AI-Hypercomputer · prishajain1 · Apr 20, 2026
@@ -2,11 +2,17 @@
 hardware: 'tpu'
 skip_jax_distributed_system: False
 attention: 'flash'
-a2v_attention_kernel: 'flash'
+a2v_attention_kernel: 'dot_product'
 v2a_attention_kernel: 'dot_product'
 attention_sharding_uniform: True 
 precision: 'bf16'
+
+# For scanning transformer layers
 scan_layers: True
+
+# For scanning diffusion loop
+scan_diffusion_loop: True
+
 names_which_can_be_saved: []
 names_which_can_be_offloaded: []
 remat_policy: "NONE"

@@ -287,11 +287,11 @@ def _tpu_flash_attention(
 ) -> jax.Array:
   """TPU Flash Attention"""
 
-  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, attention_kernel)
   num_context_shards = mesh.shape["context"]
   query, orig_q_seq_len = _reshape_data_for_flash(query, heads, num_context_shards)
   key, _ = _reshape_data_for_flash(key, heads, num_context_shards)
   value, _ = _reshape_data_for_flash(value, heads, num_context_shards)
+  block_sizes = _select_flash_block_sizes(query, key, flash_block_sizes, dtype, attention_kernel)
 
   q_axis_names = nn.logical_to_mesh_axes(axis_names_q)
   kv_axis_names = nn.logical_to_mesh_axes(axis_names_kv)
@@ -892,7 +892,7 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "mlp")),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), (None, "mlp")),
         bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("mlp",)),
     )
     self.act = get_activation(activation_fn)
@@ -904,8 +904,8 @@ def __init__(
         dtype=dtype,
         param_dtype=weights_dtype,
         precision=precision,
-        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", "embed")),
-        bias_init=nnx.with_partitioning(nnx.initializers.zeros, ("embed",)),
+        kernel_init=nnx.with_partitioning(nnx.initializers.lecun_normal(), ("mlp", None)),
+        bias_init=nnx.with_partitioning(nnx.initializers.zeros, (None,)),
     )
 
   def __call__(self, hidden_states: Array) -> Array:

@@ -20,6 +20,7 @@
 import jax.numpy as jnp
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
+from maxdiffusion.tpu_utils import get_tpu_type, TpuType
 
 Array = common_types.Array
 Mesh = common_types.Mesh
@@ -349,23 +350,40 @@ def __init__(
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
       flash_min_seq_length: int = 4096,
+      qkv_sharding_spec: Optional[tuple] = None,
+      out_sharding_spec: Optional[tuple] = None,
+      out_bias_sharding_spec: Optional[tuple] = None,
   ):
     self.heads = heads
     self.rope_type = rope_type
     self.dim_head = dim_head
     self.inner_dim = dim_head * heads
     self.dropout_rate = dropout
 
+    # Auto-detect hardware for sharding specs if not overridden
+    tpu_type = get_tpu_type()
+    is_ironwood = tpu_type == TpuType.TPU_7X
+
+    # Hardware-aware sharding: Ironwood (v7x) uses 1D sharding along the heads dimension (leaving the embedding dimension replicated)
+    # to minimize cross-device communication, while other hardware defaults to 2D sharding along both heads and embed dimensions.
+    # This has currently only been tested on Trillium (v6e) and Ironwood (v7x).
+    if qkv_sharding_spec is None:
+      qkv_sharding_spec = (None, "heads") if is_ironwood else ("embed", "heads")
+    if out_sharding_spec is None:
+      out_sharding_spec = ("heads", None) if is_ironwood else ("heads", "embed")
+    if out_bias_sharding_spec is None:
+      out_bias_sharding_spec = (None,) if is_ironwood else ("embed",)
+
     # 1. Define Partitioned Initializers (Logical Axes)
     # Q, K, V kernels: [in_features (embed), out_features (heads)]
-    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), ("embed", "heads"))
+    qkv_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), qkv_sharding_spec)
     # Q, K, V biases: [out_features (heads)]
     qkv_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("heads",))
 
     # Out kernel: [in_features (heads), out_features (embed)]
-    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), ("heads", "embed"))
+    out_kernel_init = nnx.with_partitioning(nnx.initializers.lecun_normal(), out_sharding_spec)
     # Out bias: [out_features (embed)]
-    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), ("embed",))
+    out_bias_init = nnx.with_partitioning(nnx.initializers.zeros_init(), out_bias_sharding_spec)
 
     # Norm scales
     norm_scale_init = nnx.with_partitioning(nnx.initializers.ones_init(), ("norm",))

@@ -165,12 +165,12 @@ def __init__(self, in_channels: int, mid_channels: int = 1024, scale: float = 2.
         in_channels, (num**2) * self.mid_channels, kernel_size=(3, 3), padding=((1, 1), (1, 1)), rngs=rngs
     )
     self.pixel_shuffle = PixelShuffleND(dims=2, upscale_factors=(num, num))
-    self.blur = BlurDownsample(dims=2, stride=den)
+    self.blur_down = BlurDownsample(dims=2, stride=den)
 
   def __call__(self, x: jax.Array) -> jax.Array:
     x = self.conv(x)
     x = self.pixel_shuffle(x)
-    x = self.blur(x)
+    x = self.blur_down(x)
     return x