Fix

hanzhi713 · hanzhi713 · commit bbad4f0876f6 · 2025-02-15T20:57:33.000-08:00
diff --git a/axlearn/common/flash_attention/tpu_attention.py b/axlearn/common/flash_attention/tpu_attention.py
@@ -278,29 +278,44 @@ def make_tpu_splash_attention(
         interpret=interpret,
         residual_checkpoint_name=f"tpu_attention.{FLASH_ATTN_RESIDUAL_NAME}",
     )
+    # args contains fwd_mask_info, dq_mask_info and dkv_mask_info, corresponding to the first three
+    # positional arguments to `splash_attention_kernel._splash_attention`.
     args, kwargs = kernel.tree_flatten()
     specs, _ = kernel.manual_sharding_spec(sharding).tree_flatten()
 
     def shard_map_fn(q_proj, k_proj, v_proj, bias, _, *args):
+        assert len(args) == 3
         if softmax_scale != 1.0:
             q_proj *= softmax_scale
         _, segment_ids, _ = split(bias, MaskFnAttentionBias, SegmentIdAttentionBias)
+        # Note: we cannot pass bias to vmap directly since it's possible that not all its tensors
+        # have the same batch dimension, which is required by vmap. For example, `target_positions`
+        # and `source_positions` from MaskFnAttentionBias may have batch dim == 1. Therefore, we
+        # extract the info we need from bias that pass that to vmap instead.
         seg_ids = None
-        if segment_ids.has_value():
+        if hasattr(segment_ids, "segment_ids"):
+            seg_ids = segment_ids.segment_ids
+        return jax.vmap(vmap_fn, in_axes=(0, 0, 0, 0) + (None,) * 3)(
+            q_proj, k_proj, v_proj, seg_ids, *args
+        )
+
+    def vmap_fn(q_proj, k_proj, v_proj, kv_seg_ids, *args):
+        if kv_seg_ids is None:
+            seg_ids = None
+        else:
             # SplashAttention requires q_seg_ids to have the same sequence length q_proj and
-            # kv_seq_ids to have the same sequence length as k|v_proj. Therefore, we pass in a
+            # kv_seg_ids to have the same sequence length as k|v_proj. Therefore, we pass in a
             # segment id that's not sharded in the sequence dimension, and manually slice the
             # sequence dim to populate q_seg_ids.
-            kv_seq_ids = segment_ids.segment_ids
             if q_seq_shards == 1:
                 q_shard_idx = 0
             else:
                 q_shard_idx = jax.lax.axis_index("seq")
-            q_shard_size = kv_seq_ids.shape[0] // q_seq_shards
+            q_shard_size = kv_seg_ids.shape[0] // q_seq_shards
             q_seq_ids = jax.lax.dynamic_slice_in_dim(
-                kv_seq_ids, q_shard_idx * q_shard_size, q_shard_size
+                kv_seg_ids, q_shard_idx * q_shard_size, q_shard_size
             )
-            seg_ids = splash_attention_kernel.SegmentIds(q_seq_ids, kv_seq_ids)
+            seg_ids = splash_attention_kernel.SegmentIds(q_seq_ids, kv_seg_ids)
 
         q_proj = jnp.einsum("tnh->nth", q_proj)
         k_proj = jnp.einsum("snh->nsh", k_proj)
@@ -312,7 +327,7 @@ def shard_map_fn(q_proj, k_proj, v_proj, bias, _, *args):
         return jnp.einsum("nth->tnh", out)
 
     return FlashAttentionShardMapSpecs(
-        fn=jax.vmap(shard_map_fn, in_axes=(0, 0, 0, 0) + (None,) * 4),
+        fn=shard_map_fn,
         additional_in_specs=specs,
         additional_args=args,
     )