Add segment_ids option in DiTAttentionLayer (apple#976)

weiliu89 · web-flow · commit 8fd91376c5fe · 2025-02-09T23:33:53.000Z
diff --git a/axlearn/common/dit.py b/axlearn/common/dit.py
@@ -405,8 +405,9 @@ def forward(
         shift: Optional[Tensor] = None,
         scale: Optional[Tensor] = None,
         gate: Optional[Tensor] = None,
-        query_positions: Optional[Tensor] = None,
         attention_logit_biases: Optional[Tensor] = None,
+        segment_ids: Optional[Tensor] = None,
+        query_positions: Optional[Tensor] = None,
     ) -> Tensor:
         """The forward function of DiTAttentionLayer.
 
@@ -418,7 +419,12 @@ def forward(
                 target_dim] and shift should be provided.
             gate: If provided, applying before the residual addition with shape
                 [batch_size, 1|num_length, target_dim].
-            attention_logit_biases: Optional Tensor representing the self attention biases.
+            attention_logit_biases: Optional Tensor representing the self attention biases with
+                shape [batch_size, num_length, num_length].
+            segment_ids: Optional int Tensor representing the segment each token belongs to with
+                shape [batch_size, num_length].
+            query_positions: Optional Tensor representing the query positions when computing the
+                attention with shape [batch_size, num_length]
 
         Returns:
             A tensor with shape [batch_size, num_length, target_dim].
@@ -442,7 +448,10 @@ def forward(
             x = modulate(x=x, shift=shift, scale=scale)
 
         x = self.attention(
-            query=x, query_positions=query_positions, attention_logit_biases=attention_logit_biases
+            query=x,
+            attention_logit_biases=attention_logit_biases,
+            segment_ids=segment_ids,
+            query_positions=query_positions,
         ).data
 
         if cfg.structure == "postnorm":
diff --git a/axlearn/common/dit_test.py b/axlearn/common/dit_test.py
@@ -528,6 +528,44 @@ def test_dit_attn_logit_biases(self):
         # Expect the output be the same for valid items because of logit_biases.
         assert_allclose(layer_output * valid_mask, layer_output2 * valid_mask)
 
+    def test_dit_attn_segment_ids(self):
+        batch_size = 2
+        seq_len = 3
+        dim = 32
+        num_heads = 2
+
+        prng_key = jax.random.PRNGKey(0)
+        inputs = jax.random.normal(prng_key, shape=(batch_size, seq_len, dim))
+        shift = jax.random.normal(prng_key, shape=(batch_size, 1, dim))
+        scale = jax.random.normal(prng_key, shape=(batch_size, 1, dim))
+        gate = jax.random.normal(prng_key, shape=(batch_size, 1, dim))
+        segment_ids = jnp.ones((batch_size, seq_len))
+
+        layer_cfg = DiTAttentionLayer.default_config().set(
+            name="test",
+            source_dim=dim,
+            target_dim=dim,
+        )
+        layer_cfg.attention.num_heads = num_heads
+        layer_cfg.norm.eps = 1e-6
+        layer = layer_cfg.instantiate(parent=None)
+        state = layer.initialize_parameters_recursively(prng_key=prng_key)
+
+        layer_output, _ = F(
+            layer,
+            inputs=dict(
+                input=inputs,
+                shift=shift,
+                scale=scale,
+                gate=gate,
+                segment_ids=segment_ids,
+            ),
+            state=state,
+            is_training=False,
+            prng_key=prng_key,
+        )
+        assert_allclose(layer_output.shape, inputs.shape)
+
     @parameterized.parameters([True, False])
     def test_dit_attn_optional_input(self, use_ssg):
         batch_size = 2