Clarify setting sliding_window_size = 8 results in a window size of 9, including itself.

ds-hwang · ds-hwang · commit eaa1bd5b20a7 · 2025-02-20T19:04:05.000-08:00
diff --git a/axlearn/common/attention_bias.py b/axlearn/common/attention_bias.py
@@ -701,6 +701,12 @@ def sliding_window_causal_mask(sliding_window_size: int) -> MaskFn:
     """Returns a causal MaskFn for sliding window attentions of a given window size.
 
     Implements the `MaskFn` protocol.
+
+    Note: Setting sliding_window_size = 8 results in attending to 9 tokens - it attends to itself
+    and sliding_window_size tokens to the left.
+
+    Args:
+        sliding_window_size: Left context of sliding window mask.
     """
 
     def mask(query_position: Tensor, key_position: Tensor):
@@ -730,8 +736,12 @@ def make_causal_biases(seq_len: int) -> Tensor:
 def make_sliding_window_causal_biases(seq_len: int, sliding_window_size: int) -> Tensor:
     """Generates attention logit biases for sliding window attention.
 
+    Note: Setting sliding_window_size = 8 results in attending to 9 tokens - it attends to itself
+    and sliding_window_size tokens to the left.
+
     Args:
         seq_len: Sequence length.
+        sliding_window_size: Left context of sliding window mask.
 
     Returns:
         A float tensor of shape [seq_len, seq_len] where the value at [i, j] = -inf
diff --git a/axlearn/common/attention_bias_test.py b/axlearn/common/attention_bias_test.py
@@ -16,10 +16,27 @@
     MaskFnAttentionBias,
     SegmentIdAttentionBias,
     TensorAttentionBias,
+    sliding_window_causal_mask,
 )
 from axlearn.common.utils import Tensor
 
 
+class MaskTest(test_utils.TestCase):
+    @parameterized.parameters(
+        [0, [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]]],
+        [2, [[1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1]]],
+        [4, [[1, 0, 0, 0, 0], [1, 1, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]],
+    )
+    def test_sliding_window_mask(self, left_context, expected):
+        mask_fn = sliding_window_causal_mask(sliding_window_size=left_context)
+        step_len = 5
+        target_positions = jnp.arange(step_len)[:, None]
+        source_positions = jnp.arange(step_len)[None, :]
+        bool_mask = mask_fn(target_positions, source_positions)
+        out_mask = bool_mask.astype(jnp.int32)
+        self.assertEqual(out_mask.tolist(), expected)
+
+
 class AttentionBiasTest(test_utils.TestCase):
     @parameterized.parameters(
         [attention_bias.ZeroAttentionBias(), False],