Ensures that cache_dtype is respected. (#977)

Mark Lee · web-flow · commit ae855ed288f1 · 2025-02-10T20:43:50.000Z
diff --git a/axlearn/common/attention.py b/axlearn/common/attention.py
@@ -853,14 +853,15 @@ def extend_step(
 
             # Create a dispatch matrix of shape [B, T=step, S].
             oh_indices = jax.nn.one_hot(
-                time_step[:, None] + jnp.arange(num_query_steps), source_len, dtype=k_proj.dtype
+                time_step[:, None] + jnp.arange(num_query_steps), source_len, dtype=cached_key.dtype
             )
             # Create a mask of shape [B, S, 1, 1].
             negated_oh_indices = (1 - oh_indices.sum(axis=1))[..., None, None]
             k_proj = jnp.einsum("bt...,bts->bs...", k_proj, oh_indices)
             v_proj = jnp.einsum("bt...,bts->bs...", v_proj, oh_indices)
-            k_proj = cached_key * negated_oh_indices + k_proj
-            v_proj = cached_value * negated_oh_indices + v_proj
+            # Ensure that we accumulate using the original dtype.
+            k_proj = cached_key * negated_oh_indices + k_proj.astype(cached_key.dtype)
+            v_proj = cached_value * negated_oh_indices + v_proj.astype(cached_value.dtype)
 
             updated_state.update(key=k_proj, value=v_proj)
         return updated_state, self.Output(query=q_proj, key=k_proj, value=v_proj)
@@ -1750,8 +1751,7 @@ def _forward_for_mode(
         # Validate key & value combination.
         if (key is None) != (value is None):
             raise ValueError(
-                "key and value must be both None or both set, "
-                f"key:{type(key)}, value:{type(value)}"
+                f"key and value must be both None or both set, key:{type(key)}, value:{type(value)}"
             )
         if kv_state is not None:
             if key is not None or value is not None:
diff --git a/axlearn/common/attention_test.py b/axlearn/common/attention_test.py
@@ -122,6 +122,7 @@
     TensorSpec,
     VDict,
     as_tensor,
+    cast_floats,
     flatten_items,
     save_and_offload_only_these_names_regex,
     shapes,
@@ -1562,19 +1563,27 @@ def test_qlinear(self, base_cfg, test_cfg):
                 # Check that the outputs are close for all pairs.
                 self.assertNestedAllClose(outputs[layer_a], outputs[layer_b])
 
-    @parameterized.parameters(
-        (attention.QKVLinear, 1),
-        (attention.FusedQKVLinear, 1),
-        (attention.GroupedQKVLinear, 1),
-        (attention.FusedGroupedQKVLinear, 1),
-        (attention.RoFormerQKVLinear, 1),
-        (attention.QKVLinear, 2),
-        (attention.FusedQKVLinear, 3),
-        (attention.GroupedQKVLinear, 4),
-        (attention.FusedGroupedQKVLinear, 3),
-        (attention.RoFormerQKVLinear, 2),
+    @parameterized.product(
+        [
+            dict(layer_cls=attention.QKVLinear, extend_step_len=1),
+            dict(layer_cls=attention.FusedQKVLinear, extend_step_len=1),
+            dict(layer_cls=attention.GroupedQKVLinear, extend_step_len=1),
+            dict(layer_cls=attention.FusedGroupedQKVLinear, extend_step_len=1),
+            dict(layer_cls=attention.RoFormerQKVLinear, extend_step_len=1),
+            dict(layer_cls=attention.QKVLinear, extend_step_len=2),
+            dict(layer_cls=attention.FusedQKVLinear, extend_step_len=3),
+            dict(layer_cls=attention.GroupedQKVLinear, extend_step_len=4),
+            dict(layer_cls=attention.FusedGroupedQKVLinear, extend_step_len=3),
+            dict(layer_cls=attention.RoFormerQKVLinear, extend_step_len=2),
+        ],
+        cache_dtype=[None, jnp.bfloat16],
     )
-    def test_repeated_extend_step(self, layer_cls: type[attention.BaseQKVLinear], extend_step_len):
+    def test_repeated_extend_step(
+        self,
+        layer_cls: type[attention.BaseQKVLinear],
+        extend_step_len: int,
+        cache_dtype: Optional[jnp.dtype],
+    ):
         """Tests that calling QKVLinear.extend_step() multiple times with the
         same time_step results in the same output."""
         model_dim = 8
@@ -1586,10 +1595,12 @@ def test_repeated_extend_step(self, layer_cls: type[attention.BaseQKVLinear], ex
             value_dim=model_dim,
             num_heads=num_heads,
             per_head_dim=per_head_dim,
+            cache_dtype=cache_dtype,
         )
         cfg = layer_cls.default_config().set(**layer_kwargs)
         maybe_set_config(cfg, num_kv_heads=num_heads, rotary_value=False)
         layer = cfg.set(name="test").instantiate(parent=None)
+        expect_dtype = cache_dtype or layer.dtype()
 
         # Construct base layer state.
         layer_state = layer.initialize_parameters_recursively(jax.random.PRNGKey(0))
@@ -1609,6 +1620,8 @@ def test_repeated_extend_step(self, layer_cls: type[attention.BaseQKVLinear], ex
         cache_state, init_output = layer.init_states(
             time_step=None, query=TensorSpec([batch_size, tgt_len])
         )
+        self.assertEqual(cache_state["key"].dtype, expect_dtype)
+        self.assertEqual(cache_state["value"].dtype, expect_dtype)
         self.assertIsNone(init_output)
         step_querys = []
         step_keys = step_values = None
@@ -1624,10 +1637,12 @@ def test_repeated_extend_step(self, layer_cls: type[attention.BaseQKVLinear], ex
             step_querys.append(step_output.query)
             step_keys = step_output.key
             step_values = step_output.value
+            self.assertEqual(cache_state["key"].dtype, expect_dtype)
+            self.assertEqual(cache_state["value"].dtype, expect_dtype)
 
         self.assertNestedAllClose(fwd_output.query, jnp.concat(step_querys, axis=1))
-        self.assertNestedAllClose(fwd_output.key, step_keys)
-        self.assertNestedAllClose(fwd_output.value, step_values)
+        self.assertNestedAllClose(cast_floats(fwd_output.key, cache_dtype), step_keys)
+        self.assertNestedAllClose(cast_floats(fwd_output.value, cache_dtype), step_values)
 
     @parameterized.parameters(jnp.float32, jnp.float16, jnp.bfloat16)
     def test_dtypes_inherited_from_parent(self, dtype: jnp.dtype):