fix

bozheng-hit · Cyrilvallez · commit fd6affc5fcb5 · 2025-09-09T18:13:57.000+02:00
diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py
@@ -89,7 +89,7 @@ class Qwen3NextDynamicCache:
     cache (which has a constant shape regardless of seq_len).
 
     This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
-    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    and `ssm_states` for gated deltanet cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
     For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
     while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
     For linear attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
@@ -108,7 +108,7 @@ def __init__(self, config: Qwen3NextConfig, batch_size, dtype=torch.float16, dev
         self.recurrent_states = []
         self.transformer_layers = []
         for i in range(config.num_hidden_layers):
-            # NOTE: only use mamba2 and full attention now! need to change future for more blocks.
+            # NOTE: only use gated deltanet and full attention now! need to change future for more blocks.
             if self.layer_types[i] == "linear_attention":
                 self.conv_states += [
                     torch.zeros(
@@ -1196,7 +1196,7 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
+        past_key_values: Optional[Qwen3NextDynamicCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py
@@ -97,7 +97,7 @@ class Qwen3NextDynamicCache:
     cache (which has a constant shape regardless of seq_len).
 
     This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
-    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    and `ssm_states` for gated deltanet cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
     For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
     while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
     For linear attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
@@ -116,7 +116,7 @@ def __init__(self, config: Qwen3NextConfig, batch_size, dtype=torch.float16, dev
         self.recurrent_states = []
         self.transformer_layers = []
         for i in range(config.num_hidden_layers):
-            # NOTE: only use mamba2 and full attention now! need to change future for more blocks.
+            # NOTE: only use gated deltanet and full attention now! need to change future for more blocks.
             if self.layer_types[i] == "linear_attention":
                 self.conv_states += [
                     torch.zeros(
diff --git a/tests/models/qwen3_next/test_modeling_qwen3_next.py b/tests/models/qwen3_next/test_modeling_qwen3_next.py
@@ -96,7 +96,7 @@ class Qwen3NextModelTest(CausalLMModelTest, unittest.TestCase):
     model_tester_class = Qwen3NextModelTester
 
     def _check_past_key_values_for_generate(self, batch_size, decoder_past_key_values, cache_length, config):
-        "Qwen3-Next has a special Cache as it alternates with Mamba layers"
+        "Qwen3-Next has a special Cache as it alternates with gated deltanet layers"
         self.assertIsInstance(decoder_past_key_values, Qwen3NextDynamicCache)
 
         # (batch, head, seq_length, head_features)
@@ -119,7 +119,7 @@ def _check_past_key_values_for_generate(self, batch_size, decoder_past_key_value
 
     @pytest.mark.generate
     def test_past_key_values_format(self):
-        "Needs to be overwritten as Qwen3-Next alternates between attention layers and mamba layers."
+        "Needs to be overwritten as Qwen3-Next alternates between attention layers and gated deltanet layers."
         for model_class in self.all_generative_model_classes:
             config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -150,7 +150,7 @@ def test_past_key_values_format(self):
                     self.assertEqual(self_attention_layer_values.shape, default_self_attention_shape)
 
     def test_attention_outputs(self):
-        "Needs to be overwritten as Qwen3-Next alternates between attention layers and mamba layers."
+        "Needs to be overwritten as Qwen3-Next alternates between attention layers and gated deltanet layers."
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
         # force eager attention to support output attentions