[Qwen3 Next] Update qwen3_next to use moe_calibration_context (#1984)

dsikka · web-flow · commit ee755a419cbe · 2025-11-03T18:44:28.000-05:00
# SUMMARY:
- Update the qwen3_next_moe definition to use the moe context
- Update example to use the correct arguement
- Add tests
- Update qwen3_moe model definition to implement `restore` 

# Testing:
- All modeling tests pass
diff --git a/examples/quantization_w4a4_fp4/qwen3_next_example.py b/examples/quantization_w4a4_fp4/qwen3_next_example.py
@@ -68,18 +68,22 @@ def tokenize(sample):
 )
 
 # Apply quantization.
-# We see `calibrate_moe_context` to True to update all `Qwen3MoeSparseMoeBlock`
-# during calibration.
+# MoE calibration is now handled automatically by the pipeline.
+# We set `moe_calibrate_all_experts` to True to ensure all experts receive
+# calibration data. This temporarily updates the model definition to use
+# `CalibrationQwen3NextSparseMoeBlock` (from `llmcompressor.modeling.qwen3_next_moe`)
+# which replaces the original `Qwen3NextSparseMoeBlock` class.
+# This updates how the forward pass is handled in the MoE block during calibration.
 # Feel free to update the definition under
-# llm-compressor/src/llmcompressor/modeling/qwen3_moe.py` to play around with
-# this behaviour and evaluate its impact on quantization performance
+# llm-compressor/src/llmcompressor/modeling/qwen3_next_moe.py to play around with
+# this behavior and evaluate its impact on quantization performance.
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    calibrate_moe_context=True,
+    moe_calibrate_all_experts=True,
 )
 
 
diff --git a/src/llmcompressor/modeling/moe_context.py b/src/llmcompressor/modeling/moe_context.py
@@ -45,7 +45,7 @@ class MoECalibrationModule(ABC, torch.nn.Module):
 
     is_permanent: bool = False
 
-    def restore(self) -> torch.nn.Module:
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
         """
         Restore the original module structure.
 
@@ -163,5 +163,5 @@ def moe_calibration_context(
         # Step 2: Restore non-permanent modules
         for name, (original, replacement) in replaced.items():
             if not replacement.is_permanent:
-                restored = replacement.restore()
+                restored = replacement.restore(original)
                 model.set_submodule(name, restored)
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -29,6 +29,9 @@
 from llmcompressor.modeling.qwen3_moe import (  # noqa: F401
     CalibrationQwen3MoeSparseMoeBlock,
 )
+from llmcompressor.modeling.qwen3_next_moe import (  # noqa: F401
+    CalibrationQwen3NextSparseMoeBlock,
+)
 from llmcompressor.modeling.qwen3_vl_moe import (
     replace as replace_Qwen3VLMoE,
 )
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
@@ -98,6 +98,9 @@ def forward(self, hidden_states: torch.Tensor):
         )
         return final_hidden_states, router_logits
 
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        return original
+
 
 # Legacy function for backward compatibility
 def replace(
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -16,17 +16,34 @@
 
 import torch
 
+from llmcompressor.modeling.moe_context import (
+    MoECalibrationModule,
+    register_moe_calibration,
+)
+
+
+@register_moe_calibration("Qwen3NextSparseMoeBlock")
+class CalibrationQwen3NextSparseMoeBlock(MoECalibrationModule):
+    from transformers import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import (
+        Qwen3NextSparseMoeBlock,
+    )
+
+    """
+    Calibration version of Qwen3NextSparseMoeBlock that sends all tokens to all experts.
+    """
+
+    is_permanent = False
 
-class Qwen3NextSparseMoeBlock(torch.nn.Module):
     def __init__(
         self,
-        config,
-        original,
-        calibrate_all_experts: bool,
+        original: Qwen3NextSparseMoeBlock,
+        config: Qwen3NextConfig,
+        calibrate_all_experts: bool = True,
     ):
         super().__init__()
         self.num_experts = config.num_experts
-        self.top_k = config.top_k
+        self.top_k = config.num_experts_per_tok
         self.norm_topk_prob = config.norm_topk_prob
 
         # gating
@@ -44,7 +61,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_logits = self.gate(hidden_states)
 
         routing_weights = torch.nn.functional.softmax(
-            router_logits, dim=1, dtype=torch.float
+            router_logits, dim=-1, dtype=torch.float
         )
         routing_weights, selected_experts = torch.topk(
             routing_weights, self.top_k, dim=-1
@@ -103,12 +120,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         )
         return final_hidden_states, router_logits
 
+    def restore(self, original: torch.nn.Module) -> torch.nn.Module:
+        return original
+
 
 def replace(
     config,
     module,
     calibrate_all_experts,
 ):
-    return Qwen3NextSparseMoeBlock(
+    return CalibrationQwen3NextSparseMoeBlock(
         config=config, original=module, calibrate_all_experts=calibrate_all_experts
     )
diff --git a/tests/llmcompressor/modeling/test_calib_qwen3_next.py b/tests/llmcompressor/modeling/test_calib_qwen3_next.py
@@ -0,0 +1,96 @@
+import contextlib
+from functools import partial
+
+import pytest
+import torch
+from transformers import AutoModelForCausalLM
+
+from llmcompressor.modeling.moe_context import moe_calibration_context
+from llmcompressor.modeling.qwen3_next_moe import CalibrationQwen3NextSparseMoeBlock
+from llmcompressor.utils.dev import skip_weights_download
+from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
+from tests.testing_utils import requires_cadence, requires_gpu
+
+
+@requires_cadence("weekly")
+@pytest.mark.parametrize("model_stub", ["Qwen/Qwen3-Next-80B-A3B-Instruct"])
+def test_calib_replace_qwen3moe_all_experts(model_stub):
+    with skip_weights_download():
+        model = AutoModelForCausalLM.from_pretrained(model_stub)
+
+    # Qwen3MoE layer replacement is temporary within the context
+    with contextlib.ExitStack() as stack:
+        stack.enter_context(calibration_forward_context(model))
+        stack.enter_context(DisableQuantization(model))
+        stack.enter_context(moe_calibration_context(model, calibrate_all_experts=True))
+
+        # Find one MoE layer
+        moe_layer = None
+        for name, module in model.named_modules():
+            if isinstance(module, CalibrationQwen3NextSparseMoeBlock):
+                moe_layer = module
+                break
+
+        assert moe_layer is not None
+
+        num_experts = len(moe_layer.experts)
+        expert_triggered = [False for _ in range(num_experts)]
+
+        # Define the hook function
+        def hook_fn(i, module, input, output):
+            expert_triggered[i] = True
+
+        # Attach hooks using functools.partial to bind each index
+        for i, expert in enumerate(moe_layer.experts):
+            expert.register_forward_hook(partial(hook_fn, i))
+
+        # Create dummy input tensor that simulates hidden_states
+        hidden_dim = model.config.hidden_size
+        batch, seq_len = 4, 32
+        sample = torch.randn(batch, seq_len, hidden_dim, dtype=torch.float32)
+
+        # Forward through the MoE layer directly
+        with torch.no_grad():
+            _ = moe_layer(sample)
+
+        # Assert all experts are used
+        assert all(
+            expert_triggered
+        ), f"Not all experts were triggered: {expert_triggered}"
+
+
+@requires_gpu
+def test_calib_qwen3_moe_module():
+    from transformers import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import (
+        Qwen3NextSparseMoeBlock,
+    )
+
+    config = Qwen3NextConfig()
+    with torch.device("cuda"):
+        original = Qwen3NextSparseMoeBlock(config).eval()
+
+    # Create dummy input tensor that simulates hidden_states
+    hidden_dim = config.hidden_size
+    batch, seq_len = 4, 32
+    sample = torch.randn(batch, seq_len, hidden_dim, device="cuda")
+
+    with calibration_forward_context(original):
+        true_output = original(sample)
+
+    module = CalibrationQwen3NextSparseMoeBlock(
+        original, config, calibrate_all_experts=True
+    )
+
+    with calibration_forward_context(module):
+        output = module(sample)
+        assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
+        assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10
+
+    module = CalibrationQwen3NextSparseMoeBlock(
+        original, config, calibrate_all_experts=False
+    )
+    with calibration_forward_context(module):
+        output = module(sample)
+        assert torch.nn.functional.mse_loss(true_output[0], output[0]) < 1e-10
+        assert torch.nn.functional.mse_loss(true_output[1], output[1]) < 1e-10

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,9 @@`
`29`	`29`	`from llmcompressor.modeling.qwen3_moe import ( # noqa: F401`
`30`	`30`	`CalibrationQwen3MoeSparseMoeBlock,`
`31`	`31`	`)`
	`32`	`+from llmcompressor.modeling.qwen3_next_moe import ( # noqa: F401`
	`33`	`+ CalibrationQwen3NextSparseMoeBlock,`
	`34`	`+)`
`32`	`35`	`from llmcompressor.modeling.qwen3_vl_moe import (`
`33`	`36`	`replace as replace_Qwen3VLMoE,`
`34`	`37`	`)`
Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,9 @@ def forward(self, hidden_states: torch.Tensor):`
`98`	`98`	`)`
`99`	`99`	`return final_hidden_states, router_logits`
`100`	`100`
	`101`	`+ def restore(self, original: torch.nn.Module) -> torch.nn.Module:`
	`102`	`+ return original`
	`103`	`+`
`101`	`104`
`102`	`105`	`# Legacy function for backward compatibility`
`103`	`106`	`def replace(`