add super kernel for deocode moe

NNUCJ · NNUCJ · commit 8b7eab59bda4 · 2025-08-12T09:51:05.000+08:00
Signed-off-by: NNUCJ &lt;616151263@qq.com&gt;
diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py
@@ -238,11 +238,23 @@ def test_init_with_quant(self, mock_dist_env, default_moe_config):
 
     @pytest.mark.parametrize(
         "others_param",
-        [[None,
-          MagicMock(return_value=torch.randn(5, 32)), False, 5, None],
-         [2, None, False, 5, None], [None, None, True, 5, None],
-         [None, None, False, 1, None], [None, None, True, 5, 1],
-         [None, None, False, 5, 1]])
+        [[
+            None,
+            MagicMock(return_value=torch.randn(5, 32)), False, 5, None, False,
+            None, False
+        ], [2, None, False, 5, None, False, None, False],
+         [None, None, True, 5, None, False, None, False],
+         [None, None, False, 1, None, False, None, False],
+         [None, None, True, 5, 1, False, None, False],
+         [None, None, False, 5, 1, False, None, False],
+         [
+             None, None, True, 5, 1, True,
+             MagicMock(return_value=(torch.randn(5, 8), None)), False
+         ],
+         [
+             None, None, False, 5, 1, True,
+             MagicMock(return_value=(torch.randn(5, 8), None)), True
+         ]])
     def test_forward(self, mock_dist_env, default_moe_config, others_param):
         """
         1 test has shared_experts
@@ -251,15 +263,22 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
         4 test single num_tokens(decode)
         5 test ep_size is 1 and is_prefill is true
         6 test ep_size is 1 and is_prefill is False
+        7 test enable_multistream_moe and is_prefill is true
         """
-        top_k, shared_experts, is_prefill, num_tokens, ep_size = others_param
+        top_k, shared_experts, is_prefill, num_tokens, ep_size, enable_multistream_moe, gate, enable_super_kernel = others_param
         inputs = torch.randn(num_tokens, 32)
         router_logits = torch.randn(num_tokens, 8)
         moe = AscendFusedMoE(**default_moe_config)
 
         if ep_size == 1:
             moe.moe_parallel_config.ep_size = 1
 
+        if enable_multistream_moe:
+            moe.enable_multistream_moe = True
+
+        if enable_super_kernel:
+            moe.enable_super_kernel = True
+
         moe.quant_method = MockQuantMethod(shared_experts, num_tokens)
         forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens,
                                                          dtype=torch.bool),
@@ -270,7 +289,8 @@ def test_forward(self, mock_dist_env, default_moe_config, others_param):
                                  router_logits,
                                  is_prefill=is_prefill,
                                  top_k=top_k,
-                                 shared_experts=shared_experts)
+                                 shared_experts=shared_experts,
+                                 gate=gate)
 
         moe.quant_method.apply.assert_called_once()
 
diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py
@@ -1,4 +1,7 @@
 import os
+from contextlib import nullcontext
+
+import torchair
 
 from tests.ut.base import TestBase
 from vllm_ascend.torchair import utils
@@ -26,3 +29,11 @@ def test_torchair_cache_dir(self):
                          "Delete torchair cache dir failed")
         self.assertFalse(utils.check_kv_cache_bytes_cache_exist(),
                          "Delete kv cache bytes cache dir failed")
+
+    def test_super_kernel(self):
+        super_kernel_unenable = utils.super_kernel("prefix", "stream-fusion=1",
+                                                   False)
+        self.assertTrue(super_kernel_unenable, nullcontext())
+        super_kernel_enable = utils.super_kernel("prefix", "stream-fusion=1",
+                                                 True)
+        self.assertIsInstance(super_kernel_enable, torchair.scope._Scope)
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -315,15 +315,24 @@ def __init__(
         self.enable_multistream_moe = \
             ascend_config.torchair_graph_config.enable_multistream_moe and \
             self.torchair_graph_enabled
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
+        self.enable_super_kernel = self.enable_multistream_moe and self.tp_size == 1
+        self.params_dtype = torch.float32 if self.enable_super_kernel else torch.get_default_dtype()
+
+        # Converting gate weight to fp32 is to adapt to the super kernel feature.
+        # Super kernel feature currently cannot fuse operators such as cast, stridedslice, and add.
+        # In the moe stage, Cast will interrupt the fusion of the super kernel. To avoid this problem,
+        # modifications will be made in the initialization stage.
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            params_dtype=self.params_dtype,
+            prefix=f"{prefix}.gate")
         if config.topk_method == "noaux_tc":
             self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts))
+                torch.empty(config.n_routed_experts,
+                            dtype=self.params_dtype))
         else:
             self.gate.e_score_correction_bias = None
 
@@ -370,7 +379,6 @@ def __init__(
         if transfer_config is not None:
             self.kv_consumer = transfer_config.kv_role == "kv_consumer"
 
-        self.params_dtype = torch.get_default_dtype()
         self.rm_router_logits = self.experts.rm_router_logits
 
     def forward(self,
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -48,7 +48,8 @@
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
     MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
 from vllm_ascend.ops.sequence_parallel import MetadataForPadding
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.torchair.utils import (npu_stream_switch, npu_wait_tensor,
+                                        super_kernel)
 from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
                                get_all_reduce_merge_state,
                                get_ascend_soc_version,
@@ -1204,6 +1205,7 @@ def __init__(
         )
         AscendFusedMoE.moe_counter += 1
         self.moe_instance_id = AscendFusedMoE.moe_counter
+        self.prefix = prefix
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -1268,6 +1270,7 @@ def __init__(
         self.enable_multistream_moe = \
             ascend_config.torchair_graph_config.enable_multistream_moe and \
             self.torchair_graph_enabled
+        self.enable_super_kernel = self.enable_multistream_moe and tp_size == 1
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -1372,16 +1375,23 @@ def forward(self,
         quantized_x_for_share, dynamic_scale_for_share = None, None
         from vllm_ascend.quantization.w8a8_dynamic import \
             AscendW8A8DynamicFusedMoEMethod
+        running_in_super_kernel = self.enable_super_kernel and fused_moe_state == FusedMoEState.MC2
         if self.enable_multistream_moe:
-            if not self.rm_router_logits:
-                router_logits, _ = gate(hidden_states)
-            if hasattr(self.quant_method, "quant_method") and \
-               isinstance(self.quant_method.quant_method,
-                          AscendW8A8DynamicFusedMoEMethod
-                          ) and fused_moe_state == FusedMoEState.MC2:
-                with npu_stream_switch("moe_secondary", 0):
-                    quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
-                        hidden_states)
+            with super_kernel(self.prefix,
+                              "stream-fusion=1",
+                              enabled=running_in_super_kernel):
+                if not self.rm_router_logits:
+                    if self.enable_super_kernel:
+                        router_logits, _ = gate(hidden_states.float())
+                    else:
+                        router_logits, _ = gate(hidden_states)
+                if hasattr(self.quant_method, "quant_method") and \
+                isinstance(self.quant_method.quant_method,
+                            AscendW8A8DynamicFusedMoEMethod
+                            ) and fused_moe_state == FusedMoEState.MC2:
+                    with npu_stream_switch("moe_secondary", 0):
+                        quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant(
+                            hidden_states)
 
         if shared_experts:
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
@@ -1481,6 +1491,8 @@ def forward(self,
             token_dispatcher=self.token_dispatcher,
             quantized_x_for_share=quantized_x_for_share,
             dynamic_scale_for_share=dynamic_scale_for_share,
+            prefix=self.prefix,
+            running_in_super_kernel=running_in_super_kernel,
         )
 
         if shared_experts:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -28,7 +28,8 @@
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.fused_moe import select_experts
-from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
+from vllm_ascend.torchair.utils import (npu_stream_switch, npu_wait_tensor,
+                                        super_kernel)
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion,
                                dispose_tensor, get_ascend_soc_version)
 
@@ -898,59 +899,62 @@ def apply(
         shared_experts: Optional[Any] = None,
         quantized_x_for_share: Optional[Any] = None,
         dynamic_scale_for_share: Optional[Any] = None,
+        prefix: str = "",
+        running_in_super_kernel: bool = False,
         **kwargs,
     ) -> torch.Tensor:
         assert router_logits.shape[
             1] == global_num_experts, "Number of global experts mismatch"
 
         is_deepseek_v3_r1 = global_num_experts == 256
-
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if is_deepseek_v3_r1:
-            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
-                k=top_k,  # topk当前写8
-                bias=e_score_correction_bias,
-                k_group=topk_group,  # fix: 4
-                group_count=num_expert_group,  # fix 8
-                group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
-                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
-                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
-                # out_flag=False, # todo new api; 第三个输出是否输出
-                # y2_flag=False, # old api; 第三个输出是否输出
-                routed_scaling_factor=1,
-                eps=float(1e-20))
-        else:
-            topk_weights, topk_ids = select_experts(
-                hidden_states=x,
-                router_logits=router_logits,
-                top_k=top_k,
-                use_grouped_topk=use_grouped_topk,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
-            )
-
         fused_moe_state = get_forward_context().fused_moe_state
         shared_gate_up, shared_dequant_scale = None, None
-        if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
-            with npu_stream_switch("moe_secondary", 0):
-                npu_wait_tensor(quantized_x_for_share, router_logits)
-                share_up_out, _ = shared_experts.gate_up_proj(
-                    (quantized_x_for_share, dynamic_scale_for_share))
-                shared_gate_up, shared_dequant_scale = share_up_out[
-                    0], share_up_out[1]
-
-        # this is a naive implementation for experts load balance so as
-        # to avoid accumulating too much tokens on a single rank.
-        # currently it is only activated when doing profile runs.
-        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
-
-        topk_weights = topk_weights.to(x.dtype)
+        with super_kernel(prefix,
+                          "stream-fusion=1",
+                          enabled=running_in_super_kernel):
+            # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+            if is_deepseek_v3_r1:
+                topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                    router_logits,
+                    k=top_k,  # topk当前写8
+                    bias=e_score_correction_bias,
+                    k_group=topk_group,  # fix: 4
+                    group_count=num_expert_group,  # fix 8
+                    group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
+                    renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                    norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                    # out_flag=False, # todo new api; 第三个输出是否输出
+                    # y2_flag=False, # old api; 第三个输出是否输出
+                    routed_scaling_factor=1,
+                    eps=float(1e-20))
+            else:
+                topk_weights, topk_ids = select_experts(
+                    hidden_states=x,
+                    router_logits=router_logits,
+                    top_k=top_k,
+                    use_grouped_topk=use_grouped_topk,
+                    renormalize=renormalize,
+                    topk_group=topk_group,
+                    num_expert_group=num_expert_group,
+                    custom_routing_function=custom_routing_function,
+                    scoring_func=scoring_func,
+                    e_score_correction_bias=e_score_correction_bias,
+                )
+            if shared_experts is not None and fused_moe_state == FusedMoEState.MC2:
+                with npu_stream_switch("moe_secondary", 0):
+                    npu_wait_tensor(quantized_x_for_share, router_logits)
+                    share_up_out, _ = shared_experts.gate_up_proj(
+                        (quantized_x_for_share, dynamic_scale_for_share))
+                    shared_gate_up, shared_dequant_scale = share_up_out[
+                        0], share_up_out[1]
+
+            # this is a naive implementation for experts load balance so as
+            # to avoid accumulating too much tokens on a single rank.
+            # currently it is only activated when doing profile runs.
+            if enable_force_load_balance:
+                topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
+
+            topk_weights = topk_weights.to(x.dtype)
         if fused_moe_state == FusedMoEState.AllGatherEP:
             return fused_experts_with_allgather(
                 hidden_states=x,
@@ -963,24 +967,27 @@ def apply(
                 top_k=top_k,
                 expert_map=expert_map)
         elif fused_moe_state == FusedMoEState.MC2:
-            return fused_experts_with_mc2(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                w1_scale=layer.w13_weight_scale_fp32,
-                w2_scale=layer.w2_weight_scale,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                top_k=top_k,
-                expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
-                log2phy=log2phy,
-                global_redundant_expert_num=global_redundant_expert_num,
-                shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled,
-                mc2_mask=kwargs.get("mc2_mask", None),
-                shared_gate_up=shared_gate_up,
-                shared_dequant_scale=shared_dequant_scale)
+            with super_kernel(prefix,
+                              "stream-fusion=1",
+                              enabled=running_in_super_kernel):
+                return fused_experts_with_mc2(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    w1_scale=layer.w13_weight_scale_fp32,
+                    w2_scale=layer.w2_weight_scale,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    top_k=top_k,
+                    expert_map=expert_map,
+                    moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                    log2phy=log2phy,
+                    global_redundant_expert_num=global_redundant_expert_num,
+                    shared_experts=shared_experts,
+                    is_torchair=self.torchair_graph_enabled,
+                    mc2_mask=kwargs.get("mc2_mask", None),
+                    shared_gate_up=shared_gate_up,
+                    shared_dequant_scale=shared_dequant_scale)
         elif fused_moe_state in [
                 FusedMoEState.AllGather, FusedMoEState.NaiveMulticast
         ]:
diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py
@@ -4,6 +4,7 @@
 from contextlib import contextmanager, nullcontext
 
 import torch
+from torchair.scope import super_kernel as _super_kernel
 
 try:
     # Recent release of torchair has moved these ops to `.scope`.
@@ -96,3 +97,7 @@ def npu_wait_tensor(self: torch.Tensor,
                     *,
                     enabled: bool = True):
     return _npu_wait_tensor(self, dependency) if enabled else self
+
+
+def super_kernel(prefix: str, option: str, enabled: bool = True):
+    return _super_kernel(prefix, option) if enabled else nullcontext()