tweak layer flags and add output_is_reduced method

bnellnm · bnellnm · commit 8f5170e8508c · 2025-10-03T19:46:55.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -61,6 +61,9 @@ def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int,
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -63,6 +63,9 @@ def __init__(self,
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -47,6 +47,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def _apply_router_weight_on_input(
         self,
         a1: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -495,6 +495,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def prepare(
         self,
         a1: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1147,17 +1147,6 @@ def __init__(
         self.batched_hidden_states: Optional[torch.Tensor] = None
         self.batched_router_logits: Optional[torch.Tensor] = None
 
-        # TODO(bnell): make these into methods on PrepareAndFinalize or all2all?
-        self.must_reduce_shared_experts = (self.use_pplx_kernels
-                                           or self.use_deepep_ht_kernels
-                                           or self.use_deepep_ll_kernels)
-
-        self.use_dp_chunking = (
-            self.moe_parallel_config.use_pplx_kernels
-            or self.moe_parallel_config.use_deepep_ll_kernels
-            or (self.dp_size > 1
-                and self.moe_config.use_flashinfer_cutlass_kernels))
-
         if self.use_dp_chunking:
             states_shape: tuple[int, ...]
             logits_shape: tuple[int, ...]
@@ -1230,6 +1219,14 @@ def use_flashinfer_cutlass_kernels(self):
                 and self.moe_quant_config.quant_dtype == "nvfp4"
                 and self.moe_config.use_flashinfer_cutlass_kernels)
 
+    @property
+    def use_dp_chunking(self) -> bool:
+        # Route to the chunked forward path using the FlashInfer Cutlass kernel
+        # only when data parallelism (DP) is enabled.
+        return (self.moe_parallel_config.use_pplx_kernels
+                or self.moe_parallel_config.use_deepep_ll_kernels
+                or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels))
+
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
         assert self.expert_map is not None
@@ -1827,14 +1824,16 @@ def must_reduce_shared_expert_outputs(self) -> bool:
         Therefore it is required that we reduce the shared_experts output
         early.
         """
-        return self.must_reduce_shared_experts
+        assert self.quant_method is not None
+        return (self.quant_method.fused_experts is not None
+                and self.quant_method.fused_experts.output_is_reduced())
 
     def maybe_all_reduce_tensor_model_parallel(
             self, final_hidden_states: torch.Tensor):
         """
         Some combine kernels reduce across GPU ranks by default.
         """
-        if self.must_reduce_shared_experts:
+        if self.must_reduce_shared_expert_outputs():
             return final_hidden_states
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -364,6 +364,14 @@ def max_num_tokens_per_rank(self) -> Optional[int]:
     def num_dispatchers(self) -> int:
         raise NotImplementedError
 
+    @abstractmethod
+    def output_is_reduced(self) -> bool:
+        """
+        Indicates whether or not the output of finalize is reduced across all
+        ranks.
+        """
+        raise NotImplementedError
+
 
 # TODO: add supported activations method (return string)
 class FusedMoEPermuteExpertsUnpermute(ABC):
@@ -671,6 +679,13 @@ def __init__(
                 f"{fused_experts.__class__.__name__}."
                 f"{fused_experts.activation_formats[0]}")
 
+    def output_is_reduced(self) -> bool:
+        """
+        Indicates whether or not the output of fused MoE kernel
+        is reduced across all ranks.
+        """
+        return self.prepare_finalize.output_is_reduced()
+
     def _chunk_info(self, M: int) -> tuple[int, int]:
         """
         Compute number of chunks and chunk size for given M.
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -89,6 +89,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
+    def output_is_reduced(self) -> bool:
+        return True
+
     def supports_async(self) -> bool:
         return True
 
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -27,6 +27,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
     def num_dispatchers(self) -> int:
         return 1
 
+    def output_is_reduced(self) -> bool:
+        return False
+
     def prepare(
         self,
         a1: torch.Tensor,