Skip to content

Commit 8f5170e

Browse files
committed
tweak layer flags and add output_is_reduced method
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 877e61b commit 8f5170e

File tree

8 files changed

+45
-13
lines changed

8 files changed

+45
-13
lines changed

vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def __init__(self, buffer: deep_ep.Buffer, num_dispatchers: int,
6161
def num_dispatchers(self) -> int:
6262
return self.num_dispatchers_
6363

64+
def output_is_reduced(self) -> bool:
65+
return True
66+
6467
@property
6568
def activation_format(self) -> mk.FusedMoEActivationFormat:
6669
return mk.FusedMoEActivationFormat.Standard

vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ def __init__(self,
6363
def num_dispatchers(self) -> int:
6464
return self.num_dispatchers_
6565

66+
def output_is_reduced(self) -> bool:
67+
return True
68+
6669
@property
6770
def activation_format(self) -> mk.FusedMoEActivationFormat:
6871
return mk.FusedMoEActivationFormat.BatchedExperts

vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
4747
def num_dispatchers(self) -> int:
4848
return self.num_dispatchers_
4949

50+
def output_is_reduced(self) -> bool:
51+
return False
52+
5053
def _apply_router_weight_on_input(
5154
self,
5255
a1: torch.Tensor,

vllm/model_executor/layers/fused_moe/fused_batched_moe.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
495495
def num_dispatchers(self) -> int:
496496
return self.num_dispatchers_
497497

498+
def output_is_reduced(self) -> bool:
499+
return False
500+
498501
def prepare(
499502
self,
500503
a1: torch.Tensor,

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,17 +1147,6 @@ def __init__(
11471147
self.batched_hidden_states: Optional[torch.Tensor] = None
11481148
self.batched_router_logits: Optional[torch.Tensor] = None
11491149

1150-
# TODO(bnell): make these into methods on PrepareAndFinalize or all2all?
1151-
self.must_reduce_shared_experts = (self.use_pplx_kernels
1152-
or self.use_deepep_ht_kernels
1153-
or self.use_deepep_ll_kernels)
1154-
1155-
self.use_dp_chunking = (
1156-
self.moe_parallel_config.use_pplx_kernels
1157-
or self.moe_parallel_config.use_deepep_ll_kernels
1158-
or (self.dp_size > 1
1159-
and self.moe_config.use_flashinfer_cutlass_kernels))
1160-
11611150
if self.use_dp_chunking:
11621151
states_shape: tuple[int, ...]
11631152
logits_shape: tuple[int, ...]
@@ -1230,6 +1219,14 @@ def use_flashinfer_cutlass_kernels(self):
12301219
and self.moe_quant_config.quant_dtype == "nvfp4"
12311220
and self.moe_config.use_flashinfer_cutlass_kernels)
12321221

1222+
@property
1223+
def use_dp_chunking(self) -> bool:
1224+
# Route to the chunked forward path using the FlashInfer Cutlass kernel
1225+
# only when data parallelism (DP) is enabled.
1226+
return (self.moe_parallel_config.use_pplx_kernels
1227+
or self.moe_parallel_config.use_deepep_ll_kernels
1228+
or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels))
1229+
12331230
def update_expert_map(self):
12341231
# ep_size and ep_rank should already be updated
12351232
assert self.expert_map is not None
@@ -1827,14 +1824,16 @@ def must_reduce_shared_expert_outputs(self) -> bool:
18271824
Therefore it is required that we reduce the shared_experts output
18281825
early.
18291826
"""
1830-
return self.must_reduce_shared_experts
1827+
assert self.quant_method is not None
1828+
return (self.quant_method.fused_experts is not None
1829+
and self.quant_method.fused_experts.output_is_reduced())
18311830

18321831
def maybe_all_reduce_tensor_model_parallel(
18331832
self, final_hidden_states: torch.Tensor):
18341833
"""
18351834
Some combine kernels reduce across GPU ranks by default.
18361835
"""
1837-
if self.must_reduce_shared_experts:
1836+
if self.must_reduce_shared_expert_outputs():
18381837
return final_hidden_states
18391838
else:
18401839
return tensor_model_parallel_all_reduce(final_hidden_states)

vllm/model_executor/layers/fused_moe/modular_kernel.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,14 @@ def max_num_tokens_per_rank(self) -> Optional[int]:
364364
def num_dispatchers(self) -> int:
365365
raise NotImplementedError
366366

367+
@abstractmethod
368+
def output_is_reduced(self) -> bool:
369+
"""
370+
Indicates whether or not the output of finalize is reduced across all
371+
ranks.
372+
"""
373+
raise NotImplementedError
374+
367375

368376
# TODO: add supported activations method (return string)
369377
class FusedMoEPermuteExpertsUnpermute(ABC):
@@ -671,6 +679,13 @@ def __init__(
671679
f"{fused_experts.__class__.__name__}."
672680
f"{fused_experts.activation_formats[0]}")
673681

682+
def output_is_reduced(self) -> bool:
683+
"""
684+
Indicates whether or not the output of fused MoE kernel
685+
is reduced across all ranks.
686+
"""
687+
return self.prepare_finalize.output_is_reduced()
688+
674689
def _chunk_info(self, M: int) -> tuple[int, int]:
675690
"""
676691
Compute number of chunks and chunk size for given M.

vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
8989
def num_dispatchers(self) -> int:
9090
return self.num_dispatchers_
9191

92+
def output_is_reduced(self) -> bool:
93+
return True
94+
9295
def supports_async(self) -> bool:
9396
return True
9497

vllm/model_executor/layers/fused_moe/prepare_finalize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
2727
def num_dispatchers(self) -> int:
2828
return 1
2929

30+
def output_is_reduced(self) -> bool:
31+
return False
32+
3033
def prepare(
3134
self,
3235
a1: torch.Tensor,

0 commit comments

Comments
 (0)