redo workspace allocation logic a bit

bnellnm · bnellnm · commit 5ec1086675cc · 2025-10-03T19:46:55.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -250,8 +250,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -264,7 +264,7 @@ def workspace_shapes(
         # end up sending their tokens. This needs to be fixed.
         num_dispatchers = self.num_dispatchers
         num_experts = local_num_experts
-        max_num_tokens = (curr_M if self.max_num_tokens is None else
+        max_num_tokens = (M_chunk if self.max_num_tokens is None else
                           self.max_num_tokens)
         workspace13 = (num_experts, max_num_tokens * num_dispatchers,
                        max(K, N))
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -93,8 +93,8 @@ def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -108,13 +108,13 @@ def workspace_shapes(
         if self.allow_deep_gemm:
             assert self.batched_deep_gemm_experts is not None
             return self.batched_deep_gemm_experts.workspace_shapes(
-                curr_M, M, N, K, topk, global_num_experts, local_num_experts,
-                expert_tokens_metadata)
+                M_chunk, M_full, N, K, topk, global_num_experts,
+                local_num_experts, expert_tokens_metadata)
         else:
             assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
-                curr_M, M, N, K, topk, global_num_experts, local_num_experts,
-                expert_tokens_metadata)
+                M_chunk, M_full, N, K, topk, global_num_experts,
+                local_num_experts, expert_tokens_metadata)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -313,18 +313,18 @@ def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        workspace1 = (curr_M * topk, max(N, K))
-        workspace2 = (curr_M * topk, max(N // 2, K))
-        output = (M, K)
+        workspace1 = (M_chunk * topk, max(N, K))
+        workspace2 = (M_chunk * topk, max(N // 2, K))
+        output = (M_full, K)
         return (workspace1, workspace2, output)
 
 
@@ -371,8 +371,8 @@ def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -382,9 +382,11 @@ def workspace_shapes(
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         assert num_dp is not None
-        workspace1 = (self.max_experts_per_worker, M * num_dp, max(N, K))
-        workspace2 = (self.max_experts_per_worker, M * num_dp, max(N // 2, K))
-        output = (self.max_experts_per_worker, M, K)
+        assert M_chunk == M_full
+        workspace1 = (self.max_experts_per_worker, M_full * num_dp, max(N, K))
+        workspace2 = (self.max_experts_per_worker, M_full * num_dp,
+                      max(N // 2, K))
+        output = (self.max_experts_per_worker, M_full, K)
         return (workspace1, workspace2, output)
 
 
@@ -670,8 +672,8 @@ def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -683,13 +685,14 @@ def workspace_shapes(
         workspace2: tuple[int, ...] = ()
         output: tuple[int, ...] = ()
         if self.use_batched_format:
-            workspace1 = (self.max_experts_per_worker, M, max(N, K))
-            workspace2 = (self.max_experts_per_worker, M, (N // 2))
-            output = (self.max_experts_per_worker, M, K)
+            assert M_chunk == M_full
+            workspace1 = (self.max_experts_per_worker, M_full, max(N, K))
+            workspace2 = (self.max_experts_per_worker, M_full, (N // 2))
+            output = (self.max_experts_per_worker, M_full, K)
         else:
-            workspace1 = (curr_M * topk, max(2 * N, K))
-            workspace2 = (curr_M * topk, N)
-            output = (M, K)
+            workspace1 = (M_chunk * topk, max(2 * N, K))
+            workspace2 = (M_chunk * topk, N)
+            output = (M_full, K)
         return (workspace1, workspace2, output)
 
     def apply(
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -181,8 +181,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -192,13 +192,13 @@ def workspace_shapes(
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         assert self.block_shape is not None
         block_m = self.block_shape[0]
-        M_sum = compute_aligned_M(curr_M, topk, local_num_experts, block_m,
+        M_sum = compute_aligned_M(M_chunk, topk, local_num_experts, block_m,
                                   expert_tokens_meta)
         assert M_sum % block_m == 0
 
         workspace1 = (M_sum, max(N, K))
         workspace2 = (M_sum, max(N // 2, K))
-        output = (M, K)
+        output = (M_full, K)
         return (workspace1, workspace2, output)
 
     def apply(
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -81,8 +81,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -108,9 +108,9 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        workspace1 = (curr_M, K)
+        workspace1 = (M_chunk, K)
         workspace2 = (0, )
-        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" else K)
+        output_shape = (M_full, K * 2 if self.quant_dtype == "nvfp4" else K)
         # The workspace is determined by `aq`, since it comes after any
         # potential communication op and is involved in the expert computation.
         return (workspace1, workspace2, output_shape)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -652,8 +652,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -850,8 +850,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1729,18 +1729,18 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        workspace1 = (curr_M, topk, max(N // 2, K))
-        workspace2 = (curr_M, topk, max(N, K))
-        output = (M, K)
+        workspace1 = (M_chunk, topk, max(N // 2, K))
+        workspace2 = (M_chunk, topk, max(N, K))
+        output = (M_full, K)
         return (workspace1, workspace2, output)
 
     def apply(
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -252,8 +252,8 @@ def supports_chunking(self) -> bool:
 
     def workspace_shapes(
         self,
-        curr_M: int,
-        M: int,
+        M_chunk: int,
+        M_full: int,
         N: int,
         K: int,
         topk: int,
@@ -262,9 +262,9 @@ def workspace_shapes(
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M, K)
+        workspace1 = (M_chunk, K)
         workspace2 = (0, 0)
-        output = (M, K)
+        output = (M_full, K)
         return (workspace1, workspace2, output)
 
     def apply(
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py