Skip to content

[Bugfix] make moe_align_block_size compliable with cuda graph #12036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions csrc/moe/moe_align_sum_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "../cuda_compat.h"
#include "../dispatch_utils.h"

#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
#define CEILDIV(x, y) (((x) + (y)-1) / (y))

namespace vllm {
namespace moe {
Expand Down Expand Up @@ -221,29 +221,29 @@ __global__ void moe_sum_kernel(
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
torch::Tensor num_tokens_post_pad,
bool use_global_memory) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

// If we have very large number of experts, we can no longer use shared
// memory.
// TODO(simon): the right solution should be calculating the exact right
// amount of shared memory and use that. The num_experts >= 256 is just a
// temporary solution to unblock Deepseek V3.
if (num_experts >= 256) {
if (use_global_memory) {
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);

const int32_t mem_tokens_cnts =
((num_experts + 1) * num_experts) * sizeof(int32_t);
const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
// allocate global memory
int32_t* tokens_cnts;
int32_t* cumsum;
cudaMalloc(&tokens_cnts, mem_tokens_cnts);
cudaMalloc(&cumsum, mem_cumsum);
auto options_int = torch::TensorOptions()
.dtype(torch::kInt)
.device(topk_ids.device());
torch::Tensor token_cnts_buffer =
torch::empty({(num_experts + 1) * num_experts}, options_int);
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);

auto kernel =
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
Expand All @@ -252,9 +252,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), tokens_cnts, cumsum);
cudaFree(tokens_cnts);
cudaFree(cumsum);
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>());
});
} else {
VLLM_DISPATCH_INTEGRAL_TYPES(
Expand Down
3 changes: 2 additions & 1 deletion csrc/moe/moe_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
torch::Tensor num_tokens_post_pad,
bool use_global_memory);
3 changes: 2 additions & 1 deletion csrc/moe/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
" Tensor! num_tokens_post_pad,"
" bool use_global_memory) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

#ifndef USE_ROCM
Expand Down
12 changes: 8 additions & 4 deletions vllm/_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,13 +914,17 @@ def moe_sum(input: torch.Tensor, output: torch.Tensor):
torch.ops._moe_C.moe_sum(input, output)


def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
block_size: int, sorted_token_ids: torch.Tensor,
def moe_align_block_size(topk_ids: torch.Tensor,
num_experts: int,
block_size: int,
sorted_token_ids: torch.Tensor,
experts_ids: torch.Tensor,
num_tokens_post_pad: torch.Tensor) -> None:
num_tokens_post_pad: torch.Tensor,
use_global_memory: bool = False) -> None:
torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
sorted_token_ids, experts_ids,
num_tokens_post_pad)
num_tokens_post_pad,
use_global_memory)


def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
Expand Down
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ def _verify_cuda_graph(self) -> None:
self.max_model_len)

if (self.hf_config.model_type == 'deepseek_v3'
and not self.enforce_eager):
and self.quantization == "fp8" and not self.enforce_eager):
logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
"fallback to the eager mode.")
self.enforce_eager = True
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,10 @@ def moe_align_block_size(
num_tokens_post_pad = torch.empty((1),
dtype=torch.int32,
device=topk_ids.device)
use_global_memory = num_experts >= 256 # for deepseek-v3
ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
expert_ids, num_tokens_post_pad)
expert_ids, num_tokens_post_pad,
use_global_memory)
return sorted_ids, expert_ids, num_tokens_post_pad


Expand Down