Merge pull request vllm-project#11 from dcmaddix/fused_moe_lora_cleanup

wcwuwc · web-flow · commit a931b7064d09 · 2025-10-11T08:03:50.000+08:00
Fused moe lora cleanup
diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu
@@ -21,6 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 
 }  // namespace
 
+// TODO: Refactor common parts with moe_align_sum_kernels
 template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_lora_align_sum_kernel(
     scalar_t* __restrict__ topk_ids, scalar_t* __restrict__ token_lora_mapping,
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -133,8 +133,6 @@ void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
 #ifndef USE_ROCM
-// #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-//     (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& output_block_scale,
                               torch::Tensor& input,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -122,8 +122,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
   ops.def(
       "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
       "Tensor input, Tensor input_global_scale) -> ()");
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -417,7 +417,7 @@ def activate_adapter(
             if module_lora:
                 module_lora.optimize()
                 # Note (gnovack) - If MOE lora weights are not split into
-                # um_experts chunks, we split them here
+                # num_experts chunks, we split them here
                 if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
                     module_lora.lora_a
                 ):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
@@ -286,7 +286,8 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             return packed_modules_mapping
         else:
             raise AttributeError(
-                "To support LoRA for MoE model, 'get_expert_mapping' must be implemented"
+                "To support LoRA for MoE model, " \
+                "'get_expert_mapping' must be implemented"
             )
     else:
         return get_packed_modules_mapping(model)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
@@ -94,7 +94,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                     expected_lora_modules.extend(packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
-                # TODO(gnovack) - Attempting to load full-layer moe adapter
                 if module == "experts":
                     expected_lora_modules.append(module)
             expected_lora_modules = list(set(expected_lora_modules))
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -192,25 +192,23 @@ def fused_marlin_moe(
         is_zp_float=False,
     )
 
-    if activation_func is not None:
-        activation_func(
-            activation, intermediate_cache2, intermediate_cache1.view(-1, 2 * N)
-        )
-    else:
-        if activation == "silu":
-            torch.ops._C.silu_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, 2 * N)
-            )
-        elif activation == "swigluoai":
-            # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(
-                intermediate_cache2, intermediate_cache1.view(-1, 2 * N)
-            )
-        else:
-            raise ValueError(
-                f"Unsupported activation: {activation}. "
-                "Only silu and swigluoai activations are supported."
-            )
+    if activation_func is None:
+        def activation_func(activation:str, output:torch.Tensor, input:torch.Tensor) -> None:
+            if activation == "silu":
+                torch.ops._C.silu_and_mul(
+                    output, input
+                )
+            elif activation == "swigluoai":
+                # alpha = 1.702, limit = 7.0
+                torch.ops._C.swigluoai_and_mul(
+                    output, input
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported activation: {activation}. "
+                    "Only silu and swigluoai activations are supported."
+                )
+    activation_func(activation, intermediate_cache2, intermediate_cache1.view(-1, 2 * N))
 
     if expert_map is not None:
         intermediate_cache3.zero_()
@@ -425,7 +423,6 @@ def apply(
     def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
         ops.moe_sum(input, output)
 
-
 def modular_marlin_fused_moe(
     quant_config: FusedMoEQuantConfig, shared_experts: Optional[torch.nn.Module] = None
 ) -> mk.FusedMoEModularKernel:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2047,12 +2047,6 @@ def apply(
         )
 
         invoke_fused_moe_kernel(
-            # The code `hidden_states` is not performing any specific action in
-            # the provided snippet. It seems to be a variable name or
-            # placeholder without any associated code or context.
-            # The code `hidden_states` is not performing any specific action in
-            # the provided snippet. It seems to be a variable or placeholder
-            # that has been declared but not used or assigned any value.
             hidden_states,
             w1,
             intermediate_cache1,
@@ -2114,7 +2108,7 @@ def apply(
             B_bias=self.w2_bias,
         )
 
-        # ops.moe_sum(intermediate_cache3, output)
+        # separate function is required for MoE + LoRA
         self.moe_sum(intermediate_cache3, output)
 
     def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
@@ -697,13 +697,13 @@ def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return logits
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        # Params for weights, fp8 weight scales, fp8 activation scales
+        # Params for weights, weight scales, activation scales
         # (param_name, weight_name, expert_id, shard_id)
         return FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.num_local_experts,  # FIXME: self.config.n_routed_experts if in config
+            num_experts=self.config.num_local_experts,
             num_redundant_experts=0,
         )