rebase fixup

fxmarty-amd · fxmarty-amd · commit cff73cd5b0bb · 2025-05-09T08:49:23.000+02:00
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -6,8 +6,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
-import triton
-import triton.language as tl
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
@@ -25,6 +23,7 @@
     dequant_mxfp4,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -6,7 +6,6 @@
 import torch
 
 from vllm.logger import init_logger
-import vllm.envs as envs
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -336,7 +336,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight_scale = None
 
             # This call is necessary to release the scales memory.
-            # TODO: is it still?
             torch.cuda.empty_cache()
 
     def apply(
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -47,7 +47,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.weight_scale = None
 
             # This call is necessary to release the scales memory.
-            # TODO: is it still?
             torch.cuda.empty_cache()
 
     def create_weights(self, layer: torch.nn.Module,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1795,7 +1795,7 @@ def execute_model(
                 ])
         else:
             model_executable = self.model
-        
+
         # Receive KV cache in distributed KV cache transfer setting
         # In disagg prefill setting, it will also recv hidden states and bypass
         # model forwarding