Skip to content

Commit 1a504af

Browse files
Isotr0pysimon-mo
authored andcommitted
[Bugfix] Fix broken CPU quantization due to triton import (#15038)
Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent 01ca85b commit 1a504af

File tree

1 file changed

+4
-1
lines changed
  • vllm/model_executor/layers/quantization

1 file changed

+4
-1
lines changed

vllm/model_executor/layers/quantization/gguf.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from vllm import _custom_ops as ops
1111
from vllm.logger import init_logger
1212
from vllm.model_executor.layers.activation import SiluAndMul
13-
from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
1413
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
1514
FusedMoEMethodBase)
1615
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -140,6 +139,10 @@ def _fused_moe_gguf(
140139
qweight_type2: int,
141140
act,
142141
) -> torch.Tensor:
142+
# lazy import to avoid triggering triton import in CPU backend
143+
from vllm.model_executor.layers.fused_moe.fused_moe import (
144+
moe_align_block_size)
145+
143146
out_hidden_states = torch.empty_like(x)
144147
if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
145148
num_tokens, _ = x.shape

0 commit comments

Comments
 (0)