File tree 1 file changed +4
-1
lines changed
vllm/model_executor/layers/quantization
1 file changed +4
-1
lines changed Original file line number Diff line number Diff line change 10
10
from vllm import _custom_ops as ops
11
11
from vllm .logger import init_logger
12
12
from vllm .model_executor .layers .activation import SiluAndMul
13
- from vllm .model_executor .layers .fused_moe .fused_moe import moe_align_block_size
14
13
from vllm .model_executor .layers .fused_moe .layer import (FusedMoE ,
15
14
FusedMoEMethodBase )
16
15
from vllm .model_executor .layers .linear import LinearBase , LinearMethodBase
@@ -140,6 +139,10 @@ def _fused_moe_gguf(
140
139
qweight_type2 : int ,
141
140
act ,
142
141
) -> torch .Tensor :
142
+ # lazy import to avoid triggering triton import in CPU backend
143
+ from vllm .model_executor .layers .fused_moe .fused_moe import (
144
+ moe_align_block_size )
145
+
143
146
out_hidden_states = torch .empty_like (x )
144
147
if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES :
145
148
num_tokens , _ = x .shape
You can’t perform that action at this time.
0 commit comments