Skip to content

Commit 5dbf854

Browse files
authored
[CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (#11618)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
1 parent 970d6d0 commit 5dbf854

File tree

1 file changed

+3
-2
lines changed
  • vllm/model_executor/layers/quantization

1 file changed

+3
-2
lines changed

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
from vllm.model_executor.layers.quantization.base_config import (
1616
QuantizationConfig, QuantizeMethodBase)
1717
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
18-
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
19-
apply_w8a8_block_fp8_linear)
2018
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
2119
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
2220
from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -337,6 +335,9 @@ def apply(self,
337335
size_k=layer.input_size_per_partition,
338336
bias=bias)
339337

338+
# Note: lazy import to avoid triton import error.
339+
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
340+
apply_w8a8_block_fp8_linear)
340341
if self.block_quant:
341342
assert self.quant_config.weight_block_size is not None
342343
return apply_w8a8_block_fp8_linear(

0 commit comments

Comments
 (0)