File tree Expand file tree Collapse file tree 1 file changed +3
-2
lines changed
vllm/model_executor/layers/quantization Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Original file line number Diff line number Diff line change 15
15
from vllm .model_executor .layers .quantization .base_config import (
16
16
QuantizationConfig , QuantizeMethodBase )
17
17
from vllm .model_executor .layers .quantization .kv_cache import BaseKVCacheMethod
18
- from vllm .model_executor .layers .quantization .utils .fp8_utils import (
19
- apply_w8a8_block_fp8_linear )
20
18
from vllm .model_executor .layers .quantization .utils .marlin_utils_fp8 import (
21
19
apply_fp8_marlin_linear , prepare_fp8_layer_for_marlin )
22
20
from vllm .model_executor .layers .quantization .utils .quant_utils import (
@@ -337,6 +335,9 @@ def apply(self,
337
335
size_k = layer .input_size_per_partition ,
338
336
bias = bias )
339
337
338
+ # Note: lazy import to avoid triton import error.
339
+ from vllm .model_executor .layers .quantization .utils .fp8_utils import (
340
+ apply_w8a8_block_fp8_linear )
340
341
if self .block_quant :
341
342
assert self .quant_config .weight_block_size is not None
342
343
return apply_w8a8_block_fp8_linear (
You can’t perform that action at this time.
0 commit comments