|
22 | 22 | from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, |
23 | 23 | get_quantization_config) |
24 | 24 | from vllm.model_executor.models import ModelRegistry |
25 | | -from vllm.platforms import current_platform, interface |
| 25 | +from vllm.platforms import CpuArchEnum |
26 | 26 | from vllm.tracing import is_otel_available, otel_import_error_traceback |
27 | 27 | from vllm.transformers_utils.config import ( |
28 | 28 | ConfigFormat, get_config, get_hf_image_processor_config, |
@@ -349,6 +349,7 @@ def __init__(self, |
349 | 349 | self.is_hybrid = self._init_is_hybrid() |
350 | 350 | self.has_inner_state = self._init_has_inner_state() |
351 | 351 |
|
| 352 | + from vllm.platforms import current_platform |
352 | 353 | if current_platform.is_neuron(): |
353 | 354 | self.override_neuron_config = override_neuron_config |
354 | 355 | else: |
@@ -589,6 +590,7 @@ def _verify_quantization(self) -> None: |
589 | 590 | raise ValueError( |
590 | 591 | f"Unknown quantization method: {self.quantization}. Must " |
591 | 592 | f"be one of {supported_quantization}.") |
| 593 | + from vllm.platforms import current_platform |
592 | 594 | current_platform.verify_quantization(self.quantization) |
593 | 595 | if self.quantization not in optimized_quantization_methods: |
594 | 596 | logger.warning( |
@@ -644,6 +646,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, |
644 | 646 |
|
645 | 647 | # Reminder: Please update docs/source/usage/compatibility_matrix.md |
646 | 648 | # If the feature combo become valid |
| 649 | + from vllm.platforms import current_platform |
647 | 650 | if not current_platform.is_async_output_supported(self.enforce_eager): |
648 | 651 | logger.warning( |
649 | 652 | "Async output processing is not supported on the " |
@@ -1012,6 +1015,7 @@ def _verify_args(self) -> None: |
1012 | 1015 | raise ValueError( |
1013 | 1016 | "GPU memory utilization must be less than 1.0. Got " |
1014 | 1017 | f"{self.gpu_memory_utilization}.") |
| 1018 | + from vllm.platforms import current_platform |
1015 | 1019 | if (current_platform.is_cuda() and self.block_size is not None |
1016 | 1020 | and self.block_size > 32): |
1017 | 1021 | raise ValueError("CUDA Paged Attention kernel only supports " |
@@ -1279,6 +1283,7 @@ def __post_init__(self) -> None: |
1279 | 1283 | f"distributed executor backend " |
1280 | 1284 | f"'{self.distributed_executor_backend}'.") |
1281 | 1285 | ray_only_devices = ["tpu", "hpu"] |
| 1286 | + from vllm.platforms import current_platform |
1282 | 1287 | if (current_platform.device_type in ray_only_devices |
1283 | 1288 | and self.world_size > 1): |
1284 | 1289 | if self.distributed_executor_backend is None: |
@@ -1327,7 +1332,7 @@ def use_ray(self) -> bool: |
1327 | 1332 | def _verify_args(self) -> None: |
1328 | 1333 | # Lazy import to avoid circular import |
1329 | 1334 | from vllm.executor.executor_base import ExecutorBase |
1330 | | - |
| 1335 | + from vllm.platforms import current_platform |
1331 | 1336 | if self.distributed_executor_backend not in ( |
1332 | 1337 | "ray", "mp", None) and not (isinstance( |
1333 | 1338 | self.distributed_executor_backend, type) and issubclass( |
@@ -1528,6 +1533,7 @@ def compute_hash(self) -> str: |
1528 | 1533 | def __init__(self, device: str = "auto") -> None: |
1529 | 1534 | if device == "auto": |
1530 | 1535 | # Automated device type detection |
| 1536 | + from vllm.platforms import current_platform |
1531 | 1537 | self.device_type = current_platform.device_type |
1532 | 1538 | if not self.device_type: |
1533 | 1539 | raise RuntimeError("Failed to infer device type") |
@@ -2241,9 +2247,10 @@ def _get_and_verify_dtype( |
2241 | 2247 | else: |
2242 | 2248 | torch_dtype = config_dtype |
2243 | 2249 |
|
| 2250 | + from vllm.platforms import current_platform |
2244 | 2251 | if (current_platform.is_cpu() |
2245 | 2252 | and current_platform.get_cpu_architecture() |
2246 | | - == interface.CpuArchEnum.POWERPC |
| 2253 | + == CpuArchEnum.POWERPC |
2247 | 2254 | and (config_dtype == torch.float16 |
2248 | 2255 | or config_dtype == torch.float32)): |
2249 | 2256 | logger.info( |
@@ -3083,6 +3090,7 @@ def _get_quantization_config( |
3083 | 3090 | model_config: ModelConfig, |
3084 | 3091 | load_config: LoadConfig) -> Optional[QuantizationConfig]: |
3085 | 3092 | """Get the quantization config.""" |
| 3093 | + from vllm.platforms import current_platform |
3086 | 3094 | if model_config.quantization is not None: |
3087 | 3095 | from vllm.model_executor.model_loader.weight_utils import ( |
3088 | 3096 | get_quant_config) |
@@ -3145,6 +3153,7 @@ def __post_init__(self): |
3145 | 3153 | self.quant_config = VllmConfig._get_quantization_config( |
3146 | 3154 | self.model_config, self.load_config) |
3147 | 3155 |
|
| 3156 | + from vllm.platforms import current_platform |
3148 | 3157 | if self.scheduler_config is not None and \ |
3149 | 3158 | self.model_config is not None and \ |
3150 | 3159 | self.scheduler_config.chunked_prefill_enabled and \ |
|
0 commit comments