Check hip from wrapper

vllm-project · WoosukKwon · Dec 8, 2023 · Oct 10, 2023 · Oct 10, 2023 · Oct 10, 2023
commit f92980e357d7fc0691f6ab54df885a2a86ee7ce9
diff --git a/setup.py b/setup.py
@@ -19,15 +19,24 @@
 ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
+
+def _is_hip():
+    return torch.version.hip
+
+
+def _is_cuda():
+    return torch.version.cuda
+
+
 # Compiler flags.
 CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
 # TODO(woosuk): Should we use -O3?
 NVCC_FLAGS = ["-O2", "-std=c++17"]
 
-if torch.version.hip and ROCM_HOME is not None:
+if _is_hip() and ROCM_HOME is not None:
     NVCC_FLAGS += ["-DUSE_ROCM"]
 
-if torch.version.cuda and CUDA_HOME is None:
+if _is_cuda() and CUDA_HOME is None:
     raise RuntimeError(
         "Cannot find CUDA_HOME. CUDA must be available to build the package.")
 
@@ -129,7 +138,7 @@ def get_torch_arch_list() -> Set[str]:
 
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
 compute_capabilities = get_torch_arch_list()
-if torch.version.cuda and not compute_capabilities:
+if _is_cuda() and not compute_capabilities:
     # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
     # GPUs on the current machine.
     device_count = torch.cuda.device_count()
@@ -140,7 +149,7 @@ def get_torch_arch_list() -> Set[str]:
                 "GPUs with compute capability below 7.0 are not supported.")
         compute_capabilities.add(f"{major}.{minor}")
 
-if torch.version.cuda:
+if _is_cuda():
     nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
     if not compute_capabilities:
         # If no GPU is specified nor available, add all supported architectures
@@ -191,7 +200,7 @@ def get_torch_arch_list() -> Set[str]:
         num_threads = min(os.cpu_count(), 8)
         NVCC_FLAGS += ["--threads", str(num_threads)]
 
-elif torch.version.hip:
+elif _is_hip():
     amd_arch = get_amdgpu_offload_arch()
     if amd_arch not in ROCM_SUPPORTED_ARCHS:
         raise RuntimeError(
@@ -211,7 +220,7 @@ def get_torch_arch_list() -> Set[str]:
     "csrc/pybind.cpp",
 ]
 
-if torch.version.cuda:
+if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
 
 vllm_extension = CUDAExtension(
@@ -245,7 +254,7 @@ def find_version(filepath: str) -> str:
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
 
-    if torch.version.hip:
+    if _is_hip():
         # Get the HIP version
         hipcc_version = get_hipcc_rocm_version()
         if hipcc_version != MAIN_CUDA_VERSION:
@@ -271,7 +280,7 @@ def read_readme() -> str:
 
 def get_requirements() -> List[str]:
     """Get Python package dependencies from requirements.txt."""
-    if torch.version.hip:
+    if _is_hip():
         with open(get_path("requirements-rocm.txt")) as f:
             requirements = f.read().strip().split("\n")
     else:

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -7,6 +7,7 @@
 
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
+from vllm.utils import is_hip
 
 
 @dataclass
@@ -89,7 +90,7 @@ def add_cli_args(
                             help='directory to download and load the weights, '
                             'default to the default cache dir of '
                             'huggingface')
-        if torch.cuda.is_available() and torch.version.hip:
+        if is_hip():
             # do something specific for HIP
             parser.add_argument(
                 '--load-format',
@@ -106,7 +107,7 @@ def add_cli_args(
                 help='data type for model weights and activations. '
                 'The default option is FP16 precision '
                 'Supports FP16 and BF16 ')
-        elif torch.cuda.is_available() and torch.version.cuda:
+        else:
             # do something specific for CUDA
             parser.add_argument(
                 '--load-format',
@@ -197,7 +198,7 @@ def add_cli_args(
         parser.add_argument('--disable-log-stats',
                             action='store_true',
                             help='disable logging statistics')
-        if torch.cuda.is_available() and torch.version.hip:
+        if is_hip():
             # Quantization settings.
             parser.add_argument('--quantization',
                                 '-q',
@@ -206,7 +207,7 @@ def add_cli_args(
                                 default=None,
                                 help='Method used to quantize the weights')
 
-        elif torch.cuda.is_available() and torch.version.cuda:
+        else:
             # Quantization settings.
             parser.add_argument('--quantization',
                                 '-q',

diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
@@ -3,6 +3,7 @@
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
+from vllm.utils import is_hip
 
 import torch
 
@@ -75,7 +76,7 @@ def initialize_cluster(
                 "Ray is not installed. Please install Ray to use distributed "
                 "serving.")
         # Connect to a ray cluster.
-        if torch.version.hip:
+        if is_hip():
             ray.init(address=ray_address,
                      ignore_reinit_error=True,
                      num_gpus=parallel_config.world_size)

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
@@ -10,6 +10,7 @@
 from vllm._C import ops
 from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
+from vllm.utils import is_hip
 
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
@@ -161,7 +162,7 @@ def forward(
                 p=0.0,
                 scale=self.scale,
                 op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
-                (torch.cuda.is_available() and torch.version.hip) else None,
+                (is_hip()) else None,
             )
             output = out.view_as(query)
         else:

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
@@ -2,12 +2,13 @@
 import torch
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.utils import is_hip
 
 _QUANTIZATION_CONFIG_REGISTRY = {
     "squeezellm": SqueezeLLMConfig,
 }
 
-if torch.cuda.is_available() and torch.version.cuda:
+if not is_hip():
     from vllm.model_executor.layers.quantization.awq import AWQConfig
     _QUANTIZATION_CONFIG_REGISTRY["awq"] = AWQConfig
 

diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -2,10 +2,11 @@
 
 import torch
 from torch.nn.parameter import Parameter
-if torch.cuda.is_available() and torch.version.hip:
+from vllm.utils import is_hip
+if is_hip():
     # do something specific for HIP
     print("Warning: vLLM does not support AWQ on ROCm.")
-elif torch.cuda.is_available() and torch.version.cuda:
+else:
     from vllm._C import ops
 
 from vllm.model_executor.layers.linear import (LinearMethodBase,

diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -7,6 +7,7 @@
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.utils import is_hip
 
 
 class SqueezeLLMConfig(QuantizationConfig):
@@ -114,14 +115,11 @@ def apply_weights(self,
         lookup_table = weights["lookup_table"]
         out_shape = x.shape[:-1] + (qweight.shape[-1], )
         reshaped_x = x.reshape(-1, x.shape[-1])
-        if torch.cuda.is_available() and torch.version.hip:
-            out_float = torch.zeros(out_shape,
-                                    device="cuda",
-                                    dtype=torch.float)
+        if is_hip():
+            out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float)
             ops.squeezellm_gemm(reshaped_x, qweight, out_float, lookup_table)
             out = out_float.to(dtype=torch.float16)
-            # do something specific for HIP
-        elif torch.cuda.is_available() and torch.version.cuda:
+        else:
             # NOTE: The output tensor should be zero-initialized.
             out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
             ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.models import *
 from vllm.model_executor.weight_utils import (get_quant_config,
                                               initialize_dummy_weights)
+from vllm.utils import is_hip
 
 # TODO(woosuk): Lazy-load the model classes.
 _MODEL_REGISTRY = {
@@ -44,7 +45,7 @@
     # in models such as Mistral
     "MistralForCausalLM",
 ]
-if torch.version.hip:
+if is_hip():
     for rocm_model in _ROCM_DISABLED_MODELS:
         del _MODEL_REGISTRY[rocm_model]
 

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -53,3 +53,7 @@ def random_uuid() -> str:
 def in_wsl() -> bool:
     # Reference: https://github.com/microsoft/WSL/issues/4071
     return "microsoft" in " ".join(uname()).lower()
+
+
+def is_hip():
+    return torch.version.hip