update to use the ROCm 6.0 torch 2.1.1 as default base docker image

hongxiayang · hongxiayang · commit 5f9f5cfa6a65 · 2024-01-05T23:13:55.000Z
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -1,15 +1,14 @@
 # default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 FROM $BASE_IMAGE
 
-ARG BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 RUN echo "Base image is $BASE_IMAGE"
 
 # BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1"
-# Testing image: "compute-artifactory.amd.com:5000/rocm-plus-docker/framework/release-public:rocm6.0_ubuntu20.04_py3.9_pytorch_rocm6.0_internal_testing"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 # this does not always work for all rocm versions
 RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \
@@ -26,7 +25,6 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
-
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
     curl \
@@ -72,7 +70,12 @@ RUN mkdir libs \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN pip install xformers==0.0.23 --no-deps
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
 RUN cd /app \
     && cd vllm \
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
@@ -97,22 +97,22 @@ You can build and install vLLM from source:
 
 Build a docker image from `Dockerfile.rocm`, and launch a docker container.
 
-The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0. It provides flexibility to customize the build of docker image using the following arguments:
+The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
 
-* `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1`
+* `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
 * `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
 
-For example, to build docker image for vllm on ROCm 6.0, you can run:
+For example, to build docker image for vllm on ROCm 5.7, you can run:
 
 .. code-block:: console
 
-    $ docker build --build-arg BASE_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/release-public:rocm6.0_ubuntu20.04_py3.9_pytorch_rocm6.0_internal_testing" \
+    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
        -f Dockerfile.rocm -t vllm-rocm . 
 
-To build vllm on ROCm 5.7, you can use the default:
+To build vllm on ROCm 6.0, you can use the default:
 
 .. code-block:: console
 
@@ -161,3 +161,8 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
         $ cd vllm
         $ pip install -U -r requirements-rocm.txt
         $ python setup.py install # This may take 5-10 minutes.
+
+.. note::
+
+    - You may need to turn on the "--enforce-eager" flag if you experience process hang when running the `run_benchmark.py` script to test your installation.
+
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -43,6 +43,7 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
         cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
     if max_shared_mem == 0 and is_hip():
         # got 0 sometimes when using 74
+        print("get_max_shared_memory_bytes got 0, trying to use value 97 for ROCm")
         cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
         max_shared_mem = cuda_utils.get_device_attribute(
             cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -65,20 +65,27 @@ def init_model(self) -> None:
 
         # This env var set by Ray causes exceptions with graph building.
         os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-        
-        # This caused problem for rank non-0 (for example, 1 when -tp 2), when calling torch.cuda.set_device(self.device) in ROCm.
-        # HIP Error invalid device ordial
-        # where CUDA_VISIABLE_DEVICES=0,1, and set_device with cuda:1.
+
         try:
             self.device = torch.device(f"cuda:{self.local_rank}")
             torch.cuda.set_device(self.device)
         except RuntimeError as re:
+            # On certain versions, we experienced RuntimeError for rank non-0 when running with tensor-parallel option on ROCm.
+            # For example, for option, -tp 2, calling torch.cuda.set_device(self.device) for device 1 would throw the following error:
+            # HIP Error invalid device ordial
+            # By debugging, we found that CUDA_VISIABLE_DEVICES=0,1, but device_count is 1 and env HIP_VISIBLE_DEVICES is None.
+            # below is a work around when that happens so that we can continue
+            device_count = torch.cuda.device_count()
             print(
-                f"RuntimeError {re} in cuda.set_device {self.device}, visible device={os.environ.get('CUDA_VISIBLE_DEVICES')}. "
+                f"RuntimeError {re} in cuda.set_device {self.device}, device_count={device_count}. "
             )
-            self.device = torch.device("cuda:0")
-            print(f"Trying get around by set_device to {self.device}")
-            torch.cuda.set_device(self.device)
+            if device_count > 0:
+                self.device = torch.device("cuda:0")
+                print(f"Trying get around by set_device to {self.device}")
+                torch.cuda.set_device(self.device)
+            else:
+                # no work around is available
+                raise
 
         _check_if_gpu_supports_dtype(self.model_config.dtype)