vllm-project · simon-mo · May 2, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -1,10 +1,11 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script build the ROCm docker image and runs test inside it.
 set -ex
 
 # Print ROCm version
+echo "--- ROCm info"
 rocminfo
 
+echo "--- Resetting GPUs"
 
 echo "reset" > /opt/amdgpu/etc/gpu_state
 
@@ -16,37 +17,28 @@ while true; do
         fi
 done
 
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+container_name=rocm_${sha}
+docker build \
+        -t ${container_name} \
+        -f Dockerfile.rocm \
+        --progress plain \
+        .
+
+remove_docker_container() {
+   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+}
+trap remove_docker_container EXIT
 
+echo "--- Running container"
 
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-export HIP_VISIBLE_DEVICES=1
-docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
+docker run \
+        --device /dev/kfd --device /dev/dri \
+        --network host \
+        --rm \
+        -e HF_TOKEN \
+        --name ${container_name} \
+        ${container_name} \
+        /bin/bash -c "$1"
 
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
 tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md
 
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+    exit 0
+fi
+
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -19,6 +19,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test
+  mirror_hardwares: [amd]
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
@@ -29,6 +30,7 @@ steps:
 - label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
+  mirror_hardwares: [amd]
   commands:
   - pytest -v -s test_pynccl.py
   - pytest -v -s test_pynccl_library.py
@@ -38,6 +40,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
 
 - label: Engine Test
+  mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
@@ -48,6 +51,7 @@ steps:
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
   commands:
     # install aws cli for llava_example.py
     - pip install awscli
@@ -61,29 +65,35 @@ steps:
   parallelism: 4
 
 - label: Models Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
 
 - label: Llava Test
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models/test_llava.py
 
 - label: Prefix Caching Test
+  mirror_hardwares: [amd]
   commands:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
+  mirror_hardwares: [amd]
   command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
+  mirror_hardwares: [amd]
   command: pytest -v -s worker
 
 - label: Speculative decoding tests
+  mirror_hardwares: [amd]
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
@@ -101,6 +111,7 @@ steps:
 
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -16,18 +16,29 @@ steps:
           limit: 5
   - wait
 
-  - label: "AMD Test"
-    agents:
-      queue: amd
-    command: bash .buildkite/run-amd-test.sh
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
 
   - label: "Neuron Test"
+    depends_on: ~
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
     soft_fail: true
 
-  - label: "CPU Test"
+  - label: "Intel Test"
+    depends_on: ~
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
 
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+ARG APP_MOUNT=/vllm-workspace
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 
@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
     && cd ../..; \
     fi
 
-COPY ./ /app/vllm
+WORKDIR /vllm-workspace
+COPY . .
 
 RUN python3 -m pip install --upgrade pip numba
 
-RUN cd /app \
-    && cd vllm \
-    && pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -U -r requirements-rocm.txt \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip