Skip to content

Rocm vllm ci fix #468

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ea787b0
Test build to check processing by different K8 queues.
Alexei-V-Ivanov-AMD Feb 4, 2025
01dfdda
Testing.
Alexei-V-Ivanov-AMD Feb 5, 2025
7f80bf8
Copying over the tests directory to enable CI testing.
Alexei-V-Ivanov-AMD Feb 5, 2025
14aaf35
Comparing with MI250 in the "mi250_8xGPU" queue.
Alexei-V-Ivanov-AMD Feb 5, 2025
a106489
Building with "test" as a --target
Alexei-V-Ivanov-AMD Feb 5, 2025
6acfc3a
Fixing working directory property.
Alexei-V-Ivanov-AMD Feb 5, 2025
172e0e8
Dummy alternation to confirm trouble with simultaneous test execution.
Alexei-V-Ivanov-AMD Feb 5, 2025
114e750
Dummy alternation to trigger a re-build and re-test.
Alexei-V-Ivanov-AMD Feb 6, 2025
0fc4050
Updating rocm dockerhub repo.
Alexei-V-Ivanov-AMD Feb 27, 2025
b2e3e12
Update run-amd-test.sh
Alexei-V-Ivanov-AMD Mar 3, 2025
cc41fa6
.
Alexei-V-Ivanov-AMD Mar 4, 2025
2c64618
Merge branch 'k8test' of github.com:ROCm/vllm into k8test
Alexei-V-Ivanov-AMD Mar 4, 2025
4022a8a
Importing Test improvements (Sage's PR #13970 to vllm-project).
Alexei-V-Ivanov-AMD Mar 4, 2025
84ea7b9
Restoring access to amd_gpu_1 queue
Alexei-V-Ivanov-AMD Mar 4, 2025
fbb39f3
Redirecting to the stable test-processing queues.
Alexei-V-Ivanov-AMD Mar 10, 2025
68c7701
Merge branch 'main' into rocm-vllm-ci-fix
Alexei-V-Ivanov-AMD Mar 10, 2025
e210fb7
Fix building architectures.
Alexei-V-Ivanov-AMD Mar 10, 2025
f591d18
Merge branch 'rocm-vllm-ci-fix' of github.com:ROCm/vllm into rocm-vll…
Alexei-V-Ivanov-AMD Mar 10, 2025
5e31d5c
Removing junk.
Alexei-V-Ivanov-AMD Mar 10, 2025
a699a11
Merging in the latest merge from vllm-project to ROCm (#472)
Alexei-V-Ivanov-AMD Mar 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ main() {

pip install -U transformers

pip install -r requirements-dev.txt
pip install -r requirements/dev.txt
which genai-perf

# check storage
Expand Down
9 changes: 7 additions & 2 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ while true; do
done

echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"

Expand Down Expand Up @@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py \
--ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/test_mamba_mixer2.py"
--ignore=kernels/test_mamba_mixer2.py \
--ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_permute_cols.py"
fi

#ignore certain Entrypoints tests
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ function cpu_tests() {
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements-test.txt
pip install -r vllm/requirements/test.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
Expand Down
11 changes: 8 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ steps:
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- pip install -r ../../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html
Expand Down Expand Up @@ -78,6 +78,7 @@ steps:
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
Expand Down Expand Up @@ -115,6 +116,7 @@ steps:
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
Expand Down Expand Up @@ -146,8 +148,10 @@ steps:
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
- pushd ../examples/offline_inference
- python3 rlhf.py
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd

- label: Metrics, Tracing Test # 10min
num_gpus: 2
Expand Down Expand Up @@ -204,6 +208,7 @@ steps:
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/structured_output
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
# TODO: accuracy does not match, whether setting
Expand Down
12 changes: 10 additions & 2 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
{% set default_working_dir = "vllm/tests" %}
{% set hf_home = "/root/.cache/huggingface" %}

steps:
- label: ":docker: build image"
depends_on: ~
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ."
- "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
- "docker push {{ docker_image_amd }}"
key: "amd-build"
env:
Expand All @@ -27,7 +27,15 @@ steps:
depends_on:
- "amd-build"
agents:
{% if step.amd_gpus and step.amd_gpus==8%}
queue: amd_gpu
{% elif step.amd_gpus and step.amd_gpus==4%}
queue: amd_gpu
{% elif step.amd_gpus and step.amd_gpus==2%}
queue: amd_gpu
{% else%}
queue: amd_gpu
{% endif%}
commands:
- bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
env:
Expand Down
15 changes: 15 additions & 0 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ pull_request_rules:
add:
- frontend

- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality

- name: label-structured-output
description: Automatically apply structured-output label
conditions:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ python_executable=python3

# Update paths
# Install requirements
$python_executable -m pip install -r requirements-rocm.txt
$python_executable -m pip install -r requirements/rocm.txt

# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ _build/
hip_compat.h

# Benchmark dataset
benchmarks/*.json
benchmarks/**/*.json

# Linting
actionlint
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ repos:
rev: 0.6.2
hooks:
- id: pip-compile
args: [requirements-test.in, -o, requirements-test.txt]
files: ^requirements-test\.(in|txt)$
args: [requirements/test.in, -o, requirements/test.txt]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
- id: mypy-local
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ formats: []
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt
- requirements: requirements/docs.txt
80 changes: 54 additions & 26 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")

# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")

# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
Expand Down Expand Up @@ -312,7 +312,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
Expand All @@ -334,7 +334,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

# Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
if (ALLSPARK_ARCHS)
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
Expand All @@ -345,46 +345,74 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else()
message(STATUS "Not building AllSpark kernels as no compatible archs found"
" in CUDA target architectures")
" in CUDA target architectures, or CUDA not >= 12.0")
endif()


set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
# CUDA 12.0 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()

# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"Blackwell.")
else()
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()

#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
Expand All @@ -409,17 +437,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# 2:4 Sparse Kernels

# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.")
Expand All @@ -434,8 +462,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
)
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
Expand Down Expand Up @@ -534,6 +561,7 @@ define_gpu_extension_target(
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)

Expand All @@ -557,7 +585,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}")

if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
Expand Down
Loading