Skip to content

vLLM Benchmark

vLLM Benchmark #390

name: vLLM Benchmark
on:
schedule:
# Run every 2 hours
- cron: '0 */2 * * *'
workflow_dispatch:
inputs:
vllm_branch:
description: vLLM branch
required: true
type: string
default: main
vllm_commit:
description: vLLM commit
required: false
type: string
models:
description: |
A comma-separated list of models to benchmark, leave empty to run everything
required: false
type: string
gpus:
description: |
A comma-separated list of GPUs to benchmark, i.e. h100, mi300
required: true
type: string
default: h100,mi300
pull_request:
paths:
- .github/workflows/vllm-benchmark.yml
- vllm-benchmarks/**
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
set-parameters:
runs-on: ubuntu-latest
outputs:
benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Set parameters
id: set-parameters
shell: bash
env:
MODELS: ${{ inputs.models || '' }}
GPUS: ${{ inputs.gpus || '' }}
run: |
set -eux
# The generated matrix is grouped by model and runner
python .github/scripts/generate_vllm_benchmark_matrix.py \
--benchmark-configs-dir vllm-benchmarks/benchmarks \
--models "${MODELS}" \
--gpus "${GPUS}"
benchmarks:
name: Run vLLM benchmarks
needs: set-parameters
strategy:
matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
environment: pytorch-x-vllm
permissions:
id-token: write
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Checkout vLLM repository
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: vllm-benchmarks/vllm
ref: ${{ inputs.vllm_branch || 'main' }}
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'
- name: Check if the device is supported
shell: bash
run: |
set -eux
if command -v nvidia-smi; then
DEVICE_NAME=cuda
nvidia-smi
elif command -v rocm-smi; then
DEVICE_NAME=rocm
rocm-smi
else
echo "Only CUDA and ROCm benchmarks are supported at the moment"
exit 1
fi
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
- name: Set GPU name and type
working-directory: vllm-benchmarks
shell: bash
run: |
set -eux
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
- name: Install dependencies
shell: bash
run: |
set -eux
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
pip install -r .github/scripts/requirements.txt
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/rocm6.3
fi
- name: Set Docker registry
shell: bash
run: |
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
fi
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
- name: Check for last benchmark commit
working-directory: vllm-benchmarks
env:
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
HEAD_SHA: ${{ inputs.vllm_commit || '' }}
MODELS: ${{ matrix.models }}
run: |
set -eux
if [[ -z "${HEAD_SHA}" ]]; then
pushd vllm
# Looking back the latest 100 commits is enough
for i in {0..99}
do
# Check if the image is there, if it doesn't then check an older one
# because the commit is too recent
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
# No Docker image available yet because the commit is too recent
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
continue
fi
NOT_EXIST=0
S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1
if [[ ${NOT_EXIST} == "1" ]]; then
echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
break
fi
done
popd
fi
echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
- name: Setup CUDA GPU_FLAG for docker run
if: env.DEVICE_NAME == 'cuda'
run: |
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
- name: Setup ROCm
if: env.DEVICE_NAME == 'rocm'
uses: pytorch/pytorch/./.github/actions/setup-rocm@main
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
run: |
echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
- name: Setup benchmark tests
env:
MODELS: ${{ matrix.models }}
run: |
set -eux
pushd vllm-benchmarks/vllm
git checkout "${HEAD_SHA}"
rm .buildkite/nightly-benchmarks/tests/*.json
popd
# Set the list of benchmarks we want to cover in this runner
python .github/scripts/setup_vllm_benchmark.py \
--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
--models "${MODELS}"
pushd vllm-benchmarks/vllm
ls -lah .buildkite/nightly-benchmarks/tests
find .buildkite/nightly-benchmarks/tests -type f -exec cat {} \;
popd
- name: Run vLLM benchmark
env:
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}
# vLLM-related environment variables
ENGINE_VERSION: v1
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
run: |
set -x
docker run \
${GPU_FLAG:-} \
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
-e SCCACHE_BUCKET \
-e SCCACHE_REGION \
-e DEVICE_NAME \
-e DEVICE_TYPE \
-e HF_TOKEN \
-e ENGINE_VERSION \
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
--ipc=host \
--tty \
--security-opt seccomp=unconfined \
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
-w /tmp/workspace \
"${DOCKER_IMAGE}" \
bash -xc "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh"
- name: Authenticate with AWS
# AWS CUDA runners already have access to the bucket via its runner IAM role
if: env.DEVICE_NAME != 'cuda'
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
# The max duration enforced by the server side
role-duration-seconds: 18000
aws-region: us-east-1
- name: Upload the benchmark results
env:
BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
MODELS: ${{ matrix.models }}
run: |
set -eux
sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
ls -lah "${BENCHMARK_RESULTS}"
python .github/scripts/upload_benchmark_results.py \
--repo vllm-benchmarks/vllm \
--benchmark-name "vLLM benchmark" \
--benchmark-results "${BENCHMARK_RESULTS}" \
--device "${DEVICE_TYPE// /_}" \
--model "${MODELS//\//_}"