Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into sharded-tensorizer
Browse files Browse the repository at this point in the history
* upstream/main: (126 commits)
  [Bugfix][Frontend] Cleanup "fix chat logprobs" (vllm-project#5026)
  [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (vllm-project#5312)
  [Misc] Various simplifications and typing fixes (vllm-project#5368)
  [ci] Fix Buildkite agent path (vllm-project#5392)
  [Doc] Add documentation for FP8 W8A8 (vllm-project#5388)
  Bump version to v0.5.0 (vllm-project#5384)
  [Docs] Alphabetically sort sponsors (vllm-project#5386)
  [Docs] Add Docs on Limitations of VLM Support (vllm-project#5383)
  [ci] Mount buildkite agent on Docker container to upload benchmark results (vllm-project#5330)
  [ci] Use small_cpu_queue for doc build (vllm-project#5331)
  [Bugfix] Fix LLaVA-NeXT (vllm-project#5380)
  [Feature][Frontend]:  Continued `stream_options` implementation also in CompletionRequest (vllm-project#5319)
  [Model] Initial support for LLaVA-NeXT (vllm-project#4199)
  [Misc] Improve error message when LoRA parsing fails (vllm-project#5194)
  [misc][typo] fix typo (vllm-project#5372)
  [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (vllm-project#5374)
  [Misc] Update to comply with the new `compressed-tensors` config (vllm-project#5350)
  [Bugfix] Fix KeyError: 1 When Using LoRA adapters (vllm-project#5164)
  [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (vllm-project#5047)
  [mis][ci/test] fix flaky test in test_sharded_state_loader.py (vllm-project#5361)
  ...
  • Loading branch information
tjohnson31415 committed Jun 11, 2024
2 parents 102c8a1 + 640052b commit 12adc56
Show file tree
Hide file tree
Showing 269 changed files with 12,084 additions and 3,334 deletions.
2 changes: 1 addition & 1 deletion .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import zipfile

MAX_SIZE_MB = 150
MAX_SIZE_MB = 200


def print_top_10_largest_files(zip_file):
Expand Down
26 changes: 26 additions & 0 deletions .buildkite/nightly-benchmarks/kickoff-pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash

set -euo pipefail

# Install system packages
apt update
apt install -y curl jq

# Install minijinja for templating
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
source $HOME/.cargo/env

# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')

if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
else
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
exit 0
fi
fi

# Upload sample.yaml
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
39 changes: 39 additions & 0 deletions .buildkite/nightly-benchmarks/sample.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
steps:
# NOTE(simon): You can create separate blocks for different jobs
- label: "A100: NVIDIA SMI"
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
containers:
# - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
# TODO(simon): check latest main branch or use the PR image.
- image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
command:
- bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
# TODO(simon): bring H100 online
# - label: "H100: NVIDIA SMI"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
# command:
# - bash -c 'nvidia-smi && nvidia-smi topo -m'
# propagate-environment: true
# ipc: host
# gpus: all

28 changes: 28 additions & 0 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,34 @@ set -ex
echo "--- ROCm info"
rocminfo

# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}

# Call the cleanup docker function
cleanup_docker

echo "--- Resetting GPUs"

echo "reset" > /opt/amdgpu/etc/gpu_state
Expand Down
8 changes: 4 additions & 4 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
echo '```' >> benchmark_results.md

# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
if [ ! -f /usr/bin/buildkite-agent ]; then
exit 0
fi

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
Expand All @@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
fi

rm ShareGPT_V3_unfiltered_cleaned_split.json
/workspace/buildkite-agent artifact upload "*.json"
buildkite-agent artifact upload "*.json"
14 changes: 12 additions & 2 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
# Run the image
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test

# offline inference
docker exec cpu-test bash -c "python3 examples/offline_inference.py"

# Run basic model test
docker exec cpu-test bash -c "cd tests;
pip install pytest Pillow protobuf
bash ../.buildkite/download-images.sh
cd ../
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
31 changes: 16 additions & 15 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ steps:
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- pytest -v -s distributed/test_pynccl_library.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
Expand All @@ -46,7 +45,8 @@ steps:
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s spec_decode/e2e/test_integration_dist.py
- pytest -v -s spec_decode/e2e/test_integration_dist.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py

- label: Distributed Tests (Multiple Groups)
#mirror_hardwares: [amd]
Expand All @@ -63,7 +63,6 @@ steps:
mirror_hardwares: [amd]

commands:
- pytest -v -s test_inputs.py
- pytest -v -s entrypoints -m llm
- pytest -v -s entrypoints -m openai

Expand All @@ -80,6 +79,13 @@ steps:
- python3 llava_example.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s test_inputs.py
- pytest -v -s multimodal

- label: Kernels Test %N
#mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
Expand All @@ -88,14 +94,13 @@ steps:
- label: Models Test
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py
- pytest -v -s models -m \"not llava\"

- label: Llava Test
mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
- pytest -v -s models -m llava

- label: Prefix Caching Test
mirror_hardwares: [amd]
Expand All @@ -119,7 +124,10 @@ steps:

- label: Speculative decoding tests
#mirror_hardwares: [amd]
command: pytest -v -s spec_decode
commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode

- label: LoRA Test %N
#mirror_hardwares: [amd]
Expand All @@ -131,14 +139,7 @@ steps:
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
# Temporarily run this way because we cannot clean up GPU mem usage
# for multi GPU tests.
# TODO(sang): Fix it.
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
- pytest -v -s lora/test_long_context.py::test_self_consistency
- pytest -v -s lora/test_long_context.py::test_quality
- pytest -v -s lora/test_long_context.py::test_max_len
- pytest -v -s -x lora/test_long_context.py

- label: Tensorizer Test
#mirror_hardwares: [amd]
Expand Down
64 changes: 64 additions & 0 deletions .buildkite/test-template-aws.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
{% set default_working_dir = "/vllm-workspace/tests" %}

steps:
- label: ":docker: build image"
agents:
queue: cpu_queue
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
- wait

{% for step in steps %}
- label: "{{ step.label }}"
agents:
{% if step.label == "Documentation Build" %}
queue: small_cpu_queue
{% elif step.no_gpu %}
queue: cpu_queue
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
queue: gpu_4_queue
{% else %}
queue: gpu_1_queue
{% endif %}
soft_fail: true
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins:
- docker#v5.2.0:
image: {{ docker_image }}
always-pull: true
propagate-environment: true
{% if not step.no_gpu %}
gpus: all
{% endif %}
{% if step.label == "Benchmarks" %}
mount-buildkite-agent: true
{% endif %}
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
environment:
- VLLM_USAGE_SOURCE=ci-test
- HF_TOKEN
{% if step.label == "Speculative decoding tests" %}
- VLLM_ATTENTION_BACKEND=XFORMERS
{% endif %}
volumes:
- /dev/shm:/dev/shm
{% endfor %}
7 changes: 5 additions & 2 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

steps:
- label: ":docker: build image"
commands:
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
Expand All @@ -28,6 +28,7 @@ steps:
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
soft_fail: true
{% endif %}
{% endfor %}

Expand All @@ -36,10 +37,12 @@ steps:
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: true
soft_fail: false

- label: "Intel Test"
depends_on: ~
agents:
queue: intel
command: bash .buildkite/run-cpu-test.sh

{% for step in steps %}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
Expand Down
Loading

0 comments on commit 12adc56

Please sign in to comment.