Skip to content

Commit

Permalink
[CI] Add Buildkite (vllm-project#2355)
Browse files Browse the repository at this point in the history
  • Loading branch information
simon-mo authored Jan 14, 2024
1 parent 9f659bf commit 6e01e8c
Show file tree
Hide file tree
Showing 13 changed files with 192 additions and 37 deletions.
24 changes: 24 additions & 0 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# This script is run by buildkite to run the benchmarks and upload the results to buildkite

set -ex

# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."

# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt

python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt

# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
41 changes: 41 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.

steps:
- label: Regression Test
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional

- label: AsyncEngine Test
command: pytest -v -s async_engine

- label: Distributed Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.

- label: Engine Test
command: pytest -v -s engine

- label: Kernels Test
command: pytest -v -s kernels
soft_fail: true

- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true

- label: Samplers Test
command: pytest -v -s samplers --forked

- label: Worker Test
command: pytest -v -s worker

- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
- pip install aiohttp
- bash run-benchmarks.sh
46 changes: 46 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}

steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait

{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
36 changes: 24 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev

RUN apt-get update -y \
&& apt-get install -y python3-pip
&& apt-get install -y python3-pip git

WORKDIR /workspace

Expand All @@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################


# image to build pytorch extensions
#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
Expand All @@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
Expand All @@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads

RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################


#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY tests tests
COPY vllm vllm
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################

ENTRYPOINT ["python3", "-m", "pytest", "tests"]

#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base

Expand All @@ -63,14 +78,10 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
#################### RUNTIME BASE IMAGE ####################

FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]

#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
Expand All @@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
4 changes: 3 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ types-setuptools
pytest
pytest-forked
pytest-asyncio

httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
return requirements


package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
ext_modules = []
package_data["vllm"].append("*.so")

setuptools.setup(
name="vllm",
version=get_vllm_version(),
Expand Down Expand Up @@ -321,5 +326,5 @@ def get_requirements() -> List[str]:
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension},
package_data={"vllm": ["py.typed"]},
package_data=package_data,
)
12 changes: 10 additions & 2 deletions tests/async_engine/test_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,13 @@ def api_server():
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
uvicorn_process = subprocess.Popen([
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m"
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
])
yield
uvicorn_process.terminate()
Expand Down Expand Up @@ -81,6 +86,9 @@ def test_api_server(api_server):
pool.join()

# check cancellation stats
# give it some times to update the stats
time.sleep(1)

num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0
Expand Down
15 changes: 9 additions & 6 deletions tests/async_engine/test_openai_server.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
from argparse import Namespace
from dataclasses import dataclass
import os
import pathlib

import pytest
from fastapi.testclient import TestClient

from vllm.entrypoints.openai.api_server import *

chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()

# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", None, True,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", None, False,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
"""<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
"""),
("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
"""<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
Expand Down Expand Up @@ -54,8 +58,7 @@ class MockTokenizer:

def test_load_chat_template():
# Testing chatml template
template = "../../examples/template_chatml.jinja"
mock_args = Namespace(chat_template=template)
mock_args = Namespace(chat_template=chatml_jinja_path)
tokenizer = MockTokenizer()

# Call the function with the mocked args
Expand Down
26 changes: 14 additions & 12 deletions tests/distributed/test_comm_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
from multiprocessing import Process, set_start_method

import pytest
import torch
import ray

from vllm.config import ParallelConfig
from vllm.utils import get_open_port
Expand All @@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size,
worker_use_ray=True)
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
torch.cuda.set_device(rank)
_init_distributed_environment(parallel_config, rank,
distributed_init_method)


@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank,
Expand All @@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert torch.allclose(t, expected)


@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank,
Expand Down Expand Up @@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
set_start_method("spawn", force=True)
# Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray.init()

distributed_init_port = get_open_port()
processes = []
refs = []
for rank in range(tensor_parallel_size):
p = Process(target=test_target,
args=(tensor_parallel_size, rank, distributed_init_port))
p.start()
processes.append(p)
for p in processes:
p.join()
assert all(p.exitcode == 0 for p in processes)
refs.append(
test_target.remote(tensor_parallel_size, rank,
distributed_init_port))
ray.get(refs)

ray.shutdown()
2 changes: 1 addition & 1 deletion tests/kernels/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# This will change depending on the compute capability.
# - 512 as a buffer
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
NUM_BLOCKS = 40000 # Arbitrary values for testing
NUM_BLOCKS = 12000 # Arbitrary values for testing
PARTITION_SIZE = 512

DTYPES = [torch.half, torch.bfloat16, torch.float]
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from vllm._C import cache_ops

DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [83] # Arbitrary values for testing
NUM_TOKENS = [42] # Arbitrary values for testing
NUM_LAYERS = [1] # Arbitrary values for testing
NUM_HEADS = [8] # Arbitrary values for testing
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
BLOCK_SIZES = [8, 16, 32]
NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing
NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing
NUM_MAPPINGS = [256] # Arbitrary values for testing
SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
Expand Down
1 change: 1 addition & 0 deletions tests/samplers/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
temperature=0.0)
vllm_results = vllm_model.model.generate(
example_prompts, sampling_params=vllm_sampling_params)
del vllm_model

# Test whether logprobs are included in the results.
for result in vllm_results:
Expand Down
Loading

0 comments on commit 6e01e8c

Please sign in to comment.