From 6e01e8c1c8ea323d30e3f57050469b2df66b56c6 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 14 Jan 2024 12:37:58 -0800 Subject: [PATCH] [CI] Add Buildkite (#2355) --- .buildkite/run-benchmarks.sh | 24 +++++++++++++ .buildkite/test-pipeline.yaml | 41 +++++++++++++++++++++ .buildkite/test-template.j2 | 46 ++++++++++++++++++++++++ Dockerfile | 36 ++++++++++++------- requirements-dev.txt | 4 ++- setup.py | 7 +++- tests/async_engine/test_api_server.py | 12 +++++-- tests/async_engine/test_openai_server.py | 15 ++++---- tests/distributed/test_comm_ops.py | 26 +++++++------- tests/kernels/test_attention.py | 2 +- tests/kernels/test_cache.py | 4 +-- tests/samplers/test_logprobs.py | 1 + tests/samplers/test_sampler.py | 11 ++++++ 13 files changed, 192 insertions(+), 37 deletions(-) create mode 100644 .buildkite/run-benchmarks.sh create mode 100644 .buildkite/test-pipeline.yaml create mode 100644 .buildkite/test-template.j2 diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh new file mode 100644 index 0000000000000..0a77f09b8a3f4 --- /dev/null +++ b/.buildkite/run-benchmarks.sh @@ -0,0 +1,24 @@ +# This script is run by buildkite to run the benchmarks and upload the results to buildkite + +set -ex + +# cd into parent directory of this file +cd "$(dirname "${BASH_SOURCE[0]}")/.." + +# run benchmarks and upload the result to buildkite +python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt + +python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt + +# write the results into a markdown file +echo "### Latency Benchmarks" >> benchmark_results.md +sed -n '1p' benchmark_latency.txt >> benchmark_results.md +echo "" >> benchmark_results.md +sed -n '$p' benchmark_latency.txt >> benchmark_results.md +echo "### Throughput Benchmarks" >> benchmark_results.md +sed -n '1p' benchmark_throughput.txt >> benchmark_results.md +echo "" >> benchmark_results.md +sed -n '$p' benchmark_throughput.txt >> benchmark_results.md + +# upload the results to buildkite +/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml new file mode 100644 index 0000000000000..3cd1bed0e50a2 --- /dev/null +++ b/.buildkite/test-pipeline.yaml @@ -0,0 +1,41 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. +# This script will be feed into Jinja template in `test-template.j2` to generate +# the final pipeline yaml file. + +steps: +- label: Regression Test + command: pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: AsyncEngine Test + command: pytest -v -s async_engine + +- label: Distributed Test + command: pytest -v -s test_comm_ops.py + working_dir: "/vllm-workspace/tests/distributed" + num_gpus: 2 # only support 1 or 2 for now. + +- label: Engine Test + command: pytest -v -s engine + +- label: Kernels Test + command: pytest -v -s kernels + soft_fail: true + +- label: Models Test + commands: + - pytest -v -s models --forked + soft_fail: true + +- label: Samplers Test + command: pytest -v -s samplers --forked + +- label: Worker Test + command: pytest -v -s worker + +- label: Benchmarks + working_dir: "/vllm-workspace/.buildkite" + commands: + - pip install aiohttp + - bash run-benchmarks.sh diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 new file mode 100644 index 0000000000000..e94538bc996ab --- /dev/null +++ b/.buildkite/test-template.j2 @@ -0,0 +1,46 @@ +{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} +{% set default_num_gpu = 1 %} +{% set default_working_dir = "/vllm-workspace/tests" %} + +steps: + - label: ":docker: build image" + commands: + - "docker build --tag {{ docker_image }} --target test --progress plain ." + - "docker push {{ docker_image }}" + env: + DOCKER_BUILDKIT: "1" + - wait + + {% for step in steps %} + - label: "{{ step.label }}" + agents: + queue: kubernetes + soft_fail: {{ step.soft_fail or false }} + plugins: + - kubernetes: + podSpec: + volumes: + - name: dshm + emptyDir: + medium: Memory + containers: + - image: "{{ docker_image }}" + command: ["bash"] + args: + - "-c" + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + requests: + nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" + limits: + nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + volumeMounts: + - mountPath: /dev/shm + name: dshm + {% endfor %} diff --git a/Dockerfile b/Dockerfile index bd66afe79c7e1..44b1dd17d7e02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,11 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. + +#################### BASE BUILD IMAGE #################### FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev RUN apt-get update -y \ - && apt-get install -y python3-pip + && apt-get install -y python3-pip git WORKDIR /workspace @@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt +#################### BASE BUILD IMAGE #################### + -# image to build pytorch extensions +#################### EXTENSION BUILD IMAGE #################### FROM dev AS build # install build dependencies @@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py +# cuda arch list used by torch ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # max jobs used by Ninja to build extensions @@ -40,18 +47,26 @@ ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads RUN python3 setup.py build_ext --inplace +#################### EXTENSION Build IMAGE #################### + +#################### TEST IMAGE #################### # image to run unit testing suite FROM dev AS test # copy pytorch extensions separately to avoid having to rebuild # when python code changes -COPY --from=build /workspace/vllm/*.so /workspace/vllm/ -COPY tests tests -COPY vllm vllm +WORKDIR /vllm-workspace +# ADD is used to preserve directory structure +ADD . /vllm-workspace/ +COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ +# ignore build dependencies installation because we are using pre-complied extensions +RUN rm pyproject.toml +RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose +#################### TEST IMAGE #################### -ENTRYPOINT ["python3", "-m", "pytest", "tests"] +#################### RUNTIME BASE IMAGE #################### # use CUDA base as CUDA runtime dependencies are already installed via pip FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base @@ -63,14 +78,10 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt +#################### RUNTIME BASE IMAGE #################### -FROM vllm-base AS vllm -COPY --from=build /workspace/vllm/*.so /workspace/vllm/ -COPY vllm vllm - -EXPOSE 8000 -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] +#################### OPENAI API SERVER #################### # openai api server alternative FROM vllm-base AS vllm-openai # install additional dependencies for openai api server @@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +#################### OPENAI API SERVER #################### diff --git a/requirements-dev.txt b/requirements-dev.txt index cf15292749083..89f8b3f08dbfa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,4 +13,6 @@ types-setuptools pytest pytest-forked pytest-asyncio - +httpx +einops # required for MPT +flash_attn # required for HuggingFace's llama implementation diff --git a/setup.py b/setup.py index 811d494e7a01f..fe8cd6d75ed76 100644 --- a/setup.py +++ b/setup.py @@ -293,6 +293,11 @@ def get_requirements() -> List[str]: return requirements +package_data = {"vllm": ["py.typed"]} +if os.environ.get("VLLM_USE_PRECOMPILED"): + ext_modules = [] + package_data["vllm"].append("*.so") + setuptools.setup( name="vllm", version=get_vllm_version(), @@ -321,5 +326,5 @@ def get_requirements() -> List[str]: install_requires=get_requirements(), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension}, - package_data={"vllm": ["py.typed"]}, + package_data=package_data, ) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 0b45e10dc5550..ed9017c1e3e9d 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -29,8 +29,13 @@ def api_server(): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() uvicorn_process = subprocess.Popen([ - sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m" + sys.executable, + "-u", + str(script_path), + "--model", + "facebook/opt-125m", + "--host", + "127.0.0.1", ]) yield uvicorn_process.terminate() @@ -81,6 +86,9 @@ def test_api_server(api_server): pool.join() # check cancellation stats + # give it some times to update the stats + time.sleep(1) + num_aborted_requests = requests.get( "http://localhost:8000/stats").json()["num_aborted_requests"] assert num_aborted_requests > 0 diff --git a/tests/async_engine/test_openai_server.py b/tests/async_engine/test_openai_server.py index a61ff7e84ca66..ff1ce423c517e 100644 --- a/tests/async_engine/test_openai_server.py +++ b/tests/async_engine/test_openai_server.py @@ -1,19 +1,24 @@ from argparse import Namespace from dataclasses import dataclass +import os +import pathlib import pytest from fastapi.testclient import TestClient from vllm.entrypoints.openai.api_server import * +chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( + __file__))).parent.parent / "examples/template_chatml.jinja" +assert chatml_jinja_path.exists() + # Define models, templates, and their corresponding expected outputs MODEL_TEMPLATE_GENERATON_OUTPUT = [ ("facebook/opt-125m", None, True, "HelloHi there!What is the capital of"), ("facebook/opt-125m", None, False, "HelloHi there!What is the capital of"), - ("facebook/opt-125m", "../../examples/template_chatml.jinja", True, - """<|im_start|>user + ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there!<|im_end|> @@ -21,8 +26,7 @@ What is the capital of<|im_end|> <|im_start|>assistant """), - ("facebook/opt-125m", "../../examples/template_chatml.jinja", False, - """<|im_start|>user + ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant Hi there!<|im_end|> @@ -54,8 +58,7 @@ class MockTokenizer: def test_load_chat_template(): # Testing chatml template - template = "../../examples/template_chatml.jinja" - mock_args = Namespace(chat_template=template) + mock_args = Namespace(chat_template=chatml_jinja_path) tokenizer = MockTokenizer() # Call the function with the mocked args diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index b9895b3e71794..75111feb39507 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -2,10 +2,9 @@ Run `pytest tests/distributed/test_comm_ops.py --forked`. """ -from multiprocessing import Process, set_start_method - import pytest import torch +import ray from vllm.config import ParallelConfig from vllm.utils import get_open_port @@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int, tensor_parallel_size, worker_use_ray=True) distributed_init_method = f"tcp://localhost:{distributed_init_port}" - torch.cuda.set_device(rank) _init_distributed_environment(parallel_config, rank, distributed_init_method) +@ray.remote(num_gpus=1, max_calls=1) def all_reduce_test_worker(tensor_parallel_size: int, rank: int, distributed_init_port: str): init_test_distributed_environment(1, tensor_parallel_size, rank, @@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int, assert torch.allclose(t, expected) +@ray.remote(num_gpus=1, max_calls=1) def all_gather_test_worker(tensor_parallel_size: int, rank: int, distributed_init_port: str): init_test_distributed_environment(1, tensor_parallel_size, rank, @@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, @pytest.mark.parametrize("test_target", [all_reduce_test_worker, all_gather_test_worker]) def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): - set_start_method("spawn", force=True) + # Using ray helps debugging the error when it failed + # as compared to multiprocessing. + ray.init() + distributed_init_port = get_open_port() - processes = [] + refs = [] for rank in range(tensor_parallel_size): - p = Process(target=test_target, - args=(tensor_parallel_size, rank, distributed_init_port)) - p.start() - processes.append(p) - for p in processes: - p.join() - assert all(p.exitcode == 0 for p in processes) + refs.append( + test_target.remote(tensor_parallel_size, rank, + distributed_init_port)) + ray.get(refs) + + ray.shutdown() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 814d40f56def0..3949948e860f7 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -13,7 +13,7 @@ # This will change depending on the compute capability. # - 512 as a buffer MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 -NUM_BLOCKS = 40000 # Arbitrary values for testing +NUM_BLOCKS = 12000 # Arbitrary values for testing PARTITION_SIZE = 512 DTYPES = [torch.half, torch.bfloat16, torch.float] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 1d8d41e013b03..7b1cc058f2cb5 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -6,12 +6,12 @@ from vllm._C import cache_ops DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_TOKENS = [83] # Arbitrary values for testing +NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing HEAD_SIZES = [64, 80, 96, 112, 128, 256] BLOCK_SIZES = [8, 16, 32] -NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing +NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 1c67cc5bd7394..0ea3704462fcb 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -30,6 +30,7 @@ def test_get_prompt_logprobs( temperature=0.0) vllm_results = vllm_model.model.generate( example_prompts, sampling_params=vllm_sampling_params) + del vllm_model # Test whether logprobs are included in the results. for result in vllm_results: diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 76aca3ad80a63..996aa8e0a8d9a 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int): for nth_output in sequence_output.samples: assert nth_output.output_token == expected[i].item() + del model_runner + @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_sampler_all_random(seed: int): @@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int): for nth_output in sequence_output.samples: assert nth_output.output_token == i + del model_runner + @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_sampler_all_beam(seed: int): @@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int): # the outputs are expected - in other words, this just tests # whether there are no exceptions in the sampler # when handling an all-beam search case. + del model_runner @pytest.mark.parametrize("seed", RANDOM_SEEDS) @@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int): for nth_output in sequence_output.samples: assert nth_output.output_token in expected_tokens + del model_runner + @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_sampler_logits_processors(seed: int): @@ -235,6 +242,8 @@ def pick_ith(token_ids, logits): for idx, nth_output in enumerate(sequence_output.samples): assert nth_output.output_token == idx + del model_runner + @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_sampler_top_k_top_p(seed: int): @@ -296,3 +305,5 @@ def mock_sample(probs, logprobs, sampling_metadata): hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) assert torch.allclose(hf_probs, sample_probs, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) + + del model_runner