vllm-project · simon-mo · Aug 6, 2024 · Jul 26, 2024 · Jul 29, 2024 · Aug 1, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -5,11 +5,27 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.
 
+# Documentation
+# label(str): the name of the test. emoji allowed.
+# fast_check(bool): whether to run this on each commit by default, without the /ready tag. 
+# fast_check_only(bool): whether to skip this test on full suite.
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for test. incompatbile with command.
+# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
+# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
+# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
+#     in this case, commands must be specified. the first command runs on first host, the second
+#     command runs on the second host.
+# working_dir(str): override the place where command execute. default to "/vllm-workspace/tests".
+# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
 
 steps:
 - label: Async Engine, Inputs, Utils, Worker Test
   fast_check: true
   fast_check_only: true
+  source_file_dependencies:
+  - vllm/
   commands:
   - pytest -v -s async_engine # Async Engine
   - pytest -v -s test_inputs.py
@@ -19,7 +35,8 @@ steps:
 
 - label: Metrics, Tracing Test
   fast_check: true
-  fast_check_only: true
+  source_file_dependencies:
+  - vllm/
   commands:
   - pytest -v -s metrics # Metrics
   - "pip install \
@@ -31,17 +48,17 @@ steps:
 
 - label: Regression Test
   mirror_hardwares: [amd]
-  fast_check: true
+  fast_check: false
+  source_file_dependencies:
+  - vllm/
   command: pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: AsyncEngine Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s async_engine
-
 - label: Basic Correctness Test
   mirror_hardwares: [amd]
   fast_check: true
+  source_file_dependencies:
+  - vllm/
   commands:
   # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
   - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
@@ -54,14 +71,18 @@ steps:
 - label: Core Test
   mirror_hardwares: [amd]
   fast_check: true
+  source_file_dependencies:
+  - vllm/core
+  - vllm/distributed
   commands:
   - pytest -v -s core
   - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
-  #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
   commands:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
@@ -70,6 +91,8 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
@@ -81,6 +104,8 @@ steps:
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  source_file_dependencies:
+  - vllm/
   commands:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -102,10 +127,11 @@ steps:
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Distributed Tests (4 GPUs)
-  #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   fast_check: true
+  source_file_dependencies:
+  - vllm/
   commands:
   - pytest -v -s distributed/test_pynccl.py
   # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
@@ -118,11 +144,15 @@ steps:
 - label: Pipeline Parallelism Test
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
+  source_file_dependencies:
+  - vllm/
   commands:
   - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: Engine Test
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py
   # OOM in the CI unless we run this separately
@@ -131,18 +161,20 @@ steps:
 - label: Entrypoints Test
   fast_check: true
   mirror_hardwares: [amd]
-
+  source_file_dependencies:
+  - vllm/entrypoints
   commands:
   - pytest -v -s entrypoints/llm
   - pytest -v -s entrypoints/openai
 
 - label: Examples Test
   working_dir: "/vllm-workspace/examples"
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
   commands:
-    # install aws cli for llava_example.py
-    # install tensorizer for tensorize_vllm_model.py
-    - pip install awscli tensorizer
+    - pip install awscli tensorizer # for llava example and tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_with_prefix.py
@@ -151,108 +183,123 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
 - label: Inputs Test
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
   commands:
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
 - label: Kernels Test %N
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/
   commands:
     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
   commands:
     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
   commands:
     - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
   commands:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers
   command: pytest -v -s test_logits_processor.py
 
 - label: Utils Test
+  source_file_dependencies:
+  - vllm/
   commands:
     - pytest -v -s test_utils.py
     - pytest -v -s test_embedded_commit.py
 
 - label: Worker Test
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/worker
   command: pytest -v -s worker
 
 - label: Speculative decoding tests
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/spec_decode
   commands:
     # See https://github.com/vllm-project/vllm/issues/5152
     - export VLLM_ATTENTION_BACKEND=XFORMERS
     - pytest -v -s spec_decode
 
 - label: LoRA Test %N
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/lora
+  - csrc/punica
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
 
 - label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - csrc/punica
   commands:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
-  #mirror_hardwares: [amd]
   soft_fail: true
   fast_check: true
+  source_file_dependencies:
+  - vllm/model_executor/model_loader
   commands:
     - apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 
-- label: Metrics Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s metrics
-
 - label: Quantization Test
-  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
   command: pytest -v -s quantization
 
-- label: Tracing Test
-  commands: 
-    - "pip install \
-        opentelemetry-sdk \
-        opentelemetry-api \
-        opentelemetry-exporter-otlp \
-        opentelemetry-semantic-conventions-ai"
-    - pytest -v -s tracing
-
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]
+  source_file_dependencies:
+  - benchmarks/
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
 - label: LM Eval Small Models
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/
   commands:
   - pip install lm-eval
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -262,6 +309,9 @@ steps:
   gpu: a100
   num_gpus: 4
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/
   commands:
   - pip install lm-eval
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -271,13 +321,16 @@ steps:
   working_dir: "/vllm-workspace/test_docs/docs"
   fast_check: true
   no_gpu: True
+  source_file_dependencies: [] # always run
   commands:
   - pip install -r requirements-docs.txt
   - SPHINXOPTS=\"-W\" make html
 
 - label: Distributed Tests (A100)
   gpu: a100
   num_gpus: 4
+  source_file_dependencies:
+  - vllm/
   commands: 
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details