Skip to content

Commit ab03bdb

Browse files
DarkLight1337LeiWang1999
authored andcommitted
[CI/Build] Reorganize models tests (vllm-project#7820)
Signed-off-by: LeiWang1999 <leiwang1999@outlook.com>
1 parent 2966fe6 commit ab03bdb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+415
-498
lines changed

.buildkite/run-cpu-test.sh

+4-6
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
2323
# Run basic model test
2424
docker exec cpu-test bash -c "
2525
pip install pytest matplotlib einops transformers_stream_generator
26-
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
27-
--ignore=tests/models/test_oot_registration.py \
28-
--ignore=tests/models/test_registry.py \
29-
--ignore=tests/models/test_fp8.py \
30-
--ignore=tests/models/test_jamba.py \
31-
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
26+
pytest -v -s tests/models/decoder_only/language \
27+
--ignore=tests/models/test_fp8.py \
28+
--ignore=tests/models/decoder_only/language/test_jamba.py \
29+
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
3230

3331
# Run compressed-tensor test
3432
docker exec cpu-test bash -c "

.buildkite/test-pipeline.yaml

+45-25
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ steps:
9494
- pytest -v -s entrypoints/test_chat_utils.py
9595
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
9696

97-
9897
- label: Distributed Tests (4 GPUs) # 10min
9998
working_dir: "/vllm-workspace/tests"
10099
num_gpus: 4
@@ -164,30 +163,13 @@ steps:
164163
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
165164
- python3 offline_inference_encoder_decoder.py
166165

167-
- label: Models Test # 1hr10min
168-
source_file_dependencies:
169-
- vllm/
170-
- tests/models
171-
commands:
172-
- pip install -e ./plugins/vllm_add_dummy_model
173-
- pytest -v -s models/test_oot_registration.py # it needs a clean process
174-
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
175-
176166
- label: torch compile integration test
177167
source_file_dependencies:
178168
- vllm/
179169
commands:
180170
- pytest -v -s ./compile/test_full_graph.py
181171
- pytest -v -s ./compile/test_wrapper.py
182172

183-
184-
- label: Vision Language Models Test # 42min
185-
#mirror_hardwares: [amd]
186-
source_file_dependencies:
187-
- vllm/
188-
commands:
189-
- pytest -v -s models -m vlm
190-
191173
- label: Prefix Caching Test # 7min
192174
#mirror_hardwares: [amd]
193175
source_file_dependencies:
@@ -286,6 +268,45 @@ steps:
286268
commands:
287269
- pytest -v -s tool_use
288270

271+
##### models test #####
272+
273+
- label: Basic Models Test # 3min
274+
source_file_dependencies:
275+
- vllm/
276+
- tests/models
277+
commands:
278+
- pip install -e ./plugins/vllm_add_dummy_model
279+
- pytest -v -s models/test_oot_registration.py # it needs a clean process
280+
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
281+
282+
- label: Decoder-only Language Models Test # 1h3min
283+
#mirror_hardwares: [amd]
284+
source_file_dependencies:
285+
- vllm/
286+
- tests/models/decoder_only/language
287+
commands:
288+
- pytest -v -s models/decoder_only/language
289+
290+
- label: Decoder-only Multi-Modal Models Test # 56min
291+
#mirror_hardwares: [amd]
292+
source_file_dependencies:
293+
- vllm/
294+
- tests/models/decoder_only/audio_language
295+
- tests/models/decoder_only/vision_language
296+
commands:
297+
- pytest -v -s models/decoder_only/audio_language
298+
- pytest -v -s models/decoder_only/vision_language
299+
300+
- label: Other Models Test # 5min
301+
#mirror_hardwares: [amd]
302+
source_file_dependencies:
303+
- vllm/
304+
- tests/models/embedding/language
305+
- tests/models/encoder_decoder/language
306+
commands:
307+
- pytest -v -s models/embedding/language
308+
- pytest -v -s models/encoder_decoder/language
309+
289310
##### 1 GPU test #####
290311
##### multi gpus test #####
291312

@@ -311,11 +332,11 @@ steps:
311332
- tests/distributed/
312333
commands:
313334
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
314-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
335+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
315336
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
316337
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
317338
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
318-
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
339+
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
319340

320341
- label: Distributed Tests (2 GPUs) # 28min
321342
#mirror_hardwares: [amd]
@@ -328,11 +349,10 @@ steps:
328349
- vllm/model_executor/models/
329350
- tests/distributed/
330351
commands:
331-
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
332-
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
333-
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
334-
- pytest -v -s distributed/test_chunked_prefill_distributed.py
335-
- pytest -v -s distributed/test_multimodal_broadcast.py
352+
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
353+
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
354+
# Avoid importing model tests that cause CUDA reinitialization error
355+
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
336356
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
337357
- pip install -e ./plugins/vllm_add_dummy_model
338358
- pytest -v -s distributed/test_distributed_oot.py

docs/source/models/supported_models.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
342342

343343
We have the following levels of testing for models:
344344

345-
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
345+
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
346346
2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
347347
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
348348
4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -85,5 +85,6 @@ skip_gitignore = true
8585
[tool.pytest.ini_options]
8686
markers = [
8787
"skip_global_cleanup",
88-
"vlm: run tests for vision language models only",
88+
"core_model: run this model test in each PR instead of just daily",
89+
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
8990
]

tests/basic_correctness/test_basic_correctness.py

+62
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@
1515
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
1616

1717
from ..models.utils import check_outputs_equal
18+
from ..utils import multi_gpu_test
1819

1920
MODELS = [
2021
"facebook/opt-125m",
2122
"meta-llama/Llama-2-7b-hf",
2223
]
2324

25+
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
26+
2427

2528
def test_vllm_gc_ed():
2629
"""Verify vllm instance is GC'ed when it is deleted"""
@@ -70,6 +73,65 @@ def test_models(
7073
)
7174

7275

76+
@multi_gpu_test(num_gpus=2)
77+
@pytest.mark.parametrize(
78+
"model, distributed_executor_backend, attention_backend, "
79+
"test_suite", [
80+
("facebook/opt-125m", "ray", "", "L4"),
81+
("facebook/opt-125m", "mp", "", "L4"),
82+
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
83+
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
84+
("facebook/opt-125m", "ray", "", "A100"),
85+
("facebook/opt-125m", "mp", "", "A100"),
86+
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
87+
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
88+
])
89+
def test_models_distributed(
90+
hf_runner,
91+
vllm_runner,
92+
example_prompts,
93+
model: str,
94+
distributed_executor_backend: str,
95+
attention_backend: str,
96+
test_suite: str,
97+
) -> None:
98+
99+
if test_suite != TARGET_TEST_SUITE:
100+
pytest.skip(f"Skip test for {test_suite}")
101+
102+
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
103+
# test ray adag
104+
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
105+
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
106+
107+
if attention_backend:
108+
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
109+
110+
dtype = "half"
111+
max_tokens = 5
112+
113+
# NOTE: take care of the order. run vLLM first, and then run HF.
114+
# vLLM needs a fresh new process without cuda initialization.
115+
# if we run HF first, the cuda initialization will be done and it
116+
# will hurt multiprocessing backend with fork method (the default method).
117+
with vllm_runner(model,
118+
dtype=dtype,
119+
tensor_parallel_size=2,
120+
distributed_executor_backend=distributed_executor_backend
121+
) as vllm_model:
122+
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
123+
124+
with hf_runner(model, dtype=dtype) as hf_model:
125+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
126+
127+
check_outputs_equal(
128+
outputs_0_lst=hf_outputs,
129+
outputs_1_lst=vllm_outputs,
130+
name_0="hf",
131+
name_1="vllm",
132+
)
133+
134+
73135
def test_model_with_failure(vllm_runner) -> None:
74136
try:
75137
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",

tests/basic_correctness/test_chunked_prefill.py

+55
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
77
Run `pytest tests/models/test_chunked_prefill.py`.
88
"""
9+
import os
910
from contextlib import nullcontext
1011

1112
import pytest
1213

1314
from ..models.utils import check_logprobs_close, check_outputs_equal
15+
from ..utils import multi_gpu_test
1416

1517
MODELS = [
1618
"facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
6668
)
6769

6870

71+
@multi_gpu_test(num_gpus=2)
72+
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
73+
@pytest.mark.parametrize("model", MODELS)
74+
def test_models_distributed(
75+
hf_runner,
76+
vllm_runner,
77+
example_prompts,
78+
model: str,
79+
distributed_executor_backend: str,
80+
) -> None:
81+
if (model == "meta-llama/Llama-2-7b-hf"
82+
and distributed_executor_backend == "ray"):
83+
# test ray adag
84+
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
85+
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
86+
87+
dtype = "half"
88+
max_tokens = 5
89+
chunked_prefill_token_size = 16
90+
91+
# Add a chunked prefill config.
92+
max_num_seqs = min(chunked_prefill_token_size, 256)
93+
assert chunked_prefill_token_size != -1
94+
enable_chunked_prefill = True
95+
max_num_batched_tokens = chunked_prefill_token_size
96+
97+
# NOTE: take care of the order. run vLLM first, and then run HF.
98+
# vLLM needs a fresh new process without cuda initialization.
99+
# if we run HF first, the cuda initialization will be done and it
100+
# will hurt multiprocessing backend with fork method (the default method).
101+
102+
with vllm_runner(
103+
model,
104+
dtype=dtype,
105+
tensor_parallel_size=2,
106+
max_num_seqs=max_num_seqs,
107+
enable_chunked_prefill=enable_chunked_prefill,
108+
max_num_batched_tokens=max_num_batched_tokens,
109+
distributed_executor_backend=distributed_executor_backend,
110+
) as vllm_model:
111+
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
112+
113+
with hf_runner(model, dtype=dtype) as hf_model:
114+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
115+
116+
check_outputs_equal(
117+
outputs_0_lst=hf_outputs,
118+
outputs_1_lst=vllm_outputs,
119+
name_0="hf",
120+
name_1="vllm",
121+
)
122+
123+
69124
@pytest.mark.parametrize(
70125
"kv_cache_dtype,model",
71126
[("fp8_e4m3",

tests/basic_correctness/test_preemption.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919
"facebook/opt-125m",
2020
]
2121

22-
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
23-
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
24-
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
25-
"tests/basic_correctness/test_preemption.py`")
22+
23+
@pytest.fixture(scope="module", autouse=True)
24+
def check_settings():
25+
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
26+
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
27+
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
28+
"tests/basic_correctness/test_preemption.py`")
2629

2730

2831
@pytest.fixture

tests/conftest.py

+12-17
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
import tempfile
77
from collections import UserList
88
from enum import Enum
9-
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
10-
TypeVar, Union)
9+
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
10+
TypedDict, TypeVar, Union)
1111

1212
import numpy as np
1313
import pytest
@@ -18,6 +18,7 @@
1818
from PIL import Image
1919
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
2020
BatchFeature)
21+
from transformers.models.auto.auto_factory import _BaseAutoModelClass
2122

2223
from vllm import LLM, SamplingParams
2324
from vllm.assets.image import ImageAsset
@@ -260,7 +261,7 @@ def __init__(
260261
*,
261262
model_kwargs: Optional[Dict[str, Any]] = None,
262263
is_embedding_model: bool = False,
263-
auto_cls=AutoModelForCausalLM,
264+
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
264265
postprocess_inputs: Callable[[BatchEncoding],
265266
BatchEncoding] = identity,
266267
) -> None:
@@ -292,20 +293,14 @@ def __init__(
292293
trust_remote_code=True,
293294
)
294295

295-
try:
296-
# don't put this import at the top level
297-
# it will call torch.cuda.device_count()
298-
from transformers import AutoProcessor # noqa: F401
299-
self.processor = AutoProcessor.from_pretrained(
300-
model_name,
301-
torch_dtype=torch_dtype,
302-
trust_remote_code=True,
303-
)
304-
except Exception as exc:
305-
logger.warning(
306-
"Unable to auto-load HuggingFace processor for model (%s). "
307-
"Using tokenizer instead. Reason: %s", model_name, exc)
308-
self.processor = self.tokenizer
296+
# don't put this import at the top level
297+
# it will call torch.cuda.device_count()
298+
from transformers import AutoProcessor # noqa: F401
299+
self.processor = AutoProcessor.from_pretrained(
300+
model_name,
301+
torch_dtype=torch_dtype,
302+
trust_remote_code=True,
303+
)
309304

310305
self.postprocess_inputs = postprocess_inputs
311306

0 commit comments

Comments
 (0)