Skip to content

Commit e3096c3

Browse files
hmellorshreyankg
authored andcommitted
Consolidate Llama model usage in tests (vllm-project#13094)
1 parent bc8c517 commit e3096c3

22 files changed

+44
-52
lines changed

tests/basic_correctness/test_basic_correctness.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
MODELS = [
1919
"google/gemma-2-2b-it",
20-
"meta-llama/Llama-3.2-1B",
20+
"meta-llama/Llama-3.2-1B-Instruct",
2121
]
2222

2323
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@@ -96,12 +96,12 @@ def test_models(
9696
"test_suite", [
9797
("facebook/opt-125m", "ray", "", "L4"),
9898
("facebook/opt-125m", "mp", "", "L4"),
99-
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
100-
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
99+
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
100+
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
101101
("facebook/opt-125m", "ray", "", "A100"),
102102
("facebook/opt-125m", "mp", "", "A100"),
103103
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
104-
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
104+
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
105105
])
106106
def test_models_distributed(
107107
hf_runner,
@@ -116,7 +116,7 @@ def test_models_distributed(
116116
if test_suite != TARGET_TEST_SUITE:
117117
pytest.skip(f"Skip test for {test_suite}")
118118

119-
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
119+
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
120120
# test ray adag
121121
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
122122
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

tests/basic_correctness/test_chunked_prefill.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
MODELS = [
2222
"facebook/opt-125m",
23-
"meta-llama/Llama-3.2-1B",
23+
"meta-llama/Llama-3.2-1B-Instruct",
2424
]
2525

2626

@@ -92,7 +92,7 @@ def test_models_distributed(
9292
) -> None:
9393
override_backend_env_variable(monkeypatch, attention_backend)
9494

95-
if (model == "meta-llama/Llama-2-7b-hf"
95+
if (model == "meta-llama/Llama-3.2-1B-Instruct"
9696
and distributed_executor_backend == "ray"):
9797
# test ray adag
9898
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
221221
Checks exact match decode with and without prefix caching
222222
with chunked prefill enabled.
223223
"""
224-
model = "meta-llama/Llama-2-7b-chat-hf"
224+
model = "meta-llama/Llama-3.2-1B-Instruct"
225225
# The common prompt has 142 tokens with Llama-2 tokenizer.
226226
common_prompt = "You are a helpful AI assistant " * 20
227227
unique_prompts = [

tests/basic_correctness/test_cpu_offload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44

55

66
def test_cpu_offload():
7-
compare_two_settings("meta-llama/Llama-3.2-1B", [],
7+
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
88
["--cpu-offload-gb", "1"])

tests/basic_correctness/test_cumem.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def model(x):
118118
@pytest.mark.parametrize(
119119
"model",
120120
[
121-
"meta-llama/Llama-3.2-1B", # sleep mode with safetensors
121+
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
122122
"facebook/opt-125m" # sleep mode with pytorch checkpoint
123123
])
124124
def test_end_to_end(model):

tests/compile/test_basic_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class TestSetting:
2626
test_settings = [
2727
# basic llama model
2828
TestSetting(
29-
model="meta-llama/Llama-3.2-1B",
29+
model="meta-llama/Llama-3.2-1B-Instruct",
3030
model_args=[],
3131
pp_size=2,
3232
tp_size=2,

tests/compile/utils.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from tests.quantization.utils import is_quant_method_supported
88
from vllm import LLM, SamplingParams
9-
from vllm.config import CompilationLevel
109
from vllm.platforms import current_platform
1110

1211
TEST_MODELS = [
@@ -15,14 +14,14 @@
1514
"dtype": torch.float16,
1615
"quantization": "compressed-tensors"
1716
}),
18-
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
17+
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
1918
"dtype": torch.float16,
20-
"quantization": "fp8"
19+
"quantization": "compressed-tensors"
2120
}),
22-
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
21+
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
2322
"quantization": "compressed-tensors"
2423
}),
25-
("meta-llama/Meta-Llama-3-8B", {}),
24+
("meta-llama/Llama-3.2-1B-Instruct", {}),
2625
]
2726

2827
if is_quant_method_supported("aqlm"):
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
6968
# make sure these models can be captured in full graph mode
7069
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
7170

72-
# The base meta llama uses too much memory.
73-
if (model == "meta-llama/Meta-Llama-3-8B"
74-
and optimization_level >= CompilationLevel.PIECEWISE):
75-
return
76-
7771
print(f"MODEL={model}")
7872

7973
prompts = [

tests/distributed/test_pipeline_parallel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def iter_params(self, model_id: str):
162162
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
163163
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
164164
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
165-
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
165+
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
166166
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
167167
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
168168
# Uses Llama
@@ -230,7 +230,7 @@ def iter_params(self, model_id: str):
230230
TEST_MODELS = [
231231
# [LANGUAGE GENERATION]
232232
"microsoft/Phi-3.5-MoE-instruct",
233-
"meta-llama/Meta-Llama-3-8B",
233+
"meta-llama/Llama-3.2-1B-Instruct",
234234
"ibm/PowerLM-3b",
235235
# [LANGUAGE EMBEDDING]
236236
"intfloat/e5-mistral-7b-instruct",

tests/entrypoints/openai/test_serving_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
OpenAIServingModels)
1515
from vllm.lora.request import LoRARequest
1616

17-
MODEL_NAME = "meta-llama/Llama-2-7b"
17+
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
1818
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
1919
LORA_LOADING_SUCCESS_MESSAGE = (
2020
"Success: LoRA adapter '{lora_name}' added successfully.")

tests/entrypoints/openai/test_shutdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from ...utils import RemoteOpenAIServer
77

8-
MODEL_NAME = "meta-llama/Llama-3.2-1B"
8+
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
99

1010

1111
@pytest.mark.asyncio

tests/kv_transfer/disagg_test.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def setup_servers():
2828
"-m",
2929
"vllm.entrypoints.openai.api_server",
3030
"--model",
31-
"meta-llama/Meta-Llama-3.1-8B-Instruct",
31+
"meta-llama/Llama-3.2-1B-Instruct",
3232
"--port",
3333
"8100",
3434
"--gpu-memory-utilization",
@@ -49,7 +49,7 @@ def setup_servers():
4949
"-m",
5050
"vllm.entrypoints.openai.api_server",
5151
"--model",
52-
"meta-llama/Meta-Llama-3.1-8B-Instruct",
52+
"meta-llama/Llama-3.2-1B-Instruct",
5353
"--port",
5454
"8200",
5555
"--gpu-memory-utilization",
@@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
100100
response = requests.post("http://localhost:8100/v1/completions",
101101
headers={"Content-Type": "application/json"},
102102
json={
103-
"model":
104-
"meta-llama/Meta-Llama-3.1-8B-Instruct",
103+
"model": "meta-llama/Llama-3.2-1B-Instruct",
105104
"prompt": prompt,
106105
"max_tokens": 1,
107106
"temperature": 0
@@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
112111
response = requests.post("http://localhost:8200/v1/completions",
113112
headers={"Content-Type": "application/json"},
114113
json={
115-
"model":
116-
"meta-llama/Meta-Llama-3.1-8B-Instruct",
114+
"model": "meta-llama/Llama-3.2-1B-Instruct",
117115
"prompt": prompt,
118116
"max_tokens": 10,
119117
"temperature": 0

0 commit comments

Comments
 (0)