Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

MODELS = [
"google/gemma-2-2b-it",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
]

TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
Expand Down Expand Up @@ -96,12 +96,12 @@ def test_models(
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
Expand All @@ -116,7 +116,7 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")

if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
Expand Down
6 changes: 3 additions & 3 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
]


Expand Down Expand Up @@ -92,7 +92,7 @@ def test_models_distributed(
) -> None:
override_backend_env_variable(monkeypatch, attention_backend)

if (model == "meta-llama/Llama-2-7b-hf"
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
Expand Down Expand Up @@ -221,7 +221,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model = "meta-llama/Llama-2-7b-chat-hf"
model = "meta-llama/Llama-3.2-1B-Instruct"
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt = "You are a helpful AI assistant " * 20
unique_prompts = [
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_cpu_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@


def test_cpu_offload():
compare_two_settings("meta-llama/Llama-3.2-1B", [],
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
["--cpu-offload-gb", "1"])
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def model(x):
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B", # sleep mode with safetensors
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
])
def test_end_to_end(model):
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class TestSetting:
test_settings = [
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B",
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
Expand Down
14 changes: 4 additions & 10 deletions tests/compile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel
from vllm.platforms import current_platform

TEST_MODELS = [
Expand All @@ -15,14 +14,14 @@
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "fp8"
"quantization": "compressed-tensors"
}),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]

if is_quant_method_supported("aqlm"):
Expand Down Expand Up @@ -69,11 +68,6 @@ def check_full_graph_support(model,
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

# The base meta llama uses too much memory.
if (model == "meta-llama/Meta-Llama-3-8B"
and optimization_level >= CompilationLevel.PIECEWISE):
return

print(f"MODEL={model}")

prompts = [
Expand Down
4 changes: 2 additions & 2 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def iter_params(self, model_id: str):
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
# Uses Llama
Expand Down Expand Up @@ -230,7 +230,7 @@ def iter_params(self, model_id: str):
TEST_MODELS = [
# [LANGUAGE GENERATION]
"microsoft/Phi-3.5-MoE-instruct",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B-Instruct",
"ibm/PowerLM-3b",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
OpenAIServingModels)
from vllm.lora.request import LoRARequest

MODEL_NAME = "meta-llama/Llama-2-7b"
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' added successfully.")
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_shutdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "meta-llama/Llama-3.2-1B"
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"


@pytest.mark.asyncio
Expand Down
10 changes: 4 additions & 6 deletions tests/kv_transfer/disagg_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"--port",
"8100",
"--gpu-memory-utilization",
Expand All @@ -49,7 +49,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"--port",
"8200",
"--gpu-memory-utilization",
Expand Down Expand Up @@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8100/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 1,
"temperature": 0
Expand All @@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8200/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 10,
"temperature": 0
Expand Down
8 changes: 4 additions & 4 deletions tests/models/decoder_only/language/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf")
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct")
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
Expand Down
2 changes: 1 addition & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def check_available_online(
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False),
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
Expand Down
2 changes: 1 addition & 1 deletion tests/quantization/test_register_quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_register_quantization_config():

@pytest.mark.parametrize(argnames="model",
argvalues=[
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
])
def test_custom_quant(vllm_runner, model):
"""Test infer with the custom quantization method."""
Expand Down
2 changes: 1 addition & 1 deletion tests/samplers/test_ignore_eos.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]


@pytest.mark.parametrize("model", MODELS)
Expand Down
6 changes: 3 additions & 3 deletions tests/spec_decode/e2e/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@pytest.mark.parametrize("common_llm_kwargs", [{
"model": "meta-llama/Llama-2-7b-chat-hf",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
Expand All @@ -27,8 +27,8 @@
},
{
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
"speculative_max_model_len": 4096 + 1,
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_max_model_len": 131072 + 1,
},
])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
Expand Down
2 changes: 1 addition & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def test_rope_customization():
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
("facebook/opt-125m", False),
("facebook/bart-base", True),
("meta-llama/Llama-3.2-1B", False),
("meta-llama/Llama-3.2-1B-Instruct", False),
("meta-llama/Llama-3.2-11B-Vision", True),
])
def test_is_encoder_decoder(model_id, is_encoder_decoder):
Expand Down
8 changes: 4 additions & 4 deletions tests/test_sharded_state_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def test_filter_subtensors():


@pytest.fixture(scope="module")
def llama_2_7b_files():
def llama_3p2_1b_files():
with TemporaryDirectory() as cache_dir:
input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
cache_dir=cache_dir,
ignore_patterns=["*.bin*", "original/*"])

Expand Down Expand Up @@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
llama_2_7b_files):
llama_3p2_1b_files):
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

weights_patterns = ("*.safetensors", )
gpu_memory_utilization = 0.8
input_dir = llama_2_7b_files
input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn")

# Run in separate processes for memory & CUDA isolation
Expand Down
2 changes: 1 addition & 1 deletion tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"bigscience/bloom-560m",
"mosaicml/mpt-7b",
"tiiuae/falcon-7b",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-1B-Instruct",
"codellama/CodeLlama-7b-hf",
"mistralai/Pixtral-12B-2409",
]
Expand Down
4 changes: 2 additions & 2 deletions tests/tokenization/test_get_eos.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@


def test_get_llama3_eos_token():
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = get_tokenizer(model_name)
assert tokenizer.eos_token_id == 128009

generation_config = try_get_generation_config(model_name,
trust_remote_code=False)
assert generation_config is not None
assert generation_config.eos_token_id == [128001, 128009]
assert generation_config.eos_token_id == [128001, 128008, 128009]


def test_get_blip2_eos_token():
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
pytest.skip(reason="V1 currently only supported on CUDA.",
allow_module_level=True)

ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
disable_log_requests=True)

Expand Down
2 changes: 1 addition & 1 deletion tests/v1/sample/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from ...conftest import VllmRunner

MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
DTYPE = "half"


Expand Down
2 changes: 1 addition & 1 deletion tests/v1/sample/test_logprobs_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
EXPECTED_VALUE = 0.62

# FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
SERVER_ARGS = [
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
Expand Down
Loading