Skip to content

Commit 5134671

Browse files
njhillsumitd2
authored andcommitted
[BugFix] Fix test breakages from transformers 4.45 upgrade (vllm-project#8829)
Signed-off-by: Sumit Dubey <sumit.dubey2@ibm.com>
1 parent e27aca0 commit 5134671

File tree

13 files changed

+62
-49
lines changed

13 files changed

+62
-49
lines changed

.buildkite/test-pipeline.yaml

+3-6
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ steps:
8383

8484
- label: Entrypoints Test # 20min
8585
working_dir: "/vllm-workspace/tests"
86-
soft_fail: true
8786
fast_check: true
8887
mirror_hardwares: [amd]
8988
source_file_dependencies:
@@ -96,7 +95,8 @@ steps:
9695
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
9796
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
9897
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
99-
- pytest -v -s entrypoints/openai
98+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
99+
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
100100
- pytest -v -s entrypoints/test_chat_utils.py
101101
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
102102

@@ -178,7 +178,6 @@ steps:
178178
- pytest -v -s prefix_caching
179179

180180
- label: Samplers Test # 18min
181-
soft_fail: true
182181
source_file_dependencies:
183182
- vllm/model_executor/layers
184183
- vllm/sampling_metadata.py
@@ -206,7 +205,6 @@ steps:
206205

207206
- label: LoRA Test %N # 30min each
208207
mirror_hardwares: [amd]
209-
soft_fail: true
210208
source_file_dependencies:
211209
- vllm/lora
212210
- tests/lora
@@ -311,7 +309,6 @@ steps:
311309
- pytest -v -s models/decoder_only/language
312310

313311
- label: Decoder-only Multi-Modal Models Test # 56min
314-
soft_fail: true
315312
#mirror_hardwares: [amd]
316313
source_file_dependencies:
317314
- vllm/
@@ -463,7 +460,7 @@ steps:
463460
# NOTE: don't test llama model here, it seems hf implementation is buggy
464461
# see https://github.com/vllm-project/vllm/pull/5689 for details
465462
- pytest -v -s distributed/test_custom_all_reduce.py
466-
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
463+
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
467464
- pytest -v -s -x lora/test_mixtral.py
468465

469466
- label: LM Eval Large Models # optional

tests/conftest.py

-1
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,6 @@ def generate_w_logprobs(
699699
if videos is not None:
700700
for i, video in enumerate(videos):
701701
inputs[i]["multi_modal_data"] = {"video": video}
702-
print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")
703702

704703
req_outputs = self.model.generate(inputs,
705704
sampling_params=sampling_params)

tests/distributed/test_pipeline_parallel.py

-7
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import os
99

1010
import pytest
11-
from packaging import version
12-
from transformers import __version__ as transformers_version
1311

1412
from vllm.logger import init_logger
1513

@@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
4947
pytest.skip("Skipping multi-node pipeline parallel test for "
5048
"multiprocessing distributed backend")
5149

52-
# Skip tests that require transformers>=4.45.0
53-
if "Qwen2-VL" in MODEL_NAME and version.parse(
54-
transformers_version) < version.parse("4.45.0.dev0"):
55-
pytest.skip("This test requires transformers>=4.45.0")
56-
5750
pp_args = [
5851
# use half precision for speed and memory savings in CI environment
5952
"--dtype",

tests/engine/test_custom_executor.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):
4848

4949

5050
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
51-
def test_custom_executor(model, tmpdir):
51+
def test_custom_executor(model, tmp_path):
5252
cwd = os.path.abspath(".")
53-
os.chdir(tmpdir)
53+
os.chdir(tmp_path)
5454
try:
5555
assert not os.path.exists(".marker")
5656

@@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):
6868

6969

7070
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
71-
def test_custom_executor_async(model, tmpdir):
71+
def test_custom_executor_async(model, tmp_path):
7272
cwd = os.path.abspath(".")
73-
os.chdir(tmpdir)
73+
os.chdir(tmp_path)
7474
try:
7575
assert not os.path.exists(".marker")
7676

tests/entrypoints/openai/test_serving_chat.py

+6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
1616

1717

18+
@dataclass
19+
class MockHFConfig:
20+
model_type: str = "any"
21+
22+
1823
@dataclass
1924
class MockModelConfig:
2025
tokenizer = MODEL_NAME
@@ -24,6 +29,7 @@ class MockModelConfig:
2429
tokenizer_revision = None
2530
embedding_mode = False
2631
multimodal_config = MultiModalConfig()
32+
hf_config = MockHFConfig()
2733

2834

2935
@dataclass

tests/lora/test_tokenizer_group.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
4141
lora_request)
4242

4343

44-
def test_get_lora_tokenizer(sql_lora_files, tmpdir):
44+
def test_get_lora_tokenizer(sql_lora_files, tmp_path):
4545
lora_request = None
4646
tokenizer = get_lora_tokenizer(lora_request)
4747
assert not tokenizer
@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
5050
tokenizer = get_lora_tokenizer(lora_request)
5151
assert tokenizer.get_added_vocab()
5252

53-
lora_request = LoRARequest("1", 1, str(tmpdir))
53+
lora_request = LoRARequest("1", 1, str(tmp_path))
5454
tokenizer = get_lora_tokenizer(lora_request)
5555
assert not tokenizer

tests/models/decoder_only/language/test_granite.py

-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Run `pytest tests/models/test_granite.py`.
44
"""
55
import pytest
6-
import transformers
76

87
from ...utils import check_logprobs_close
98

@@ -12,9 +11,6 @@
1211
]
1312

1413

15-
# GraniteForCausalLM will be in transformers >= 4.45
16-
@pytest.mark.skipif(transformers.__version__ < "4.45",
17-
reason="granite model test requires transformers >= 4.45")
1814
@pytest.mark.parametrize("model", MODELS)
1915
@pytest.mark.parametrize("dtype", ["bfloat16"])
2016
@pytest.mark.parametrize("max_tokens", [64])

tests/models/decoder_only/vision_language/test_llava_next_video.py

-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import List, Optional, Tuple, Type, overload
22

33
import pytest
4-
import transformers
54
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
65

76
from vllm.multimodal.utils import (rescale_video_size, resize_video,
@@ -158,8 +157,6 @@ def run_test(
158157
)
159158

160159

161-
@pytest.mark.skipif(transformers.__version__ < "4.45",
162-
reason="Waiting for next transformers release")
163160
@pytest.mark.parametrize("model", models)
164161
@pytest.mark.parametrize(
165162
"size_factors",
@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
203200
)
204201

205202

206-
@pytest.mark.skipif(transformers.__version__ < "4.45",
207-
reason="Waiting for next transformers release")
208203
@pytest.mark.parametrize("model", models)
209204
@pytest.mark.parametrize(
210205
"sizes",

tests/models/decoder_only/vision_language/test_llava_onevision.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import List, Optional, Tuple, Type, overload
22

33
import pytest
4-
import transformers
54
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
65
BatchEncoding)
76

@@ -166,8 +165,6 @@ def process(hf_inputs: BatchEncoding):
166165
)
167166

168167

169-
@pytest.mark.skipif(transformers.__version__ < "4.45",
170-
reason="Waiting for next transformers release")
171168
@pytest.mark.parametrize("model", models)
172169
@pytest.mark.parametrize(
173170
"size_factors",
@@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
211208
)
212209

213210

214-
@pytest.mark.skipif(transformers.__version__ < "4.45",
215-
reason="Waiting for next transformers release")
216211
@pytest.mark.parametrize("model", models)
217212
@pytest.mark.parametrize(
218213
"sizes",
@@ -259,7 +254,9 @@ def run_image_test(
259254
# max_model_len should be greater than image_feature_size
260255
with vllm_runner(model,
261256
dtype=dtype,
262-
max_model_len=32768,
257+
max_num_seqs=1,
258+
max_model_len=16384,
259+
gpu_memory_utilization=0.98,
263260
tensor_parallel_size=tensor_parallel_size,
264261
distributed_executor_backend=distributed_executor_backend,
265262
enforce_eager=True,
@@ -305,8 +302,8 @@ def process(hf_inputs: BatchEncoding):
305302
)
306303

307304

308-
@pytest.mark.skipif(transformers.__version__ < "4.45",
309-
reason="Waiting for next transformers release")
305+
# FIXME: Swap to a smaller model for this architecture
306+
@pytest.mark.skip(reason="Model OOMing on CI")
310307
@pytest.mark.parametrize("model", models)
311308
@pytest.mark.parametrize("dtype", ["half"])
312309
@pytest.mark.parametrize("max_tokens", [128])

tests/models/test_registry.py

-6
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,9 @@
11
import pytest
2-
import transformers
32

43
from vllm.model_executor.models import _MODELS, ModelRegistry
54

65

76
@pytest.mark.parametrize("model_cls", _MODELS)
87
def test_registry_imports(model_cls):
9-
if (model_cls in ("LlavaOnevisionForConditionalGeneration",
10-
"Qwen2VLForConditionalGeneration")
11-
and transformers.__version__ < "4.45"):
12-
pytest.skip("Waiting for next transformers release")
13-
148
# Ensure all model classes can be imported successfully
159
ModelRegistry.resolve_model_cls([model_cls])

tests/samplers/test_sampler.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import itertools
22
import random
3+
from dataclasses import dataclass
34
from typing import Dict, List, Optional, Tuple
45
from unittest.mock import Mock, patch
56

@@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
596597
generation_config = GenerationConfig(top_k=top_k,
597598
top_p=top_p,
598599
do_sample=True)
599-
warpers = generation_model._get_logits_warper(generation_config, device)
600-
assert len(warpers) == 2 # top_p and top_k
600+
601+
@dataclass
602+
class MockConfig:
603+
is_encoder_decoder: bool = False
604+
605+
generation_model.config = MockConfig() # needed by the following method
606+
generation_model._prepare_special_tokens(generation_config, device=device)
607+
processors = generation_model._get_logits_processor(generation_config,
608+
None,
609+
None,
610+
None, [],
611+
device=device)
612+
assert len(processors) == 2 # top_p and top_k
601613

602614
seq_group_metadata_list: List[SequenceGroupMetadata] = []
603615
seq_lens: List[int] = []
@@ -639,7 +651,7 @@ def mock_sample(probs, *args, **kwargs):
639651

640652
assert sample_probs is not None
641653

642-
hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
654+
hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
643655
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
644656
torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
645657
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))

vllm/entrypoints/openai/serving_chat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,13 @@ async def create_chat_completion(
152152
**(request.chat_template_kwargs or {}),
153153
)
154154
except Exception as e:
155-
logger.error("Error in applying chat template from request: %s", e)
155+
logger.exception("Error in applying chat template from request")
156156
return self.create_error_response(str(e))
157157

158158
try:
159159
mm_data = await mm_data_future
160160
except Exception as e:
161-
logger.error("Error in loading multi-modal data: %s", e)
161+
logger.exception("Error in loading multi-modal data")
162162
return self.create_error_response(str(e))
163163

164164
# validation for OpenAI tools

vllm/transformers_utils/tokenizer.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import warnings
33
from pathlib import Path
4+
from types import MethodType
45
from typing import Optional, Union
56

67
import huggingface_hub
@@ -152,6 +153,29 @@ def get_tokenizer(
152153
else:
153154
raise e
154155

156+
# NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324
157+
if type(tokenizer).__name__ in ("ChatGLMTokenizer",
158+
"ChatGLM4Tokenizer"):
159+
assert isinstance(tokenizer, PreTrainedTokenizer)
160+
orig_pad = tokenizer._pad
161+
162+
# Patch _pad method to accept `padding_side`
163+
def _pad(
164+
self: PreTrainedTokenizer,
165+
*args,
166+
padding_side: Optional[str] = None,
167+
**kwargs,
168+
):
169+
if (padding_side is not None
170+
and padding_side != self.padding_side):
171+
msg = ("`padding_side` argument is not supported by "
172+
"ChatGLMTokenizer and will be ignored.")
173+
warnings.warn(msg, stacklevel=2)
174+
175+
return orig_pad(*args, **kwargs)
176+
177+
tokenizer._pad = MethodType(_pad, tokenizer)
178+
155179
if not isinstance(tokenizer, PreTrainedTokenizerFast):
156180
logger.warning(
157181
"Using a slow tokenizer. This might cause a significant "
@@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
167191
return None
168192
try:
169193
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
170-
except OSError as e:
194+
except Exception as e:
171195
# No tokenizer was found in the LoRA folder,
172196
# use base model tokenizer
173197
logger.warning(

0 commit comments

Comments
 (0)