Skip to content

Commit b89fb2a

Browse files
[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 5340b0e commit b89fb2a

File tree

3 files changed

+19
-19
lines changed

3 files changed

+19
-19
lines changed

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import pytest
1111
from packaging.version import Version
12-
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
12+
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
1313
from transformers import __version__ as TRANSFORMERS_VERSION
1414

1515
from vllm.platforms import current_platform
@@ -101,7 +101,7 @@
101101
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
102102
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
103103
max_model_len=4096,
104-
auto_cls=AutoModelForVision2Seq,
104+
auto_cls=AutoModelForImageTextToText,
105105
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
106106
custom_test_opts=[CustomTestOptions(
107107
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -121,7 +121,7 @@
121121
"stop_sign": "caption es",
122122
"cherry_blossom": "What is in the picture?",
123123
}),
124-
auto_cls=AutoModelForVision2Seq,
124+
auto_cls=AutoModelForImageTextToText,
125125
postprocess_inputs=model_utils.cast_dtype_post_processor(
126126
"pixel_values"
127127
),
@@ -190,7 +190,7 @@
190190
test_type=VLMTestType.IMAGE,
191191
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
192192
img_idx_to_prompt=lambda idx: "",
193-
auto_cls=AutoModelForVision2Seq,
193+
auto_cls=AutoModelForImageTextToText,
194194
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
195195
),
196196
"chameleon": VLMTestInfo(
@@ -199,7 +199,7 @@
199199
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
200200
max_model_len=4096,
201201
max_num_seqs=2,
202-
auto_cls=AutoModelForVision2Seq,
202+
auto_cls=AutoModelForImageTextToText,
203203
postprocess_inputs=model_utils.cast_dtype_post_processor(
204204
"pixel_values"
205205
),
@@ -240,6 +240,7 @@
240240
img_idx_to_prompt=lambda idx: "",
241241
max_model_len=2048,
242242
max_num_seqs=2,
243+
auto_cls=AutoModelForImageTextToText,
243244
use_tokenizer_eos=True,
244245
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
245246
num_logprobs=10,
@@ -256,8 +257,7 @@
256257
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
257258
max_model_len=4096,
258259
max_num_seqs=2,
259-
# TODO: Use AutoModelForVision2Seq once transformers supports this
260-
auto_cls=AutoModelForPreTraining,
260+
auto_cls=AutoModelForImageTextToText,
261261
dtype="bfloat16",
262262
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
263263
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -307,7 +307,7 @@
307307
img_idx_to_prompt=lambda idx: "<image>",
308308
max_model_len=8192,
309309
max_num_seqs=2,
310-
auto_cls=AutoModelForVision2Seq,
310+
auto_cls=AutoModelForImageTextToText,
311311
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
312312
),
313313
"intern_vl": VLMTestInfo(
@@ -336,7 +336,7 @@
336336
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
337337
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
338338
max_model_len=10240,
339-
auto_cls=AutoModelForVision2Seq,
339+
auto_cls=AutoModelForImageTextToText,
340340
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
341341
custom_test_opts=[CustomTestOptions(
342342
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -382,7 +382,7 @@
382382
"pixel_values"
383383
),
384384
get_stop_token_ids=lambda tok: [128009],
385-
auto_cls=AutoModelForVision2Seq,
385+
auto_cls=AutoModelForImageTextToText,
386386
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
387387
patch_hf_runner=model_utils.mantis_patch_hf_runner,
388388
marks=[
@@ -463,7 +463,7 @@
463463
img_idx_to_prompt=lambda idx: "[IMG]",
464464
max_model_len=8192,
465465
max_num_seqs=2,
466-
auto_cls=AutoModelForVision2Seq,
466+
auto_cls=AutoModelForImageTextToText,
467467
marks=[large_gpu_mark(min_gb=48)],
468468
),
469469
"qwen_vl": VLMTestInfo(
@@ -481,7 +481,7 @@
481481
models=["facebook/chameleon-7b"],
482482
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
483483
max_model_len=4096,
484-
auto_cls=AutoModelForVision2Seq,
484+
auto_cls=AutoModelForImageTextToText,
485485
postprocess_inputs=model_utils.cast_dtype_post_processor(
486486
"pixel_values"
487487
),
@@ -495,7 +495,7 @@
495495
models=["llava-hf/llava-1.5-7b-hf"],
496496
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
497497
max_model_len=4096,
498-
auto_cls=AutoModelForVision2Seq,
498+
auto_cls=AutoModelForImageTextToText,
499499
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
500500
marks=multi_gpu_marks(num_gpus=2),
501501
**COMMON_BROADCAST_SETTINGS # type: ignore
@@ -504,7 +504,7 @@
504504
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
505505
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
506506
max_model_len=10240,
507-
auto_cls=AutoModelForVision2Seq,
507+
auto_cls=AutoModelForImageTextToText,
508508
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
509509
marks=multi_gpu_marks(num_gpus=2),
510510
**COMMON_BROADCAST_SETTINGS # type: ignore

tests/models/embedding/vision_language/test_llava_next.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pytest
44
import torch.nn.functional as F
5-
from transformers import AutoModelForVision2Seq
5+
from transformers import AutoModelForImageTextToText
66

77
from vllm.platforms import current_platform
88

@@ -70,7 +70,7 @@ def _run_test(
7070
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
7171

7272
with hf_runner(model, dtype=dtype,
73-
auto_cls=AutoModelForVision2Seq) as hf_model:
73+
auto_cls=AutoModelForImageTextToText) as hf_model:
7474
# Patch the issue where generation_config.json is missing
7575
hf_model.processor.patch_size = \
7676
hf_model.model.config.vision_config.patch_size

tests/models/encoder_decoder/vision_language/test_mllama.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
import pytest
66
import torch
7-
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
8-
BatchEncoding)
7+
from transformers import (AutoConfig, AutoModelForImageTextToText,
8+
AutoTokenizer, BatchEncoding)
99

1010
from vllm import LLM, SamplingParams
1111
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -234,7 +234,7 @@ def process(hf_inputs: BatchEncoding, **kwargs):
234234
dtype=dtype,
235235
model_kwargs={"device_map": "auto"},
236236
postprocess_inputs=process,
237-
auto_cls=AutoModelForVision2Seq) as hf_model:
237+
auto_cls=AutoModelForImageTextToText) as hf_model:
238238
hf_outputs_per_image = [
239239
hf_model.generate_greedy_logprobs_limit(prompts,
240240
max_tokens,

0 commit comments

Comments
 (0)