|
9 | 9 |
|
10 | 10 | import pytest
|
11 | 11 | from packaging.version import Version
|
12 |
| -from transformers import AutoModelForPreTraining, AutoModelForVision2Seq |
| 12 | +from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq |
13 | 13 | from transformers import __version__ as TRANSFORMERS_VERSION
|
14 | 14 |
|
15 | 15 | from vllm.platforms import current_platform
|
|
101 | 101 | prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
102 | 102 | convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
103 | 103 | max_model_len=4096,
|
104 |
| - auto_cls=AutoModelForVision2Seq, |
| 104 | + auto_cls=AutoModelForImageTextToText, |
105 | 105 | vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
106 | 106 | custom_test_opts=[CustomTestOptions(
|
107 | 107 | inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
|
121 | 121 | "stop_sign": "caption es",
|
122 | 122 | "cherry_blossom": "What is in the picture?",
|
123 | 123 | }),
|
124 |
| - auto_cls=AutoModelForVision2Seq, |
| 124 | + auto_cls=AutoModelForImageTextToText, |
125 | 125 | postprocess_inputs=model_utils.cast_dtype_post_processor(
|
126 | 126 | "pixel_values"
|
127 | 127 | ),
|
|
190 | 190 | test_type=VLMTestType.IMAGE,
|
191 | 191 | prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
192 | 192 | img_idx_to_prompt=lambda idx: "",
|
193 |
| - auto_cls=AutoModelForVision2Seq, |
| 193 | + auto_cls=AutoModelForImageTextToText, |
194 | 194 | vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
195 | 195 | ),
|
196 | 196 | "chameleon": VLMTestInfo(
|
|
199 | 199 | prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
200 | 200 | max_model_len=4096,
|
201 | 201 | max_num_seqs=2,
|
202 |
| - auto_cls=AutoModelForVision2Seq, |
| 202 | + auto_cls=AutoModelForImageTextToText, |
203 | 203 | postprocess_inputs=model_utils.cast_dtype_post_processor(
|
204 | 204 | "pixel_values"
|
205 | 205 | ),
|
|
240 | 240 | img_idx_to_prompt=lambda idx: "",
|
241 | 241 | max_model_len=2048,
|
242 | 242 | max_num_seqs=2,
|
| 243 | + auto_cls=AutoModelForImageTextToText, |
243 | 244 | use_tokenizer_eos=True,
|
244 | 245 | vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
245 | 246 | num_logprobs=10,
|
|
256 | 257 | multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
257 | 258 | max_model_len=4096,
|
258 | 259 | max_num_seqs=2,
|
259 |
| - # TODO: Use AutoModelForVision2Seq once transformers supports this |
260 |
| - auto_cls=AutoModelForPreTraining, |
| 260 | + auto_cls=AutoModelForImageTextToText, |
261 | 261 | dtype="bfloat16",
|
262 | 262 | vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
263 | 263 | patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
|
307 | 307 | img_idx_to_prompt=lambda idx: "<image>",
|
308 | 308 | max_model_len=8192,
|
309 | 309 | max_num_seqs=2,
|
310 |
| - auto_cls=AutoModelForVision2Seq, |
| 310 | + auto_cls=AutoModelForImageTextToText, |
311 | 311 | hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
312 | 312 | ),
|
313 | 313 | "intern_vl": VLMTestInfo(
|
|
336 | 336 | test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
337 | 337 | prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
338 | 338 | max_model_len=10240,
|
339 |
| - auto_cls=AutoModelForVision2Seq, |
| 339 | + auto_cls=AutoModelForImageTextToText, |
340 | 340 | vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
341 | 341 | custom_test_opts=[CustomTestOptions(
|
342 | 342 | inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
|
382 | 382 | "pixel_values"
|
383 | 383 | ),
|
384 | 384 | get_stop_token_ids=lambda tok: [128009],
|
385 |
| - auto_cls=AutoModelForVision2Seq, |
| 385 | + auto_cls=AutoModelForImageTextToText, |
386 | 386 | vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
387 | 387 | patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
388 | 388 | marks=[
|
|
463 | 463 | img_idx_to_prompt=lambda idx: "[IMG]",
|
464 | 464 | max_model_len=8192,
|
465 | 465 | max_num_seqs=2,
|
466 |
| - auto_cls=AutoModelForVision2Seq, |
| 466 | + auto_cls=AutoModelForImageTextToText, |
467 | 467 | marks=[large_gpu_mark(min_gb=48)],
|
468 | 468 | ),
|
469 | 469 | "qwen_vl": VLMTestInfo(
|
|
481 | 481 | models=["facebook/chameleon-7b"],
|
482 | 482 | prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
483 | 483 | max_model_len=4096,
|
484 |
| - auto_cls=AutoModelForVision2Seq, |
| 484 | + auto_cls=AutoModelForImageTextToText, |
485 | 485 | postprocess_inputs=model_utils.cast_dtype_post_processor(
|
486 | 486 | "pixel_values"
|
487 | 487 | ),
|
|
495 | 495 | models=["llava-hf/llava-1.5-7b-hf"],
|
496 | 496 | prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
497 | 497 | max_model_len=4096,
|
498 |
| - auto_cls=AutoModelForVision2Seq, |
| 498 | + auto_cls=AutoModelForImageTextToText, |
499 | 499 | vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
500 | 500 | marks=multi_gpu_marks(num_gpus=2),
|
501 | 501 | **COMMON_BROADCAST_SETTINGS # type: ignore
|
|
504 | 504 | models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
505 | 505 | prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
506 | 506 | max_model_len=10240,
|
507 |
| - auto_cls=AutoModelForVision2Seq, |
| 507 | + auto_cls=AutoModelForImageTextToText, |
508 | 508 | vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
509 | 509 | marks=multi_gpu_marks(num_gpus=2),
|
510 | 510 | **COMMON_BROADCAST_SETTINGS # type: ignore
|
|
0 commit comments