[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)

DarkLight1337 · web-flow · commit b89fb2a4a124 · 2025-03-17T18:35:17.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
@@ -101,7 +101,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         convert_assets_to_embeddings=model_utils.get_llava_embeddings,
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -121,7 +121,7 @@
             "stop_sign": "caption es",
             "cherry_blossom": "What is in the picture?",
         }),
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -190,7 +190,7 @@
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
     ),
     "chameleon": VLMTestInfo(
@@ -199,7 +199,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -240,6 +240,7 @@
         img_idx_to_prompt=lambda idx: "",
         max_model_len=2048,
         max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
@@ -256,8 +257,7 @@
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        # TODO: Use AutoModelForVision2Seq once transformers supports this
-        auto_cls=AutoModelForPreTraining,
+        auto_cls=AutoModelForImageTextToText,
         dtype="bfloat16",
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -307,7 +307,7 @@
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
     ),
     "intern_vl": VLMTestInfo(
@@ -336,7 +336,7 @@
         test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -382,7 +382,7 @@
             "pixel_values"
         ),
         get_stop_token_ids=lambda tok: [128009],
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
         marks=[
@@ -463,7 +463,7 @@
         img_idx_to_prompt=lambda idx: "[IMG]",
         max_model_len=8192,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         marks=[large_gpu_mark(min_gb=48)],
     ),
     "qwen_vl": VLMTestInfo(
@@ -481,7 +481,7 @@
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -495,7 +495,7 @@
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -504,7 +504,7 @@
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch.nn.functional as F
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText
 
 from vllm.platforms import current_platform
 
@@ -70,7 +70,7 @@ def _run_test(
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
     with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
         # Patch the issue where generation_config.json is missing
         hf_model.processor.patch_size = \
             hf_model.model.config.vision_config.patch_size
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,8 +4,8 @@
 
 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
+from transformers import (AutoConfig, AutoModelForImageTextToText,
+                          AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -234,7 +234,7 @@ def process(hf_inputs: BatchEncoding, **kwargs):
                    dtype=dtype,
                    model_kwargs={"device_map": "auto"},
                    postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,