|
112 | 112 | vllm_runner_kwargs={"enable_mm_embeds": True}, |
113 | 113 | marks=[pytest.mark.core_model, pytest.mark.cpu_model], |
114 | 114 | ), |
| 115 | + "paligemma": VLMTestInfo( |
| 116 | + models=["google/paligemma-3b-mix-224"], |
| 117 | + test_type=VLMTestType.IMAGE, |
| 118 | + prompt_formatter=identity, |
| 119 | + img_idx_to_prompt=lambda idx: "", |
| 120 | + # Paligemma uses its own sample prompts because the default one fails |
| 121 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 122 | + { |
| 123 | + "stop_sign": "caption es", |
| 124 | + "cherry_blossom": "What is in the picture?", |
| 125 | + } |
| 126 | + ), |
| 127 | + auto_cls=AutoModelForImageTextToText, |
| 128 | + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
| 129 | + dtype="bfloat16", |
| 130 | + marks=[ |
| 131 | + pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") |
| 132 | + ], |
| 133 | + ), |
115 | 134 | "qwen2_5_vl": VLMTestInfo( |
116 | 135 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
117 | 136 | test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), |
|
176 | 195 | # Gemma3 has bidirectional mask on images |
177 | 196 | "gemma3-transformers": VLMTestInfo( |
178 | 197 | models=["google/gemma-3-4b-it"], |
179 | | - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
180 | | - prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
181 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
182 | | - { |
183 | | - "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
184 | | - "cherry_blossom": "<start_of_image>What is the season?", |
185 | | - } |
186 | | - ), |
187 | | - multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
188 | | - max_model_len=8192, |
| 198 | + test_type=VLMTestType.IMAGE, |
| 199 | + prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
| 200 | + max_model_len=4096, |
189 | 201 | auto_cls=AutoModelForImageTextToText, |
190 | | - # TODO: Support `do_pan_and_scan` in transformers backend |
191 | | - # patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
192 | 202 | vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output, |
193 | 203 | image_size_factors=[(0.25, 0.5, 1.0)], |
194 | 204 | vllm_runner_kwargs={ |
195 | 205 | "model_impl": "transformers", |
196 | | - # "mm_processor_kwargs": {"do_pan_and_scan": True}, |
197 | 206 | }, |
198 | 207 | marks=[pytest.mark.core_model], |
199 | 208 | ), |
|
212 | 221 | }, |
213 | 222 | marks=[pytest.mark.core_model], |
214 | 223 | ), |
215 | | - # PaliGemma has PrefixLM attention |
216 | | - "paligemma-transformers": VLMTestInfo( |
217 | | - models=["google/paligemma-3b-mix-224"], |
218 | | - test_type=VLMTestType.IMAGE, |
219 | | - prompt_formatter=identity, |
220 | | - img_idx_to_prompt=lambda idx: "", |
221 | | - # PaliGemma uses its own sample prompts because the default one fails |
222 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
223 | | - { |
224 | | - "stop_sign": "caption es", |
225 | | - "cherry_blossom": "What is in the picture?", |
226 | | - } |
227 | | - ), |
228 | | - auto_cls=AutoModelForImageTextToText, |
229 | | - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
230 | | - image_size_factors=[(0.25, 0.5, 1.0)], |
231 | | - vllm_runner_kwargs={ |
232 | | - "model_impl": "transformers", |
233 | | - }, |
234 | | - marks=[pytest.mark.core_model], |
235 | | - ), |
236 | 224 | # Pixel values from processor are not 4D or 5D arrays |
237 | 225 | "qwen2_5_vl-transformers": VLMTestInfo( |
238 | 226 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
|
359 | 347 | image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
360 | 348 | marks=[large_gpu_mark(min_gb=32)], |
361 | 349 | ), |
| 350 | + "gemma3": VLMTestInfo( |
| 351 | + models=["google/gemma-3-4b-it"], |
| 352 | + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
| 353 | + prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
| 354 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 355 | + { |
| 356 | + "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
| 357 | + "cherry_blossom": "<start_of_image>What is the season?", |
| 358 | + } |
| 359 | + ), |
| 360 | + multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
| 361 | + max_model_len=4096, |
| 362 | + max_num_seqs=2, |
| 363 | + auto_cls=AutoModelForImageTextToText, |
| 364 | + vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, |
| 365 | + patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
| 366 | + num_logprobs=10, |
| 367 | + ), |
362 | 368 | "glm4v": VLMTestInfo( |
363 | 369 | models=["zai-org/glm-4v-9b"], |
364 | 370 | test_type=VLMTestType.IMAGE, |
|
0 commit comments