Skip to content

Commit ad3ec89

Browse files
Isotr0pyywang96
andauthored
[VLM] Add Qwen3-VL generation test (#25185)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.io>
1 parent 3481e40 commit ad3ec89

File tree

7 files changed

+108
-5
lines changed

7 files changed

+108
-5
lines changed

tests/models/multimodal/generation/test_common.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,28 @@
159159
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
160160
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
161161
),
162+
"qwen3_vl": VLMTestInfo(
163+
models=["Qwen/Qwen3-VL-4B-Instruct"],
164+
test_type=(
165+
VLMTestType.IMAGE,
166+
VLMTestType.MULTI_IMAGE,
167+
VLMTestType.VIDEO,
168+
),
169+
needs_video_metadata=True,
170+
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
171+
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
172+
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
173+
max_model_len=4096,
174+
max_num_seqs=2,
175+
num_logprobs=20,
176+
auto_cls=AutoModelForImageTextToText,
177+
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
178+
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
179+
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
180+
marks=[
181+
pytest.mark.core_model,
182+
],
183+
),
162184
"ultravox": VLMTestInfo(
163185
models=["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
164186
test_type=VLMTestType.AUDIO,

tests/models/multimodal/generation/vlm_utils/builders.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
from collections.abc import Callable, Iterable
66
from pathlib import PosixPath
7+
from typing import Any
78

9+
import numpy.typing as npt
810
import torch
911

1012
from vllm.multimodal.audio import AudioResampler
@@ -236,6 +238,7 @@ def build_video_inputs_from_test_info(
236238
video_assets: VideoTestAssets,
237239
size_wrapper: ImageSizeWrapper,
238240
num_frames: int,
241+
needs_video_metadata: bool,
239242
) -> list[PromptWithMultiModalInput]:
240243
if test_info.prompt_formatter is None:
241244
raise ValueError("Prompt formatter must be set to build video inputs")
@@ -248,7 +251,10 @@ def build_video_inputs_from_test_info(
248251
)
249252

250253
sampled_vids = [
251-
sample_frames_from_video(asset.np_ndarrays, num_frames)
254+
sample_frames_with_video_metadata(
255+
(asset.np_ndarrays, asset.metadata),
256+
num_frames,
257+
)
252258
for asset in video_assets
253259
]
254260

@@ -259,12 +265,33 @@ def build_video_inputs_from_test_info(
259265
return [
260266
PromptWithMultiModalInput(
261267
prompts=[prompt for _ in size_wrapper.data],
262-
video_data=[video_scaler(video, size) for size in size_wrapper.data],
268+
video_data=[
269+
(
270+
video_scaler(video, size)
271+
if not needs_video_metadata
272+
else (video_scaler(video, size), meta)
273+
)
274+
for size in size_wrapper.data
275+
],
263276
)
264-
for video, prompt in zip(sampled_vids, model_prompts)
277+
for (video, meta), prompt in zip(sampled_vids, model_prompts)
265278
]
266279

267280

281+
def sample_frames_with_video_metadata(
282+
video_with_meta: tuple[npt.NDArray, dict[str, Any]],
283+
num_frames: int,
284+
) -> tuple[npt.NDArray, dict[str, Any]]:
285+
video, meta = video_with_meta
286+
video = sample_frames_from_video(video, num_frames)
287+
288+
meta["do_sample_frames"] = meta["total_num_frames"] == num_frames
289+
meta["total_num_frames"] = num_frames
290+
meta["fps"] = meta["duration"] / num_frames
291+
meta["frames_indices"] = list(range(num_frames))
292+
return video, meta
293+
294+
268295
def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
269296
"""Applies a size scaler to one image; this can be an image size factor,
270297
which scales the image while maintaining the aspect ratio"""

tests/models/multimodal/generation/vlm_utils/case_filtering.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
100100
# num_frames is video only
101101
if test_type == VLMTestType.VIDEO:
102102
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
103+
iter_kwargs["needs_video_metadata"] = ensure_wrapped(
104+
test_info.needs_video_metadata
105+
)
103106

104107
# No sizes passed for custom inputs, since inputs are directly provided
105108
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):

tests/models/multimodal/generation/vlm_utils/model_utils.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,54 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
905905
return hf_model
906906

907907

908+
def qwen3_vl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
909+
"""Patches and returns an instance of the HfRunner to use for GLM4.1V."""
910+
hf_processor = hf_model.processor
911+
912+
def processor(*args, videos=None, **kwargs):
913+
if videos is not None and is_list_of(videos, tuple):
914+
# batched multi videos
915+
do_sample_frames = {video[1]["do_sample_frames"] for video in videos}
916+
assert len(do_sample_frames) == 1
917+
if kwargs.get("do_sample_frames") is None:
918+
kwargs["do_sample_frames"] = do_sample_frames
919+
video_metadata = [
920+
[
921+
VideoMetadata(
922+
**{k: v for k, v in video[1].items() if k != "do_sample_frames"}
923+
)
924+
]
925+
for video in videos
926+
]
927+
videos = [[video[0]] for video in videos]
928+
elif videos is not None and isinstance(videos, tuple):
929+
# single video
930+
do_sample_frames = videos[1]["do_sample_frames"]
931+
if kwargs.get("do_sample_frames") is None:
932+
kwargs["do_sample_frames"] = do_sample_frames
933+
video_metadata = [
934+
[
935+
VideoMetadata(
936+
**{
937+
k: v
938+
for k, v in videos[1].items()
939+
if k != "do_sample_frames"
940+
}
941+
)
942+
]
943+
]
944+
videos = [[videos[0]]]
945+
else:
946+
video_metadata = None
947+
948+
return hf_processor(
949+
*args, videos=videos, video_metadata=video_metadata, **kwargs
950+
)
951+
952+
hf_model.processor = processor
953+
return hf_model
954+
955+
908956
def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
909957
from vllm.model_executor.models.tarsier import get_vision_encoder_info
910958

tests/models/multimodal/generation/vlm_utils/runners.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def run_video_test(
117117
video_assets,
118118
test_case.size_wrapper,
119119
test_case.num_video_frames,
120+
test_case.needs_video_metadata,
120121
)
121122

122123
core.run_test(

tests/models/multimodal/generation/vlm_utils/types.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ class VLMTestInfo(NamedTuple):
154154
dtype: str = "auto"
155155
distributed_executor_backend: str | None = None
156156
# Only expanded in video tests
157-
num_video_frames: int = 16
157+
num_video_frames: int | tuple[int] = 16
158+
needs_video_metadata: bool = False
158159

159160
# Fixed image sizes / image size factors; most tests use image_size_factors
160161
# The values provided for these two fields will be stacked and expanded
@@ -212,5 +213,6 @@ class ExpandableVLMTestArgs(NamedTuple):
212213
size_wrapper: ImageSizeWrapper | None = None
213214
# Video only
214215
num_video_frames: int | None = None
216+
needs_video_metadata: bool = False
215217
# Custom inputs only
216218
custom_test_opts: CustomTestOptions | None = None

vllm/assets/video.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
9494

9595
metadata = {
9696
"total_num_frames": num_frames,
97-
"fps": fps,
97+
"fps": duration / num_frames,
9898
"duration": duration,
9999
"video_backend": "opencv",
100100
"frames_indices": list(range(num_frames)),

0 commit comments

Comments
 (0)