44
55from collections .abc import Callable , Iterable
66from pathlib import PosixPath
7+ from typing import Any
78
9+ import numpy .typing as npt
810import torch
911
1012from vllm .multimodal .audio import AudioResampler
@@ -236,6 +238,7 @@ def build_video_inputs_from_test_info(
236238 video_assets : VideoTestAssets ,
237239 size_wrapper : ImageSizeWrapper ,
238240 num_frames : int ,
241+ needs_video_metadata : bool ,
239242) -> list [PromptWithMultiModalInput ]:
240243 if test_info .prompt_formatter is None :
241244 raise ValueError ("Prompt formatter must be set to build video inputs" )
@@ -248,7 +251,10 @@ def build_video_inputs_from_test_info(
248251 )
249252
250253 sampled_vids = [
251- sample_frames_from_video (asset .np_ndarrays , num_frames )
254+ sample_frames_with_video_metadata (
255+ (asset .np_ndarrays , asset .metadata ),
256+ num_frames ,
257+ )
252258 for asset in video_assets
253259 ]
254260
@@ -259,12 +265,33 @@ def build_video_inputs_from_test_info(
259265 return [
260266 PromptWithMultiModalInput (
261267 prompts = [prompt for _ in size_wrapper .data ],
262- video_data = [video_scaler (video , size ) for size in size_wrapper .data ],
268+ video_data = [
269+ (
270+ video_scaler (video , size )
271+ if not needs_video_metadata
272+ else (video_scaler (video , size ), meta )
273+ )
274+ for size in size_wrapper .data
275+ ],
263276 )
264- for video , prompt in zip (sampled_vids , model_prompts )
277+ for ( video , meta ) , prompt in zip (sampled_vids , model_prompts )
265278 ]
266279
267280
281+ def sample_frames_with_video_metadata (
282+ video_with_meta : tuple [npt .NDArray , dict [str , Any ]],
283+ num_frames : int ,
284+ ) -> tuple [npt .NDArray , dict [str , Any ]]:
285+ video , meta = video_with_meta
286+ video = sample_frames_from_video (video , num_frames )
287+
288+ meta ["do_sample_frames" ] = meta ["total_num_frames" ] == num_frames
289+ meta ["total_num_frames" ] = num_frames
290+ meta ["fps" ] = meta ["duration" ] / num_frames
291+ meta ["frames_indices" ] = list (range (num_frames ))
292+ return video , meta
293+
294+
268295def apply_image_size_scaling (image , size : float | tuple [int , int ], size_type : SizeType ):
269296 """Applies a size scaler to one image; this can be an image size factor,
270297 which scales the image while maintaining the aspect ratio"""
0 commit comments