2828from vllm .model_executor .models .module_mapping import MultiModelKeys
2929from vllm .model_executor .sampling_metadata import SamplingMetadata
3030from vllm .multimodal import MULTIMODAL_REGISTRY
31- from vllm .multimodal .inputs import MultiModalInputs , NestedTensors
31+ from vllm .multimodal .inputs import MultiModalKwargs , NestedTensors
3232from vllm .sequence import IntermediateTensors , SequenceData
3333from vllm .transformers_utils .tokenizer import cached_tokenizer_from_config
3434
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
13191319
13201320
13211321def input_mapper_for_phi4mm_audio (ctx : InputContext ,
1322- data : object ) -> MultiModalInputs :
1322+ data : object ) -> MultiModalKwargs :
13231323 """
1324- This function is used to create the MultiModalInputs for the Phi4MM
1324+ This function is used to create the MultiModalKwargs for the Phi4MM
13251325 (audio) model.
13261326 Specifically, for audio, we extract the audio features from the sound
13271327 file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
13381338 data (object): Audio data.
13391339
13401340 Returns:
1341- MultiModalInputs : Multi-modal inputs.
1341+ MultiModalKwargs : Multi-modal inputs.
13421342 """
13431343 if not isinstance (data , list ):
13441344 data = [data ]
13451345
13461346 if len (data ) == 0 :
1347- return MultiModalInputs ()
1347+ return MultiModalKwargs ()
13481348
13491349 audio_features = []
13501350 for audio_input in data :
@@ -1365,15 +1365,15 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
13651365 [single_audio_embed_size ],
13661366 )
13671367 audio_features .append (single_audio_feature_audio_len_pair )
1368- return MultiModalInputs ({"audio_features" : audio_features })
1368+ return MultiModalKwargs ({"audio_features" : audio_features })
13691369
13701370
13711371def input_mapper_for_phi4mm_image (ctx : InputContext , data : object ):
13721372 if not isinstance (data , list ):
13731373 data = [data ]
13741374 # data: list of PIL images
13751375 if len (data ) == 0 :
1376- return MultiModalInputs ()
1376+ return MultiModalKwargs ()
13771377 hf_config = ctx .get_hf_config ()
13781378 vision_encoder_name = hf_config .img_processor
13791379 if vision_encoder_name is None :
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
13851385
13861386 image_input_dict = preprocess (data , dynamic_hd_size , vit_image_size ,
13871387 vit_patch_size )
1388- return MultiModalInputs ({
1388+ return MultiModalKwargs ({
13891389 "pixel_values" :
13901390 image_input_dict ["pixel_values" ],
13911391 "image_sizes" :
0 commit comments