Skip to content

[V1] Change return type on get_multimodal_embeddings() #19446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,11 +601,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
multimodal_embeddings = self._process_image_input(image_input)
return multimodal_embeddings

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,11 +406,11 @@ def _parse_and_validate_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input, **kwargs)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,11 +627,11 @@ def _process_image_input(self,
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,11 +987,11 @@ def _parse_and_validate_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
assert self.model.vqmodel is not None
image_tokens = self.model.get_image_tokens(image_input["data"].to(
self.config.torch_dtype))
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,11 +586,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,11 +1032,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/gemma3_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,11 +568,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,11 +593,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.transformer

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/granite_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,10 +706,11 @@ def _process_audio_input(
def get_multimodal_embeddings(
self,
**kwargs: object,
) -> Optional[MultiModalEmbeddings]:
) -> MultiModalEmbeddings:
"""Compute the audio embeddings if audio inputs are present."""
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None:
return []
return None
audio_features = self._process_audio_input(audio_input)
return audio_features
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,11 +706,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class SupportsMultiModal(Protocol):
MRO of your model class.
"""

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
"""
Returns multimodal embeddings generated from multimodal kwargs
to be merged with text embeddings.
Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1304,11 +1304,12 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:

modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities:
return []
return None

# The result multimodal_embeddings is tuple of tensors, with each
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,11 +659,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,11 +478,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings

Expand All @@ -492,7 +492,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:

if multimodal_embeddings is None:
if not multimodal_embeddings:
return self.language_model.get_input_embeddings(input_ids)

inputs_embeds = embed_multimodal(
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,11 +401,11 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
video_input = self._parse_and_validate_video_input(**kwargs)
if video_input is None:
return None
return []
vision_embeddings = self._process_video_pixels(video_input)
return vision_embeddings

Expand Down
5 changes: 3 additions & 2 deletions vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,11 +839,12 @@ def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
**kwargs)
if not mm_input_by_modality:
return []
return None

# The result multimodal_embeddings is tuple of tensors, with each
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,11 +878,11 @@ def _process_multimodal_inputs(self, modalities: dict):
def get_language_model(self) -> torch.nn.Module:
return self.llm

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities:
return None
return []

return self._process_multimodal_inputs(modalities)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/minimax_vl_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,11 @@ def _parse_and_validate_image_input(

raise AssertionError("This line should be unreachable.")

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/mistral3.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,11 +495,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

vision_embeddings = self._process_image_input(image_input)

Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/mllama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,11 +794,10 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(self,
**kwargs) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1473,11 +1473,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

return self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/ovis.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,11 +499,11 @@ def _process_image_input(

return tuple(vision_embeddings)

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []

image_features = self._process_image_input(image_input)

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,11 +655,11 @@ def _process_image_input(
def get_language_model(self) -> torch.nn.Module:
return self.language_model

def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
return []
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings

Expand All @@ -669,7 +669,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.embed_tokens(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
Expand Down
Loading