Skip to content

[Bugfix] Fix broken v0 multimodal inference #19814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/contributing/model/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Further update the model as follows:
# model as one of the requirements of basic vLLM model implementation.
inputs_embeds = self.language_model.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The change from multimodal_embeddings is not None to if multimodal_embeddings: in this documentation example correctly reflects the corresponding code changes. This updated condition is more Pythonic and robustly handles cases where multimodal_embeddings might be an empty list or tuple (which evaluates to False), in addition to None.

inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
_IMAGE_TOKEN_ID)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -1005,7 +1005,7 @@ def get_input_embeddings(
) -> torch.Tensor:

inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.model.vocabulary_mapping.image_token_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/florence2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.pad_token_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/gemma3_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,7 +1336,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
context_token_ids = [
token_id for token_id in (self.img_context_token_id,
self.video_context_token_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/kimi_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def get_input_embeddings(
# model as one of the requirements of basic vLLM model implementation.
inputs_embeds = self.language_model.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This change from if multimodal_embeddings is not None: to if multimodal_embeddings: is a good fix.

As per the PR description, the previous condition (is not None) would evaluate to True if multimodal_embeddings was an empty list (e.g., []). This would lead to merge_multimodal_embeddings being called with an empty list for multimodal_embeddings.

If multimodal_embeddings is an empty list but input_ids still contains placeholder tokens, this could lead to an IndexError within merge_multimodal_embeddings when trying to access multimodal_embeddings[0]. The torch.cat() error mentioned in the PR description might be another symptom depending on the exact path taken within merge_multimodal_embeddings.

The new condition if multimodal_embeddings: correctly evaluates to False for both None and empty lists/tuples, thus preventing merge_multimodal_embeddings from being called inappropriately. This makes the code more robust and Pythonic.

inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.video_token_index)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_index, self.config.video_token_index])
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.llm.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
assert len(self.mm_token_ids) > 0
inputs_embeds = merge_multimodal_embeddings(
input_ids,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/minimax_vl_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/mistral3.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/mllama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,7 +1487,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
assert self.img_patch_id is not None

inputs_embeds = merge_multimodal_embeddings(
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/ovis.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.llm.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.image_pad_token_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,7 +1148,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.model.embed_tokens(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen2_5_omni_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:

# TODO (ywang96): support overlapping modalitiy embeddings so that
# `use_audio_in_video` will work on V1.
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_id, self.config.video_token_id])
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.audio_token_index)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1289,7 +1289,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_id, self.config.video_token_id])
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)

if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.transformer.visual.image_pad_id)
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/skyworkr1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
assert self.img_context_token_id is not None
self._set_visual_token_mask(input_ids)
inputs_embeds = merge_multimodal_embeddings(
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/tarsier.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def get_input_embeddings(
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
if multimodal_embeddings:

# TODO(ywang96): remove this block after v0 is deprecated.
if not envs.VLLM_USE_V1:
Expand Down