Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VLM] Reorganize profiling/processing-related code #11812

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Cleanup
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
  • Loading branch information
DarkLight1337 committed Jan 7, 2025
commit b7e5324815aa48a00d2769bb37e0f695111a239e
4 changes: 2 additions & 2 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
return {"image": self.get_max_image_tokens()}

def apply_feature_select_strategy(
def _apply_feature_select_strategy(
self,
strategy: str,
encoder_num_image_tokens: int,
Expand All @@ -145,7 +145,7 @@ def get_num_image_tokens(
hf_config = self.get_hf_config()
vision_encoder_info = self.get_vision_encoder_info()

return self.apply_feature_select_strategy(
return self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
vision_encoder_info.get_num_image_tokens(
image_width=image_width,
Expand Down
25 changes: 17 additions & 8 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from abc import abstractmethod
from functools import cached_property
from typing import (Final, Iterable, List, Literal, Mapping, Optional,
Protocol, Set, Tuple, TypedDict, TypeVar, Union)
Expand Down Expand Up @@ -82,7 +83,7 @@ def get_num_image_tokens(
hf_config = self.get_hf_config()
vision_encoder_info = self.get_vision_encoder_info()

base_feature_size = self.apply_feature_select_strategy(
base_feature_size = self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
vision_encoder_info.get_num_image_tokens(
image_width=image_width,
Expand All @@ -99,7 +100,7 @@ def get_num_image_tokens(
(
unpadded_feature_size,
newline_feature_size,
) = self.get_num_unpadded_features(
) = self._get_num_unpadded_features(
original_height=image_height,
original_width=image_width,
npatches=vision_encoder_info.get_patch_grid_length(),
Expand All @@ -110,7 +111,7 @@ def get_num_image_tokens(
return unpadded_feature_size + newline_feature_size + base_feature_size

# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
def get_num_unpadded_features(
def _get_num_unpadded_features(
self,
*,
original_height: int,
Expand Down Expand Up @@ -162,6 +163,19 @@ def get_image_size_with_most_features(self) -> ImageSize:

class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):

# Copied from BaseMultiModalProcessor
@abstractmethod
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
raise NotImplementedError


class LlavaNextMultiModalProcessor(
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):

def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
Expand All @@ -174,11 +188,6 @@ def _get_mm_fields_config(
)


class LlavaNextMultiModalProcessor(
BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
pass


@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
info=LlavaNextProcessingInfo,
dummy=LlavaDummyInputsBuilder)
Expand Down
17 changes: 10 additions & 7 deletions vllm/model_executor/models/llava_next_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
max_video_tokens = self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)

return {"video": max_video_tokens}
Expand All @@ -76,7 +76,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
width = height = vision_encoder_info.get_image_size()
return ImageSize(width=width, height=height)

def get_num_frame_tokens(
def _get_num_frame_tokens(
self,
*,
image_width: int,
Expand All @@ -98,14 +98,14 @@ def get_num_video_tokens(
image_height: int,
num_frames: int,
) -> int:
num_frame_tokens = self.get_num_frame_tokens(
num_frame_tokens = self._get_num_frame_tokens(
image_width=image_width,
image_height=image_height,
)

return num_frame_tokens * num_frames

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -125,11 +125,11 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_total_frames = self.get_max_video_frames(seq_len)
max_total_frames = self._get_max_video_frames(seq_len)

return max(max_total_frames // max(max_videos, 1), 1)

Expand All @@ -146,15 +146,18 @@ def get_dummy_processor_inputs(

processor = self.info.get_hf_processor()
video_token = processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"video":
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down
41 changes: 14 additions & 27 deletions vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement
from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:

# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
# with additional logic afterwards taken from LlavaOnevisionProcessor
def get_num_unpadded_features(
def _get_num_unpadded_features(
self,
*,
original_height: int,
Expand Down Expand Up @@ -145,23 +145,7 @@ def get_num_unpadded_features(

return (unpadded_features, newline_features)

def get_image_size_with_most_features(self) -> ImageSize:
hf_config = self.get_hf_config()
largest_feature_size, largest_feature_pinpoint = 0, None
for (height, width) in hf_config.image_grid_pinpoints:
feat_size = self.get_num_image_tokens(image_width=width,
image_height=height)
if feat_size > largest_feature_size:
largest_feature_size = feat_size
largest_feature_pinpoint = ImageSize(width=width,
height=height)

if largest_feature_size == 0 or largest_feature_pinpoint is None:
raise ValueError("Cannot have a largest feature size of 0!")

return largest_feature_pinpoint

def get_num_frame_tokens(
def _get_num_frame_tokens(
self,
*,
image_width: int,
Expand All @@ -183,14 +167,14 @@ def get_num_video_tokens(
image_height: int,
num_frames: int,
) -> int:
num_frame_tokens = self.get_num_frame_tokens(
num_frame_tokens = self._get_num_frame_tokens(
image_width=image_width,
image_height=image_height,
)

return num_frame_tokens * num_frames + 1 # Newline token

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.limit_per_prompt.get("image", 1)
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens)
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)

Expand All @@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)


Expand All @@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
processor = self.info.get_hf_processor()
image_token = processor.image_token
video_token = processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"image":
Expand All @@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down
15 changes: 9 additions & 6 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ def get_max_image_tokens(self) -> int:
image_height=target_height,
)

def get_max_video_frames(self, max_tokens: int) -> int:
def _get_max_video_frames(self, max_tokens: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()

num_frames = 0
Expand All @@ -856,14 +856,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:

return num_frames

def get_max_num_frames(self, seq_len: int) -> int:
def get_num_frames_with_most_features(self, seq_len: int) -> int:
mm_config = self.ctx.get_mm_config()
max_images = mm_config.limit_per_prompt.get("image", 1)
max_videos = mm_config.limit_per_prompt.get("video", 1)

max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self.get_max_video_frames(seq_len -
max_image_tokens)
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)

num_frames = max(max_total_frames // max(max_videos, 1), 1)

Expand All @@ -879,7 +879,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
return self.get_num_video_tokens(
image_width=target_width,
image_height=target_height,
num_frames=self.get_max_num_frames(seq_len),
num_frames=self.get_num_frames_with_most_features(seq_len),
)


Expand All @@ -896,8 +896,11 @@ def get_dummy_processor_inputs(
hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token
video_token: str = hf_processor.video_token

target_width, target_height = \
self.info.get_image_size_with_most_features()
target_num_frames = \
self.info.get_num_frames_with_most_features(seq_len)

mm_data = {
"image":
Expand All @@ -908,7 +911,7 @@ def get_dummy_processor_inputs(
self._get_dummy_videos(
width=target_width,
height=target_height,
num_frames=self.info.get_max_num_frames(seq_len),
num_frames=target_num_frames,
num_videos=num_videos,
)
}
Expand Down
Loading