Skip to content

Commit fe378e9

Browse files
Isotr0pygarg-amit
authored andcommitted
[Model] Expose InternVL2 max_dynamic_patch as a mm_processor_kwarg (vllm-project#8946)
Signed-off-by: Amit Garg <mitgarg17495@gmail.com>
1 parent 900231d commit fe378e9

File tree

2 files changed

+90
-61
lines changed

2 files changed

+90
-61
lines changed

examples/offline_inference_vision_language_multi_image.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
115115
trust_remote_code=True,
116116
max_model_len=4096,
117117
limit_mm_per_prompt={"image": len(image_urls)},
118+
mm_processor_kwargs={"max_dynamic_patch": 4},
118119
)
119120

120121
placeholders = "\n".join(f"Image-{i}: <image>\n"

vllm/model_executor/models/internvl.py

Lines changed: 89 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
# Licensed under The MIT License [see LICENSE for details]
66
# --------------------------------------------------------
77
import re
8-
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
9-
TypedDict, Union)
8+
from functools import partial
9+
from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
10+
Tuple, TypedDict, Union)
1011

1112
import torch
1213
import torch.nn as nn
@@ -122,6 +123,20 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
122123
return blocks, target_width, target_height
123124

124125

126+
def calculate_num_blocks_wrapper(hf_config: Dict[str, Any],
127+
max_dynamic_patch: Optional[int] = None):
128+
if max_dynamic_patch is None:
129+
max_dynamic_patch = hf_config.max_dynamic_patch
130+
min_num = hf_config.min_dynamic_patch
131+
image_size = hf_config.vision_config.image_size
132+
use_thumbnail = hf_config.use_thumbnail
133+
return partial(calculate_num_blocks,
134+
min_num=min_num,
135+
max_num=max_dynamic_patch,
136+
image_size=image_size,
137+
use_thumbnail=use_thumbnail)
138+
139+
125140
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
126141
def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
127142
image_size: int,
@@ -168,62 +183,85 @@ def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
168183
return pixel_values
169184

170185

171-
def get_internvl_num_patches(image_size: int, patch_size: int,
172-
downsample_ratio: float):
186+
def image_to_pixel_values_wrapper(hf_config: Dict[str, Any],
187+
max_dynamic_patch: Optional[int] = None):
188+
image_size = hf_config.vision_config.image_size
189+
min_num = hf_config.min_dynamic_patch
190+
if max_dynamic_patch is None:
191+
max_dynamic_patch = hf_config.max_dynamic_patch
192+
use_thumbnail = hf_config.use_thumbnail
193+
return partial(image_to_pixel_values,
194+
input_size=image_size,
195+
min_num=min_num,
196+
max_num=max_dynamic_patch,
197+
use_thumbnail=use_thumbnail)
198+
199+
200+
def get_internvl_num_patches(hf_config: Dict[str, Any]):
201+
vision_config = hf_config.vision_config
202+
downsample_ratio = hf_config.downsample_ratio
203+
image_size = vision_config.image_size
204+
patch_size = vision_config.patch_size
173205
return int(
174206
get_clip_num_patches(image_size=image_size, patch_size=patch_size) *
175207
(downsample_ratio**2))
176208

177209

178-
def get_max_internvl_image_tokens(ctx: InputContext):
210+
def get_max_internvl_image_tokens(ctx: InputContext,
211+
*,
212+
max_dynamic_patch: Optional[int] = None):
179213
hf_config = ctx.get_hf_config()
180-
vision_config = hf_config.vision_config
181214

215+
if max_dynamic_patch is None:
216+
max_dynamic_patch = hf_config.max_dynamic_patch
182217
use_thumbnail = hf_config.use_thumbnail
183-
max_dynamic_patch = hf_config.max_dynamic_patch
184-
if use_thumbnail:
218+
if use_thumbnail and max_dynamic_patch > 1:
185219
max_dynamic_patch += 1
186-
downsample_ratio = hf_config.downsample_ratio
187220

188-
image_size = vision_config.image_size
189-
patch_size = vision_config.patch_size
190-
num_patches = get_internvl_num_patches(image_size, patch_size,
191-
downsample_ratio)
221+
num_patches = get_internvl_num_patches(hf_config)
192222
return num_patches * max_dynamic_patch
193223

194224

195-
def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
225+
def get_max_internvl_image_size(ctx: InputContext,
226+
*,
227+
max_dynamic_patch: Optional[int] = None):
228+
hf_config = ctx.get_hf_config()
229+
image_size = hf_config.vision_config.image_size
230+
231+
if max_dynamic_patch is None:
232+
max_dynamic_patch = hf_config.max_dynamic_patch
233+
use_thumbnail = hf_config.use_thumbnail
234+
if use_thumbnail and max_dynamic_patch > 1:
235+
max_dynamic_patch += 1
236+
width = image_size * max_dynamic_patch
237+
height = image_size
238+
return width, height
239+
240+
241+
def input_processor_for_internvl(ctx: InputContext,
242+
llm_inputs: LLMInputs,
243+
*,
244+
max_dynamic_patch: Optional[int] = None):
196245
multi_modal_data = llm_inputs.get("multi_modal_data")
197246
if multi_modal_data is None or "image" not in multi_modal_data:
198247
return llm_inputs
199248

200249
model_config = ctx.model_config
201250
hf_config = ctx.get_hf_config()
202-
vision_config = hf_config.vision_config
203-
204-
image_size = vision_config.image_size
205-
patch_size = vision_config.patch_size
206-
downsample_ratio = hf_config.downsample_ratio
207-
num_patches = get_internvl_num_patches(image_size, patch_size,
208-
downsample_ratio)
209251

210252
image_data = multi_modal_data["image"]
211-
min_num = hf_config.min_dynamic_patch
212-
max_num = hf_config.max_dynamic_patch
213-
use_thumbnail = hf_config.use_thumbnail
253+
num_patches = get_internvl_num_patches(hf_config)
254+
num_blocks_calculator = calculate_num_blocks_wrapper(
255+
hf_config, max_dynamic_patch)
214256
if isinstance(image_data, Image.Image):
215257
width, height = image_data.size
216-
num_blocks, _, _ = calculate_num_blocks(width, height, min_num,
217-
max_num, image_size,
218-
use_thumbnail)
258+
num_blocks, _, _ = num_blocks_calculator(width, height)
219259
image_feature_size = [num_blocks * num_patches]
220260
elif is_list_of(image_data, Image.Image):
221261
image_feature_size = []
222262
for image in image_data:
223263
width, height = image.size
224-
num_blocks, _, _ = calculate_num_blocks(width, height, min_num,
225-
max_num, image_size,
226-
use_thumbnail)
264+
num_blocks, _, _ = num_blocks_calculator(width, height)
227265
image_feature_size.append(num_blocks * num_patches)
228266
elif isinstance(image_data, torch.Tensor):
229267
num_images, image_feature_size, hidden_size = image_data.shape
@@ -253,31 +291,21 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
253291
multi_modal_data=multi_modal_data)
254292

255293

256-
def input_mapper_for_internvl(ctx: InputContext, data: object):
294+
def input_mapper_for_internvl(ctx: InputContext,
295+
data: object,
296+
*,
297+
max_dynamic_patch: Optional[int] = None):
257298
hf_config = ctx.get_hf_config()
258299

259-
use_thumbnail = hf_config.use_thumbnail
260-
min_num = hf_config.min_dynamic_patch
261-
max_num = hf_config.max_dynamic_patch
262-
image_size = hf_config.vision_config.image_size
263-
300+
image_pixel_values_mapper = image_to_pixel_values_wrapper(
301+
hf_config, max_dynamic_patch)
264302
if isinstance(data, Image.Image):
265-
data = image_to_pixel_values(data,
266-
image_size,
267-
min_num,
268-
max_num,
269-
use_thumbnail=use_thumbnail)
303+
data = image_pixel_values_mapper(data)
270304
# Add an N dimension for number of images per prompt (currently 1).
271305
data = data.unsqueeze(0)
272306
elif is_list_of(data, Image.Image):
273307
# we can't stack here because the images may have different num_patches
274-
data = [
275-
image_to_pixel_values(img,
276-
image_size,
277-
min_num,
278-
max_num,
279-
use_thumbnail=use_thumbnail) for img in data
280-
]
308+
data = [image_pixel_values_mapper(img) for img in data]
281309
model_config = ctx.model_config
282310
tokenizer = cached_get_tokenizer(
283311
model_config.tokenizer,
@@ -292,35 +320,36 @@ def input_mapper_for_internvl(ctx: InputContext, data: object):
292320
})
293321

294322

295-
def dummy_data_for_internvl(ctx: InputContext, seq_len: int,
296-
mm_counts: Mapping[str, int]):
323+
def dummy_data_for_internvl(ctx: InputContext,
324+
seq_len: int,
325+
mm_counts: Mapping[str, int],
326+
*,
327+
max_dynamic_patch: Optional[int] = None):
297328
num_images = mm_counts["image"]
298329

299-
image_feature_size = get_max_internvl_image_tokens(ctx)
300-
model_config = ctx.model_config
301330
hf_config = ctx.get_hf_config()
302-
vision_config = hf_config.vision_config
331+
332+
image_feature_size = get_max_internvl_image_tokens(
333+
ctx, max_dynamic_patch=max_dynamic_patch)
334+
model_config = ctx.model_config
303335
tokenizer = cached_get_tokenizer(
304336
model_config.tokenizer,
305337
trust_remote_code=model_config.trust_remote_code)
306338

307339
seq_data = dummy_seq_data_for_clip(
308-
vision_config,
340+
hf_config.vision_config,
309341
seq_len,
310342
num_images,
311343
image_token_id=tokenizer.encode(IMG_CONTEXT,
312344
add_special_tokens=False)[0],
313345
image_feature_size_override=image_feature_size,
314346
)
315347

316-
image_size = vision_config.image_size
317-
min_num = hf_config.min_dynamic_patch
318-
max_num = hf_config.max_dynamic_patch
319-
max_image_width = max_num * image_size
320-
max_image_height = min_num * image_size
348+
max_image_width, max_image_height = get_max_internvl_image_size(
349+
ctx, max_dynamic_patch=max_dynamic_patch)
321350

322351
mm_data = dummy_image_for_clip(
323-
vision_config,
352+
hf_config.vision_config,
324353
num_images,
325354
image_width_override=max_image_width,
326355
image_height_override=max_image_height,
@@ -470,7 +499,6 @@ def _process_image_input(
470499
self,
471500
image_input: InternVLImageInputs,
472501
) -> torch.Tensor:
473-
474502
if image_input["type"] == "image_embeds":
475503
return image_input["data"]
476504

0 commit comments

Comments
 (0)