Skip to content

Commit 83b824c

Browse files
[VLM] Remove BaseProcessingInfo.get_mm_max_tokens_per_item (#16408)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 7678fcd commit 83b824c

39 files changed

+104
-677
lines changed

docs/source/contributing/model/multimodal.md

Lines changed: 37 additions & 203 deletions
Original file line numberDiff line numberDiff line change
@@ -121,17 +121,21 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
121121
return {"image": None, "video": 1}
122122
```
123123

124-
### Maximum number of placeholder feature tokens
124+
## 3. Specify dummy inputs
125125

126-
Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
127-
to return the maximum number of placeholder feature tokens per input item for each modality.
126+
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
127+
HF processing as well as memory profiling.
128128

129-
When calling the model, the output embeddings from the visual encoder are assigned to the input positions
130-
containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
131-
to the size of the output embeddings.
129+
### For memory profiling
132130

133-
:::::{tab-set}
134-
::::{tab-item} Basic example: LLaVA
131+
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
132+
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
133+
the model so that vLLM can reserve the correct amount of memory for it.
134+
135+
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
136+
137+
::::{tab-set}
138+
:::{tab-item} Basic example: LLaVA
135139
:sync: llava
136140

137141
Looking at the code of HF's `LlavaForConditionalGeneration`:
@@ -240,41 +244,43 @@ def get_num_image_tokens(
240244
```
241245

242246
Notice that the number of image tokens doesn't depend on the image width and height.
243-
So, we can calculate the maximum number of image tokens using any image size:
247+
We can simply use a dummy `image_size`:
244248

245249
```python
246250
def get_image_size_with_most_features(self) -> ImageSize:
247251
hf_config = self.get_hf_config()
248252
width = height = hf_config.image_size
249253
return ImageSize(width=width, height=height)
250254

251-
def get_max_image_tokens(self) -> int:
252-
target_width, target_height = self.get_image_size_with_most_features()
253-
254-
return self.get_num_image_tokens(
255-
image_width=target_width,
256-
image_height=target_height,
257-
)
258-
```
259-
260-
And thus, we can override the method as:
261-
262-
```python
263-
def get_mm_max_tokens_per_item(
255+
def get_dummy_processor_inputs(
264256
self,
265257
seq_len: int,
266258
mm_counts: Mapping[str, int],
267-
) -> Mapping[str, int]:
268-
return {"image": self.get_max_image_tokens()}
259+
) -> ProcessorInputs:
260+
num_images = mm_counts.get("image", 0)
261+
262+
processor = self.info.get_hf_processor()
263+
image_token = processor.image_token
264+
265+
hf_config = self.get_hf_config()
266+
target_width, target_height = self.info.get_image_size_with_most_features()
267+
268+
mm_data = {
269+
"image":
270+
self._get_dummy_images(width=target_width,
271+
height=target_height,
272+
num_images=num_images)
273+
}
274+
275+
return ProcessorInputs(
276+
prompt_text=image_token * num_images,
277+
mm_data=mm_data,
278+
)
269279
```
270280

271-
:::{note}
272-
Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
273281
:::
274282

275-
::::
276-
277-
::::{tab-item} Non-consecutive feature tokens: Fuyu
283+
:::{tab-item} No input placeholders: Fuyu
278284
:sync: fuyu
279285

280286
Looking at the code of HF's `FuyuForCausalLM`:
@@ -394,188 +400,16 @@ num_patches_per_dim_w = image_width // patch_width
394400
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
395401
```
396402

397-
We can calculate this in vLLM using this code:
398-
399-
```python
400-
def get_num_image_patches(
401-
self,
402-
*,
403-
image_width: int,
404-
image_height: int,
405-
) -> int:
406-
image_processor = self.get_image_processor()
407-
target_width = image_processor.size["width"]
408-
target_height = image_processor.size["height"]
409-
patch_width = image_processor.patch_size["width"]
410-
patch_height = image_processor.patch_size["height"]
411-
412-
if not (image_width <= target_width and image_height <= target_height):
413-
height_scale_factor = target_height / image_height
414-
width_scale_factor = target_width / image_width
415-
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
416-
417-
image_height = int(image_height * optimal_scale_factor)
418-
image_width = int(image_width * optimal_scale_factor)
419-
420-
ncols = math.ceil(image_width / patch_width)
421-
nrows = math.ceil(image_height / patch_height)
422-
return ncols * nrows
423-
```
424-
425-
These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
426-
inserts newline tokens (`|NEWLINE|`) as shown here:
427-
428-
```python
429-
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
430-
tensor_of_image_ids = torch.full(
431-
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
432-
)
433-
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
434-
assert num_patches == patches.shape[0]
435-
436-
if variable_sized:
437-
# Now terminate each line with |NEWLINE|.
438-
tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
439-
newline_ids = torch.full(
440-
[tensor_of_image_ids.shape[0], 1],
441-
image_newline_id,
442-
dtype=torch.int32,
443-
device=image_input.device,
444-
)
445-
tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
446-
tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
447-
```
448-
449-
So, the layout of tokens for an image is:
450-
451-
```
452-
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
453-
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
454-
...
455-
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
456-
```
457-
458-
This makes the placeholder tokens non-consecutive in the prompt.
459-
Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
460-
461-
So overall, the total number of feature tokens is
462-
463-
```python
464-
def get_num_image_tokens(
465-
self,
466-
*,
467-
image_width: int,
468-
image_height: int,
469-
) -> int:
470-
image_processor = self.get_image_processor()
471-
target_width = image_processor.size["width"]
472-
target_height = image_processor.size["height"]
473-
patch_width = image_processor.patch_size["width"]
474-
patch_height = image_processor.patch_size["height"]
475-
476-
if not (image_width <= target_width and image_height <= target_height):
477-
height_scale_factor = target_height / image_height
478-
width_scale_factor = target_width / image_width
479-
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
480-
481-
image_height = int(image_height * optimal_scale_factor)
482-
image_width = int(image_width * optimal_scale_factor)
483-
484-
ncols = math.ceil(image_width / patch_width)
485-
nrows = math.ceil(image_height / patch_height)
486-
return (ncols + 1) * nrows
487-
```
488-
489-
To calculate the maximum number of image tokens, recall that input images are first resized
490-
to fit within `image_processor.size`. The maximum possible dimensions of the image before
491-
being converted into patches is therefore equal to `image_processor.size`.
403+
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
404+
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
492405

493406
```python
494407
def get_image_size_with_most_features(self) -> ImageSize:
495408
image_processor = self.get_image_processor()
496409
return ImageSize(width=image_processor.size["width"],
497410
height=image_processor.size["height"])
498-
499-
def get_max_image_tokens(self) -> int:
500-
target_width, target_height = self.get_image_size_with_most_features()
501-
502-
return self.get_num_image_tokens(
503-
image_width=target_width,
504-
image_height=target_height,
505-
)
506-
```
507-
508-
And thus, we can override the method as:
509-
510-
```python
511-
def get_mm_max_tokens_per_item(
512-
self,
513-
seq_len: int,
514-
mm_counts: Mapping[str, int],
515-
) -> Mapping[str, int]:
516-
return {"image": self.get_max_image_tokens()}
517-
```
518-
519-
:::{note}
520-
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
521-
This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
522-
:::
523-
524-
::::
525-
:::::
526-
527-
## 3. Specify dummy inputs
528-
529-
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
530-
HF processing as well as memory profiling.
531-
532-
### For memory profiling
533-
534-
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
535-
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
536-
the model so that vLLM can reserve the correct amount of memory for it.
537-
538-
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
539-
on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
540-
541-
::::{tab-set}
542-
:::{tab-item} Basic example: LLaVA
543-
:sync: llava
544-
545-
Making use of the `get_image_size_with_most_features` method implemented in Step 2:
546-
547-
```python
548-
def get_dummy_processor_inputs(
549-
self,
550-
seq_len: int,
551-
mm_counts: Mapping[str, int],
552-
) -> ProcessorInputs:
553-
num_images = mm_counts.get("image", 0)
554-
555-
processor = self.info.get_hf_processor()
556-
image_token = processor.image_token
557-
558-
hf_config = self.get_hf_config()
559-
target_width, target_height = self.info.get_image_size_with_most_features()
560-
561-
mm_data = {
562-
"image":
563-
self._get_dummy_images(width=target_width,
564-
height=target_height,
565-
num_images=num_images)
566-
}
567-
568-
return ProcessorInputs(
569-
prompt_text=image_token * num_images,
570-
mm_data=mm_data,
571-
)
572411
```
573412

574-
:::
575-
576-
:::{tab-item} No input placeholders: Fuyu
577-
:sync: fuyu
578-
579413
Fuyu does not expect image placeholders in the inputs to HF processor, so
580414
the dummy prompt text is empty regardless of the number of images.
581415
Otherwise, the logic of this method is very similar to LLaVA:

tests/models/multimodal/processing/test_llama4.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,6 @@ def test_processor_override(
7676
if v == config.boi_token_index]
7777

7878
# patch sizes and masks
79-
patch_token_id = vocab[hf_processor.img_patch_token]
80-
num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
81-
mm_counts = {"image": num_imgs}
82-
assert num_patches / num_imgs <= \
83-
processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
8479
num_patches_per_chunk = processor.info.get_patch_per_chunk(
8580
config.vision_config)
8681
assert prompt_token_ids.count(config.image_token_index) \

vllm/model_executor/models/aria.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -408,13 +408,6 @@ def get_hf_processor(self, **kwargs: object):
408408
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
409409
return {"image": None}
410410

411-
def get_mm_max_tokens_per_item(
412-
self,
413-
seq_len: int,
414-
mm_counts: Mapping[str, int],
415-
) -> Mapping[str, int]:
416-
return {"image": self.get_num_image_tokens()}
417-
418411
def get_num_image_tokens(self) -> int:
419412
hf_config = self.get_hf_config()
420413
return max(hf_config.projector_patch_to_query_dict.values())

vllm/model_executor/models/aya_vision.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -117,31 +117,6 @@ def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
117117
def get_image_processor(self) -> GotOcr2ImageProcessor:
118118
return self.get_hf_processor().image_processor
119119

120-
def get_mm_max_tokens_per_item(
121-
self,
122-
seq_len: int,
123-
mm_counts: Mapping[str, int],
124-
) -> Mapping[str, int]:
125-
return {"image": self.get_max_image_tokens()}
126-
127-
def get_max_image_tokens(self) -> int:
128-
hf_processor = self.get_hf_processor()
129-
image_processor = hf_processor.image_processor
130-
131-
image_size = self.get_image_size_with_most_features()
132-
num_patches = self.get_num_patches(
133-
image_width=image_size.width,
134-
image_height=image_size.height,
135-
size=image_processor.size,
136-
min_patches=image_processor.min_patches,
137-
max_patches=image_processor.max_patches,
138-
)
139-
140-
img_patches_per_tile = (hf_processor.img_size //
141-
hf_processor.patch_size)**2
142-
143-
return num_patches * img_patches_per_tile
144-
145120
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
146121
return {"image": None}
147122

vllm/model_executor/models/blip2.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -406,13 +406,6 @@ def get_hf_config(self):
406406
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
407407
return {"image": 1}
408408

409-
def get_mm_max_tokens_per_item(
410-
self,
411-
seq_len: int,
412-
mm_counts: Mapping[str, int],
413-
) -> Mapping[str, int]:
414-
return {"image": self.get_num_image_tokens()}
415-
416409
def get_num_image_tokens(self) -> int:
417410
hf_config = self.get_hf_config()
418411
return hf_config.num_query_tokens

vllm/model_executor/models/chameleon.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,6 @@ def get_hf_processor(self, **kwargs: object):
6464
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
6565
return {"image": 1}
6666

67-
def get_mm_max_tokens_per_item(
68-
self,
69-
seq_len: int,
70-
mm_counts: Mapping[str, int],
71-
) -> Mapping[str, int]:
72-
return {"image": self.get_num_image_tokens()}
73-
7467
def get_num_image_tokens(self) -> int:
7568
processor = self.get_hf_processor()
7669
return processor.image_seq_length

vllm/model_executor/models/clip.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ def get_num_image_tokens(
3030
) -> int:
3131
return self.get_patch_grid_length()**2 + 1
3232

33-
def get_max_image_tokens(self) -> int:
34-
return self.get_patch_grid_length()**2 + 1
35-
3633
def get_image_size(self) -> int:
3734
return self.vision_config.image_size
3835

0 commit comments

Comments
 (0)