Skip to content

Commit 4fd4b74

Browse files
authored
[Bugfix] Fix max image size for PaddleOCR-VL (#28442)
Signed-off-by: Roger Wang <hey@rogerw.io>
1 parent cc07976 commit 4fd4b74

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

vllm/model_executor/models/paddleocr_vl.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -198,23 +198,18 @@ def get_num_image_tokens(
198198
if image_processor is None:
199199
image_processor = self.get_image_processor()
200200

201-
do_resize = True
202201
hf_config = self.get_hf_config()
203202
vision_config = hf_config.vision_config
204203
patch_size = vision_config.patch_size
205204
merge_size = vision_config.spatial_merge_size
206-
207-
if do_resize:
208-
resized_height, resized_width = smart_resize(
209-
height=image_height,
210-
width=image_width,
211-
factor=patch_size * merge_size,
212-
min_pixels=image_processor.min_pixels,
213-
max_pixels=image_processor.max_pixels,
214-
)
215-
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
216-
else:
217-
preprocessed_size = ImageSize(width=image_width, height=image_height)
205+
resized_height, resized_width = smart_resize(
206+
height=image_height,
207+
width=image_width,
208+
factor=patch_size * merge_size,
209+
min_pixels=image_processor.min_pixels,
210+
max_pixels=image_processor.max_pixels,
211+
)
212+
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
218213

219214
grid_t = 1
220215
grid_h = preprocessed_size.height // patch_size
@@ -227,8 +222,19 @@ def get_num_image_tokens(
227222

228223
def get_image_size_with_most_features(self) -> ImageSize:
229224
hf_config = self.get_hf_config()
230-
image_size = hf_config.vision_config.image_size
231-
return ImageSize(height=image_size, width=image_size)
225+
226+
# See `smart_resize` for the calculation of the image size.
227+
merge_size = hf_config.vision_config.spatial_merge_size
228+
patch_size = hf_config.vision_config.patch_size
229+
factor = merge_size * patch_size
230+
max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
231+
# Find factors of max_num_tokens close to its square root
232+
# to create a dummy image with a reasonable aspect ratio.
233+
h_patches = int(math.sqrt(max_num_tokens))
234+
while max_num_tokens % h_patches != 0:
235+
h_patches -= 1
236+
w_patches = max_num_tokens // h_patches
237+
return ImageSize(height=h_patches * factor, width=w_patches * factor)
232238

233239

234240
class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):

0 commit comments

Comments
 (0)