@@ -198,23 +198,18 @@ def get_num_image_tokens(
198198 if image_processor is None :
199199 image_processor = self .get_image_processor ()
200200
201- do_resize = True
202201 hf_config = self .get_hf_config ()
203202 vision_config = hf_config .vision_config
204203 patch_size = vision_config .patch_size
205204 merge_size = vision_config .spatial_merge_size
206-
207- if do_resize :
208- resized_height , resized_width = smart_resize (
209- height = image_height ,
210- width = image_width ,
211- factor = patch_size * merge_size ,
212- min_pixels = image_processor .min_pixels ,
213- max_pixels = image_processor .max_pixels ,
214- )
215- preprocessed_size = ImageSize (width = resized_width , height = resized_height )
216- else :
217- preprocessed_size = ImageSize (width = image_width , height = image_height )
205+ resized_height , resized_width = smart_resize (
206+ height = image_height ,
207+ width = image_width ,
208+ factor = patch_size * merge_size ,
209+ min_pixels = image_processor .min_pixels ,
210+ max_pixels = image_processor .max_pixels ,
211+ )
212+ preprocessed_size = ImageSize (width = resized_width , height = resized_height )
218213
219214 grid_t = 1
220215 grid_h = preprocessed_size .height // patch_size
@@ -227,8 +222,19 @@ def get_num_image_tokens(
227222
228223 def get_image_size_with_most_features (self ) -> ImageSize :
229224 hf_config = self .get_hf_config ()
230- image_size = hf_config .vision_config .image_size
231- return ImageSize (height = image_size , width = image_size )
225+
226+ # See `smart_resize` for the calculation of the image size.
227+ merge_size = hf_config .vision_config .spatial_merge_size
228+ patch_size = hf_config .vision_config .patch_size
229+ factor = merge_size * patch_size
230+ max_num_tokens = self .get_image_processor ().max_pixels // (factor ** 2 )
231+ # Find factors of max_num_tokens close to its square root
232+ # to create a dummy image with a reasonable aspect ratio.
233+ h_patches = int (math .sqrt (max_num_tokens ))
234+ while max_num_tokens % h_patches != 0 :
235+ h_patches -= 1
236+ w_patches = max_num_tokens // h_patches
237+ return ImageSize (height = h_patches * factor , width = w_patches * factor )
232238
233239
234240class PaddleOCRVLDummyInputsBuilder (BaseDummyInputsBuilder [PaddleOCRVLProcessingInfo ]):
0 commit comments