huggingface · yonigozlan · Nov 4, 2025 · Oct 26, 2025 · Nov 4, 2025
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
@@ -286,8 +286,8 @@ def __call__(
                         f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                     )
 
-                image_rows = inputs.pop("rows", [[0] * len(text)])
-                image_cols = inputs.pop("cols", [[0] * len(text)])
+                image_rows = inputs.pop("rows", [[0] * n_images for n_images in n_images_in_text])
+                image_cols = inputs.pop("cols", [[0] * n_images for n_images in n_images_in_text])
 
                 fake_image_token = self.fake_image_token
                 image_token = self.image_token

diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -172,8 +172,6 @@ def __init__(
 
     def expand_text_with_image_tokens(self, text, image_rows, image_cols):
         prompt_strings = []
-        image_rows = image_rows if image_rows is not None else [[0] * len(text)]
-        image_cols = image_cols if image_cols is not None else [[0] * len(text)]
         for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
             # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
             image_prompt_strings = []
@@ -330,6 +328,11 @@ def __call__(
                     raise ValueError(
                         f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
                     )
+                # Set default values for image_rows and image_cols if not provided
+                if image_rows is None:
+                    image_rows = [[0] * n_images for n_images in n_images_in_text]
+                if image_cols is None:
+                    image_cols = [[0] * n_images for n_images in n_images_in_text]
                 text = self.expand_text_with_image_tokens(text, image_rows=image_rows, image_cols=image_cols)
 
         elif videos is not None: