Add default padding when multiple text inputs (batch size>1)

huggingface · Oct 17, 2024 · 819e22c · 819e22c
1 parent a6c8a5c
commit 819e22c
Showing 1 changed file with 17 additions and 6 deletions.
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
@@ -323,10 +323,12 @@ def __call__(
                     if num_images_in_text != num_images_in_images:
                         raise ValueError(
                             f"The number of images in each nested image group should be the same as the number of {image_token} tokens in the corresponding prompt."
+                            f" Found {num_images_in_text} {image_token} tokens and {num_images_in_images} images."
                         )
                 elif sum(num_images_in_text) != len(images):
                     raise ValueError(
                         f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed."
+                        f" Found {sum(num_images_in_text)} {image_token} tokens and {len(images)} images."
                     )
                 else:
                     # Reorganize the images to match the prompts
@@ -335,12 +337,18 @@ def __call__(
                         images_reorganized.append(images[:num_images])
                         images = images[num_images:]
                     images = images_reorganized
-        elif len(text) == 1 and len(images) > 1:
-            logger.warning(
-                "The pipeline detected multiple images for one prompt, but no image tokens in the prompt. "
-                "The prompt will be repeated for each image."
-            )
-            text = [text[0]] * len(images)
+        else:
+            if hasattr(self.processor, "image_token") and self.processor.image_token is not None:
+                logger.warning(
+                    "The pipeline detected no image tokens in the prompt, but this model does support image tokens. "
+                    "Results may be suboptimal or unexpected."
+                )
+            if len(text) == 1 and len(images) > 1:
+                logger.warning(
+                    "The pipeline detected multiple images for one prompt, but no image tokens in the prompt. "
+                    "The prompt will be repeated for each image."
+                )
+                text = [text[0]] * len(images)
 
         # After reorganizing, these should be the same
         if len(text) > 1 and len(images) != len(text):
@@ -402,6 +410,9 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, pro
         else:
             images = [load_image(image, timeout=timeout) for image in images]
 
+        # if batched text inputs, we set padding to True unless specified otherwise
+        if isinstance(text, (list, tuple)) and len(text) > 1:
+            processing_kwargs.setdefault("padding", True)
         try:
             model_inputs = self.processor(
                 images=images, text=text, return_tensors=self.framework, **processing_kwargs