Skip to content

Commit

Permalink
Add default padding when multiple text inputs (batch size>1)
Browse files Browse the repository at this point in the history
  • Loading branch information
yonigozlan committed Oct 17, 2024
1 parent a6c8a5c commit 819e22c
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions src/transformers/pipelines/image_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,12 @@ def __call__(
if num_images_in_text != num_images_in_images:
raise ValueError(
f"The number of images in each nested image group should be the same as the number of {image_token} tokens in the corresponding prompt."
f" Found {num_images_in_text} {image_token} tokens and {num_images_in_images} images."
)
elif sum(num_images_in_text) != len(images):
raise ValueError(
f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed."
f" Found {sum(num_images_in_text)} {image_token} tokens and {len(images)} images."
)
else:
# Reorganize the images to match the prompts
Expand All @@ -335,12 +337,18 @@ def __call__(
images_reorganized.append(images[:num_images])
images = images[num_images:]
images = images_reorganized
elif len(text) == 1 and len(images) > 1:
logger.warning(
"The pipeline detected multiple images for one prompt, but no image tokens in the prompt. "
"The prompt will be repeated for each image."
)
text = [text[0]] * len(images)
else:
if hasattr(self.processor, "image_token") and self.processor.image_token is not None:
logger.warning(
"The pipeline detected no image tokens in the prompt, but this model does support image tokens. "
"Results may be suboptimal or unexpected."
)
if len(text) == 1 and len(images) > 1:
logger.warning(
"The pipeline detected multiple images for one prompt, but no image tokens in the prompt. "
"The prompt will be repeated for each image."
)
text = [text[0]] * len(images)

# After reorganizing, these should be the same
if len(text) > 1 and len(images) != len(text):
Expand Down Expand Up @@ -402,6 +410,9 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, pro
else:
images = [load_image(image, timeout=timeout) for image in images]

# if batched text inputs, we set padding to True unless specified otherwise
if isinstance(text, (list, tuple)) and len(text) > 1:
processing_kwargs.setdefault("padding", True)
try:
model_inputs = self.processor(
images=images, text=text, return_tensors=self.framework, **processing_kwargs
Expand Down

0 comments on commit 819e22c

Please sign in to comment.