diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py index daa0f5192f33d2..09ecc1e92ee4ec 100644 --- a/src/transformers/pipelines/image_text_to_text.py +++ b/src/transformers/pipelines/image_text_to_text.py @@ -287,9 +287,11 @@ def __call__( chats = [Chat(chat, image) for chat, image in zip(text, images)] # 🐈 🐈 🐈 return super().__call__(chats, **kwargs) - # If we are not in chat mode, we need both images and text - if images is None or text is None: - raise ValueError("You must provide both images and text when not using chat templates.") + # support text only generation + if images is None: + return super().__call__(text, **kwargs) + if text is None: + raise ValueError("You must provide text for this pipeline.") if not isinstance(images, (list, tuple)): images = [images] @@ -386,28 +388,37 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, pro processing_kwargs["legacy"] = False processing_kwargs = {k: v for k, v in processing_kwargs.items() if v is not None} - images = inputs.images - - if isinstance(inputs, Chat): - # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default - # because very few models support multiple separate, consecutive assistant messages - if continue_final_message is None: - continue_final_message = inputs.messages[-1]["role"] == "assistant" - text = self.processor.apply_chat_template( - inputs.messages, - add_generation_prompt=not continue_final_message, - continue_final_message=continue_final_message, - return_tensors=self.framework, - ) + # In case we only have text inputs + if isinstance(inputs, (list, tuple, str)): + images = None + text = inputs inputs_text = inputs else: - text = inputs.text - inputs_text = inputs.text - - if not isinstance(images, (list, tuple)): - images = load_image(images, timeout=timeout) - else: - images = [load_image(image, timeout=timeout) for image in images] + # We have an ImageText or Chat inputs + images = inputs.images + if len(images) > 0: + if not isinstance(images, (list, tuple)): + images = load_image(images, timeout=timeout) + else: + images = [load_image(image, timeout=timeout) for image in images] + else: + images = None + + if isinstance(inputs, Chat): + # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default + # because very few models support multiple separate, consecutive assistant messages + if continue_final_message is None: + continue_final_message = inputs.messages[-1]["role"] == "assistant" + text = self.processor.apply_chat_template( + inputs.messages, + add_generation_prompt=not continue_final_message, + continue_final_message=continue_final_message, + return_tensors=self.framework, + ) + inputs_text = inputs + else: + text = inputs.text + inputs_text = inputs.text # if batched text inputs, we set padding to True unless specified otherwise if isinstance(text, (list, tuple)) and len(text) > 1: