address @zucchini-nlp's comments

huggingface · leloykun · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
commit 935d6e51d470861484f0fd2ec1ea5e9a6982a6d2
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
@@ -22,7 +22,7 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -94,16 +94,8 @@ def __call__(
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -133,9 +125,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def batch_decode(self, *args, **kwargs):
         """

diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
@@ -114,7 +114,7 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise TypeError("Invalid input text. Please provide a string, or a list of strings")
         if text is None and images is None:
-            raise ValueError("You must provide either text or images")
+            raise ValueError("You must provide either text or images as prompt")
 
         output_kwargs = self._merge_kwargs(
             ChameleonProcessorKwargs,
@@ -132,12 +132,10 @@ def __call__(
                 sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
             prompt_strings.append(sample)
 
-        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-
+        features = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         if images is not None:
-            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
-
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+            features["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+        return features
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
@@ -22,7 +22,7 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -92,6 +92,8 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[FlavaProcessorKwargs],
     ):
         """
@@ -121,9 +123,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def batch_decode(self, *args, **kwargs):
         """

diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -20,15 +20,10 @@
 import sys
 from typing import List, Optional, Union
 
-from ...image_processing_utils import BatchFeature
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import (
-    AddedToken,
-    BatchEncoding,
-    PreTokenizedInput,
-    TextInput,
-)
+from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
 from ..auto import AutoTokenizer
 
@@ -103,7 +98,26 @@ def __call__(
         This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
-        Please refer to the docstring of the above two methods for more information.
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            -- **qformer_input_ids** - List of token ids from the Q-Former tokenizer to be fed to a model. Returned when `text` is not `None`.
+            -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`.
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             InstructBlipVideoProcessorKwargs,
@@ -150,7 +164,7 @@ def __call__(
                     )
 
             # cast to desired return tensors type after concatenating
-            text_encoding = BatchEncoding(
+            text_encoding = BatchFeature(
                 text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
             )
             encoding.update(text_encoding)

diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -139,7 +139,8 @@ def __call__(
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_images** -- Pixel values of images to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             LlavaNextVideoProcessorKwargs,
@@ -162,8 +163,6 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys())
-
         if self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
             logger.warning_once(
@@ -207,7 +206,10 @@ def __call__(
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
+        )
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
@@ -67,6 +67,8 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[SiglipProcessingKwargs],
     ) -> BatchFeature:
         """
@@ -116,9 +118,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchFeature(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def decode(self, *args, **kwargs):
         """

diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
@@ -20,7 +20,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
@@ -97,7 +97,7 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
-        videos: Optional[ImageInput] = None,
+        videos: Optional[VideoInput] = None,
         audio=None,
         **kwargs: Unpack[VideoLlavaProcessorKwargs],
     ) -> BatchFeature:
@@ -181,7 +181,7 @@ def __call__(
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         data.update(text_inputs)
 
-        return BatchFeature(data=data)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
@@ -20,9 +20,10 @@
 import warnings
 from typing import List, Optional, Union
 
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -87,13 +88,32 @@ def __call__(
         self,
         images: ImageInput,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[ViltProcessorKwargs],
-    ) -> BatchEncoding:
+    ) -> BatchFeature:
         """
         This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
-        Please refer to the docstring of the above two methods for more information.
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             ViltProcessorKwargs,