Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uniform kwargs for processors of audio-text models #32906

Draft
wants to merge 28 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2f4163a
uniformize kwargs of Chameleon
leloykun Aug 16, 2024
2588144
fix linter nit
leloykun Aug 16, 2024
6454130
rm stride default
leloykun Aug 16, 2024
9949e72
add tests for chameleon processor
leloykun Aug 16, 2024
58c6b53
fix tests
leloykun Aug 16, 2024
6592ce3
fix chameleon tests
leloykun Aug 16, 2024
c4f5474
don't hardcode arg names
leloykun Aug 16, 2024
ce9cc73
uniformize processor kwargs of altclip, bridgetower, flava, instructb…
leloykun Aug 17, 2024
d325914
fix linter issue
leloykun Aug 17, 2024
935d6e5
address @zucchini-nlp's comments
leloykun Aug 19, 2024
39650f6
improve docs
leloykun Aug 19, 2024
539da9d
don't dw from hub for video tests
leloykun Aug 19, 2024
c8b2384
add video processing tests for instructblipvideo & video_llava
leloykun Aug 19, 2024
423d864
add git, mgp, tvp, & x-clip
leloykun Aug 19, 2024
5fd2c32
fix docs
leloykun Aug 19, 2024
9e00f68
address @zucchini-nlp's comments
leloykun Aug 20, 2024
a2672a6
simplify implementations
leloykun Aug 20, 2024
721d1c8
uniformize implementations of make_batched_videos and make_batched_im…
leloykun Aug 20, 2024
c0f3abb
fix instructblipvideo tests
leloykun Aug 20, 2024
bb5debd
fix copies
leloykun Aug 20, 2024
d9bc2e9
fix make_batched_videos
leloykun Aug 20, 2024
f6e7914
fix MGP-str
leloykun Aug 20, 2024
acd2c56
fix make_batched_videos
leloykun Aug 20, 2024
5c39f4f
fix make_batched_videos
leloykun Aug 20, 2024
ea06e45
fix make_batched_videos
leloykun Aug 20, 2024
44023bc
uniformize kwargs for audio-text processors
leloykun Aug 21, 2024
ea3d36e
add clap, clvp, musicgen melody, qwen2, & seamless m4t
leloykun Aug 21, 2024
3e46327
fix wav2vec2 bert & speecht5
leloykun Aug 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
address @zucchini-nlp's comments
  • Loading branch information
leloykun committed Aug 19, 2024
commit 935d6e51d470861484f0fd2ec1ea5e9a6982a6d2
16 changes: 3 additions & 13 deletions src/transformers/models/altclip/processing_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...tokenization_utils_base import PreTokenizedInput, TextInput


if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -94,16 +94,8 @@ def __call__(
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.

return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:

- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
Expand Down Expand Up @@ -133,9 +125,7 @@ def __call__(
elif text is not None:
return encoding
else:
return BatchEncoding(
data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
)
return image_features

def batch_decode(self, *args, **kwargs):
"""
Expand Down
10 changes: 4 additions & 6 deletions src/transformers/models/chameleon/processing_chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __call__(
elif not isinstance(text, list) and not isinstance(text[0], str):
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
if text is None and images is None:
raise ValueError("You must provide either text or images")
raise ValueError("You must provide either text or images as prompt")

output_kwargs = self._merge_kwargs(
ChameleonProcessorKwargs,
Expand All @@ -132,12 +132,10 @@ def __call__(
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
prompt_strings.append(sample)

data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])

features = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
if images is not None:
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]

return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
features["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
return features

# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/flava/processing_flava.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...tokenization_utils_base import PreTokenizedInput, TextInput


if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -92,6 +92,8 @@ def __call__(
self,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[FlavaProcessorKwargs],
):
"""
Expand Down Expand Up @@ -121,9 +123,7 @@ def __call__(
elif text is not None:
return encoding
else:
return BatchEncoding(
data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
)
return image_features

def batch_decode(self, *args, **kwargs):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,10 @@
import sys
from typing import List, Optional, Union

from ...image_processing_utils import BatchFeature
from ...feature_extraction_utils import BatchFeature
from ...image_utils import VideoInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import (
AddedToken,
BatchEncoding,
PreTokenizedInput,
TextInput,
)
from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
from ...utils import logging
from ..auto import AutoTokenizer

Expand Down Expand Up @@ -103,7 +98,26 @@ def __call__(
This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.

Please refer to the docstring of the above two methods for more information.
Args:
text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.

Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:

- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
-- **qformer_input_ids** - List of token ids from the Q-Former tokenizer to be fed to a model. Returned when `text` is not `None`.
-- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`.
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
output_kwargs = self._merge_kwargs(
InstructBlipVideoProcessorKwargs,
Expand Down Expand Up @@ -150,7 +164,7 @@ def __call__(
)

# cast to desired return tensors type after concatenating
text_encoding = BatchEncoding(
text_encoding = BatchFeature(
text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
)
encoding.update(text_encoding)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ def __call__(
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **pixel_values_images** -- Pixel values of images to be fed to a model. Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
"""
output_kwargs = self._merge_kwargs(
LlavaNextVideoProcessorKwargs,
Expand All @@ -162,8 +163,6 @@ def __call__(
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")

print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys())

if self.patch_size is None or self.vision_feature_select_strategy is None:
prompt_strings = text
logger.warning_once(
Expand Down Expand Up @@ -207,7 +206,10 @@ def __call__(

text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])

return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
return BatchFeature(
data={**text_inputs, **image_inputs, **videos_inputs},
tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
)

# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/siglip/processing_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def __call__(
self,
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
images: Optional[ImageInput] = None,
audio=None,
videos=None,
**kwargs: Unpack[SiglipProcessingKwargs],
) -> BatchFeature:
"""
Expand Down Expand Up @@ -116,9 +118,7 @@ def __call__(
elif text is not None:
return encoding
else:
return BatchFeature(
data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
)
return image_features

def decode(self, *args, **kwargs):
"""
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/video_llava/processing_video_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from typing import List, Optional, Union

from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, get_image_size, to_numpy_array
from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import logging
Expand Down Expand Up @@ -97,7 +97,7 @@ def __call__(
self,
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
images: Optional[ImageInput] = None,
videos: Optional[ImageInput] = None,
videos: Optional[VideoInput] = None,
audio=None,
**kwargs: Unpack[VideoLlavaProcessorKwargs],
) -> BatchFeature:
Expand Down Expand Up @@ -181,7 +181,7 @@ def __call__(
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
data.update(text_inputs)

return BatchFeature(data=data)
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))

# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):
Expand Down
26 changes: 23 additions & 3 deletions src/transformers/models/vilt/processing_vilt.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import warnings
from typing import List, Optional, Union

from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
from ...tokenization_utils_base import PreTokenizedInput, TextInput


if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -87,13 +88,32 @@ def __call__(
self,
images: ImageInput,
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ViltProcessorKwargs],
) -> BatchEncoding:
) -> BatchFeature:
"""
This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
[`BertTokenizerFast.__call__`] to prepare text for the model.

Please refer to the docstring of the above two methods for more information.
Args:
text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.

Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:

- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
output_kwargs = self._merge_kwargs(
ViltProcessorKwargs,
Expand Down