Skip to content

Commit e2f3064

Browse files
committed
chore:add custom image and audio kwargs class and some nits
1 parent 4b7f5a8 commit e2f3064

File tree

1 file changed

+22
-5
lines changed

1 file changed

+22
-5
lines changed

src/transformers/models/imagebind/processing_imagebind.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,37 @@
1515
Image/Text processor class for ImageBind
1616
"""
1717

18-
from typing import List, Union
18+
from typing import List, Optional, Union
1919

2020
try:
2121
from typing import Unpack
2222
except ImportError:
2323
from typing_extensions import Unpack
2424

2525
from ...image_utils import ImageInput
26-
from ...processing_utils import ProcessingKwargs, ProcessorMixin
27-
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
28-
26+
from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin
27+
from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput
28+
29+
class ImageBindProcessorImagesKwargs(ImagesKwargs, total=False):
30+
do_convert_rgb: bool = None
31+
do_chunk: bool = None
32+
chunk_duration: float = None
33+
num_chunks: int = None
34+
num_frames_per_chunk: int = None
35+
fps: int = None
36+
37+
class ImageBindProcessorAudioKwargs(AudioKwargs, total=False):
38+
do_normalize: Optional[bool] = None
39+
mean: Optional[float] = None
40+
std: Optional[float] = None
41+
do_chunk: Optional[bool] = None
42+
chunk_duration: Optional[float] = None
43+
num_chunks: Optional[int] = None
2944

3045
class ImageBindProcessorKwargs(ProcessingKwargs, total=False):
3146
# see processing_utils.ProcessingKwargs documentation for usage.
47+
images_kwargs: ImageBindProcessorImagesKwargs
48+
audio_kwargs: ImageBindProcessorAudioKwargs
3249
_defaults = {}
3350

3451

@@ -78,7 +95,7 @@ def __call__(
7895
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
7996
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
8097
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
81-
audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, `List[List[List[float]]]`):
98+
audio (`AudioInput`, `List[float]`, `List[List[float]]`, `List[List[List[float]]]`):
8299
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of numpy
83100
arrays or a (possibly nested) list of float values. The supported input types are as follows:
84101

0 commit comments

Comments
 (0)