|
15 | 15 | Image/Text processor class for ImageBind |
16 | 16 | """ |
17 | 17 |
|
18 | | -from typing import List, Union |
| 18 | +from typing import List, Optional, Union |
19 | 19 |
|
20 | 20 | try: |
21 | 21 | from typing import Unpack |
22 | 22 | except ImportError: |
23 | 23 | from typing_extensions import Unpack |
24 | 24 |
|
25 | 25 | from ...image_utils import ImageInput |
26 | | -from ...processing_utils import ProcessingKwargs, ProcessorMixin |
27 | | -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput |
28 | | - |
| 26 | +from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin |
| 27 | +from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput |
| 28 | + |
| 29 | +class ImageBindProcessorImagesKwargs(ImagesKwargs, total=False): |
| 30 | + do_convert_rgb: bool = None |
| 31 | + do_chunk: bool = None |
| 32 | + chunk_duration: float = None |
| 33 | + num_chunks: int = None |
| 34 | + num_frames_per_chunk: int = None |
| 35 | + fps: int = None |
| 36 | + |
| 37 | +class ImageBindProcessorAudioKwargs(AudioKwargs, total=False): |
| 38 | + do_normalize: Optional[bool] = None |
| 39 | + mean: Optional[float] = None |
| 40 | + std: Optional[float] = None |
| 41 | + do_chunk: Optional[bool] = None |
| 42 | + chunk_duration: Optional[float] = None |
| 43 | + num_chunks: Optional[int] = None |
29 | 44 |
|
30 | 45 | class ImageBindProcessorKwargs(ProcessingKwargs, total=False): |
31 | 46 | # see processing_utils.ProcessingKwargs documentation for usage. |
| 47 | + images_kwargs: ImageBindProcessorImagesKwargs |
| 48 | + audio_kwargs: ImageBindProcessorAudioKwargs |
32 | 49 | _defaults = {} |
33 | 50 |
|
34 | 51 |
|
@@ -78,7 +95,7 @@ def __call__( |
78 | 95 | The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings |
79 | 96 | (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set |
80 | 97 | `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). |
81 | | - audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, `List[List[List[float]]]`): |
| 98 | + audio (`AudioInput`, `List[float]`, `List[List[float]]`, `List[List[List[float]]]`): |
82 | 99 | The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of numpy |
83 | 100 | arrays or a (possibly nested) list of float values. The supported input types are as follows: |
84 | 101 |
|
|
0 commit comments