diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index a3b1484e6b99d7..786548fd2336e9 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -135,7 +135,8 @@ def __call__( Args: raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. sampling_rate (`int`, *optional*): The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass `sampling_rate` at the forward call to prevent silent errors. @@ -161,9 +162,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py index fc7b398bbfc8fb..d33307ffbd22fc 100644 --- a/src/transformers/models/clap/feature_extraction_clap.py +++ b/src/transformers/models/clap/feature_extraction_clap.py @@ -272,7 +272,8 @@ def __call__( Args: raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. truncation (`str`, *optional*): Truncation pattern for long audio inputs. Two patterns are available: - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and @@ -313,9 +314,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py index d07709f006eb08..9e9e276c168ca1 100644 --- a/src/transformers/models/mctct/feature_extraction_mctct.py +++ b/src/transformers/models/mctct/feature_extraction_mctct.py @@ -180,7 +180,8 @@ def __call__( Args: raw_speech (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[torch.Tensor]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list - of float values, a list of tensors, a list of numpy arrays or a list of list of float values. + of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be + mono channel audio, not stereo, i.e. single float per timestep. padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -232,9 +233,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index ff40d2d37a8a73..81f2ea4e99be22 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -141,7 +141,8 @@ def __call__( Args: raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -201,9 +202,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py index d9ec74107bcd52..dd5ff4c8a1afae 100644 --- a/src/transformers/models/speecht5/feature_extraction_speecht5.py +++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py @@ -201,7 +201,8 @@ def __call__( Args: audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*): The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. This outputs waveform features. + values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must + be mono channel audio, not stereo, i.e. single float per timestep. audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*): The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel @@ -308,9 +309,8 @@ def _process_audio( **kwargs, ) -> BatchFeature: is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1 - if is_batched_numpy: - if len(speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py index b9c9402ea64111..d5beba76bd6986 100644 --- a/src/transformers/models/tvlt/feature_extraction_tvlt.py +++ b/src/transformers/models/tvlt/feature_extraction_tvlt.py @@ -129,7 +129,8 @@ def __call__( Args: raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - `'pt'`: Return PyTorch `torch.Tensor` objects. @@ -177,9 +178,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) ) diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py index e6491097af58c6..70eb8bd94e7676 100644 --- a/src/transformers/models/whisper/feature_extraction_whisper.py +++ b/src/transformers/models/whisper/feature_extraction_whisper.py @@ -152,7 +152,8 @@ def __call__( Args: raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float - values, a list of numpy arrays or a list of list of float values. + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. truncation (`bool`, *optional*, default to `True`): Activates truncation to cut input sequences longer than *max_length* to *max_length*. pad_to_multiple_of (`int`, *optional*, defaults to None): @@ -204,9 +205,8 @@ def __call__( ) is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 - if is_batched_numpy: - if len(raw_speech.shape) > 2: - raise ValueError(f"Only mono-channel audio is supported for input to {self}") + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") is_batched = is_batched_numpy or ( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) )