Incorporate comments from PR huggingface#23223

LWprogramming · May 22, 2023 · 1b23181 · 1b23181
1 parent 62c8b16
commit 1b23181
Show file tree

Hide file tree

Showing 15 changed files with 60 additions and 28 deletions.
diff --git a/reports/examples_torch/errors.txt b/reports/examples_torch/errors.txt
@@ -0,0 +1,21 @@
+==================================== ERRORS ====================================
+________ ERROR collecting examples/pytorch/test_accelerate_examples.py _________
+ImportError while importing test module '/Users/leonwu/Documents/ai_cs/transformers/examples/pytorch/test_accelerate_examples.py'.
+Hint: make sure your test modules/packages have valid Python names.
+Traceback:
+/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py:126: in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+examples/pytorch/test_accelerate_examples.py:27: in <module>
+    from accelerate.utils import write_basic_config
+E   ModuleNotFoundError: No module named 'accelerate'
+__________ ERROR collecting examples/pytorch/test_pytorch_examples.py __________
+ImportError while importing test module '/Users/leonwu/Documents/ai_cs/transformers/examples/pytorch/test_pytorch_examples.py'.
+Hint: make sure your test modules/packages have valid Python names.
+Traceback:
+/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py:126: in import_module
+    return _bootstrap._gcd_import(name[level:], package, level)
+examples/pytorch/test_pytorch_examples.py:70: in <module>
+    import run_wav2vec2_pretraining_no_trainer
+examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py:27: in <module>
+    from accelerate import Accelerator
+E   ModuleNotFoundError: No module named 'accelerate'
diff --git a/reports/examples_torch/failures_line.txt b/reports/examples_torch/failures_line.txt
diff --git a/reports/examples_torch/failures_long.txt b/reports/examples_torch/failures_long.txt
diff --git a/reports/examples_torch/failures_short.txt b/reports/examples_torch/failures_short.txt
diff --git a/reports/examples_torch/stats.txt b/reports/examples_torch/stats.txt
@@ -0,0 +1 @@
+======================== 1 warning, 2 errors in 23.43s =========================
diff --git a/reports/examples_torch/summary_short.txt b/reports/examples_torch/summary_short.txt
@@ -0,0 +1,3 @@
+=========================== short test summary info ============================
+ERROR examples/pytorch/test_accelerate_examples.py
+ERROR examples/pytorch/test_pytorch_examples.py
diff --git a/reports/examples_torch/warnings.txt b/reports/examples_torch/warnings.txt
@@ -0,0 +1,7 @@
+=========================== warnings summary (final) ===========================
+venv/lib/python3.10/site-packages/_pytest/config/__init__.py:1302
+  /Users/leonwu/Documents/ai_cs/transformers/venv/lib/python3.10/site-packages/_pytest/config/__init__.py:1302: PytestConfigWarning: Unknown config option: doctest_glob
+
+    self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
+
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
diff --git a/.../models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/.../models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -135,7 +135,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             sampling_rate (`int`, *optional*):
                 The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                 `sampling_rate` at the forward call to prevent silent errors.
@@ -161,9 +162,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
@@ -272,7 +272,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             truncation (`str`, *optional*):
                 Truncation pattern for long audio inputs. Two patterns are available:
                     - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
@@ -313,9 +314,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
@@ -180,7 +180,8 @@ def __call__(
         Args:
             raw_speech (`torch.Tensor`, `np.ndarray`, `List[float]`, `List[torch.Tensor]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
-                of float values, a list of tensors, a list of numpy arrays or a list of list of float values.
+                of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
+                mono channel audio, not stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -232,9 +233,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -141,7 +141,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
@@ -201,9 +202,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -201,7 +201,8 @@ def __call__(
         Args:
             audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                 The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values. This outputs waveform features.
+                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
+                be mono channel audio, not stereo, i.e. single float per timestep.
             audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                 The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
                 list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
@@ -308,9 +309,8 @@ def _process_audio(
         **kwargs,
     ) -> BatchFeature:
         is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1
-        if is_batched_numpy:
-            if len(speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/tvlt/feature_extraction_tvlt.py b/src/transformers/models/tvlt/feature_extraction_tvlt.py
@@ -129,7 +129,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -177,9 +178,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -152,7 +152,8 @@ def __call__(
         Args:
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
-                values, a list of numpy arrays or a list of list of float values.
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
             truncation (`bool`, *optional*, default to `True`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
             pad_to_multiple_of (`int`, *optional*, defaults to None):
@@ -204,9 +205,8 @@ def __call__(
             )
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
-        if is_batched_numpy:
-            if len(raw_speech.shape) > 2:
-                raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
         is_batched = is_batched_numpy or (
             isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
         )

diff --git a/tests_output.txt b/tests_output.txt