huggingface · patrickvonplaten · Feb 15, 2022 · Feb 9, 2022 · Feb 14, 2022 · Feb 14, 2022
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -127,7 +127,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
 
-        if os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path):
             decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
         else:
             # BeamSearchDecoderCTC has no auto class

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -621,15 +621,20 @@ def pipeline(
                     import kenlm  # to trigger `ImportError` if not installed
                     from pyctcdecode import BeamSearchDecoderCTC
 
-                    language_model_glob = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
-                    alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
-                    allow_regex = [language_model_glob, alphabet_filename]
+                    if os.path.isdir(model_name) or os.path.isfile(model_name):
+                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+                    else:
+                        language_model_glob = os.path.join(
+                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+                        )
+                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+                        allow_regex = [language_model_glob, alphabet_filename]
+                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
 
-                    decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_regex=allow_regex)
                     kwargs["decoder"] = decoder
                 except ImportError as e:
                     logger.warning(
-                        "Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
+                        f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Try to install `pyctcdecode` and `kenlm`: (`pip install pyctcdecode`, `pip install https://github.com/kpu/kenlm/archive/master.zip`): Error: {e}"
                     )
 
     if task == "translation" and model.config.task_specific_params:

diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py
@@ -64,6 +64,7 @@
 
 
 if is_pyctcdecode_available():
+    from huggingface_hub import snapshot_download
     from transformers import Wav2Vec2ProcessorWithLM
 
 
@@ -1495,6 +1496,30 @@ def test_wav2vec2_with_lm(self):
 
         self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
 
+    @require_pyctcdecode
+    @require_torchaudio
+    def test_wav2vec2_with_local_lm(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+
+        ds = load_dataset("common_voice", "es", split="test", streaming=True)
+        sample = next(iter(ds))
+
+        resampled_audio = torchaudio.functional.resample(
+            torch.tensor(sample["audio"]["array"]), 48_000, 16_000
+        ).numpy()
+
+        model = Wav2Vec2ForCTC.from_pretrained(local_dir).to(torch_device)
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(local_dir)
+
+        input_values = processor(resampled_audio, return_tensors="pt").input_values
+
+        with torch.no_grad():
+            logits = model(input_values.to(torch_device)).logits
+
+        transcription = processor.batch_decode(logits.cpu().numpy()).text
+
+        self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
+
     def test_inference_diarization(self):
         model = Wav2Vec2ForAudioFrameClassification.from_pretrained("anton-l/wav2vec2-base-superb-sd").to(torch_device)
         processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sd")

diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py
@@ -31,6 +31,7 @@
 from transformers.pipelines.automatic_speech_recognition import apply_stride, chunk_iter
 from transformers.testing_utils import (
     is_pipeline_test,
+    is_pyctcdecode_available,
     is_torch_available,
     nested_simplify,
     require_pyctcdecode,
@@ -47,6 +48,10 @@
     import torch
 
 
+if is_pyctcdecode_available():
+    from huggingface_hub import snapshot_download
+
+
 # We can't use this mixin because it assumes TF support.
 # from .test_pipelines_common import CustomInputPipelineCommonMixin
 
@@ -368,6 +373,27 @@ def test_with_lm_fast(self):
         self.assertEqual(output, [{"text": ANY(str)}])
         self.assertEqual(output[0]["text"][:6], "<s> <s")
 
+    @require_torch
+    @require_pyctcdecode
+    def test_with_local_lm_fast(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+        speech_recognizer = pipeline(
+            task="automatic-speech-recognition",
+            model=local_dir,
+        )
+        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
+        audio = ds[40]["audio"]["array"]
+
+        n_repeats = 2
+        audio_tiled = np.tile(audio, n_repeats)
+
+        output = speech_recognizer([audio_tiled], batch_size=2)
+
+        self.assertEqual(output, [{"text": ANY(str)}])
+        self.assertEqual(output[0]["text"][:6], "<s> <s")
+
     @require_torch
     @slow
     def test_chunking(self):

diff --git a/tests/test_processor_wav2vec2_with_lm.py b/tests/test_processor_wav2vec2_with_lm.py
@@ -31,6 +31,7 @@
 
 
 if is_pyctcdecode_available():
+    from huggingface_hub import snapshot_download
     from pyctcdecode import BeamSearchDecoderCTC
     from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
 
@@ -303,3 +304,20 @@ def test_decoder_download_ignores_files(self):
         # https://huggingface.co/hf-internal-testing/processor_with_lm/tree/main
         # are downloaded and none of the rest (e.g. README.md, ...)
         self.assertListEqual(downloaded_decoder_files, expected_decoder_files)
+
+    def test_decoder_local_files(self):
+        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
+
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(local_dir)
+
+        language_model = processor.decoder.model_container[processor.decoder._model_key]
+        path_to_cached_dir = Path(language_model._kenlm_model.path.decode("utf-8")).parent.parent.absolute()
+
+        local_decoder_files = os.listdir(local_dir)
+        expected_decoder_files = os.listdir(path_to_cached_dir)
+
+        local_decoder_files.sort()
+        expected_decoder_files.sort()
+
+        # test that both decoder form hub and local files in cache are the same
+        self.assertListEqual(local_decoder_files, expected_decoder_files)