Merge pull request #125 from neillu23/persephone-refactor

jesus-villalba · web-flow · commit 75008f003276 · 2022-11-30T15:55:52.000-05:00
upload missing file: download_lm.py
diff --git a/egs/librispeech/v1/local/download_lm.py b/egs/librispeech/v1/local/download_lm.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file downloads the following LibriSpeech LM files:
+
+    - 3-gram.pruned.1e-7.arpa.gz
+    - 4-gram.arpa.gz
+    - librispeech-vocab.txt
+    - librispeech-lexicon.txt
+    - librispeech-lm-norm.txt.gz
+
+from http://www.openslr.org/resources/11
+and save them in the user provided directory.
+
+Files are not re-downloaded if they already exist.
+
+Usage:
+    ./local/download_lm.py --out-dir ./download/lm
+"""
+
+import argparse
+import gzip
+import logging
+import os
+import shutil
+from pathlib import Path
+
+from lhotse.utils import urlretrieve_progress
+from tqdm.auto import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out-dir", type=str, help="Output directory.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main(out_dir: str):
+    url = "http://www.openslr.org/resources/11"
+    out_dir = Path(out_dir)
+
+    files_to_download = (
+        "3-gram.pruned.1e-7.arpa.gz",
+        "4-gram.arpa.gz",
+        "librispeech-vocab.txt",
+        "librispeech-lexicon.txt",
+        "librispeech-lm-norm.txt.gz",
+    )
+
+    for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
+        filename = out_dir / f
+        if filename.is_file() is False:
+            urlretrieve_progress(
+                f"{url}/{f}",
+                filename=filename,
+                desc=f"Downloading {filename}",
+            )
+        else:
+            logging.info(f"{filename} already exists - skipping")
+
+        if ".gz" in str(filename):
+            unzipped = Path(os.path.splitext(filename)[0])
+            if unzipped.is_file() is False:
+                with gzip.open(filename, "rb") as f_in:
+                    with open(unzipped, "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+            else:
+                logging.info(f"{unzipped} already exist - skipping")
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    args = get_args()
+    logging.info(f"out_dir: {args.out_dir}")
+
+    main(out_dir=args.out_dir)