Skip to content

Commit 75008f0

Browse files
Merge pull request #125 from neillu23/persephone-refactor
upload missing file: download_lm.py
2 parents 2cbefda + 49219e9 commit 75008f0

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
3+
#
4+
# See ../../../../LICENSE for clarification regarding multiple authors
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
19+
"""
20+
This file downloads the following LibriSpeech LM files:
21+
22+
- 3-gram.pruned.1e-7.arpa.gz
23+
- 4-gram.arpa.gz
24+
- librispeech-vocab.txt
25+
- librispeech-lexicon.txt
26+
- librispeech-lm-norm.txt.gz
27+
28+
from http://www.openslr.org/resources/11
29+
and save them in the user provided directory.
30+
31+
Files are not re-downloaded if they already exist.
32+
33+
Usage:
34+
./local/download_lm.py --out-dir ./download/lm
35+
"""
36+
37+
import argparse
38+
import gzip
39+
import logging
40+
import os
41+
import shutil
42+
from pathlib import Path
43+
44+
from lhotse.utils import urlretrieve_progress
45+
from tqdm.auto import tqdm
46+
47+
48+
def get_args():
49+
parser = argparse.ArgumentParser()
50+
parser.add_argument("--out-dir", type=str, help="Output directory.")
51+
52+
args = parser.parse_args()
53+
return args
54+
55+
56+
def main(out_dir: str):
57+
url = "http://www.openslr.org/resources/11"
58+
out_dir = Path(out_dir)
59+
60+
files_to_download = (
61+
"3-gram.pruned.1e-7.arpa.gz",
62+
"4-gram.arpa.gz",
63+
"librispeech-vocab.txt",
64+
"librispeech-lexicon.txt",
65+
"librispeech-lm-norm.txt.gz",
66+
)
67+
68+
for f in tqdm(files_to_download, desc="Downloading LibriSpeech LM files"):
69+
filename = out_dir / f
70+
if filename.is_file() is False:
71+
urlretrieve_progress(
72+
f"{url}/{f}",
73+
filename=filename,
74+
desc=f"Downloading {filename}",
75+
)
76+
else:
77+
logging.info(f"{filename} already exists - skipping")
78+
79+
if ".gz" in str(filename):
80+
unzipped = Path(os.path.splitext(filename)[0])
81+
if unzipped.is_file() is False:
82+
with gzip.open(filename, "rb") as f_in:
83+
with open(unzipped, "wb") as f_out:
84+
shutil.copyfileobj(f_in, f_out)
85+
else:
86+
logging.info(f"{unzipped} already exist - skipping")
87+
88+
89+
if __name__ == "__main__":
90+
formatter = (
91+
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
92+
)
93+
94+
logging.basicConfig(format=formatter, level=logging.INFO)
95+
96+
args = get_args()
97+
logging.info(f"out_dir: {args.out_dir}")
98+
99+
main(out_dir=args.out_dir)

0 commit comments

Comments
 (0)