-
Notifications
You must be signed in to change notification settings - Fork 0
/
kenlm_perplexity.sh
91 lines (74 loc) · 2.75 KB
/
kenlm_perplexity.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Will maybe do this
if [ $stage -le 4 ]; then
echo "Create 3gram language model from kenlm"
cat data/train/text | cut -d' ' -f2- | sed -e \
's/[,.?!]//g' > data/text_lm_training.txt
mkdir -p data/lang_3gsmall
# language modeling files are typically in data/lang_Xgram
$big_memory_cmd logs/make_LM_3gsmall.log local/make_LM.sh --order 3 \
--small true --carpa false data/text_lm_training.txt data/lang/ \
data/local/dict/lexicon.txt data/lang_3gsmall
fi
#!/bin/bash -e
set -o pipefail
stage=0
order=4
small=false # pruned or not
pruning=
carpa=true
. ./path.sh
. parse_options.sh || exit 1;
. ./local/utils.sh
if [ $# != 4 ]; then
echo "This script creates language models"
echo ""
echo "Usage: local/make_LM.sh [options] <input-text-file> <lang-dir> <dict-dir> <language-model-dir>"
echo "e.g.: local/make_LM.sh data/language_model/LMtext.txt data/lang data/local/dict/lexicon.txt models/language_model/"
echo ""
echo "Options:"
echo " --order <num> # The ngram order of the LM"
echo " --small <bool> # Prune if true (default: false)"
echo " --pruning <string> # How to prune, e.g. '--prune 0 0 1' (default --prune 0 3 5)"
echo " --carpa <bool> # Make a constant arpa lm if true, otherwise convert arpa to fst"
exit 1;
fi
lmtext=$1
lang=$2
lexicon=$3
dir=$4
[ ! -d $lang ] && echo "$0: expected $lang to exist" && exit 1;
for f in $lmtext $lexicon; do \
[ ! -f $f ] && echo "$0: expected $f to exist" && exit 1;
done
suffix=
affix=
[ $small = true ] && suffix=small && pruning="--prune 0 3 5" && affix=_035pruned
if [ $stage -le 1 ]; then
# Preparing the language model
mkdir -p $dir/lang_${order}g${suffix}
for s in L_disambig.fst L.fst oov.int oov.txt phones phones.txt \
topo words.txt; do
[ ! -e $dir/lang_${order}g${suffix}/$s ] && cp -r $lang/$s $dir/lang_${order}g${suffix}/$s
done
echo "Build ARPA-format language model"
lmplz \
--skip_symbols \
-o ${order} -S 70% $pruning \
--text $lmtext \
--limit_vocab_file <(cut -d' ' -f1 $dir/lang_${order}g${suffix}/words.txt | egrep -v "<eps>|<unk>") \
| gzip -c > $dir/lang_${order}g${suffix}/kenlm_${order}g${affix}.arpa.gz || error 1 "lmplz failed"
fi
if [ $stage -le 2 ]; then
if [ $carpa = true ]; then
echo "Build constant ARPA language model"
utils/build_const_arpa_lm.sh \
$dir/lang_${order}g${suffix}/kenlm_${order}g${affix}.arpa.gz \
$lang $dir/lang_${order}g${suffix} || error 1 "Failed creating a const. ARPA LM"
else
echo "Convert ARPA-format language models to FSTs."
utils/format_lm.sh \
$lang $dir/lang_${order}g${suffix}/kenlm_${order}g${affix}.arpa.gz \
$lexicon $dir/lang_${order}g${suffix} || error 1 "Failed creating G.fst"
fi
fi
exit 0;