-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathtrain_htk_recognizer.sh
180 lines (143 loc) · 5.35 KB
/
train_htk_recognizer.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#~/bin/bash
#
# Trains a speech recognizer using HTK. Starts from a flat start
# model. The final HMM can be found in $TMPDIR/hmm/hmmdefs
TRAINFILE=$1
TRAINFEATFILE=$TRAINFILE
GRAMFILE=$2
WDLIST=$3
DICTFILE=$4
#word level transcripts
TRAINTRANSFILE=$5
#TRAINEVAL=./train_eval.mlf
PROTO_FILE=$6
# how many mixtures per gaussian?
NMIX=$7
TMPDIR=$8
cd $TMPDIR
# given a bunch of training wav files listed in $TRAINFILE, a grammar in
# $GRAMFILE, a word list file in $WDLIST, a pronunciation dictionary
# $DICTFILE, and a file of word level transcriptions for all of the
# training data $TRAINTRANSFILE, this script (and its associated files)
# will bootstrap and train an HTK recognizer...
# files it depends on:
# sil.hed
# outputs:
# wdnet - HTK grammar
# dlog - dictionary log (check for errors)
# monophones0 -
# standard command line options (to get reasonable error reports)
#SOPTS="-A -D -T 1"
#SOPTS="-A -D "
SOPTS=$9
# trap any errors when running an HTK command
function runcmd ()
{
"$@"
local ERRORCODE=$?
if [ $ERRORCODE != 0 ]
then exit $ERRORCODE
fi
}
# we do this a lot...
# arg 1 = starting HMM
# arg 2 = ending HMM (number of last HMM to be trained)
# arg 3 = name of file containing transcripts to use
# arg 4 = name of file containing list of symbols (e.g. phones) in transcripts
function trainhmms ()
{
local x=$1
while test $x -le $(($2-1))
do mkdir hmm$(($x+1))
#runcmd HERest $SOPTS -I $3 -s statistics -t 250.0 250.0 3000.0 -S $TRAINFILE -H hmm$x/macros -H hmm$x/hmmdefs -M hmm$(($x+1)) $4
#runcmd HERest $SOPTS -I $3 -t 250.0 250.0 3000.0 -S $TRAINFILE -H hmm$x/macros -H hmm$x/hmmdefs -M hmm$(($x+1)) $4
runcmd HERest $SOPTS -I $3 -t 250.0 250.0 3000.0 -v 1 -S $TRAINFILE -H hmm$x/macros -H hmm$x/hmmdefs -M hmm$(($x+1)) $4
x=$((x+1))
done
HMM=$x
}
# #global config:
# export HCONFIG="
# SOURCEFORMAT = HTK
# SOURCEKIND = USER
# TARGETFORMAT = HTK
# TARGETKIND = USER"
# 1. make a grammar that HTK understands
if [ -e $GRAMFILE ]
then
if [ `head -1 $GRAMFILE | cut -d " " -f 1 | cut -d "=" -f 1` == "VERSION" ]
# the grammar already an SLF file?
then cp $GRAMFILE wdnet
else
runcmd HParse $GRAMFILE wdnet
fi
fi
# 2. compile an HTK formatted dictionary
runcmd HDMan $SOPTS -m -w $WDLIST -n monophones0 -e . -l dlog dict $DICTFILE
# add sil to monophones0
cp monophones0 tmp;
echo sil >> tmp;
sort tmp | uniq > monophones0;
rm -f tmp;
# 3. convert word level transcripts to phone level transcripts
echo "EX
IS sil sil
DE sp" > mkphones0.led
# FIXME
echo "EX
DE sp" > mkphones0.led
runcmd HLEd $SOPTS -l '*' -d dict -i phones0.mlf mkphones0.led $TRAINTRANSFILE
# This script doesn't do feature extraction.
## 4. feature extraction from training data:
##HCopy $SOPTS -S $TRAINFEATFILE
# 5. HMM training:
# a. compute initial 3 state phone HMM - just gets overall
# mean/variance of all training data
HMM=0
mkdir hmm$HMM
runcmd HCompV $SOPTS -f 0.01 -m -S $TRAINFILE -M hmm$HMM $PROTO_FILE
VECSIZE=`cat $PROTO_FILE | grep -i MEAN | head -1 | cut -s -d ">" -f 2 | tr -d ' '`
# b. copy the phone HMM in proto (above) into a new HMM for every
# phone in monophones0 in hmmdefs. also create basic macro file
echo "~o" > hmm$HMM/macros
echo "<VecSize> $VECSIZE" >> hmm$HMM/macros
echo "<USER>" >> hmm$HMM/macros
#echo "<MFCC_0_D_A>" >> hmm$HMM/macros
cat hmm$HMM/vFloors >> hmm$HMM/macros
proto=`echo $PROTO_FILE | rev | cut -d "/" -f 1 | rev`
perl -ne "BEGIN {open(FD,\"hmm$HMM/$proto\"); @proto=<FD>; close FD; @proto=@proto[4..\$#proto];} chop; print \"\~h \\\"\$_\\\"\n@proto\n\";" monophones0 > hmm$HMM/hmmdefs
# c. do some training rounds (monophones0 should contain sil but not sp)
trainhmms $HMM $((HMM+3)) phones0.mlf monophones0
#5 e. realign the training data, forcing the beginning and end of the
# utterance to fall into the silence state
echo "silence sil" >> dict
#HVite $SOPTS -l '*' -o SWT -b silence -a -H hmm$HMM/macros -H hmm$HMM/hmmdefs -i aligned.mlf -m -t 250.0 -y lab -I $TRAINTRANSFILE -S $TRAINFILE dict monophones1
runcmd HVite $SOPTS -l '*' -o SWT -b silence -a -H hmm$HMM/macros -H hmm$HMM/hmmdefs -i aligned.mlf -m -y lab -I $TRAINTRANSFILE -S $TRAINFILE dict monophones0
trainhmms $HMM $((HMM+3)) aligned.mlf monophones0
# f. do a sequence of mixture splits (up to $NMIX mixtures per
# state for every phone
NSTATES=`grep -i NUMSTATES $PROTO_FILE | sed -e 's/[^0-9]*//'`
# The HTK book recommends that we only increment by one or two
# components at a time since it has all sorts of heuristics for
# determining the optimal way to split things
for ((N=2; N <= NMIX-1; N+=2))
do echo "MU $N {*.state[2-$(($NSTATES-1))].mix}" > mu.hed
echo Splitting GMMs to $N components...
if test ! -d hmm$(($HMM+1))
then mkdir hmm$(($HMM+1));
fi
runcmd HHEd $SOPTS -H hmm$HMM/macros -H hmm$HMM/hmmdefs -M hmm$(($HMM+1)) mu.hed monophones0
HMM=$(($HMM+1))
trainhmms $HMM $((HMM+3)) aligned.mlf monophones0
done
# make sure we get the correct number of mixture components
echo Splitting GMMs to $NMIX components...
echo "MU $((NMIX)) {*.state[2-$(($NSTATES-1))].mix}" > mu.hed
if test ! -d hmm$(($HMM+1))
then mkdir hmm$(($HMM+1));
fi
runcmd HHEd $SOPTS -H hmm$HMM/macros -H hmm$HMM/hmmdefs -M hmm$(($HMM+1)) mu.hed monophones0
HMM=$(($HMM+1))
trainhmms $HMM $((HMM+3)) aligned.mlf monophones0
# Done!
mv hmm$HMM hmm_final