forked from Snowdar/asv-subtools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
runMultiTaskXvector.sh
executable file
·268 lines (221 loc) · 10.8 KB
/
runMultiTaskXvector.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/bin/bash
# Copyright Tsinghua (Author:YiLiu earlier than 2018-10-01)
# xmuspeech (Author:Snowdar 2018-10-01)
# This script is used to train a multitask xvector network which contains two tasks, speaker/language recognition
# and phonetic distinction. And it is not same to the script "runPhoneticXvector.sh" that it replaces training
# xvector model after phonetic model and just considering output of a cetain hidden layer of phonetic model
# as a splice-vector of a certain hidden layer of xvector model, this script trains a joint network directly
# with some shared hidden layers between xvector network and phonetic network, which means that the network
# params will be updated in turn by randomly training in phonetic egs or speaker/language egs.
#
# [Reference]
# "Liu, Y., He, L., Liu, J. and Johnson, M.T.. Speaker Embedding Extraction with Phonetic Information. Interspeech 2018."
set -e
stage=0
endstage=6
train_stage=-10
use_gpu=true
clean=true
remove_egs=true
cmn=true # do sliding cmn when getting egs
sleep_time=3
model_limit=8
phonetic_vad=false # if true,it works in both feats and ali during getting egs and it is equal to remove 'sil' phone
phonetic_min_len=20
xv_min_chunk=60
xv_max_chunk=80 # equal to xv_min_len:utt-length should be always >= xv_max_chunk
num_archives=150
xvTrainData=data/plp_20_5.0/baseTrain
phoneticTrainData=data/plp_20_5.0/thchs30_train # the feat-type and dim of two traindatas should be consistent
phoneticAliDir=exp/thchs30_train_dnn_ali # get ali from a am model by yourself
outputname=base_multiTask_xv_plp_20_5.0_cmn # just a output name and the real output-path is exp/$outputname
. subtools/path.sh
. subtools/kaldi/utils/parse_options.sh
########## auto variables ################
nnet_dir=exp/$outputname
phonetic_egs_dir=exp/$outputname/phonetic_egs
xv_egs_dir=exp/$outputname/xvector_egs
mkdir -p $nnet_dir
echo -e "SleepTime=$sleep_time\nLimit=$model_limit" > $nnet_dir/control.conf
xv_feat_dim=$(feat-to-dim scp:$xvTrainData/feats.scp -) || exit 1
phonetic_feat_dim=$(feat-to-dim scp:$phoneticTrainData/feats.scp -) || exit 1
[ $xv_feat_dim != $phonetic_feat_dim ] && echo "[exit] Dim of $xvTrainData is not equal to $phoneticTrainData" && exit 1
feat_dim=$xv_feat_dim
#### stage --> go #####
if [[ $stage -le 0 && 0 -le $endstage ]];then
echo "[stage 0] Prepare xvTrainData dir with no nonspeech frames for xvector egs"
rm -rf ${xvTrainData}_nosil
rm -rf exp/features/${xvTrainData}_nosil
subtools/kaldi/sid/nnet3/xvector/prepare_feats_for_egs.sh --nj 20 --cmd "run.pl" \
$xvTrainData ${xvTrainData}_nosil exp/features/${xvTrainData}_nosil
fi
if [[ $stage -le 1 && 1 -le $endstage ]];then
echo "[stage 1] Remove utts whose length is less than the lower limit value"
subtools/removeUtt.sh ${phoneticTrainData} $phonetic_min_len
subtools/removeUtt.sh ${xvTrainData}_nosil $xv_max_chunk
fi
phonetic_output="phonetic_output"
if [[ $stage -le 2 && 2 -le $endstage ]];then
echo "[stage 2] Prepare multitask network config"
phonetic_num_targets=$(tree-info $phoneticAliDir/tree | grep num-pdfs | awk '{print $2}') || exit 1
xv_num_targets=$(awk '{print $1}' $xvTrainData/spk2utt | sort | wc -l | awk '{print $1}') || exit 1
max_chunk_size=10000
min_chunk_size=25
mkdir -p $nnet_dir/configs/phonetic
cat <<EOF > $nnet_dir/configs/network.xconfig
# please note that it is important to have input layer with the name=input
# The frame-level layers
input dim=${feat_dim} name=input
# shared layers
relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
relu-batchnorm-layer name=tdnn4 dim=512
# phonetic branch
relu-batchnorm-layer name=phonetic_tdnn5 dim=512 input=tdnn4
relu-batchnorm-layer name=phonetic_tdnn6 dim=512
relu-batchnorm-layer name=phonetic_tdnn7 dim=512
output-layer name=$phonetic_output dim=$phonetic_num_targets max-change=1.5
# xvector branch
relu-batchnorm-layer name=tdnn5 dim=1500 input=tdnn4
# The stats pooling layer. Layers after this are segment-level.
# In the config below, the first and last argument (0, and ${max_chunk_size})
# means that we pool over an input segment starting at frame 0
# and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1)
# mean that no subsampling is performed.
stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
# This is where we usually extract the embedding (aka xvector) from.
relu-batchnorm-layer name=tdnn6 dim=512 input=stats
# This is where another layer the embedding could be extracted
# from, but usually the previous one works better.
relu-batchnorm-layer name=tdnn7 dim=512
output-layer name=output include-log-softmax=true dim=${xv_num_targets}
EOF
# parse nnet config with phonetic as main branch,but we just need the "vars" file here
sed 's/name=output/name=xvector_output/g' $nnet_dir/configs/network.xconfig | \
sed ''s/name=$phonetic_output/name=output/g'' > $nnet_dir/configs/phonetic/network.xconfig
subtools/kaldi/steps/nnet3/xconfig_to_configs.py \
--xconfig-file $nnet_dir/configs/phonetic/network.xconfig \
--config-dir $nnet_dir/configs/phonetic
# parse nnet config with xvector as main branch and use it to init raw model
subtools/kaldi/steps/nnet3/xconfig_to_configs.py \
--xconfig-file $nnet_dir/configs/network.xconfig \
--config-dir $nnet_dir/configs
cp $nnet_dir/configs/vars $nnet_dir/configs/vars_xvec
cp $nnet_dir/configs/phonetic/vars $nnet_dir/configs/vars_am
# some configs for extracting xvector
echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract_tdnn6.config
cp -f $nnet_dir/extract_tdnn6.config $nnet_dir/extract.config
echo "output-node name=output input=tdnn7.affine" > $nnet_dir/extract_tdnn7.config
echo "$max_chunk_size" > $nnet_dir/max_chunk_size
echo "$min_chunk_size" > $nnet_dir/min_chunk_size
fi
# note:
# for train_cvector_dnn.py script (by YiLiu), the num of egs of xvector and phonetic should be equal and
# *egs.*.scp is required to exist in both xvector and phonetic egs dir.Next, the "archive_chunk_lengths"
# file should be exist in $xv_egs_dir rather than $xv_egs_dir/temp where this file is generated initially.
#
# the reason why don't combine the two type egs before training is that the author provide a c++ paragram
# "nnet3-copy-cvector-egs" which can combine multitask egs temporarily when training,but it is not must
# because "nnet3-copy-egs" (kaldi provide) can also achive this purpose by option --outputs, which could be
# tedious than "nnet3-copy-cvector-egs". Ok, the training c++ paragrams like nnet3-train,nnet3-compute-prob
# are how to recognize multi-egs to update params of different branch of a shared network is very interesting,
# which is refered to the format of egs, a string like "<NnetIo> output <I1V>", which means this egs will be
# used for a output-node (see parsed config ) whose name is "output" and ignore other branch.Yeh,the output-node
# named "output" is a main branch and others, such as "phonetic_output", will be as a secondary branch,which
# refering to "nnet3-compute",but by "nnet3-[am-]copy",you can still change the master-slave relationship always
# when you just have a final.raw/final.mdl.
## xvector egs ##
##############################################
if [[ $stage -le 3 && 3 -le $endstage ]];then
echo "[stage 3] get xvector egs"
subtools/kaldi/sid/nnet3/xvector/get_egs.sh --cmd "run.pl" \
--nj 20 \
--stage 0 \
--num-train-archives $num_archives \
--frames-per-iter-diagnostic 100000 \
--min-frames-per-chunk $xv_min_chunk \
--max-frames-per-chunk $xv_max_chunk \
--num-diagnostic-archives 3 \
--num-repeats 6000 \
"${xvTrainData}_nosil" $xv_egs_dir
# training script needs this file
cp -f $xv_egs_dir/temp/archive_chunk_lengths $xv_egs_dir
fi
## phonetic egs ##
##############################################
if [[ $stage -le 4 && 4 -le $endstage ]];then
echo "[stage 4] get phonetic egs"
left_context=$(grep 'model_left_context' $nnet_dir/configs/phonetic/vars | cut -d '=' -f 2) || exit 1
right_context=$(grep 'model_right_context' $nnet_dir/configs/phonetic/vars | cut -d '=' -f 2) || exit 1
num_archives=$(cat $xv_egs_dir/info/num_archives) || exit 1
frame_subsampling_factor=1
[ -f $phoneticAliDir/frame_subsampling_factor ] && frame_subsampling_factor=$(awk '{print $1}' $phoneticAliDir/frame_subsampling_factor)
subtools/kaldi/sid/nnet3/get_egs.sh --cmd "run.pl" \
--nj 10 \
--stage 0 \
--cmn $cmn \
--frame-subsampling-factor $frame_subsampling_factor \
--vad $phonetic_vad \
--generate-egs-scp true \
--num-archives $num_archives \
--frames-per-eg 1 \
--left-context $left_context \
--right-context $right_context \
${phoneticTrainData} $phoneticAliDir $phonetic_egs_dir
fi
if [[ $stage -le 5 && 5 -le $endstage ]]; then
echo "[stage 5] train multitask nnet3 raw model"
dropout_schedule='0,0@0.20,0.1@0.50,0'
srand=123
subtools/kaldi/steps_multitask/nnet3/train_cvector_dnn.py --stage=$train_stage \
--cmd="run.pl" \
--trainer.optimization.proportional-shrink 10 \
--trainer.optimization.momentum=0.5 \
--trainer.optimization.num-jobs-initial=2 \
--trainer.optimization.num-jobs-final=8 \
--trainer.optimization.initial-effective-lrate=0.001 \
--trainer.optimization.final-effective-lrate=0.0001 \
--trainer.optimization.minibatch-size="256;64" \
--trainer.srand=$srand \
--trainer.max-param-change=2 \
--trainer.num-epochs=3 \
--trainer.dropout-schedule="$dropout_schedule" \
--trainer.shuffle-buffer-size=1000 \
--cleanup.remove-egs=$remove_egs \
--cleanup.preserve-model-interval=500 \
--use-gpu=true \
--am-output-name=$phonetic_output \
--am-weight=1.0 \
--am-egs-dir=$phonetic_egs_dir \
--xvec-output-name="output" \
--xvec-weight=1.0 \
--xvec-egs-dir=$xv_egs_dir \
--dir=$nnet_dir || exit 1;
fi
if [[ -f $nnet_dir/final.raw && "$clean" == "true" ]];then
rm -f $xv_egs_dir/egs*
rm -f $phonetic_egs_dir/egs*
rm -rf ${xvTrainData}_nosil
rm -rf exp/features/${xvTrainData}_nosil
fi
if [[ $stage -le 6 && 6 -le $endstage ]]; then
echo "[stage 8] extract multitask-xvectors of several datasets"
prefix=plp_20_5.0
toEXdata="baseTrain test_1s test_1s_concat_sp"
layer="tdnn6"
nj=20
gpu=false
cache=1000
for x in $toEXdata ;do
for y in $layer ;do
num=0
[ -f $nnet_dir/$y/$x/xvector.scp ] && num=$(grep ERROR $nnet_dir/$y/$x/log/extract.*.log | wc -l)
[[ "$force" == "true" || ! -f $nnet_dir/$y/$x/xvector.scp || $num -gt 0 ]] && \
subtools/kaldi/sid/nnet3/xvector/extract_xvectors.sh --cache-capacity $cache --extract-config extract_${y}.config \
--use-gpu $gpu --nj $nj $nnet_dir data/${prefix}/$x $nnet_dir/$y/$x
> $nnet_dir/$y/$x/$prefix
echo "$y layer embeddings of data/$prefix/$x extracted done."
done
done
fi