|
| 1 | +#!/bin/bash |
| 2 | +# Copyright 2020 Idiap Research Institute (Srikanth Madikeri) |
| 3 | +# chain2 recipe for monolingual systems for BABEL |
| 4 | + |
| 5 | +set -e -o pipefail |
| 6 | + |
| 7 | +# First the options that are passed through to run_ivector_common.sh |
| 8 | +# (some of which are also used in this script directly). |
| 9 | +stage=-1 |
| 10 | +nj=30 |
| 11 | +train_set=train |
| 12 | +gmm=tri5 # the gmm for the target data |
| 13 | +langdir=data/lang |
| 14 | +num_threads_ubm=1 |
| 15 | +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned |
| 16 | + |
| 17 | +# The rest are configs specific to this script. Most of the parameters |
| 18 | +# are just hardcoded at this level, in the commands below. |
| 19 | +train_stage=-10 |
| 20 | +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. |
| 21 | +tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. |
| 22 | +common_egs_dir= # you can set this to use previously dumped egs. |
| 23 | +chunk_width=150,120,90,75 |
| 24 | +frame_subsampling_factor=3 |
| 25 | +langs=default # has multiple values for a multilingual system |
| 26 | +srand=-1 |
| 27 | +num_jobs_initial=2 |
| 28 | +num_jobs_final=12 |
| 29 | +initial_effective_lrate=0.001 |
| 30 | +final_effective_lrate=0.0001 |
| 31 | +max_param_change=2.0 |
| 32 | +xent_regularize=0.1 |
| 33 | +# End configuration section. |
| 34 | +echo "$0 $@" # Print the command line for logging |
| 35 | + |
| 36 | +. ./cmd.sh |
| 37 | +. ./path.sh |
| 38 | +. ./utils/parse_options.sh |
| 39 | + |
| 40 | + |
| 41 | +if ! cuda-compiled; then |
| 42 | + cat <<EOF && exit 1 |
| 43 | +This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA |
| 44 | +If you want to use GPUs (and have them), go to src/, and configure and make on a machine |
| 45 | +where "nvcc" is installed. |
| 46 | +EOF |
| 47 | +fi |
| 48 | + |
| 49 | +gmm_dir=exp/$gmm |
| 50 | +ali_dir=exp/${gmm}_ali_${train_set}_sp |
| 51 | +tree_dir=exp/chain2${nnet3_affix}/tree${tree_affix} |
| 52 | +lat_dir=exp/chain2${nnet3_affix}/${gmm}_${train_set}_sp_lats |
| 53 | +dir=exp/chain2${nnet3_affix}/tdnn${tdnn_affix}_sp |
| 54 | +train_data_dir=data/${train_set}_sp_hires |
| 55 | +lores_train_data_dir=data/${train_set}_sp |
| 56 | +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires |
| 57 | + |
| 58 | + |
| 59 | +local/chain/run_ivector_common.sh --stage $stage \ |
| 60 | + --nj $nj \ |
| 61 | + --train-set $train_set \ |
| 62 | + --gmm $gmm \ |
| 63 | + --num-threads-ubm $num_threads_ubm \ |
| 64 | + --nnet3-affix "$nnet3_affix" |
| 65 | + |
| 66 | + |
| 67 | + |
| 68 | +for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ |
| 69 | + $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do |
| 70 | + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 |
| 71 | +done |
| 72 | + |
| 73 | +if [ $stage -le 7 ]; then |
| 74 | + echo "$0: creating lang directory with one state per phone." |
| 75 | + # Create a version of the lang/ directory that has one state per phone in the |
| 76 | + # topo file. [note, it really has two states.. the first one is only repeated |
| 77 | + # once, the second one has zero or more repeats.] |
| 78 | + if [ -d data/lang_chain ]; then |
| 79 | + if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then |
| 80 | + echo "$0: data/lang_chain already exists, not overwriting it; continuing" |
| 81 | + else |
| 82 | + echo "$0: data/lang_chain already exists and seems to be older than data/lang..." |
| 83 | + echo " ... not sure what to do. Exiting." |
| 84 | + exit 1; |
| 85 | + fi |
| 86 | + else |
| 87 | + cp -r $langdir data/lang_chain |
| 88 | + silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; |
| 89 | + nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; |
| 90 | + # Use our special topology... note that later on may have to tune this |
| 91 | + # topology. |
| 92 | + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo |
| 93 | + fi |
| 94 | +fi |
| 95 | + |
| 96 | +if [ $stage -le 8 ]; then |
| 97 | + # Get the alignments as lattices (gives the chain training more freedom). |
| 98 | + # use the same num-jobs as the alignments |
| 99 | + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ |
| 100 | + $langdir $gmm_dir $lat_dir |
| 101 | + rm $lat_dir/fsts.*.gz # save space |
| 102 | +fi |
| 103 | + |
| 104 | +if [ $stage -le 9 ]; then |
| 105 | + # Build a tree using our new topology. We know we have alignments for the |
| 106 | + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use |
| 107 | + # those. |
| 108 | + if [ -f $tree_dir/final.mdl ]; then |
| 109 | + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." |
| 110 | + exit 1; |
| 111 | + fi |
| 112 | + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor $frame_subsampling_factor \ |
| 113 | + --context-opts "--context-width=2 --central-position=1" \ |
| 114 | + --leftmost-questions-truncate -1 \ |
| 115 | + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir |
| 116 | +fi |
| 117 | + |
| 118 | +if [ $stage -le 10 ]; then |
| 119 | + mkdir -p $dir |
| 120 | + |
| 121 | + echo "$0: creating neural net configs using the xconfig parser"; |
| 122 | + |
| 123 | + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') |
| 124 | + [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } |
| 125 | + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) |
| 126 | + |
| 127 | + mkdir -p $dir/configs |
| 128 | + cat <<EOF > $dir/configs/network.xconfig |
| 129 | + input dim=100 name=ivector |
| 130 | + input dim=43 name=input |
| 131 | +
|
| 132 | + # please note that it is important to have input layer with the name=input |
| 133 | + # as the layer immediately preceding the fixed-affine-layer to enable |
| 134 | + # the use of short notation for the descriptor |
| 135 | + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat |
| 136 | +
|
| 137 | + # the first splicing is moved before the lda layer, so no splicing here |
| 138 | + relu-batchnorm-layer name=tdnn1 dim=450 |
| 139 | + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=450 |
| 140 | + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 |
| 141 | + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 |
| 142 | + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 |
| 143 | + relu-batchnorm-layer name=tdnn7 input=Append(-6,-3,0) dim=450 |
| 144 | +
|
| 145 | + ## adding the layers for chain branch |
| 146 | + relu-batchnorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5 |
| 147 | + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 |
| 148 | + output-layer name=output-default input=prefinal-chain include-log-softmax=false dim=$num_targets max-change=1.5 |
| 149 | +
|
| 150 | + ## adding the layers for chain branch |
| 151 | +
|
| 152 | + # adding the layers for xent branch |
| 153 | + # This block prints the configs for a separate output that will be |
| 154 | + # trained with a cross-entropy objective in the 'chain' models... this |
| 155 | + # has the effect of regularizing the hidden parts of the model. we use |
| 156 | + # 0.5 / args.xent_regularize as the learning rate factor- the factor of |
| 157 | + # 0.5 / args.xent_regularize is suitable as it means the xent |
| 158 | + # final-layer learns at a rate independent of the regularization |
| 159 | + # constant; and the 0.5 was tuned so as to make the relative progress |
| 160 | + # similar in the xent and regular final layers. |
| 161 | + relu-batchnorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5 |
| 162 | + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 |
| 163 | + output-layer name=output-default-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 |
| 164 | +
|
| 165 | +EOF |
| 166 | + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ |
| 167 | + if [ ! -f $dir/init/default_trans.mdl ]; then # checking this because it may have been copied in a previous run of the same script |
| 168 | + copy-transition-model $tree_dir/final.mdl $dir/init/default_trans.mdl || exit 1 & |
| 169 | + else |
| 170 | + echo "Keeping the old $dir/init/default_trans.mdl as it already exists." |
| 171 | + fi |
| 172 | + |
| 173 | +fi |
| 174 | + |
| 175 | +init_info=$dir/init/info.txt |
| 176 | +if [ $stage -le 11 ]; then |
| 177 | + |
| 178 | + if [ ! -f $dir/configs/ref.raw ]; then |
| 179 | + echo "Expected $dir/configs/ref.raw to exist" |
| 180 | + exit |
| 181 | + fi |
| 182 | + |
| 183 | + mkdir -p $dir/init |
| 184 | + nnet3-info $dir/configs/ref.raw > $dir/configs/temp.info |
| 185 | + model_left_context=`fgrep 'left-context' $dir/configs/temp.info | awk '{print $2}'` |
| 186 | + model_right_context=`fgrep 'right-context' $dir/configs/temp.info | awk '{print $2}'` |
| 187 | + cat >$init_info <<EOF |
| 188 | +frame_subsampling_factor $frame_subsampling_factor |
| 189 | +langs $langs |
| 190 | +model_left_context $model_left_context |
| 191 | +model_right_context $model_right_context |
| 192 | +EOF |
| 193 | + rm $dir/configs/temp.info |
| 194 | +fi |
| 195 | + |
| 196 | + |
| 197 | +# Make phone LM and denominator and normalization FST |
| 198 | +if [ $stage -le 12 ]; then |
| 199 | + echo "$0: Making Phone LM and denominator and normalization FST" |
| 200 | + mkdir -p $dir/den_fsts/log |
| 201 | + |
| 202 | + # We may later reorganize this. |
| 203 | + cp $tree_dir/tree $dir/default.tree |
| 204 | + |
| 205 | + echo "$0: creating phone language-model" |
| 206 | + $train_cmd $dir/den_fsts/log/make_phone_lm_default.log \ |
| 207 | + chain-est-phone-lm --num-extra-lm-states=2000 \ |
| 208 | + "ark:gunzip -c $ali_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \ |
| 209 | + $dir/den_fsts/default.phone_lm.fst |
| 210 | + |
| 211 | + echo "$0: creating denominator FST" |
| 212 | + $train_cmd $dir/den_fsts/log/make_den_fst.log \ |
| 213 | + chain-make-den-fst $dir/default.tree $dir/init/default_trans.mdl $dir/den_fsts/default.phone_lm.fst \ |
| 214 | + $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1; |
| 215 | +fi |
| 216 | + |
| 217 | + |
| 218 | +model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt) |
| 219 | +model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt) |
| 220 | +if [ -z $model_left_context ]; then |
| 221 | + echo "ERROR: Cannot find entry for model_left_context in $dir/init/info.txt" |
| 222 | +fi |
| 223 | +if [ -z $model_right_context ]; then |
| 224 | + echo "ERROR: Cannot find entry for model_right_context in $dir/init/info.txt" |
| 225 | +fi |
| 226 | +egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context] |
| 227 | +egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context] |
| 228 | + |
| 229 | +for d in $dir/raw_egs $dir/processed_egs; do |
| 230 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then |
| 231 | + mkdir -p $d |
| 232 | + utils/create_split_dir.pl \ |
| 233 | + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage |
| 234 | + fi |
| 235 | +done |
| 236 | + |
| 237 | +if [ -z $common_egs_dir ]; then |
| 238 | + if [ $stage -le 13 ]; then |
| 239 | + echo "$0: about to dump raw egs." |
| 240 | + # Dump raw egs. |
| 241 | + steps/chain2/get_raw_egs.sh --cmd "$train_cmd" \ |
| 242 | + --lang "default" \ |
| 243 | + --online-ivector-dir $train_ivector_dir \ |
| 244 | + --left-context $egs_left_context \ |
| 245 | + --right-context $egs_right_context \ |
| 246 | + --frame-subsampling-factor $frame_subsampling_factor \ |
| 247 | + --alignment-subsampling-factor $frame_subsampling_factor \ |
| 248 | + --frames-per-chunk $chunk_width \ |
| 249 | + ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs |
| 250 | + fi |
| 251 | + |
| 252 | + if [ $stage -le 14 ]; then |
| 253 | + echo "$0: about to process egs" |
| 254 | + steps/chain2/process_egs.sh --cmd "$train_cmd" \ |
| 255 | + ${dir}/raw_egs ${dir}/processed_egs |
| 256 | + fi |
| 257 | + |
| 258 | + if [ $stage -le 15 ]; then |
| 259 | + echo "$0: about to randomize egs" |
| 260 | + steps/chain2/randomize_egs.sh --frames-per-job 1500000 \ |
| 261 | + ${dir}/processed_egs ${dir}/egs |
| 262 | + fi |
| 263 | + common_egs_dir=$dir/egs |
| 264 | +fi |
| 265 | + |
| 266 | +if [ $stage -le 16 ]; then |
| 267 | + echo "$0: Training pre-conditioning matrix" |
| 268 | + num_lda_jobs=`find $common_egs_dir/ -iname 'train.*.scp' | wc -l | cut -d ' ' -f2` |
| 269 | + steps/chain2/compute_preconditioning_matrix.sh --cmd "$train_cmd" \ |
| 270 | + --nj $num_lda_jobs \ |
| 271 | + $dir/configs/init.raw \ |
| 272 | + $common_egs_dir \ |
| 273 | + $dir || exit 1 |
| 274 | +fi |
| 275 | + |
| 276 | + |
| 277 | +if [ $stage -le 17 ]; then |
| 278 | + echo "$0: Preparing initial acoustic model" |
| 279 | + if [ -f $dir/configs/init.config ]; then |
| 280 | + $train_cmd ${dir}/log/add_first_layer.log \ |
| 281 | + nnet3-init --srand=${srand} ${dir}/configs/init.raw \ |
| 282 | + ${dir}/configs/final.config ${dir}/init/default.raw || exit 1 |
| 283 | + else |
| 284 | + $train_cmd ${dir}/log/init_model.log \ |
| 285 | + nnet3-init --srand=${srand} ${dir}/configs/final.config ${dir}/init/default.raw || exit 1 |
| 286 | + fi |
| 287 | + |
| 288 | + $train_cmd $dir/log/init_mdl.log \ |
| 289 | + nnet3-am-init ${dir}/init/default_trans.mdl $dir/init/default.raw $dir/init/default.mdl || exit 1 |
| 290 | +fi |
| 291 | + |
| 292 | +if [ $stage -le 18 ]; then |
| 293 | + echo "$0: Starting model training" |
| 294 | + steps/chain2/train.sh \ |
| 295 | + --stage $train_stage --cmd "$cuda_cmd" \ |
| 296 | + --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \ |
| 297 | + --initial-effective-lrate $initial_effective_lrate \ |
| 298 | + --final-effective-lrate $final_effective_lrate \ |
| 299 | + --max-param-change $max_param_change \ |
| 300 | + --groups-per-minibatch 128 \ |
| 301 | + --l2-regularize 0.00005 \ |
| 302 | + --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ |
| 303 | + $common_egs_dir $dir |
| 304 | +fi |
| 305 | + |
| 306 | +if [ $stage -le 19 ]; then |
| 307 | + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as |
| 308 | + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from |
| 309 | + # the lang directory. |
| 310 | + if [ ! -f $dir/tree ]; then |
| 311 | + cp $tree_dir/tree $dir/tree |
| 312 | + fi |
| 313 | + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_chain $dir $dir/graph |
| 314 | +fi |
| 315 | + |
| 316 | +exit 0 |
0 commit comments