update training scripts.

kaldi-asr · danpovey · Feb 11, 2020 · Jan 30, 2020 · Jan 30, 2020 · Jan 30, 2020
commit 154e36680bf401bc3eeff3403f899b4be68667c1
diff --git a/egs/aishell/s10/chain/inference.py b/egs/aishell/s10/chain/inference.py
@@ -34,8 +34,9 @@ def main():
                             output_dim=args.output_dim,
                             lda_mat_filename=args.lda_mat_filename,
                             hidden_dim=args.hidden_dim,
-                            kernel_size_list=args.kernel_size_list,
-                            stride_list=args.stride_list)
+                            bottleneck_dim=args.bottleneck_dim,
+                            time_stride_list=args.time_stride_list,
+                            conv_stride_list=args.conv_stride_list)
 
     load_checkpoint(args.checkpoint, model)
 

diff --git a/egs/aishell/s10/chain/model.py b/egs/aishell/s10/chain/model.py
@@ -201,6 +201,14 @@ def forward(self, x):
 
         return nnet_output, xent_output
 
+    def constrain_orthonormal(self):
+        for i in range(len(self.tdnnfs)):
+            self.tdnnfs[i].constrain_orthonormal()
+
+        self.prefinal_l.constrain_orthonormal()
+        self.prefinal_chain.constrain_orthonormal()
+        self.prefinal_xent.constrain_orthonormal()
+
 
 if __name__ == '__main__':
     feat_dim = 43
@@ -212,3 +220,4 @@ def forward(self, x):
     x = torch.arange(N * T * C).reshape(N, T, C).float()
     nnet_output, xent_output = model(x)
     print(x.shape, nnet_output.shape, xent_output.shape)
+    model.constrain_orthonormal()
diff --git a/egs/aishell/s10/chain/options.py b/egs/aishell/s10/chain/options.py
@@ -129,18 +129,19 @@ def _check_args(args):
     assert args.feat_dim > 0
     assert args.output_dim > 0
     assert args.hidden_dim > 0
+    assert args.bottleneck_dim > 0
 
-    assert args.kernel_size_list is not None
-    assert len(args.kernel_size_list) > 0
+    assert args.time_stride_list is not None
+    assert len(args.time_stride_list) > 0
 
-    assert args.stride_list is not None
-    assert len(args.stride_list) > 0
+    assert args.conv_stride_list is not None
+    assert len(args.conv_stride_list) > 0
 
-    args.kernel_size_list = [int(k) for k in args.kernel_size_list.split(', ')]
+    args.time_stride_list = [int(k) for k in args.time_stride_list.split(', ')]
 
-    args.stride_list = [int(k) for k in args.stride_list.split(', ')]
+    args.conv_stride_list = [int(k) for k in args.conv_stride_list.split(', ')]
 
-    assert len(args.kernel_size_list) == len(args.stride_list)
+    assert len(args.time_stride_list) == len(args.conv_stride_list)
 
     assert args.log_level in ['debug', 'info', 'warning']
 
@@ -195,15 +196,21 @@ def get_args():
                         required=True,
                         type=int)
 
-    parser.add_argument('--kernel-size-list',
-                        dest='kernel_size_list',
-                        help='kernel size list',
+    parser.add_argument('--bottleneck-dim',
+                        dest='bottleneck_dim',
+                        help='nn bottleneck dimension',
+                        required=True,
+                        type=int)
+
+    parser.add_argument('--time-stride-list',
+                        dest='time_stride_list',
+                        help='time stride list',
                         required=True,
                         type=str)
 
-    parser.add_argument('--stride-list',
-                        dest='stride_list',
-                        help='stride list',
+    parser.add_argument('--conv-stride-list',
+                        dest='conv_stride_list',
+                        help='conv stride list',
                         required=True,
                         type=str)
 

diff --git a/egs/aishell/s10/chain/tdnnf_layer.py b/egs/aishell/s10/chain/tdnnf_layer.py
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 
 
-def _constraint_orthonormal_internal(M):
+def _constrain_orthonormal_internal(M):
     '''
     Refer to
         void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M)
@@ -58,7 +58,7 @@ def __init__(self, dim, bottleneck_dim, time_stride):
         assert time_stride in [0, 1]
         # WARNING(fangjun): kaldi uses [-1, 0] for the first linear layer
         # and [0, 1] for the second affine layer;
-        # We use [-1, 0, 1] for the first linear layer
+        # we use [-1, 0, 1] for the first linear layer if time_stride == 1
 
         if time_stride == 0:
             kernel_size = 1
@@ -79,7 +79,7 @@ def forward(self, x):
         x = self.conv(x)
         return x
 
-    def constraint_orthonormal(self):
+    def constrain_orthonormal(self):
         state_dict = self.conv.state_dict()
         w = state_dict['weight']
         # w is of shape [out_channels, in_channels, kernel_size]
@@ -97,7 +97,7 @@ def constraint_orthonormal(self):
             w = w.t()
             need_transpose = True
 
-        w = _constraint_orthonormal_internal(w)
+        w = _constrain_orthonormal_internal(w)
 
         if need_transpose:
             w = w.t()
@@ -142,6 +142,9 @@ def forward(self, x):
 
         return x
 
+    def constrain_orthonormal(self):
+        self.linear.constrain_orthonormal()
+
 
 class FactorizedTDNN(nn.Module):
     '''
@@ -175,6 +178,8 @@ def __init__(self,
                                         time_stride=time_stride)
 
         # affine requires [N, C, T]
+        # WARNING(fangjun): we do not use nn.Linear here
+        # since we want to use `stride`
         self.affine = nn.Conv1d(in_channels=bottleneck_dim,
                                 out_channels=dim,
                                 kernel_size=1,
@@ -191,31 +196,34 @@ def forward(self, x):
         input_x = x
 
         x = self.linear(x)
+
         # at this point, x is [N, C, T]
 
         x = self.affine(x)
+
         # at this point, x is [N, C, T]
 
         x = F.relu(x)
+
         # at this point, x is [N, C, T]
 
         x = self.batchnorm(x)
+
         # at this point, x is [N, C, T]
 
         # TODO(fangjun): implement GeneralDropoutComponent in PyTorch
 
-        # at this point, x is [N, C, T]
         if self.linear.kernel_size == 3:
             x = self.bypass_scale * input_x[:, :, 1:-1:self.conv_stride] + x
         else:
             x = self.bypass_scale * input_x[:, :, ::self.conv_stride] + x
         return x
 
-    def constraint_orthonormal(self):
-        self.linear.constraint_orthonormal()
+    def constrain_orthonormal(self):
+        self.linear.constrain_orthonormal()
 
 
-def _test_constraint_orthonormal():
+def _test_constrain_orthonormal():
 
     def compute_loss(M):
         P = torch.mm(M, M.t())
@@ -238,7 +246,7 @@ def compute_loss(M):
     loss.append(compute_loss(w))
 
     for i in range(15):
-        w = _constraint_orthonormal_internal(w)
+        w = _constrain_orthonormal_internal(w)
         loss.append(compute_loss(w))
 
     for i in range(1, len(loss)):
@@ -252,11 +260,11 @@ def compute_loss(M):
                            time_stride=1,
                            conv_stride=3)
     loss = []
-    model.constraint_orthonormal()
+    model.constrain_orthonormal()
     loss.append(
         compute_loss(model.linear.conv.state_dict()['weight'].reshape(128, -1)))
     for i in range(5):
-        model.constraint_orthonormal()
+        model.constrain_orthonormal()
         loss.append(
             compute_loss(model.linear.conv.state_dict()['weight'].reshape(
                 128, -1)))
@@ -308,4 +316,4 @@ def _test_factorized_tdnn():
 if __name__ == '__main__':
     torch.manual_seed(20200130)
     _test_factorized_tdnn()
-    _test_constraint_orthonormal()
+    _test_constrain_orthonormal()
diff --git a/egs/aishell/s10/chain/train.py b/egs/aishell/s10/chain/train.py
@@ -11,6 +11,7 @@
 # disable warnings when loading tensorboard
 warnings.simplefilter(action='ignore', category=FutureWarning)
 
+import numpy as np
 import torch
 import torch.optim as optim
 from torch.nn.utils import clip_grad_value_
@@ -84,6 +85,11 @@ def train_one_epoch(dataloader, model, device, optimizer, criterion,
             total_weight += objf_l2_term_weight[2].item()
             num_frames = nnet_output.shape[0]
             total_frames += num_frames
+
+        if np.random.choice(4) == 0:
+            with torch.no_grad():
+                model.constraint_orthonormal()
+
         if batch_idx % 100 == 0:
             logging.info(
                 'Process {}/{}({:.6f}%) global average objf: {:.6f} over {} '
@@ -135,8 +141,9 @@ def main():
                             output_dim=args.output_dim,
                             lda_mat_filename=args.lda_mat_filename,
                             hidden_dim=args.hidden_dim,
-                            kernel_size_list=args.kernel_size_list,
-                            stride_list=args.stride_list)
+                            bottleneck_dim=args.bottleneck_dim,
+                            time_stride_list=args.time_stride_list,
+                            conv_stride_list=args.conv_stride_list)
 
     start_epoch = 0
     num_epochs = args.num_epochs

diff --git a/egs/aishell/s10/conf/mfcc_hires.conf b/egs/aishell/s10/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false       # use average of log energy, not energy.
+--sample-frequency=16000 # AISHELL-2 is sampled at 16kHz
+--num-mel-bins=40        # similar to Google's setup.
+--num-ceps=40            # there is no dimensionality reduction.
+--low-freq=20            # low cutoff frequency for mel bins
+--high-freq=-400         # high cutoff frequency, relative to Nyquist of 8000 (=7600)
diff --git a/egs/aishell/s10/local/run_chain.sh b/egs/aishell/s10/local/run_chain.sh
@@ -9,7 +9,7 @@ stage=0
 
 # GPU device id to use (count from 0).
 # you can also set `CUDA_VISIBLE_DEVICES` and set `device_id=0`
-device_id=0
+device_id=6
 
 nj=10
 
@@ -19,8 +19,8 @@ lat_dir=exp/tri5a_lats # input lat dir
 treedir=exp/chain/tri5_tree # output tree dir
 
 # You should know how to calculate your model's left/right context **manually**
-model_left_context=12
-model_right_context=12
+model_left_context=28
+model_right_context=28
 egs_left_context=$[$model_left_context + 1]
 egs_right_context=$[$model_right_context + 1]
 frames_per_eg=150,110,90
@@ -30,9 +30,10 @@ minibatch_size=128
 num_epochs=6
 lr=1e-3
 
-hidden_dim=625
-kernel_size_list="1, 3, 3, 3, 3, 3" # comma separated list
-stride_list="1, 1, 3, 1, 1, 1" # comma separated list
+hidden_dim=1024
+bottleneck_dim=128
+time_stride_list="1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list
+conv_stride_list="1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1" # comma separated list
 
 log_level=info # valid values: debug, info, warning
 
@@ -47,11 +48,16 @@ save_nn_output_as_compressed=false
 
 if [[ $stage -le 0 ]]; then
   for datadir in train dev test; do
-    dst_dir=data/fbank_pitch/$datadir
+    dst_dir=data/mfcc_hires/$datadir
     if [[ ! -f $dst_dir/feats.scp ]]; then
+      echo "making mfcc-pitch features for LF-MMI training"
       utils/copy_data_dir.sh data/$datadir $dst_dir
-      echo "making fbank-pitch features for LF-MMI training"
-      steps/make_fbank_pitch.sh --cmd $train_cmd --nj $nj $dst_dir || exit 1
+      steps/make_mfcc_pitch.sh \
+        --mfcc-config conf/mfcc_hires.conf \
+        --pitch-config conf/pitch.conf \
+        --cmd "$train_cmd" \
+        --nj $nj \
+        $dst_dir || exit 1
       steps/compute_cmvn_stats.sh $dst_dir || exit 1
       utils/fix_data_dir.sh $dst_dir
     else
@@ -80,12 +86,12 @@ if [[ $stage -le 2 ]]; then
   # step compared with other recipes.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
       --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 5000 data/train $lang $ali_dir $treedir
+      --cmd "$train_cmd" 5000 data/mfcc/train $lang $ali_dir $treedir
 fi
 
 if  [[ $stage -le 3 ]]; then
   echo "creating phone language-model"
-  $train_cmd exp/chain/log/make_phone_lm.log \
+  "$train_cmd" exp/chain/log/make_phone_lm.log \
     chain-est-phone-lm \
      "ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \
      exp/chain/phone_lm.fst || exit 1
@@ -95,7 +101,7 @@ if [[ $stage -le 4 ]]; then
   echo "creating denominator FST"
   copy-transition-model $treedir/final.mdl exp/chain/0.trans_mdl
   cp $treedir/tree exp/chain
-  $train_cmd exp/chain/log/make_den_fst.log \
+  "$train_cmd" exp/chain/log/make_den_fst.log \
     chain-make-den-fst exp/chain/tree exp/chain/0.trans_mdl exp/chain/phone_lm.fst \
        exp/chain/den.fst exp/chain/normalization.fst || exit 1
 fi
@@ -119,7 +125,7 @@ if [[ $stage -le 5 ]]; then
     --right-tolerance 5 \
     --srand 0 \
     --stage -10 \
-    data/fbank_pitch/train \
+    data/mfcc_hires/train \
     exp/chain $lat_dir exp/chain/egs
 fi
 
@@ -157,16 +163,17 @@ if [[ $stage -le 8 ]]; then
 
   # sort the options alphabetically
   python3 ./chain/train.py \
+    --bottleneck-dim $bottleneck_dim \
     --checkpoint=${train_checkpoint:-} \
+    --conv-stride-list "$conv_stride_list" \
     --device-id $device_id \
     --dir exp/chain/train \
     --feat-dim $feat_dim \
     --hidden-dim $hidden_dim \
     --is-training true \
-    --kernel-size-list "$kernel_size_list" \
     --log-level $log_level \
     --output-dim $output_dim \
-    --stride-list "$stride_list" \
+    --time-stride-list "$time_stride_list" \
     --train.cegs-dir exp/chain/merged_egs \
     --train.den-fst exp/chain/den.fst \
     --train.egs-left-context $egs_left_context \
@@ -186,20 +193,21 @@ if [[ $stage -le 9 ]]; then
       best_epoch=$(cat exp/chain/train/best-epoch-info | grep 'best epoch' | awk '{print $NF}')
       inference_checkpoint=exp/chain/train/epoch-${best_epoch}.pt
       python3 ./chain/inference.py \
+        --bottleneck-dim $bottleneck_dim \
         --checkpoint $inference_checkpoint \
+        --conv-stride-list "$conv_stride_list" \
         --device-id $device_id \
         --dir exp/chain/inference/$x \
         --feat-dim $feat_dim \
-        --feats-scp data/fbank_pitch/$x/feats.scp \
+        --feats-scp data/mfcc_hires/$x/feats.scp \
         --hidden-dim $hidden_dim \
         --is-training false \
-        --kernel-size-list "$kernel_size_list" \
         --log-level $log_level \
         --model-left-context $model_left_context \
         --model-right-context $model_right_context \
         --output-dim $output_dim \
         --save-as-compressed $save_nn_output_as_compressed \
-        --stride-list "$stride_list" || exit 1
+        --time-stride-list "$time_stride_list" || exit 1
     fi
   done
 fi
@@ -228,7 +236,7 @@ if [[ $stage -le 11 ]]; then
 
   for x in test dev; do
     ./local/score.sh --cmd "$decode_cmd" \
-      data/fbank_pitch/$x \
+      data/mfcc_hires/$x \
       exp/chain/graph \
       exp/chain/decode_res/$x || exit 1
   done