06 homework

nwpuaslp · Feb 13, 2020 · 8c7a56b · 8c7a56b
1 parent b206879
commit 8c7a56b
Show file tree

Hide file tree

Showing 10 changed files with 1,923 additions and 0 deletions.
diff --git a/06-DNN-HMM/README.md b/06-DNN-HMM/README.md
@@ -0,0 +1,58 @@
+# 基于DNN-HMM的语音识别系统作业
+
+本次课程共有2个作业，分别如下所示。
+
+## 作业1
+
+### 数据说明
+本次实验所用的数据为0-9（其中0的标签为Z（Zero））和O这11个字符的英文录音所提取的39维的MFCC特征。其中
+* 训练数据：330句话，11个字符，每个字符30句话，训练数据位于train目录下。
+* 测试数据：110句话，11个字符，每个字符10句话，测试数据位于test目录下。
+
+train/test目录下各有3个文件，分别如下：
+* text: 标注文件，每一行第一列为句子id，第二列为标注。
+* feats.scp: 特征索引文件，每一行第一列为句子id，第二列为特征的索引表示。
+* feats.ark: 特征实际存储文件，该文件为二进制文件。
+
+### 实验内容
+本实验实现了一个简单的DNN的框架，使用DNN进行11个数字的训练和识别。
+实验中使用以上所述的训练和测试数据分别对该DNN进行训练和测试。
+请阅读dnn.py中的代码，理解该DNN框架，完善ReLU激活函数和FullyConnect全连接层的前向后向算法。
+可以参考Softmax的前向和后向实现。dnn.py中代码插入位置为。
+``` python
+# BEGIN_LAB
+# write your code here
+# END_LAB
+```
+
+### 运行和检查
+
+使用如下命令运行该实验，该程序末尾会打印出在测试集上的准确率。假设实现正确，应该得到95%以上的准确率，作者的实现分类准确率为98.18%。
+
+``` sh
+python dnn.py
+```
+
+### 拓展
+除了跑默认参数之外，读者还可以自己尝试调节一些超参数，并观察这些超参数对最终准确率的影响。如
+* 学习率
+* 隐层结点数
+* 隐层层数
+
+读者还可以基于该框架实现神经网络中的一些基本算法，如：
+* sigmoid和tanh激活函数
+* dropout
+* L2 regularization
+* optimizer(Momentum/Adam)
+* ...
+
+实现后读者可以在该数字识别任务上应用这些算法，并观察对识别率的影响。
+
+通过调节这些超参数和实现其他的一些基本算法，读者可以进一步认识和理解神经网络。
+
+## 作业2
+
+基于Kaldi理解基于DNN-HMM的语音识别系统。请安装kaldi，并运行kaldi下的标准数据集THCHS30的实验，该实验如链接所示， 
+https://github.com/kaldi-asr/kaldi/blob/master/egs/thchs30/s5/run.sh。
+
+[THCHS30](http://www.openslr.org/18/)是清华大学开源的一个中文数据集，总共30小时。请基于该数据集，基于kaldi下该数据集的标注脚本，梳理基于DNN-HMM的语音识别系统的**流程，其有哪些步骤，每一步的输入、输出，步骤间的相互关系**等，可以把自己的理解流程化、图形化、文字化的记录下来，写下来。
diff --git a/06-DNN-HMM/dnn.py b/06-DNN-HMM/dnn.py
@@ -0,0 +1,204 @@
+# Author: Sining Sun, Zhanheng Yang, Binbin Zhang
+
+import numpy as np
+import kaldi_io
+from utils import *
+
+targets_list = ['Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
+targets_mapping = {}
+for i, x in enumerate(targets_list):
+    targets_mapping[x] = i
+
+
+class Layer:
+    def forward(self, input):
+        ''' Forward function by input
+        Args:
+            input: input, B * N matrix, B for batch size
+        Returns:
+            output when applied this layer
+        '''
+        raise 'Not implement error'
+
+    def backward(self, input, output, d_output):
+        ''' Compute gradient of this layer's input by (input, output, d_output)
+            as well as compute the gradient of the parameter of this layer
+        Args:
+            input: input of this layer
+            output: output of this layer
+            d_output: accumulated gradient from final output to this
+                      layer's output
+        Returns:
+            accumulated gradient from final output to this layer's input
+        '''
+        raise 'Not implement error'
+
+    def set_learning_rate(self, lr):
+        ''' Set learning rate of this layer'''
+        self.learning_rate = lr
+
+    def update(self):
+        ''' Update this layers parameter if it has or do nothing
+        '''
+
+
+class ReLU(Layer):
+    def forward(self, input):
+        # BEGIN_LAB
+        # END_LAB
+
+    def backward(self, input, output, d_output):
+        # BEGIN_LAB
+        # END_LAB
+
+
+class FullyConnect(Layer):
+    def __init__(self, in_dim, out_dim):
+        self.w = np.random.randn(out_dim, in_dim) * np.sqrt(2.0 / in_dim)
+        self.b = np.zeros(out_dim)
+        self.dw = np.zeros((out_dim, in_dim))
+        self.db = np.zeros(out_dim)
+
+    def forward(self, input):
+        # BEGIN_LAB
+        # END_LAB
+
+    def backward(self, input, output, d_output):
+        batch_size = input.shape[0]
+        in_diff = None
+        # BEGIN_LAB, compute in_diff/dw/db here
+        # END_LAB
+        # Normalize dw/db by batch size
+        self.dw = self.dw / batch_size
+        self.db = self.db / batch_size
+        return in_diff
+
+    def update(self):
+        self.w = self.w - self.learning_rate * self.dw
+        self.b = self.b - self.learning_rate * self.db
+
+
+class Softmax(Layer):
+    def forward(self, input):
+        row_max = input.max(axis=1).reshape(input.shape[0], 1)
+        x = input - row_max
+        return np.exp(x) / np.sum(np.exp(x), axis=1).reshape(x.shape[0], 1)
+
+    def backward(self, input, output, d_output):
+        ''' Directly return the d_output as we show below, the grad is to
+            the activation(input) of softmax
+        '''
+        return d_output
+
+
+class DNN:
+    def __init__(self, in_dim, out_dim, hidden_dim, num_hidden):
+        self.layers = []
+        self.layers.append(FullyConnect(in_dim, hidden_dim))
+        self.layers.append(ReLU())
+        for i in range(num_hidden):
+            self.layers.append(FullyConnect(hidden_dim, hidden_dim))
+            self.layers.append(ReLU())
+        self.layers.append(FullyConnect(hidden_dim, out_dim))
+        self.layers.append(Softmax())
+
+    def set_learning_rate(self, lr):
+        for layer in self.layers:
+            layer.set_learning_rate(lr)
+
+    def forward(self, input):
+        self.forward_buf = []
+        out = input
+        self.forward_buf.append(out)
+        for i in range(len(self.layers)):
+            out = self.layers[i].forward(out)
+            self.forward_buf.append(out)
+        assert (len(self.forward_buf) == len(self.layers) + 1)
+        return out
+
+    def backward(self, grad):
+        '''
+        Args:
+            grad: the grad is to the activation before softmax
+        '''
+        self.backward_buf = [None] * len(self.layers)
+        self.backward_buf[len(self.layers) - 1] = grad
+        for i in range(len(self.layers) - 2, -1, -1):
+            grad = self.layers[i].backward(self.forward_buf[i],
+                                           self.forward_buf[i + 1],
+                                           self.backward_buf[i + 1])
+            self.backward_buf[i] = grad
+
+    def update(self):
+        for layer in self.layers:
+            layer.update()
+
+
+def one_hot(labels, total_label):
+    output = np.zeros((labels.shape[0], total_label))
+    for i in range(labels.shape[0]):
+        output[i][labels[i]] = 1.0
+    return output
+
+
+def train(dnn):
+    utt2feat, utt2target = read_feats_and_targets('train/feats.scp',
+                                                  'train/text')
+    inputs, labels = build_input(targets_mapping, utt2feat, utt2target)
+    num_samples = inputs.shape[0]
+    # Shuffle data
+    permute = np.random.permutation(num_samples)
+    inputs = inputs[permute]
+    labels = labels[permute]
+    num_epochs = 20
+    batch_size = 100
+    for i in range(num_epochs):
+        cur = 0
+        while cur < num_samples:
+            end = min(cur + batch_size, num_samples)
+            input = inputs[cur:end]
+            label = labels[cur:end]
+            # Step1: forward
+            out = dnn.forward(input)
+            one_hot_label = one_hot(label, out.shape[1])
+            # Step2: Compute cross entropy loss and backward
+            loss = -np.sum(np.log(out + 1e-20) * one_hot_label) / out.shape[0]
+            # The grad is to activation before softmax
+            grad = out - one_hot_label
+            dnn.backward(grad)
+            # Step3: update parameters
+            dnn.update()
+            print('Epoch {} num_samples {} loss {}'.format(i, cur, loss))
+            cur += batch_size
+
+
+def test(dnn):
+    utt2feat, utt2target = read_feats_and_targets('test/feats.scp',
+                                                  'test/text')
+    total = len(utt2feat)
+    correct = 0
+    for utt in utt2feat:
+        t = utt2target[utt]
+        ark = utt2feat[utt]
+        mat = kaldi_io.read_mat(ark)
+        mat = splice(mat, 5, 5)
+        posterior = dnn.forward(mat)
+        posterior = np.sum(posterior, axis=0) / float(mat.shape[0])
+        predict = targets_list[np.argmax(posterior)]
+        if t == predict: correct += 1
+        print('label: {} predict: {}'.format(t, predict))
+    print('Acc: {}'.format(float(correct) / total))
+
+
+def main():
+    np.random.seed(777)
+    # We splice the raw feat with left 5 frames and right 5 frames
+    # So the input here is 39 * (5 + 1 + 5) = 429
+    dnn = DNN(429, 11, 128, 1)
+    dnn.set_learning_rate(1e-2)
+    train(dnn)
+    test(dnn)
+
+
+if __name__ == '__main__':
+    main()