Skip to content

Commit

Permalink
06 homework
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuoyuan yao committed Feb 13, 2020
1 parent b206879 commit 8c7a56b
Show file tree
Hide file tree
Showing 10 changed files with 1,923 additions and 0 deletions.
58 changes: 58 additions & 0 deletions 06-DNN-HMM/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# 基于DNN-HMM的语音识别系统作业

本次课程共有2个作业,分别如下所示。

## 作业1

### 数据说明
本次实验所用的数据为0-9(其中0的标签为Z(Zero))和O这11个字符的英文录音所提取的39维的MFCC特征。其中
* 训练数据:330句话,11个字符,每个字符30句话,训练数据位于train目录下。
* 测试数据:110句话,11个字符,每个字符10句话,测试数据位于test目录下。

train/test目录下各有3个文件,分别如下:
* text: 标注文件,每一行第一列为句子id,第二列为标注。
* feats.scp: 特征索引文件,每一行第一列为句子id,第二列为特征的索引表示。
* feats.ark: 特征实际存储文件,该文件为二进制文件。

### 实验内容
本实验实现了一个简单的DNN的框架,使用DNN进行11个数字的训练和识别。
实验中使用以上所述的训练和测试数据分别对该DNN进行训练和测试。
请阅读dnn.py中的代码,理解该DNN框架,完善ReLU激活函数和FullyConnect全连接层的前向后向算法。
可以参考Softmax的前向和后向实现。dnn.py中代码插入位置为。
``` python
# BEGIN_LAB
# write your code here
# END_LAB
```

### 运行和检查

使用如下命令运行该实验,该程序末尾会打印出在测试集上的准确率。假设实现正确,应该得到95%以上的准确率,作者的实现分类准确率为98.18%。

``` sh
python dnn.py
```

### 拓展
除了跑默认参数之外,读者还可以自己尝试调节一些超参数,并观察这些超参数对最终准确率的影响。如
* 学习率
* 隐层结点数
* 隐层层数

读者还可以基于该框架实现神经网络中的一些基本算法,如:
* sigmoid和tanh激活函数
* dropout
* L2 regularization
* optimizer(Momentum/Adam)
* ...

实现后读者可以在该数字识别任务上应用这些算法,并观察对识别率的影响。

通过调节这些超参数和实现其他的一些基本算法,读者可以进一步认识和理解神经网络。

## 作业2

基于Kaldi理解基于DNN-HMM的语音识别系统。请安装kaldi,并运行kaldi下的标准数据集THCHS30的实验,该实验如链接所示,
https://github.com/kaldi-asr/kaldi/blob/master/egs/thchs30/s5/run.sh。

[THCHS30](http://www.openslr.org/18/)是清华大学开源的一个中文数据集,总共30小时。请基于该数据集,基于kaldi下该数据集的标注脚本,梳理基于DNN-HMM的语音识别系统的**流程,其有哪些步骤,每一步的输入、输出,步骤间的相互关系**等,可以把自己的理解流程化、图形化、文字化的记录下来,写下来。
204 changes: 204 additions & 0 deletions 06-DNN-HMM/dnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# Author: Sining Sun, Zhanheng Yang, Binbin Zhang

import numpy as np
import kaldi_io
from utils import *

targets_list = ['Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']
targets_mapping = {}
for i, x in enumerate(targets_list):
targets_mapping[x] = i


class Layer:
def forward(self, input):
''' Forward function by input
Args:
input: input, B * N matrix, B for batch size
Returns:
output when applied this layer
'''
raise 'Not implement error'

def backward(self, input, output, d_output):
''' Compute gradient of this layer's input by (input, output, d_output)
as well as compute the gradient of the parameter of this layer
Args:
input: input of this layer
output: output of this layer
d_output: accumulated gradient from final output to this
layer's output
Returns:
accumulated gradient from final output to this layer's input
'''
raise 'Not implement error'

def set_learning_rate(self, lr):
''' Set learning rate of this layer'''
self.learning_rate = lr

def update(self):
''' Update this layers parameter if it has or do nothing
'''


class ReLU(Layer):
def forward(self, input):
# BEGIN_LAB
# END_LAB

def backward(self, input, output, d_output):
# BEGIN_LAB
# END_LAB


class FullyConnect(Layer):
def __init__(self, in_dim, out_dim):
self.w = np.random.randn(out_dim, in_dim) * np.sqrt(2.0 / in_dim)
self.b = np.zeros(out_dim)
self.dw = np.zeros((out_dim, in_dim))
self.db = np.zeros(out_dim)

def forward(self, input):
# BEGIN_LAB
# END_LAB

def backward(self, input, output, d_output):
batch_size = input.shape[0]
in_diff = None
# BEGIN_LAB, compute in_diff/dw/db here
# END_LAB
# Normalize dw/db by batch size
self.dw = self.dw / batch_size
self.db = self.db / batch_size
return in_diff

def update(self):
self.w = self.w - self.learning_rate * self.dw
self.b = self.b - self.learning_rate * self.db


class Softmax(Layer):
def forward(self, input):
row_max = input.max(axis=1).reshape(input.shape[0], 1)
x = input - row_max
return np.exp(x) / np.sum(np.exp(x), axis=1).reshape(x.shape[0], 1)

def backward(self, input, output, d_output):
''' Directly return the d_output as we show below, the grad is to
the activation(input) of softmax
'''
return d_output


class DNN:
def __init__(self, in_dim, out_dim, hidden_dim, num_hidden):
self.layers = []
self.layers.append(FullyConnect(in_dim, hidden_dim))
self.layers.append(ReLU())
for i in range(num_hidden):
self.layers.append(FullyConnect(hidden_dim, hidden_dim))
self.layers.append(ReLU())
self.layers.append(FullyConnect(hidden_dim, out_dim))
self.layers.append(Softmax())

def set_learning_rate(self, lr):
for layer in self.layers:
layer.set_learning_rate(lr)

def forward(self, input):
self.forward_buf = []
out = input
self.forward_buf.append(out)
for i in range(len(self.layers)):
out = self.layers[i].forward(out)
self.forward_buf.append(out)
assert (len(self.forward_buf) == len(self.layers) + 1)
return out

def backward(self, grad):
'''
Args:
grad: the grad is to the activation before softmax
'''
self.backward_buf = [None] * len(self.layers)
self.backward_buf[len(self.layers) - 1] = grad
for i in range(len(self.layers) - 2, -1, -1):
grad = self.layers[i].backward(self.forward_buf[i],
self.forward_buf[i + 1],
self.backward_buf[i + 1])
self.backward_buf[i] = grad

def update(self):
for layer in self.layers:
layer.update()


def one_hot(labels, total_label):
output = np.zeros((labels.shape[0], total_label))
for i in range(labels.shape[0]):
output[i][labels[i]] = 1.0
return output


def train(dnn):
utt2feat, utt2target = read_feats_and_targets('train/feats.scp',
'train/text')
inputs, labels = build_input(targets_mapping, utt2feat, utt2target)
num_samples = inputs.shape[0]
# Shuffle data
permute = np.random.permutation(num_samples)
inputs = inputs[permute]
labels = labels[permute]
num_epochs = 20
batch_size = 100
for i in range(num_epochs):
cur = 0
while cur < num_samples:
end = min(cur + batch_size, num_samples)
input = inputs[cur:end]
label = labels[cur:end]
# Step1: forward
out = dnn.forward(input)
one_hot_label = one_hot(label, out.shape[1])
# Step2: Compute cross entropy loss and backward
loss = -np.sum(np.log(out + 1e-20) * one_hot_label) / out.shape[0]
# The grad is to activation before softmax
grad = out - one_hot_label
dnn.backward(grad)
# Step3: update parameters
dnn.update()
print('Epoch {} num_samples {} loss {}'.format(i, cur, loss))
cur += batch_size


def test(dnn):
utt2feat, utt2target = read_feats_and_targets('test/feats.scp',
'test/text')
total = len(utt2feat)
correct = 0
for utt in utt2feat:
t = utt2target[utt]
ark = utt2feat[utt]
mat = kaldi_io.read_mat(ark)
mat = splice(mat, 5, 5)
posterior = dnn.forward(mat)
posterior = np.sum(posterior, axis=0) / float(mat.shape[0])
predict = targets_list[np.argmax(posterior)]
if t == predict: correct += 1
print('label: {} predict: {}'.format(t, predict))
print('Acc: {}'.format(float(correct) / total))


def main():
np.random.seed(777)
# We splice the raw feat with left 5 frames and right 5 frames
# So the input here is 39 * (5 + 1 + 5) = 429
dnn = DNN(429, 11, 128, 1)
dnn.set_learning_rate(1e-2)
train(dnn)
test(dnn)


if __name__ == '__main__':
main()
Loading

0 comments on commit 8c7a56b

Please sign in to comment.