-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zhuoyuan yao
committed
Feb 13, 2020
1 parent
b206879
commit 8c7a56b
Showing
10 changed files
with
1,923 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# 基于DNN-HMM的语音识别系统作业 | ||
|
||
本次课程共有2个作业,分别如下所示。 | ||
|
||
## 作业1 | ||
|
||
### 数据说明 | ||
本次实验所用的数据为0-9(其中0的标签为Z(Zero))和O这11个字符的英文录音所提取的39维的MFCC特征。其中 | ||
* 训练数据:330句话,11个字符,每个字符30句话,训练数据位于train目录下。 | ||
* 测试数据:110句话,11个字符,每个字符10句话,测试数据位于test目录下。 | ||
|
||
train/test目录下各有3个文件,分别如下: | ||
* text: 标注文件,每一行第一列为句子id,第二列为标注。 | ||
* feats.scp: 特征索引文件,每一行第一列为句子id,第二列为特征的索引表示。 | ||
* feats.ark: 特征实际存储文件,该文件为二进制文件。 | ||
|
||
### 实验内容 | ||
本实验实现了一个简单的DNN的框架,使用DNN进行11个数字的训练和识别。 | ||
实验中使用以上所述的训练和测试数据分别对该DNN进行训练和测试。 | ||
请阅读dnn.py中的代码,理解该DNN框架,完善ReLU激活函数和FullyConnect全连接层的前向后向算法。 | ||
可以参考Softmax的前向和后向实现。dnn.py中代码插入位置为。 | ||
``` python | ||
# BEGIN_LAB | ||
# write your code here | ||
# END_LAB | ||
``` | ||
|
||
### 运行和检查 | ||
|
||
使用如下命令运行该实验,该程序末尾会打印出在测试集上的准确率。假设实现正确,应该得到95%以上的准确率,作者的实现分类准确率为98.18%。 | ||
|
||
``` sh | ||
python dnn.py | ||
``` | ||
|
||
### 拓展 | ||
除了跑默认参数之外,读者还可以自己尝试调节一些超参数,并观察这些超参数对最终准确率的影响。如 | ||
* 学习率 | ||
* 隐层结点数 | ||
* 隐层层数 | ||
|
||
读者还可以基于该框架实现神经网络中的一些基本算法,如: | ||
* sigmoid和tanh激活函数 | ||
* dropout | ||
* L2 regularization | ||
* optimizer(Momentum/Adam) | ||
* ... | ||
|
||
实现后读者可以在该数字识别任务上应用这些算法,并观察对识别率的影响。 | ||
|
||
通过调节这些超参数和实现其他的一些基本算法,读者可以进一步认识和理解神经网络。 | ||
|
||
## 作业2 | ||
|
||
基于Kaldi理解基于DNN-HMM的语音识别系统。请安装kaldi,并运行kaldi下的标准数据集THCHS30的实验,该实验如链接所示, | ||
https://github.com/kaldi-asr/kaldi/blob/master/egs/thchs30/s5/run.sh。 | ||
|
||
[THCHS30](http://www.openslr.org/18/)是清华大学开源的一个中文数据集,总共30小时。请基于该数据集,基于kaldi下该数据集的标注脚本,梳理基于DNN-HMM的语音识别系统的**流程,其有哪些步骤,每一步的输入、输出,步骤间的相互关系**等,可以把自己的理解流程化、图形化、文字化的记录下来,写下来。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
# Author: Sining Sun, Zhanheng Yang, Binbin Zhang | ||
|
||
import numpy as np | ||
import kaldi_io | ||
from utils import * | ||
|
||
targets_list = ['Z', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O'] | ||
targets_mapping = {} | ||
for i, x in enumerate(targets_list): | ||
targets_mapping[x] = i | ||
|
||
|
||
class Layer: | ||
def forward(self, input): | ||
''' Forward function by input | ||
Args: | ||
input: input, B * N matrix, B for batch size | ||
Returns: | ||
output when applied this layer | ||
''' | ||
raise 'Not implement error' | ||
|
||
def backward(self, input, output, d_output): | ||
''' Compute gradient of this layer's input by (input, output, d_output) | ||
as well as compute the gradient of the parameter of this layer | ||
Args: | ||
input: input of this layer | ||
output: output of this layer | ||
d_output: accumulated gradient from final output to this | ||
layer's output | ||
Returns: | ||
accumulated gradient from final output to this layer's input | ||
''' | ||
raise 'Not implement error' | ||
|
||
def set_learning_rate(self, lr): | ||
''' Set learning rate of this layer''' | ||
self.learning_rate = lr | ||
|
||
def update(self): | ||
''' Update this layers parameter if it has or do nothing | ||
''' | ||
|
||
|
||
class ReLU(Layer): | ||
def forward(self, input): | ||
# BEGIN_LAB | ||
# END_LAB | ||
|
||
def backward(self, input, output, d_output): | ||
# BEGIN_LAB | ||
# END_LAB | ||
|
||
|
||
class FullyConnect(Layer): | ||
def __init__(self, in_dim, out_dim): | ||
self.w = np.random.randn(out_dim, in_dim) * np.sqrt(2.0 / in_dim) | ||
self.b = np.zeros(out_dim) | ||
self.dw = np.zeros((out_dim, in_dim)) | ||
self.db = np.zeros(out_dim) | ||
|
||
def forward(self, input): | ||
# BEGIN_LAB | ||
# END_LAB | ||
|
||
def backward(self, input, output, d_output): | ||
batch_size = input.shape[0] | ||
in_diff = None | ||
# BEGIN_LAB, compute in_diff/dw/db here | ||
# END_LAB | ||
# Normalize dw/db by batch size | ||
self.dw = self.dw / batch_size | ||
self.db = self.db / batch_size | ||
return in_diff | ||
|
||
def update(self): | ||
self.w = self.w - self.learning_rate * self.dw | ||
self.b = self.b - self.learning_rate * self.db | ||
|
||
|
||
class Softmax(Layer): | ||
def forward(self, input): | ||
row_max = input.max(axis=1).reshape(input.shape[0], 1) | ||
x = input - row_max | ||
return np.exp(x) / np.sum(np.exp(x), axis=1).reshape(x.shape[0], 1) | ||
|
||
def backward(self, input, output, d_output): | ||
''' Directly return the d_output as we show below, the grad is to | ||
the activation(input) of softmax | ||
''' | ||
return d_output | ||
|
||
|
||
class DNN: | ||
def __init__(self, in_dim, out_dim, hidden_dim, num_hidden): | ||
self.layers = [] | ||
self.layers.append(FullyConnect(in_dim, hidden_dim)) | ||
self.layers.append(ReLU()) | ||
for i in range(num_hidden): | ||
self.layers.append(FullyConnect(hidden_dim, hidden_dim)) | ||
self.layers.append(ReLU()) | ||
self.layers.append(FullyConnect(hidden_dim, out_dim)) | ||
self.layers.append(Softmax()) | ||
|
||
def set_learning_rate(self, lr): | ||
for layer in self.layers: | ||
layer.set_learning_rate(lr) | ||
|
||
def forward(self, input): | ||
self.forward_buf = [] | ||
out = input | ||
self.forward_buf.append(out) | ||
for i in range(len(self.layers)): | ||
out = self.layers[i].forward(out) | ||
self.forward_buf.append(out) | ||
assert (len(self.forward_buf) == len(self.layers) + 1) | ||
return out | ||
|
||
def backward(self, grad): | ||
''' | ||
Args: | ||
grad: the grad is to the activation before softmax | ||
''' | ||
self.backward_buf = [None] * len(self.layers) | ||
self.backward_buf[len(self.layers) - 1] = grad | ||
for i in range(len(self.layers) - 2, -1, -1): | ||
grad = self.layers[i].backward(self.forward_buf[i], | ||
self.forward_buf[i + 1], | ||
self.backward_buf[i + 1]) | ||
self.backward_buf[i] = grad | ||
|
||
def update(self): | ||
for layer in self.layers: | ||
layer.update() | ||
|
||
|
||
def one_hot(labels, total_label): | ||
output = np.zeros((labels.shape[0], total_label)) | ||
for i in range(labels.shape[0]): | ||
output[i][labels[i]] = 1.0 | ||
return output | ||
|
||
|
||
def train(dnn): | ||
utt2feat, utt2target = read_feats_and_targets('train/feats.scp', | ||
'train/text') | ||
inputs, labels = build_input(targets_mapping, utt2feat, utt2target) | ||
num_samples = inputs.shape[0] | ||
# Shuffle data | ||
permute = np.random.permutation(num_samples) | ||
inputs = inputs[permute] | ||
labels = labels[permute] | ||
num_epochs = 20 | ||
batch_size = 100 | ||
for i in range(num_epochs): | ||
cur = 0 | ||
while cur < num_samples: | ||
end = min(cur + batch_size, num_samples) | ||
input = inputs[cur:end] | ||
label = labels[cur:end] | ||
# Step1: forward | ||
out = dnn.forward(input) | ||
one_hot_label = one_hot(label, out.shape[1]) | ||
# Step2: Compute cross entropy loss and backward | ||
loss = -np.sum(np.log(out + 1e-20) * one_hot_label) / out.shape[0] | ||
# The grad is to activation before softmax | ||
grad = out - one_hot_label | ||
dnn.backward(grad) | ||
# Step3: update parameters | ||
dnn.update() | ||
print('Epoch {} num_samples {} loss {}'.format(i, cur, loss)) | ||
cur += batch_size | ||
|
||
|
||
def test(dnn): | ||
utt2feat, utt2target = read_feats_and_targets('test/feats.scp', | ||
'test/text') | ||
total = len(utt2feat) | ||
correct = 0 | ||
for utt in utt2feat: | ||
t = utt2target[utt] | ||
ark = utt2feat[utt] | ||
mat = kaldi_io.read_mat(ark) | ||
mat = splice(mat, 5, 5) | ||
posterior = dnn.forward(mat) | ||
posterior = np.sum(posterior, axis=0) / float(mat.shape[0]) | ||
predict = targets_list[np.argmax(posterior)] | ||
if t == predict: correct += 1 | ||
print('label: {} predict: {}'.format(t, predict)) | ||
print('Acc: {}'.format(float(correct) / total)) | ||
|
||
|
||
def main(): | ||
np.random.seed(777) | ||
# We splice the raw feat with left 5 frames and right 5 frames | ||
# So the input here is 39 * (5 + 1 + 5) = 429 | ||
dnn = DNN(429, 11, 128, 1) | ||
dnn.set_learning_rate(1e-2) | ||
train(dnn) | ||
test(dnn) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.