Skip to content

Commit

Permalink
fix and add n-gram
Browse files Browse the repository at this point in the history
Former-commit-id: 50cdeba
  • Loading branch information
nq555222 committed Mar 22, 2017
1 parent 6edb9ca commit 713d7fa
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 28 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ End-to-end automatic speech recognition system implemented in TensorFlow.
- [x] **Improve some function apis for reusable** (2017-03-14)
- [x] **Add scaling for data preprocessing** (2017-03-15)
- [x] **Add reusable support for LibriSpeech training** (2017-03-15)
- [x] **Add simple n-gram model for random generation or statistical use** (2017-03-23)
- [x] **Improve some code for pre-processing and training** (2017-03-23)

## Performance
### PER based dynamic BLSTM on TIMIT database, with casual tuning because time it limited
Expand Down
3 changes: 2 additions & 1 deletion src/feature/libri_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#subset = 0
#labels=[]

keywords = ['dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']
keywords = ['myaudio', 'dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']

keyword = keywords[0]
label_dir = '/home/pony/github/data/libri/cha-level/'+keyword+'/label/'
Expand All @@ -49,6 +49,7 @@
os.makedirs(mfcc_dir)

rootdir = '/media/pony/Seagate Expansion Drive/学习/语音识别/ASR数据库/LibriSpeech/'+keyword
rootdir = '/home/pony/github/data/'+keyword

if True:
for subdir, dirs, files in os.walk(rootdir):
Expand Down
39 changes: 39 additions & 0 deletions src/models/n-gram/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

import numpy as np
import pickle

def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)

def frequence(gram, type=2):
if type == 2:
for key, value in gram.items():
total = 0.0
for subkey, subvalue in value.items():
total += subvalue

for subkey, subvalue in value.items():
gram[key][subkey] = subvalue/total
else:
raise NotImplementedError('%s-gram is being developed'%type)
return gram


def generate_sentence(corpus_dir, seed='what are', length=10):
bigram = load_obj(corpus_dir+'bigram')
freq_bigram = frequence(bigram)
sent = ''
if not ' ' in seed:
sent += seed
prev = seed
for i in range(length):
probs = []
for _, value in freq_bigram[prev].items():
probs.append(value)
sample = np.random.choice(range(len(freq_bigram[prev])),p=probs)
prev = freq_bigram[prev].keys()[sample]
sent += ' '+prev
print sent

generate_sentence('/home/pony/github/data/libri/ngram/', seed='love', length=10)
95 changes: 95 additions & 0 deletions src/models/n-gram/ngram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding:utf-8 -*-
#!/usr/bin/python

''' Language modelling for automatic speech recognition based on n-gram
author:
iiiiiiiiiiii iiiiiiiiiiii !!!!!!! !!!!!!
# ### # ### ### I# #:
# ### # I##; ##; ## ##
### ### !## #### #
### ### ### ## ### #'
!##; `##% ##; ## ### ##
### ### $## `# ## #
### # ### # #### ####;
`### -# ### `# ### ###
############## ############## `# #
date:2017-04-17
'''
import numpy as np
import os
import operator
import pickle

def save_obj(name, obj):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

class NGram:
def __init__(self, rootdir):
self.rootdir = rootdir

def get_corpus(self):
corpus = []
word_count = {}
biword_count = {}
bigram = {}
bigram['SOS'] = {}
trigram = {}
for subdir, dirs, files in os.walk(self.rootdir):
for f in files:
fullFilename = os.path.join(subdir, f)
filenameNoSuffix = os.path.splitext(fullFilename)[0]
if f.endswith('.label'):
with open(fullFilename, 'r') as f:
line = f.readline()
corpus.append(line)
line = line.strip().split(' ')
len_sent = range(len(line))
for idx in len_sent:
word = line[idx]
word_count = inc_dict(word_count, word)

if not bigram.has_key(word):
bigram[word] = {}

if idx == 0:
bigram['SOS'] = inc_dict(bigram['SOS'], word)

elif idx != len(line)-1:
bigram[word] = inc_dict(bigram[word], line[idx+1])

else:
bigram[word] = inc_dict(bigram[word], 'EOS')

if idx == 0:
tri_key = 'SOS ' + word
else:
tri_key = line[idx-1]+' '+word
if not trigram.has_key(tri_key):
trigram[tri_key] = {}
if idx == len(line)-1:
trigram[tri_key] = inc_dict(trigram[tri_key], 'EOS')
else:
trigram[tri_key] = inc_dict(trigram[tri_key], line[idx+1])

return corpus, word_count, bigram, trigram


def inc_dict(dic, key):
if not dic.has_key(key):
dic[key] = 0
dic[key] += 1
return dic


if __name__ == '__main__':
ngram = NGram('/media/pony/Seagate Expansion Drive/学习/语音识别/ASR数据库/LibriSpeech/')
corpus, word_count, bigram, trigram = ngram.get_corpus()
savedir = '/home/pony/github/data/libri/ngram/'
save_obj(savedir+'corpus', corpus)
save_obj(savedir+'word_count', word_count)
save_obj(savedir+'bigram', bigram)
save_obj(savedir+'trigram', trigram)
#sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True)
10 changes: 6 additions & 4 deletions src/run.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
#!/bin/bash

for loop in {2..30}
for loop in {3..30}
do
echo "loop is $loop"
b=$(( $loop % 3 ))
echo "dataset is index $b"
if [ $loop -eq 1 ]
then
/usr/bin/python train.py --mode=train
/usr/bin/python train.py --lb=$b
else
/usr/bin/python train.py --mode=train --keep=True
fi
/usr/bin/python train.py --lb=$b --keep=True
/usr/bin/python train.py --mode=test
fi
done
85 changes: 62 additions & 23 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,31 +54,40 @@
class Trainer(object):

def __init__(self):
parser = argparse.ArgumentParser()
cat = 1
libri_data = ['timit', 'dev', 'train-clean-100', 'train-other-240', 'train-other-260','train-clean-360']
parser.add_argument('--lb', type=int, default=2, help='specify the dataset of libri')
parser.add_argument('--keep', type=bool, default=False,
help='train the model based on model saved')
parser.add_argument('--mode', type=str, default='train',
help='you can select two modes, "train" or "test"')
self.args = parser.parse_args()
lb = self.args.lb
#lb = 2

libri_data = ['dev', 'train-clean-100', 'train-clean-360', 'train-other-500']
lb = 1

train_mfcc_dir = ['/home/pony/github/data/timit/train/mfcc/',
train_mfcc_dir = ['/home/pony/github/data/timit/cha/train/mfcc/',
'/home/pony/github/data/libri/cha-level/'+libri_data[lb]+'/mfcc/']

train_label_dir = ['/home/pony/github/data/timit/train/label/',
train_label_dir = ['/home/pony/github/data/timit/cha/train/label/',
'/home/pony/github/data/libri/cha-level/'+libri_data[lb]+'/label/']

test_mfcc_dir = ['/home/pony/github/data/timit/test/mfcc/',
test_mfcc_dir = ['/home/pony/github/data/timit/cha/test/mfcc/',
'/home/pony/github/data/libri/cha-level/test/mfcc/']

test_label_dir = ['/home/pony/github/data/timit/test/label/',
test_label_dir = ['/home/pony/github/data/timit/cha/test/label/',
'/home/pony/github/data/libri/cha-level/test/label/']

task = ['timit', 'libri']
level = ['phn', 'cha']
level = ['cha', 'cha']
timit_config = [['phn',62], ['cha', 29]]
num_hidden = [128, 256]
num_class = [62, 29]

#num_class = [62, 29]
num_class = [29, 29]
save_dir = ['/home/pony/github/data/ASR/save/timit/', '/home/pony/github/data/ASR/save/libri/']
log_dir = ['/home/pony/github/data/ASR/log/timit/', '/home/pony/github/data/ASR/log/libri/']

parser = argparse.ArgumentParser()
parser.add_argument('--task', type=str, default=task[cat], help='two tasks now, timit or libri')

parser.add_argument('--level', type=str, default=level[cat], help='two levels now, phn or cha')
Expand All @@ -101,7 +110,7 @@ def __init__(self):
parser.add_argument('--model', default='DBiRNN',
help='model for ASR:DBiRNN,BiRNN,ResNet,...')

parser.add_argument('--keep_prob', type=float, default=0.99,
parser.add_argument('--keep_prob', type=float, default=1,
help='set the keep probability of layer for dropout')

parser.add_argument('--rnncell', type=str, default='gru',
Expand All @@ -110,28 +119,22 @@ def __init__(self):
parser.add_argument('--num_layer', type=int, default=2,
help='set the number of hidden layer or bidirectional layer')

parser.add_argument('--activation', default=tf.nn.relu,
parser.add_argument('--activation', default=tf.nn.elu,
help='set the activation function of each layer')

parser.add_argument('--optimizer', type=type, default=tf.train.AdamOptimizer,
help='set the optimizer to train the model,eg:AdamOptimizer,GradientDescentOptimizer')

parser.add_argument('--grad_clip', default=15,
parser.add_argument('--grad_clip', default=0.8,
help='set gradient clipping when backpropagating errors')

parser.add_argument('--keep', type=bool, default=False,
help='train the model based on model saved')

parser.add_argument('--save', type=bool, default=True,
help='to save the model in the disk')

parser.add_argument('--mode', type=str, default='train',
help='test the model based on trained parameters, but at present, we can"t test during training.')

parser.add_argument('--learning_rate', type=float, default=0.001,
parser.add_argument('--learning_rate', type=float, default=0.0001,
help='set the step size of each iteration')

parser.add_argument('--num_epoch', type=int, default=5000,
parser.add_argument('--num_epoch', type=int, default=1,
help='set the total number of training epochs')

parser.add_argument('--batch_size', type=int, default=32,
Expand Down Expand Up @@ -244,6 +247,9 @@ def train(self):
l,
er/args.batch_size))
batchErrors[batch] = er*len(batchSeqLengths)
# NOTE:
if er/args.batch_size == 1.0:
break
if batch%30==0:
print('Truth:\n'+output_to_sequence(y,type=args.level))
print('Output:\n'+output_to_sequence(pre,type=args.level))
Expand Down Expand Up @@ -285,6 +291,7 @@ def test(self):
if ckpt and ckpt.model_checkpoint_path:
model.saver.restore(sess, ckpt.model_checkpoint_path)
print('Model restored from:'+args.save_dir)

batchErrors = np.zeros(len(batchedData))
batchRandIxs = np.random.permutation(len(batchedData))
for batch, batchOrigI in enumerate(batchRandIxs):
Expand All @@ -296,6 +303,39 @@ def test(self):
model.targetShape: batchTargetShape,
model.seqLengths: batchSeqLengths}

if args.level == 'cha':
l, pre, y, er = sess.run([model.loss,
model.predictions,
model.targetY,
model.errorRate],
feed_dict=feedDict)
batchErrors[batch] = er
print('\ntotal:{},batch:{}/{},loss={:.3f},mean CER={:.3f}\n'.format(
totalN,
batch+1,
len(batchRandIxs),
l,
er/args.batch_size))

elif args.level == 'phn':
l, pre, y = sess.run([model.loss,
model.predictions,
model.targetY],
feed_dict=feedDict)
er = get_edit_distance([pre.values], [y.values], True, 'test', args.level)
print('\ntotal:{},batch:{}/{},loss={:.3f},mean PER={:.3f}\n'.format(
totalN,
batch+1,
len(batchRandIxs),
l,
er/args.batch_size))
batchErrors[batch] = er*len(batchSeqLengths)


print('Truth:\n'+output_to_sequence(y,type=args.level))
print('Output:\n'+output_to_sequence(pre,type=args.level))

'''
l, pre, y = sess.run([ model.loss,
model.predictions,
model.targetY],
Expand All @@ -305,12 +345,11 @@ def test(self):
er = get_edit_distance([pre.values], [y.values], True, 'test', args.level)
print(output_to_sequence(y,type=args.level))
print(output_to_sequence(pre,type=args.level))
'''
with open(args.task+'_result.txt', 'a') as result:
result.write(output_to_sequence(y,type=args.level)+'\n')
result.write(output_to_sequence(pre,type=args.level)+'\n')
result.write('\n')
print('Minibatch', batch+1, 'test error rate:', er)
batchErrors[batch] = er*len(batchSeqLengths)
epochER = batchErrors.sum() / totalN
print(args.task+' test error rate:', epochER)
logging(model,self.logfile,epochER,mode='test')
Expand Down

0 comments on commit 713d7fa

Please sign in to comment.