fix and add n-gram

Former-commit-id: 50cdeba
gdy1201 · Mar 22, 2017 · 713d7fa · 713d7fa
1 parent 6edb9ca
commit 713d7fa
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@ End-to-end automatic speech recognition system implemented in TensorFlow.
 - [x] **Improve some function apis for reusable** (2017-03-14)
 - [x] **Add scaling for data preprocessing** (2017-03-15)
 - [x] **Add reusable support for LibriSpeech training** (2017-03-15)
+- [x] **Add simple n-gram model for random generation or statistical use** (2017-03-23)
+- [x] **Improve some code for pre-processing and training** (2017-03-23)
 
 ## Performance
 ### PER based dynamic BLSTM on TIMIT database, with casual tuning because time it limited

diff --git a/src/feature/libri_preprocess.py b/src/feature/libri_preprocess.py
@@ -38,7 +38,7 @@
 #subset = 0
 #labels=[]
 
-keywords = ['dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']
+keywords = ['myaudio', 'dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']
 
 keyword = keywords[0]
 label_dir = '/home/pony/github/data/libri/cha-level/'+keyword+'/label/'
@@ -49,6 +49,7 @@
     os.makedirs(mfcc_dir)
 
 rootdir = '/media/pony/Seagate Expansion Drive/学习/语音识别/ASR数据库/LibriSpeech/'+keyword
+rootdir = '/home/pony/github/data/'+keyword
 
 if True:
     for subdir, dirs, files in os.walk(rootdir):

diff --git a/src/models/n-gram/generate.py b/src/models/n-gram/generate.py
@@ -0,0 +1,39 @@
+
+import numpy as np
+import pickle
+
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+def frequence(gram, type=2):
+    if type == 2:
+        for key, value in gram.items():
+	    total = 0.0
+	    for subkey, subvalue in value.items():
+		total += subvalue
+
+	    for subkey, subvalue in value.items():
+		gram[key][subkey] = subvalue/total
+    else:
+	raise NotImplementedError('%s-gram is being developed'%type)
+    return gram
+
+
+def generate_sentence(corpus_dir, seed='what are', length=10):
+    bigram = load_obj(corpus_dir+'bigram')
+    freq_bigram = frequence(bigram)
+    sent = ''
+    if not ' ' in seed:
+	sent += seed
+	prev = seed
+        for i in range(length):
+            probs = []
+	    for _, value in freq_bigram[prev].items():
+		probs.append(value)
+	    sample = np.random.choice(range(len(freq_bigram[prev])),p=probs)
+	    prev = freq_bigram[prev].keys()[sample]
+	    sent += ' '+prev
+    print sent
+
+generate_sentence('/home/pony/github/data/libri/ngram/', seed='love', length=10)
diff --git a/src/models/n-gram/ngram.py b/src/models/n-gram/ngram.py
@@ -0,0 +1,95 @@
+# -*- coding:utf-8 -*-
+#!/usr/bin/python
+
+''' Language modelling for automatic speech recognition based on n-gram
+author:
+
+      iiiiiiiiiiii            iiiiiiiiiiii         !!!!!!!             !!!!!!    
+      #        ###            #        ###           ###        I#        #:     
+      #      ###              #      I##;             ##;       ##       ##      
+            ###                     ###               !##      ####      #       
+           ###                     ###                 ###    ## ###    #'       
+         !##;                    `##%                   ##;  ##   ###  ##        
+        ###                     ###                     $## `#     ##  #         
+       ###        #            ###        #              ####      ####;         
+     `###        -#           ###        `#               ###       ###          
+     ##############          ##############               `#         #     
+     
+date:2017-04-17
+'''
+import numpy as np
+import os
+import operator
+import pickle
+
+def save_obj(name, obj):
+    with open(name + '.pkl', 'wb') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+
+class NGram:
+    def __init__(self, rootdir):
+	self.rootdir = rootdir
+
+    def get_corpus(self):
+	corpus = []
+	word_count = {}
+	biword_count = {}
+	bigram = {}
+	bigram['SOS'] = {}
+	trigram = {}
+        for subdir, dirs, files in os.walk(self.rootdir):
+            for f in files:
+                fullFilename = os.path.join(subdir, f)
+	        filenameNoSuffix =  os.path.splitext(fullFilename)[0]
+	        if f.endswith('.label'):
+		    with open(fullFilename, 'r') as f:
+		        line = f.readline()
+			corpus.append(line)
+			line = line.strip().split(' ')
+			len_sent = range(len(line))
+			for idx in len_sent:
+			    word = line[idx]
+			    word_count = inc_dict(word_count, word)
+
+			    if not bigram.has_key(word):
+				bigram[word] = {}
+
+			    if idx == 0:
+				bigram['SOS'] = inc_dict(bigram['SOS'], word)
+
+			    elif idx != len(line)-1:
+			        bigram[word] = inc_dict(bigram[word], line[idx+1])
+
+			    else:
+			        bigram[word] = inc_dict(bigram[word], 'EOS')
+
+			    if idx == 0:
+		                tri_key = 'SOS ' + word
+			    else:
+			        tri_key = line[idx-1]+' '+word
+			    if not trigram.has_key(tri_key):
+				trigram[tri_key] = {}
+			    if idx == len(line)-1:
+			        trigram[tri_key] = inc_dict(trigram[tri_key], 'EOS')
+			    else:
+			        trigram[tri_key] = inc_dict(trigram[tri_key], line[idx+1])
+
+	return corpus, word_count, bigram, trigram
+
+
+def inc_dict(dic, key):
+    if not dic.has_key(key):
+	dic[key] = 0
+    dic[key] += 1
+    return dic
+
+
+if __name__ == '__main__':
+    ngram = NGram('/media/pony/Seagate Expansion Drive/学习/语音识别/ASR数据库/LibriSpeech/')
+    corpus, word_count, bigram, trigram = ngram.get_corpus()
+    savedir = '/home/pony/github/data/libri/ngram/'
+    save_obj(savedir+'corpus', corpus)
+    save_obj(savedir+'word_count', word_count)
+    save_obj(savedir+'bigram', bigram)
+    save_obj(savedir+'trigram', trigram)
+    #sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True)
diff --git a/src/run.sh b/src/run.sh
@@ -1,13 +1,15 @@
 #!/bin/bash
 
-for loop in {2..30} 
+for loop in {3..30} 
 do
     echo "loop is $loop"
+    b=$(( $loop % 3 ))
+    echo "dataset is index $b"
     if [ $loop -eq 1 ]
     then
-        /usr/bin/python train.py --mode=train
+        /usr/bin/python train.py --lb=$b
     else
-        /usr/bin/python train.py --mode=train --keep=True
-    fi
+        /usr/bin/python train.py --lb=$b --keep=True
     /usr/bin/python train.py --mode=test
+    fi
 done
diff --git a/src/train.py b/src/train.py
@@ -54,31 +54,40 @@
 class Trainer(object):
 
     def __init__(self):
+	parser = argparse.ArgumentParser()
 	cat = 1
+	libri_data = ['timit', 'dev', 'train-clean-100', 'train-other-240', 'train-other-260','train-clean-360']
+	parser.add_argument('--lb', type=int, default=2, help='specify the dataset of libri')
+	parser.add_argument('--keep', type=bool, default=False,
+                       help='train the model based on model saved')
+	parser.add_argument('--mode', type=str, default='train',
+                       help='you can select two modes, "train" or "test"')
+	self.args = parser.parse_args()
+	lb = self.args.lb
+	#lb = 2
 
-	libri_data = ['dev', 'train-clean-100', 'train-clean-360', 'train-other-500']
-	lb = 1
-
-	train_mfcc_dir = ['/home/pony/github/data/timit/train/mfcc/',
+	train_mfcc_dir = ['/home/pony/github/data/timit/cha/train/mfcc/',
 			  '/home/pony/github/data/libri/cha-level/'+libri_data[lb]+'/mfcc/']
 
-	train_label_dir = ['/home/pony/github/data/timit/train/label/',
+	train_label_dir = ['/home/pony/github/data/timit/cha/train/label/',
 			  '/home/pony/github/data/libri/cha-level/'+libri_data[lb]+'/label/']
 
-	test_mfcc_dir = ['/home/pony/github/data/timit/test/mfcc/',
+	test_mfcc_dir = ['/home/pony/github/data/timit/cha/test/mfcc/',
 			  '/home/pony/github/data/libri/cha-level/test/mfcc/']
 
-	test_label_dir = ['/home/pony/github/data/timit/test/label/',
+	test_label_dir = ['/home/pony/github/data/timit/cha/test/label/',
 			  '/home/pony/github/data/libri/cha-level/test/label/']
 
 	task = ['timit', 'libri']
-	level = ['phn', 'cha']
+	level = ['cha', 'cha']
+	timit_config = [['phn',62], ['cha', 29]]
 	num_hidden = [128, 256]
-	num_class = [62, 29]
+
+	#num_class = [62, 29]
+	num_class = [29, 29]
 	save_dir = ['/home/pony/github/data/ASR/save/timit/', '/home/pony/github/data/ASR/save/libri/']
 	log_dir = ['/home/pony/github/data/ASR/log/timit/', '/home/pony/github/data/ASR/log/libri/']
 
-	parser = argparse.ArgumentParser()
 	parser.add_argument('--task', type=str, default=task[cat], help='two tasks now, timit or libri')
 
 	parser.add_argument('--level', type=str, default=level[cat], help='two levels now, phn or cha')
@@ -101,7 +110,7 @@ def __init__(self):
 	parser.add_argument('--model', default='DBiRNN',
 		       help='model for ASR:DBiRNN,BiRNN,ResNet,...')
 
-	parser.add_argument('--keep_prob', type=float, default=0.99,
+	parser.add_argument('--keep_prob', type=float, default=1,
 		       help='set the keep probability of layer for dropout')
 
 	parser.add_argument('--rnncell', type=str, default='gru',
@@ -110,28 +119,22 @@ def __init__(self):
         parser.add_argument('--num_layer', type=int, default=2,
                        help='set the number of hidden layer or bidirectional layer')
 
-        parser.add_argument('--activation', default=tf.nn.relu,
+        parser.add_argument('--activation', default=tf.nn.elu,
                        help='set the activation function of each layer')
 
         parser.add_argument('--optimizer', type=type, default=tf.train.AdamOptimizer,
                        help='set the optimizer to train the model,eg:AdamOptimizer,GradientDescentOptimizer')
 
-        parser.add_argument('--grad_clip', default=15,
+        parser.add_argument('--grad_clip', default=0.8,
                        help='set gradient clipping when backpropagating errors')
 
-	parser.add_argument('--keep', type=bool, default=False,
-                       help='train the model based on model saved')
-
 	parser.add_argument('--save', type=bool, default=True,
                        help='to save the model in the disk')
 
-	parser.add_argument('--mode', type=str, default='train',
-                       help='test the model based on trained parameters, but at present, we can"t test during training.')
-
-        parser.add_argument('--learning_rate', type=float, default=0.001,
+        parser.add_argument('--learning_rate', type=float, default=0.0001,
                        help='set the step size of each iteration')
 
-        parser.add_argument('--num_epoch', type=int, default=5000,
+        parser.add_argument('--num_epoch', type=int, default=1,
                        help='set the total number of training epochs')
 
         parser.add_argument('--batch_size', type=int, default=32,
@@ -244,6 +247,9 @@ def train(self):
 					l,
 					er/args.batch_size))
             	        batchErrors[batch] = er*len(batchSeqLengths)
+		    # NOTE:
+		    if er/args.batch_size == 1.0:
+			break
 		    if batch%30==0:
 		        print('Truth:\n'+output_to_sequence(y,type=args.level))
 	    	        print('Output:\n'+output_to_sequence(pre,type=args.level))
@@ -285,6 +291,7 @@ def test(self):
             if ckpt and ckpt.model_checkpoint_path:
                 model.saver.restore(sess, ckpt.model_checkpoint_path)
     	        print('Model restored from:'+args.save_dir) 
+
             batchErrors = np.zeros(len(batchedData))
             batchRandIxs = np.random.permutation(len(batchedData)) 
             for batch, batchOrigI in enumerate(batchRandIxs):
@@ -296,6 +303,39 @@ def test(self):
 			    model.targetShape: batchTargetShape,
 			    model.seqLengths: batchSeqLengths}
 
+		if args.level == 'cha':
+                    l, pre, y, er = sess.run([model.loss, 
+					      model.predictions,
+					      model.targetY,
+					      model.errorRate], 
+					      feed_dict=feedDict)
+            	    batchErrors[batch] = er
+		    print('\ntotal:{},batch:{}/{},loss={:.3f},mean CER={:.3f}\n'.format(
+					totalN,
+					batch+1,
+					len(batchRandIxs),
+					l,
+					er/args.batch_size))
+
+		elif args.level == 'phn':
+                    l, pre, y = sess.run([model.loss, 
+					      model.predictions,
+					      model.targetY], 
+					      feed_dict=feedDict)
+		    er = get_edit_distance([pre.values], [y.values], True, 'test', args.level)
+		    print('\ntotal:{},batch:{}/{},loss={:.3f},mean PER={:.3f}\n'.format(
+					totalN,
+					batch+1,
+					len(batchRandIxs),
+					l,
+					er/args.batch_size))
+            	    batchErrors[batch] = er*len(batchSeqLengths)
+
+
+		print('Truth:\n'+output_to_sequence(y,type=args.level))
+	    	print('Output:\n'+output_to_sequence(pre,type=args.level))
+
+		'''
                 l, pre, y = sess.run([ model.loss,
 					    model.predictions,
 					    model.targetY],
@@ -305,12 +345,11 @@ def test(self):
 		er = get_edit_distance([pre.values], [y.values], True, 'test', args.level)
 	    	print(output_to_sequence(y,type=args.level))
 	    	print(output_to_sequence(pre,type=args.level))
+		'''
 		with open(args.task+'_result.txt', 'a') as result:
 		    result.write(output_to_sequence(y,type=args.level)+'\n')
 		    result.write(output_to_sequence(pre,type=args.level)+'\n')
 		    result.write('\n')
-                print('Minibatch', batch+1, 'test error rate:', er)
-            	batchErrors[batch] = er*len(batchSeqLengths)
             epochER = batchErrors.sum() / totalN
             print(args.task+' test error rate:', epochER)
 	    logging(model,self.logfile,epochER,mode='test')