improve reusable support

Former-commit-id: 870d68a
gdy1201 · Mar 15, 2017 · a8990ae · a8990ae
1 parent c324671
commit a8990ae
Show file tree

Hide file tree

Showing 9 changed files with 4,702 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,18 @@
 # Automatic-Speech-Recognition
 End-to-end automatic speech recognition system implemented in TensorFlow.
 
-## test Updates
-- [x] **Test 1.0 version has born** (2017-02-26)
-
 ## Recent Updates
 - [x] **Support TensorFlow r1.0** (2017-02-24)
 - [x] **Support dropout for dynamic rnn** (2017-03-11)
 - [x] **Support run in shell file** (2017-03-11)
 - [x] **Support evaluation every several training epoches automatically** (2017-03-11)
+- [x] **Fix bugs for character-level automatic speech recognition** (2017-03-14)
+- [x] **Improve some function apis for reusable** (2017-03-14)
+- [x] **Add scaling for data preprocessing** (2017-03-15)
+- [x] **Add reusable support for LibriSpeech training** (2017-03-15)
 
-## PER based dynamic BLSTM on TIMIT database, with casual tuning because time it limited
+## Performance
+### PER based dynamic BLSTM on TIMIT database, with casual tuning because time it limited
 ![image](https://github.com/zzw922cn/Automatic_Speech_Recognition/blob/master/PER.png)
 
 ## Content

diff --git a/src/analysis/parseLog.py b/src/analysis/parseLog.py
@@ -64,6 +64,7 @@ def visualize(trainERs, testERs):
     ax2.grid()
     plt.suptitle('dynamic bidirectional LSTM for Automatic Speech Recognition')
     plt.show()
-rootdir = '/home/pony/github/data/ASR/log/'
-train, test = readlogs(rootdir)
-visualize(train, test)
+if __name__ == '__main__':
+    rootdir = '/home/pony/github/data/ASR/log/'
+    train, test = readlogs(rootdir)
+    visualize(train, test)
diff --git a/src/feature/libri_preprocess.py b/src/feature/libri_preprocess.py
@@ -30,6 +30,8 @@
 import cPickle
 import glob
 import sys
+import sklearn
+from sklearn import preprocessing
 
 
 count = 0
@@ -38,7 +40,7 @@
 
 keywords = ['dev-clean', 'dev-other', 'test-clean', 'test-other', 'train-clean-100', 'train-clean-360', 'train-other-500']
 
-keyword = keywords[6]
+keyword = keywords[0]
 label_dir = '/home/pony/github/data/libri/cha-level/'+keyword+'/label/'
 mfcc_dir = '/home/pony/github/data/libri/cha-level/'+keyword+'/mfcc/'
 if not os.path.exists(label_dir):
@@ -58,17 +60,16 @@
 		    print fullFilename
 	            (rate,sig)= wav.read(fullFilename)
                     mfcc = calcMFCC_delta_delta(sig,rate,win_length=0.020,win_step=0.010)
+		    mfcc = preprocessing.scale(mfcc)
 		    # transpose mfcc to array of (39,time_length)
 		    mfcc = np.transpose(mfcc)
-		    print mfcc.shape
 		    # save mfcc to file
 		    m_f = mfcc_dir + filenameNoSuffix.split('/')[-1] +'.npy'
 		    np.save(m_f,mfcc)
 
 	            labelFilename = filenameNoSuffix + '.label'
                     with open(labelFilename,'r') as f:
-    	    	        characters = f.readline().strip()
-	            print characters
+    	    	        characters = f.readline().strip().lower()
     	            targets = []
 		    ## totally 28 real characters
     	            for c in characters:

diff --git a/src/feature/timit_preprocess.py b/src/feature/timit_preprocess.py
@@ -29,15 +29,23 @@
 import numpy as np
 import glob
 import sys
+import sklearn
+from sklearn import preprocessing
 
 
 ## keywords can be set to either of 'train' and 'test'
-keywords = 'test'
+level = 'cha'
+keywords = 'train'
 
-mfcc_dir = '/home/pony/github/data/timit/'+keywords+'/mfcc/'
-label_dir = '/home/pony/github/data/timit/'+keywords+'/label/'
+mfcc_dir = '/home/pony/github/data/timit/'+level+'/'+keywords+'/mfcc/'
+label_dir = '/home/pony/github/data/timit/'+level+'/'+keywords+'/label/'
 
-rootdir = '/home/pony/ASR/datasets/TIMIT/'+keywords
+if not os.path.exists(label_dir):
+    os.makedirs(label_dir)
+if not os.path.exists(mfcc_dir):
+    os.makedirs(mfcc_dir)
+
+rootdir = '/media/pony/Seagate Expansion Drive/学习/语音识别/ASR数据库/TIMIT/'+keywords
 
 count = 0
 ## original phonemes
@@ -54,20 +62,39 @@
 	    if file.endswith('.WAV'):
 	        (rate,sig)= wav.read(fullFilename)
                 mfcc = calcMFCC_delta_delta(sig,rate,win_length=0.020,win_step=0.010)
+		mfcc = preprocessing.scale(mfcc)
 		mfcc = np.transpose(mfcc)
 		print mfcc.shape
 		m_f = mfcc_dir + filenameNoSuffix.split('/')[-2]+'-'+filenameNoSuffix.split('/')[-1]+'.npy'
 		np.save(m_f,mfcc)
+		if level == 'phn':
+		    labelFilename = filenameNoSuffix + '.PHN'
+    	            phenome = []
+                    with open(labelFilename,'r') as f:
+		        for line in f.read().splitlines():
+			    s=line.split(' ')[2]
+			    p_index = phn.index(s)
+			    phenome.append(p_index)
+		    print phenome
+		    phenome = np.array(phenome)
+		elif level == 'cha':
+		    labelFilename = filenameNoSuffix + '.WRD'
+    	            phenome = []
+		    sentence = ''
+                    with open(labelFilename,'r') as f:
+		        for line in f.read().splitlines():
+			    s=line.split(' ')[2]
+			    sentence += s+' '
+			    for c in s:
+				if c=="'":
+				    phenome.append(27)
+				else:
+				    phenome.append(ord(c)-96)
+			    phenome.append(0)
+		    phenome = phenome[:-1]
+		    print phenome
+		    print sentence
 
-		labelFilename = filenameNoSuffix + '.PHN'
-    	        phenome = []
-                with open(labelFilename,'r') as f:
-		    for line in f.read().splitlines():
-			s=line.split(' ')[2]
-			p_index = phn.index(s)
-			phenome.append(p_index)
-		print phenome
-		phenome = np.array(phenome)
 		t_f = label_dir + filenameNoSuffix.split('/')[-2]+'-'+filenameNoSuffix.split('/')[-1]+'.npy'
 		print t_f
 		np.save(t_f,phenome)

diff --git a/src/models/dynamic_brnn.py b/src/models/dynamic_brnn.py
@@ -30,7 +30,6 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python.ops import ctc_ops as ctc
 from tensorflow.contrib.rnn.python.ops import rnn_cell
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn
@@ -43,6 +42,7 @@
 from src.utils.utils import build_conv_layer
 from src.utils.utils import list_to_sparse_tensor
 from src.utils.utils import dropout
+from src.utils.utils import get_edit_distance
 
 def build_multi_dynamic_brnn(args, 
 		     maxTimeSteps,
@@ -119,7 +119,8 @@ def build_graph(self, args, maxTimeSteps):
 			    'activation':args.activation,
 			    'optimizer':args.optimizer,
 			    'learning rate':args.learning_rate,
-			    'keep prob':args.keep_prob
+			    'keep prob':args.keep_prob,
+			    'batch size':args.batch_size
 	    }	    
 
 	    fbHrs = build_multi_dynamic_brnn(self.args,maxTimeSteps,self.inputX,self.cell_fn,self.seqLengths)
@@ -129,8 +130,7 @@ def build_graph(self, args, maxTimeSteps):
                     biasesClasses = tf.Variable(tf.zeros([args.num_class]),name='biasesClasses')
     	            logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in fbHrs]
     	    logits3d = tf.stack(logits)
-    	    self.loss = tf.reduce_mean(ctc.ctc_loss(self.targetY, logits3d, self.seqLengths))
-
+    	    self.loss = tf.reduce_mean(tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths))
 	    self.var_op = tf.global_variables()
 	    self.var_trainable_op = tf.trainable_variables()
 
@@ -142,9 +142,9 @@ def build_graph(self, args, maxTimeSteps):
                 grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.var_trainable_op),args.grad_clip)
                 opti = tf.train.AdamOptimizer(args.learning_rate)
                 self.optimizer = opti.apply_gradients(zip(grads, self.var_trainable_op))
-    	    self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1])
-    	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0])
-    	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
+    	    self.predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(logits3d, self.seqLengths, merge_repeated=False)[0][0])
+	    if args.level == 'cha':
+	        self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=True))
 	    self.initial_op = tf.global_variables_initializer()
 	    self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
 	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')