fix bugs

Former-commit-id: e8e7025
gdy1201 · Dec 14, 2016 · 9b96088 · 9b96088
1 parent 307e992
commit 9b96088
Show file tree

Hide file tree

Showing 21 changed files with 57 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -45,11 +45,11 @@ Instead of configuration in command line, you can also set the arguments above i
 ### Data preprocessing
 The original TIMIT database contains 6300 utterances, but we find the 'SA' audio files occurs many times, it will lead bad bias for our speech recognition system. Therefore, we removed the all 'SA' files from the original dataset and attain the new TIMIT dataset, which contains only 5040 utterances including 3696 standard training set and 1344 test set.
 
-Automatic Speech Recognition is to transcribe a raw audio file into character sequences. Data preprocessing is to convert a raw audio file into feature vectors of several frames. Here, we first split each audio file by a 20ms hamming window with no overlap, and then calculate the 12 mel frequency ceptral coefficients appended by a energy variable for each frame. Based on this vector of length 13, we calculate the delta coefficients and delta-delta coefficients, totally 39 coefficients for each frame. Therefore, each audio file is splited to several frames by hamming window, and each frame is extracted to a feature vector of length 39.
+Automatic Speech Recognition is to transcribe a raw audio file into character sequences. Data preprocessing is to convert a raw audio file into feature vectors of several frames. Here, we first split each audio file by a 20ms hamming window with an overlap of 10ms, and then calculate the 12 mel frequency ceptral coefficients appended by an energy variable for each frame. Based on this vector of length 13, we calculate the delta coefficients and delta-delta coefficients, therefore, we attain totally 39 coefficients for each frame. Therefore, each audio file is splited to several frames by hamming window, and each frame is extracted to a feature vector of length 39. If you want to attain the feature vector of different length, you can reset the settings in the file [timit_preprocess.py](https://github.com/zzw922cn/Automatic-Speech-Recognition/blob/master/src/feature/timit_preprocess.py).
 
 In folder data/mfcc, each file is a feature matrix with size timeLength*39 of one audio file; in folder data/label, each file is a label vector according to the mfcc file.
 
-If you want to set your own data preprocessing, you can edit [calcmfcc.py](https://github.com/zzw922cn/Automatic-Speech-Recognition/blob/master/src/feature/calcmfcc.py) or [preprocess.py](https://github.com/zzw922cn/Automatic-Speech-Recognition/blob/master/src/feature/preprocess.py).
+If you want to set your own data preprocessing, you can edit [calcmfcc.py](https://github.com/zzw922cn/Automatic-Speech-Recognition/blob/master/src/feature/calcmfcc.py) or [timit_preprocess.py](https://github.com/zzw922cn/Automatic-Speech-Recognition/blob/master/src/feature/timit_preprocess.py).
 
 Since the original TIMIT dataset contains 61 phonemes, we use 61 phonemes for training and evaluation, but when scoring, we mappd the 61 phonemes into 39 phonemes for better performance. We do this mapping according to the paper [Speaker-independent phone recognition using hidden Markov models](http://repository.cmu.edu/cgi/viewcontent.cgi?article=2768&context=compsci). The mapping details are as follows:
 

diff --git a/log/timit/2016-12-1413:03:54.txt b/log/timit/2016-12-1413:03:54.txt
@@ -0,0 +1,12 @@
+{'optimizer': <class 'tensorflow.python.training.adam.AdamOptimizer'>, 'name': 'residual network', 'learning rate': 0.01, 'trainable params': 1948274, 'num_layer': 10, 'num_class': 62, 'all params': 5844824.0, 'num_featuremap': 16}
+13:08:10 12/14/16 CST
+Epoch:1 train error rate:0.95813536386
+Epoch:1 train time:145.198076963 s
+{'optimizer': <class 'tensorflow.python.training.adam.AdamOptimizer'>, 'name': 'residual network', 'learning rate': 0.01, 'trainable params': 1948274, 'num_layer': 10, 'num_class': 62, 'all params': 5844824.0, 'num_featuremap': 16}
+13:11:07 12/14/16 CST
+Epoch:2 train error rate:0.849993554545
+Epoch:2 train time:140.668783903 s
+{'optimizer': <class 'tensorflow.python.training.adam.AdamOptimizer'>, 'name': 'residual network', 'learning rate': 0.01, 'trainable params': 1948274, 'num_layer': 10, 'num_class': 62, 'all params': 5844824.0, 'num_featuremap': 16}
+13:13:59 12/14/16 CST
+Epoch:3 train error rate:0.851595846089
+Epoch:3 train time:136.583483934 s
diff --git a/save/timit/checkpoint b/save/timit/checkpoint
@@ -1,2 +1,4 @@
-model_checkpoint_path: "/home/pony/github/Automatic-Speech-Recognition/save/timit/model.ckpt-0"
+model_checkpoint_path: "/home/pony/github/Automatic-Speech-Recognition/save/timit/model.ckpt-2"
 all_model_checkpoint_paths: "/home/pony/github/Automatic-Speech-Recognition/save/timit/model.ckpt-0"
+all_model_checkpoint_paths: "/home/pony/github/Automatic-Speech-Recognition/save/timit/model.ckpt-1"
+all_model_checkpoint_paths: "/home/pony/github/Automatic-Speech-Recognition/save/timit/model.ckpt-2"
diff --git a/save/timit/model.ckpt-0.REMOVED.git-id b/save/timit/model.ckpt-0.REMOVED.git-id
diff --git a/save/timit/model.ckpt-0.data-00000-of-00001.REMOVED.git-id b/save/timit/model.ckpt-0.data-00000-of-00001.REMOVED.git-id
@@ -0,0 +1 @@
+851a94ba64f26cc1682d604f83d6e482a1c3ecff
diff --git a/save/timit/model.ckpt-0.index b/save/timit/model.ckpt-0.index
diff --git a/save/timit/model.ckpt-0.meta.REMOVED.git-id b/save/timit/model.ckpt-0.meta.REMOVED.git-id
@@ -0,0 +1 @@
+cebb0fe2b6e3962034284b5495b5ea31f00c7121
diff --git a/save/timit/model.ckpt-1.data-00000-of-00001.REMOVED.git-id b/save/timit/model.ckpt-1.data-00000-of-00001.REMOVED.git-id
@@ -0,0 +1 @@
+b6ad35fa64199c5a7f23ea8e72660d6399b3ff8a
diff --git a/save/timit/model.ckpt-1.index b/save/timit/model.ckpt-1.index
diff --git a/save/timit/model.ckpt-1.meta.REMOVED.git-id b/save/timit/model.ckpt-1.meta.REMOVED.git-id
@@ -0,0 +1 @@
+cebb0fe2b6e3962034284b5495b5ea31f00c7121
diff --git a/save/timit/model.ckpt-2.data-00000-of-00001.REMOVED.git-id b/save/timit/model.ckpt-2.data-00000-of-00001.REMOVED.git-id
@@ -0,0 +1 @@
+25f336b424a94caad171696986eb59b9c5ac7d54
diff --git a/save/timit/model.ckpt-2.index b/save/timit/model.ckpt-2.index
diff --git a/save/timit/model.ckpt-2.meta.REMOVED.git-id b/save/timit/model.ckpt-2.meta.REMOVED.git-id
@@ -0,0 +1 @@
+cebb0fe2b6e3962034284b5495b5ea31f00c7121
diff --git a/src/feature/timit_preprocess.py b/src/feature/timit_preprocess.py
@@ -30,10 +30,14 @@
 import glob
 import sys
 
-mfcc_dir = '/home/pony//github/data/timit/test/mfcc/'
-label_dir = '/home/pony//github/data/timit/test/label/'
 
-rootdir = '/home/pony/ASR/datasets/TIMIT/TEST/'
+## keywords can be set to either of 'train' and 'test'
+keywords = 'test'
+
+mfcc_dir = '/home/pony/github/data/timit/'+keywords+'/mfcc/'
+label_dir = '/home/pony/github/data/timit/'+keywords+'/label/'
+
+rootdir = '/home/pony/ASR/datasets/TIMIT/'+keywords
 
 count = 0
 ## original phonemes

diff --git a/src/main/brnn.py b/src/main/brnn.py
@@ -42,18 +42,15 @@
 from utils import build_forward_layer
 from utils import build_conv_layer
 
-def build_multi_brnn(args,maxTimeSteps,inputList,cell_fn=rnn_cell.BasicRNNCell):
-    # list of batchsize length, each is [seqlength*hidden]
+def build_multi_brnn(args,maxTimeSteps,inputList,cell_fn,seqLengths):
     hid_input = inputList
     for i in range(args.num_layer):
 	scope = 'BRNN_'+str(i+1)
-        forwardH = cell_fn(args.num_hidden,activation=args.activation)
 
-        # backward layer
-        backwardH = cell_fn(args.num_hidden,activation=args.activation)
-
-        # bi-directional layer
-        fbH, f_state, b_state = bidirectional_rnn(forwardH,backwardH,hid_input,dtype=tf.float32,scope=scope)
+        forward_cell = cell_fn(args.num_hidden,activation=args.activation)
+        backward_cell = cell_fn(args.num_hidden,activation=args.activation)
+        fbH, f_state, b_state = bidirectional_rnn(forward_cell,backward_cell,
+					hid_input,dtype=tf.float32,sequence_length=seqLengths,scope=scope)
 
 	fbHrs = [tf.reshape(t, [args.batch_size, 2, args.num_hidden]) for t in fbH]
 	if i != args.num_layer-1:
@@ -63,10 +60,8 @@ def build_multi_brnn(args,maxTimeSteps,inputList,cell_fn=rnn_cell.BasicRNNCell):
             # output size is seqlength*batchsize*hidden
 	    output = tf.reduce_sum(output,2)
 
-	    print(output.get_shape().as_list())
 	    # outputXrs is of size [seqlenth*batchsize,num_hidden]
     	    outputXrs = tf.reshape(output, [-1, args.num_hidden])
-
     	    hid_input = tf.split(0, maxTimeSteps, outputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39]
 
     return fbHrs
@@ -110,7 +105,7 @@ def build_graph(self, args, maxTimeSteps):
 			    'learning rate':args.learning_rate
 	    }	    
 
-	    fbHrs = build_multi_brnn(self.args,maxTimeSteps,self.inputList,self.cell_fn)
+	    fbHrs = build_multi_brnn(self.args,maxTimeSteps,self.inputList,self.cell_fn,self.seqLengths)
 	    with tf.name_scope('fc-layer'):
                 with tf.variable_scope('fc'):
 		    weightsOutH1 = tf.Variable(tf.truncated_normal([2, args.num_hidden],name='weightsOutH1'))
@@ -121,7 +116,8 @@ def build_graph(self, args, maxTimeSteps):
     	            logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]
     	    logits3d = tf.pack(logits)
     	    self.loss = tf.reduce_mean(ctc.ctc_loss(logits3d, self.targetY, self.seqLengths))
-	    self.var_op = tf.all_variables()
+	    #self.var_op = tf.all_variables()
+	    self.var_op = tf.global_variables()
 	    self.var_trainable_op = tf.trainable_variables()
 
 	    if args.grad_clip == -1:
@@ -135,6 +131,7 @@ def build_graph(self, args, maxTimeSteps):
     	    self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1])
     	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0])
     	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
-	    self.initial_op = tf.initialize_all_variables()
-	    self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
+	    #self.initial_op = tf.initialize_all_variables()
+	    self.initial_op = tf.global_variables_initializer()
+	    self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
 	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')
diff --git a/src/main/brnn.pyc b/src/main/brnn.pyc
diff --git a/src/main/nohup.out b/src/main/nohup.out
diff --git a/src/main/resnet.py b/src/main/resnet.py
@@ -130,8 +130,11 @@ def build_graph(self, args, maxTimeSteps):
     	    self.logitsMaxTest = tf.slice(tf.argmax(conv_output, 2), [0, 0], [self.seqLengths[0], 1])
     	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(conv_output, self.seqLengths)[0][0])
     	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
-    	    self.initial_op = tf.initialize_all_variables()
-	    self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
+    	    #self.initial_op = tf.initialize_all_variables()
+	    self.initial_op = tf.global_variables_initializer()
+
+	    self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
 	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')
-	    self.var_op = tf.all_variables()
+	    #self.var_op = tf.all_variables()
+	    self.var_op = tf.global_variables()
 	    self.var_trainable_op = tf.trainable_variables()
diff --git a/src/main/resnet.pyc b/src/main/resnet.pyc
diff --git a/src/main/train.py b/src/main/train.py
@@ -64,16 +64,16 @@ def __init__(self):
 	parser.add_argument('--log_dir', type=str, default='/home/pony/github/Automatic-Speech-Recognition/log/timit/',
                        help='directory to log events while training')
 
-	parser.add_argument('--model', default='BiRNN',
-		       help='model for ASR')
+	parser.add_argument('--model', default='ResNet',
+		       help='model for ASR:BiRNN,ResNet,...')
 
 	parser.add_argument('--rnncell', default='rnn',
 		       help='rnn cell, 3 choices:rnn,lstm,gru')
 
-        parser.add_argument('--num_layer', type=int, default=2,
+        parser.add_argument('--num_layer', type=int, default=1,
                        help='set the number of hidden layer or bidirectional layer')
 
-        parser.add_argument('--activation', default=tf.nn.elu,
+        parser.add_argument('--activation', default=tf.nn.relu,
                        help='set the activation function of each layer')
 
         parser.add_argument('--optimizer', type=type, default=tf.train.AdamOptimizer,
@@ -91,13 +91,13 @@ def __init__(self):
 	parser.add_argument('--evaluation', type=bool, default=False,
                        help='test the model based on trained parameters, but at present, we can"t test during training.')
 
-        parser.add_argument('--learning_rate', type=float, default=0.0001,
+        parser.add_argument('--learning_rate', type=float, default=0.01,
                        help='set the step size of each iteration')
 
         parser.add_argument('--num_epoch', type=int, default=200000,
                        help='set the total number of training epochs')
 
-        parser.add_argument('--batch_size', type=int, default=64,
+        parser.add_argument('--batch_size', type=int, default=32,
                        help='set the number of training samples in a mini-batch')
 
         parser.add_argument('--test_batch_size', type=int, default=1,
@@ -106,7 +106,7 @@ def __init__(self):
         parser.add_argument('--num_feature', type=int, default=39,
                        help='set the dimension of feature, ie: 39 mfccs, you can set 39 ')
 
-        parser.add_argument('--num_hidden', type=int, default=128,
+        parser.add_argument('--num_hidden', type=int, default=32,
                        help='set the number of neurons in hidden layer')
 
         parser.add_argument('--num_class', type=int, default=62,
@@ -169,7 +169,7 @@ def train(self):
                     feedDict = {model.inputX: batchInputs, model.targetIxs: batchTargetIxs, model.targetVals: batchTargetVals,model.targetShape: batchTargetShape, model.seqLengths: batchSeqLengths}
 
                     _, l, er, lmt, pre = sess.run([model.optimizer, model.loss, model.errorRate, model.logitsMaxTest, model.predictions], feed_dict=feedDict)
-	    	    print(output_to_sequence(pre,mode='phoneme'))
+	    	    #print(output_to_sequence(pre,mode='phoneme'))
 
                     if (batch % 1) == 0:
                 	print('Minibatch', batch, '/', batchOrigI, 'loss:', l)
@@ -178,15 +178,15 @@ def train(self):
 
 		    if (args.save==True) and  ((epoch*len(batchRandIxs)+batch+1)%5000==0 or (epoch==args.num_epoch-1 and batch==len(batchRandIxs)-1)):
 		        checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
-                        save_path = model.saver.save(sess,checkpoint_path,global_step=epoch)
+                        model.saver.save(sess,checkpoint_path,global_step=epoch)
                         print('Model has been saved in:'+str(save_path))
 	        end = time.time()
 		delta_time = end-start
 		print('Epoch '+str(epoch+1)+' needs time:'+str(delta_time)+' s')	
 
 		if args.save==True and (epoch+1)%1==0:
 		    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
-                    save_path = model.saver.save(sess,checkpoint_path,global_step=epoch)
+                    model.saver.save(sess,checkpoint_path,global_step=epoch)
                     print('Model has been saved in file')
                 epochER = batchErrors.sum() / totalN
                 print('Epoch', epoch+1, 'train error rate:', epochER)

diff --git a/src/main/utils.pyc b/src/main/utils.pyc