Added Zoneout, added dev loss in tensorboard

localminimum · localminimum · commit 67690ec3b45a · 2018-04-24T16:06:20.000+12:00
diff --git a/GRU.py b/GRU.py
@@ -9,6 +9,7 @@
 import hashlib
 import numbers
 
+import tensorflow as tf
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +33,52 @@
 _BIAS_VARIABLE_NAME = "bias"
 _WEIGHTS_VARIABLE_NAME = "kernel"
 
+class GRUCell(RNNCell):
+  """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               kernel_initializer=None,
+               bias_initializer=None,
+			   is_training = True):
+    super(GRUCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+    self._is_training = is_training
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def call(self, inputs, state):
+    """Gated recurrent unit (GRU) with nunits cells."""
+    with vs.variable_scope("gates"):  # Reset gate and update gate.
+      # We start with bias of 1.0 to not reset and not update.
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        dtype = [a.dtype for a in [inputs, state]][0]
+        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
+      value = math_ops.sigmoid(
+          _linear([inputs, state], 2 * self._num_units, True, bias_ones,
+                  self._kernel_initializer))
+      r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+    with vs.variable_scope("candidate"):
+      c = self._activation(
+          _linear([inputs, r * state], self._num_units, True,
+                  self._bias_initializer, self._kernel_initializer))
+      if self._is_training:
+		c = tf.nn.dropout(c, 1 - 0.2)
+    new_h = u * state + (1 - u) * c
+    return new_h, new_h
+
 class gated_attention_GRUCell(RNNCell):
 
   def __init__(self,
@@ -42,7 +89,8 @@ def __init__(self,
                memory_len = None,
                reuse=None,
                kernel_initializer=None,
-               bias_initializer=None):
+               bias_initializer=None,
+			   is_training = True):
     super(gated_attention_GRUCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._activation = math_ops.tanh
@@ -52,6 +100,7 @@ def __init__(self,
     self._params = params
     self._self_matching = self_matching
     self._memory_len = memory_len
+    self._is_training = is_training
 
   @property
   def state_size(self):
@@ -85,6 +134,8 @@ def call(self, inputs, state):
       c = self._activation(
           _linear([inputs, r * state], self._num_units, True,
                   self._bias_initializer, self._kernel_initializer))
+      #if self._is_training:
+	#	c = tf.nn.dropout(c, 1 - 0.2)
     new_h = u * state + (1 - u) * c
     return new_h, new_h
 
diff --git a/data_load.py b/data_load.py
@@ -159,6 +159,15 @@ def load_data(dir_):
             p_char_len, q_char_len,
             indices], shapes)
 
+def get_dev():
+    devset, shapes = load_data(Params.dev_dir)
+    indices = devset[-1]
+    # devset = [np.reshape(input_, shapes[i]) for i,input_ in enumerate(devset)]
+
+    dev_ind = np.arange(indices.shape[0],dtype = np.int32)
+    np.random.shuffle(dev_ind)
+    return devset, dev_ind
+
 def get_batch(is_training = True):
     """Loads training data and put them in queues"""
     with tf.device('/cpu:0'):
diff --git a/layers.py b/layers.py
@@ -8,6 +8,10 @@
 from tensorflow.contrib.rnn import RNNCell
 from params import Params
 
+from tensorflow.contrib.rnn import MultiRNNCell
+from tensorflow.contrib.rnn import RNNCell
+from params import Params
+from zoneout import ZoneoutWrapper
 '''
 attention weights from https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf
 W_u^Q.shape:    (2 * attn_size, attn_size)
@@ -49,16 +53,14 @@ def encoding(word, char, word_embeddings, char_embeddings, scope = "embedding"):
         char_encoding = tf.nn.embedding_lookup(char_embeddings, char)
         return word_encoding, char_encoding
 
-def apply_dropout(inputs, dropout = Params.dropout, is_training = True):
+def apply_zoneout(inputs, dropout = Params.zoneout, is_training = True):
     '''
+    Implementation of Zoneout from https://arxiv.org/pdf/1606.01305.pdf
     Default is set to None due to high bias error
     '''
-    if not is_training or Params.dropout is None:
+    if Params.zoneout is None:
         return inputs
-    if isinstance(inputs, RNNCell):
-        return tf.contrib.rnn.DropoutWrapper(inputs, output_keep_prob=1.0 - dropout, dtype = tf.float32, variational_recurrent = True, input_size = inputs.output_size)
-    else:
-        return tf.nn.dropout(inputs, keep_prob = 1.0 - dropout)
+    return ZoneoutWrapper(inputs, state_zoneout_prob= dropout, is_training = is_training)
 
 def bidirectional_GRU(inputs, inputs_len, cell = None, units = Params.attn_size, layers = 1, scope = "Bidirectional_GRU", output = 0, is_training = True, reuse = None):
     '''
diff --git a/model.py b/model.py
@@ -5,10 +5,10 @@
 
 import tensorflow as tf
 from tqdm import tqdm
-from data_load import get_batch
+from data_load import get_batch, get_dev
 from params import Params
 from layers import *
-from GRU import gated_attention_GRUCell
+from GRU import gated_attention_GRUCell, GRUCell
 from evaluate import *
 import numpy as np
 import cPickle as pickle
@@ -26,19 +26,19 @@ def __init__(self,is_training = True):
 		self.graph = tf.Graph()
 		with self.graph.as_default():
 			self.global_step = tf.Variable(0, name='global_step', trainable=False)
-			data, self.num_batch = get_batch(is_training = is_training)
+			self.data, self.num_batch = get_batch(is_training = is_training)
 			(self.passage_w,
 			self.question_w,
 			self.passage_c,
 			self.question_c,
-			self.passage_w_len,
-			self.question_w_len,
+			self.passage_w_len_,
+			self.question_w_len_,
 			self.passage_c_len,
 			self.question_c_len,
-			self.indices) = data
+			self.indices) = self.data
 
-			self.passage_w_len = tf.squeeze(self.passage_w_len)
-			self.question_w_len = tf.squeeze(self.question_w_len)
+			self.passage_w_len = tf.squeeze(self.passage_w_len_)
+			self.question_w_len = tf.squeeze(self.question_w_len_)
 
 			self.encode_ids()
 			self.params = get_attn_params(Params.attn_size, initializer = tf.contrib.layers.xavier_initializer)
@@ -73,28 +73,36 @@ def encode_ids(self):
 										word_embeddings = self.word_embeddings,
 										char_embeddings = self.char_embeddings,
 										scope = "question_embeddings")
+		#cell = [GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(2)]
 		self.passage_char_encoded = bidirectional_GRU(self.passage_char_encoded,
 										self.passage_c_len,
+		#								cell = cell,
 										scope = "passage_char_encoding",
 										output = 1,
 										is_training = self.is_training)
+		#cell = [GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(2)]
 		self.question_char_encoded = bidirectional_GRU(self.question_char_encoded,
 										self.question_c_len,
+		#								cell = cell,
 										scope = "question_char_encoding",
 										output = 1,
 										is_training = self.is_training)
 		self.passage_encoding = tf.concat((self.passage_word_encoded, self.passage_char_encoded),axis = 2)
 		self.question_encoding = tf.concat((self.question_word_encoded, self.question_char_encoded),axis = 2)
 
 		# Passage and question encoding
+		#cell = [MultiRNNCell([GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(3)]) for _ in range(2)]
 		self.passage_encoding = bidirectional_GRU(self.passage_encoding,
 										self.passage_w_len,
+		#								cell = cell,
 										layers = Params.num_layers,
 										scope = "passage_encoding",
 										output = 0,
 										is_training = self.is_training)
+		#cell = [MultiRNNCell([GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(3)]) for _ in range(2)]
 		self.question_encoding = bidirectional_GRU(self.question_encoding,
 										self.question_w_len,
+		#								cell = cell,
 										layers = Params.num_layers,
 										scope = "question_encoding",
 										output = 0,
@@ -165,10 +173,13 @@ def summary(self):
 		self.F1_placeholder = tf.placeholder(tf.float32, shape = (), name = "F1_placeholder")
 		self.EM = tf.Variable(tf.constant(0.0, shape=(), dtype = tf.float32),trainable=False, name="EM")
 		self.EM_placeholder = tf.placeholder(tf.float32, shape = (), name = "EM_placeholder")
-		self.metric_assign = tf.group(tf.assign(self.F1, self.F1_placeholder),tf.assign(self.EM, self.EM_placeholder))
-		tf.summary.scalar('mean_loss', self.mean_loss)
-		tf.summary.scalar("training_F1_Score",self.F1)
-		tf.summary.scalar("training_Exact_Match",self.EM)
+		self.dev_loss = tf.Variable(tf.constant(5.0, shape=(), dtype = tf.float32),trainable=False, name="dev_loss")
+		self.dev_loss_placeholder = tf.placeholder(tf.float32, shape = (), name = "dev_loss")
+		self.metric_assign = tf.group(tf.assign(self.F1, self.F1_placeholder),tf.assign(self.EM, self.EM_placeholder),tf.assign(self.dev_loss, self.dev_loss_placeholder))
+		tf.summary.scalar('loss_training', self.mean_loss)
+		tf.summary.scalar('loss_dev', self.dev_loss)
+		tf.summary.scalar("F1_Score",self.F1)
+		tf.summary.scalar("Exact_Match",self.EM)
 		tf.summary.scalar('learning_rate', Params.opt_arg[Params.optimizer]['learning_rate'])
 		self.merged = tf.summary.merge_all()
 
@@ -198,6 +209,7 @@ def main():
 	model = Model(is_training = True); print("Built model")
 	dict_ = pickle.load(open(Params.data_dir + "dictionary.pkl","r"))
 	init = False
+	devdata, dev_ind = get_dev()
 	if not os.path.isfile(os.path.join(Params.logdir,"checkpoint")):
 		init = True
 		glove = np.memmap(Params.data_dir + "glove.np", dtype = np.float32, mode = "r")
@@ -218,19 +230,19 @@ def main():
 				for step in tqdm(range(model.num_batch), total = model.num_batch, ncols=70, leave=False, unit='b'):
 					sess.run(model.train_op)
 					if step % Params.save_steps == 0:
-						gs = sess.run(model.global_step)
-						sv.saver.save(sess, Params.logdir + '/model_epoch_%d_step_%d'%(gs//model.num_batch, gs%model.num_batch))
-						index, ground_truth, passage = sess.run([model.points_logits, model.indices, model.passage_w])
-						index = np.argmax(index, axis = 2)
+						sample_ind = np.random.choice(dev_ind, Params.batch_size)
+						feed_dict = {data: devdata[i][sample_ind] for i,data in enumerate(model.data)}
+						logits, dev_loss, gs = sess.run([model.points_logits, model.mean_loss, model.global_step], feed_dict = feed_dict)
+						index = np.argmax(logits, axis = 2)
 						F1, EM = 0.0, 0.0
 						for batch in range(Params.batch_size):
-							f1, em = f1_and_EM(index[batch], ground_truth[batch], passage[batch], dict_)
+							f1, em = f1_and_EM(index[batch], devdata[8][sample_ind][batch], devdata[0][sample_ind][batch], dict_)
 							F1 += f1
 							EM += em
 						F1 /= float(Params.batch_size)
 						EM /= float(Params.batch_size)
-						sess.run(model.metric_assign,{model.F1_placeholder: F1, model.EM_placeholder: EM})
-						print("\nExact_match: {}\nF1_score: {}".format(EM,F1))
+						sess.run(model.metric_assign,{model.F1_placeholder: F1, model.EM_placeholder: EM, model.dev_loss_placeholder: dev_loss})
+						print("\nDev_loss: {}\nDev_Exact_match: {}\nDev_F1_score: {}".format(dev_loss,EM,F1))
 
 if __name__ == '__main__':
 	if Params.mode.lower() == "debug":
diff --git a/params.py b/params.py
@@ -21,11 +21,11 @@ class Params():
 
     # Training
     mode = "train" # case-insensitive options: ["train", "test", "debug"]
-    dropout = None # dropout probability if None, don't use dropout
-    optimizer = "adadelta" # Options: ["adadelta", "adam", "gradientdescent", "adagrad"]
-    batch_size = 50 if mode is not "test" else 100# Size of the mini-batch for training
+    dropout = 0.2 # dropout probability if None, don't use dropout
+    optimizer = "adam" # Options: ["adadelta", "adam", "gradientdescent", "adagrad"]
+    batch_size = 54 if mode is not "test" else 100# Size of the mini-batch for training
     save_steps = 50 # Save the model at every 50 steps
-    clip = False # clip gradient norm
+    clip = True # clip gradient norm
     norm = 5.0 # global norm
     # Change the hyperparameters of your learning algorithms here
     opt_arg = {'adadelta':{'learning_rate':1, 'rho': 0.95, 'epsilon':1e-6},
@@ -40,6 +40,6 @@ class Params():
     vocab_size = 2196018 # Number of vocabs in glove.840B.300d.txt + 1 for an unknown token
     char_vocab_size = 95 # Number of characters in glove.840B.300d.char.txt + 1 for an unknown character
     emb_size = 300 # Embeddings size for both words and characters
-    attn_size = 75 # RNN celland attention module size
+    attn_size = 64 # RNN celland attention module size
     num_layers = 3 # Number of layers at question-passage matching and self matching network
     bias = True # Use bias term in attention
diff --git a/process.py b/process.py
@@ -231,13 +231,19 @@ def pad_data(data, max_word):
     padded_data = np.zeros((len(data),max_word),dtype = np.int32)
     for i,line in enumerate(data):
         for j,word in enumerate(line):
+	    if j >= max_word:
+	        print("skipped a word")
+		continue	
             padded_data[i,j] = word
     return padded_data
 
 def pad_char_data(data, max_char, max_words):
     padded_data = np.zeros((len(data),max_words,max_char),dtype = np.int32)
     for i,line in enumerate(data):
         for j,word in enumerate(line):
+	    if j >= max_words:
+		print("skipped a word")
+		break
             for k,char in enumerate(word):
                 if k >= max_char:
                     # ignore the rest of the word if it's longer than the limit
diff --git a/zoneout.py b/zoneout.py
@@ -0,0 +1,42 @@
+# from ipywidgets import interact
+import tensorflow as tf
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variable_scope
+
+# Wrapper for the TF RNN cell
+# For an LSTM, the 'cell' is a tuple containing state and cell
+# We use TF's dropout to implement zoneout
+class ZoneoutWrapper(tf.nn.rnn_cell.RNNCell):
+  """Operator adding zoneout to all states (states+cells) of the given cell."""
+
+  def __init__(self, cell, state_zoneout_prob, is_training=True, seed=None):
+    if not isinstance(cell, tf.nn.rnn_cell.RNNCell):
+      raise TypeError("The parameter cell is not an RNNCell.")
+    if (isinstance(state_zoneout_prob, float) and
+        not (state_zoneout_prob >= 0.0 and state_zoneout_prob <= 1.0)):
+      raise ValueError("Parameter zoneout_prob must be between 0 and 1: %d"
+                       % zoneout_prob)
+    self._cell = cell
+    self._zoneout_prob = state_zoneout_prob
+    self._seed = seed
+    self.is_training = is_training
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def __call__(self, inputs, state, scope=None):
+    output, new_state = self._cell(inputs, state, scope)
+    if self.is_training:
+        new_state = (1 - self._zoneout_prob) * tf.nn.dropout(
+                      new_state - state, (1 - self._zoneout_prob), seed=self._seed) + state
+    else:
+        new_state = self._zoneout_prob * state + (1 - self._zoneout_prob) * new_state
+    return output, new_state