Skip to content

Commit 67690ec

Browse files
committed
Added Zoneout, added dev loss in tensorboard
1 parent 6c1575e commit 67690ec

File tree

7 files changed

+153
-31
lines changed

7 files changed

+153
-31
lines changed

GRU.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import hashlib
1010
import numbers
1111

12+
import tensorflow as tf
1213
from tensorflow.python.framework import constant_op
1314
from tensorflow.python.framework import dtypes
1415
from tensorflow.python.framework import ops
@@ -32,6 +33,52 @@
3233
_BIAS_VARIABLE_NAME = "bias"
3334
_WEIGHTS_VARIABLE_NAME = "kernel"
3435

36+
class GRUCell(RNNCell):
37+
"""Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
38+
39+
def __init__(self,
40+
num_units,
41+
activation=None,
42+
reuse=None,
43+
kernel_initializer=None,
44+
bias_initializer=None,
45+
is_training = True):
46+
super(GRUCell, self).__init__(_reuse=reuse)
47+
self._num_units = num_units
48+
self._activation = activation or math_ops.tanh
49+
self._kernel_initializer = kernel_initializer
50+
self._bias_initializer = bias_initializer
51+
self._is_training = is_training
52+
53+
@property
54+
def state_size(self):
55+
return self._num_units
56+
57+
@property
58+
def output_size(self):
59+
return self._num_units
60+
61+
def call(self, inputs, state):
62+
"""Gated recurrent unit (GRU) with nunits cells."""
63+
with vs.variable_scope("gates"): # Reset gate and update gate.
64+
# We start with bias of 1.0 to not reset and not update.
65+
bias_ones = self._bias_initializer
66+
if self._bias_initializer is None:
67+
dtype = [a.dtype for a in [inputs, state]][0]
68+
bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
69+
value = math_ops.sigmoid(
70+
_linear([inputs, state], 2 * self._num_units, True, bias_ones,
71+
self._kernel_initializer))
72+
r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
73+
with vs.variable_scope("candidate"):
74+
c = self._activation(
75+
_linear([inputs, r * state], self._num_units, True,
76+
self._bias_initializer, self._kernel_initializer))
77+
if self._is_training:
78+
c = tf.nn.dropout(c, 1 - 0.2)
79+
new_h = u * state + (1 - u) * c
80+
return new_h, new_h
81+
3582
class gated_attention_GRUCell(RNNCell):
3683

3784
def __init__(self,
@@ -42,7 +89,8 @@ def __init__(self,
4289
memory_len = None,
4390
reuse=None,
4491
kernel_initializer=None,
45-
bias_initializer=None):
92+
bias_initializer=None,
93+
is_training = True):
4694
super(gated_attention_GRUCell, self).__init__(_reuse=reuse)
4795
self._num_units = num_units
4896
self._activation = math_ops.tanh
@@ -52,6 +100,7 @@ def __init__(self,
52100
self._params = params
53101
self._self_matching = self_matching
54102
self._memory_len = memory_len
103+
self._is_training = is_training
55104

56105
@property
57106
def state_size(self):
@@ -85,6 +134,8 @@ def call(self, inputs, state):
85134
c = self._activation(
86135
_linear([inputs, r * state], self._num_units, True,
87136
self._bias_initializer, self._kernel_initializer))
137+
#if self._is_training:
138+
# c = tf.nn.dropout(c, 1 - 0.2)
88139
new_h = u * state + (1 - u) * c
89140
return new_h, new_h
90141

data_load.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,15 @@ def load_data(dir_):
159159
p_char_len, q_char_len,
160160
indices], shapes)
161161

162+
def get_dev():
163+
devset, shapes = load_data(Params.dev_dir)
164+
indices = devset[-1]
165+
# devset = [np.reshape(input_, shapes[i]) for i,input_ in enumerate(devset)]
166+
167+
dev_ind = np.arange(indices.shape[0],dtype = np.int32)
168+
np.random.shuffle(dev_ind)
169+
return devset, dev_ind
170+
162171
def get_batch(is_training = True):
163172
"""Loads training data and put them in queues"""
164173
with tf.device('/cpu:0'):

layers.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
from tensorflow.contrib.rnn import RNNCell
99
from params import Params
1010

11+
from tensorflow.contrib.rnn import MultiRNNCell
12+
from tensorflow.contrib.rnn import RNNCell
13+
from params import Params
14+
from zoneout import ZoneoutWrapper
1115
'''
1216
attention weights from https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf
1317
W_u^Q.shape: (2 * attn_size, attn_size)
@@ -49,16 +53,14 @@ def encoding(word, char, word_embeddings, char_embeddings, scope = "embedding"):
4953
char_encoding = tf.nn.embedding_lookup(char_embeddings, char)
5054
return word_encoding, char_encoding
5155

52-
def apply_dropout(inputs, dropout = Params.dropout, is_training = True):
56+
def apply_zoneout(inputs, dropout = Params.zoneout, is_training = True):
5357
'''
58+
Implementation of Zoneout from https://arxiv.org/pdf/1606.01305.pdf
5459
Default is set to None due to high bias error
5560
'''
56-
if not is_training or Params.dropout is None:
61+
if Params.zoneout is None:
5762
return inputs
58-
if isinstance(inputs, RNNCell):
59-
return tf.contrib.rnn.DropoutWrapper(inputs, output_keep_prob=1.0 - dropout, dtype = tf.float32, variational_recurrent = True, input_size = inputs.output_size)
60-
else:
61-
return tf.nn.dropout(inputs, keep_prob = 1.0 - dropout)
63+
return ZoneoutWrapper(inputs, state_zoneout_prob= dropout, is_training = is_training)
6264

6365
def bidirectional_GRU(inputs, inputs_len, cell = None, units = Params.attn_size, layers = 1, scope = "Bidirectional_GRU", output = 0, is_training = True, reuse = None):
6466
'''

model.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
import tensorflow as tf
77
from tqdm import tqdm
8-
from data_load import get_batch
8+
from data_load import get_batch, get_dev
99
from params import Params
1010
from layers import *
11-
from GRU import gated_attention_GRUCell
11+
from GRU import gated_attention_GRUCell, GRUCell
1212
from evaluate import *
1313
import numpy as np
1414
import cPickle as pickle
@@ -26,19 +26,19 @@ def __init__(self,is_training = True):
2626
self.graph = tf.Graph()
2727
with self.graph.as_default():
2828
self.global_step = tf.Variable(0, name='global_step', trainable=False)
29-
data, self.num_batch = get_batch(is_training = is_training)
29+
self.data, self.num_batch = get_batch(is_training = is_training)
3030
(self.passage_w,
3131
self.question_w,
3232
self.passage_c,
3333
self.question_c,
34-
self.passage_w_len,
35-
self.question_w_len,
34+
self.passage_w_len_,
35+
self.question_w_len_,
3636
self.passage_c_len,
3737
self.question_c_len,
38-
self.indices) = data
38+
self.indices) = self.data
3939

40-
self.passage_w_len = tf.squeeze(self.passage_w_len)
41-
self.question_w_len = tf.squeeze(self.question_w_len)
40+
self.passage_w_len = tf.squeeze(self.passage_w_len_)
41+
self.question_w_len = tf.squeeze(self.question_w_len_)
4242

4343
self.encode_ids()
4444
self.params = get_attn_params(Params.attn_size, initializer = tf.contrib.layers.xavier_initializer)
@@ -73,28 +73,36 @@ def encode_ids(self):
7373
word_embeddings = self.word_embeddings,
7474
char_embeddings = self.char_embeddings,
7575
scope = "question_embeddings")
76+
#cell = [GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(2)]
7677
self.passage_char_encoded = bidirectional_GRU(self.passage_char_encoded,
7778
self.passage_c_len,
79+
# cell = cell,
7880
scope = "passage_char_encoding",
7981
output = 1,
8082
is_training = self.is_training)
83+
#cell = [GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(2)]
8184
self.question_char_encoded = bidirectional_GRU(self.question_char_encoded,
8285
self.question_c_len,
86+
# cell = cell,
8387
scope = "question_char_encoding",
8488
output = 1,
8589
is_training = self.is_training)
8690
self.passage_encoding = tf.concat((self.passage_word_encoded, self.passage_char_encoded),axis = 2)
8791
self.question_encoding = tf.concat((self.question_word_encoded, self.question_char_encoded),axis = 2)
8892

8993
# Passage and question encoding
94+
#cell = [MultiRNNCell([GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(3)]) for _ in range(2)]
9095
self.passage_encoding = bidirectional_GRU(self.passage_encoding,
9196
self.passage_w_len,
97+
# cell = cell,
9298
layers = Params.num_layers,
9399
scope = "passage_encoding",
94100
output = 0,
95101
is_training = self.is_training)
102+
#cell = [MultiRNNCell([GRUCell(Params.attn_size, is_training = self.is_training) for _ in range(3)]) for _ in range(2)]
96103
self.question_encoding = bidirectional_GRU(self.question_encoding,
97104
self.question_w_len,
105+
# cell = cell,
98106
layers = Params.num_layers,
99107
scope = "question_encoding",
100108
output = 0,
@@ -165,10 +173,13 @@ def summary(self):
165173
self.F1_placeholder = tf.placeholder(tf.float32, shape = (), name = "F1_placeholder")
166174
self.EM = tf.Variable(tf.constant(0.0, shape=(), dtype = tf.float32),trainable=False, name="EM")
167175
self.EM_placeholder = tf.placeholder(tf.float32, shape = (), name = "EM_placeholder")
168-
self.metric_assign = tf.group(tf.assign(self.F1, self.F1_placeholder),tf.assign(self.EM, self.EM_placeholder))
169-
tf.summary.scalar('mean_loss', self.mean_loss)
170-
tf.summary.scalar("training_F1_Score",self.F1)
171-
tf.summary.scalar("training_Exact_Match",self.EM)
176+
self.dev_loss = tf.Variable(tf.constant(5.0, shape=(), dtype = tf.float32),trainable=False, name="dev_loss")
177+
self.dev_loss_placeholder = tf.placeholder(tf.float32, shape = (), name = "dev_loss")
178+
self.metric_assign = tf.group(tf.assign(self.F1, self.F1_placeholder),tf.assign(self.EM, self.EM_placeholder),tf.assign(self.dev_loss, self.dev_loss_placeholder))
179+
tf.summary.scalar('loss_training', self.mean_loss)
180+
tf.summary.scalar('loss_dev', self.dev_loss)
181+
tf.summary.scalar("F1_Score",self.F1)
182+
tf.summary.scalar("Exact_Match",self.EM)
172183
tf.summary.scalar('learning_rate', Params.opt_arg[Params.optimizer]['learning_rate'])
173184
self.merged = tf.summary.merge_all()
174185

@@ -198,6 +209,7 @@ def main():
198209
model = Model(is_training = True); print("Built model")
199210
dict_ = pickle.load(open(Params.data_dir + "dictionary.pkl","r"))
200211
init = False
212+
devdata, dev_ind = get_dev()
201213
if not os.path.isfile(os.path.join(Params.logdir,"checkpoint")):
202214
init = True
203215
glove = np.memmap(Params.data_dir + "glove.np", dtype = np.float32, mode = "r")
@@ -218,19 +230,19 @@ def main():
218230
for step in tqdm(range(model.num_batch), total = model.num_batch, ncols=70, leave=False, unit='b'):
219231
sess.run(model.train_op)
220232
if step % Params.save_steps == 0:
221-
gs = sess.run(model.global_step)
222-
sv.saver.save(sess, Params.logdir + '/model_epoch_%d_step_%d'%(gs//model.num_batch, gs%model.num_batch))
223-
index, ground_truth, passage = sess.run([model.points_logits, model.indices, model.passage_w])
224-
index = np.argmax(index, axis = 2)
233+
sample_ind = np.random.choice(dev_ind, Params.batch_size)
234+
feed_dict = {data: devdata[i][sample_ind] for i,data in enumerate(model.data)}
235+
logits, dev_loss, gs = sess.run([model.points_logits, model.mean_loss, model.global_step], feed_dict = feed_dict)
236+
index = np.argmax(logits, axis = 2)
225237
F1, EM = 0.0, 0.0
226238
for batch in range(Params.batch_size):
227-
f1, em = f1_and_EM(index[batch], ground_truth[batch], passage[batch], dict_)
239+
f1, em = f1_and_EM(index[batch], devdata[8][sample_ind][batch], devdata[0][sample_ind][batch], dict_)
228240
F1 += f1
229241
EM += em
230242
F1 /= float(Params.batch_size)
231243
EM /= float(Params.batch_size)
232-
sess.run(model.metric_assign,{model.F1_placeholder: F1, model.EM_placeholder: EM})
233-
print("\nExact_match: {}\nF1_score: {}".format(EM,F1))
244+
sess.run(model.metric_assign,{model.F1_placeholder: F1, model.EM_placeholder: EM, model.dev_loss_placeholder: dev_loss})
245+
print("\nDev_loss: {}\nDev_Exact_match: {}\nDev_F1_score: {}".format(dev_loss,EM,F1))
234246

235247
if __name__ == '__main__':
236248
if Params.mode.lower() == "debug":

params.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ class Params():
2121

2222
# Training
2323
mode = "train" # case-insensitive options: ["train", "test", "debug"]
24-
dropout = None # dropout probability if None, don't use dropout
25-
optimizer = "adadelta" # Options: ["adadelta", "adam", "gradientdescent", "adagrad"]
26-
batch_size = 50 if mode is not "test" else 100# Size of the mini-batch for training
24+
dropout = 0.2 # dropout probability if None, don't use dropout
25+
optimizer = "adam" # Options: ["adadelta", "adam", "gradientdescent", "adagrad"]
26+
batch_size = 54 if mode is not "test" else 100# Size of the mini-batch for training
2727
save_steps = 50 # Save the model at every 50 steps
28-
clip = False # clip gradient norm
28+
clip = True # clip gradient norm
2929
norm = 5.0 # global norm
3030
# Change the hyperparameters of your learning algorithms here
3131
opt_arg = {'adadelta':{'learning_rate':1, 'rho': 0.95, 'epsilon':1e-6},
@@ -40,6 +40,6 @@ class Params():
4040
vocab_size = 2196018 # Number of vocabs in glove.840B.300d.txt + 1 for an unknown token
4141
char_vocab_size = 95 # Number of characters in glove.840B.300d.char.txt + 1 for an unknown character
4242
emb_size = 300 # Embeddings size for both words and characters
43-
attn_size = 75 # RNN celland attention module size
43+
attn_size = 64 # RNN celland attention module size
4444
num_layers = 3 # Number of layers at question-passage matching and self matching network
4545
bias = True # Use bias term in attention

process.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,13 +231,19 @@ def pad_data(data, max_word):
231231
padded_data = np.zeros((len(data),max_word),dtype = np.int32)
232232
for i,line in enumerate(data):
233233
for j,word in enumerate(line):
234+
if j >= max_word:
235+
print("skipped a word")
236+
continue
234237
padded_data[i,j] = word
235238
return padded_data
236239

237240
def pad_char_data(data, max_char, max_words):
238241
padded_data = np.zeros((len(data),max_words,max_char),dtype = np.int32)
239242
for i,line in enumerate(data):
240243
for j,word in enumerate(line):
244+
if j >= max_words:
245+
print("skipped a word")
246+
break
241247
for k,char in enumerate(word):
242248
if k >= max_char:
243249
# ignore the rest of the word if it's longer than the limit

zoneout.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# from ipywidgets import interact
2+
import tensorflow as tf
3+
4+
import numpy as np
5+
6+
from tensorflow.python.framework import dtypes
7+
from tensorflow.python.ops import variable_scope
8+
9+
# Wrapper for the TF RNN cell
10+
# For an LSTM, the 'cell' is a tuple containing state and cell
11+
# We use TF's dropout to implement zoneout
12+
class ZoneoutWrapper(tf.nn.rnn_cell.RNNCell):
13+
"""Operator adding zoneout to all states (states+cells) of the given cell."""
14+
15+
def __init__(self, cell, state_zoneout_prob, is_training=True, seed=None):
16+
if not isinstance(cell, tf.nn.rnn_cell.RNNCell):
17+
raise TypeError("The parameter cell is not an RNNCell.")
18+
if (isinstance(state_zoneout_prob, float) and
19+
not (state_zoneout_prob >= 0.0 and state_zoneout_prob <= 1.0)):
20+
raise ValueError("Parameter zoneout_prob must be between 0 and 1: %d"
21+
% zoneout_prob)
22+
self._cell = cell
23+
self._zoneout_prob = state_zoneout_prob
24+
self._seed = seed
25+
self.is_training = is_training
26+
27+
@property
28+
def state_size(self):
29+
return self._cell.state_size
30+
31+
@property
32+
def output_size(self):
33+
return self._cell.output_size
34+
35+
def __call__(self, inputs, state, scope=None):
36+
output, new_state = self._cell(inputs, state, scope)
37+
if self.is_training:
38+
new_state = (1 - self._zoneout_prob) * tf.nn.dropout(
39+
new_state - state, (1 - self._zoneout_prob), seed=self._seed) + state
40+
else:
41+
new_state = self._zoneout_prob * state + (1 - self._zoneout_prob) * new_state
42+
return output, new_state

0 commit comments

Comments
 (0)