Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop criteria #79

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
212 changes: 124 additions & 88 deletions g2p_seq2seq/g2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
import math
import os
import time
import random

import numpy as np
import tensorflow as tf
from tensorflow.core.protobuf import saver_pb2

from g2p_seq2seq import data_utils
from g2p_seq2seq import seq2seq_model
Expand Down Expand Up @@ -126,6 +126,9 @@ def __put_into_buckets(self, source, target):
if len(source_ids) < source_size and len(target_ids) < target_size:
data_set[bucket_id].append([source_ids, target_ids])
break

for bucket_id in range(len(self._BUCKETS)):
random.shuffle(data_set[bucket_id])
return data_set


Expand All @@ -135,8 +138,8 @@ def prepare_data(self, train_path, valid_path, test_path):
print("Preparing G2P data")
train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\
self.ph_vocab, self.test_lines =\
data_utils.prepare_g2p_data(self.model_dir, train_path, valid_path,
test_path)
data_utils.prepare_g2p_data(self.model_dir, train_path, valid_path,
test_path)
# Read data into buckets and compute their sizes.
print ("Reading development and training data.")
self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids)
Expand Down Expand Up @@ -203,74 +206,69 @@ def train(self):

train_bucket_sizes = [len(self.train_set[b])
for b in xrange(len(self._BUCKETS))]
train_total_size = float(sum(train_bucket_sizes))
# A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
# to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
# the size if i-th training bucket, as used later.
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
for i in xrange(len(train_bucket_sizes))]

# This is the training loop.
step_time, train_loss = 0.0, 0.0
current_step, num_iter_wo_improve = 0, 0
prev_train_losses, prev_valid_losses = [], []
num_iter_cover_train = int(sum(train_bucket_sizes) /
self.params.batch_size /
self.params.steps_per_checkpoint)
step_time, train_loss, allow_excess_min = 0.0, 0.0, 1.5
current_step, self.epochs_wo_improvement,\
self.allow_epochs_wo_improvement = 0, 0, 2
train_losses, eval_losses, epoch_losses = [], [], []
while (self.params.max_steps == 0
or self.model.global_step.eval(self.session)
<= self.params.max_steps):
# Get a batch and make a step.
start_time = time.time()
step_loss = self.__calc_step_loss(train_buckets_scale)
step_time += (time.time() - start_time) / self.params.steps_per_checkpoint
train_loss += step_loss / self.params.steps_per_checkpoint
current_step += 1

# Once in a while, we save checkpoint, print statistics, and run evals.
if current_step % self.params.steps_per_checkpoint == 0:
# Print statistics for the previous steps.
train_ppx = math.exp(train_loss) if train_loss < 300 else float('inf')
print ("global step %d learning rate %.4f step-time %.2f perplexity "
"%.2f" % (self.model.global_step.eval(self.session),
self.model.learning_rate.eval(self.session),
step_time, train_ppx))
eval_loss = self.__calc_eval_loss()
eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
print(" eval: perplexity %.2f" % (eval_ppx))
# Decrease learning rate if no improvement was seen on train set
# over last 3 times.
if (len(prev_train_losses) > 2
and train_loss > max(prev_train_losses[-3:])):
self.session.run(self.model.learning_rate_decay_op)

if (len(prev_valid_losses) > 0
and eval_loss <= min(prev_valid_losses)):
# Save checkpoint and zero timer and loss.
self.model.saver.save(self.session,
os.path.join(self.model_dir, "model"),
write_meta_graph=False)

if (len(prev_valid_losses) > 0
and eval_loss >= min(prev_valid_losses)):
num_iter_wo_improve += 1
else:
num_iter_wo_improve = 0

if num_iter_wo_improve > num_iter_cover_train * 2:
print("No improvement over last %d times. Training will stop after %d"
"iterations if no improvement was seen."
% (num_iter_wo_improve,
num_iter_cover_train - num_iter_wo_improve))

# Stop train if no improvement was seen on validation set
# over last 3 epochs.
if num_iter_wo_improve > num_iter_cover_train * 3:
break
for from_row in range(0, max(train_bucket_sizes), self.params.batch_size):
for bucket_id in range(len(self._BUCKETS)):
if from_row <= train_bucket_sizes[bucket_id]:
step_loss = self.__calc_step_loss(bucket_id, from_row)
step_time += (time.time() - start_time) /\
self.params.steps_per_checkpoint
train_loss += step_loss / self.params.steps_per_checkpoint
current_step += 1

# Once in a while, we save checkpoint, print statistics,
# and run evals.
if current_step % self.params.steps_per_checkpoint == 0:
# Print statistics for the previous steps.
train_ppx =\
math.exp(train_loss) if train_loss < 300 else float('inf')
print ("global step %d learning rate %.4f step-time %.2f "
"perplexity %.3f" %
(self.model.global_step.eval(self.session),
self.model.learning_rate.eval(self.session),
step_time, train_ppx))
eval_loss = self.__calc_eval_loss()
eval_ppx =\
math.exp(eval_loss) if eval_loss < 300 else float('inf')
print(" eval: perplexity %.3f" % (eval_ppx))
# Decrease learning rate if no improvement was seen on train set
# over last 3 times.
if (len(train_losses) > 2
and train_loss > max(train_losses[-3:])):
self.session.run(self.model.learning_rate_decay_op)

# Save checkpoint and zero timer and loss.
self.model.saver.save(self.session,
os.path.join(self.model_dir, "model"),
write_meta_graph=False)

train_losses.append(train_loss)
eval_losses.append(eval_loss)
step_time, train_loss = 0.0, 0.0

# After epoch pass, calculate average validation loss during
# the previous epoch
eval_losses = [loss for loss in eval_losses
if loss < (min(eval_losses) * allow_excess_min)]
epoch_loss = (sum(eval_losses) / len(eval_losses)
if len(eval_losses) > 0 else float('inf'))
epoch_losses.append(epoch_loss)

# Make a decision to continue/stop training.
stop_training = self.__should_stop_training(epoch_losses)
if stop_training:
break

prev_train_losses.append(train_loss)
prev_valid_losses.append(eval_loss)
step_time, train_loss = 0.0, 0.0
eval_losses = []

print('Training done.')
with tf.Graph().as_default():
Expand All @@ -279,17 +277,57 @@ def train(self):
g2p_model_eval.evaluate(self.test_lines)


def __calc_step_loss(self, train_buckets_scale):
def __should_stop_training(self, epoch_losses, window_scale=1.5):
"""Check stop training condition.
Because models with different sizes need different number of epochs
for improvement, we implemented stop criteria based on a expanding window
of allowable number of epochs without improvement. Assuming how many
maximum epochs it was needed for the previous improvements, we may increase
allowable number of epochs without improvement. Model will stop training
if number of epochs passed from previous improvement exceed maximal
allowable number.

Args:
epoch_losses: losses on a validation set during the previous epochs;

Returns:
True/False: should or should not stop training;
"""
if len(epoch_losses) > 1:
print('Prev min epoch eval loss: %f, curr epoch eval loss: %f' %
(min(epoch_losses[:-1]), epoch_losses[-1]))
# Check if there was an improvement during the last epoch
if epoch_losses[-1] < min(epoch_losses[:-1]):
# Increase window if major part of previous window have been passed
if (self.allow_epochs_wo_improvement <
(self.epochs_wo_improvement * window_scale)):
self.allow_epochs_wo_improvement =\
int(math.ceil(self.epochs_wo_improvement * window_scale))
print('Improved during the last epoch.')
self.epochs_wo_improvement = 0
else:
print('No improvement during the last epoch.')
self.epochs_wo_improvement += 1

print('Number of the epochs passed from the last improvement: %d'
% self.epochs_wo_improvement)
print('Max allowable number of epochs for improvement: %d'
% self.allow_epochs_wo_improvement)

# Stop training if no improvement was seen during last
# max allowable number of epochs
if self.epochs_wo_improvement > self.allow_epochs_wo_improvement:
return True
return False


def __calc_step_loss(self, bucket_id, from_row):
"""Choose a bucket according to data distribution. We pick a random number
in [0, 1] and use the corresponding interval in train_buckets_scale.
"""
random_number_01 = np.random.random_sample()
bucket_id = min([i for i in xrange(len(train_buckets_scale))
if train_buckets_scale[i] > random_number_01])

# Get a batch and make a step.
encoder_inputs, decoder_inputs, target_weights = self.model.get_batch(
self.train_set, bucket_id)
encoder_inputs, decoder_inputs, target_weights =\
self.model.get_batch(self.train_set, bucket_id, from_row)
_, step_loss, _ = self.model.step(self.session, encoder_inputs,
decoder_inputs, target_weights,
bucket_id, False)
Expand All @@ -299,21 +337,18 @@ def __calc_step_loss(self, train_buckets_scale):
def __calc_eval_loss(self):
"""Run evals on development set and print their perplexity.
"""
eval_loss, num_iter_total = 0.0, 0.0
eval_loss, steps = 0.0, 0
for bucket_id in xrange(len(self._BUCKETS)):
num_iter_cover_valid = int(math.ceil(len(self.valid_set[bucket_id])/
self.params.batch_size))
num_iter_total += num_iter_cover_valid
for batch_id in xrange(num_iter_cover_valid):
for from_row in xrange(0, len(self.valid_set[bucket_id]),
self.params.batch_size):
encoder_inputs, decoder_inputs, target_weights =\
self.model.get_eval_set_batch(self.valid_set, bucket_id,
batch_id * self.params.batch_size)
_, eval_batch_loss, _ = self.model.step(self.session, encoder_inputs,
decoder_inputs, target_weights,
bucket_id, True)
eval_loss += eval_batch_loss
eval_loss = eval_loss/num_iter_total if num_iter_total > 0 else float('inf')
return eval_loss
self.model.get_batch(self.valid_set, bucket_id, from_row)
_, loss, _ = self.model.step(self.session, encoder_inputs,
decoder_inputs, target_weights,
bucket_id, True)
eval_loss += loss
steps += 1
return eval_loss/steps if steps > 0 else float('inf')


def decode_word(self, word):
Expand All @@ -326,9 +361,10 @@ def decode_word(self, word):
phonemes: decoded phoneme sequence for input word;
"""
# Check if all graphemes attended in vocabulary
gr_absent = [gr for gr in word if gr not in self.gr_vocab]
gr_absent = set([gr for gr in word if gr not in self.gr_vocab])
if gr_absent:
print("Symbols '%s' are not in vocabulary" % "','".join(gr_absent).encode('utf-8'))
print("Symbols '%s' are not in vocabulary" % (
"','".join(gr_absent).encode('utf-8')))
return ""

# Get token-ids for the input word.
Expand All @@ -337,8 +373,8 @@ def decode_word(self, word):
bucket_id = min([b for b in xrange(len(self._BUCKETS))
if self._BUCKETS[b][0] > len(token_ids)])
# Get a 1-element batch to feed the word to the model.
encoder_inputs, decoder_inputs, target_weights = self.model.get_batch(
{bucket_id: [(token_ids, [])]}, bucket_id)
encoder_inputs, decoder_inputs, target_weights =\
self.model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id, 0)
# Get output logits for the word.
_, _, output_logits = self.model.step(self.session, encoder_inputs,
decoder_inputs, target_weights,
Expand Down
63 changes: 13 additions & 50 deletions g2p_seq2seq/seq2seq_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def __init__(self,
softmax_loss_function = None
# Sampled softmax only makes sense if we sample less than vocabulary size.
if num_samples > 0 and num_samples < self.target_vocab_size:
w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
dtype=dtype)
w = tf.transpose(w_t)
b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
output_projection = (w, b)
Expand Down Expand Up @@ -243,7 +244,7 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights,
# Since our targets are decoder inputs shifted by one, we need one more.
last_target = self.decoder_inputs[decoder_size].name
input_feed[last_target] = np.zeros([len(encoder_inputs[0])],
dtype=np.int32)
dtype=np.int32)

# Output feed: depends on whether we do a backward step or not.
if not forward_only:
Expand All @@ -262,43 +263,7 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights,
return None, outputs[0], outputs[1:] # No gradient norm, loss, outputs.


def get_batch(self, data, bucket_id):
"""Get a random batch of data from the specified bucket, prepare for step.

To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.

Args:
data: a tuple of size len(self.buckets) in which each element contains
lists of pairs of input and output data that we use to create a batch.
bucket_id: integer, which bucket to get the batch for.

Returns:
The triple (encoder_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []

# Get a random batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
for _ in xrange(self.batch_size):
encoder_input, decoder_input = random.choice(data[bucket_id])

# Encoder inputs are padded and then reversed.
encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

# Decoder inputs get an extra "GO" symbol, and are padded then.
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([GO_ID] + decoder_input +
[PAD_ID] * decoder_pad_size)
return self.__create_batch_major_vecs(encoder_size, decoder_size,
encoder_inputs, decoder_inputs)


def get_eval_set_batch(self, data, bucket_id, from_row_idx):
def get_batch(self, data, bucket_id, from_row):
"""Get a batch from data with rows started with from_row_idx.

To feed data in step(..) it must be a list of batch-major vectors, while
Expand All @@ -316,14 +281,14 @@ def get_eval_set_batch(self, data, bucket_id, from_row_idx):
"""
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
batch_row_idx = 0
batch_row = 0

# Get a batch of encoder and decoder inputs from data,
# pad them if needed, reverse encoder inputs and add GO to decoder.
while (from_row_idx+batch_row_idx < len(data[bucket_id])
and batch_row_idx < self.batch_size):
while (from_row + batch_row < len(data[bucket_id])
and batch_row < self.batch_size):
encoder_input, decoder_input =\
data[bucket_id][from_row_idx+batch_row_idx]
data[bucket_id][from_row + batch_row]

# Encoder inputs are padded and then reversed.
encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
Expand All @@ -333,11 +298,9 @@ def get_eval_set_batch(self, data, bucket_id, from_row_idx):
decoder_pad_size = decoder_size - len(decoder_input) - 1
decoder_inputs.append([GO_ID] + decoder_input +
[PAD_ID] * decoder_pad_size)
batch_row_idx += 1
return self.__create_batch_major_vecs(encoder_size,
decoder_size,
encoder_inputs,
decoder_inputs)
batch_row += 1
return self.__create_batch_major_vecs(encoder_size, decoder_size,
encoder_inputs, decoder_inputs)


def __create_batch_major_vecs(self, encoder_size, decoder_size,
Expand All @@ -350,14 +313,14 @@ def __create_batch_major_vecs(self, encoder_size, decoder_size,
batch_encoder_inputs.append(
np.array([encoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(len(encoder_inputs))],
dtype=np.int32))
dtype=np.int32))

# Batch decoder inputs are re-indexed decoder_inputs, we create weights.
for length_idx in xrange(decoder_size):
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(len(encoder_inputs))],
dtype=np.int32))
dtype=np.int32))

# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(len(encoder_inputs), dtype=np.float32)
Expand Down