Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
zn-nlp committed Nov 4, 2019
1 parent c59f5a5 commit a2b1662
Show file tree
Hide file tree
Showing 8 changed files with 296 additions and 111 deletions.
27 changes: 10 additions & 17 deletions seq2seq_tf2/batcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,10 @@ def abstract_to_sents(abstract):
sents = []
while True:
try:
print('SENTENCE_START is ', SENTENCE_START)
print('in abstract is ', abstract)
start_p = abstract.index(SENTENCE_START, cur)
print('start_p is ', start_p)
end_p = abstract.index(SENTENCE_END, start_p + 1)
print('end_p is ', end_p)
cur = end_p + len(SENTENCE_END)
print('cur is ', cur)
sents.append(abstract[start_p + len(SENTENCE_START): end_p])
print('sents is ', sents)
except ValueError as e: # no more sentences
return sents

Expand Down Expand Up @@ -162,17 +156,14 @@ def get_dec_inp_targ_seqs(sequence, max_len, start_id, stop_id):
# return parsed_example


def example_generator(filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, mode):
def example_generator(filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, mode, batch_size):
dataset_1 = tf.data.TextLineDataset(filenames_1)
dataset_2 = tf.data.TextLineDataset(filenames_2)

train_dataset = tf.data.Dataset.zip((dataset_1, dataset_2))
if mode == "train":
train_dataset = train_dataset.shuffle(10, reshuffle_each_iteration=True).repeat()

vocab = Vocab(vocab_path, vocab_size)
# print('vocab is {}'.format(vocab.word2id))

for raw_record in train_dataset:
article = raw_record[0].numpy().decode("utf-8")
# print('article is ', article)
Expand Down Expand Up @@ -224,13 +215,15 @@ def example_generator(filenames_1, filenames_2, vocab_path, vocab_size, max_enc_
"abstract": abstract,
"abstract_sents": abstract
}
# print('output is ', output)
yield output
if mode == "test":
for _ in range(batch_size):
yield output
else:
yield output


def batch_generator(generator, filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, batch_size, mode):
dataset = tf.data.Dataset.from_generator(generator,
args=[filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, mode],
def batch_generator(generator, filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, batch_size, mode):
dataset = tf.data.Dataset.from_generator(lambda: generator(filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, mode, batch_size),
output_types={
"enc_len": tf.int32,
"enc_input": tf.int32,
Expand Down Expand Up @@ -296,9 +289,9 @@ def update(entry):
return dataset


def batcher(filenames_1, filenames_2, vocab_path, hpm):
def batcher(filenames_1, filenames_2, vocab, hpm):
# filenames = glob.glob("{}/*.tfrecords".format(data_path))
dataset = batch_generator(example_generator, filenames_1, filenames_2, vocab_path, hpm["vocab_size"], hpm["max_enc_len"],
dataset = batch_generator(example_generator, filenames_1, filenames_2, vocab, hpm["max_enc_len"],
hpm["max_dec_len"], hpm["batch_size"], hpm["mode"])
return dataset

Expand Down
34 changes: 15 additions & 19 deletions seq2seq_tf2/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,29 @@


class Encoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embedding_matrix):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
# embedding_matrix = load_word2vec(vocab_size)
# self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
# weights=[embedding_matrix],
# trainable=False)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
trainable=False)
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
# self.bigru = tf.keras.layers.Bidirectional(self.gru, merge_mode='concat')
self.bigru = tf.keras.layers.Bidirectional(self.gru, merge_mode='concat')

def call(self, x, hidden):
x = self.embedding(x)
# hidden = tf.split(hidden, num_or_size_splits=2, axis=1)
# output, forward_state, backward_state = self.bigru(x, initial_state=hidden)
# state = tf.concat([forward_state, backward_state], axis=1)
output, state = self.gru(x, initial_state=hidden)
hidden = tf.split(hidden, num_or_size_splits=2, axis=1)
output, forward_state, backward_state = self.bigru(x, initial_state=hidden)
state = tf.concat([forward_state, backward_state], axis=1)
# output, state = self.gru(x, initial_state=hidden)
return output, state

def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
return tf.zeros((self.batch_sz, 2*self.enc_units))


class BahdanauAttention(tf.keras.layers.Layer):
Expand Down Expand Up @@ -60,21 +58,19 @@ def call(self, query, values):


class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, embedding_matrix):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
# embedding_matrix = load_word2vec(vocab_size)
# self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
# weights=[embedding_matrix],
# trainable=False)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
weights=[embedding_matrix],
trainable=False)
self.gru = tf.keras.layers.GRU(self.dec_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax)
# self.fc = tf.nn.dropout(0.5)
self.fc = tf.keras.layers.Dropout(0.5)

def call(self, x, hidden, enc_output, context_vector):
# enc_output shape == (batch_size, max_length, hidden_size)
Expand Down
14 changes: 8 additions & 6 deletions seq2seq_tf2/main.py → seq2seq_tf2/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,23 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("--max_enc_len", default=400, help="Encoder input max sequence length", type=int)
parser.add_argument("--max_dec_len", default=100, help="Decoder input max sequence length", type=int)
parser.add_argument("--max_dec_steps", default=120, help="maximum number of words of the predicted abstract", type=int)
parser.add_argument("--min_dec_steps", default=30, help="Minimum number of words of the predicted abstract", type=int)
parser.add_argument("--batch_size", default=16, help="batch size", type=int)
parser.add_argument("--beam_size", default=3,
help="beam size for beam search decoding (must be equal to batch size in decode mode)",
type=int)
parser.add_argument("--vocab_size", default=50000, help="Vocabulary size", type=int)
parser.add_argument("--embed_size", default=256, help="Words embeddings dimension", type=int)
parser.add_argument("--enc_units", default=256, help="Encoder GRU cell units number", type=int)
parser.add_argument("--dec_units", default=256, help="Decoder GRU cell units number", type=int)
parser.add_argument("--attn_units", default=512, help="[context vector, decoder state, decoder input] feedforward result dimension - this result is used to compute the attention weights", type=int)
parser.add_argument("--learning_rate", default=0.015, help="Learning rate", type=float)
parser.add_argument("--learning_rate", default=0.15, help="Learning rate", type=float)
parser.add_argument("--adagrad_init_acc", default=0.1, help="Adagrad optimizer initial accumulator value. Please refer to the Adagrad optimizer API documentation on tensorflow site for more details.", type=float)
parser.add_argument("--max_grad_norm", default=0.8, help="Gradient norm above which gradients must be clipped", type=float)
parser.add_argument("--checkpoints_save_steps", default=10, help="Save checkpoints every N steps", type=int)
parser.add_argument("--max_steps", default=10000, help="Max number of iterations", type=int)
parser.add_argument("--num_to_test", default=5, help="Number of examples to test", type=int)
parser.add_argument("--mode", default='train', help="training, eval or test options")
parser.add_argument("--pointer_gen", default=False, help="training, eval or test options")

Expand All @@ -48,11 +55,6 @@ def main():
# assert os.path.exists(params["data_dir"]), "data_dir doesn't exist"
# assert os.path.isfile(params["vocab_path"]), "vocab_path doesn't exist"

if not os.path.exists("{}".format(params["model_dir"])):
os.makedirs("{}".format(params["model_dir"]))
"""i = len([name for name in os.listdir("{}/{}".format(params["model_dir"], "logdir")) if os.path.isfile(name)])
params["log_file"] = "{}/logdir/tensorflow_{}.log".format(params["model_dir"],i)"""

if params["mode"] == "train":
train(params)

Expand Down
16 changes: 9 additions & 7 deletions seq2seq_tf2/seq2seq_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
class PGN(tf.keras.Model):
def __init__(self, params):
super(PGN, self).__init__()
# self.embedding_matrix = load_word2vec(params["vocab_size"])
# print()
self.embedding_matrix = load_word2vec(params["vocab_size"])
self.params = params
self.encoder = Encoder(params["vocab_size"], params["embed_size"], params["enc_units"], params["batch_size"])
self.encoder = Encoder(params["vocab_size"], params["embed_size"], params["enc_units"], params["batch_size"], self.embedding_matrix)
self.attention = BahdanauAttention(params["attn_units"])
self.decoder = Decoder(params["vocab_size"], params["embed_size"], params["dec_units"], params["batch_size"])
self.decoder = Decoder(params["vocab_size"], params["embed_size"], params["dec_units"], params["batch_size"], self.embedding_matrix)
self.pointer = Pointer()

def call_encoder(self, enc_inp):
Expand All @@ -31,12 +30,10 @@ def call_decoder_onestep(self, latest_tokens, enc_hidden, dec_hidden):
context_vector)
return dec_x, pred, dec_hidden


def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch_oov_len):
predictions = []
attentions = []
p_gens = []
context_vector, _ = self.attention(dec_hidden, enc_output)

if self.params["pointer_gen"]:
for t in range(dec_inp.shape[1]):
Expand All @@ -52,12 +49,17 @@ def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch

final_dists = _calc_final_dist(enc_extended_inp, predictions, attentions, p_gens, batch_oov_len,
self.params["vocab_size"], self.params["batch_size"])
return tf.stack(final_dists, 1), dec_hidden

if self.params["mode"] == "train":
return tf.stack(final_dists, 1), dec_hidden # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
else:
return tf.stack(final_dists, 1), dec_hidden, context_vector, tf.stack(attentions, 1), tf.stack(p_gens, 1)

else:
print('dec_inp is ', dec_inp)
print('dec_inp.shape[1] is ', dec_inp.shape[1])
for t in range(dec_inp.shape[1]):
context_vector, _ = self.attention(dec_hidden, enc_output)
dec_x, pred, dec_hidden = self.decoder(tf.expand_dims(dec_inp[:, t], 1),
dec_hidden,
enc_output,
Expand Down
147 changes: 94 additions & 53 deletions seq2seq_tf2/test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import tensorflow as tf
from seq2seq_tf2.seq2seq_model import PGN
from seq2seq_tf2.batcher import Vocab, START_DECODING, STOP_DECODING, article_to_ids, output_to_words, SENTENCE_END
from seq2seq_tf2.batcher import Vocab, START_DECODING, STOP_DECODING, article_to_ids, output_to_words, SENTENCE_END, batcher
from seq2seq_tf2.preprocess import preprocess_sentence
from seq2seq_tf2.test_helper import beam_decode
from tqdm import tqdm
from seq2seq_tf2 import config
import json

params = {'max_enc_len': 400,
_params = {'max_enc_len': 400,
'max_dec_len': 100,
'batch_size': 1,
'vocab_size': 50000,
Expand All @@ -31,60 +33,99 @@
'log_file': ''}


def test(sentence):
vocab = Vocab(params["vocab_path"], params["vocab_size"])
model = PGN(params)

ckpt = tf.train.Checkpoint(model=model)
checkpoint_dir = "{}/checkpoint".format(params["model_dir"])
latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
ckpt.restore(latest_ckpt).expect_partial()

sentence = preprocess_sentence(sentence)
print('sentence is ', sentence)
sentence_words = sentence.split()[:params["max_enc_len"]]
print('sentence_words is ', sentence_words)
enc_input = [vocab.word_to_id(w) for w in sentence_words]
print('enc_input is ', enc_input)
enc_input_extend_vocab, article_oovs = article_to_ids(sentence_words, vocab)
print('enc_input_extend_vocab is ', enc_input_extend_vocab)
print('article_oovs', article_oovs)

start_decoding = vocab.word_to_id(START_DECODING)
stop_decoding = vocab.word_to_id(STOP_DECODING)

enc_input = tf.keras.preprocessing.sequence.pad_sequences([enc_input],
maxlen=params["max_enc_len"],
padding='post')
print('enc_input is ', enc_input)
enc_input = tf.convert_to_tensor(enc_input)
print('enc_input is ', enc_input)
def test(params):
assert params["mode"].lower() == "test", "change training mode to 'test' or 'eval'"
assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params"

enc_hidden, enc_output = model.call_encoder(enc_input)
print('enc_hidden is ', enc_hidden)
print('enc_output is ', enc_output)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([start_decoding], 0)
print('dec_input is ', dec_input)

result = ''
while dec_input != vocab.word_to_id(STOP_DECODING):
_, predictions, dec_hidden = model.call_decoder_onestep(dec_input, enc_output, dec_hidden)
print('predictions is ', predictions)

predicted_id = tf.argmax(predictions[0]).numpy()
print('predicted_id', predicted_id)
result += vocab.id_to_word(predicted_id) + ' '

if vocab.id_to_word(predicted_id) == SENTENCE_END \
or len(result.split()) >= params['max_dec_len']:
print('Early stopping')
break
tf.compat.v1.logging.info("Building the model ...")
model = PGN(params)

dec_input = tf.expand_dims([predicted_id], 1)
print('dec_input:', dec_input)
print("Creating the vocab ...")
vocab = Vocab(params["vocab_path"], params["vocab_size"])

print('result: ', result)
print("Creating the batcher ...")
b = batcher(params["data_dir"], vocab, params)

print("Creating the checkpoint manager")
checkpoint_dir = "{}".format(params["checkpoint_dir"])
ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=11)

path = params["model_path"] if params["model_path"] else ckpt_manager.latest_checkpoint
ckpt.restore(path)
print("Model restored")

for batch in b:
yield beam_decode(model, batch, vocab, params)


def test_and_save(params):
assert params["test_save_dir"], "provide a dir where to save the results"
gen = test(params)
with tqdm(total=params["num_to_test"], position=0, leave=True) as pbar:
for i in range(params["num_to_test"]):
trial = next(gen)
with open(params["test_save_dir"] + "/article_" + str(i) + ".txt", "w") as f:
f.write("article:\n")
f.write(trial.text)
f.write("\n\nabstract:\n")
f.write(trial.abstract)
pbar.update(1)

# def _test(sentence):
# vocab = Vocab(params["vocab_path"], params["vocab_size"])
# model = PGN(params)
#
# ckpt = tf.train.Checkpoint(model=model)
# checkpoint_dir = "{}/checkpoint".format(params["model_dir"])
# latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
# ckpt.restore(latest_ckpt).expect_partial()
#
# sentence = preprocess_sentence(sentence)
# print('sentence is ', sentence)
# sentence_words = sentence.split()[:params["max_enc_len"]]
# print('sentence_words is ', sentence_words)
# enc_input = [vocab.word_to_id(w) for w in sentence_words]
# print('enc_input is ', enc_input)
# enc_input_extend_vocab, article_oovs = article_to_ids(sentence_words, vocab)
# print('enc_input_extend_vocab is ', enc_input_extend_vocab)
# print('article_oovs', article_oovs)
#
# start_decoding = vocab.word_to_id(START_DECODING)
# stop_decoding = vocab.word_to_id(STOP_DECODING)
#
# enc_input = tf.keras.preprocessing.sequence.pad_sequences([enc_input],
# maxlen=params["max_enc_len"],
# padding='post')
# print('enc_input is ', enc_input)
# enc_input = tf.convert_to_tensor(enc_input)
# print('enc_input is ', enc_input)
#
# enc_hidden, enc_output = model.call_encoder(enc_input)
# print('enc_hidden is ', enc_hidden)
# print('enc_output is ', enc_output)
# dec_hidden = enc_hidden
# dec_input = tf.expand_dims([start_decoding], 0)
# print('dec_input is ', dec_input)
#
# result = ''
# while dec_input != vocab.word_to_id(STOP_DECODING):
# _, predictions, dec_hidden = model.call_decoder_onestep(dec_input, enc_output, dec_hidden)
# print('predictions is ', predictions)
#
# predicted_id = tf.argmax(predictions[0]).numpy()
# print('predicted_id', predicted_id)
# result += vocab.id_to_word(predicted_id) + ' '
#
# if vocab.id_to_word(predicted_id) == SENTENCE_END \
# or len(result.split()) >= params['max_dec_len']:
# print('Early stopping')
# break
#
# dec_input = tf.expand_dims([predicted_id], 1)
# print('dec_input:', dec_input)
#
# print('result: ', result)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit a2b1662

Please sign in to comment.