update

qiaochen · Nov 4, 2019 · a2b1662 · a2b1662
1 parent c59f5a5
commit a2b1662
Show file tree

Hide file tree

Showing 8 changed files with 296 additions and 111 deletions.
diff --git a/seq2seq_tf2/batcher.py b/seq2seq_tf2/batcher.py
@@ -116,16 +116,10 @@ def abstract_to_sents(abstract):
     sents = []
     while True:
         try:
-            print('SENTENCE_START is ', SENTENCE_START)
-            print('in abstract is ', abstract)
             start_p = abstract.index(SENTENCE_START, cur)
-            print('start_p is ', start_p)
             end_p = abstract.index(SENTENCE_END, start_p + 1)
-            print('end_p is ', end_p)
             cur = end_p + len(SENTENCE_END)
-            print('cur is ', cur)
             sents.append(abstract[start_p + len(SENTENCE_START): end_p])
-            print('sents is ', sents)
         except ValueError as e: # no more sentences
             return sents
 
@@ -162,17 +156,14 @@ def get_dec_inp_targ_seqs(sequence, max_len, start_id, stop_id):
 #     return parsed_example
 
 
-def example_generator(filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, mode):
+def example_generator(filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, mode, batch_size):
     dataset_1 = tf.data.TextLineDataset(filenames_1)
     dataset_2 = tf.data.TextLineDataset(filenames_2)
 
     train_dataset = tf.data.Dataset.zip((dataset_1, dataset_2))
     if mode == "train":
         train_dataset = train_dataset.shuffle(10, reshuffle_each_iteration=True).repeat()
 
-    vocab = Vocab(vocab_path, vocab_size)
-    # print('vocab is {}'.format(vocab.word2id))
-
     for raw_record in train_dataset:
         article = raw_record[0].numpy().decode("utf-8")
         # print('article is ', article)
@@ -224,13 +215,15 @@ def example_generator(filenames_1, filenames_2, vocab_path, vocab_size, max_enc_
             "abstract": abstract,
             "abstract_sents": abstract
         }
-        # print('output is ', output)
-        yield output
+        if mode == "test":
+            for _ in range(batch_size):
+                yield output
+        else:
+            yield output
 
 
-def batch_generator(generator, filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, batch_size, mode):
-    dataset = tf.data.Dataset.from_generator(generator,
-                                             args=[filenames_1, filenames_2, vocab_path, vocab_size, max_enc_len, max_dec_len, mode],
+def batch_generator(generator, filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, batch_size, mode):
+    dataset = tf.data.Dataset.from_generator(lambda: generator(filenames_1, filenames_2, vocab, max_enc_len, max_dec_len, mode, batch_size),
                                              output_types={
                                                  "enc_len": tf.int32,
                                                  "enc_input": tf.int32,
@@ -296,9 +289,9 @@ def update(entry):
     return dataset
 
 
-def batcher(filenames_1, filenames_2, vocab_path, hpm):
+def batcher(filenames_1, filenames_2, vocab, hpm):
     # filenames = glob.glob("{}/*.tfrecords".format(data_path))
-    dataset = batch_generator(example_generator, filenames_1, filenames_2, vocab_path, hpm["vocab_size"], hpm["max_enc_len"],
+    dataset = batch_generator(example_generator, filenames_1, filenames_2, vocab, hpm["max_enc_len"],
                               hpm["max_dec_len"], hpm["batch_size"], hpm["mode"])
     return dataset
 

diff --git a/seq2seq_tf2/layers.py b/seq2seq_tf2/layers.py
@@ -4,31 +4,29 @@
 
 
 class Encoder(tf.keras.layers.Layer):
-    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
+    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embedding_matrix):
         super(Encoder, self).__init__()
         self.batch_sz = batch_sz
         self.enc_units = enc_units
-        # embedding_matrix = load_word2vec(vocab_size)
-        # self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
-        #                                            weights=[embedding_matrix],
-        #                                            trainable=False)
-        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
+        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
+                                                   weights=[embedding_matrix],
+                                                   trainable=False)
         self.gru = tf.keras.layers.GRU(self.enc_units,
                                        return_sequences=True,
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')
-        # self.bigru = tf.keras.layers.Bidirectional(self.gru, merge_mode='concat')
+        self.bigru = tf.keras.layers.Bidirectional(self.gru, merge_mode='concat')
 
     def call(self, x, hidden):
         x = self.embedding(x)
-        # hidden = tf.split(hidden, num_or_size_splits=2, axis=1)
-        # output, forward_state, backward_state = self.bigru(x, initial_state=hidden)
-        # state = tf.concat([forward_state, backward_state], axis=1)
-        output, state = self.gru(x, initial_state=hidden)
+        hidden = tf.split(hidden, num_or_size_splits=2, axis=1)
+        output, forward_state, backward_state = self.bigru(x, initial_state=hidden)
+        state = tf.concat([forward_state, backward_state], axis=1)
+        # output, state = self.gru(x, initial_state=hidden)
         return output, state
 
     def initialize_hidden_state(self):
-        return tf.zeros((self.batch_sz, self.enc_units))
+        return tf.zeros((self.batch_sz, 2*self.enc_units))
 
 
 class BahdanauAttention(tf.keras.layers.Layer):
@@ -60,21 +58,19 @@ def call(self, query, values):
 
 
 class Decoder(tf.keras.layers.Layer):
-    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
+    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, embedding_matrix):
         super(Decoder, self).__init__()
         self.batch_sz = batch_sz
         self.dec_units = dec_units
-        # embedding_matrix = load_word2vec(vocab_size)
-        # self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
-        #                                            weights=[embedding_matrix],
-        #                                            trainable=False)
-        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
+        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
+                                                   weights=[embedding_matrix],
+                                                   trainable=False)
         self.gru = tf.keras.layers.GRU(self.dec_units,
                                        return_sequences=True,
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')
         self.fc = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax)
-        # self.fc = tf.nn.dropout(0.5)
+        self.fc = tf.keras.layers.Dropout(0.5)
 
     def call(self, x, hidden, enc_output, context_vector):
         # enc_output shape == (batch_size, max_length, hidden_size)

diff --git a/seq2seq_tf2/main.py → seq2seq_tf2/run_summarization.py b/seq2seq_tf2/main.py → seq2seq_tf2/run_summarization.py
@@ -12,16 +12,23 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--max_enc_len", default=400, help="Encoder input max sequence length", type=int)
     parser.add_argument("--max_dec_len", default=100, help="Decoder input max sequence length", type=int)
+    parser.add_argument("--max_dec_steps", default=120, help="maximum number of words of the predicted abstract", type=int)
+    parser.add_argument("--min_dec_steps", default=30, help="Minimum number of words of the predicted abstract", type=int)
     parser.add_argument("--batch_size", default=16, help="batch size", type=int)
+    parser.add_argument("--beam_size", default=3,
+                        help="beam size for beam search decoding (must be equal to batch size in decode mode)",
+                        type=int)
     parser.add_argument("--vocab_size", default=50000, help="Vocabulary size", type=int)
     parser.add_argument("--embed_size", default=256, help="Words embeddings dimension", type=int)
     parser.add_argument("--enc_units", default=256, help="Encoder GRU cell units number", type=int)
     parser.add_argument("--dec_units", default=256, help="Decoder GRU cell units number", type=int)
     parser.add_argument("--attn_units", default=512, help="[context vector, decoder state, decoder input] feedforward result dimension - this result is used to compute the attention weights", type=int)
-    parser.add_argument("--learning_rate", default=0.015, help="Learning rate", type=float)
+    parser.add_argument("--learning_rate", default=0.15, help="Learning rate", type=float)
     parser.add_argument("--adagrad_init_acc", default=0.1, help="Adagrad optimizer initial accumulator value. Please refer to the Adagrad optimizer API documentation on tensorflow site for more details.", type=float)
     parser.add_argument("--max_grad_norm", default=0.8, help="Gradient norm above which gradients must be clipped", type=float)
     parser.add_argument("--checkpoints_save_steps", default=10, help="Save checkpoints every N steps", type=int)
+    parser.add_argument("--max_steps", default=10000, help="Max number of iterations", type=int)
+    parser.add_argument("--num_to_test", default=5, help="Number of examples to test", type=int)
     parser.add_argument("--mode", default='train', help="training, eval or test options")
     parser.add_argument("--pointer_gen", default=False, help="training, eval or test options")
 
@@ -48,11 +55,6 @@ def main():
     # assert os.path.exists(params["data_dir"]), "data_dir doesn't exist"
     # assert os.path.isfile(params["vocab_path"]), "vocab_path doesn't exist"
 
-    if not os.path.exists("{}".format(params["model_dir"])):
-        os.makedirs("{}".format(params["model_dir"]))
-    """i = len([name for name in os.listdir("{}/{}".format(params["model_dir"], "logdir")) if os.path.isfile(name)])
-    params["log_file"] = "{}/logdir/tensorflow_{}.log".format(params["model_dir"],i)"""
-
     if params["mode"] == "train":
         train(params)
 

diff --git a/seq2seq_tf2/seq2seq_model.py b/seq2seq_tf2/seq2seq_model.py
@@ -6,12 +6,11 @@
 class PGN(tf.keras.Model):
     def __init__(self, params):
         super(PGN, self).__init__()
-        # self.embedding_matrix = load_word2vec(params["vocab_size"])
-        # print()
+        self.embedding_matrix = load_word2vec(params["vocab_size"])
         self.params = params
-        self.encoder = Encoder(params["vocab_size"], params["embed_size"], params["enc_units"], params["batch_size"])
+        self.encoder = Encoder(params["vocab_size"], params["embed_size"], params["enc_units"], params["batch_size"], self.embedding_matrix)
         self.attention = BahdanauAttention(params["attn_units"])
-        self.decoder = Decoder(params["vocab_size"], params["embed_size"], params["dec_units"], params["batch_size"])
+        self.decoder = Decoder(params["vocab_size"], params["embed_size"], params["dec_units"], params["batch_size"], self.embedding_matrix)
         self.pointer = Pointer()
 
     def call_encoder(self, enc_inp):
@@ -31,12 +30,10 @@ def call_decoder_onestep(self, latest_tokens, enc_hidden, dec_hidden):
                                                    context_vector)
         return dec_x, pred, dec_hidden
 
-
     def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch_oov_len):
         predictions = []
         attentions = []
         p_gens = []
-        context_vector, _ = self.attention(dec_hidden, enc_output)
 
         if self.params["pointer_gen"]:
             for t in range(dec_inp.shape[1]):
@@ -52,12 +49,17 @@ def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch
 
             final_dists = _calc_final_dist(enc_extended_inp, predictions, attentions, p_gens, batch_oov_len,
                                            self.params["vocab_size"], self.params["batch_size"])
-            return tf.stack(final_dists, 1), dec_hidden
+
+            if self.params["mode"] == "train":
+                return tf.stack(final_dists, 1), dec_hidden  # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
+            else:
+                return tf.stack(final_dists, 1), dec_hidden, context_vector, tf.stack(attentions, 1), tf.stack(p_gens, 1)
 
         else:
             print('dec_inp is ', dec_inp)
             print('dec_inp.shape[1] is ', dec_inp.shape[1])
             for t in range(dec_inp.shape[1]):
+                context_vector, _ = self.attention(dec_hidden, enc_output)
                 dec_x, pred, dec_hidden = self.decoder(tf.expand_dims(dec_inp[:, t], 1),
                                                        dec_hidden,
                                                        enc_output,

diff --git a/seq2seq_tf2/test.py b/seq2seq_tf2/test.py
@@ -1,11 +1,13 @@
 import tensorflow as tf
 from seq2seq_tf2.seq2seq_model import PGN
-from seq2seq_tf2.batcher import Vocab, START_DECODING, STOP_DECODING, article_to_ids, output_to_words, SENTENCE_END
+from seq2seq_tf2.batcher import Vocab, START_DECODING, STOP_DECODING, article_to_ids, output_to_words, SENTENCE_END, batcher
 from seq2seq_tf2.preprocess import preprocess_sentence
+from seq2seq_tf2.test_helper import beam_decode
+from tqdm import tqdm
 from seq2seq_tf2 import config
 import json
 
-params = {'max_enc_len': 400,
+_params = {'max_enc_len': 400,
           'max_dec_len': 100,
           'batch_size': 1,
           'vocab_size': 50000,
@@ -31,60 +33,99 @@
           'log_file': ''}
 
 
-def test(sentence):
-    vocab = Vocab(params["vocab_path"], params["vocab_size"])
-    model = PGN(params)
-
-    ckpt = tf.train.Checkpoint(model=model)
-    checkpoint_dir = "{}/checkpoint".format(params["model_dir"])
-    latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
-    ckpt.restore(latest_ckpt).expect_partial()
-
-    sentence = preprocess_sentence(sentence)
-    print('sentence is ', sentence)
-    sentence_words = sentence.split()[:params["max_enc_len"]]
-    print('sentence_words is ', sentence_words)
-    enc_input = [vocab.word_to_id(w) for w in sentence_words]
-    print('enc_input is ', enc_input)
-    enc_input_extend_vocab, article_oovs = article_to_ids(sentence_words, vocab)
-    print('enc_input_extend_vocab is ', enc_input_extend_vocab)
-    print('article_oovs', article_oovs)
-
-    start_decoding = vocab.word_to_id(START_DECODING)
-    stop_decoding = vocab.word_to_id(STOP_DECODING)
-
-    enc_input = tf.keras.preprocessing.sequence.pad_sequences([enc_input],
-                                                           maxlen=params["max_enc_len"],
-                                                           padding='post')
-    print('enc_input is ', enc_input)
-    enc_input = tf.convert_to_tensor(enc_input)
-    print('enc_input is ', enc_input)
+def test(params):
+    assert params["mode"].lower() == "test", "change training mode to 'test' or 'eval'"
+    assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params"
 
-    enc_hidden, enc_output = model.call_encoder(enc_input)
-    print('enc_hidden is ', enc_hidden)
-    print('enc_output is ', enc_output)
-    dec_hidden = enc_hidden
-    dec_input = tf.expand_dims([start_decoding], 0)
-    print('dec_input is ', dec_input)
-
-    result = ''
-    while dec_input != vocab.word_to_id(STOP_DECODING):
-        _, predictions, dec_hidden = model.call_decoder_onestep(dec_input, enc_output, dec_hidden)
-        print('predictions is ', predictions)
-
-        predicted_id = tf.argmax(predictions[0]).numpy()
-        print('predicted_id', predicted_id)
-        result += vocab.id_to_word(predicted_id) + ' '
-
-        if vocab.id_to_word(predicted_id) == SENTENCE_END \
-                or len(result.split()) >= params['max_dec_len']:
-            print('Early stopping')
-            break
+    tf.compat.v1.logging.info("Building the model ...")
+    model = PGN(params)
 
-        dec_input = tf.expand_dims([predicted_id], 1)
-        print('dec_input:', dec_input)
+    print("Creating the vocab ...")
+    vocab = Vocab(params["vocab_path"], params["vocab_size"])
 
-    print('result: ', result)
+    print("Creating the batcher ...")
+    b = batcher(params["data_dir"], vocab, params)
+
+    print("Creating the checkpoint manager")
+    checkpoint_dir = "{}".format(params["checkpoint_dir"])
+    ckpt = tf.train.Checkpoint(step=tf.Variable(0), PGN=model)
+    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=11)
+
+    path = params["model_path"] if params["model_path"] else ckpt_manager.latest_checkpoint
+    ckpt.restore(path)
+    print("Model restored")
+
+    for batch in b:
+        yield beam_decode(model, batch, vocab, params)
+
+
+def test_and_save(params):
+    assert params["test_save_dir"], "provide a dir where to save the results"
+    gen = test(params)
+    with tqdm(total=params["num_to_test"], position=0, leave=True) as pbar:
+        for i in range(params["num_to_test"]):
+            trial = next(gen)
+            with open(params["test_save_dir"] + "/article_" + str(i) + ".txt", "w") as f:
+                f.write("article:\n")
+                f.write(trial.text)
+                f.write("\n\nabstract:\n")
+                f.write(trial.abstract)
+            pbar.update(1)
+
+# def _test(sentence):
+#     vocab = Vocab(params["vocab_path"], params["vocab_size"])
+#     model = PGN(params)
+#
+#     ckpt = tf.train.Checkpoint(model=model)
+#     checkpoint_dir = "{}/checkpoint".format(params["model_dir"])
+#     latest_ckpt = tf.train.latest_checkpoint(checkpoint_dir)
+#     ckpt.restore(latest_ckpt).expect_partial()
+#
+#     sentence = preprocess_sentence(sentence)
+#     print('sentence is ', sentence)
+#     sentence_words = sentence.split()[:params["max_enc_len"]]
+#     print('sentence_words is ', sentence_words)
+#     enc_input = [vocab.word_to_id(w) for w in sentence_words]
+#     print('enc_input is ', enc_input)
+#     enc_input_extend_vocab, article_oovs = article_to_ids(sentence_words, vocab)
+#     print('enc_input_extend_vocab is ', enc_input_extend_vocab)
+#     print('article_oovs', article_oovs)
+#
+#     start_decoding = vocab.word_to_id(START_DECODING)
+#     stop_decoding = vocab.word_to_id(STOP_DECODING)
+#
+#     enc_input = tf.keras.preprocessing.sequence.pad_sequences([enc_input],
+#                                                            maxlen=params["max_enc_len"],
+#                                                            padding='post')
+#     print('enc_input is ', enc_input)
+#     enc_input = tf.convert_to_tensor(enc_input)
+#     print('enc_input is ', enc_input)
+#
+#     enc_hidden, enc_output = model.call_encoder(enc_input)
+#     print('enc_hidden is ', enc_hidden)
+#     print('enc_output is ', enc_output)
+#     dec_hidden = enc_hidden
+#     dec_input = tf.expand_dims([start_decoding], 0)
+#     print('dec_input is ', dec_input)
+#
+#     result = ''
+#     while dec_input != vocab.word_to_id(STOP_DECODING):
+#         _, predictions, dec_hidden = model.call_decoder_onestep(dec_input, enc_output, dec_hidden)
+#         print('predictions is ', predictions)
+#
+#         predicted_id = tf.argmax(predictions[0]).numpy()
+#         print('predicted_id', predicted_id)
+#         result += vocab.id_to_word(predicted_id) + ' '
+#
+#         if vocab.id_to_word(predicted_id) == SENTENCE_END \
+#                 or len(result.split()) >= params['max_dec_len']:
+#             print('Early stopping')
+#             break
+#
+#         dec_input = tf.expand_dims([predicted_id], 1)
+#         print('dec_input:', dec_input)
+#
+#     print('result: ', result)
 
 
 if __name__ == '__main__':