transformer_fast_decode, beam search: take an optional cache and return it (tensorflow#999)

gcampax · kpe · commit 948f8ef5188b · 2019-03-02T23:17:27.000+01:00
Some models, eg. semantic parsing models with copying mechanisms,
want to use the output of Transformer for multiple predictions.
One way to do so is to modify the symbols_to_logits_fn to generate
the additional predictions and save it in the cache dictionary.

To do so, though, fast_decode() must allow an externally supplied cache,
and must return it to the caller after the loop.
diff --git a/tensor2tensor/layers/latent_layers.py b/tensor2tensor/layers/latent_layers.py
@@ -168,7 +168,7 @@ def symbols_to_logits_fn(ids):
 
   initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32)
   length = tf.shape(latents_dense_in)[1]
-  ids, _ = beam_search.beam_search(
+  ids, _, _ = beam_search.beam_search(
       symbols_to_logits_fn,
       initial_ids,
       1,
diff --git a/tensor2tensor/models/research/transformer_nat.py b/tensor2tensor/models/research/transformer_nat.py
@@ -228,7 +228,7 @@ def symbols_to_logits_fn(ids):
 
   initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32)
   length = tf.shape(latents_dense_in)[1]
-  ids, _ = beam_search.beam_search(
+  ids, _, _ = beam_search.beam_search(
       symbols_to_logits_fn,
       initial_ids,
       beam_size=1,
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
@@ -290,7 +290,7 @@ def symbols_to_logits_fn(ids):
 
   initial_ids = tf.zeros([tf.shape(latents_dense_in)[0]], dtype=tf.int32)
   length = tf.shape(latents_dense_in)[1]
-  ids, _ = beam_search.beam_search(
+  ids, _, _ = beam_search.beam_search(
       symbols_to_logits_fn, initial_ids, beam_size, length,
       vocab_size, alpha=0.0, eos_id=-1, stop_early=False)
 
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -910,7 +910,8 @@ def fast_decode(encoder_output,
                 eos_id=beam_search.EOS_ID,
                 batch_size=None,
                 force_decode_length=False,
-                scope_prefix="body/"):
+                scope_prefix="body/",
+                cache=None):
   """Given encoder output and a symbols to logits function, does fast decoding.
 
   Implements both greedy and beam search decoding, uses beam search iff
@@ -957,7 +958,9 @@ def fast_decode(encoder_output,
   vars_3d_num_heads = (
       hparams.num_heads if hparams.get("attention_variables_3d") else 0)
 
-  cache = {
+  if cache is None:
+    cache = dict()
+  cache.update({
       "layer_%d" % layer: {
           "k":
               common_attention.split_heads(
@@ -966,7 +969,7 @@ def fast_decode(encoder_output,
               common_attention.split_heads(
                   tf.zeros([batch_size, 0, value_channels]), hparams.num_heads),
       } for layer in range(num_layers)
-  }
+  })
 
   # If `ffn_layer` is in `["dense_relu_dense" or "conv_hidden_relu"]`, then the
   # cache key "f" won't be used, which means that the` shape of cache["f"]`
@@ -1000,7 +1003,7 @@ def fast_decode(encoder_output,
 
   if beam_size > 1:  # Beam Search
     initial_ids = sos_id * tf.ones([batch_size], dtype=tf.int32)
-    decoded_ids, scores = beam_search.beam_search(
+    decoded_ids, scores, cache = beam_search.beam_search(
         symbols_to_logits_fn,
         initial_ids,
         beam_size,
@@ -1047,7 +1050,7 @@ def is_not_finished(i, hit_eos, *_):
     hit_eos = tf.fill([batch_size], False)
     next_id = sos_id * tf.ones([batch_size, 1], dtype=tf.int64)
     initial_log_prob = tf.zeros([batch_size], dtype=tf.float32)
-    _, _, _, decoded_ids, _, log_prob = tf.while_loop(
+    _, _, _, decoded_ids, cache, log_prob = tf.while_loop(
         is_not_finished,
         inner_loop, [
             tf.constant(0), hit_eos, next_id, decoded_ids, cache,
@@ -1063,7 +1066,7 @@ def is_not_finished(i, hit_eos, *_):
         ])
     scores = log_prob
 
-  return {"outputs": decoded_ids, "scores": scores}
+  return {"outputs": decoded_ids, "scores": scores, "cache": cache}
 
 
 @registry.register_model
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
@@ -752,7 +752,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
         tf.less(i, decode_length), tf.logical_not(bound_is_met))
 
   (_, alive_seq, alive_log_probs, finished_seq, finished_scores,
-   finished_flags, _) = tf.while_loop(
+   finished_flags, states) = tf.while_loop(
        _is_finished,
        inner_loop, [
            tf.constant(0), alive_seq, alive_log_probs, finished_seq,
@@ -786,4 +786,4 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
       tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
   finished_scores = tf.where(
       tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
-  return finished_seq, finished_scores
+  return finished_seq, finished_scores, states
diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py
@@ -38,7 +38,7 @@ def symbols_to_logits(_):
       # Just return random logits
       return tf.random_uniform((batch_size * beam_size, vocab_size))
 
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs, _ = beam_search.beam_search(
         symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size,
         0.)
 
@@ -114,7 +114,7 @@ def symbols_to_logits(ids):
       logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
       return logits
 
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -145,7 +145,7 @@ def symbols_to_logits(ids):
       logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
       return logits
 
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -214,7 +214,7 @@ def symbols_to_logits(ids):
       logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
       return logits
 
-    final_ids, final_probs = beam_search.beam_search(
+    final_ids, final_probs, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -254,7 +254,7 @@ def symbols_to_logits(ids):
       logits = tf.to_float(tf.log(probabilities[pos - 1, :]))
       return logits
 
-    final_ids, final_scores = beam_search.beam_search(
+    final_ids, final_scores, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -297,7 +297,7 @@ def symbols_to_logits(ids):
       return logits
 
     # Disable early stopping
-    final_ids, final_scores = beam_search.beam_search(
+    final_ids, final_scores, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -343,7 +343,7 @@ def symbols_to_logits(ids, _, states):
     states["state"] = tf.placeholder_with_default(
         states["state"], shape=(None, 1))
 
-    final_ids, _ = beam_search.beam_search(
+    final_ids, _, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
@@ -360,6 +360,41 @@ def symbols_to_logits(ids, _, states):
       except tf.errors.InvalidArgumentError as e:
         raise AssertionError(e.message)
 
+  def testStatesAfterLoop(self):
+    batch_size = 1
+    beam_size = 1
+    vocab_size = 2
+    decode_length = 3
+
+    initial_ids = tf.constant([0] * batch_size)  # GO
+    probabilities = tf.constant([[[0.7, 0.3]], [[0.4, 0.6]], [[0.5, 0.5]]])
+
+    def symbols_to_logits(ids, _, states):
+      pos = tf.shape(ids)[1] - 1
+      logits = tf.to_float(tf.log(probabilities[pos, :]))
+      states["state"] += 1
+      return logits, states
+
+    states = {
+        "state": tf.zeros((batch_size, 1)),
+    }
+    states["state"] = tf.placeholder_with_default(
+        states["state"], shape=(None, 1))
+
+    _, _, final_states = beam_search.beam_search(
+        symbols_to_logits,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        0.0,
+        eos_id=1,
+        states=states)
+    
+    with self.test_session() as sess:
+      final_states = sess.run(final_states)
+    self.assertAllEqual([[1]], final_states["state"])
+
   def testStateBeamTwo(self):
     batch_size = 1
     beam_size = 2
@@ -393,7 +428,7 @@ def symbols_to_logits(ids, _, states):
     states["state"] = tf.placeholder_with_default(
         states["state"], shape=(None, 1))
 
-    final_ids, _ = beam_search.beam_search(
+    final_ids, _, _ = beam_search.beam_search(
         symbols_to_logits,
         initial_ids,
         beam_size,
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
@@ -891,7 +891,7 @@ def symbols_to_logits_fn(ids, i=None):
       inputs = features["inputs"]
       decode_length = (common_layers.shape_list(inputs)[1] +
                        features.get("decode_length", decode_length))
-    ids, scores = beam_search.beam_search(
+    ids, scores, _ = beam_search.beam_search(
         symbols_to_logits_fn,
         initial_ids,
         beam_size,