tensorflow
diff --git a/‎tensor2tensor/layers/common_layers.py
Lines changed: 92 additions & 0 deletions b/‎tensor2tensor/layers/common_layers.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎tensor2tensor/layers/common_video.py
Lines changed: 0 additions & 2 deletions b/‎tensor2tensor/layers/common_video.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎tensor2tensor/models/research/rl.py
Lines changed: 9 additions & 43 deletions b/‎tensor2tensor/models/research/rl.py
Lines changed: 9 additions & 43 deletions
diff --git a/‎tensor2tensor/rl/batch_dqn_agent_test.py
Lines changed: 0 additions & 157 deletions b/‎tensor2tensor/rl/batch_dqn_agent_test.py
Lines changed: 0 additions & 157 deletions
@@ -1790,6 +1790,98 @@ def padded_cross_entropy(logits,
     return tf.reduce_sum(xent * weights), tf.reduce_sum(weights)
 
 
+def padded_cross_entropy_mixture(logits,
+                                 labels,
+                                 label_smoothing,
+                                 num_mixtures,
+                                 weights_fn=weights_nonzero,
+                                 reduce_sum=False,
+                                 cutoff=0.0,
+                                 gaussian=False,
+                                 return_best_logits=False):
+  """Compute cross-entropy assuming 0s are padding.
+
+  Computes a loss numerator (the sum of losses), and loss denominator
+  (the number of non-padding tokens).
+
+  Computes cross-entropy for each mixture, and returns the corresponding values
+  for the mixture with the highest probability
+
+  Args:
+    logits: `Tensor` with shape `[batch * num_mixtures, timesteps, vocab_size]`.
+      optionally a FactoredTensor.
+    labels: an integer `Tensor` with shape `[batch, timesteps]`.
+    label_smoothing: a floating point `Scalar`.
+    num_mixtures: an integer.
+    weights_fn: A function from labels to weights.
+    reduce_sum: a Boolean, whether to sum at the end or not.
+    cutoff: a float, at which point to have no loss.
+    gaussian: If true, use a Gaussian distribution for label smoothing
+    return_best_logits: If true, return the logits of the mixture with highest
+    probabilities for an example
+
+  Returns:
+    loss_numerator: a `Scalar`.  Sum of losses.
+    loss_denominator: a `Scalar.  The number of non-padding target tokens.
+
+  Raises:
+    ValueError: in case of unsupported argument types.
+  """
+  logit_shapes = shape_list(
+      logits)  # batch_size * num_mixtures, timesteps, 1, 1, vocab_size
+  batch_size = tf.cast(logit_shapes[0] / num_mixtures, dtype=tf.int32)
+  timesteps = logit_shapes[1]
+  vocab_size = logit_shapes[4]
+
+  new_shape_for_xent = [num_mixtures] + shape_list(labels)
+  labels = tf.tile(labels, [num_mixtures, 1, 1, 1])
+
+  xent, weights = padded_cross_entropy(
+      logits, labels, label_smoothing, weights_fn, reduce_sum, cutoff, gaussian)
+
+  # reshape xent and weights to have the num_mixtures as first dimension
+  xent = tf.reshape(xent, new_shape_for_xent)
+  weights = tf.reshape(weights, new_shape_for_xent[:-1])
+
+  # sum up sentence neg log probs
+  xent = tf.reduce_sum(xent, axis=2)
+
+  # if we need to compute the best logits
+  if return_best_logits:
+    best_mixture_indices = tf.cast(tf.argmin(xent, 0), dtype=tf.int32)
+    individual_element_indices = tf.range(batch_size)
+    stacked_mixture_element_indices = tf.stack(
+        (tf.squeeze(best_mixture_indices), individual_element_indices), -1)
+    best_logits = tf.reshape(logits,
+                             [num_mixtures, -1, timesteps, 1, 1, vocab_size])
+    best_logits = tf.gather_nd(best_logits, stacked_mixture_element_indices)
+    best_logits = tf.reshape(best_logits,
+                             [batch_size, timesteps, 1, 1, vocab_size])
+
+  with tf.control_dependencies([
+      tf.assert_equal(
+          tf.shape(xent)[:3], [num_mixtures, batch_size, 1],
+          message="Each batch element should have a probability value for each mixture element"
+      )
+  ]):
+    xent = tf.reduce_min(xent, axis=0)
+    weights = tf.reduce_mean(weights, axis=0)
+
+  with tf.control_dependencies([
+      tf.assert_equal(
+          tf.shape(xent)[0], [batch_size],
+          message="There should be batch_size elements after selecting best mixture probabilities"
+      )
+  ]):
+    summed_xent = tf.reduce_sum(xent)
+    summed_weights = tf.reduce_sum(weights)
+
+  if return_best_logits:
+    return summed_xent, summed_weights, best_logits
+  else:
+    return summed_xent, summed_weights
+
+
 def _weights_one_third(labels):
   """Returns Tensor of shape [batch, height, width]. Each element is 1/3."""
   return tf.ones(tf.shape(labels)[:-1]) / 3.
 
@@ -790,8 +790,6 @@ def finish(self):
     (out, err) = [
         b"".join(chunks) for chunks in (self._out_chunks, self._err_chunks)
     ]
-    self.proc.stdout.close()
-    self.proc.stderr.close()
     if self.proc.returncode:
       err = "\n".join([" ".join(self.cmd), err.decode("utf8")])
       raise IOError(err)
 
@@ -354,18 +354,12 @@ def dqn_atari_base():
       optimizer_epsilon=0.00001,
       optimizer_centered=True,
 
-      # TODO: change names maybe replay_buffer -> agent? Also batch_size is now
-      # buffer_batch_size in _DQNAgent.
       replay_buffer_replay_capacity=1000000,
-      replay_buffer_buffer_batch_size=32,
+      replay_buffer_batch_size=32,
 
       time_limit=27000,
       save_every_steps=50000,
       num_frames=int(20 * 1e6),
-
-      # TODO(konradczechowski) this is not used in trainer_model_free, clean
-      # this up after evaluation refactor
-      eval_episodes_num=3,
   )
 
 
@@ -376,16 +370,6 @@ def dqn_original_params():
   hparams.set_hparam("num_frames", int(1e6))
   return hparams
 
-def rlmf_tiny_overrides():
-  """Parameters to override for tiny setting excluding agent-related hparams."""
-  return dict(
-      max_num_noops=1,
-      eval_max_num_noops=1,
-      rl_env_max_episode_steps=7,
-      eval_rl_env_max_episode_steps=7,
-      eval_sampling_temps=[0.0, 1.0],
-  )
-
 
 @registry.register_hparams
 def rlmf_original():
@@ -398,7 +382,6 @@ def rlmf_original():
       eval_batch_size=2,
       frame_stack_size=4,
       eval_sampling_temps=[0.0, 0.2, 0.5, 0.8, 1.0, 2.0],
-      max_num_noops=8,
       eval_max_num_noops=8,
       eval_rl_env_max_episode_steps=1000,
       resize_height_factor=2,
@@ -443,31 +426,6 @@ def rlmf_base():
   return hparams
 
 
-@registry.register_hparams
-def rlmf_tiny():
-  """Tiny set of hparams for model-free PPO."""
-  hparams = rlmf_original()
-  hparams = hparams.override_from_dict(rlmf_tiny_overrides())
-  hparams.batch_size = 2
-  hparams.add_hparam("ppo_epochs_num", 3)
-  hparams.add_hparam("ppo_epoch_length", 2)
-  return hparams
-
-
-@registry.register_hparams
-def rlmf_dqn_tiny():
-  hparams = rlmf_original()
-  hparams = hparams.override_from_dict(rlmf_tiny_overrides())
-  hparams.batch_size = 1
-  hparams.base_algo = "dqn"
-  hparams.base_algo_params = "dqn_original_params"
-  hparams.add_hparam("dqn_num_frames", 128)
-  hparams.add_hparam("dqn_save_every_steps", 128)
-  hparams.add_hparam("dqn_replay_buffer_replay_capacity", 100)
-  hparams.add_hparam("dqn_agent_min_replay_history", 10)
-  return hparams
-
-
 @registry.register_hparams
 def rlmf_eval():
   """Eval set of hparams for model-free PPO."""
@@ -484,6 +442,14 @@ def rlmf_eval():
   return hparams
 
 
+@registry.register_hparams
+def rlmf_tiny():
+  hparams = rlmf_base()
+  hparams.ppo_epochs_num = 100
+  hparams.ppo_eval_every_epochs = 10
+  return hparams
+
+
 class PolicyBase(t2t_model.T2TModel):
 
   def loss(self, *args, **kwargs):
Original file line number	Diff line number	Diff line change
`@@ -790,8 +790,6 @@ def finish(self):`
`790`	`790`	`(out, err) = [`
`791`	`791`	`b"".join(chunks) for chunks in (self._out_chunks, self._err_chunks)`
`792`	`792`	`]`
`793`		`- self.proc.stdout.close()`
`794`		`- self.proc.stderr.close()`
`795`	`793`	`if self.proc.returncode:`
`796`	`794`	`err = "\n".join([" ".join(self.cmd), err.decode("utf8")])`
`797`	`795`	`raise IOError(err)`