Probabs while sampling actions from policy still aren't normalized sometimes, very close, but the numpy check fails, so normalize them.

afrozenator · copybara-github · commit 67c26e4a8c6e · 2019-05-06T17:56:53.000-07:00
(Maybe do all that in JAX?)

PiperOrigin-RevId: 246932988
diff --git a/tensor2tensor/envs/env_problem_utils.py b/tensor2tensor/envs/env_problem_utils.py
@@ -106,20 +106,26 @@ def multinomial_sample(probs):
     log_probs = log_prob_actions[np.arange(B)[:, None],
                                  index[:, None],
                                  np.arange(A)]
-    assert (B, A) == log_probs.shape
+    assert (B, A) == log_probs.shape, \
+        "B=%d, A=%d, log_probs.shape=%s" % (B, A, log_probs.shape)
 
     # Convert to probs, since we need to do categorical sampling.
     probs = np.exp(log_probs)
 
     # Sometimes log_probs contains a 0, it shouldn't. This makes the
     # probabilities sum up to more than 1, since the addition happens
     # in float64, so just add and subtract 1.0 to zero those probabilites
-    # out. Real example encountered probs = [1e-8, 1.0, 1e-22]
+    # out.
     #
     # Also testing for this is brittle.
     probs += 1
     probs -= 1
 
+    # For some reason, sometimes, this isn't the case.
+    probs_sum = np.sum(probs, axis=1, keepdims=True)
+    if not all(probs_sum == 1.0):
+      probs = probs / probs_sum
+
     # Now pick actions from this probs array.
     actions = np.apply_along_axis(multinomial_sample, 1, probs)