[bug-fix] Fix entropy computation in MultiCategorialDistribution (#3607)

Ervin T · web-flow · commit 200ab7b0bc59 · 2020-03-11T10:54:02.000-07:00
diff --git a/ml-agents/mlagents/trainers/distributions.py b/ml-agents/mlagents/trainers/distributions.py
@@ -189,13 +189,13 @@ def __init__(self, logits: tf.Tensor, act_size: List[int], action_masks: tf.Tens
             and 1 for unmasked.
         """
         unmasked_log_probs = self._create_policy_branches(logits, act_size)
-        self._sampled_policy, self._all_probs, action_index = self._get_masked_actions_probs(
-            unmasked_log_probs, act_size, action_masks
-        )
+        (
+            self._sampled_policy,
+            self._all_probs,
+            action_index,
+        ) = self._get_masked_actions_probs(unmasked_log_probs, act_size, action_masks)
         self._sampled_onehot = self._action_onehot(self._sampled_policy, act_size)
-        self._entropy = self._create_entropy(
-            self._sampled_onehot, self._all_probs, action_index, act_size
-        )
+        self._entropy = self._create_entropy(self._all_probs, action_index, act_size)
         self._total_prob = self._get_log_probs(
             self._sampled_onehot, self._all_probs, action_index, act_size
         )
@@ -263,11 +263,7 @@ def _get_log_probs(
         return log_probs
 
     def _create_entropy(
-        self,
-        all_log_probs: tf.Tensor,
-        sample_onehot: tf.Tensor,
-        action_idx: List[int],
-        act_size: List[int],
+        self, all_log_probs: tf.Tensor, action_idx: List[int], act_size: List[int]
     ) -> tf.Tensor:
         entropy = tf.reduce_sum(
             (
diff --git a/ml-agents/mlagents/trainers/tests/test_distributions.py b/ml-agents/mlagents/trainers/tests/test_distributions.py
@@ -113,8 +113,8 @@ def test_multicategorical_distribution():
             sess.run(init)
             output = sess.run(distribution.sample)
             for _ in range(10):
-                sample, log_probs = sess.run(
-                    [distribution.sample, distribution.log_probs]
+                sample, log_probs, entropy = sess.run(
+                    [distribution.sample, distribution.log_probs, distribution.entropy]
                 )
                 assert len(log_probs[0]) == sum(DISCRETE_ACTION_SPACE)
                 # Assert action never exceeds [-1,1]
@@ -123,6 +123,8 @@ def test_multicategorical_distribution():
                     assert act >= 0 and act <= DISCRETE_ACTION_SPACE[i]
                 output = sess.run([distribution.total_log_probs])
                 assert output[0].shape[0] == 1
+                # Make sure entropy is correct
+                assert entropy[0] > 3.8
 
             # Test masks
             mask = []