Unity-Technologies · vincentpierre · Mar 24, 2020 · Mar 20, 2020 · Mar 20, 2020 · Mar 20, 2020
diff --git a/com.unity.ml-agents/Runtime/Agent.cs b/com.unity.ml-agents/Runtime/Agent.cs
@@ -315,6 +315,7 @@ protected virtual void OnDisable()
 
         void NotifyAgentDone(DoneReason doneReason)
         {
+            m_Info.episodeId = m_EpisodeId;
             m_Info.reward = m_Reward;
             m_Info.done = true;
             m_Info.maxStepReached = doneReason == DoneReason.MaxStepReached;
@@ -376,7 +377,7 @@ public void SetModel(
                 // If everything is the same, don't make any changes.
                 return;
             }
-
+            NotifyAgentDone(DoneReason.Disabled);
             m_PolicyFactory.model = model;
             m_PolicyFactory.inferenceDevice = inferenceDevice;
             m_PolicyFactory.behaviorName = behaviorName;

diff --git a/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs b/com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs
@@ -458,13 +458,20 @@ UnityRLInitializationOutputProto GetTempUnityRlInitializationOutput()
             {
                 if (m_CurrentUnityRlOutput.AgentInfos.ContainsKey(behaviorName))
                 {
-                    if (output == null)
+                    if (m_CurrentUnityRlOutput.AgentInfos[behaviorName].CalculateSize() > 0)
                     {
-                        output = new UnityRLInitializationOutputProto();
-                    }
+                        // Only send the BrainParameters if there is a non empty list of
+                        // AgentInfos ready to be sent.
+                        // This is to ensure that The Python side will always have a first
+                        // observation when receiving the BrainParameters
+                        if (output == null)
+                        {
+                            output = new UnityRLInitializationOutputProto();
+                        }
 
-                    var brainParameters = m_UnsentBrainKeys[behaviorName];
-                    output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                        var brainParameters = m_UnsentBrainKeys[behaviorName];
+                        output.BrainParameters.Add(brainParameters.ToProto(behaviorName, true));
+                    }
                 }
             }
 

diff --git a/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs b/com.unity.ml-agents/Runtime/Policies/HeuristicPolicy.cs
@@ -29,7 +29,10 @@ public HeuristicPolicy(Func<float[]> heuristic)
         public void RequestDecision(AgentInfo info, List<ISensor> sensors)
         {
             StepSensors(sensors);
-            m_LastDecision = m_Heuristic.Invoke();
+            if (!info.done)
 m_Info.done = false; 
 m_Info.maxStepReached = false; 
 m_Info.episodeId = m_EpisodeId; 
 m_Brain.RequestDecision(m_Info, sensors); 
 m_Info.done = false; 
 m_Info.maxStepReached = false; 
 m_Info.episodeId = m_EpisodeId; 
  
 m_Brain.RequestDecision(m_Info, sensors); 
+            {
+                m_LastDecision = m_Heuristic.Invoke();
+            }
         }
 
         /// <inheritdoc />

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -1,5 +1,5 @@
 import sys
-from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Set
+from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Any
 from collections import defaultdict, Counter, deque
 
 from mlagents_envs.base_env import BatchedStepResult, StepResult
@@ -66,7 +66,6 @@ def add_experiences(
             for _entropy in take_action_outputs["entropy"]:
                 self.stats_reporter.add_stat("Policy/Entropy", _entropy)
 
-        terminated_agents: Set[str] = set()
         # Make unique agent_ids that are global across workers
         action_global_agent_ids = [
             get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
@@ -85,6 +84,7 @@ def add_experiences(
             stored_take_action_outputs = self.last_take_action_outputs.get(
                 global_id, None
             )
+
             if stored_agent_step is not None and stored_take_action_outputs is not None:
                 # We know the step is from the same worker, so use the local agent id.
                 obs = stored_agent_step.obs
@@ -143,11 +143,12 @@ def add_experiences(
                         traj_queue.put(trajectory)
                     self.experience_buffers[global_id] = []
                     if curr_agent_step.done:
+                        # Record episode length for agents which have had at least
+                        # 1 step. Done after reset ignored.
                         self.stats_reporter.add_stat(
                             "Environment/Episode Length",
                             self.episode_steps.get(global_id, 0),
                         )
-                        terminated_agents.add(global_id)
                 elif not curr_agent_step.done:
                     self.episode_steps[global_id] += 1
 
@@ -156,9 +157,9 @@ def add_experiences(
                 curr_agent_step,
                 batched_step_result.agent_id_to_index[_id],
             )
-
-        for terminated_id in terminated_agents:
-            self._clean_agent_data(terminated_id)
+            # Delete all done agents, regardless of if they had a 0-length episode.
+            if curr_agent_step.done:
+                self._clean_agent_data(global_id)
 
         for _gid in action_global_agent_ids:
             # If the ID doesn't have a last step result, the agent just reset,
@@ -173,14 +174,22 @@ def _clean_agent_data(self, global_id: str) -> None:
         """
         Removes the data for an Agent.
         """
-        del self.experience_buffers[global_id]
-        del self.last_take_action_outputs[global_id]
-        del self.last_step_result[global_id]
-        del self.episode_steps[global_id]
-        del self.episode_rewards[global_id]
+        self._safe_delete(self.experience_buffers, global_id)
+        self._safe_delete(self.last_take_action_outputs, global_id)
+        self._safe_delete(self.last_step_result, global_id)
+        self._safe_delete(self.episode_steps, global_id)
+        self._safe_delete(self.episode_rewards, global_id)
         self.policy.remove_previous_action([global_id])
         self.policy.remove_memories([global_id])
 
+    def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None:
+        """
+        Safe removes data from a dictionary. If not found,
+        don't delete.
+        """
+        if key in my_dictionary:
+            del my_dictionary[key]
+
     def publish_trajectory_queue(
         self, trajectory_queue: "AgentManagerQueue[Trajectory]"
     ) -> None:

diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
@@ -174,17 +174,6 @@ def get_action(
         if batched_step_result.n_agents() == 0:
             return ActionInfo.empty()
 
-        agents_done = [
-            agent
-            for agent, done in zip(
-                batched_step_result.agent_id, batched_step_result.done
-            )
-            if done
-        ]
-
-        self.remove_memories(agents_done)
-        self.remove_previous_action(agents_done)
-
         global_agent_ids = [
             get_global_agent_id(worker_id, int(agent_id))
             for agent_id in batched_step_result.agent_id
@@ -379,9 +368,11 @@ def _initialize_tensorflow_references(self):
 
     def create_input_placeholders(self):
         with self.graph.as_default():
-            self.global_step, self.increment_step_op, self.steps_to_increment = (
-                ModelUtils.create_global_steps()
-            )
+            (
+                self.global_step,
+                self.increment_step_op,
+                self.steps_to_increment,
+            ) = ModelUtils.create_global_steps()
             self.visual_in = ModelUtils.create_visual_input_placeholders(
                 self.brain.camera_resolutions
             )

diff --git a/ml-agents/mlagents/trainers/tests/test_agent_processor.py b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
@@ -152,6 +152,15 @@ def test_agent_deletion():
     assert len(processor.last_take_action_outputs.keys()) == 0
     assert len(processor.episode_steps.keys()) == 0
     assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0
+
+    # check that steps with immediate dones don't add to dicts
+    processor.add_experiences(mock_done_step, 0, ActionInfo.empty())
+    assert len(processor.experience_buffers.keys()) == 0
+    assert len(processor.last_take_action_outputs.keys()) == 0
+    assert len(processor.episode_steps.keys()) == 0
+    assert len(processor.episode_rewards.keys()) == 0
+    assert len(processor.last_step_result.keys()) == 0
 
 
 def test_end_episode():