[bug-fix] Fix when group terminal steps are deleted, robust test (#5441)

Ervin T · web-flow · commit 6150aa9d8f85 · 2021-06-25T16:31:39.000-07:00
* Fix when terminal steps are deleted, robust test

* Update changelog

* Fix test comment
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to
 ### Bug Fixes
 #### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
 #### ml-agents / ml-agents-envs / gym-unity (Python)
+- Fixed a bug in multi-agent cooperative training where agents might not receive all of the states of
+terminated teammates. (#5441)
 
 ## [2.1.0-exp.1] - 2021-06-09
 ### Minor Changes
diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -122,8 +122,6 @@ def add_experiences(
             self._process_step(
                 terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id]
             )
-            # Clear the last seen group obs when agents die.
-            self._clear_group_status_and_obs(global_id)
 
         # Iterate over all the decision steps, first gather all the group obs
         # and then create the trajectories. _add_to_group_status
@@ -135,6 +133,12 @@ def add_experiences(
             self._process_step(
                 ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id]
             )
+        # Clear the last seen group obs when agents die, but only after all of the group
+        # statuses were added to the trajectory.
+        for terminal_step in terminal_steps.values():
+            local_id = terminal_step.agent_id
+            global_id = get_global_agent_id(worker_id, local_id)
+            self._clear_group_status_and_obs(global_id)
 
         for _gid in action_global_agent_ids:
             # If the ID doesn't have a last step result, the agent just reset,
diff --git a/ml-agents/mlagents/trainers/tests/mock_brain.py b/ml-agents/mlagents/trainers/tests/mock_brain.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import numpy as np
 
 from mlagents.trainers.buffer import AgentBuffer, AgentBufferKey
@@ -21,6 +21,7 @@ def create_mock_steps(
     action_spec: ActionSpec,
     done: bool = False,
     grouped: bool = False,
+    agent_ids: Optional[List[int]] = None,
 ) -> Tuple[DecisionSteps, TerminalSteps]:
     """
     Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
@@ -43,7 +44,10 @@ def create_mock_steps(
 
     reward = np.array(num_agents * [1.0], dtype=np.float32)
     interrupted = np.array(num_agents * [False], dtype=np.bool)
-    agent_id = np.arange(num_agents, dtype=np.int32)
+    if agent_ids is not None:
+        agent_id = np.array(agent_ids, dtype=np.int32)
+    else:
+        agent_id = np.arange(num_agents, dtype=np.int32)
     _gid = 1 if grouped else 0
     group_id = np.array(num_agents * [_gid], dtype=np.int32)
     group_reward = np.array(num_agents * [0.0], dtype=np.float32)
diff --git a/ml-agents/mlagents/trainers/tests/test_agent_processor.py b/ml-agents/mlagents/trainers/tests/test_agent_processor.py
@@ -137,32 +137,54 @@ def test_group_statuses():
         )
 
     # Make terminal steps for some dead agents
-    mock_decision_steps_2, mock_terminal_steps_2 = mb.create_mock_steps(
+    _, mock_terminal_steps_2 = mb.create_mock_steps(
         num_agents=2,
         observation_specs=create_observation_specs_with_shapes([(8,)]),
         action_spec=ActionSpec.create_continuous(2),
         done=True,
         grouped=True,
+        agent_ids=[2, 3],
+    )
+    # Make decision steps continue for other agents
+    mock_decision_steps_2, _ = mb.create_mock_steps(
+        num_agents=2,
+        observation_specs=create_observation_specs_with_shapes([(8,)]),
+        action_spec=ActionSpec.create_continuous(2),
+        done=False,
+        grouped=True,
+        agent_ids=[0, 1],
     )
 
     processor.add_experiences(
         mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
     )
-    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
+    # Continue to add for remaining live agents
+    fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id)
     for _ in range(3):
         processor.add_experiences(
-            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
+            mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info
         )
 
     # Assert that four trajectories have been added to the Trainer
     assert len(tqueue.put.call_args_list) == 4
-    # Last trajectory should be the longest
+
+    # Get the first trajectory, which should have been agent 2 (one of the killed agents)
     trajectory = tqueue.put.call_args_list[0][0][-1]
+    assert len(trajectory.steps) == 3
+    # Make sure trajectory has the right Groupmate Experiences.
+    # All three steps should contain all agents
+    for step in trajectory.steps:
+        assert len(step.group_status) == 3
+
+    # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents.
+    trajectory = tqueue.put.call_args_list[-1][0][-1]
+    assert len(trajectory.steps) == 5
 
-    # Make sure trajectory has the right Groupmate Experiences
+    # Make sure trajectory has the right Groupmate Experiences.
+    # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents)
     for step in trajectory.steps[0:3]:
         assert len(step.group_status) == 3
-    # After 2 agents has died
+    # After 2 agents has died, there should only be 1 group status.
     for step in trajectory.steps[3:]:
         assert len(step.group_status) == 1