Unity-Technologies · ervteng · Mar 10, 2021 · Mar 8, 2021 · Mar 8, 2021 · Mar 9, 2021
diff --git a/ml-agents/mlagents/trainers/tests/check_env_trains.py b/ml-agents/mlagents/trainers/tests/check_env_trains.py
@@ -27,7 +27,11 @@ def write_stats(
         self, category: str, values: Dict[str, StatsSummary], step: int
     ) -> None:
         for val, stats_summary in values.items():
-            if val == "Environment/Cumulative Reward":
+            if (
+                val == "Environment/Cumulative Reward"
+                or val == "Environment/Group Cumulative Reward"
+            ):
+
                 print(step, val, stats_summary.aggregated_value)
                 self._last_reward_summary[category] = stats_summary.aggregated_value
 

diff --git a/ml-agents/mlagents/trainers/tests/dummy_config.py b/ml-agents/mlagents/trainers/tests/dummy_config.py
@@ -50,6 +50,20 @@
     threaded=False,
 )
 
+_COMA_CONFIG = TrainerSettings(
+    trainer_type=TrainerType.COMA,
+    hyperparameters=PPOSettings(
+        learning_rate=5.0e-3,
+        learning_rate_schedule=ScheduleType.CONSTANT,
+        batch_size=16,
+        buffer_size=64,
+    ),
+    network_settings=NetworkSettings(num_layers=1, hidden_units=32),
+    summary_freq=500,
+    max_steps=3000,
+    threaded=False,
+)
+
 
 def ppo_dummy_config():
     return copy.deepcopy(_PPO_CONFIG)
@@ -59,6 +73,10 @@ def sac_dummy_config():
     return copy.deepcopy(_SAC_CONFIG)
 
 
+def coma_dummy_config():
+    return copy.deepcopy(_COMA_CONFIG)
+
+
 @pytest.fixture
 def gail_dummy_config():
     return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_DEMO_PATH)}

diff --git a/ml-agents/mlagents/trainers/tests/simple_test_envs.py b/ml-agents/mlagents/trainers/tests/simple_test_envs.py
@@ -314,6 +314,212 @@ def _make_batched_step(
         return (decision_step, terminal_step)
 
 
+class MultiAgentEnvironment(BaseEnv):
+    """
+    The MultiAgentEnvironment maintains a list of SimpleEnvironment, one for each agent.
+    When sending DecisionSteps and TerminalSteps to the trainers, it first batches the
+    decision steps from the individual environments. When setting actions, it indexes the
+    batched ActionTuple to obtain the ActionTuple for individual agents
+    """
+
+    def __init__(
+        self,
+        brain_names,
+        step_size=STEP_SIZE,
+        num_visual=0,
+        num_vector=1,
+        num_var_len=0,
+        vis_obs_size=VIS_OBS_SIZE,
+        vec_obs_size=OBS_SIZE,
+        var_len_obs_size=VAR_LEN_SIZE,
+        action_sizes=(1, 0),
+        num_agents=2,
+    ):
+        super().__init__()
+        self.envs = {}
+        self.dones = {}
+        self.just_died = set()
+        self.names = brain_names
+        self.final_rewards: Dict[str, List[float]] = {}
+        for name in brain_names:
+            self.final_rewards[name] = []
+            for i in range(num_agents):
+                name_and_num = name + str(i)
+                self.envs[name_and_num] = SimpleEnvironment(
+                    [name],
+                    step_size,
+                    num_visual,
+                    num_vector,
+                    num_var_len,
+                    vis_obs_size,
+                    vec_obs_size,
+                    var_len_obs_size,
+                    action_sizes,
+                )
+                self.dones[name_and_num] = False
+                self.envs[name_and_num].reset()
+        # All envs have the same behavior spec, so just get the last one.
+        self.behavior_spec = self.envs[name_and_num].behavior_spec
+        self.action_spec = self.envs[name_and_num].action_spec
+        self.num_agents = num_agents
+
+    @property
+    def all_done(self):
+        return all(self.dones.values())
+
+    @property
+    def behavior_specs(self):
+        behavior_dict = {}
+        for n in self.names:
+            behavior_dict[n] = self.behavior_spec
+        return BehaviorMapping(behavior_dict)
+
+    def set_action_for_agent(self, behavior_name, agent_id, action):
+        pass
+
+    def set_actions(self, behavior_name, action):
+        # The ActionTuple contains the actions for all n_agents. This
+        # slices the ActionTuple into an action tuple for each environment
+        # and sets it. The index j is used to ignore agents that have already
+        # reached done.
+        j = 0
+        for i in range(self.num_agents):
+            _act = ActionTuple()
+            name_and_num = behavior_name + str(i)
+            env = self.envs[name_and_num]
+            if not self.dones[name_and_num]:
+                if self.action_spec.continuous_size > 0:
+                    _act.add_continuous(action.continuous[j : j + 1])
+                if self.action_spec.discrete_size > 0:
+                    _disc_list = [action.discrete[j, :]]
+                    _act.add_discrete(np.array(_disc_list))
+                j += 1
+                env.action[behavior_name] = _act
+
+    def get_steps(self, behavior_name):
+        # This gets the individual DecisionSteps and TerminalSteps
+        # from the envs and merges them into a batch to be sent
+        # to the AgentProcessor.
+        dec_vec_obs = []
+        dec_reward = []
+        dec_group_reward = []
+        dec_agent_id = []
+        dec_group_id = []
+        ter_vec_obs = []
+        ter_reward = []
+        ter_group_reward = []
+        ter_agent_id = []
+        ter_group_id = []
+        interrupted = []
+
+        action_mask = None
+        terminal_step = TerminalSteps.empty(self.behavior_spec)
+        decision_step = None
+        for i in range(self.num_agents):
+            name_and_num = behavior_name + str(i)
+            env = self.envs[name_and_num]
+            _dec, _term = env.step_result[behavior_name]
+            if not self.dones[name_and_num]:
+                dec_agent_id.append(i)
+                dec_group_id.append(1)
+                if len(dec_vec_obs) > 0:
+                    for j, obs in enumerate(_dec.obs):
+                        dec_vec_obs[j] = np.concatenate((dec_vec_obs[j], obs), axis=0)
+                else:
+                    for obs in _dec.obs:
+                        dec_vec_obs.append(obs)
+                dec_reward.append(_dec.reward[0])
+                dec_group_reward.append(_dec.group_reward[0])
+                if _dec.action_mask is not None:
+                    if action_mask is None:
+                        action_mask = []
+                    if len(action_mask) > 0:
+                        action_mask[0] = np.concatenate(
+                            (action_mask[0], _dec.action_mask[0]), axis=0
+                        )
+                    else:
+                        action_mask.append(_dec.action_mask[0])
+            if len(_term.reward) > 0 and name_and_num in self.just_died:
+                ter_agent_id.append(i)
+                ter_group_id.append(1)
+                if len(ter_vec_obs) > 0:
+                    for j, obs in enumerate(_term.obs):
+                        ter_vec_obs[j] = np.concatenate((ter_vec_obs[j], obs), axis=0)
+                else:
+                    for obs in _term.obs:
+                        ter_vec_obs.append(obs)
+                ter_reward.append(_term.reward[0])
+                ter_group_reward.append(_term.group_reward[0])
+                interrupted.append(False)
+                self.just_died.remove(name_and_num)
+        decision_step = DecisionSteps(
+            dec_vec_obs,
+            dec_reward,
+            dec_agent_id,
+            action_mask,
+            dec_group_id,
+            dec_group_reward,
+        )
+        terminal_step = TerminalSteps(
+            ter_vec_obs,
+            ter_reward,
+            interrupted,
+            ter_agent_id,
+            ter_group_id,
+            ter_group_reward,
+        )
+        return (decision_step, terminal_step)
+
+    def step(self) -> None:
+        # Steps all environments and calls reset if all agents are done.
+        for name in self.names:
+            for i in range(self.num_agents):
+                name_and_num = name + str(i)
+                # Does not step the env if done
+                if not self.dones[name_and_num]:
+                    env = self.envs[name_and_num]
+                    # Reproducing part of env step to intercept Dones
+                    assert all(action is not None for action in env.action.values())
+                    done = env._take_action(name)
+                    reward = env._compute_reward(name, done)
+                    self.dones[name_and_num] = done
+                    if done:
+                        self.just_died.add(name_and_num)
+                    if self.all_done:
+                        env.step_result[name] = env._make_batched_step(
+                            name, done, 0.0, reward
+                        )
+                        self.final_rewards[name].append(reward)
+                        self.reset()
+                    elif done:
+                        # This agent has finished but others are still running.
+                        # This gives a reward of the time penalty if this agent
+                        # is successful and the negative env reward if it fails.
+                        ceil_reward = min(-TIME_PENALTY, reward)
+                        env.step_result[name] = env._make_batched_step(
+                            name, done, ceil_reward, 0.0
+                        )
+                        self.final_rewards[name].append(reward)
+
+                    else:
+                        env.step_result[name] = env._make_batched_step(
+                            name, done, reward, 0.0
+                        )
+
+    def reset(self) -> None:  # type: ignore
+        for name in self.names:
+            for i in range(self.num_agents):
+                name_and_num = name + str(i)
+                self.dones[name_and_num] = False
+
+    @property
+    def reset_parameters(self) -> Dict[str, str]:
+        return {}
+
+    def close(self):
+        pass
+
+
 class RecordEnvironment(SimpleEnvironment):
     def __init__(
         self,

diff --git a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
@@ -4,6 +4,7 @@
 
 from mlagents.trainers.tests.simple_test_envs import (
     SimpleEnvironment,
+    MultiAgentEnvironment,
     MemoryEnvironment,
     RecordEnvironment,
 )
@@ -27,7 +28,11 @@
     ActionSpecProto,
 )
 
-from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
+from mlagents.trainers.tests.dummy_config import (
+    ppo_dummy_config,
+    sac_dummy_config,
+    coma_dummy_config,
+)
 from mlagents.trainers.tests.check_env_trains import (
     check_environment_trains,
     default_reward_processor,
@@ -37,11 +42,83 @@
 
 PPO_TORCH_CONFIG = ppo_dummy_config()
 SAC_TORCH_CONFIG = sac_dummy_config()
+COMA_TORCH_CONFIG = coma_dummy_config()
 
 # tests in this file won't be tested on GPU machine
 pytestmark = pytest.mark.check_environment_trains
 
 
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+def test_simple_coma(action_sizes):
+    env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=action_sizes, num_agents=2)
+    config = attr.evolve(COMA_TORCH_CONFIG)
+    check_environment_trains(env, {BRAIN_NAME: config})
+
+
+@pytest.mark.parametrize("num_visual", [1, 2])
+def test_visual_coma(num_visual):
+    env = MultiAgentEnvironment(
+        [BRAIN_NAME], action_sizes=(0, 1), num_agents=2, num_visual=num_visual
+    )
+    new_hyperparams = attr.evolve(
+        COMA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+    )
+    config = attr.evolve(COMA_TORCH_CONFIG, hyperparameters=new_hyperparams)
+    check_environment_trains(env, {BRAIN_NAME: config})
+
+
+@pytest.mark.parametrize("num_var_len", [1, 2])
+@pytest.mark.parametrize("num_vector", [0, 1])
+@pytest.mark.parametrize("num_vis", [0, 1])
+def test_var_len_obs_coma(num_vis, num_vector, num_var_len):
+    env = MultiAgentEnvironment(
+        [BRAIN_NAME],
+        action_sizes=(0, 1),
+        num_visual=num_vis,
+        num_vector=num_vector,
+        num_var_len=num_var_len,
+        step_size=0.2,
+        num_agents=2,
+    )
+    new_hyperparams = attr.evolve(
+        COMA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
+    )
+    config = attr.evolve(COMA_TORCH_CONFIG, hyperparameters=new_hyperparams)
+    check_environment_trains(env, {BRAIN_NAME: config})
+
+
+@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
+@pytest.mark.parametrize("is_multiagent", [True, False])
+def test_recurrent_coma(action_sizes, is_multiagent):
+    if is_multiagent:
+        # This is not a recurrent environment, just check if LSTM doesn't crash
+        env = MultiAgentEnvironment(
+            [BRAIN_NAME], action_sizes=action_sizes, num_agents=2
+        )
+    else:
+        # Actually test LSTM here
+        env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
+    new_network_settings = attr.evolve(
+        COMA_TORCH_CONFIG.network_settings,
+        memory=NetworkSettings.MemorySettings(memory_size=16),
+    )
+    new_hyperparams = attr.evolve(
+        COMA_TORCH_CONFIG.hyperparameters,
+        learning_rate=1.0e-3,
+        batch_size=64,
+        buffer_size=128,
+    )
+    config = attr.evolve(
+        COMA_TORCH_CONFIG,
+        hyperparameters=new_hyperparams,
+        network_settings=new_network_settings,
+        max_steps=500 if is_multiagent else 6000,
+    )
+    check_environment_trains(
+        env, {BRAIN_NAME: config}, success_threshold=None if is_multiagent else 0.9
+    )
+
+
 @pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
 def test_simple_ppo(action_sizes):
     env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)