Skip to content

Multiagent simplerl #5066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 10, 2021
6 changes: 5 additions & 1 deletion ml-agents/mlagents/trainers/tests/check_env_trains.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def write_stats(
self, category: str, values: Dict[str, StatsSummary], step: int
) -> None:
for val, stats_summary in values.items():
if val == "Environment/Cumulative Reward":
if (
val == "Environment/Cumulative Reward"
or val == "Environment/Group Cumulative Reward"
):

print(step, val, stats_summary.aggregated_value)
self._last_reward_summary[category] = stats_summary.aggregated_value

Expand Down
18 changes: 18 additions & 0 deletions ml-agents/mlagents/trainers/tests/dummy_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@
threaded=False,
)

_COMA_CONFIG = TrainerSettings(
trainer_type=TrainerType.COMA,
hyperparameters=PPOSettings(
learning_rate=5.0e-3,
learning_rate_schedule=ScheduleType.CONSTANT,
batch_size=16,
buffer_size=64,
),
network_settings=NetworkSettings(num_layers=1, hidden_units=32),
summary_freq=500,
max_steps=3000,
threaded=False,
)


def ppo_dummy_config():
return copy.deepcopy(_PPO_CONFIG)
Expand All @@ -59,6 +73,10 @@ def sac_dummy_config():
return copy.deepcopy(_SAC_CONFIG)


def coma_dummy_config():
return copy.deepcopy(_COMA_CONFIG)


@pytest.fixture
def gail_dummy_config():
return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_DEMO_PATH)}
Expand Down
206 changes: 206 additions & 0 deletions ml-agents/mlagents/trainers/tests/simple_test_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,212 @@ def _make_batched_step(
return (decision_step, terminal_step)


class MultiAgentEnvironment(BaseEnv):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand nothing this class does. Please add comments

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it is a pretty horrible thing

"""
The MultiAgentEnvironment maintains a list of SimpleEnvironment, one for each agent.
When sending DecisionSteps and TerminalSteps to the trainers, it first batches the
decision steps from the individual environments. When setting actions, it indexes the
batched ActionTuple to obtain the ActionTuple for individual agents
"""

def __init__(
self,
brain_names,
step_size=STEP_SIZE,
num_visual=0,
num_vector=1,
num_var_len=0,
vis_obs_size=VIS_OBS_SIZE,
vec_obs_size=OBS_SIZE,
var_len_obs_size=VAR_LEN_SIZE,
action_sizes=(1, 0),
num_agents=2,
):
super().__init__()
self.envs = {}
self.dones = {}
self.just_died = set()
self.names = brain_names
self.final_rewards: Dict[str, List[float]] = {}
for name in brain_names:
self.final_rewards[name] = []
for i in range(num_agents):
name_and_num = name + str(i)
self.envs[name_and_num] = SimpleEnvironment(
[name],
step_size,
num_visual,
num_vector,
num_var_len,
vis_obs_size,
vec_obs_size,
var_len_obs_size,
action_sizes,
)
self.dones[name_and_num] = False
self.envs[name_and_num].reset()
# All envs have the same behavior spec, so just get the last one.
self.behavior_spec = self.envs[name_and_num].behavior_spec
self.action_spec = self.envs[name_and_num].action_spec
self.num_agents = num_agents

@property
def all_done(self):
return all(self.dones.values())

@property
def behavior_specs(self):
behavior_dict = {}
for n in self.names:
behavior_dict[n] = self.behavior_spec
return BehaviorMapping(behavior_dict)

def set_action_for_agent(self, behavior_name, agent_id, action):
pass

def set_actions(self, behavior_name, action):
# The ActionTuple contains the actions for all n_agents. This
# slices the ActionTuple into an action tuple for each environment
# and sets it. The index j is used to ignore agents that have already
# reached done.
j = 0
for i in range(self.num_agents):
_act = ActionTuple()
name_and_num = behavior_name + str(i)
env = self.envs[name_and_num]
if not self.dones[name_and_num]:
if self.action_spec.continuous_size > 0:
_act.add_continuous(action.continuous[j : j + 1])
if self.action_spec.discrete_size > 0:
_disc_list = [action.discrete[j, :]]
_act.add_discrete(np.array(_disc_list))
j += 1
env.action[behavior_name] = _act

def get_steps(self, behavior_name):
# This gets the individual DecisionSteps and TerminalSteps
# from the envs and merges them into a batch to be sent
# to the AgentProcessor.
dec_vec_obs = []
dec_reward = []
dec_group_reward = []
dec_agent_id = []
dec_group_id = []
ter_vec_obs = []
ter_reward = []
ter_group_reward = []
ter_agent_id = []
ter_group_id = []
interrupted = []

action_mask = None
terminal_step = TerminalSteps.empty(self.behavior_spec)
decision_step = None
for i in range(self.num_agents):
name_and_num = behavior_name + str(i)
env = self.envs[name_and_num]
_dec, _term = env.step_result[behavior_name]
if not self.dones[name_and_num]:
dec_agent_id.append(i)
dec_group_id.append(1)
if len(dec_vec_obs) > 0:
for j, obs in enumerate(_dec.obs):
dec_vec_obs[j] = np.concatenate((dec_vec_obs[j], obs), axis=0)
else:
for obs in _dec.obs:
dec_vec_obs.append(obs)
dec_reward.append(_dec.reward[0])
dec_group_reward.append(_dec.group_reward[0])
if _dec.action_mask is not None:
if action_mask is None:
action_mask = []
if len(action_mask) > 0:
action_mask[0] = np.concatenate(
(action_mask[0], _dec.action_mask[0]), axis=0
)
else:
action_mask.append(_dec.action_mask[0])
if len(_term.reward) > 0 and name_and_num in self.just_died:
ter_agent_id.append(i)
ter_group_id.append(1)
if len(ter_vec_obs) > 0:
for j, obs in enumerate(_term.obs):
ter_vec_obs[j] = np.concatenate((ter_vec_obs[j], obs), axis=0)
else:
for obs in _term.obs:
ter_vec_obs.append(obs)
ter_reward.append(_term.reward[0])
ter_group_reward.append(_term.group_reward[0])
interrupted.append(False)
self.just_died.remove(name_and_num)
decision_step = DecisionSteps(
dec_vec_obs,
dec_reward,
dec_agent_id,
action_mask,
dec_group_id,
dec_group_reward,
)
terminal_step = TerminalSteps(
ter_vec_obs,
ter_reward,
interrupted,
ter_agent_id,
ter_group_id,
ter_group_reward,
)
return (decision_step, terminal_step)

def step(self) -> None:
# Steps all environments and calls reset if all agents are done.
for name in self.names:
for i in range(self.num_agents):
name_and_num = name + str(i)
# Does not step the env if done
if not self.dones[name_and_num]:
env = self.envs[name_and_num]
# Reproducing part of env step to intercept Dones
assert all(action is not None for action in env.action.values())
done = env._take_action(name)
reward = env._compute_reward(name, done)
self.dones[name_and_num] = done
if done:
self.just_died.add(name_and_num)
if self.all_done:
env.step_result[name] = env._make_batched_step(
name, done, 0.0, reward
)
self.final_rewards[name].append(reward)
self.reset()
elif done:
# This agent has finished but others are still running.
# This gives a reward of the time penalty if this agent
# is successful and the negative env reward if it fails.
ceil_reward = min(-TIME_PENALTY, reward)
env.step_result[name] = env._make_batched_step(
name, done, ceil_reward, 0.0
)
self.final_rewards[name].append(reward)

else:
env.step_result[name] = env._make_batched_step(
name, done, reward, 0.0
)

def reset(self) -> None: # type: ignore
for name in self.names:
for i in range(self.num_agents):
name_and_num = name + str(i)
self.dones[name_and_num] = False

@property
def reset_parameters(self) -> Dict[str, str]:
return {}

def close(self):
pass


class RecordEnvironment(SimpleEnvironment):
def __init__(
self,
Expand Down
79 changes: 78 additions & 1 deletion ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from mlagents.trainers.tests.simple_test_envs import (
SimpleEnvironment,
MultiAgentEnvironment,
MemoryEnvironment,
RecordEnvironment,
)
Expand All @@ -27,7 +28,11 @@
ActionSpecProto,
)

from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.dummy_config import (
ppo_dummy_config,
sac_dummy_config,
coma_dummy_config,
)
from mlagents.trainers.tests.check_env_trains import (
check_environment_trains,
default_reward_processor,
Expand All @@ -37,11 +42,83 @@

PPO_TORCH_CONFIG = ppo_dummy_config()
SAC_TORCH_CONFIG = sac_dummy_config()
COMA_TORCH_CONFIG = coma_dummy_config()

# tests in this file won't be tested on GPU machine
pytestmark = pytest.mark.check_environment_trains


@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be tested for combinations of rank 1, 2 and 3 observations and with and LSTM config? (To make sure it does not crash at least)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some tests for LSTM, variable length obs, and visual

def test_simple_coma(action_sizes):
env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=action_sizes, num_agents=2)
config = attr.evolve(COMA_TORCH_CONFIG)
check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("num_visual", [1, 2])
def test_visual_coma(num_visual):
env = MultiAgentEnvironment(
[BRAIN_NAME], action_sizes=(0, 1), num_agents=2, num_visual=num_visual
)
new_hyperparams = attr.evolve(
COMA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(COMA_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("num_var_len", [1, 2])
@pytest.mark.parametrize("num_vector", [0, 1])
@pytest.mark.parametrize("num_vis", [0, 1])
def test_var_len_obs_coma(num_vis, num_vector, num_var_len):
env = MultiAgentEnvironment(
[BRAIN_NAME],
action_sizes=(0, 1),
num_visual=num_vis,
num_vector=num_vector,
num_var_len=num_var_len,
step_size=0.2,
num_agents=2,
)
new_hyperparams = attr.evolve(
COMA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4
)
config = attr.evolve(COMA_TORCH_CONFIG, hyperparameters=new_hyperparams)
check_environment_trains(env, {BRAIN_NAME: config})


@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
@pytest.mark.parametrize("is_multiagent", [True, False])
def test_recurrent_coma(action_sizes, is_multiagent):
if is_multiagent:
# This is not a recurrent environment, just check if LSTM doesn't crash
env = MultiAgentEnvironment(
[BRAIN_NAME], action_sizes=action_sizes, num_agents=2
)
else:
# Actually test LSTM here
env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
new_network_settings = attr.evolve(
COMA_TORCH_CONFIG.network_settings,
memory=NetworkSettings.MemorySettings(memory_size=16),
)
new_hyperparams = attr.evolve(
COMA_TORCH_CONFIG.hyperparameters,
learning_rate=1.0e-3,
batch_size=64,
buffer_size=128,
)
config = attr.evolve(
COMA_TORCH_CONFIG,
hyperparameters=new_hyperparams,
network_settings=new_network_settings,
max_steps=500 if is_multiagent else 6000,
)
check_environment_trains(
env, {BRAIN_NAME: config}, success_threshold=None if is_multiagent else 0.9
)


@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
def test_simple_ppo(action_sizes):
env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
Expand Down