Skip to content

[poca] Remove add_groupmate_rewards from settings #5082

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions ml-agents/mlagents/trainers/poca/optimizer_torch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from typing import Dict, cast, List, Tuple, Optional
from mlagents.trainers.torch.components.reward_providers.extrinsic_reward_provider import (
ExtrinsicRewardProvider,
)
import numpy as np
import math
from mlagents.torch_utils import torch
Expand All @@ -15,7 +18,6 @@
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
from mlagents.trainers.settings import (
ExtrinsicSettings,
RewardSignalSettings,
RewardSignalType,
TrainerSettings,
Expand Down Expand Up @@ -187,15 +189,18 @@ def create_reward_signals(
GAIL, and make sure Extrinsic adds team rewards.
:param reward_signal_configs: Reward signal config.
"""
for reward_signal, settings in reward_signal_configs.items():
for reward_signal in reward_signal_configs.keys():
if reward_signal != RewardSignalType.EXTRINSIC:
logger.warning(
f"Reward signal {reward_signal.value.capitalize()} is not supported with the POCA trainer; "
"results may be unexpected."
)
elif isinstance(settings, ExtrinsicSettings):
settings.add_groupmate_rewards = True
super().create_reward_signals(reward_signal_configs)
# Make sure we add the groupmate rewards in POCA, so agents learn how to help each
# other achieve individual rewards as well
for reward_provider in self.reward_signals.values():
if isinstance(reward_provider, ExtrinsicRewardProvider):
reward_provider.add_groupmate_rewards = True

@property
def critic(self):
Expand Down
10 changes: 2 additions & 8 deletions ml-agents/mlagents/trainers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ class RewardSignalType(Enum):

def to_settings(self) -> type:
_mapping = {
RewardSignalType.EXTRINSIC: ExtrinsicSettings,
RewardSignalType.EXTRINSIC: RewardSignalSettings,
RewardSignalType.GAIL: GAILSettings,
RewardSignalType.CURIOSITY: CuriositySettings,
RewardSignalType.RND: RNDSettings,
Expand Down Expand Up @@ -215,12 +215,6 @@ def structure(d: Mapping, t: type) -> Any:
return d_final


@attr.s(auto_attribs=True)
class ExtrinsicSettings(RewardSignalSettings):
# For use with MA-POCA. Add groupmate rewards to the final extrinsic reward.
add_groupmate_rewards = False


@attr.s(auto_attribs=True)
class GAILSettings(RewardSignalSettings):
learning_rate: float = 3e-4
Expand Down Expand Up @@ -629,7 +623,7 @@ def _set_default_hyperparameters(self):

network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
factory=lambda: {RewardSignalType.EXTRINSIC: ExtrinsicSettings()}
factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
)
init_path: Optional[str] = None
keep_checkpoints: int = 5
Expand Down
4 changes: 2 additions & 2 deletions ml-agents/mlagents/trainers/tests/torch/test_poca.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import attr

from mlagents.trainers.poca.optimizer_torch import TorchPOCAOptimizer
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType

from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
Expand Down Expand Up @@ -49,7 +49,7 @@ def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual):

trainer_settings = attr.evolve(dummy_config)
trainer_settings.reward_signals = {
RewardSignalType.EXTRINSIC: ExtrinsicSettings(strength=1.0, gamma=0.99)
RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99)
}

trainer_settings.network_settings.memory = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import ExtrinsicSettings, RewardSignalType
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)
Expand All @@ -29,7 +29,7 @@
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
settings = ExtrinsicSettings()
settings = RewardSignalSettings()
settings.gamma = 0.2
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
assert extrinsic_rp.gamma == 0.2
Expand All @@ -48,7 +48,7 @@ def test_construction(behavior_spec: BehaviorSpec) -> None:
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
settings = ExtrinsicSettings()
settings = RewardSignalSettings()
extrinsic_rp = create_reward_provider(
RewardSignalType.EXTRINSIC, behavior_spec, settings
)
Expand All @@ -69,7 +69,7 @@ def test_factory(behavior_spec: BehaviorSpec) -> None:
)
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
buffer = create_agent_buffer(behavior_spec, 1000, reward)
settings = ExtrinsicSettings()
settings = RewardSignalSettings()
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == reward).all()
Expand All @@ -86,7 +86,7 @@ def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
assert (generated_rewards == 2 * reward).all()

# Test groupmate rewards. Total reward should be indiv_reward + 2 * teammate_reward + group_reward
settings.add_groupmate_rewards = True
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
extrinsic_rp.add_groupmate_rewards = True
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == 4 * reward).all()
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
BaseRewardProvider,
)
from mlagents_envs.base_env import BehaviorSpec
from mlagents.trainers.settings import ExtrinsicSettings
from mlagents.trainers.settings import RewardSignalSettings


class ExtrinsicRewardProvider(BaseRewardProvider):
Expand All @@ -16,9 +16,9 @@ class ExtrinsicRewardProvider(BaseRewardProvider):
but also the team and the individual rewards of the other agents.
"""

def __init__(self, specs: BehaviorSpec, settings: ExtrinsicSettings) -> None:
def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None:
super().__init__(specs, settings)
self._add_groupmate_rewards = settings.add_groupmate_rewards
self.add_groupmate_rewards = False

def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
indiv_rewards = np.array(
Expand All @@ -29,7 +29,7 @@ def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
BufferKey.GROUPMATE_REWARDS in mini_batch
and BufferKey.GROUP_REWARD in mini_batch
):
if self._add_groupmate_rewards:
if self.add_groupmate_rewards:
groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
groupmate_rewards_sum = np.array(
[sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32
Expand Down