Skip to content

Commit 143876b

Browse files
Added Reward Providers for Torch (#4280)
* Added Reward Providers for Torch * Use NetworkBody to encode state in the reward providers * Integrating the reward prodiders with ppo and torch * work in progress, integration with PPO. Not training properly Pyramids at the moment * Integration in PPO * Removing duplicate file * Gail and Curiosity working * addressing comments * Enfore float32 for tests * enfore np.float32 in buffer
1 parent c2b0074 commit 143876b

21 files changed

+1111
-49
lines changed

ml-agents/mlagents/trainers/buffer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def extend(self, data: np.ndarray) -> None:
4848
Adds a list of np.arrays to the end of the list of np.arrays.
4949
:param data: The np.array list to append.
5050
"""
51-
self += list(np.array(data))
51+
self += list(np.array(data, dtype=np.float32))
5252

5353
def set(self, data):
5454
"""

ml-agents/mlagents/trainers/optimizer/torch_optimizer.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@
55

66
from mlagents.trainers.buffer import AgentBuffer
77
from mlagents.trainers.components.bc.module import BCModule
8-
from mlagents.trainers.components.reward_signals.extrinsic.signal import (
9-
ExtrinsicRewardSignal,
10-
)
8+
from mlagents.trainers.torch.components.reward_providers import create_reward_provider
9+
1110
from mlagents.trainers.policy.torch_policy import TorchPolicy
1211
from mlagents.trainers.optimizer import Optimizer
13-
from mlagents.trainers.settings import TrainerSettings, RewardSignalType
12+
from mlagents.trainers.settings import TrainerSettings
1413
from mlagents.trainers.trajectory import SplitObservations
1514
from mlagents.trainers.torch.utils import ModelUtils
1615

@@ -37,16 +36,11 @@ def create_reward_signals(self, reward_signal_configs):
3736
Create reward signals
3837
:param reward_signal_configs: Reward signal config.
3938
"""
40-
extrinsic_signal = ExtrinsicRewardSignal(
41-
self.policy, reward_signal_configs[RewardSignalType.EXTRINSIC]
42-
)
43-
self.reward_signals = {RewardSignalType.EXTRINSIC.value: extrinsic_signal}
44-
# Create reward signals
45-
# for reward_signal, config in reward_signal_configs.items():
46-
# self.reward_signals[reward_signal] = create_reward_signal(
47-
# self.policy, reward_signal, config
48-
# )
49-
# self.update_dict.update(self.reward_signals[reward_signal].update_dict)
39+
for reward_signal, settings in reward_signal_configs.items():
40+
# Name reward signals by string in case we have duplicates later
41+
self.reward_signals[reward_signal.value] = create_reward_provider(
42+
reward_signal, self.policy.behavior_spec, settings
43+
)
5044

5145
def get_value_estimates(
5246
self, decision_requests: DecisionSteps, idx: int, done: bool
@@ -72,7 +66,7 @@ def get_value_estimates(
7266
# If we're done, reassign all of the value estimates that need terminal states.
7367
if done:
7468
for k in value_estimates:
75-
if self.reward_signals[k].use_terminal_states:
69+
if not self.reward_signals[k].ignore_done:
7670
value_estimates[k] = 0.0
7771

7872
return value_estimates
@@ -111,7 +105,7 @@ def get_trajectory_value_estimates(
111105

112106
if done:
113107
for k in next_value_estimate:
114-
if self.reward_signals[k].use_terminal_states:
108+
if not self.reward_signals[k].ignore_done:
115109
next_value_estimate[k] = 0.0
116110

117111
return value_estimates, next_value_estimate

ml-agents/mlagents/trainers/ppo/optimizer_torch.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,4 +176,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
176176
"Policy/Beta": decay_bet,
177177
}
178178

179+
for reward_provider in self.reward_signals.values():
180+
update_stats.update(reward_provider.update(batch))
181+
179182
return update_stats

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
TestingConfiguration,
2222
FrameworkType,
2323
)
24+
from mlagents.trainers.components.reward_signals import RewardSignal
2425

2526
try:
2627
from mlagents.trainers.policy.torch_policy import TorchPolicy
@@ -91,18 +92,30 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
9192

9293
for name, v in value_estimates.items():
9394
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
94-
self._stats_reporter.add_stat(
95-
self.optimizer.reward_signals[name].value_name, np.mean(v)
96-
)
95+
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
96+
self._stats_reporter.add_stat(
97+
self.optimizer.reward_signals[name].value_name, np.mean(v)
98+
)
99+
else:
100+
self._stats_reporter.add_stat(
101+
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
102+
np.mean(v),
103+
)
97104

98105
# Evaluate all reward functions
99106
self.collected_rewards["environment"][agent_id] += np.sum(
100107
agent_buffer_trajectory["environment_rewards"]
101108
)
102109
for name, reward_signal in self.optimizer.reward_signals.items():
103-
evaluate_result = reward_signal.evaluate_batch(
104-
agent_buffer_trajectory
105-
).scaled_reward
110+
if isinstance(reward_signal, RewardSignal):
111+
evaluate_result = reward_signal.evaluate_batch(
112+
agent_buffer_trajectory
113+
).scaled_reward
114+
else:
115+
evaluate_result = (
116+
reward_signal.evaluate(agent_buffer_trajectory)
117+
* reward_signal.strength
118+
)
106119
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
107120
# Report the reward signals
108121
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

ml-agents/mlagents/trainers/sac/optimizer_torch.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
8383
# Use to reduce "survivor bonus" when using Curiosity or GAIL.
8484
self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()]
8585
self.use_dones_in_backup = {
86-
name: int(self.reward_signals[name].use_terminal_states)
86+
name: int(not self.reward_signals[name].ignore_done)
8787
for name in self.stream_names
8888
}
8989

@@ -472,6 +472,9 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
472472
"Policy/Learning Rate": decay_lr,
473473
}
474474

475+
for signal in self.reward_signals.values():
476+
signal.update(batch)
477+
475478
return update_stats
476479

477480
def update_reward_signals(

ml-agents/mlagents/trainers/sac/trainer.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from mlagents.trainers.trajectory import Trajectory, SplitObservations
2020
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
2121
from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
22+
from mlagents.trainers.components.reward_signals import RewardSignal
2223

2324
try:
2425
from mlagents.trainers.policy.torch_policy import TorchPolicy
@@ -143,9 +144,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
143144
agent_buffer_trajectory["environment_rewards"]
144145
)
145146
for name, reward_signal in self.optimizer.reward_signals.items():
146-
evaluate_result = reward_signal.evaluate_batch(
147-
agent_buffer_trajectory
148-
).scaled_reward
147+
if isinstance(reward_signal, RewardSignal):
148+
evaluate_result = reward_signal.evaluate_batch(
149+
agent_buffer_trajectory
150+
).scaled_reward
151+
else:
152+
evaluate_result = (
153+
reward_signal.evaluate(agent_buffer_trajectory)
154+
* reward_signal.strength
155+
)
149156
# Report the reward signals
150157
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
151158

@@ -154,9 +161,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
154161
agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
155162
)
156163
for name, v in value_estimates.items():
157-
self._stats_reporter.add_stat(
158-
self.optimizer.reward_signals[name].value_name, np.mean(v)
159-
)
164+
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
165+
self._stats_reporter.add_stat(
166+
self.optimizer.reward_signals[name].value_name, np.mean(v)
167+
)
168+
else:
169+
self._stats_reporter.add_stat(
170+
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
171+
np.mean(v),
172+
)
160173

161174
# Bootstrap using the last step rather than the bootstrap step if max step is reached.
162175
# Set last element to duplicate obs and remove dones.
@@ -277,9 +290,14 @@ def _update_sac_policy(self) -> bool:
277290
)
278291
# Get rewards for each reward
279292
for name, signal in self.optimizer.reward_signals.items():
280-
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
281-
sampled_minibatch
282-
).scaled_reward
293+
if isinstance(signal, RewardSignal):
294+
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
295+
sampled_minibatch
296+
).scaled_reward
297+
else:
298+
sampled_minibatch[f"{name}_rewards"] = (
299+
signal.evaluate(sampled_minibatch) * signal.strength
300+
)
283301

284302
update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
285303
for stat_name, value in update_stats.items():
@@ -326,12 +344,13 @@ def _update_reward_signals(self) -> None:
326344
reward_signal_minibatches = {}
327345
for name, signal in self.optimizer.reward_signals.items():
328346
logger.debug(f"Updating {name} at step {self.step}")
329-
# Some signals don't need a minibatch to be sampled - so we don't!
330-
if signal.update_dict:
331-
reward_signal_minibatches[name] = buffer.sample_mini_batch(
332-
self.hyperparameters.batch_size,
333-
sequence_length=self.policy.sequence_length,
334-
)
347+
if isinstance(signal, RewardSignal):
348+
# Some signals don't need a minibatch to be sampled - so we don't!
349+
if signal.update_dict:
350+
reward_signal_minibatches[name] = buffer.sample_mini_batch(
351+
self.hyperparameters.batch_size,
352+
sequence_length=self.policy.sequence_length,
353+
)
335354
update_stats = self.optimizer.update_reward_signals(
336355
reward_signal_minibatches, n_sequences
337356
)
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import numpy as np
2+
import pytest
3+
import torch
4+
from mlagents.trainers.torch.components.reward_providers import (
5+
CuriosityRewardProvider,
6+
create_reward_provider,
7+
)
8+
from mlagents_envs.base_env import BehaviorSpec, ActionType
9+
from mlagents.trainers.settings import CuriositySettings, RewardSignalType
10+
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
11+
create_agent_buffer,
12+
)
13+
14+
SEED = [42]
15+
16+
17+
@pytest.mark.parametrize(
18+
"behavior_spec",
19+
[
20+
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
21+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
22+
],
23+
)
24+
def test_construction(behavior_spec: BehaviorSpec) -> None:
25+
curiosity_settings = CuriositySettings(32, 0.01)
26+
curiosity_settings.strength = 0.1
27+
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
28+
assert curiosity_rp.strength == 0.1
29+
assert curiosity_rp.name == "Curiosity"
30+
31+
32+
@pytest.mark.parametrize(
33+
"behavior_spec",
34+
[
35+
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
36+
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5),
37+
BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)),
38+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
39+
],
40+
)
41+
def test_factory(behavior_spec: BehaviorSpec) -> None:
42+
curiosity_settings = CuriositySettings(32, 0.01)
43+
curiosity_rp = create_reward_provider(
44+
RewardSignalType.CURIOSITY, behavior_spec, curiosity_settings
45+
)
46+
assert curiosity_rp.name == "Curiosity"
47+
48+
49+
@pytest.mark.parametrize("seed", SEED)
50+
@pytest.mark.parametrize(
51+
"behavior_spec",
52+
[
53+
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5),
54+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
55+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
56+
],
57+
)
58+
def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None:
59+
np.random.seed(seed)
60+
torch.manual_seed(seed)
61+
curiosity_settings = CuriositySettings(32, 0.01)
62+
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
63+
buffer = create_agent_buffer(behavior_spec, 5)
64+
curiosity_rp.update(buffer)
65+
reward_old = curiosity_rp.evaluate(buffer)[0]
66+
for _ in range(10):
67+
curiosity_rp.update(buffer)
68+
reward_new = curiosity_rp.evaluate(buffer)[0]
69+
assert reward_new < reward_old
70+
reward_old = reward_new
71+
72+
73+
@pytest.mark.parametrize("seed", SEED)
74+
@pytest.mark.parametrize(
75+
"behavior_spec", [BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5)]
76+
)
77+
def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:
78+
np.random.seed(seed)
79+
torch.manual_seed(seed)
80+
curiosity_settings = CuriositySettings(32, 0.1)
81+
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
82+
buffer = create_agent_buffer(behavior_spec, 5)
83+
for _ in range(200):
84+
curiosity_rp.update(buffer)
85+
prediction = curiosity_rp._network.predict_action(buffer)[0].detach()
86+
target = buffer["actions"][0]
87+
error = float(torch.mean((prediction - target) ** 2))
88+
assert error < 0.001
89+
90+
91+
@pytest.mark.parametrize("seed", SEED)
92+
@pytest.mark.parametrize(
93+
"behavior_spec",
94+
[
95+
BehaviorSpec([(10,), (64, 66, 3)], ActionType.CONTINUOUS, 5),
96+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
97+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
98+
],
99+
)
100+
def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:
101+
np.random.seed(seed)
102+
torch.manual_seed(seed)
103+
curiosity_settings = CuriositySettings(32, 0.1)
104+
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
105+
buffer = create_agent_buffer(behavior_spec, 5)
106+
for _ in range(100):
107+
curiosity_rp.update(buffer)
108+
prediction = curiosity_rp._network.predict_next_state(buffer)[0]
109+
target = curiosity_rp._network.get_next_state(buffer)[0]
110+
error = float(torch.mean((prediction - target) ** 2).detach())
111+
assert error < 0.001
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import pytest
2+
from mlagents.trainers.torch.components.reward_providers import (
3+
ExtrinsicRewardProvider,
4+
create_reward_provider,
5+
)
6+
from mlagents_envs.base_env import BehaviorSpec, ActionType
7+
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
8+
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
9+
create_agent_buffer,
10+
)
11+
12+
13+
@pytest.mark.parametrize(
14+
"behavior_spec",
15+
[
16+
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
17+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
18+
],
19+
)
20+
def test_construction(behavior_spec: BehaviorSpec) -> None:
21+
settings = RewardSignalSettings()
22+
settings.gamma = 0.2
23+
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
24+
assert extrinsic_rp.gamma == 0.2
25+
assert extrinsic_rp.name == "Extrinsic"
26+
27+
28+
@pytest.mark.parametrize(
29+
"behavior_spec",
30+
[
31+
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
32+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
33+
],
34+
)
35+
def test_factory(behavior_spec: BehaviorSpec) -> None:
36+
settings = RewardSignalSettings()
37+
extrinsic_rp = create_reward_provider(
38+
RewardSignalType.EXTRINSIC, behavior_spec, settings
39+
)
40+
assert extrinsic_rp.name == "Extrinsic"
41+
42+
43+
@pytest.mark.parametrize("reward", [2.0, 3.0, 4.0])
44+
@pytest.mark.parametrize(
45+
"behavior_spec",
46+
[
47+
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
48+
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
49+
],
50+
)
51+
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
52+
buffer = create_agent_buffer(behavior_spec, 1000, reward)
53+
settings = RewardSignalSettings()
54+
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
55+
generated_rewards = extrinsic_rp.evaluate(buffer)
56+
assert (generated_rewards == reward).all()

0 commit comments

Comments
 (0)