Skip to content

Commit dfe9c11

Browse files
author
Ervin T
authored
Move add_experiences out of trainer, add Trajectories (#3067)
1 parent 2d37d14 commit dfe9c11

32 files changed

+1264
-840
lines changed

ml-agents/mlagents/trainers/action_info.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import NamedTuple, Any, Dict
2+
import numpy as np
23

3-
ActionInfoOutputs = Dict[str, Any]
4+
ActionInfoOutputs = Dict[str, np.ndarray]
45

56

67
class ActionInfo(NamedTuple):
Lines changed: 139 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,75 +1,155 @@
1-
from typing import List, Union
1+
import sys
2+
from typing import List, Dict
3+
from collections import defaultdict, Counter
24

3-
from mlagents.trainers.buffer import AgentBuffer, BufferException
5+
from mlagents.trainers.trainer import Trainer
6+
from mlagents.trainers.trajectory import Trajectory, AgentExperience
7+
from mlagents.trainers.brain import BrainInfo
8+
from mlagents.trainers.tf_policy import TFPolicy
9+
from mlagents.trainers.action_info import ActionInfoOutputs
10+
from mlagents.trainers.stats import StatsReporter
411

512

6-
class ProcessingBuffer(dict):
13+
class AgentProcessor:
714
"""
8-
ProcessingBuffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.
15+
AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id.
16+
Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.
17+
One AgentProcessor should be created per agent group.
918
"""
1019

11-
def __str__(self):
12-
return "local_buffers :\n{0}".format(
13-
"\n".join(["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()])
14-
)
15-
16-
def __getitem__(self, key):
17-
if key not in self.keys():
18-
self[key] = AgentBuffer()
19-
return super().__getitem__(key)
20-
21-
def reset_local_buffers(self) -> None:
20+
def __init__(
21+
self,
22+
trainer: Trainer,
23+
policy: TFPolicy,
24+
stats_reporter: StatsReporter,
25+
max_trajectory_length: int = sys.maxsize,
26+
):
2227
"""
23-
Resets all the local AgentBuffers.
28+
Create an AgentProcessor.
29+
:param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
30+
when it is finished.
31+
:param policy: Policy instance associated with this AgentProcessor.
32+
:param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
33+
:param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
2434
"""
25-
for buf in self.values():
26-
buf.reset_agent()
35+
self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
36+
self.last_brain_info: Dict[str, BrainInfo] = {}
37+
self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
38+
# Note: this is needed until we switch to AgentExperiences as the data input type.
39+
# We still need some info from the policy (memories, previous actions)
40+
# that really should be gathered by the env-manager.
41+
self.policy = policy
42+
self.episode_steps: Counter = Counter()
43+
self.episode_rewards: Dict[str, float] = defaultdict(float)
44+
self.stats_reporter = stats_reporter
45+
self.trainer = trainer
46+
self.max_trajectory_length = max_trajectory_length
2747

28-
def append_to_update_buffer(
48+
def add_experiences(
2949
self,
30-
update_buffer: AgentBuffer,
31-
agent_id: Union[int, str],
32-
key_list: List[str] = None,
33-
batch_size: int = None,
34-
training_length: int = None,
50+
curr_info: BrainInfo,
51+
next_info: BrainInfo,
52+
take_action_outputs: ActionInfoOutputs,
3553
) -> None:
3654
"""
37-
Appends the buffer of an agent to the update buffer.
38-
:param update_buffer: A reference to an AgentBuffer to append the agent's buffer to
39-
:param agent_id: The id of the agent which data will be appended
40-
:param key_list: The fields that must be added. If None: all fields will be appended.
41-
:param batch_size: The number of elements that must be appended. If None: All of them will be.
42-
:param training_length: The length of the samples that must be appended. If None: only takes one element.
55+
Adds experiences to each agent's experience history.
56+
:param curr_info: current BrainInfo.
57+
:param next_info: next BrainInfo.
58+
:param take_action_outputs: The outputs of the Policy's get_action method.
4359
"""
44-
if key_list is None:
45-
key_list = self[agent_id].keys()
46-
if not self[agent_id].check_length(key_list):
47-
raise BufferException(
48-
"The length of the fields {0} for agent {1} were not of same length".format(
49-
key_list, agent_id
50-
)
60+
if take_action_outputs:
61+
self.stats_reporter.add_stat(
62+
"Policy/Entropy", take_action_outputs["entropy"].mean()
5163
)
52-
for field_key in key_list:
53-
update_buffer[field_key].extend(
54-
self[agent_id][field_key].get_batch(
55-
batch_size=batch_size, training_length=training_length
56-
)
64+
self.stats_reporter.add_stat(
65+
"Policy/Learning Rate", take_action_outputs["learning_rate"]
5766
)
5867

59-
def append_all_agent_batch_to_update_buffer(
60-
self,
61-
update_buffer: AgentBuffer,
62-
key_list: List[str] = None,
63-
batch_size: int = None,
64-
training_length: int = None,
65-
) -> None:
66-
"""
67-
Appends the buffer of all agents to the update buffer.
68-
:param key_list: The fields that must be added. If None: all fields will be appended.
69-
:param batch_size: The number of elements that must be appended. If None: All of them will be.
70-
:param training_length: The length of the samples that must be appended. If None: only takes one element.
71-
"""
72-
for agent_id in self.keys():
73-
self.append_to_update_buffer(
74-
update_buffer, agent_id, key_list, batch_size, training_length
75-
)
68+
for agent_id in curr_info.agents:
69+
self.last_brain_info[agent_id] = curr_info
70+
self.last_take_action_outputs[agent_id] = take_action_outputs
71+
72+
# Store the environment reward
73+
tmp_environment_reward = next_info.rewards
74+
75+
for next_idx, agent_id in enumerate(next_info.agents):
76+
stored_info = self.last_brain_info.get(agent_id, None)
77+
if stored_info is not None:
78+
stored_take_action_outputs = self.last_take_action_outputs[agent_id]
79+
idx = stored_info.agents.index(agent_id)
80+
obs = []
81+
if not stored_info.local_done[idx]:
82+
for i, _ in enumerate(stored_info.visual_observations):
83+
obs.append(stored_info.visual_observations[i][idx])
84+
if self.policy.use_vec_obs:
85+
obs.append(stored_info.vector_observations[idx])
86+
if self.policy.use_recurrent:
87+
memory = self.policy.retrieve_memories([agent_id])[0, :]
88+
else:
89+
memory = None
90+
91+
done = next_info.local_done[next_idx]
92+
max_step = next_info.max_reached[next_idx]
93+
94+
# Add the outputs of the last eval
95+
action = stored_take_action_outputs["action"][idx]
96+
if self.policy.use_continuous_act:
97+
action_pre = stored_take_action_outputs["pre_action"][idx]
98+
else:
99+
action_pre = None
100+
action_probs = stored_take_action_outputs["log_probs"][idx]
101+
action_masks = stored_info.action_masks[idx]
102+
prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
103+
104+
experience = AgentExperience(
105+
obs=obs,
106+
reward=tmp_environment_reward[next_idx],
107+
done=done,
108+
action=action,
109+
action_probs=action_probs,
110+
action_pre=action_pre,
111+
action_mask=action_masks,
112+
prev_action=prev_action,
113+
max_step=max_step,
114+
memory=memory,
115+
)
116+
# Add the value outputs if needed
117+
self.experience_buffers[agent_id].append(experience)
118+
self.episode_rewards[agent_id] += tmp_environment_reward[next_idx]
119+
if (
120+
next_info.local_done[next_idx]
121+
or (
122+
len(self.experience_buffers[agent_id])
123+
>= self.max_trajectory_length
124+
)
125+
) and len(self.experience_buffers[agent_id]) > 0:
126+
# Make next AgentExperience
127+
next_obs = []
128+
for i, _ in enumerate(next_info.visual_observations):
129+
next_obs.append(next_info.visual_observations[i][next_idx])
130+
if self.policy.use_vec_obs:
131+
next_obs.append(next_info.vector_observations[next_idx])
132+
trajectory = Trajectory(
133+
steps=self.experience_buffers[agent_id],
134+
agent_id=agent_id,
135+
next_obs=next_obs,
136+
)
137+
# This will eventually be replaced with a queue
138+
self.trainer.process_trajectory(trajectory)
139+
self.experience_buffers[agent_id] = []
140+
if next_info.local_done[next_idx]:
141+
self.stats_reporter.add_stat(
142+
"Environment/Cumulative Reward",
143+
self.episode_rewards.get(agent_id, 0),
144+
)
145+
self.stats_reporter.add_stat(
146+
"Environment/Episode Length",
147+
self.episode_steps.get(agent_id, 0),
148+
)
149+
del self.episode_steps[agent_id]
150+
del self.episode_rewards[agent_id]
151+
elif not next_info.local_done[next_idx]:
152+
self.episode_steps[agent_id] += 1
153+
self.policy.save_previous_action(
154+
curr_info.agents, take_action_outputs["action"]
155+
)

ml-agents/mlagents/trainers/buffer.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,35 @@ def truncate(self, max_length: int, sequence_length: int = 1) -> None:
255255
for _key in self.keys():
256256
self[_key] = self[_key][current_length - max_length :]
257257

258+
def resequence_and_append(
259+
self,
260+
target_buffer: "AgentBuffer",
261+
key_list: List[str] = None,
262+
batch_size: int = None,
263+
training_length: int = None,
264+
) -> None:
265+
"""
266+
Takes in a batch size and training length (sequence length), and appends this AgentBuffer to target_buffer
267+
properly padded for LSTM use. Optionally, use key_list to restrict which fields are inserted into the new
268+
buffer.
269+
:param target_buffer: The buffer which to append the samples to.
270+
:param key_list: The fields that must be added. If None: all fields will be appended.
271+
:param batch_size: The number of elements that must be appended. If None: All of them will be.
272+
:param training_length: The length of the samples that must be appended. If None: only takes one element.
273+
"""
274+
if key_list is None:
275+
key_list = list(self.keys())
276+
if not self.check_length(key_list):
277+
raise BufferException(
278+
"The length of the fields {0} were not of same length".format(key_list)
279+
)
280+
for field_key in key_list:
281+
target_buffer[field_key].extend(
282+
self[field_key].get_batch(
283+
batch_size=batch_size, training_length=training_length
284+
)
285+
)
286+
258287
@property
259288
def num_experiences(self) -> int:
260289
"""

ml-agents/mlagents/trainers/components/bc/module.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ def _update_batch(
150150
feed_dict[self.policy.model.prev_action] = mini_batch_demo[
151151
"prev_action"
152152
]
153-
154153
network_out = self.policy.sess.run(
155154
list(self.out_dict.values()), feed_dict=feed_dict
156155
)

ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@ def evaluate(
3131
return RewardSignalResult(scaled_reward, unscaled_reward)
3232

3333
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
34-
env_rews = np.array(mini_batch["environment_rewards"])
34+
env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
3535
return RewardSignalResult(self.strength * env_rews, env_rews)

ml-agents/mlagents/trainers/curriculum.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import json
33
import math
4+
from typing import Dict, Any, TextIO
45

56
from .exception import CurriculumConfigError, CurriculumLoadingError
67

@@ -51,14 +52,14 @@ def __init__(self, location):
5152
)
5253

5354
@property
54-
def lesson_num(self):
55+
def lesson_num(self) -> int:
5556
return self._lesson_num
5657

5758
@lesson_num.setter
58-
def lesson_num(self, lesson_num):
59+
def lesson_num(self, lesson_num: int) -> None:
5960
self._lesson_num = max(0, min(lesson_num, self.max_lesson_num))
6061

61-
def increment_lesson(self, measure_val):
62+
def increment_lesson(self, measure_val: float) -> bool:
6263
"""
6364
Increments the lesson number depending on the progress given.
6465
:param measure_val: Measure of progress (either reward or percentage
@@ -87,7 +88,7 @@ def increment_lesson(self, measure_val):
8788
return True
8889
return False
8990

90-
def get_config(self, lesson=None):
91+
def get_config(self, lesson: int = None) -> Dict[str, Any]:
9192
"""
9293
Returns reset parameters which correspond to the lesson.
9394
:param lesson: The lesson you want to get the config of. If None, the
@@ -106,7 +107,7 @@ def get_config(self, lesson=None):
106107
return config
107108

108109
@staticmethod
109-
def load_curriculum_file(location):
110+
def load_curriculum_file(location: str) -> None:
110111
try:
111112
with open(location) as data_file:
112113
return Curriculum._load_curriculum(data_file)
@@ -120,7 +121,7 @@ def load_curriculum_file(location):
120121
)
121122

122123
@staticmethod
123-
def _load_curriculum(fp):
124+
def _load_curriculum(fp: TextIO) -> None:
124125
try:
125126
return json.load(fp)
126127
except json.decoder.JSONDecodeError as e:

ml-agents/mlagents/trainers/demo_loader.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from typing import List, Tuple
55
import numpy as np
66
from mlagents.trainers.buffer import AgentBuffer
7-
from mlagents.trainers.agent_processor import ProcessingBuffer
87
from mlagents.trainers.brain import BrainParameters, BrainInfo
98
from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
109
AgentInfoActionPairProto,
@@ -27,8 +26,8 @@ def make_demo_buffer(
2726
sequence_length: int,
2827
) -> AgentBuffer:
2928
# Create and populate buffer using experiences
30-
demo_process_buffer = ProcessingBuffer()
31-
demo_buffer = AgentBuffer()
29+
demo_raw_buffer = AgentBuffer()
30+
demo_processed_buffer = AgentBuffer()
3231
for idx, experience in enumerate(pair_infos):
3332
if idx > len(pair_infos) - 2:
3433
break
@@ -47,30 +46,27 @@ def make_demo_buffer(
4746
previous_action = np.array(
4847
pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
4948
)
50-
demo_process_buffer[0].last_brain_info = current_brain_info
51-
demo_process_buffer[0]["done"].append(next_brain_info.local_done[0])
52-
demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0])
49+
demo_raw_buffer["done"].append(next_brain_info.local_done[0])
50+
demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
5351
for i in range(brain_params.number_visual_observations):
54-
demo_process_buffer[0]["visual_obs%d" % i].append(
52+
demo_raw_buffer["visual_obs%d" % i].append(
5553
current_brain_info.visual_observations[i][0]
5654
)
5755
if brain_params.vector_observation_space_size > 0:
58-
demo_process_buffer[0]["vector_obs"].append(
56+
demo_raw_buffer["vector_obs"].append(
5957
current_brain_info.vector_observations[0]
6058
)
61-
demo_process_buffer[0]["actions"].append(
62-
current_pair_info.action_info.vector_actions
63-
)
64-
demo_process_buffer[0]["prev_action"].append(previous_action)
59+
demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
60+
demo_raw_buffer["prev_action"].append(previous_action)
6561
if next_brain_info.local_done[0]:
66-
demo_process_buffer.append_to_update_buffer(
67-
demo_buffer, 0, batch_size=None, training_length=sequence_length
62+
demo_raw_buffer.resequence_and_append(
63+
demo_processed_buffer, batch_size=None, training_length=sequence_length
6864
)
69-
demo_process_buffer.reset_local_buffers()
70-
demo_process_buffer.append_to_update_buffer(
71-
demo_buffer, 0, batch_size=None, training_length=sequence_length
65+
demo_raw_buffer.reset_agent()
66+
demo_raw_buffer.resequence_and_append(
67+
demo_processed_buffer, batch_size=None, training_length=sequence_length
7268
)
73-
return demo_buffer
69+
return demo_processed_buffer
7470

7571

7672
@timed

0 commit comments

Comments
 (0)