Unity-Technologies · ervteng · Dec 19, 2019 · Nov 22, 2019 · Nov 25, 2019 · Nov 25, 2019
diff --git a/ml-agents/mlagents/trainers/action_info.py b/ml-agents/mlagents/trainers/action_info.py
@@ -1,6 +1,7 @@
 from typing import NamedTuple, Any, Dict
+import numpy as np
 
-ActionInfoOutputs = Dict[str, Any]
+ActionInfoOutputs = Dict[str, np.ndarray]
 
 
 class ActionInfo(NamedTuple):

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -1,75 +1,155 @@
-from typing import List, Union
+import sys
+from typing import List, Dict
+from collections import defaultdict, Counter
 
-from mlagents.trainers.buffer import AgentBuffer, BufferException
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.trajectory import Trajectory, AgentExperience
+from mlagents.trainers.brain import BrainInfo
+from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.action_info import ActionInfoOutputs
+from mlagents.trainers.stats import StatsReporter
 
 
-class ProcessingBuffer(dict):
+class AgentProcessor:
     """
-    ProcessingBuffer contains a dictionary of AgentBuffer. The AgentBuffers are indexed by agent_id.
+    AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id.
+    Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.
+    One AgentProcessor should be created per agent group.
     """
 
-    def __str__(self):
-        return "local_buffers :\n{0}".format(
-            "\n".join(["\tagent {0} :{1}".format(k, str(self[k])) for k in self.keys()])
-        )
-
-    def __getitem__(self, key):
-        if key not in self.keys():
-            self[key] = AgentBuffer()
-        return super().__getitem__(key)
-
-    def reset_local_buffers(self) -> None:
+    def __init__(
+        self,
+        trainer: Trainer,
+        policy: TFPolicy,
+        stats_reporter: StatsReporter,
+        max_trajectory_length: int = sys.maxsize,
+    ):
         """
-        Resets all the local AgentBuffers.
+        Create an AgentProcessor.
+        :param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
+        when it is finished.
+        :param policy: Policy instance associated with this AgentProcessor.
+        :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
+        :param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
         """
-        for buf in self.values():
-            buf.reset_agent()
+        self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
+        self.last_brain_info: Dict[str, BrainInfo] = {}
+        self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
+        # Note: this is needed until we switch to AgentExperiences as the data input type.
+        # We still need some info from the policy (memories, previous actions)
+        # that really should be gathered by the env-manager.
+        self.policy = policy
+        self.episode_steps: Counter = Counter()
+        self.episode_rewards: Dict[str, float] = defaultdict(float)
+        self.stats_reporter = stats_reporter
+        self.trainer = trainer
+        self.max_trajectory_length = max_trajectory_length
 
-    def append_to_update_buffer(
+    def add_experiences(
         self,
-        update_buffer: AgentBuffer,
-        agent_id: Union[int, str],
-        key_list: List[str] = None,
-        batch_size: int = None,
-        training_length: int = None,
+        curr_info: BrainInfo,
+        next_info: BrainInfo,
+        take_action_outputs: ActionInfoOutputs,
     ) -> None:
         """
-        Appends the buffer of an agent to the update buffer.
-        :param update_buffer: A reference to an AgentBuffer to append the agent's buffer to
-        :param agent_id: The id of the agent which data will be appended
-        :param key_list: The fields that must be added. If None: all fields will be appended.
-        :param batch_size: The number of elements that must be appended. If None: All of them will be.
-        :param training_length: The length of the samples that must be appended. If None: only takes one element.
+        Adds experiences to each agent's experience history.
+        :param curr_info: current BrainInfo.
+        :param next_info: next BrainInfo.
+        :param take_action_outputs: The outputs of the Policy's get_action method.
         """
-        if key_list is None:
-            key_list = self[agent_id].keys()
-        if not self[agent_id].check_length(key_list):
-            raise BufferException(
-                "The length of the fields {0} for agent {1} were not of same length".format(
-                    key_list, agent_id
-                )
+        if take_action_outputs:
+            self.stats_reporter.add_stat(
+                "Policy/Entropy", take_action_outputs["entropy"].mean()
             )
-        for field_key in key_list:
-            update_buffer[field_key].extend(
-                self[agent_id][field_key].get_batch(
-                    batch_size=batch_size, training_length=training_length
-                )
+            self.stats_reporter.add_stat(
+                "Policy/Learning Rate", take_action_outputs["learning_rate"]
             )
 
-    def append_all_agent_batch_to_update_buffer(
-        self,
-        update_buffer: AgentBuffer,
-        key_list: List[str] = None,
-        batch_size: int = None,
-        training_length: int = None,
-    ) -> None:
-        """
-        Appends the buffer of all agents to the update buffer.
-        :param key_list: The fields that must be added. If None: all fields will be appended.
-        :param batch_size: The number of elements that must be appended. If None: All of them will be.
-        :param training_length: The length of the samples that must be appended. If None: only takes one element.
-        """
-        for agent_id in self.keys():
-            self.append_to_update_buffer(
-                update_buffer, agent_id, key_list, batch_size, training_length
-            )
+        for agent_id in curr_info.agents:
+            self.last_brain_info[agent_id] = curr_info
+            self.last_take_action_outputs[agent_id] = take_action_outputs
+
+        # Store the environment reward
+        tmp_environment_reward = next_info.rewards
+
+        for next_idx, agent_id in enumerate(next_info.agents):
+            stored_info = self.last_brain_info.get(agent_id, None)
+            if stored_info is not None:
+                stored_take_action_outputs = self.last_take_action_outputs[agent_id]
+                idx = stored_info.agents.index(agent_id)
+                obs = []
+                if not stored_info.local_done[idx]:
+                    for i, _ in enumerate(stored_info.visual_observations):
+                        obs.append(stored_info.visual_observations[i][idx])
+                    if self.policy.use_vec_obs:
+                        obs.append(stored_info.vector_observations[idx])
+                    if self.policy.use_recurrent:
+                        memory = self.policy.retrieve_memories([agent_id])[0, :]
+                    else:
+                        memory = None
+
+                    done = next_info.local_done[next_idx]
+                    max_step = next_info.max_reached[next_idx]
+
+                    # Add the outputs of the last eval
+                    action = stored_take_action_outputs["action"][idx]
+                    if self.policy.use_continuous_act:
+                        action_pre = stored_take_action_outputs["pre_action"][idx]
+                    else:
+                        action_pre = None
+                    action_probs = stored_take_action_outputs["log_probs"][idx]
+                    action_masks = stored_info.action_masks[idx]
+                    prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
+
+                    experience = AgentExperience(
+                        obs=obs,
+                        reward=tmp_environment_reward[next_idx],
+                        done=done,
+                        action=action,
+                        action_probs=action_probs,
+                        action_pre=action_pre,
+                        action_mask=action_masks,
+                        prev_action=prev_action,
+                        max_step=max_step,
+                        memory=memory,
+                    )
+                    # Add the value outputs if needed
+                    self.experience_buffers[agent_id].append(experience)
+                    self.episode_rewards[agent_id] += tmp_environment_reward[next_idx]
+                if (
+                    next_info.local_done[next_idx]
+                    or (
+                        len(self.experience_buffers[agent_id])
+                        >= self.max_trajectory_length
+                    )
+                ) and len(self.experience_buffers[agent_id]) > 0:
+                    # Make next AgentExperience
+                    next_obs = []
+                    for i, _ in enumerate(next_info.visual_observations):
+                        next_obs.append(next_info.visual_observations[i][next_idx])
+                    if self.policy.use_vec_obs:
+                        next_obs.append(next_info.vector_observations[next_idx])
+                    trajectory = Trajectory(
+                        steps=self.experience_buffers[agent_id],
+                        agent_id=agent_id,
+                        next_obs=next_obs,
+                    )
+                    # This will eventually be replaced with a queue
+                    self.trainer.process_trajectory(trajectory)
+                    self.experience_buffers[agent_id] = []
+                    if next_info.local_done[next_idx]:
+                        self.stats_reporter.add_stat(
+                            "Environment/Cumulative Reward",
+                            self.episode_rewards.get(agent_id, 0),
+                        )
+                        self.stats_reporter.add_stat(
+                            "Environment/Episode Length",
+                            self.episode_steps.get(agent_id, 0),
+                        )
+                        del self.episode_steps[agent_id]
+                        del self.episode_rewards[agent_id]
+                elif not next_info.local_done[next_idx]:
+                    self.episode_steps[agent_id] += 1
+        self.policy.save_previous_action(
+            curr_info.agents, take_action_outputs["action"]
+        )
diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -255,6 +255,35 @@ def truncate(self, max_length: int, sequence_length: int = 1) -> None:
             for _key in self.keys():
                 self[_key] = self[_key][current_length - max_length :]
 
+    def resequence_and_append(
+        self,
+        target_buffer: "AgentBuffer",
+        key_list: List[str] = None,
+        batch_size: int = None,
+        training_length: int = None,
+    ) -> None:
+        """
+        Takes in a batch size and training length (sequence length), and appends this AgentBuffer to target_buffer
+        properly padded for LSTM use. Optionally, use key_list to restrict which fields are inserted into the new
+        buffer.
+        :param target_buffer: The buffer which to append the samples to.
+        :param key_list: The fields that must be added. If None: all fields will be appended.
+        :param batch_size: The number of elements that must be appended. If None: All of them will be.
+        :param training_length: The length of the samples that must be appended. If None: only takes one element.
+        """
+        if key_list is None:
+            key_list = list(self.keys())
+        if not self.check_length(key_list):
+            raise BufferException(
+                "The length of the fields {0} were not of same length".format(key_list)
+            )
+        for field_key in key_list:
+            target_buffer[field_key].extend(
+                self[field_key].get_batch(
+                    batch_size=batch_size, training_length=training_length
+                )
+            )
+
     @property
     def num_experiences(self) -> int:
         """

diff --git a/ml-agents/mlagents/trainers/components/bc/module.py b/ml-agents/mlagents/trainers/components/bc/module.py
@@ -150,7 +150,6 @@ def _update_batch(
                 feed_dict[self.policy.model.prev_action] = mini_batch_demo[
                     "prev_action"
                 ]
-
         network_out = self.policy.sess.run(
             list(self.out_dict.values()), feed_dict=feed_dict
         )

diff --git a/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py b/ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py
@@ -31,5 +31,5 @@ def evaluate(
         return RewardSignalResult(scaled_reward, unscaled_reward)
 
     def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
-        env_rews = np.array(mini_batch["environment_rewards"])
+        env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
         return RewardSignalResult(self.strength * env_rews, env_rews)
diff --git a/ml-agents/mlagents/trainers/curriculum.py b/ml-agents/mlagents/trainers/curriculum.py
@@ -1,6 +1,7 @@
 import os
 import json
 import math
+from typing import Dict, Any, TextIO
 
 from .exception import CurriculumConfigError, CurriculumLoadingError
 
@@ -51,14 +52,14 @@ def __init__(self, location):
                 )
 
     @property
-    def lesson_num(self):
+    def lesson_num(self) -> int:
         return self._lesson_num
 
     @lesson_num.setter
-    def lesson_num(self, lesson_num):
+    def lesson_num(self, lesson_num: int) -> None:
         self._lesson_num = max(0, min(lesson_num, self.max_lesson_num))
 
-    def increment_lesson(self, measure_val):
+    def increment_lesson(self, measure_val: float) -> bool:
         """
         Increments the lesson number depending on the progress given.
         :param measure_val: Measure of progress (either reward or percentage
@@ -87,7 +88,7 @@ def increment_lesson(self, measure_val):
                 return True
         return False
 
-    def get_config(self, lesson=None):
+    def get_config(self, lesson: int = None) -> Dict[str, Any]:
         """
         Returns reset parameters which correspond to the lesson.
         :param lesson: The lesson you want to get the config of. If None, the
@@ -106,7 +107,7 @@ def get_config(self, lesson=None):
         return config
 
     @staticmethod
-    def load_curriculum_file(location):
+    def load_curriculum_file(location: str) -> None:
         try:
             with open(location) as data_file:
                 return Curriculum._load_curriculum(data_file)
@@ -120,7 +121,7 @@ def load_curriculum_file(location):
             )
 
     @staticmethod
-    def _load_curriculum(fp):
+    def _load_curriculum(fp: TextIO) -> None:
         try:
             return json.load(fp)
         except json.decoder.JSONDecodeError as e:

diff --git a/ml-agents/mlagents/trainers/demo_loader.py b/ml-agents/mlagents/trainers/demo_loader.py
@@ -4,7 +4,6 @@
 from typing import List, Tuple
 import numpy as np
 from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.agent_processor import ProcessingBuffer
 from mlagents.trainers.brain import BrainParameters, BrainInfo
 from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
     AgentInfoActionPairProto,
@@ -27,8 +26,8 @@ def make_demo_buffer(
     sequence_length: int,
 ) -> AgentBuffer:
     # Create and populate buffer using experiences
-    demo_process_buffer = ProcessingBuffer()
-    demo_buffer = AgentBuffer()
+    demo_raw_buffer = AgentBuffer()
+    demo_processed_buffer = AgentBuffer()
     for idx, experience in enumerate(pair_infos):
         if idx > len(pair_infos) - 2:
             break
@@ -47,30 +46,27 @@ def make_demo_buffer(
             previous_action = np.array(
                 pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
             )
-        demo_process_buffer[0].last_brain_info = current_brain_info
-        demo_process_buffer[0]["done"].append(next_brain_info.local_done[0])
-        demo_process_buffer[0]["rewards"].append(next_brain_info.rewards[0])
+        demo_raw_buffer["done"].append(next_brain_info.local_done[0])
+        demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
         for i in range(brain_params.number_visual_observations):
-            demo_process_buffer[0]["visual_obs%d" % i].append(
+            demo_raw_buffer["visual_obs%d" % i].append(
                 current_brain_info.visual_observations[i][0]
             )
         if brain_params.vector_observation_space_size > 0:
-            demo_process_buffer[0]["vector_obs"].append(
+            demo_raw_buffer["vector_obs"].append(
                 current_brain_info.vector_observations[0]
             )
-        demo_process_buffer[0]["actions"].append(
-            current_pair_info.action_info.vector_actions
-        )
-        demo_process_buffer[0]["prev_action"].append(previous_action)
+        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
+        demo_raw_buffer["prev_action"].append(previous_action)
         if next_brain_info.local_done[0]:
-            demo_process_buffer.append_to_update_buffer(
-                demo_buffer, 0, batch_size=None, training_length=sequence_length
+            demo_raw_buffer.resequence_and_append(
+                demo_processed_buffer, batch_size=None, training_length=sequence_length
             )
-            demo_process_buffer.reset_local_buffers()
-    demo_process_buffer.append_to_update_buffer(
-        demo_buffer, 0, batch_size=None, training_length=sequence_length
+            demo_raw_buffer.reset_agent()
+    demo_raw_buffer.resequence_and_append(
+        demo_processed_buffer, batch_size=None, training_length=sequence_length
     )
-    return demo_buffer
+    return demo_processed_buffer
 
 
 @timed