Unity-Technologies
diff --git a/‎ml-agents/mlagents/trainers/agent_processor.py
Lines changed: 28 additions & 12 deletions b/‎ml-agents/mlagents/trainers/agent_processor.py
Lines changed: 28 additions & 12 deletions
diff --git a/‎ml-agents/mlagents/trainers/curriculum.py
Lines changed: 7 additions & 6 deletions b/‎ml-agents/mlagents/trainers/curriculum.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎ml-agents/mlagents/trainers/learn.py
Lines changed: 6 additions & 0 deletions b/‎ml-agents/mlagents/trainers/learn.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎ml-agents/mlagents/trainers/ppo/policy.py
Lines changed: 0 additions & 2 deletions b/‎ml-agents/mlagents/trainers/ppo/policy.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎ml-agents/mlagents/trainers/ppo/trainer.py
Lines changed: 5 additions & 3 deletions b/‎ml-agents/mlagents/trainers/ppo/trainer.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎ml-agents/mlagents/trainers/rl_trainer.py
Lines changed: 2 additions & 8 deletions b/‎ml-agents/mlagents/trainers/rl_trainer.py
Lines changed: 2 additions & 8 deletions
diff --git a/‎ml-agents/mlagents/trainers/sac/policy.py
Lines changed: 0 additions & 2 deletions b/‎ml-agents/mlagents/trainers/sac/policy.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎ml-agents/mlagents/trainers/sac/trainer.py
Lines changed: 6 additions & 4 deletions b/‎ml-agents/mlagents/trainers/sac/trainer.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎ml-agents/mlagents/trainers/stats.py
Lines changed: 119 additions & 0 deletions b/‎ml-agents/mlagents/trainers/stats.py
Lines changed: 119 additions & 0 deletions
@@ -7,6 +7,7 @@
 from mlagents.trainers.brain import BrainInfo
 from mlagents.trainers.tf_policy import TFPolicy
 from mlagents.trainers.action_info import ActionInfoOutputs
+from mlagents.trainers.stats import StatsReporter
 
 
 class AgentProcessor:
@@ -16,24 +17,31 @@ class AgentProcessor:
     One AgentProcessor should be created per agent group.
     """
 
-    def __init__(self, trainer: Trainer, policy: TFPolicy, max_trajectory_length: int):
+    def __init__(
+        self,
+        trainer: Trainer,
+        policy: TFPolicy,
+        max_trajectory_length: int,
+        stats_reporter: StatsReporter,
+    ):
         """
         Create an AgentProcessor.
         :param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
         when it is finished.
         :param policy: Policy instance associated with this AgentProcessor.
         :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
+        :param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
         """
         self.experience_buffers: Dict[str, List[AgentExperience]] = defaultdict(list)
         self.last_brain_info: Dict[str, BrainInfo] = {}
         self.last_take_action_outputs: Dict[str, ActionInfoOutputs] = {}
-        self.stats: Dict[str, List[float]] = defaultdict(list)
         # Note: this is needed until we switch to AgentExperiences as the data input type.
         # We still need some info from the policy (memories, previous actions)
         # that really should be gathered by the env-manager.
         self.policy = policy
         self.episode_steps: Counter = Counter()
-        self.episode_rewards: Dict[str, float] = defaultdict(lambda: 0.0)
+        self.episode_rewards: Dict[str, float] = defaultdict(float)
+        self.stats_reporter = stats_reporter
         if max_trajectory_length:
             self.max_trajectory_length = max_trajectory_length
             self.ignore_max_length = False
@@ -55,12 +63,12 @@ def add_experiences(
         :param take_action_outputs: The outputs of the Policy's get_action method.
         """
         if take_action_outputs:
-            self.stats["Policy/Entropy"].append(take_action_outputs["entropy"].mean())
-            self.stats["Policy/Learning Rate"].append(
-                take_action_outputs["learning_rate"]
+            self.stats_reporter.add_stat(
+                "Policy/Entropy", take_action_outputs["entropy"].mean()
+            )
+            self.stats_reporter.add_stat(
+                "Policy/Learning Rate", take_action_outputs["learning_rate"]
             )
-            for name, values in take_action_outputs["value_heads"].items():
-                self.stats[name].append(np.mean(values))
 
         for agent_id in curr_info.agents:
             self.last_brain_info[agent_id] = curr_info
@@ -99,7 +107,6 @@ def add_experiences(
                     action_masks = stored_info.action_masks[idx]
                     prev_action = self.policy.retrieve_previous_action([agent_id])[0, :]
 
-                    values = stored_take_action_outputs["value_heads"]
                     experience = AgentExperience(
                         obs=obs,
                         reward=tmp_environment_reward[next_idx],
@@ -114,7 +121,7 @@ def add_experiences(
                     )
                     # Add the value outputs if needed
                     self.experience_buffers[agent_id].append(experience)
-
+                    self.episode_rewards[agent_id] += tmp_environment_reward[next_idx]
                 if (
                     next_info.local_done[next_idx]
                     or (
@@ -137,9 +144,18 @@ def add_experiences(
                     # This will eventually be replaced with a queue
                     self.trainer.process_trajectory(trajectory)
                     self.experience_buffers[agent_id] = []
+                    if next_info.local_done[next_idx]:
+                        self.stats_reporter.add_stat(
+                            "Environment/Cumulative Reward",
+                            self.episode_rewards.get(agent_id, 0),
+                        )
+                        self.stats_reporter.add_stat(
+                            "Environment/Episode Length",
+                            self.episode_steps.get(agent_id, 0),
+                        )
+                        del self.episode_steps[agent_id]
+                        del self.episode_rewards[agent_id]
                 elif not next_info.local_done[next_idx]:
-                    if agent_id not in self.episode_steps:
-                        self.episode_steps[agent_id] = 0
                     self.episode_steps[agent_id] += 1
         self.policy.save_previous_action(
             curr_info.agents, take_action_outputs["action"]
 
@@ -1,6 +1,7 @@
 import os
 import json
 import math
+from typing import Dict, Any, TextIO
 
 from .exception import CurriculumConfigError, CurriculumLoadingError
 
@@ -51,14 +52,14 @@ def __init__(self, location):
                 )
 
     @property
-    def lesson_num(self):
+    def lesson_num(self) -> int:
         return self._lesson_num
 
     @lesson_num.setter
-    def lesson_num(self, lesson_num):
+    def lesson_num(self, lesson_num: int) -> None:
         self._lesson_num = max(0, min(lesson_num, self.max_lesson_num))
 
-    def increment_lesson(self, measure_val):
+    def increment_lesson(self, measure_val: float) -> bool:
         """
         Increments the lesson number depending on the progress given.
         :param measure_val: Measure of progress (either reward or percentage
@@ -87,7 +88,7 @@ def increment_lesson(self, measure_val):
                 return True
         return False
 
-    def get_config(self, lesson=None):
+    def get_config(self, lesson: int = None) -> Dict[str, Any]:
         """
         Returns reset parameters which correspond to the lesson.
         :param lesson: The lesson you want to get the config of. If None, the
@@ -106,7 +107,7 @@ def get_config(self, lesson=None):
         return config
 
     @staticmethod
-    def load_curriculum_file(location):
+    def load_curriculum_file(location: str) -> None:
         try:
             with open(location) as data_file:
                 return Curriculum._load_curriculum(data_file)
@@ -120,7 +121,7 @@ def load_curriculum_file(location):
             )
 
     @staticmethod
-    def _load_curriculum(fp):
+    def _load_curriculum(fp: TextIO) -> None:
         try:
             return json.load(fp)
         except json.decoder.JSONDecodeError as e:
 
@@ -17,6 +17,7 @@
 from mlagents.trainers.exception import TrainerError
 from mlagents.trainers.meta_curriculum import MetaCurriculum
 from mlagents.trainers.trainer_util import load_config, TrainerFactory
+from mlagents.trainers.stats import TensorboardWriter, StatsReporter
 from mlagents.envs.environment import UnityEnvironment
 from mlagents.trainers.sampler_class import SamplerManager
 from mlagents.trainers.exception import SamplerException
@@ -248,6 +249,11 @@ def run_training(
         )
     trainer_config = load_config(trainer_config_path)
     port = options.base_port + (sub_id * options.num_envs)
+
+    # Configure Tensorboard Writers and StatsReporter
+    tb_writer = TensorboardWriter(summaries_dir)
+    StatsReporter.add_writer(tb_writer)
+
     if options.env_path is None:
         port = 5004  # This is the in Editor Training Port
     env_factory = create_environment_factory(
 
@@ -104,8 +104,6 @@ def create_model(
             {
                 "action": self.model.output,
                 "log_probs": self.model.all_log_probs,
-                "value_heads": self.model.value_heads,
-                "value": self.model.value,
                 "entropy": self.model.entropy,
                 "learning_rate": self.model.learning_rate,
             }
 
@@ -99,7 +99,9 @@ def process_trajectory(self, trajectory: Trajectory) -> None:
         )
         for name, v in value_estimates.items():
             agent_buffer_trajectory["{}_value_estimates".format(name)].extend(v)
-            self.stats[self.policy.reward_signals[name].value_name].append(np.mean(v))
+            self.stats_reporter.add_stat(
+                self.policy.reward_signals[name].value_name, np.mean(v)
+            )
 
         value_next = self.policy.get_value_estimates(
             trajectory.next_obs,
@@ -212,12 +214,12 @@ def update_policy(self):
                     batch_update_stats[stat_name].append(value)
 
         for stat, stat_list in batch_update_stats.items():
-            self.stats[stat].append(np.mean(stat_list))
+            self.stats_reporter.add_stat(stat, np.mean(stat_list))
 
         if self.policy.bc_module:
             update_stats = self.policy.bc_module.update()
             for stat, val in update_stats.items():
-                self.stats[stat].append(val)
+                self.stats_reporter.add_stat(stat, val)
         self.clear_update_buffer()
         self.trainer_metrics.end_policy_update()
 
 
@@ -46,23 +46,17 @@ def end_episode(self) -> None:
                 rewards[agent_id] = 0
 
     def _update_end_episode_stats(self, agent_id: str) -> None:
-        self.stats["Environment/Episode Length"].append(
-            self.episode_steps.get(agent_id, 0)
-        )
         self.episode_steps[agent_id] = 0
         for name, rewards in self.collected_rewards.items():
             if name == "environment":
                 self.cumulative_returns_since_policy_update.append(
                     rewards.get(agent_id, 0)
                 )
-                self.stats["Environment/Cumulative Reward"].append(
-                    rewards.get(agent_id, 0)
-                )
                 self.reward_buffer.appendleft(rewards.get(agent_id, 0))
                 rewards[agent_id] = 0
             else:
-                self.stats[self.policy.reward_signals[name].stat_name].append(
-                    rewards.get(agent_id, 0)
+                self.stats_reporter.add_stat(
+                    self.policy.reward_signals[name].stat_name, rewards.get(agent_id, 0)
                 )
                 rewards[agent_id] = 0
 
 
@@ -124,8 +124,6 @@ def create_model(
             {
                 "action": self.model.output,
                 "log_probs": self.model.all_log_probs,
-                "value_heads": self.model.value_heads,
-                "value": self.model.value,
                 "entropy": self.model.entropy,
                 "learning_rate": self.model.learning_rate,
             }
 
@@ -166,7 +166,9 @@ def process_trajectory(self, trajectory: Trajectory) -> None:
             agent_buffer_trajectory
         )
         for name, v in value_estimates.items():
-            self.stats[self.policy.reward_signals[name].value_name].append(np.mean(v))
+            self.stats_reporter.add_stat(
+                self.policy.reward_signals[name].value_name, np.mean(v)
+            )
 
         # Bootstrap using the last step rather than the bootstrap step if max step is reached.
         # Set last element to duplicate obs and remove dones.
@@ -258,13 +260,13 @@ def update_sac_policy(self) -> None:
             )
 
         for stat, stat_list in batch_update_stats.items():
-            self.stats[stat].append(np.mean(stat_list))
+            self.stats_reporter.add_stat(stat, np.mean(stat_list))
 
         bc_module = self.sac_policy.bc_module
         if bc_module:
             update_stats = bc_module.update()
             for stat, val in update_stats.items():
-                self.stats[stat].append(val)
+                self.stats_reporter.add_stat(stat, val)
 
     def update_reward_signals(self) -> None:
         """
@@ -299,4 +301,4 @@ def update_reward_signals(self) -> None:
             for stat_name, value in update_stats.items():
                 batch_update_stats[stat_name].append(value)
         for stat, stat_list in batch_update_stats.items():
-            self.stats[stat].append(np.mean(stat_list))
+            self.stats_reporter.add_stat(stat, np.mean(stat_list))
@@ -0,0 +1,119 @@
+from collections import defaultdict
+from typing import List, Dict, NamedTuple
+import numpy as np
+import abc
+import os
+
+from mlagents.tf_utils import tf
+
+
+class StatsWriter(abc.ABC):
+    """
+    A StatsWriter abstract class. A StatsWriter takes in a category, key, scalar value, and step
+    and writes it out by some method.
+    """
+
+    @abc.abstractmethod
+    def write_stats(self, category: str, key: str, value: float, step: int) -> None:
+        pass
+
+    @abc.abstractmethod
+    def write_text(self, category: str, text: str, step: int) -> None:
+        pass
+
+
+class TensorboardWriter(StatsWriter):
+    def __init__(self, base_dir: str):
+        self.summary_writers: Dict[str, tf.summary.FileWriter] = {}
+        self.base_dir: str = base_dir
+
+    def write_stats(self, category: str, key: str, value: float, step: int) -> None:
+        self._maybe_create_summary_writer(category)
+        summary = tf.Summary()
+        summary.value.add(tag="{}".format(key), simple_value=value)
+        self.summary_writers[category].add_summary(summary, step)
+        self.summary_writers[category].flush()
+
+    def _maybe_create_summary_writer(self, category: str) -> None:
+        if category not in self.summary_writers:
+            filewriter_dir = "{basedir}/{category}".format(
+                basedir=self.base_dir, category=category
+            )
+            if not os.path.exists(filewriter_dir):
+                os.makedirs(filewriter_dir)
+            self.summary_writers[category] = tf.summary.FileWriter(filewriter_dir)
+
+    def write_text(self, category: str, text: str, step: int) -> None:
+        self._maybe_create_summary_writer(category)
+        self.summary_writers[category].add_summary(text, step)
+
+
+class StatsSummary(NamedTuple):
+    mean: float
+    std: float
+    num: int
+
+
+class StatsReporter:
+    writers: List[StatsWriter] = []
+    stats_dict: Dict[str, Dict[str, List]] = defaultdict(lambda: defaultdict(list))
+
+    def __init__(self, category):
+        """
+        Generic StatsReporter. A category is the broadest type of storage (would
+        correspond the run name and trainer name, e.g. 3DBalltest_3DBall. A key is the
+        type of stat it is (e.g. Environment/Reward). Finally the Value is the float value
+        attached to this stat.
+        """
+        self.category: str = category
+
+    @staticmethod
+    def add_writer(writer: StatsWriter) -> None:
+        StatsReporter.writers.append(writer)
+
+    def add_stat(self, key: str, value: float) -> None:
+        """
+        Add a float value stat to the StatsReporter.
+        :param category: The highest categorization of the statistic, e.g. behavior name.
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :param value: the value of the statistic.
+        """
+        StatsReporter.stats_dict[self.category][key].append(value)
+
+    def write_stats(self, step: int) -> None:
+        """
+        Write out all stored statistics that fall under the category specified.
+        The currently stored values will be averaged, written out as a single value,
+        and the buffer cleared.
+        :param category: The category which to write out the stats.
+        :param step: Training step which to write these stats as.
+        """
+        for key in StatsReporter.stats_dict[self.category]:
+            if len(StatsReporter.stats_dict[self.category][key]) > 0:
+                stat_mean = float(np.mean(StatsReporter.stats_dict[self.category][key]))
+                for writer in StatsReporter.writers:
+                    writer.write_stats(self.category, key, stat_mean, step)
+        del StatsReporter.stats_dict[self.category]
+
+    def write_text(self, text: str, step: int) -> None:
+        """
+        Write out some text.
+        :param category: The highest categorization of the statistic, e.g. behavior name.
+        :param text: The text to write out.
+        :param step: Training step which to write these stats as.
+        """
+        for writer in StatsReporter.writers:
+            writer.write_text(self.category, text, step)
+
+    def get_stats_summaries(self, key: str) -> StatsSummary:
+        """
+        Get the mean, std, and count of a particular statistic, since last write.
+        :param category: The highest categorization of the statistic, e.g. behavior name.
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :returns: A StatsSummary NamedTuple containing (mean, std, count).
+        """
+        return StatsSummary(
+            mean=np.mean(StatsReporter.stats_dict[self.category][key]),
+            std=np.std(StatsReporter.stats_dict[self.category][key]),
+            num=len(StatsReporter.stats_dict[self.category][key]),
+        )
Original file line number	Diff line number	Diff line change
`@@ -104,8 +104,6 @@ def create_model(`
`104`	`104`	`{`
`105`	`105`	`"action": self.model.output,`
`106`	`106`	`"log_probs": self.model.all_log_probs,`
`107`		`- "value_heads": self.model.value_heads,`
`108`		`- "value": self.model.value,`
`109`	`107`	`"entropy": self.model.entropy,`
`110`	`108`	`"learning_rate": self.model.learning_rate,`
`111`	`109`	`}`
Original file line number	Diff line number	Diff line change
`@@ -124,8 +124,6 @@ def create_model(`
`124`	`124`	`{`
`125`	`125`	`"action": self.model.output,`
`126`	`126`	`"log_probs": self.model.all_log_probs,`
`127`		`- "value_heads": self.model.value_heads,`
`128`		`- "value": self.model.value,`
`129`	`127`	`"entropy": self.model.entropy,`
`130`	`128`	`"learning_rate": self.model.learning_rate,`
`131`	`129`	`}`