Unity-Technologies · ervteng · Apr 20, 2020 · Mar 20, 2020 · Mar 20, 2020 · Mar 20, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -61,6 +61,8 @@ and this project adheres to
   overwrite the existing files. (#3705)
 - `StackingSensor` was changed from `internal` visibility to `public`
 - Updated Barracuda to 0.6.3-preview.
+ - Model updates can now happen asynchronously with environment steps for better performance. (#3690)
+ - `num_updates` and `train_interval` for SAC were replaced with `steps_per_update`. (#3690)
 
 ### Bug Fixes
 

diff --git a/config/sac_trainer_config.yaml b/config/sac_trainer_config.yaml
@@ -10,8 +10,7 @@ default:
     max_steps: 5.0e5
     memory_size: 128
     normalize: false
-    num_update: 1
-    train_interval: 1
+    steps_per_update: 10
     num_layers: 2
     time_horizon: 64
     sequence_length: 64
@@ -30,11 +29,10 @@ FoodCollector:
     buffer_size: 500000
     max_steps: 2.0e6
     init_entcoef: 0.05
-    train_interval: 1
 
 Bouncer:
     normalize: true
-    max_steps: 2.0e6
+    max_steps: 1.0e6
     num_layers: 2
     hidden_units: 64
     summary_freq: 20000
@@ -43,7 +41,7 @@ PushBlock:
     max_steps: 2e6
     init_entcoef: 0.05
     hidden_units: 256
-    summary_freq: 60000
+    summary_freq: 100000
     time_horizon: 64
     num_layers: 2
 
@@ -159,10 +157,10 @@ CrawlerStatic:
     normalize: true
     time_horizon: 1000
     batch_size: 256
-    train_interval: 2
+    steps_per_update: 20
     buffer_size: 500000
     buffer_init_steps: 2000
-    max_steps: 5e6
+    max_steps: 3e6
     summary_freq: 30000
     init_entcoef: 1.0
     num_layers: 3
@@ -178,9 +176,9 @@ CrawlerDynamic:
     batch_size: 256
     buffer_size: 500000
     summary_freq: 30000
-    train_interval: 2
+    steps_per_update: 20
     num_layers: 3
-    max_steps: 1e7
+    max_steps: 5e6
     hidden_units: 512
     reward_signals:
         extrinsic:
@@ -195,7 +193,7 @@ Walker:
     max_steps: 2e7
     summary_freq: 30000
     num_layers: 4
-    train_interval: 2
+    steps_per_update: 30
     hidden_units: 512
     reward_signals:
         extrinsic:
@@ -208,6 +206,7 @@ Reacher:
     batch_size: 128
     buffer_size: 500000
     max_steps: 2e7
+    steps_per_update: 20
     summary_freq: 60000
 
 Hallway:
@@ -216,7 +215,7 @@ Hallway:
     hidden_units: 128
     memory_size: 128
     init_entcoef: 0.1
-    max_steps: 1.0e7
+    max_steps: 5.0e6
     summary_freq: 10000
     time_horizon: 64
     use_recurrent: true

diff --git a/docs/Migrating.md b/docs/Migrating.md
@@ -33,6 +33,8 @@ double-check that the versions are in the same. The versions can be found in
 - The signature of `Agent.Heuristic()` was changed to take a `float[]` as a
   parameter, instead of returning the array. This was done to prevent a common
   source of error where users would return arrays of the wrong size.
+- `num_updates` and `train_interval` for SAC have been replaced with `steps_per_update`.
+
 
 ### Steps to Migrate
 
@@ -54,6 +56,8 @@ double-check that the versions are in the same. The versions can be found in
 - If your Agent class overrides `Heuristic()`, change the signature to
   `public override void Heuristic(float[] actionsOut)` and assign values to
   `actionsOut` instead of returning an array.
+- Set `steps_per_update` to be around equal to the number of agents in your environment,
+  times `num_updates` and divided by `train_interval`.
 
 ## Migrating from 0.14 to 0.15
 

diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
@@ -158,10 +158,10 @@ Cloning (Imitation), GAIL = Generative Adversarial Imitation Learning
 | tau                    | How aggressively to update the target network used for bootstrapping value estimation in SAC.                                                                                           | SAC                      |
 | time_horizon           | How many steps of experience to collect per-agent before adding it to the experience buffer.                                                                                            | PPO, SAC                 |
 | trainer                | The type of training to perform: "ppo", "sac", "offline_bc" or "online_bc".                                                                                                             | PPO, SAC                 |
-| train_interval         | How often to update the agent.                                                                                                                                                          | SAC                      |
-| num_update             | Number of mini-batches to update the agent with during each update.                                                                                                                     | SAC                      |
+| steps_per_update           | Ratio of agent steps per mini-batch update.                                                                                                                     | SAC                      |
 | use_recurrent          | Train using a recurrent neural network. See [Using Recurrent Neural Networks](Feature-Memory.md).                                                                                       | PPO, SAC                 |
 | init_path              | Initialize trainer from a previously saved model.                                                                                                                                       | PPO, SAC                 |
+| threaded              | Run the trainer in a parallel thread from the environment steps. (Default: true)                                                                                                                                      | PPO, SAC                 |
 
 For specific advice on setting hyperparameters based on the type of training you
 are conducting, see:

diff --git a/docs/Training-PPO.md b/docs/Training-PPO.md
@@ -300,6 +300,15 @@ This option is provided in case you want to initialize different behaviors from
 in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize
 all models from the same run.
 
+### (Optional) Advanced: Disable Threading
+
+By default, PPO model updates can happen while the environment is being stepped. This violates the
+[on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms)
+assumption of PPO slightly in exchange for a 10-20% training speedup. To maintain the
+strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`.
+
+Default Value: `true`
+
 ## Training Statistics
 
 To view training statistics, use TensorBoard. For information on launching and

diff --git a/docs/Training-SAC.md b/docs/Training-SAC.md
@@ -40,19 +40,18 @@ ML-Agents provides two reward signals by default, the Extrinsic (environment) re
 Curiosity reward, which can be used to encourage exploration in sparse extrinsic reward
 environments.
 
-#### Number of Updates for Reward Signal (Optional)
+#### Steps Per Update for Reward Signal (Optional)
 
-`reward_signal_num_update` for the reward signals corresponds to the number of mini batches sampled
-and used for updating the reward signals during each
-update. By default, we update the reward signals once every time the main policy is updated.
+`reward_signal_steps_per_update` for the reward signals corresponds to the number of steps per mini batch sampled
+and used for updating the reward signals. By default, we update the reward signals once every time the main policy is updated.
 However, to imitate the training procedure in certain imitation learning papers (e.g.
 [Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)),
-we may want to update the policy N times, then update the reward signal (GAIL) M times.
-We can change `train_interval` and `num_update` of SAC to N, as well as `reward_signal_num_update`
-under `reward_signals` to M to accomplish this. By default, `reward_signal_num_update` is set to
-`num_update`.
+we may want to update the reward signal (GAIL) M times for every update of the policy.
+We can change `steps_per_update` of SAC to N, as well as `reward_signal_steps_per_update`
+under `reward_signals` to N / M to accomplish this. By default, `reward_signal_steps_per_update` is set to
+`steps_per_update`.
 
-Typical Range: `num_update`
+Typical Range: `steps_per_update`
 
 ### Buffer Size
 
@@ -106,17 +105,22 @@ there may not be any new interesting information between steps, and `train_inter
 
 Typical Range: `1` - `5`
 
-### Number of Updates
+### Steps Per Update
 
-`num_update` corresponds to the number of mini batches sampled and used for training during each
-training event. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience
-replay buffer, and using this mini batch to update the models. Typically, this can be left at 1.
-However, to imitate the training procedure in certain papers (e.g.
-[Kostrikov et. al](http://arxiv.org/abs/1809.02925), [Blondé et. al](http://arxiv.org/abs/1809.02064)),
-we may want to update N times with different mini batches before grabbing additional samples.
-We can change `train_interval` and `num_update` to N to accomplish this.
+`steps_per_update` corresponds to the average ratio of agent steps (actions) taken to updates made of the agent's
+policy. In SAC, a single "update" corresponds to grabbing a batch of size `batch_size` from the experience
+replay buffer, and using this mini batch to update the models. Note that it is not guaranteed that after
+exactly `steps_per_update` steps an update will be made, only that the ratio will hold true over many steps.
+
+Typically, `steps_per_update` should be greater than or equal to 1. Note that setting `steps_per_update` lower will
+improve sample efficiency (reduce the number of steps required to train)
+but increase the CPU time spent performing updates. For most environments where steps are fairly fast (e.g. our example
+environments) `steps_per_update` equal to the number of agents in the scene is a good balance.
+For slow environments (steps take 0.1 seconds or more) reducing `steps_per_update` may improve training speed.
+We can also change `steps_per_update` to lower than 1 to update more often than once per step, though this will
+usually result in a slowdown unless the environment is very slow.
 
-Typical Range: `1`
+Typical Range: `1` - `20`
 
 ### Tau
 

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -1,6 +1,7 @@
 import sys
-from typing import List, Dict, Deque, TypeVar, Generic, Tuple, Any, Union
-from collections import defaultdict, Counter, deque
+from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
+from collections import defaultdict, Counter
+import queue
 
 from mlagents_envs.base_env import (
     DecisionSteps,
@@ -229,26 +230,53 @@ class Empty(Exception):
 
         pass
 
-    def __init__(self, behavior_id: str, maxlen: int = 1000):
+    def __init__(self, behavior_id: str, maxlen: int = 20):
         """
         Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
         separately from an AgentManager.
         """
-        self.maxlen: int = maxlen
-        self.queue: Deque[T] = deque(maxlen=self.maxlen)
-        self.behavior_id = behavior_id
+        self._maxlen: int = maxlen
+        self._queue: queue.Queue = queue.Queue(maxsize=maxlen)
+        self._behavior_id = behavior_id
+
+    @property
+    def maxlen(self):
+        """
+        The maximum length of the queue.
+        :return: Maximum length of the queue.
+        """
+        return self._maxlen
+
+    @property
+    def behavior_id(self):
+        """
+        The Behavior ID of this queue.
+        :return: Behavior ID associated with the queue.
+        """
+        return self._behavior_id
+
+    def qsize(self) -> int:
+        """
+        Returns the approximate size of the queue. Note that values may differ
+        depending on the underlying queue implementation.
+        """
+        return self._queue.qsize()
 
     def empty(self) -> bool:
-        return len(self.queue) == 0
+        return self._queue.empty()
 
     def get_nowait(self) -> T:
+        """
+        Gets the next item from the queue, throwing an AgentManagerQueue.Empty exception
+        if the queue is empty.
+        """
         try:
-            return self.queue.popleft()
-        except IndexError:
+            return self._queue.get_nowait()
+        except queue.Empty:
             raise self.Empty("The AgentManagerQueue is empty.")
 
     def put(self, item: T) -> None:
-        self.queue.append(item)
+        self._queue.put(item)
 
 
 class AgentManager(AgentProcessor):
@@ -268,8 +296,10 @@ def __init__(
         self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
             self.behavior_id
         )
+        # NOTE: we make policy queues of infinite length to avoid lockups of the trainers.
+        # In the environment manager, we make sure to empty the policy queue before continuing to produce steps.
         self.policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
-            self.behavior_id
+            self.behavior_id, maxlen=0
         )
         self.publish_trajectory_queue(self.trajectory_queue)
 

diff --git a/ml-agents/mlagents/trainers/env_manager.py b/ml-agents/mlagents/trainers/env_manager.py
@@ -88,13 +88,17 @@ def advance(self):
         if self.first_step_infos is not None:
             self._process_step_infos(self.first_step_infos)
             self.first_step_infos = None
-        # Get new policies if found
+        # Get new policies if found. Always get the latest policy.
         for brain_name in self.external_brains:
+            _policy = None
             try:
-                _policy = self.agent_managers[brain_name].policy_queue.get_nowait()
-                self.set_policy(brain_name, _policy)
+                # We make sure to empty the policy queue before continuing to produce steps.
+                # This halts the trainers until the policy queue is empty.
+                while True:
+                    _policy = self.agent_managers[brain_name].policy_queue.get_nowait()
             except AgentManagerQueue.Empty:
-                pass
+                if _policy is not None:
+                    self.set_policy(brain_name, _policy)
         # Step the environment
         new_step_infos = self._step()
         # Add to AgentProcessor

diff --git a/ml-agents/mlagents/trainers/ghost/trainer.py b/ml-agents/mlagents/trainers/ghost/trainer.py
@@ -223,7 +223,7 @@ def advance(self) -> None:
                     # We grab at most the maximum length of the queue.
                     # This ensures that even if the queue is being filled faster than it is
                     # being emptied, the trajectories in the queue are on-policy.
-                    for _ in range(trajectory_queue.maxlen):
+                    for _ in range(trajectory_queue.qsize()):
                         t = trajectory_queue.get_nowait()
                         # adds to wrapped trainers queue
                         internal_trajectory_queue.put(t)
@@ -233,7 +233,7 @@ def advance(self) -> None:
             else:
                 # Dump trajectories from non-learning policy
                 try:
-                    for _ in range(trajectory_queue.maxlen):
+                    for _ in range(trajectory_queue.qsize()):
                         t = trajectory_queue.get_nowait()
                         # count ghost steps
                         self.ghost_step += len(t.steps)

diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -219,6 +219,7 @@ def _update_policy(self):
             for stat, val in update_stats.items():
                 self._stats_reporter.add_stat(stat, val)
         self._clear_update_buffer()
+        return True
 
     def create_policy(
         self, parsed_behavior_id: BehaviorIdentifiers, brain_parameters: BrainParameters