ray-project · richardliaw · Dec 17, 2017 · Nov 21, 2017 · Nov 21, 2017 · Nov 21, 2017
@@ -13,7 +13,7 @@ Ray RLlib is a reinforcement learning library that aims to provide both performa
     - Scalable primitives for developing new algorithms
     - Shared models between algorithms
 
-You can find the code for RLlib `here on GitHub <https://github.com/ray-project/ray/tree/master/python/ray/rllib>`__.
+You can find the code for RLlib `here on GitHub <https://github.com/ray-project/ray/tree/master/python/ray/rllib>`__, and the NIPS symposium paper `here <https://drive.google.com/open?id=1lDMOFLMUQXn8qGtuahOBUwjmFb2iASxu>`__.
 
 RLlib currently provides the following algorithms:
 
@@ -30,11 +30,6 @@ RLlib currently provides the following algorithms:
 
 - `Deep Q Network (DQN) <https://arxiv.org/abs/1312.5602>`__.
 
-Proximal Policy Optimization scales to hundreds of cores and several GPUs,
-Evolution Strategies to clusters with thousands of cores and
-the Asynchronous Advantage Actor-Critic scales to dozens of cores
-on a single node.
-
 These algorithms can be run on any `OpenAI Gym MDP <https://github.com/openai/gym>`__,
 including custom ones written and registered by the user.
 
@@ -119,16 +114,6 @@ and renders its behavior in the environment specified by ``--env``.
 Checkpoints are be found within the experiment directory,
 specified by ``--local-dir`` and ``--experiment-name`` when running ``train.py``.
 
-
-The ``eval.py`` script has a number of options you can show by running
-
-::
-    python ray/python/ray/rllib/eval.py --help
-
-The most important argument is the checkpoint positional argument from which
-the script reconstructs the agent. The options ``--env`` and ``--run``
-must match the values chosen while running ``train.py``. 
-
 Tuned Examples
 --------------
 
@@ -248,10 +233,19 @@ The Developer API
 This part of the API will be useful if you need to change existing RL algorithms
 or implement new ones. Note that the API is not considered to be stable yet.
 
+Optimizers and Evaluators
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ray.rllib.optimizers.optimizer.Optimizer
+    :members:
+
+.. autoclass:: ray.rllib.optimizers.evaluator.Evaluator
+    :members:
+
 Models
 ~~~~~~
 
-Models are subclasses of the Model class:
+Algorithms share neural network models which inherit from the following class:
 
 .. autoclass:: ray.rllib.models.Model
 

diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.rst
@@ -1,7 +1,7 @@
 Ray RLlib: A Composable and Scalable Reinforcement Learning Library
 ===================================================================
 
-This README provides a brief technical overview of RLlib. See also the `user documentation <http://ray.readthedocs.io/en/latest/rllib.html>`__.
+This README provides a brief technical overview of RLlib. See also the `user documentation <http://ray.readthedocs.io/en/latest/rllib.html>`__ and `NIPS symposium paper <https://drive.google.com/open?id=1lDMOFLMUQXn8qGtuahOBUwjmFb2iASxu>`__.
 
 RLlib currently provides the following algorithms:
 
@@ -18,11 +18,8 @@ RLlib currently provides the following algorithms:
 
 - `Deep Q Network (DQN) <https://arxiv.org/abs/1312.5602>`__.
 
-Proximal Policy Optimization scales to hundreds of cores and several GPUs, Evolution Strategies to clusters with thousands of cores and the Asynchronous Advantage Actor-Critic scales to dozens of cores on a single node.
-
 These algorithms can be run on any OpenAI Gym MDP, including custom ones written and registered by the user.
 
-For more detailed usage information, see the `user documentation <http://ray.readthedocs.io/en/latest/rllib.html>`__.
 
 Training API
 ------------

diff --git a/python/ray/rllib/a3c/runner.py b/python/ray/rllib/a3c/runner.py
@@ -4,7 +4,7 @@
 
 import ray
 from ray.rllib.envs import create_and_wrap
-from ray.rllib.evaluator import Evaluator
+from ray.rllib.optimizers import Evaluator
 from ray.rllib.a3c.common import get_policy_cls
 from ray.rllib.utils.filter import get_filter
 from ray.rllib.utils.sampler import AsyncSampler

diff --git a/python/ray/rllib/dqn/base_evaluator.py b/python/ray/rllib/dqn/base_evaluator.py
@@ -9,7 +9,7 @@
 from ray.rllib.dqn import models
 from ray.rllib.dqn.common.wrappers import wrap_dqn
 from ray.rllib.dqn.common.schedules import LinearSchedule
-from ray.rllib.evaluator import TFMultiGPUSupport
+from ray.rllib.optimizers import SampleBatch, TFMultiGPUSupport
 
 
 class DQNEvaluator(TFMultiGPUSupport):
@@ -55,20 +55,23 @@ def update_target(self):
         self.dqn_graph.update_target(self.sess)
 
     def sample(self):
-        output = []
+        obs, actions, rewards, new_obs, dones = [], [], [], [], []
         for _ in range(self.config["sample_batch_size"]):
-            result = self._step(self.global_timestep)
-            output.append(result)
-        return output
+            ob, act, rew, ob1, done = self._step(self.global_timestep)
+            obs.append(ob)
+            actions.append(act)
+            rewards.append(rew)
+            new_obs.append(ob1)
+            dones.append(done)
+        return SampleBatch({
+            "obs": obs, "actions": actions, "rewards": rewards,
+            "new_obs": new_obs, "dones": dones,
+            "weights": np.ones_like(rewards)})
 
     def compute_gradients(self, samples):
-        if self.config["prioritized_replay"]:
-            obses_t, actions, rewards, obses_tp1, dones, _ = samples
-        else:
-            obses_t, actions, rewards, obses_tp1, dones = samples
         _, grad = self.dqn_graph.compute_gradients(
-            self.sess, obses_t, actions, rewards, obses_tp1, dones,
-            np.ones_like(rewards))
+            self.sess, samples["obs"], samples["actions"], samples["rewards"],
+            samples["new_obs"], samples["dones"], samples["weights"])
         return grad
 
     def apply_gradients(self, grads):

diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py
@@ -102,11 +102,7 @@
     async_updates=False,
     # (Experimental) Whether to use multiple GPUs for SGD optimization.
     # Note that this only helps performance if the SGD batch size is large.
-    multi_gpu_optimize=False,
-    # Number of SGD iterations over the data. Only applies in multi-gpu mode.
-    num_sgd_iter=1,
-    # Devices to use for parallel SGD. Only applies in multi-gpu mode.
-    devices=["/gpu:0"])
+    multi_gpu=False)
 
 
 class DQNAgent(Agent):
@@ -136,7 +132,7 @@ def _init(self):
             # will internally create more workers for parallelism. This means
             # there is only one replay buffer regardless of num_workers.
             self.remote_evaluators = []
-            if self.config["multi_gpu_optimize"]:
+            if self.config["multi_gpu"]:
                 optimizer_cls = LocalMultiGPUOptimizer
             else:
                 optimizer_cls = LocalSyncOptimizer

diff --git a/python/ray/rllib/dqn/models.py b/python/ray/rllib/dqn/models.py
@@ -6,7 +6,7 @@
 import tensorflow.contrib.layers as layers
 
 from ray.rllib.models import ModelCatalog
-from ray.rllib.parallel import LocalSyncParallelOptimizer, TOWER_SCOPE_NAME
+from ray.rllib.parallel import TOWER_SCOPE_NAME
 
 
 def _build_q_network(inputs, num_actions, config):
@@ -159,10 +159,7 @@ def __init__(self, env, config, logdir):
             tf.float32, shape=(None,) + env.observation_space.shape)
 
         # Action Q network
-        if config["multi_gpu_optimize"]:
-            q_scope_name = TOWER_SCOPE_NAME + "/q_func"
-        else:
-            q_scope_name = "q_func"
+        q_scope_name = TOWER_SCOPE_NAME + "/q_func"
         with tf.variable_scope(q_scope_name) as scope:
             q_values = _build_q_network(
                 self.cur_observations, num_actions, config)
@@ -194,26 +191,21 @@ def build_loss(
                 obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights)
 
         self.loss_inputs = [
-            self.obs_t, self.act_t, self.rew_t, self.obs_tp1, self.done_mask,
-            self.importance_weights]
-        self.build_loss = build_loss
-
-        if config["multi_gpu_optimize"]:
-            self.multi_gpu_optimizer = LocalSyncParallelOptimizer(
-                optimizer,
-                config["devices"],
-                [self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
-                 self.done_mask, self.importance_weights],
-                int(config["sgd_batch_size"] / len(config["devices"])),
-                build_loss,
-                logdir,
-                grad_norm_clipping=config["grad_norm_clipping"])
-            loss_obj = self.multi_gpu_optimizer.get_common_loss()
-        else:
+            ("obs", self.obs_t),
+            ("actions", self.act_t),
+            ("rewards", self.rew_t),
+            ("new_obs", self.obs_tp1),
+            ("dones", self.done_mask),
+            ("weights", self.importance_weights),
+        ]
+
+        with tf.variable_scope(TOWER_SCOPE_NAME):
             loss_obj = build_loss(
                 self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
                 self.done_mask, self.importance_weights)
 
+        self.build_loss = build_loss
+
         weighted_error = loss_obj.loss
         target_q_func_vars = loss_obj.target_q_func_vars
         self.q_t = loss_obj.q_t

diff --git a/python/ray/rllib/dqn/replay_evaluator.py b/python/ray/rllib/dqn/replay_evaluator.py
@@ -8,6 +8,7 @@
 from ray.rllib.dqn.base_evaluator import DQNEvaluator
 from ray.rllib.dqn.common.schedules import LinearSchedule
 from ray.rllib.dqn.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
+from ray.rllib.optimizers import SampleBatch
 
 
 class DQNReplayEvaluator(DQNEvaluator):
@@ -63,38 +64,44 @@ def sample(self, no_replay=False):
             samples = [DQNEvaluator.sample(self)]
 
         for s in samples:
-            for obs, action, rew, new_obs, done in s:
-                self.replay_buffer.add(obs, action, rew, new_obs, done)
+            for row in s.rows():
+                self.replay_buffer.add(
+                    row["obs"], row["actions"], row["rewards"], row["new_obs"],
+                    row["dones"])
 
         if no_replay:
             return samples
 
         # Then return a batch sampled from the buffer
         if self.config["prioritized_replay"]:
-            experience = self.replay_buffer.sample(
-                self.config["train_batch_size"],
-                beta=self.beta_schedule.value(self.global_timestep))
             (obses_t, actions, rewards, obses_tp1,
-                dones, _, batch_idxes) = experience
+                dones, weights, batch_indexes) = self.replay_buffer.sample(
+                    self.config["train_batch_size"],
+                    beta=self.beta_schedule.value(self.global_timestep))
             self._update_priorities_if_needed()
-            self.samples_to_prioritize = (
-                obses_t, actions, rewards, obses_tp1, dones, batch_idxes)
+            batch = SampleBatch({
+                "obs": obses_t, "actions": actions, "rewards": rewards,
+                "new_obs": obses_tp1, "dones": dones, "weights": weights,
+                "batch_indexes": batch_indexes})
+            self.samples_to_prioritize = batch
         else:
             obses_t, actions, rewards, obses_tp1, dones = \
                 self.replay_buffer.sample(self.config["train_batch_size"])
-            batch_idxes = None
-
-        return self.samples_to_prioritize
+            batch = SampleBatch({
+                "obs": obses_t, "actions": actions, "rewards": rewards,
+                "new_obs": obses_tp1, "dones": dones,
+                "weights": np.ones_like(rewards)})
+        return batch
 
     def compute_gradients(self, samples):
-        obses_t, actions, rewards, obses_tp1, dones, batch_indxes = samples
         td_errors, grad = self.dqn_graph.compute_gradients(
-            self.sess, obses_t, actions, rewards, obses_tp1, dones,
-            np.ones_like(rewards))
+            self.sess, samples["obs"], samples["actions"], samples["rewards"],
+            samples["new_obs"], samples["dones"], samples["weights"])
         if self.config["prioritized_replay"]:
             new_priorities = (
                 np.abs(td_errors) + self.config["prioritized_replay_eps"])
-            self.replay_buffer.update_priorities(batch_indxes, new_priorities)
+            self.replay_buffer.update_priorities(
+                samples["batch_indexes"], new_priorities)
             self.samples_to_prioritize = None
         return grad
 
@@ -109,14 +116,15 @@ def _update_priorities_if_needed(self):
         if not self.samples_to_prioritize:
             return
 
-        obses_t, actions, rewards, obses_tp1, dones, batch_idxes = \
-            self.samples_to_prioritize
+        batch = self.samples_to_prioritize
         td_errors = self.dqn_graph.compute_td_error(
-            self.sess, obses_t, actions, rewards, obses_tp1, dones,
-            np.ones_like(rewards))
+            self.sess, batch["obs"], batch["actions"], batch["rewards"],
+            batch["new_obs"], batch["dones"], batch["weights"])
+
         new_priorities = (
             np.abs(td_errors) + self.config["prioritized_replay_eps"])
-        self.replay_buffer.update_priorities(batch_idxes, new_priorities)
+        self.replay_buffer.update_priorities(
+            batch["batch_indexes"], new_priorities)
         self.samples_to_prioritize = None
 
     def stats(self):

diff --git a/python/ray/rllib/evaluator.py b/python/ray/rllib/evaluator.py
diff --git a/python/ray/rllib/optimizers/__init__.py b/python/ray/rllib/optimizers/__init__.py
@@ -1,6 +1,10 @@
 from ray.rllib.optimizers.async import AsyncOptimizer
 from ray.rllib.optimizers.local_sync import LocalSyncOptimizer
 from ray.rllib.optimizers.multi_gpu import LocalMultiGPUOptimizer
+from ray.rllib.optimizers.sample_batch import SampleBatch
+from ray.rllib.optimizers.evaluator import Evaluator, TFMultiGPUSupport
 
 
-__all__ = ["AsyncOptimizer", "LocalSyncOptimizer", "LocalMultiGPUOptimizer"]
+__all__ = [
+    "AsyncOptimizer", "LocalSyncOptimizer", "LocalMultiGPUOptimizer",
+    "SampleBatch", "Evaluator", "TFMultiGPUSupport"]