ray-project · ericl · Jun 18, 2018 · May 27, 2018 · May 27, 2018 · May 27, 2018
@@ -9,6 +9,9 @@
 from ray.rllib.utils.policy_graph import PolicyGraph
 from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
 from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
+from ray.rllib.utils.async_vector_env import AsyncVectorEnv
+from ray.rllib.utils.vector_env import VectorEnv
+from ray.rllib.utils.serving_env import ServingEnv
 from ray.rllib.optimizers.sample_batch import SampleBatch
 
 
@@ -23,5 +26,6 @@ def _register_all():
 _register_all()
 
 __all__ = [
-    "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
+    "PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch",
+    "AsyncVectorEnv", "VectorEnv", "ServingEnv",
 ]
@@ -17,6 +17,8 @@
 DEFAULT_CONFIG = {
     # Number of workers (excluding master)
     "num_workers": 2,
+    # Number of environments to evaluate vectorwise per worker.
+    "num_envs": 1,
     # Size of rollout batch
     "batch_size": 10,
     # Use LSTM model - only applicable for image states
@@ -101,15 +103,17 @@ def session_creator():
             batch_mode="truncate_episodes",
             tf_session_creator=session_creator,
             registry=self.registry, env_config=self.config["env_config"],
-            model_config=self.config["model"], policy_config=self.config)
+            model_config=self.config["model"], policy_config=self.config,
+            num_envs=self.config["num_envs"])
         self.remote_evaluators = [
             remote_cls.remote(
                 self.env_creator, self.policy_cls,
                 batch_steps=self.config["batch_size"],
                 batch_mode="truncate_episodes", sample_async=True,
                 tf_session_creator=session_creator,
                 registry=self.registry, env_config=self.config["env_config"],
-                model_config=self.config["model"], policy_config=self.config)
+                model_config=self.config["model"], policy_config=self.config,
+                num_envs=self.config["num_envs"])
             for i in range(self.config["num_workers"])]
 
         self.optimizer = AsyncOptimizer(

@@ -2,6 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 from threading import Lock
 
 import torch
@@ -33,13 +34,12 @@ def setup_graph(self, obs_space, action_space):
         self.optimizer = torch.optim.Adam(
             self._model.parameters(), lr=self.config["lr"])
 
-    def compute_single_action(self, obs, state, is_training=False):
+    def compute_actions(self, obs, state, is_training=False):
         assert not state, "RNN not supported"
         with self.lock:
-            ob = torch.from_numpy(obs).float().unsqueeze(0)
+            ob = torch.from_numpy(np.array(obs)).float()
             logits, values = self._model(ob)
-            samples = F.softmax(logits, dim=1).multinomial(1).squeeze()
-            values = values.squeeze()
+            samples = F.softmax(logits, dim=1).multinomial(1).squeeze(0)
             return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
 
     def compute_gradients(self, samples):

@@ -95,6 +95,8 @@
     # to increase if your environment is particularly slow to sample, or if
     # you"re using the Async or Ape-X optimizers.
     "num_workers": 0,
+    # Number of environments to evaluate vectorwise per worker.
+    "num_envs": 1,
     # Whether to allocate GPUs for workers (if > 0).
     "num_gpus_per_worker": 0,
     # Whether to allocate CPUs for workers (if > 0).

@@ -89,6 +89,8 @@
     # to increase if your environment is particularly slow to sample, or if
     # you"re using the Async or Ape-X optimizers.
     "num_workers": 0,
+    # Number of environments to evaluate vectorwise per worker.
+    "num_envs": 1,
     # Whether to allocate GPUs for workers (if > 0).
     "num_gpus_per_worker": 0,
     # Whether to allocate CPUs for workers (if > 0).
@@ -125,21 +127,23 @@ def _init(self):
         self.local_evaluator = CommonPolicyEvaluator(
             self.env_creator, self._policy_graph,
             batch_steps=adjusted_batch_size,
-            batch_mode="pack_episodes", preprocessor_pref="deepmind",
+            batch_mode="truncate_episodes", preprocessor_pref="deepmind",
             compress_observations=True,
             registry=self.registry, env_config=self.config["env_config"],
-            model_config=self.config["model"], policy_config=self.config)
+            model_config=self.config["model"], policy_config=self.config,
+            num_envs=self.config["num_envs"])
         remote_cls = CommonPolicyEvaluator.as_remote(
             num_cpus=self.config["num_cpus_per_worker"],
             num_gpus=self.config["num_gpus_per_worker"])
         self.remote_evaluators = [
             remote_cls.remote(
                 self.env_creator, self._policy_graph,
                 batch_steps=adjusted_batch_size,
-                batch_mode="pack_episodes", preprocessor_pref="deepmind",
+                batch_mode="truncate_episodes", preprocessor_pref="deepmind",
                 compress_observations=True,
                 registry=self.registry, env_config=self.config["env_config"],
-                model_config=self.config["model"], policy_config=self.config)
+                model_config=self.config["model"], policy_config=self.config,
+                num_envs=self.config["num_envs"])
             for _ in range(self.config["num_workers"])]
 
         self.exploration0 = self._make_exploration_schedule(0)

@@ -223,11 +223,9 @@ def _postprocess_dqn(policy_graph, sample_batch):
         "obs": obs, "actions": actions, "rewards": rewards,
         "new_obs": new_obs, "dones": dones,
         "weights": np.ones_like(rewards)})
-    assert batch.count == policy_graph.config["sample_batch_size"], \
-        (batch.count, policy_graph.config["sample_batch_size"])
 
     # Prioritize on the worker side
-    if policy_graph.config["worker_side_prioritization"]:
+    if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
         td_errors = policy_graph.compute_td_error(
             batch["obs"], batch["actions"], batch["rewards"],
             batch["new_obs"], batch["dones"], batch["weights"])

@@ -63,7 +63,7 @@ def kl(self, other):
                              reduction_indices=[1])
 
     def sample(self):
-        return tf.multinomial(self.inputs, 1)[0]
+        return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
 
 
 class DiagGaussian(ActionDistribution):

@@ -125,22 +125,16 @@ def get_preprocessor(space):
 
     legacy_patch_shapes(space)
     obs_shape = space.shape
-    print("Observation shape is {}".format(obs_shape))
 
     if isinstance(space, gym.spaces.Discrete):
-        print("Using one-hot preprocessor for discrete envs.")
         preprocessor = OneHotPreprocessor
     elif obs_shape == ATARI_OBS_SHAPE:
-        print("Assuming Atari pixel env, using AtariPixelPreprocessor.")
         preprocessor = AtariPixelPreprocessor
     elif obs_shape == ATARI_RAM_OBS_SHAPE:
-        print("Assuming Atari ram env, using AtariRamPreprocessor.")
         preprocessor = AtariRamPreprocessor
     elif isinstance(space, gym.spaces.Tuple):
-        print("Using a TupleFlatteningPreprocessor")
         preprocessor = TupleFlatteningPreprocessor
     else:
-        print("Not using any observation preprocessor.")
         preprocessor = NoPreprocessor
 
     return preprocessor

@@ -56,5 +56,5 @@ def forward(self, obs):
             value: value function for each state"""
         res = self.hidden_layers(obs)
         logits = self.logits(res)
-        value = self.value_branch(res).reshape(-1)
+        value = self.value_branch(res).squeeze(1)
         return logits, value
@@ -65,5 +65,5 @@ def forward(self, obs):
             value (PyTorch): value function for each state"""
         res = self.hidden_layers(obs)
         logits = self.logits(res)
-        value = self.value_branch(res)
+        value = self.value_branch(res).squeeze(1)
         return logits, value
@@ -105,6 +105,11 @@ def get_host(self):
 
         return os.uname()[1]
 
+    def apply(self, func, *args):
+        """Apply the given function to this evaluator instance."""
+
+        return func(self, *args)
+
 
 class TFMultiGPUSupport(PolicyEvaluator):
     """The multi-GPU TF optimizer requires additional TF-specific support.

@@ -110,3 +110,23 @@ def restore(self, data):
 
         self.num_steps_trained = data[0]
         self.num_steps_sampled = data[1]
+
+    def foreach_evaluator(self, func):
+        """Apply the given function to each evaluator instance."""
+
+        local_result = [func(self.local_evaluator)]
+        remote_results = ray.get(
+            [ev.apply.remote(func) for ev in self.remote_evaluators])
+        return local_result + remote_results
+
+    def foreach_evaluator_with_index(self, func):
+        """Apply the given function to each evaluator instance.
+
+        The index will be passed as the second arg to the given function.
+        """
+
+        local_result = [func(self.local_evaluator, 0)]
+        remote_results = ray.get(
+            [ev.apply.remote(func, i + 1)
+             for i, ev in enumerate(self.remote_evaluators)])
+        return local_result + remote_results