[rllib] Fix torch TD error, IMPALA LR updates (#9477)

ericl · web-flow · commit 5acd3e66ddc1 · 2020-07-23T12:50:25.000-07:00
* update

* add test

* lint

* fix super call

* speed es test up
diff --git a/rllib/agents/dqn/dqn_torch_policy.py b/rllib/agents/dqn/dqn_torch_policy.py
@@ -52,7 +52,6 @@ def __init__(self,
             "mean_q": torch.mean(q_t_selected),
             "min_q": torch.min(q_t_selected),
             "max_q": torch.max(q_t_selected),
-            "td_error": self.td_error,
             "mean_td_error": torch.mean(self.td_error),
         }
 
@@ -250,10 +249,7 @@ def compute_q_values(policy, model, obs, explore, is_training=False):
 
 def grad_process_and_td_error_fn(policy, optimizer, loss):
     # Clip grads if configured.
-    info = apply_grad_clipping(policy, optimizer, loss)
-    # Add td-error to info dict.
-    info["td_error"] = policy.q_loss.td_error
-    return info
+    return apply_grad_clipping(policy, optimizer, loss)
 
 
 def extra_action_out_fn(policy, input_dict, state_batches, model, action_dist):
@@ -270,6 +266,7 @@ def extra_action_out_fn(policy, input_dict, state_batches, model, action_dist):
     postprocess_fn=postprocess_nstep_and_prio,
     optimizer_fn=adam_optimizer,
     extra_grad_process_fn=grad_process_and_td_error_fn,
+    extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
     extra_action_out_fn=extra_action_out_fn,
     before_init=setup_early_mixins,
     after_init=after_init,
diff --git a/rllib/agents/dqn/simple_q_torch_policy.py b/rllib/agents/dqn/simple_q_torch_policy.py
@@ -96,5 +96,5 @@ def setup_late_mixins(policy, obs_space, action_space, config):
     make_model_and_action_dist=build_q_model_and_distribution,
     mixins=[TargetNetworkMixin],
     action_distribution_fn=get_distribution_inputs_and_class,
-    stats_fn=lambda policy, config: {"td_error": policy.td_error},
+    extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
 )
diff --git a/rllib/agents/es/tests/test_es.py b/rllib/agents/es/tests/test_es.py
@@ -9,14 +9,17 @@
 class TestES(unittest.TestCase):
     def test_es_compilation(self):
         """Test whether an ESTrainer can be built on all frameworks."""
-        ray.init()
+        ray.init(num_cpus=2)
         config = es.DEFAULT_CONFIG.copy()
         # Keep it simple.
         config["model"]["fcnet_hiddens"] = [10]
         config["model"]["fcnet_activation"] = None
         config["noise_size"] = 2500000
+        config["num_workers"] = 1
+        config["episodes_per_batch"] = 10
+        config["train_batch_size"] = 100
 
-        num_iterations = 2
+        num_iterations = 1
 
         for _ in framework_iterator(config):
             plain_config = config.copy()
diff --git a/rllib/agents/impala/impala.py b/rllib/agents/impala/impala.py
@@ -194,6 +194,8 @@ def __call__(self, item):
             metrics = _get_shared_metrics()
             metrics.counters["num_weight_broadcasts"] += 1
         actor.set_weights.remote(self.weights, _get_global_vars())
+        # Also update global vars of the local worker.
+        self.workers.local_worker().set_global_vars(_get_global_vars())
 
 
 def record_steps_trained(item):
diff --git a/rllib/agents/impala/tests/test_impala.py b/rllib/agents/impala/tests/test_impala.py
@@ -18,6 +18,25 @@ def setUpClass(cls) -> None:
     def tearDownClass(cls) -> None:
         ray.shutdown()
 
+    def test_impala_lr_schedule(self):
+        config = impala.DEFAULT_CONFIG.copy()
+        config["lr_schedule"] = [
+            [0, 0.0005],
+            [10000, 0.000001],
+        ]
+        local_cfg = config.copy()
+        trainer = impala.ImpalaTrainer(config=local_cfg, env="CartPole-v0")
+
+        def get_lr(result):
+            return result["info"]["learner"]["default_policy"]["cur_lr"]
+
+        try:
+            r1 = trainer.train()
+            r2 = trainer.train()
+            assert get_lr(r2) < get_lr(r1), (r1, r2)
+        finally:
+            trainer.stop()
+
     def test_impala_compilation(self):
         """Test whether an ImpalaTrainer can be built with both frameworks."""
         config = impala.DEFAULT_CONFIG.copy()
diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py
@@ -40,23 +40,22 @@ class TorchPolicy(Policy):
     """
 
     @DeveloperAPI
-    def __init__(self,
-                 observation_space: gym.spaces.Space,
-                 action_space: gym.spaces.Space,
-                 config: TrainerConfigDict,
-                 *,
-                 model: ModelV2,
-                 loss: Callable[
-                     [Policy, ModelV2, type, SampleBatch], TensorType],
-                 action_distribution_class: TorchDistributionWrapper,
-                 action_sampler_fn: Callable[
-                     [TensorType, List[TensorType]], Tuple[
-                         TensorType, TensorType]] = None,
-                 action_distribution_fn: Optional[Callable[
-                     [Policy, ModelV2, TensorType, TensorType, TensorType],
-                     Tuple[TensorType, type, List[TensorType]]]] = None,
-                 max_seq_len: int = 20,
-                 get_batch_divisibility_req: Optional[int] = None):
+    def __init__(
+            self,
+            observation_space: gym.spaces.Space,
+            action_space: gym.spaces.Space,
+            config: TrainerConfigDict,
+            *,
+            model: ModelV2,
+            loss: Callable[[Policy, ModelV2, type, SampleBatch], TensorType],
+            action_distribution_class: TorchDistributionWrapper,
+            action_sampler_fn: Callable[[TensorType, List[TensorType]], Tuple[
+                TensorType, TensorType]] = None,
+            action_distribution_fn: Optional[Callable[[
+                Policy, ModelV2, TensorType, TensorType, TensorType
+            ], Tuple[TensorType, type, List[TensorType]]]] = None,
+            max_seq_len: int = 20,
+            get_batch_divisibility_req: Optional[int] = None):
         """Build a policy from policy and loss torch modules.
 
         Note that model will be placed on GPU device if CUDA_VISIBLE_DEVICES
@@ -165,8 +164,8 @@ def compute_actions(
                 extra_fetches[SampleBatch.ACTION_PROB] = np.exp(logp)
                 extra_fetches[SampleBatch.ACTION_LOGP] = logp
 
-            return convert_to_non_torch_type(
-                (actions, state_out, extra_fetches))
+            return convert_to_non_torch_type((actions, state_out,
+                                              extra_fetches))
 
     @override(Policy)
     def compute_actions_from_trajectories(
@@ -183,8 +182,9 @@ def compute_actions_from_trajectories(
 
         with torch.no_grad():
             # Create a view and pass that to Model as `input_dict`.
-            input_dict = self._lazy_tensor_dict(get_trajectory_view(
-                self.model, trajectories, is_training=False))
+            input_dict = self._lazy_tensor_dict(
+                get_trajectory_view(
+                    self.model, trajectories, is_training=False))
             # TODO: (sven) support RNNs w/ fast sampling.
             state_batches = []
             seq_lens = None
@@ -232,8 +232,8 @@ def _compute_action_helper(self, input_dict, state_batches, seq_lens,
                         is_training=False)
             else:
                 dist_class = self.dist_class
-                dist_inputs, state_out = self.model(
-                    input_dict, state_batches, seq_lens)
+                dist_inputs, state_out = self.model(input_dict, state_batches,
+                                                    seq_lens)
 
             if not (isinstance(dist_class, functools.partial)
                     or issubclass(dist_class, TorchDistributionWrapper)):
@@ -270,10 +270,10 @@ def compute_log_likelihoods(
             actions: Union[List[TensorType], TensorType],
             obs_batch: Union[List[TensorType], TensorType],
             state_batches: Optional[List[TensorType]] = None,
-            prev_action_batch: Optional[
-                Union[List[TensorType], TensorType]] = None,
-            prev_reward_batch: Optional[
-                Union[List[TensorType], TensorType]] = None) -> TensorType:
+            prev_action_batch: Optional[Union[List[TensorType],
+                                              TensorType]] = None,
+            prev_reward_batch: Optional[Union[List[
+                TensorType], TensorType]] = None) -> TensorType:
 
         if self.action_sampler_fn and self.action_distribution_fn is None:
             raise ValueError("Cannot compute log-prob/likelihood w/o an "
@@ -314,8 +314,8 @@ def compute_log_likelihoods(
 
     @override(Policy)
     @DeveloperAPI
-    def learn_on_batch(self, postprocessed_batch: SampleBatch) -> Dict[
-            str, TensorType]:
+    def learn_on_batch(
+            self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]:
         # Get batch ready for RNNs, if applicable.
         pad_batch_to_sequences_of_same_size(
             postprocessed_batch,
@@ -331,6 +331,7 @@ def learn_on_batch(self, postprocessed_batch: SampleBatch) -> Dict[
             loss_out = self.model.custom_loss(loss_out, train_batch)
         assert len(loss_out) == len(self._optimizers)
         # assert not any(torch.isnan(l) for l in loss_out)
+        fetches = self.extra_compute_grad_fetches()
 
         # Loop through all optimizers.
         grad_info = {"allreduce_latency": 0.0}
@@ -370,7 +371,7 @@ def learn_on_batch(self, postprocessed_batch: SampleBatch) -> Dict[
 
         grad_info["allreduce_latency"] /= len(self._optimizers)
         grad_info.update(self.extra_grad_info(train_batch))
-        return {LEARNER_STATS_KEY: grad_info}
+        return dict(fetches, **{LEARNER_STATS_KEY: grad_info})
 
     @override(Policy)
     @DeveloperAPI
@@ -380,6 +381,7 @@ def compute_gradients(self,
         loss_out = force_list(
             self._loss(self, self.model, self.dist_class, train_batch))
         assert len(loss_out) == len(self._optimizers)
+        fetches = self.extra_compute_grad_fetches()
 
         grad_process_info = {}
         grads = []
@@ -399,7 +401,7 @@ def compute_gradients(self,
 
         grad_info = self.extra_grad_info(train_batch)
         grad_info.update(grad_process_info)
-        return grads, {LEARNER_STATS_KEY: grad_info}
+        return grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
 
     @override(Policy)
     @DeveloperAPI
@@ -466,10 +468,8 @@ def set_state(self, state: object) -> None:
         super().set_state(state)
 
     @DeveloperAPI
-    def extra_grad_process(
-            self,
-            optimizer: "torch.optim.Optimizer",
-            loss: TensorType):
+    def extra_grad_process(self, optimizer: "torch.optim.Optimizer",
+                           loss: TensorType):
         """Called after each optimizer.zero_grad() + loss.backward() call.
 
         Called for each self._optimizers/loss-value pair.
@@ -486,12 +486,20 @@ def extra_grad_process(
         """
         return {}
 
+    @DeveloperAPI
+    def extra_compute_grad_fetches(self) -> Dict[str, any]:
+        """Extra values to fetch and return from compute_gradients().
+
+        Returns:
+            Dict[str, any]: Extra fetch dict to be added to the fetch dict
+                of the compute_gradients call.
+        """
+        return {LEARNER_STATS_KEY: {}}  # e.g, stats, td error, etc.
+
     @DeveloperAPI
     def extra_action_out(
-            self,
-            input_dict: Dict[str, TensorType],
-            state_batches: List[TensorType],
-            model: TorchModelV2,
+            self, input_dict: Dict[str, TensorType],
+            state_batches: List[TensorType], model: TorchModelV2,
             action_dist: TorchDistributionWrapper) -> Dict[str, TensorType]:
         """Returns dict of extra info to include in experience batch.
 
@@ -509,8 +517,8 @@ def extra_action_out(
         return {}
 
     @DeveloperAPI
-    def extra_grad_info(self, train_batch: SampleBatch) -> Dict[
-            str, TensorType]:
+    def extra_grad_info(self,
+                        train_batch: SampleBatch) -> Dict[str, TensorType]:
         """Return dict of extra grad info.
 
         Args:
@@ -524,8 +532,9 @@ def extra_grad_info(self, train_batch: SampleBatch) -> Dict[
         return {}
 
     @DeveloperAPI
-    def optimizer(self) -> Union[
-            List["torch.optim.Optimizer"], "torch.optim.Optimizer"]:
+    def optimizer(
+            self
+    ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]:
         """Custom the local PyTorch optimizer(s) to use.
 
         Returns:
@@ -560,8 +569,8 @@ def import_model_from_h5(self, import_file: str) -> None:
 
     def _lazy_tensor_dict(self, postprocessed_batch):
         train_batch = UsageTrackingDict(postprocessed_batch)
-        train_batch.set_get_interceptor(functools.partial(
-            convert_to_torch_tensor, device=self.device))
+        train_batch.set_get_interceptor(
+            functools.partial(convert_to_torch_tensor, device=self.device))
         return train_batch
 
 
diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py
@@ -1,4 +1,4 @@
-from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
 from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
@@ -19,6 +19,7 @@ def build_torch_policy(name,
                        postprocess_fn=None,
                        extra_action_out_fn=None,
                        extra_grad_process_fn=None,
+                       extra_learn_fetches_fn=None,
                        optimizer_fn=None,
                        validate_spaces=None,
                        before_init=None,
@@ -47,6 +48,8 @@ def build_torch_policy(name,
             returns a dict of extra values to include in experiences.
         extra_grad_process_fn (Optional[callable]): Optional callable that is
             called after gradients are computed and returns processing info.
+        extra_learn_fetches_fn (func): optional function that returns a dict of
+            extra values to fetch from the policy after loss evaluation.
         optimizer_fn (Optional[callable]): Optional callable that returns a
             torch optimizer given the policy and config.
         validate_spaces (Optional[callable]): Optional callable that takes the
@@ -179,6 +182,16 @@ def extra_grad_process(self, optimizer, loss):
             else:
                 return TorchPolicy.extra_grad_process(self, optimizer, loss)
 
+        @override(TorchPolicy)
+        def extra_compute_grad_fetches(self):
+            if extra_learn_fetches_fn:
+                fetches = convert_to_non_torch_type(
+                    extra_learn_fetches_fn(self))
+                # Auto-add empty learner stats dict if needed.
+                return dict({LEARNER_STATS_KEY: {}}, **fetches)
+            else:
+                return TorchPolicy.extra_compute_grad_fetches(self)
+
         @override(TorchPolicy)
         def apply_gradients(self, gradients):
             if apply_gradients_fn:

Original file line number	Diff line number	Diff line change
`@@ -96,5 +96,5 @@ def setup_late_mixins(policy, obs_space, action_space, config):`
`96`	`96`	`make_model_and_action_dist=build_q_model_and_distribution,`
`97`	`97`	`mixins=[TargetNetworkMixin],`
`98`	`98`	`action_distribution_fn=get_distribution_inputs_and_class,`
`99`		`- stats_fn=lambda policy, config: {"td_error": policy.td_error},`
	`99`	`+ extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},`
`100`	`100`	`)`