wip.

ray-project · sven1977 · Apr 27, 2021 · Dec 23, 2020 · Dec 23, 2020 · Dec 23, 2020
commit b393f037e6fddd881f7f2bee1e901aa10bd865db
@@ -29,21 +29,29 @@
 
 
 def ppo_surrogate_loss(
-        policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution],
+        policy: Policy, model: Union[ModelV2, tf.keras.Model],
+        dist_class: Type[TFActionDistribution],
         train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
     """Constructs the loss for Proximal Policy Objective.
 
     Args:
         policy (Policy): The Policy to calculate the loss for.
-        model (ModelV2): The Model to calculate the loss for.
+        model (Union[ModelV2, tf.keras.Model]): The Model to calculate
+            the loss for.
         dist_class (Type[ActionDistribution]: The action distr. class.
         train_batch (SampleBatch): The training data.
 
     Returns:
         Union[TensorType, List[TensorType]]: A single loss tensor or a list
             of loss tensors.
     """
-    logits, state = model(train_batch)
+    if isinstance(model, tf.keras.Model):
+        logits, state, extra_outs = model(train_batch)
+        value_fn_out = extra_outs[SampleBatch.VF_PREDS]
+    else:
+        logits, state = model(train_batch)
+        value_fn_out = model.value_function()
+
     curr_action_dist = dist_class(logits, model)
 
     # RNN case: Mask away 0-padded chunks at end of time axis.
@@ -86,7 +94,6 @@ def reduce_mean_valid(t):
 
     if policy.config["use_gae"]:
         prev_value_fn_out = train_batch[SampleBatch.VF_PREDS]
-        value_fn_out = model.value_function()
         vf_loss1 = tf.math.square(value_fn_out -
                                   train_batch[Postprocessing.VALUE_TARGETS])
         vf_clipped = prev_value_fn_out + tf.clip_by_value(
@@ -112,6 +119,7 @@ def reduce_mean_valid(t):
     policy._mean_vf_loss = mean_vf_loss
     policy._mean_entropy = mean_entropy
     policy._mean_kl = mean_kl
+    policy._value_fn_out = value_fn_out
 
     return total_loss
 
@@ -134,14 +142,14 @@ def kl_and_loss_stats(policy: Policy,
         "policy_loss": policy._mean_policy_loss,
         "vf_loss": policy._mean_vf_loss,
         "vf_explained_var": explained_variance(
-            train_batch[Postprocessing.VALUE_TARGETS],
-            policy.model.value_function()),
+            train_batch[Postprocessing.VALUE_TARGETS], policy._value_fn_out),
         "kl": policy._mean_kl,
         "entropy": policy._mean_entropy,
         "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
     }
 
 
+# TODO: (sven) Deprecate once we only allow native keras models.
 def vf_preds_fetches(policy: Policy) -> Dict[str, TensorType]:
     """Defines extra fetches per action computation.
 
@@ -152,6 +160,10 @@ def vf_preds_fetches(policy: Policy) -> Dict[str, TensorType]:
         Dict[str, TensorType]: Dict with extra tf fetches to perform per
             action computation.
     """
+    # Keras models return values for each call in third return argument
+    # (dict).
+    if isinstance(policy.model, tf.keras.Model):
+        return {}
     # Return value function outputs. VF estimates will hence be added to the
     # SampleBatches produced by the sampler(s) to generate the train batches
     # going into the loss function.
@@ -256,9 +268,13 @@ def __init__(self, obs_space, action_space, config):
                 @make_tf_callable(self.get_session())
                 def value(**input_dict):
                     input_dict = SampleBatch(input_dict)
-                    model_out, _ = self.model(input_dict)
-                    # [0] = remove the batch dim.
-                    return self.model.value_function()[0]
+                    if isinstance(self.model, tf.keras.Model):
+                        _, _, extra_outs = self.model(input_dict)
+                        return extra_outs[SampleBatch.VF_PREDS][0]
+                    else:
+                        model_out, _ = self.model(input_dict)
+                        # [0] = remove the batch dim.
+                        return self.model.value_function()[0]
 
             # TODO: (sven) Remove once trajectory view API is all-algo default.
             else:

@@ -241,7 +241,7 @@ def build(self, view_requirements: ViewRequirementsDict) -> SampleBatch:
 
         # Due to possible batch-repeats > 1, columns in the resulting batch
         # may not all have the same batch size.
-        batch = SampleBatch(batch_data, _dont_check_lens=True)
+        batch = SampleBatch(batch_data)
 
         # Add EPS_ID and UNROLL_ID to batch.
         batch[SampleBatch.EPS_ID] = np.repeat(self.episode_id, batch.count)
@@ -366,8 +366,7 @@ def build(self):
                 this policy.
         """
         # Create batch from our buffers.
-        batch = SampleBatch(
-            self.buffers, _seq_lens=self.seq_lens, _dont_check_lens=True)
+        batch = SampleBatch(self.buffers, _seq_lens=self.seq_lens)
         # Clear buffers for future samples.
         self.buffers.clear()
         # Reset agent steps to 0 and seq-lens to empty list.

@@ -1,5 +1,6 @@
 import numpy as np
 
+from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 
 tf1, tf, tfv = try_import_tf()
@@ -13,11 +14,11 @@ def __init__(self,
                  input_space,
                  action_space,
                  num_outputs,
-                 model_config,
-                 name,
+                 *,
+                 name="",
                  hiddens_size=256,
                  cell_size=64):
-        super(RNNModel, self).__init__()
+        super().__init__(name=name)
 
         self.cell_size = cell_size
 
@@ -46,14 +47,11 @@ def call(self, sample_batch):
         )
         lstm_out = tf.reshape(lstm_out, [-1, lstm_out.shape.as_list()[2]])
         logits = self.logits(lstm_out)
-        self._value_out = self.values(lstm_out)
-        return logits, [h, c]
+        values = tf.reshape(self.values(lstm_out), [-1])
+        return logits, [h, c], {SampleBatch.VF_PREDS: values}
 
     def get_initial_state(self):
         return [
             np.zeros(self.cell_size, np.float32),
             np.zeros(self.cell_size, np.float32),
         ]
-
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
@@ -19,7 +19,8 @@
     TorchDeterministic, TorchDiagGaussian, \
     TorchMultiActionDistribution, TorchMultiCategorical
 from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
-from ray.rllib.utils.deprecation import DEPRECATED_VALUE
+from ray.rllib.utils.deprecation import DEPRECATED_VALUE, \
+    deprecation_warning
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.spaces.simplex import Simplex
@@ -418,26 +419,45 @@ def track_var_creation(next_creator, **kw):
                     return v
 
                 with tf.variable_creator_scope(track_var_creation):
-                    # Try calling with kwargs first (custom ModelV2 should
-                    # accept these as kwargs, not get them from
-                    # config["custom_model_config"] anymore).
-                    try:
-                        instance = model_cls(obs_space, action_space,
-                                             num_outputs, model_config, name,
-                                             **customized_model_kwargs)
-                    except TypeError as e:
-                        # Keyword error: Try old way w/o kwargs.
-                        if "__init__() got an unexpected " in e.args[0]:
-                            instance = model_cls(obs_space, action_space,
-                                                 num_outputs, model_config,
-                                                 name, **model_kwargs)
-                            logger.warning(
-                                "Custom ModelV2 should accept all custom "
-                                "options as **kwargs, instead of expecting"
-                                " them in config['custom_model_config']!")
-                        # Other error -> re-raise.
-                        else:
-                            raise e
+                    if issubclass(model_cls, tf.keras.Model):
+                        instance = model_cls(
+                            input_space=obs_space,
+                            action_space=action_space,
+                            num_outputs=num_outputs,
+                            name=name,
+                            **customized_model_kwargs,
+                        )
+                    else:
+                        # Try calling with kwargs first (custom ModelV2 should
+                        # accept these as kwargs, not get them from
+                        # config["custom_model_config"] anymore).
+                        try:
+                            instance = model_cls(
+                                obs_space,
+                                action_space,
+                                num_outputs,
+                                model_config,
+                                name,
+                                **customized_model_kwargs,
+                            )
+                        except TypeError as e:
+                            # Keyword error: Try old way w/o kwargs.
+                            if "__init__() got an unexpected " in e.args[0]:
+                                instance = model_cls(
+                                    obs_space,
+                                    action_space,
+                                    num_outputs,
+                                    model_config,
+                                    name,
+                                    **model_kwargs,
+                                )
+                                logger.warning(
+                                    "Custom ModelV2 should accept all custom "
+                                    "options as **kwargs, instead of expecting"
+                                    " them in config['custom_model_config']!")
+                            # Other error -> re-raise.
+                            else:
+                                raise e
 
                 # User still registered TFModelV2's variables: Check, whether
                 # ok.
@@ -666,6 +686,8 @@ def register_custom_model(model_name: str, model_class: type) -> None:
             model_name (str): Name to register the model under.
             model_class (type): Python class of the model.
         """
+        if issubclass(model_class, tf.keras.Model):
+            deprecation_warning(old="register_custom_model", error=False)
         _global_registry.register(RLLIB_MODEL, model_name, model_class)
 
     @staticmethod

@@ -5,7 +5,6 @@
 import ray
 import ray.rllib.agents.ppo as ppo
 from ray.rllib.examples.models.modelv3 import RNNModel
-from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
 from ray.rllib.utils.framework import try_import_tf
@@ -65,11 +64,14 @@ def test_tf_modelv2(self):
         self.assertTrue("fc_net.base_model.value_out.bias:0" in vars)
 
     def test_modelv3(self):
-        ModelCatalog.register_custom_model("keras_model", RNNModel)
         config = {
             "env": "CartPole-v0",
             "model": {
-                "custom_model": "keras_model",
+                "custom_model": RNNModel,
+                "custom_model_config": {
+                    "hiddens_size": 64,
+                    "cell_size": 128,
+                },
             },
             "num_workers": 0,
         }

@@ -319,7 +319,8 @@ def __init__(
             # Pass through model. E.g., PG, PPO.
             else:
                 if isinstance(self.model, tf.keras.Model):
-                    dist_inputs, self._state_out = self.model(self._input_dict)
+                    dist_inputs, self._state_out, self._extra_action_fetches =\
+                        self.model(self._input_dict)
                 else:
                     dist_inputs, self._state_out = self.model(
                         self._input_dict, self._state_inputs, self._seq_lens)

@@ -750,7 +750,7 @@ def _get_dummy_batch_from_view_requirements(
 
         # Due to different view requirements for the different columns,
         # columns in the resulting batch may not all have the same batch size.
-        return SampleBatch(ret, _dont_check_lens=True)
+        return SampleBatch(ret)
 
     def _update_model_view_requirements_from_init_state(self):
         """Uses Model's (or this Policy's) init state to add needed ViewReqs.

@@ -416,8 +416,7 @@ def timeslice_along_seq_lens_with_overlap(
                 i += 1
                 key = "state_in_{}".format(i)
 
-        timeslices.append(
-            SampleBatch(data, _seq_lens=[end - begin], _dont_check_lens=True))
+        timeslices.append(SampleBatch(data, _seq_lens=[end - begin]))
 
     # Zero-pad each slice if necessary.
     if zero_pad_max_seq_len > 0:

@@ -70,7 +70,6 @@ def __init__(self, *args, **kwargs):
                 "seq_lens", None))
         if isinstance(self.seq_lens, list):
             self.seq_lens = np.array(self.seq_lens, dtype=np.int32)
-        self.dont_check_lens = kwargs.pop("_dont_check_lens", False)
         self.max_seq_len = kwargs.pop("_max_seq_len", None)
         if self.max_seq_len is None and self.seq_lens is not None and \
                 not (tf and tf.is_tensor(self.seq_lens)) and \
@@ -104,20 +103,12 @@ def __init__(self, *args, **kwargs):
             if isinstance(v, list):
                 self[k] = np.array(v)
 
-        if not lengths:
-            raise ValueError("Empty sample batch")
-
-        if not self.dont_check_lens:
-            assert len(set(lengths)) == 1, \
-                "Data columns must be same length, but lens are " \
-                "{}".format(lengths)
-
         if self.seq_lens is not None and \
                 not (tf and tf.is_tensor(self.seq_lens)) and \
                 len(self.seq_lens) > 0:
             self.count = sum(self.seq_lens)
         else:
-            self.count = lengths[0]
+            self.count = lengths[0] if lengths else 0
 
     @PublicAPI
     def __len__(self):
@@ -161,7 +152,6 @@ def concat_samples(samples: List["SampleBatch"]) -> \
             out,
             _seq_lens=np.array(seq_lens, dtype=np.int32),
             _time_major=concat_samples[0].time_major,
-            _dont_check_lens=True,
             _zero_padded=zero_padded,
             _max_seq_len=max_seq_len,
         )
@@ -211,7 +201,7 @@ def copy(self, shallow: bool = False) -> "SampleBatch":
                 for (k, v) in self.items()
             },
             _seq_lens=self.seq_lens,
-            _dont_check_lens=self.dont_check_lens)
+        )
         copy_.set_get_interceptor(self.get_interceptor)
         return copy_
 
@@ -353,7 +343,7 @@ def slice(self, start: int, end: int) -> "SampleBatch":
                 data,
                 _seq_lens=np.array(seq_lens, dtype=np.int32),
                 _time_major=self.time_major,
-                _dont_check_lens=True)
+            )
         else:
             return SampleBatch(
                 {k: v[start:end]

@@ -20,9 +20,9 @@ def test_dict_properties_of_sample_batches(self):
             "b": np.array([[0.1, 0.2], [0.3, 0.4]]),
             "c": True,
         }
-        batch = SampleBatch(base_dict, _dont_check_lens=True)
+        batch = SampleBatch(base_dict)
         try:
-            SampleBatch(base_dict, _dont_check_lens=False)
+            SampleBatch(base_dict)
         except AssertionError:
             pass  # expected
         keys_ = list(base_dict.keys())

@@ -214,11 +214,16 @@ def before_loss_init_wrapper(policy, obs_space, action_space,
                                          config):
                 if before_loss_init:
                     before_loss_init(policy, obs_space, action_space, config)
+
                 if extra_action_out_fn is None:
-                    policy._extra_action_fetches = {}
+                    extra_action_fetches = {}
+                else:
+                    extra_action_fetches = extra_action_out_fn(policy)
+
+                if hasattr(policy, "_extra_action_fetches"):
+                    policy._extra_action_fetches.update(extra_action_fetches)
                 else:
-                    policy._extra_action_fetches = extra_action_out_fn(policy)
-                    policy._extra_action_fetches = extra_action_out_fn(policy)
+                    policy._extra_action_fetches = extra_action_fetches
 
             DynamicTFPolicy.__init__(
                 self,