[BugFix] Restore missing keys in data collector output (#521)

tcbegley · vmoens · web-flow · commit b2eb1b8ee60f · 2022-10-10T09:19:02.000+01:00
* Ensure data collectors return all expected keys

* Rerun CI

* Add tests

* Format code

* correct unreachable test

* Fix broken test

* WIP: fix initialisation with policy + test

* Fix initialisation with policy + test

* Reset env after rollout initialisation

* fix build from spec

* Check policy has spec attribute before accessing

* Address comments

Co-authored-by: vmoens &lt;vincentmoens@gmail.com&gt;
diff --git a/test/test_collector.py b/test/test_collector.py
@@ -25,11 +25,17 @@
     MultiSyncDataCollector,
     MultiaSyncDataCollector,
 )
+from torchrl.data import (
+    CompositeSpec,
+    NdUnboundedContinuousTensorSpec,
+    UnboundedContinuousTensorSpec,
+)
 from torchrl.data.tensordict.tensordict import assert_allclose_td
 from torchrl.envs import EnvCreator
 from torchrl.envs import ParallelEnv
 from torchrl.envs.libs.gym import _has_gym
 from torchrl.envs.transforms import TransformedEnv, VecNorm
+from torchrl.modules import LSTMNet, TensorDictModule
 from torchrl.modules import OrnsteinUhlenbeckProcessWrapper, Actor
 
 # torch.set_default_dtype(torch.double)
@@ -673,14 +679,22 @@ def test_collector_vecnorm_envcreator(static_seed):
 
 
 @pytest.mark.parametrize("use_async", [False, True])
-@pytest.mark.skipif(torch.cuda.device_count() <= 1, reason="no cuda device found")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
 def test_update_weights(use_async):
-    policy = torch.nn.Linear(3, 4).cuda(1)
+    def create_env():
+        return ContinuousActionVecMockEnv()
+
+    n_actions = ContinuousActionVecMockEnv().action_spec.shape[-1]
+    policy = TensorDictModule(
+        torch.nn.LazyLinear(n_actions), in_keys=["observation"], out_keys=["action"]
+    )
+    policy(create_env().reset())
+
     collector_class = (
         MultiSyncDataCollector if not use_async else MultiaSyncDataCollector
     )
     collector = collector_class(
-        [lambda: DiscreteActionVecMockEnv()] * 3,
+        [create_env] * 3,
         policy=policy,
         devices=[torch.device("cuda:0")] * 3,
         passing_devices=[torch.device("cuda:0")] * 3,
@@ -769,6 +783,89 @@ def make_env():
     dummy_env.close()
 
 
+@pytest.mark.skipif(not _has_gym, reason="test designed with GymEnv")
+@pytest.mark.parametrize(
+    "collector_class",
+    [
+        SyncDataCollector,
+        MultiaSyncDataCollector,
+        MultiSyncDataCollector,
+    ],
+)
+@pytest.mark.parametrize("init_random_frames", [0, 50])
+@pytest.mark.parametrize("explicit_spec", [True, False])
+def test_collector_output_keys(collector_class, init_random_frames, explicit_spec):
+    from torchrl.envs.libs.gym import GymEnv
+
+    out_features = 1
+    hidden_size = 12
+    total_frames = 200
+    frames_per_batch = 20
+    num_envs = 3
+
+    net = LSTMNet(
+        out_features,
+        {"input_size": hidden_size, "hidden_size": hidden_size},
+        {"out_features": hidden_size},
+    )
+
+    policy_kwargs = {
+        "module": net,
+        "in_keys": ["observation", "hidden1", "hidden2"],
+        "out_keys": ["action", "hidden1", "hidden2", "next_hidden1", "next_hidden2"],
+    }
+    if explicit_spec:
+        hidden_spec = NdUnboundedContinuousTensorSpec((1, hidden_size))
+        policy_kwargs["spec"] = CompositeSpec(
+            action=UnboundedContinuousTensorSpec(),
+            hidden1=hidden_spec,
+            hidden2=hidden_spec,
+            next_hidden1=hidden_spec,
+            next_hidden2=hidden_spec,
+        )
+
+    policy = TensorDictModule(**policy_kwargs)
+
+    env_maker = lambda: GymEnv("Pendulum-v1")
+
+    policy(env_maker().reset())
+
+    collector_kwargs = {
+        "create_env_fn": env_maker,
+        "policy": policy,
+        "total_frames": total_frames,
+        "frames_per_batch": frames_per_batch,
+        "init_random_frames": init_random_frames,
+    }
+
+    if collector_class is not SyncDataCollector:
+        collector_kwargs["create_env_fn"] = [
+            collector_kwargs["create_env_fn"] for _ in range(num_envs)
+        ]
+
+    collector = collector_class(**collector_kwargs)
+
+    keys = [
+        "action",
+        "done",
+        "hidden1",
+        "hidden2",
+        "mask",
+        "next_hidden1",
+        "next_hidden2",
+        "next_observation",
+        "observation",
+        "reward",
+        "step_count",
+        "traj_ids",
+    ]
+    b = next(iter(collector))
+
+    assert set(b.keys()) == set(keys)
+    collector.shutdown()
+    del collector
+
+
 def weight_reset(m):
     if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
         m.reset_parameters()
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -322,10 +322,62 @@ def __init__(
         self._tensordict.set(
             "step_count", torch.zeros(*self.env.batch_size, 1, dtype=torch.int)
         )
-        self._tensordict_out = TensorDict(
-            {},
-            batch_size=[*self.env.batch_size, self.frames_per_batch],
-            device=self.passing_device,
+
+        if (
+            hasattr(policy, "spec")
+            and policy.spec is not None
+            and all(v is not None for v in policy.spec.values())
+            and set(policy.spec.keys()) == set(policy.out_keys)
+        ):
+            # if policy spec is non-empty, all the values are not None and the keys
+            # match the out_keys we assume the user has given all relevant information
+            self._tensordict_out = TensorDict(
+                {
+                    **env.observation_spec.zero(env.batch_size),
+                    "reward": env.reward_spec.zero(env.batch_size),
+                    "done": torch.zeros(
+                        env.batch_size, dtype=torch.bool, device=env.device
+                    ),
+                    **policy.spec.zero(env.batch_size),
+                },
+                env.batch_size,
+                device=env.device,
+            )
+            self._tensordict_out = (
+                self._tensordict_out.unsqueeze(-1)
+                .expand(*env.batch_size, self.frames_per_batch)
+                .to_tensordict()
+            )
+            self._tensordict_out = self._tensordict_out.update(
+                step_mdp(self._tensordict_out)
+            )  # add "observation" when there is "next_observation"
+        else:
+            # otherwise, we perform a small number of steps with the policy to
+            # determine the relevant keys with which to pre-populate _tensordict_out.
+            # See #505 for additional context.
+            self._tensordict_out = env.rollout(3, policy)
+            if env.batch_size:
+                self._tensordict_out = self._tensordict_out[..., :1]
+            else:
+                self._tensordict_out = self._tensordict_out[:1]
+            self._tensordict_out = (
+                self._tensordict_out.expand(*env.batch_size, self.frames_per_batch)
+                .to_tensordict()
+                .zero_()
+                .detach()
+            )
+            env.reset()
+
+        # in addition to outputs of the policy, we add traj_ids and step_count to
+        # _tensordict_out which will be collected during rollout
+        if len(self.env.batch_size):
+            traj_ids = torch.zeros(*self._tensordict_out.batch_size, 1)
+        else:
+            traj_ids = torch.zeros(*self._tensordict_out.batch_size, 1, 1)
+
+        self._tensordict_out.set("traj_ids", traj_ids)
+        self._tensordict_out.set(
+            "step_count", torch.zeros(*self._tensordict_out.batch_size, 1)
         )
 
         self.return_in_place = return_in_place