AOS55 · AOS55 · Nov 23, 2022 · Sep 30, 2022 · Sep 30, 2022 · Sep 30, 2022
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,6 @@ data/
 datasets/
 libraries/gym
 *outputs
-models/
+models/
+*.log
+*.out
diff --git a/agents/unsupervised_learning/smm.py b/agents/unsupervised_learning/smm.py
@@ -147,17 +147,19 @@ def __init__(self, z_dim, sp_lr, vae_lr, vae_beta, state_ent_coef,
         self.update_encoder = update_encoder
 
         kwargs["meta_dim"] = self.z_dim
+        #TODO: Fix this!
+        self.obs_type = kwargs["obs_type"]
         super().__init__(**kwargs)
         # self.obs_dim is now the real obs_dim (or repr_dim) + z_dim
         self.smm = SMM(self.obs_dim - z_dim,
                        z_dim,
                        hidden_dim=kwargs['hidden_dim'],
                        vae_beta=vae_beta,
                        device=kwargs['device']).to(kwargs['device'])
-        self.pred_optimizer = torch.optim.Adam(
-            self.smm.z_pred_net.parameters(), lr=sp_lr)
-        self.vae_optimizer = torch.optim.Adam(self.smm.vae.parameters(),
-                                              lr=vae_lr)
+
+        self.goal = (150, 75)  # TODO: Fix as part of config
+        self.pred_optimizer = torch.optim.Adam(self.smm.z_pred_net.parameters(), lr=sp_lr)
+        self.vae_optimizer = torch.optim.Adam(self.smm.vae.parameters(), lr=vae_lr)
 
         self.smm.train()
 
@@ -236,6 +238,21 @@ def update_pred(self, obs, z):
 
         return metrics, h_z_s
 
+    def get_goal_p_star(self, agent_pos):
+        x_dist = agent_pos[:, 0] - self.goal[0]
+        y_dist = agent_pos[:, 1] - self.goal[1]
+        x_dist = x_dist.cpu().detach().numpy()
+        y_dist = y_dist.cpu().detach().numpy()
+        dist = np.linalg.norm((x_dist, y_dist), axis=0)
+        def _prior_distro(dist):
+            if dist > 1.0:
+                p_star = 1/dist
+            else:
+                p_star = 1.0
+            return p_star
+        p_star = np.array(list(map(_prior_distro, dist)), dtype=np.float32)
+        return p_star
+
     def update(self, replay_iter, step):
         metrics = dict()
         if step % self.update_every_steps != 0:
@@ -244,7 +261,6 @@ def update(self, replay_iter, step):
 
         obs, action, extr_reward, discount, next_obs, z = utils.to_torch(
             batch, self.device)
-
         obs = self.aug_and_encode(obs)
         with torch.no_grad():
             next_obs = self.aug_and_encode(next_obs)
@@ -258,14 +274,37 @@ def update(self, replay_iter, step):
             h_z = np.log(self.z_dim)  # One-hot z encoding
             h_z *= torch.ones_like(extr_reward).to(self.device)
 
-            pred_log_ratios = self.state_ent_coef * h_s_z.detach(
-            )  # p^*(s) is ignored, as state space dimension is inaccessible from pixel input
-            intr_reward = pred_log_ratios + self.latent_ent_coef * h_z + self.latent_cond_ent_coef * h_z_s.detach(
-            )
-            reward = intr_reward
+            pred_log_ratios = self.state_ent_coef * h_s_z.detach()
+
+            if self.obs_type=='pixels':
+                # p^*(s) is ignored, as state space dimension is inaccessible from pixel input
+                intr_reward = pred_log_ratios + self.latent_ent_coef * h_z + self.latent_cond_ent_coef * h_z_s.detach()
+                reward = intr_reward
+            else:
+                # p^*(s) is based on the goal hitting time
+                # TODO: Assumes obs is just (x, y) at front
+                p_star = self.get_goal_p_star(obs)
+                log_p_star = np.log(p_star)
+                log_p_star = torch.tensor(log_p_star).to(self.device)
+                # TODO: Check signs in this intrinsic reward function, maybe ask author
+                # intr_reward = log_p_star + pred_log_ratios + self.latent_ent_coef * h_z + self.latent_cond_ent_coef * h_z_s.detach()
+                intr_reward = log_p_star + pred_log_ratios + self.latent_ent_coef * h_z + self.latent_cond_ent_coef * h_z_s.detach()
+                # print(f'intr_reward: {intr_reward[0]} = p*: {100 * log_p_star[0]} + rho_pi: {pred_log_ratios[0]} +h(z): {self.latent_ent_coef * h_z[0]} + h(z|s): {self.latent_cond_ent_coef * h_z_s.detach()[0]}')
+                reward = intr_reward
         else:
             reward = extr_reward
 
+        if self.obs_type=='states' and self.reward_free:
+            # add reward free to states motivation
+            metrics['intr_reward'] = intr_reward.mean().item()
+            metrics['log_p_star']  = log_p_star.mean().item()
+            metrics['pred_log_ratios'] = pred_log_ratios.mean().item()
+            metrics['latent_ent_coef'] = (self.latent_ent_coef * h_z).mean().item()
+            metrics['latent_cond_ent_coef'] = (self.latent_cond_ent_coef * h_z_s.detach()).mean().item()
+            # add loss values
+            metrics['loss_vae'] = vae_metrics['loss_vae']
+            metrics['loss_pred'] = pred_metrics['loss_pred']
+
         if self.use_tb or self.use_wandb:
             metrics.update(vae_metrics)
             metrics.update(pred_metrics)

diff --git a/collect_controlled_data.py b/collect_controlled_data.py
@@ -0,0 +1,137 @@
+from libraries.latentsafesets.utils.arg_parser import parse_args
+from libraries.latentsafesets.utils import utils
+from libraries.latentsafesets.utils import plot_utils as pu
+
+from pathlib import Path
+
+import torch
+import pprint
+import hydra
+import logging
+import os
+import numpy as np
+
+from libraries.safe import SimplePointBot as SPB
+from libraries.safe import SimpleVelocityBot as SVB
+from libraries.safe import bottleneck_nav as BottleNeck
+from libraries.latentsafesets.utils.teacher import ConstraintTeacher, SimplePointBotTeacher, SimpleVelocityBotTeacher, SimpleVelocityBotConstraintTeacher, BottleNeckTeacher, BottleNeckConstraintTeacher
+log = logging.getLogger("collect")
+from utils.env_constructor import make
+
+ENV = {
+    'SimplePointBot' : SPB,
+    'SimpleVelocityBot' : SVB,
+    'BottleNeck' : BottleNeck
+}
+
+ENV_TEACHERS = {
+    'SimplePointBot' : [
+        SimplePointBotTeacher, ConstraintTeacher
+    ],
+    'SimpleVelocityBot' : [
+        SimpleVelocityBotTeacher, SimpleVelocityBotConstraintTeacher
+    ],
+    'BottleNeck' : [
+        BottleNeckTeacher, BottleNeckConstraintTeacher
+    ]
+}
+
+DATA_DIRS = {
+    'SimplePointBot' : [
+        'SimplePointBot', 'SimplePointBot'
+    ],
+    'SimpleVelocityBot' : [
+        'SimpleVelocityBot', 'SimpleVelocityBotConstraint'
+    ],
+    'BottleNeck' : [
+        'BottleNeck', 'BottleNeckConstraints'
+    ]
+}
+
+DATA_COUNTS = {
+    'SimplePointBot' : [
+        150, 150
+    ],
+    'SimpleVeocityBot' : [
+        100, 100
+    ],
+    'BottleNeck' : [
+        100, 100
+    ]
+}
+
+
+class Workspace:
+
+    def __init__(self, cfg):
+        self.work_dir = Path.cwd()
+        self.logdir = cfg.log_dir
+        print(f'workspace: {self.work_dir}')
+        self.cfg = cfg
+        self.device = torch.device(cfg.device)
+        self.env = ENV[self.cfg.env]
+        if self.cfg.obs_type == 'pixels':
+            self.sample_env = self.env(from_pixels=True)
+        else:
+            self.sample_env = self.env(from_pixels=False)
+
+    def sample_demo_data(self):
+        teachers = ENV_TEACHERS[self.cfg.env]
+        data_dirs = DATA_DIRS[self.cfg.env]
+        data_counts = DATA_COUNTS[self.cfg.env]
+
+        idc = 0
+        for teacher, data_dir, count in list(zip(teachers, data_dirs, data_counts)):
+            self.generate_teacher_demo_data(data_dir, teacher, count, count_start=idc)
+            idc += count
+
+    def generate_teacher_demo_data(self, data_dir, teacher, count, count_start=0, noisy=False):
+        demo_dir = os.path.join(self.work_dir, data_dir)
+        if not os.path.exists(demo_dir):
+            os.makedirs(demo_dir)
+        # else:
+        #     raise RuntimeError(f'Directory {demo_dir} already exists!')
+        teacher = teacher(self.sample_env, noisy=noisy)
+        demonstrations = []
+        for idc in range(count):
+            idc += count_start
+            traj = teacher.generate_trajectory()
+            reward = sum([frame['reward'] for frame in traj])
+            print(f'Trajectory {idc}, Reward {reward}')
+            demonstrations.append(traj)
+            self.save_trajectory(traj, demo_dir, idc)
+            # if idc < 50 and self.logdir is not None:
+            #     pu.make_movie(traj, os.path.join(self.logdir, f'{data_dir}_{idc}.gif'))
+        return demonstrations
+
+    @staticmethod
+    def save_trajectory(traj, demo_dir, idc):
+        observation = []
+        action = []
+        reward = []
+        safe_set = []
+        constraint = []
+        on_policy = []
+        rtg = []
+        done = []
+        for trajectory in traj:
+            observation.append(trajectory['obs'])
+            action.append(trajectory['action'])
+            reward.append(trajectory['reward'])
+            safe_set.append(trajectory['safe_set'])
+            on_policy.append(trajectory['on_policy'])
+            constraint.append(trajectory['constraint'])
+            rtg.append(trajectory['rtg'])
+            done.append(trajectory['done'])
+        file_name = os.path.join(demo_dir, f'episode_{idc}_100')
+        np.savez_compressed(file_name, observation=observation, action=action, constraint=constraint, reward=reward, 
+                            safe_set=safe_set, on_policy=on_policy, rtg=rtg, done=done)
+
+@hydra.main(config_path='configs/.', config_name='mpc')
+def main(cfg):
+    from collect_controlled_data import Workspace as W
+    workspace = W(cfg)
+    workspace.sample_demo_data()
+
+if __name__=='__main__':
+    main()
diff --git a/configs/agent/smm.yaml b/configs/agent/smm.yaml
@@ -3,7 +3,7 @@ _target_: agents.unsupervised_learning.SMMAgent
 name: smm
 
 # z params
-z_dim: 4 # default in codebase is 4
+z_dim: ${skill_dim} # default in codebase is 4
 
 # z discriminator params
 sp_lr: 1e-3
@@ -13,9 +13,9 @@ vae_lr: 1e-2
 vae_beta: 0.5
 
 # reward params
-state_ent_coef: 1.0
-latent_ent_coef: 1.0
-latent_cond_ent_coef: 1.0
+state_ent_coef: ${state_ent_coef}
+latent_ent_coef: ${latent_ent_coef}
+latent_cond_ent_coef: ${latent_cond_ent_coef}
 
 # DDPG params
 reward_free: ${reward_free}

diff --git a/configs/controlled_data.yaml b/configs/controlled_data.yaml
@@ -0,0 +1,30 @@
+defaults:
+  - agent: ddpg
+  - override hydra/launcher: submitit_local
+
+
+# env settings
+env: SimplePointBot
+obs_type: states
+num_samples: 150
+frame_stack: 1
+action_repeat: 1
+seed: 1
+
+# experiment
+experiment: exp
+
+hydra:
+  run:
+    dir: ./exp_local/${now:%Y.%m.%d}/${now:%H%M%S}_${teacher}
+  sweep:
+    dir: ./exp_sweep/${now:%Y.%m.%d}/${now:%H%M}_${teacher}_${experiment}
+    subdir: ${hydra.job.num}
+  launcher:
+    timeout_min: 4300
+    cpus_per_task: 10
+    gpus_per_node: 1
+    tasks_per_node: 1
+    mem_gb: 160
+    nodes: 1
+    submitit_folder: ./exp_sweep/${now:%Y.%m.%d}/${now:%H%M}_${teacher}_${experiment}/.slurm
diff --git a/configs/finetune.yaml b/configs/finetune.yaml
@@ -9,6 +9,7 @@ task: walker_stand
 obs_type: states # [states, pixels]
 frame_stack: 1 # only works if obs_type=pixels
 action_repeat: 1 # set to 2 for pixels
+skill_dim: 10
 discount: 0.99
 # train settings
 num_train_frames: 2000010

diff --git a/configs/mpc.yaml b/configs/mpc.yaml
@@ -4,22 +4,22 @@ defaults:
 
 
 # Task Settings
-task: SimplePointBot_goal
-env: SimplePointBot
-obs_type: pixels  # [states, pixels]
+task: SimpleVelocityBot_goal
+env: SimpleVelocityBot
+obs_type: states  # [states, pixels]
 frame_stack: 1
 action_repeat: 1
 discount: 1.0
-num_updates: 25
+num_updates: 500
 log_freq: 100
 plot_freq: 500
 log_dir: ./logs
 
 # Module Settings
 # Encoder
 enc_checkpoint: ../../../models/spb/vae.pth
-d_latent: 32
-d_obs: [3, 64, 64]
+d_latent: 2
+d_obs: [2]
 enc_init_iters: 100000
 enc_kl_multiplier: 1e-6
 enc_data_aug: false
@@ -97,8 +97,8 @@ constr_hidden_size: 200
 constr_lr: 1e-4
 
 # Replay Buffer
-data_dirs: datasets/pixels/SimplePointBot/diayn/buffer
-data_counts: 1000
+data_dirs: datasets/states/SimpleVelocityBot/controller/prioritized_sampling_1200
+data_counts: 600
 buffer_size: 35000
 
 # Misc Settings

diff --git a/configs/pretrain.yaml b/configs/pretrain.yaml
@@ -9,16 +9,24 @@ domain: walker # primal task will be inferred in runtime
 obs_type: states # [states, pixels]
 frame_stack: 1 # only works if obs_type=pixels
 action_repeat: 1 # set to 2 for pixels
-skill_dim: 10
+
+# smm reward params
+state_ent_coef: 1.0
+latent_ent_coef: 1.0
+latent_cond_ent_coef: 1.0
+
+skill_dim: 51
 discount: 0.99
+random_start: false
+plot: false
 # train settings
-num_train_frames: 2000010
+num_train_frames: 16000100
 num_seed_frames: 4000
 # eval
-eval_every_frames: 10000
+eval_every_frames: 100000
 num_eval_episodes: 10
 # snapshot
-snapshots: [10000, 50000, 100000, 500000, 1000000, 1500000, 2000000]
+snapshots: [10000, 50000, 100000, 500000, 1000000, 1500000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000, 11000000, 12000000, 13000000, 14000000, 15000000, 16000000]
 snapshot_dir: ../../../data/models/${obs_type}/${domain}/${agent.name}/${skill_dim}/${seed}
 # replay buffer
 replay_buffer_size: 1000000