berkeleydeeprlcourse · CeHao1 · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021 · Sep 1, 2021
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@
 data/
 *.DS_Store
 *~
+*.svg
+*.png
diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py
@@ -47,8 +47,33 @@ def build_mlp(
 
     # TODO: return a MLP. This should be an instance of nn.Module
     # Note: nn.Sequential is an instance of nn.Module.
-    raise NotImplementedError
+    # raise NotImplementedError
 
+    # build the sequential model
+    model = nn.Sequential()
+
+    # 1. add input layer
+    model.add_module('input', nn.Linear(input_size, size))
+
+    # 2. add hidden layers
+    for n in range(n_layers):
+        model.add_module('hidden'+str(n), nn.Linear(size, size))
+        model.add_module('activation'+str(n), activation)
+
+    # 3. add output layer
+    model.add_module('output', nn.Linear(size, output_size))
+    model.add_module('out activation', output_activation)
+
+    # 4. assert nn.Sequential model is a nn.Module
+    assert(isinstance(model, nn.Module))
+
+    return model
+
+    '''
+    Another method is to use list = [('name', nn.Linear()), (), ()] , then nn.Sequential(collections.OrderedDict(list))
+    Or list = [nn.Linear(), xx ,xx] and nn.Sequential(list)
+
+    '''
 
 device = None
 

diff --git a/hw1/cs285/infrastructure/replay_buffer.py b/hw1/cs285/infrastructure/replay_buffer.py
@@ -77,8 +77,19 @@ def sample_random_data(self, batch_size):
         ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
         ## HINT 3: look at the sample_recent_data function below
 
-        return TODO, TODO, TODO, TODO, TODO
-
+        random_indices = np.random.permutation(self.obs.shape[0])
+        self.obs = self.obs[random_indices]
+        self.acs = self.acs[random_indices]
+        self.rews = self.rews[random_indices]
+        self.next_obs = self.next_obs[random_indices]
+        self.terminals = self.terminals[random_indices]
+        return self.sample_recent_data(batch_size)
+
+        # idx = np.random.permutation(self.obs.shape[0])[:batch_size]
+        # return self.obs[idx], self.acs[idx], self.rews[idx], self.next_obs[idx], self.terminals[idx]
+
+
+
     def sample_recent_data(self, batch_size=1):
         return (
             self.obs[-batch_size:],

diff --git a/hw1/cs285/infrastructure/rl_trainer.py b/hw1/cs285/infrastructure/rl_trainer.py
@@ -8,6 +8,7 @@
 from cs285.infrastructure import pytorch_util as ptu
 from cs285.infrastructure.logger import Logger
 from cs285.infrastructure import utils
+import pickle
 
 # how many rollouts to save as videos to tensorboard
 MAX_NVIDEO = 2
@@ -87,6 +88,8 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
         self.total_envsteps = 0
         self.start_time = time.time()
 
+        mean_list = []
+        std_list =  []
         for itr in range(n_iter):
             print("\n\n********** Iteration %i ************"%itr)
 
@@ -127,13 +130,21 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
 
                 # perform logging
                 print('\nBeginning logging procedure...')
-                self.perform_logging(
+                mean, std = self.perform_logging(
                     itr, paths, eval_policy, train_video_paths, training_logs)
 
+                mean_list.append(mean)
+                std_list.append(std)
+
                 if self.params['save_params']:
                     print('\nSaving agent params')
                     self.agent.save('{}/policy_itr_{}.pt'.format(self.params['logdir'], itr))
 
+        print('mean : ', np.round(mean_list))
+        print('std  : ', np.round(std_list))
+
+        # self.perform_expert(expert_policy)
+
     ####################################
     ####################################
 
@@ -162,11 +173,16 @@ def collect_training_trajectories(
 
                 # (2) collect `self.params['batch_size']` transitions
 
+        if itr == 0:
+            with open(load_initial_expertdata, 'rb') as f:
+                loaded_paths = pickle.load(f)
+            return loaded_paths, 0, None
+
         # TODO collect `batch_size` samples to be used for training
         # HINT1: use sample_trajectories from utils
         # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
         print("\nCollecting data to be used for training...")
-        paths, envsteps_this_batch = TODO
+        paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'])
 
         # collect more rollouts with the same policy, to be saved as videos in tensorboard
         # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
@@ -187,12 +203,12 @@ def train_agent(self):
             # TODO sample some data from the data buffer
             # HINT1: use the agent's sample function
             # HINT2: how much data = self.params['train_batch_size']
-            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO
+            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch =  self.agent.sample(self.params['train_batch_size']) 
 
             # TODO use the sampled data to train an agent
             # HINT: use the agent's train function
             # HINT: keep the agent's training log for debugging
-            train_log = TODO
+            train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)
             all_logs.append(train_log)
         return all_logs
 
@@ -202,6 +218,8 @@ def do_relabel_with_expert(self, expert_policy, paths):
         # TODO relabel collected obsevations (from our policy) with labels from an expert policy
         # HINT: query the policy (using the get_action function) with paths[i]["observation"]
         # and replace paths[i]["action"] with these expert labels
+        for path in paths:
+            path["action"] = expert_policy.get_action(path["observation"])
 
         return paths
 
@@ -267,3 +285,16 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths, training_l
             print('Done logging...\n\n')
 
             self.logger.flush()
+
+            return np.mean(eval_returns), np.std(eval_returns)
+
+    def perform_expert(self, expert_policy):
+        print("\nCollecting data for EXPERT !! eval...")
+        eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, expert_policy, self.params['eval_batch_size'], self.params['ep_len'])
+        eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths]
+        mean_exp = np.mean(eval_returns)
+        std_exp = np.std(eval_returns)
+        mean_exp_max = np.max(eval_returns)
+        mean_exp_min = np.min(eval_returns)
+
+        print('mean exp: {}, std exp: {}, mean max: {}, mean min: {}'.format(np.round(mean_exp), np.round(std_exp), np.round(mean_exp_max), np.round(mean_exp_min) ))
diff --git a/hw1/cs285/infrastructure/utils.py b/hw1/cs285/infrastructure/utils.py
@@ -7,7 +7,7 @@
 def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
 
     # initialize env for the beginning of a new rollout
-    ob = TODO # HINT: should be the output of resetting the env
+    ob = env.reset() # HINT: should be the output of resetting the env
 
     # init vars
     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
@@ -27,7 +27,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('
 
         # use the most recent ob to decide what to do
         obs.append(ob)
-        ac = TODO # HINT: query the policy's get_action function
+        ac = policy.get_action(ob) #TODO # HINT: query the policy's get_action function
         ac = ac[0]
         acs.append(ac)
 
@@ -41,7 +41,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('
 
         # TODO end the rollout if the rollout ended
         # HINT: rollout can end due to done, or due to max_path_length
-        rollout_done = TODO # HINT: this is either 0 or 1
+        rollout_done = done or (steps >= max_path_length) # HINT: this is either 0 or 1
         terminals.append(rollout_done)
 
         if rollout_done:
@@ -60,8 +60,9 @@ def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, r
     timesteps_this_batch = 0
     paths = []
     while timesteps_this_batch < min_timesteps_per_batch:
-
-        TODO
+        path = sample_trajectory(env, policy, max_path_length, render, render_mode)
+        paths.append(path)
+        timesteps_this_batch += get_pathlength(path)
 
     return paths, timesteps_this_batch
 
@@ -74,7 +75,10 @@ def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, ren
     """
     paths = []
 
-    TODO
+    # TODO
+    for n in range(ntraj):
+        path = sample_trajectory(env, policy, max_path_length, render, render_mode)
+        paths.append(path)
 
     return paths
 

diff --git a/hw1/cs285/policies/MLP_policy.py b/hw1/cs285/policies/MLP_policy.py
@@ -81,7 +81,9 @@ def get_action(self, obs: np.ndarray) -> np.ndarray:
             observation = obs[None]
 
         # TODO return the action that the policy prescribes
-        raise NotImplementedError
+        # raise NotImplementedError
+        action = self.forward(ptu.from_numpy(observation))
+        return  ptu.to_numpy(action)
 
     # update/train this policy
     def update(self, observations, actions, **kwargs):
@@ -93,7 +95,21 @@ def update(self, observations, actions, **kwargs):
     # return more flexible objects, such as a
     # `torch.distributions.Distribution` object. It's up to you!
     def forward(self, observation: torch.FloatTensor) -> Any:
-        raise NotImplementedError
+        # raise NotImplementedError
+        if self.discrete:
+            action = self.logits_na(observation)
+        else:
+            action = self.mean_net(observation)
+        return action
+
+    '''
+    if self.discrete:
+            return distributions.Categorical(logits=self.logits_na(observation))
+        else:
+            return distributions.Normal(self.mean_net(observation), torch.exp(self.logstd)[None])
+
+    loss = -action_distribution.log_prob(actions).mean()
+    '''
 
 
 #####################################################
@@ -109,7 +125,21 @@ def update(
             adv_n=None, acs_labels_na=None, qvals=None
     ):
         # TODO: update the policy and return the loss
-        loss = TODO
+        # question, why not run many iterations?
+
+        # convert inputs as torch.tensor, and predict actions
+        actions = ptu.from_numpy(actions)
+        predicted_actions = self.forward(ptu.from_numpy(observations))
+        # predicted_actions = self.get_action(ptu.from_numpy(observations))
+
+        # get loss and train
+        loss = self.loss(actions, predicted_actions)
+
+        # get gradient and backpropagation
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
         return {
             # You can add extra logging information here, but keep this line
             'Training Loss': ptu.to_numpy(loss),

diff --git a/hw1/cs285/report_plots.py b/hw1/cs285/report_plots.py
@@ -0,0 +1,106 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+import numpy as np
+import pandas as pd
+import os
+
+
+def plot_mean_std(ax, iterations, mean, std, mean_expert, mean_bc):
+    mean = np.array(mean)
+    std = np.array(std)
+
+    ax.plot(iterations, mean_expert, 'r', label='expert')
+    ax.plot(iterations, mean_bc, 'g', label='naive bc')
+    ax.plot(iterations, mean, 'b-s', label='DAgger mean')
+    ax.fill_between(iterations, mean-std, mean+std, alpha=0.2, label='DAgger std')
+
+
+def set_plot_env(iterations, mean, std, mean_expert, mean_bc, exp_name):
+
+    plt.figure(figsize=(10,5))
+    style = "whitegrid"
+    sns.set_theme(style=style) # background color
+    ax = plt.gca()
+    plot_mean_std(ax, iterations, mean, std, mean_expert, mean_bc)
+
+    ax.legend(loc='center right')
+    ax.set_xlabel('Iterations')
+    ax.set_ylabel('Return')
+    ax.set_title('return of ' + exp_name +' experiment')
+    ax.set_xlim([-0.5,10])
+
+    exp_dir = 'plots/'
+    if not os.path.exists(exp_dir):
+        os.makedirs(exp_dir)
+    plt.savefig(fname=exp_dir + 'figure-2_' + exp_name + '.png', format='png')
+
+def plot_DAgger(mean, std, mean_expert, exp_name):
+    iterations = np.arange(len(mean))
+    I = np.ones(len(mean))
+    mean_bc = mean[0] * I 
+    mean_expert = mean_expert * I 
+    set_plot_env(iterations, mean, std, mean_expert, mean_bc, exp_name)
+
+def plot_changing_ep(ep_len, mean_len, std_len):
+    plt.figure(figsize=(10,5))
+    style = "whitegrid"
+    sns.set_theme(style=style) # background color
+    ax = plt.gca()
+
+    mean_len = np.array(mean_len)
+    std_len = np.array(std_len)
+    I = np.ones(len(mean_len))
+    mean_expert = Ant_exp.mean_expert * I 
+
+    ax.plot(ep_len, mean_expert, 'r', label='expert')
+    plt.plot(ep_len, mean_len, 'b-s', label='bc mean')
+    plt.fill_between(ep_len, mean_len-std_len, mean_len+std_len, alpha=0.2, label='bc std')
+    ax.legend(loc='upper left')
+    ax.set_xlabel('num of traning steps')
+    ax.set_ylabel('Return')
+    ax.set_title('return of ' + 'Ant experiments with varying traning steps')
+
+    exp_dir = 'plots/'
+    if not os.path.exists(exp_dir):
+        os.makedirs(exp_dir)
+    plt.savefig(fname=exp_dir + 'figure-1_varying_train_step' + '.png', format='png')
+
+
+class Ant_exp:
+    mean = [4274., 4648., 4746., 4619., 4447., 4356., 4731., 4739., 4581., 4834.]
+    std =  [1128.,   53.,   85.,  103.,  854., 1030.,  124.,  135.,  336.,  109.]
+    mean_expert = 4710
+
+    ep_len = [100, 300, 500, 700, 990, 1100, 1400, 1500, 1700]
+    mean_len = [567, 1505, 3849, 3296, 3774, 3570, 2227, 4236, 4249]
+    std_len =  [7,   1250, 1299, 1604, 1363, 1392, 1740, 1151, 901]
+
+class Hooper_exp:
+    mean = [ 523., 1700., 2453., 3763., 3778., 3790., 3388., 3771., 3537., 3788.]
+    std  = [ 75., 607., 725.,   4.,   3.,   4., 671.,   3., 160.,   3.]
+    mean_expert = 3779
+
+
+if __name__ == "__main__":
+
+    # figure 1
+    exp = Ant_exp
+    mean_len = exp.mean_len
+    std_len = exp.std_len
+    ep_len = exp.ep_len
+    plot_changing_ep(ep_len, mean_len, std_len)
+
+    # figure 2-1
+    exp = Ant_exp
+    mean = exp.mean
+    std = exp.std
+    mean_expert = exp.mean_expert
+    plot_DAgger(mean, std, mean_expert, 'Ant')
+
+    # figure 2-2
+    exp = Hooper_exp
+    mean = exp.mean
+    std = exp.std
+    mean_expert = exp.mean_expert
+    plot_DAgger(mean, std, mean_expert, 'Hooper')
diff --git a/hw1/repot_plots.ipynb b/hw1/repot_plots.ipynb
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ @@
     data/
     *.DS_Store
     *~
+    *.svg
+    *.png