Skip to content

Hw1/learn #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
data/
*.DS_Store
*~
*.svg
*.png
27 changes: 26 additions & 1 deletion hw1/cs285/infrastructure/pytorch_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,33 @@ def build_mlp(

# TODO: return a MLP. This should be an instance of nn.Module
# Note: nn.Sequential is an instance of nn.Module.
raise NotImplementedError
# raise NotImplementedError

# build the sequential model
model = nn.Sequential()

# 1. add input layer
model.add_module('input', nn.Linear(input_size, size))

# 2. add hidden layers
for n in range(n_layers):
model.add_module('hidden'+str(n), nn.Linear(size, size))
model.add_module('activation'+str(n), activation)

# 3. add output layer
model.add_module('output', nn.Linear(size, output_size))
model.add_module('out activation', output_activation)

# 4. assert nn.Sequential model is a nn.Module
assert(isinstance(model, nn.Module))

return model

'''
Another method is to use list = [('name', nn.Linear()), (), ()] , then nn.Sequential(collections.OrderedDict(list))
Or list = [nn.Linear(), xx ,xx] and nn.Sequential(list)

'''

device = None

Expand Down
15 changes: 13 additions & 2 deletions hw1/cs285/infrastructure/replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,19 @@ def sample_random_data(self, batch_size):
## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
## HINT 3: look at the sample_recent_data function below

return TODO, TODO, TODO, TODO, TODO

random_indices = np.random.permutation(self.obs.shape[0])
self.obs = self.obs[random_indices]
self.acs = self.acs[random_indices]
self.rews = self.rews[random_indices]
self.next_obs = self.next_obs[random_indices]
self.terminals = self.terminals[random_indices]
return self.sample_recent_data(batch_size)

# idx = np.random.permutation(self.obs.shape[0])[:batch_size]
# return self.obs[idx], self.acs[idx], self.rews[idx], self.next_obs[idx], self.terminals[idx]



def sample_recent_data(self, batch_size=1):
return (
self.obs[-batch_size:],
Expand Down
39 changes: 35 additions & 4 deletions hw1/cs285/infrastructure/rl_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from cs285.infrastructure import pytorch_util as ptu
from cs285.infrastructure.logger import Logger
from cs285.infrastructure import utils
import pickle

# how many rollouts to save as videos to tensorboard
MAX_NVIDEO = 2
Expand Down Expand Up @@ -87,6 +88,8 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,
self.total_envsteps = 0
self.start_time = time.time()

mean_list = []
std_list = []
for itr in range(n_iter):
print("\n\n********** Iteration %i ************"%itr)

Expand Down Expand Up @@ -127,13 +130,21 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy,

# perform logging
print('\nBeginning logging procedure...')
self.perform_logging(
mean, std = self.perform_logging(
itr, paths, eval_policy, train_video_paths, training_logs)

mean_list.append(mean)
std_list.append(std)

if self.params['save_params']:
print('\nSaving agent params')
self.agent.save('{}/policy_itr_{}.pt'.format(self.params['logdir'], itr))

print('mean : ', np.round(mean_list))
print('std : ', np.round(std_list))

# self.perform_expert(expert_policy)

####################################
####################################

Expand Down Expand Up @@ -162,11 +173,16 @@ def collect_training_trajectories(

# (2) collect `self.params['batch_size']` transitions

if itr == 0:
with open(load_initial_expertdata, 'rb') as f:
loaded_paths = pickle.load(f)
return loaded_paths, 0, None

# TODO collect `batch_size` samples to be used for training
# HINT1: use sample_trajectories from utils
# HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
print("\nCollecting data to be used for training...")
paths, envsteps_this_batch = TODO
paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'])

# collect more rollouts with the same policy, to be saved as videos in tensorboard
# note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
Expand All @@ -187,12 +203,12 @@ def train_agent(self):
# TODO sample some data from the data buffer
# HINT1: use the agent's sample function
# HINT2: how much data = self.params['train_batch_size']
ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO
ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size'])

# TODO use the sampled data to train an agent
# HINT: use the agent's train function
# HINT: keep the agent's training log for debugging
train_log = TODO
train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)
all_logs.append(train_log)
return all_logs

Expand All @@ -202,6 +218,8 @@ def do_relabel_with_expert(self, expert_policy, paths):
# TODO relabel collected obsevations (from our policy) with labels from an expert policy
# HINT: query the policy (using the get_action function) with paths[i]["observation"]
# and replace paths[i]["action"] with these expert labels
for path in paths:
path["action"] = expert_policy.get_action(path["observation"])

return paths

Expand Down Expand Up @@ -267,3 +285,16 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths, training_l
print('Done logging...\n\n')

self.logger.flush()

return np.mean(eval_returns), np.std(eval_returns)

def perform_expert(self, expert_policy):
print("\nCollecting data for EXPERT !! eval...")
eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, expert_policy, self.params['eval_batch_size'], self.params['ep_len'])
eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths]
mean_exp = np.mean(eval_returns)
std_exp = np.std(eval_returns)
mean_exp_max = np.max(eval_returns)
mean_exp_min = np.min(eval_returns)

print('mean exp: {}, std exp: {}, mean max: {}, mean min: {}'.format(np.round(mean_exp), np.round(std_exp), np.round(mean_exp_max), np.round(mean_exp_min) ))
16 changes: 10 additions & 6 deletions hw1/cs285/infrastructure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):

# initialize env for the beginning of a new rollout
ob = TODO # HINT: should be the output of resetting the env
ob = env.reset() # HINT: should be the output of resetting the env

# init vars
obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
Expand All @@ -27,7 +27,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('

# use the most recent ob to decide what to do
obs.append(ob)
ac = TODO # HINT: query the policy's get_action function
ac = policy.get_action(ob) #TODO # HINT: query the policy's get_action function
ac = ac[0]
acs.append(ac)

Expand All @@ -41,7 +41,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('

# TODO end the rollout if the rollout ended
# HINT: rollout can end due to done, or due to max_path_length
rollout_done = TODO # HINT: this is either 0 or 1
rollout_done = done or (steps >= max_path_length) # HINT: this is either 0 or 1
terminals.append(rollout_done)

if rollout_done:
Expand All @@ -60,8 +60,9 @@ def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, r
timesteps_this_batch = 0
paths = []
while timesteps_this_batch < min_timesteps_per_batch:

TODO
path = sample_trajectory(env, policy, max_path_length, render, render_mode)
paths.append(path)
timesteps_this_batch += get_pathlength(path)

return paths, timesteps_this_batch

Expand All @@ -74,7 +75,10 @@ def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, ren
"""
paths = []

TODO
# TODO
for n in range(ntraj):
path = sample_trajectory(env, policy, max_path_length, render, render_mode)
paths.append(path)

return paths

Expand Down
36 changes: 33 additions & 3 deletions hw1/cs285/policies/MLP_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def get_action(self, obs: np.ndarray) -> np.ndarray:
observation = obs[None]

# TODO return the action that the policy prescribes
raise NotImplementedError
# raise NotImplementedError
action = self.forward(ptu.from_numpy(observation))
return ptu.to_numpy(action)

# update/train this policy
def update(self, observations, actions, **kwargs):
Expand All @@ -93,7 +95,21 @@ def update(self, observations, actions, **kwargs):
# return more flexible objects, such as a
# `torch.distributions.Distribution` object. It's up to you!
def forward(self, observation: torch.FloatTensor) -> Any:
raise NotImplementedError
# raise NotImplementedError
if self.discrete:
action = self.logits_na(observation)
else:
action = self.mean_net(observation)
return action

'''
if self.discrete:
return distributions.Categorical(logits=self.logits_na(observation))
else:
return distributions.Normal(self.mean_net(observation), torch.exp(self.logstd)[None])

loss = -action_distribution.log_prob(actions).mean()
'''


#####################################################
Expand All @@ -109,7 +125,21 @@ def update(
adv_n=None, acs_labels_na=None, qvals=None
):
# TODO: update the policy and return the loss
loss = TODO
# question, why not run many iterations?

# convert inputs as torch.tensor, and predict actions
actions = ptu.from_numpy(actions)
predicted_actions = self.forward(ptu.from_numpy(observations))
# predicted_actions = self.get_action(ptu.from_numpy(observations))

# get loss and train
loss = self.loss(actions, predicted_actions)

# get gradient and backpropagation
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

return {
# You can add extra logging information here, but keep this line
'Training Loss': ptu.to_numpy(loss),
Expand Down
106 changes: 106 additions & 0 deletions hw1/cs285/report_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import os


def plot_mean_std(ax, iterations, mean, std, mean_expert, mean_bc):
mean = np.array(mean)
std = np.array(std)

ax.plot(iterations, mean_expert, 'r', label='expert')
ax.plot(iterations, mean_bc, 'g', label='naive bc')
ax.plot(iterations, mean, 'b-s', label='DAgger mean')
ax.fill_between(iterations, mean-std, mean+std, alpha=0.2, label='DAgger std')


def set_plot_env(iterations, mean, std, mean_expert, mean_bc, exp_name):

plt.figure(figsize=(10,5))
style = "whitegrid"
sns.set_theme(style=style) # background color
ax = plt.gca()
plot_mean_std(ax, iterations, mean, std, mean_expert, mean_bc)

ax.legend(loc='center right')
ax.set_xlabel('Iterations')
ax.set_ylabel('Return')
ax.set_title('return of ' + exp_name +' experiment')
ax.set_xlim([-0.5,10])

exp_dir = 'plots/'
if not os.path.exists(exp_dir):
os.makedirs(exp_dir)
plt.savefig(fname=exp_dir + 'figure-2_' + exp_name + '.png', format='png')

def plot_DAgger(mean, std, mean_expert, exp_name):
iterations = np.arange(len(mean))
I = np.ones(len(mean))
mean_bc = mean[0] * I
mean_expert = mean_expert * I
set_plot_env(iterations, mean, std, mean_expert, mean_bc, exp_name)

def plot_changing_ep(ep_len, mean_len, std_len):
plt.figure(figsize=(10,5))
style = "whitegrid"
sns.set_theme(style=style) # background color
ax = plt.gca()

mean_len = np.array(mean_len)
std_len = np.array(std_len)
I = np.ones(len(mean_len))
mean_expert = Ant_exp.mean_expert * I

ax.plot(ep_len, mean_expert, 'r', label='expert')
plt.plot(ep_len, mean_len, 'b-s', label='bc mean')
plt.fill_between(ep_len, mean_len-std_len, mean_len+std_len, alpha=0.2, label='bc std')
ax.legend(loc='upper left')
ax.set_xlabel('num of traning steps')
ax.set_ylabel('Return')
ax.set_title('return of ' + 'Ant experiments with varying traning steps')

exp_dir = 'plots/'
if not os.path.exists(exp_dir):
os.makedirs(exp_dir)
plt.savefig(fname=exp_dir + 'figure-1_varying_train_step' + '.png', format='png')


class Ant_exp:
mean = [4274., 4648., 4746., 4619., 4447., 4356., 4731., 4739., 4581., 4834.]
std = [1128., 53., 85., 103., 854., 1030., 124., 135., 336., 109.]
mean_expert = 4710

ep_len = [100, 300, 500, 700, 990, 1100, 1400, 1500, 1700]
mean_len = [567, 1505, 3849, 3296, 3774, 3570, 2227, 4236, 4249]
std_len = [7, 1250, 1299, 1604, 1363, 1392, 1740, 1151, 901]

class Hooper_exp:
mean = [ 523., 1700., 2453., 3763., 3778., 3790., 3388., 3771., 3537., 3788.]
std = [ 75., 607., 725., 4., 3., 4., 671., 3., 160., 3.]
mean_expert = 3779


if __name__ == "__main__":

# figure 1
exp = Ant_exp
mean_len = exp.mean_len
std_len = exp.std_len
ep_len = exp.ep_len
plot_changing_ep(ep_len, mean_len, std_len)

# figure 2-1
exp = Ant_exp
mean = exp.mean
std = exp.std
mean_expert = exp.mean_expert
plot_DAgger(mean, std, mean_expert, 'Ant')

# figure 2-2
exp = Hooper_exp
mean = exp.mean
std = exp.std
mean_expert = exp.mean_expert
plot_DAgger(mean, std, mean_expert, 'Hooper')
68 changes: 68 additions & 0 deletions hw1/repot_plots.ipynb

Large diffs are not rendered by default.