Skip to content

Commit

Permalink
added noisy networks & now supporting FlappyBird-v0
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrewzh112 committed Feb 7, 2021
1 parent 13db269 commit c03f3ab
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 80 deletions.
9 changes: 6 additions & 3 deletions qlearning/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,17 @@ def __init__(self, *args, **kwargs):
kwargs['cpt_dir'],
kwargs['algorithm'] + '_' + kwargs['env_name'],
kwargs['img_size'],
kwargs['hidden_dim']).to(self.device)
kwargs['hidden_dim'],
noised=kwargs['noised']).to(self.device)
else:
self.Q_function = QBasic(
kwargs['input_channels'],
self.n_actions,
kwargs['cpt_dir'],
kwargs['algorithm'] + '_' + kwargs['env_name'],
kwargs['img_size'],
kwargs['hidden_dim']).to(self.device)
kwargs['hidden_dim'],
noised=kwargs['noised']).to(self.device)

# instanciate target network
self.target_Q = deepcopy(self.Q_function)
Expand All @@ -148,10 +150,11 @@ def freeze_network(self, network):
p.requires_grad = False

def update(self):
# Q_t = Q_t + lr * (reward + gamma * Q'_t - Q^target_t) ** 2
# keep sampling until we have full batch
if self.memory.ctr < self.batch_size:
return

# Q_t = Q_t + lr * (reward + gamma * Q'_t - Q^target_t) ** 2
self.optimizer.zero_grad()
observations, rewards, actions, next_observations, dones, idx, weights = self.sample_transitions()

Expand Down
67 changes: 37 additions & 30 deletions qlearning/atari/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import deque
from pathlib import Path
from gym import wrappers
# from ple.games.flappybird import FlappyBird
import gym_ple

from qlearning.agent import DQNAgent
from qlearning.atari.utils import processed_atari
Expand All @@ -23,7 +23,8 @@
SpaceInvadersNoFrameskip-v4\n \
EnduroNoFrameskip-v4\n \
AtlantisNoFrameskip-v4\n \
BankHeistNoFrameskip-v4')
BankHeistNoFrameskip-v4\n \
FlappyBird-v0')
parser.add_argument('--n_repeats', type=int, default=4, help='The number of repeated actions')
parser.add_argument('--img_size', type=int, default=84, help='The height and width of images after resizing')
parser.add_argument('--input_channels', type=int, default=1, help='The input channels after preprocessing')
Expand All @@ -44,6 +45,7 @@
parser.add_argument('--alpha', type=float, default=0.6, help='Prioritized Experience Replay alpha')
parser.add_argument('--beta', type=float, default=0.4, help='Prioritized Experience Replay beta')
parser.add_argument('--eps', type=float, default=1e-5, help='Prioritized Experience Replay epsilon')
parser.add_argument('--noised', action="store_true", default=False, help='Using noisy networks')

# logging
parser.add_argument('--progress_window', type=int, default=100, help='Window of episodes for progress')
Expand Down Expand Up @@ -71,41 +73,45 @@
env = wrappers.Monitor(env, args.video_dir,
video_callable=lambda episode_id: True,
force=True)
if 'DQN' in args.algorithm:
agent = DQNAgent(env.observation_space.shape,
env.action_space.n,
args.epsilon_init, args.epsilon_min, args.epsilon_desc,
args.gamma, args.lr, args.n_episodes,
input_channels=args.input_channels,
algorithm=args.algorithm,
img_size=args.img_size,
hidden_dim=args.hidden_dim,
max_size=args.max_size,
target_update_interval=args.target_update_interval,
batch_size=args.batch_size,
cpt_dir=args.cpt_dir,
grad_clip=args.grad_clip,
prioritize=not args.no_prioritize,
alpha=args.alpha,
beta=args.beta,
eps=args.eps,
env_name=args.env_name)
else:
raise NotImplementedError
# force some parameters depending on if using priority replay

# force some parameters depending on if using priority replay, following paper protocols
if args.no_prioritize:
args.alpha, args.beta, args.epsilon = 1, 0, 0
else:
args.lr /= 4
scores, best_score = deque(maxlen=args.progress_window), -np.inf

# load weights & make sure model in eval mode during test
# no need for exploration if using noisy networks
if args.noised:
args.epsilon_init, args.epsilon_min = 0, 0

agent = DQNAgent(env.observation_space.shape,
env.action_space.n,
args.epsilon_init, args.epsilon_min, args.epsilon_desc,
args.gamma, args.lr, args.n_episodes,
input_channels=args.input_channels,
algorithm=args.algorithm,
img_size=args.img_size,
hidden_dim=args.hidden_dim,
max_size=args.max_size,
target_update_interval=args.target_update_interval,
batch_size=args.batch_size,
cpt_dir=args.cpt_dir,
grad_clip=args.grad_clip,
prioritize=not args.no_prioritize,
alpha=args.alpha,
beta=args.beta,
eps=args.eps,
noised=args.noised,
env_name=args.env_name)

# load weights & make sure model in eval mode during test, only need online network for testings
if args.test:
agent.load_models()
agent.Q_function.eval()

scores, best_score = deque(maxlen=args.progress_window), -np.inf
pbar = tqdm(range(args.n_episodes))
for e in pbar:

# reset every episode and make sure functions are in training mode
done, score, observation = False, 0, env.reset()
agent.Q_function.train()
Expand All @@ -128,8 +134,9 @@
# logging
writer.add_scalars('Performance and training', {'Score': score, 'Epsilon': agent.epsilon})
scores.append(score)
if score > best_score and not args.test:
avg_score = np.mean(scores)
if avg_score > best_score and not args.test:
agent.save_models()
best_score = score
best_score = avg_score
if (e + 1) % args.print_every == 0:
tqdm.write(f'Episode: {e + 1}/{args.n_episodes}, Average Score: {np.mean(scores)}, Best Score {best_score}, Epsilon: {agent.epsilon}')
tqdm.write(f'Episode: {e + 1}/{args.n_episodes}, Average Score: {avg_score}, Best Score {best_score}, Epsilon: {agent.epsilon}')
39 changes: 0 additions & 39 deletions qlearning/frozen_lake/agent.py

This file was deleted.

73 changes: 65 additions & 8 deletions qlearning/networks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch.nn as nn
from torch.nn import functional as F
import torch
import math
import os


Expand All @@ -19,7 +20,7 @@ def forward(self, state):
class QBasic(nn.Module):
def __init__(self, input_channels, n_actions, cpt_dir, name,
img_size=84, hidden_dim=512, n_repeats=4, channels=[32, 64, 64],
kernel_sizes=[8, 4, 3], strides=[4, 2, 1]):
kernel_sizes=[8, 4, 3], strides=[4, 2, 1], noised=False):
super().__init__()
q_network = []
# CNN layers
Expand All @@ -35,9 +36,12 @@ def __init__(self, input_channels, n_actions, cpt_dir, name,
fc_size = nn.Sequential(*q_network)(dummy_img).size(-1)

# FC layers
q_network.append(nn.Linear(fc_size, hidden_dim))
q_network.append(nn.ReLU())
q_network.append(nn.Linear(hidden_dim, n_actions))
if noised:
q_network.extend(
[NoisedLinear(fc_size, hidden_dim), nn.ReLU(), NoisedLinear(hidden_dim, n_actions)])
else:
q_network.extend(
[nn.Linear(fc_size, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, n_actions)])
self.q_network = nn.Sequential(*q_network)

# training
Expand All @@ -58,7 +62,7 @@ def load_checkpoint(self):
class QDueling(nn.Module):
def __init__(self, input_channels, n_actions, cpt_dir, name,
img_size=84, hidden_dim=512, n_repeats=4, channels=[32, 64, 64],
kernel_sizes=[8, 4, 3], strides=[4, 2, 1]):
kernel_sizes=[8, 4, 3], strides=[4, 2, 1], noised=False):
super().__init__()
feature_extractor = []
# CNN layers
Expand All @@ -74,13 +78,20 @@ def __init__(self, input_channels, n_actions, cpt_dir, name,
fc_size = nn.Sequential(*feature_extractor)(dummy_img).size(-1)

# FC layers
feature_extractor.append(nn.Linear(fc_size, hidden_dim))
if noised:
feature_extractor.append(NoisedLinear(fc_size, hidden_dim))
else:
feature_extractor.append(nn.Linear(fc_size, hidden_dim))
feature_extractor.append(nn.ReLU())
self.feature_extractor = nn.Sequential(*feature_extractor)

# value & advantage fns
self.value = nn.Linear(hidden_dim, 1)
self.advantage = nn.Linear(hidden_dim, n_actions)
if noised:
self.value = NoisedLinear(hidden_dim, 1)
self.advantage = NoisedLinear(hidden_dim, n_actions)
else:
self.value = nn.Linear(hidden_dim, 1)
self.advantage = nn.Linear(hidden_dim, n_actions)

# training
self.name = name
Expand All @@ -99,3 +110,49 @@ def check_point(self):

def load_checkpoint(self):
self.load_state_dict(torch.load(self.cpt + '.pth'))


class NoisedMatrix(nn.Module):
def __init__(self, in_features, out_features, sigma_init=0.017):
super().__init__()
self.matrix_mu = nn.Parameter(torch.randn(in_features, out_features))
self.matrix_sigma = nn.Parameter(torch.randn(in_features, out_features))
self.matrix_epsilon = torch.empty(in_features, out_features)

# initialization scheme
self.sigma_init = sigma_init
init_range = math.sqrt(3 / in_features)
self.init_weights(init_range)

def combine_parameters(self):
self.reset_epsilon()
return self.matrix_mu + self.matrix_sigma * self.matrix_epsilon

def reset_epsilon(self):
self.matrix_epsilon.normal_(0, 1)
self.matrix_epsilon = self.matrix_epsilon.to(self.matrix_mu.device)

def init_weights(self, init_range):
self.matrix_mu.data = torch.nn.init.uniform_(
self.matrix_mu.data, a=-init_range, b=init_range)
self.matrix_sigma.data = torch.ones_like(self.matrix_sigma.data) * self.sigma_init

def forward(self):
pass


class NoisedLinear(nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.weight = NoisedMatrix(in_features, out_features)
self.bias = NoisedMatrix(out_features, 1)

def forward(self, state):
if self.training:
weight = self.weight.combine_parameters()
bias = self.bias.combine_parameters().squeeze()
else:
weight = self.weight.matrix_mu
bias = self.bias.matrix_mu.squeeze()
Qs = state @ weight + bias
return Qs

0 comments on commit c03f3ab

Please sign in to comment.