Skip to content

Commit

Permalink
fixed ou noise
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrewzh112 committed Feb 20, 2021
1 parent c00e3bc commit dd1453f
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 11 deletions.
9 changes: 7 additions & 2 deletions policy/agent.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import numpy as np
import torch
from copy import deepcopy

from policy.networks import ActorCritic, Actor, Critic
from policy.utils import ReplayBuffer, OUActionNoise
from policy.utils import ReplayBuffer, OUActionNoise, clip_action


class BlackJackAgent:
Expand Down Expand Up @@ -208,6 +209,7 @@ def __init__(self, state_dim, action_dim, hidden_dims, max_action, gamma,
self.gamma = gamma
self.tau = tau
self.batch_size = batch_size
self.max_action = max_action
self.memory = ReplayBuffer(state_dim, action_dim, maxsize)
self.noise = OUActionNoise(torch.zeros(action_dim, device=self.device),
sigma=sigma,
Expand Down Expand Up @@ -270,8 +272,11 @@ def choose_action(self, observation):
with torch.no_grad():
mu = self.actor(observation)
action = mu + self.noise()
print(mu, action)
self.actor.train()
return action.cpu().detach().numpy()
action = action.cpu().detach().numpy()
# clip noised action to ensure not out of bounds
return clip_action(action, self.max_action)

def store_transition(self, state, action, reward, next_state, done):
state = torch.tensor(state)
Expand Down
8 changes: 2 additions & 6 deletions policy/lunarlander/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pathlib import Path
from torch.utils.tensorboard import SummaryWriter

from policy.utils import clip_action
from policy import agent as Agent


Expand Down Expand Up @@ -90,9 +89,6 @@ def main():
env.render()

action = agent.choose_action(observation)
# clip noised action to ensure not out of bounds
if args.agent in ['DDPG']:
action = clip_action(action, max_action)
next_observation, reward, done, _ = env.step(action)
score += reward

Expand Down Expand Up @@ -131,8 +127,8 @@ def main():
global_step=e)
actor_losses, critic_losses = [], []

if score > best_score:
best_score = score
if np.mean(score_history) > best_score:
best_score = np.mean(score_history)
agent.save_models()
tqdm.write(
f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}')
Expand Down
4 changes: 1 addition & 3 deletions policy/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import math
import numpy as np
import torch

Expand All @@ -14,8 +13,7 @@ def __init__(self, mu, sigma=0.2, theta=0.15, dt=1e-2, x0=None):
self.reset()

def __call__(self):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * \
self.dt + self.sigma + math.sqrt(self.dt) * torch.randn(*self.mu.shape, device=self.device)
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * torch.randn(size=self.mu.shape)
self.x_prev = x
return x

Expand Down

0 comments on commit dd1453f

Please sign in to comment.