From dd1453f462b4df7b161568117eafe25e97106429 Mon Sep 17 00:00:00 2001 From: Andrew Zhao Date: Sat, 20 Feb 2021 18:54:25 +0800 Subject: [PATCH] fixed ou noise --- policy/agent.py | 9 +++++++-- policy/lunarlander/main.py | 8 ++------ policy/utils.py | 4 +--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/policy/agent.py b/policy/agent.py index e6e4190..42328f9 100644 --- a/policy/agent.py +++ b/policy/agent.py @@ -1,8 +1,9 @@ import numpy as np import torch from copy import deepcopy + from policy.networks import ActorCritic, Actor, Critic -from policy.utils import ReplayBuffer, OUActionNoise +from policy.utils import ReplayBuffer, OUActionNoise, clip_action class BlackJackAgent: @@ -208,6 +209,7 @@ def __init__(self, state_dim, action_dim, hidden_dims, max_action, gamma, self.gamma = gamma self.tau = tau self.batch_size = batch_size + self.max_action = max_action self.memory = ReplayBuffer(state_dim, action_dim, maxsize) self.noise = OUActionNoise(torch.zeros(action_dim, device=self.device), sigma=sigma, @@ -270,8 +272,11 @@ def choose_action(self, observation): with torch.no_grad(): mu = self.actor(observation) action = mu + self.noise() + print(mu, action) self.actor.train() - return action.cpu().detach().numpy() + action = action.cpu().detach().numpy() + # clip noised action to ensure not out of bounds + return clip_action(action, self.max_action) def store_transition(self, state, action, reward, next_state, done): state = torch.tensor(state) diff --git a/policy/lunarlander/main.py b/policy/lunarlander/main.py index b36c6a6..fd08281 100644 --- a/policy/lunarlander/main.py +++ b/policy/lunarlander/main.py @@ -7,7 +7,6 @@ from pathlib import Path from torch.utils.tensorboard import SummaryWriter -from policy.utils import clip_action from policy import agent as Agent @@ -90,9 +89,6 @@ def main(): env.render() action = agent.choose_action(observation) - # clip noised action to ensure not out of bounds - if args.agent in ['DDPG']: - action = clip_action(action, max_action) next_observation, reward, done, _ = env.step(action) score += reward @@ -131,8 +127,8 @@ def main(): global_step=e) actor_losses, critic_losses = [], [] - if score > best_score: - best_score = score + if np.mean(score_history) > best_score: + best_score = np.mean(score_history) agent.save_models() tqdm.write( f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}') diff --git a/policy/utils.py b/policy/utils.py index 5cc9dff..28d41c2 100644 --- a/policy/utils.py +++ b/policy/utils.py @@ -1,4 +1,3 @@ -import math import numpy as np import torch @@ -14,8 +13,7 @@ def __init__(self, mu, sigma=0.2, theta=0.15, dt=1e-2, x0=None): self.reset() def __call__(self): - x = self.x_prev + self.theta * (self.mu - self.x_prev) * \ - self.dt + self.sigma + math.sqrt(self.dt) * torch.randn(*self.mu.shape, device=self.device) + x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * torch.randn(size=self.mu.shape) self.x_prev = x return x