fixed ou noise

Andrewzh112 · Feb 20, 2021 · dd1453f · dd1453f
1 parent c00e3bc
commit dd1453f
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 11 deletions.
diff --git a/policy/agent.py b/policy/agent.py
@@ -1,8 +1,9 @@
 import numpy as np
 import torch
 from copy import deepcopy
+
 from policy.networks import ActorCritic, Actor, Critic
-from policy.utils import ReplayBuffer, OUActionNoise
+from policy.utils import ReplayBuffer, OUActionNoise, clip_action
 
 
 class BlackJackAgent:
@@ -208,6 +209,7 @@ def __init__(self, state_dim, action_dim, hidden_dims, max_action, gamma,
         self.gamma = gamma
         self.tau = tau
         self.batch_size = batch_size
+        self.max_action = max_action
         self.memory = ReplayBuffer(state_dim, action_dim, maxsize)
         self.noise = OUActionNoise(torch.zeros(action_dim, device=self.device),
                                    sigma=sigma,
@@ -270,8 +272,11 @@ def choose_action(self, observation):
         with torch.no_grad():
             mu = self.actor(observation)
         action = mu + self.noise()
+        print(mu, action)
         self.actor.train()
-        return action.cpu().detach().numpy()
+        action = action.cpu().detach().numpy()
+        # clip noised action to ensure not out of bounds
+        return clip_action(action, self.max_action)
 
     def store_transition(self, state, action, reward, next_state, done):
         state = torch.tensor(state)

diff --git a/policy/lunarlander/main.py b/policy/lunarlander/main.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 from torch.utils.tensorboard import SummaryWriter
 
-from policy.utils import clip_action
 from policy import agent as Agent
 
 
@@ -90,9 +89,6 @@ def main():
                 env.render()
 
             action = agent.choose_action(observation)
-            # clip noised action to ensure not out of bounds
-            if args.agent in ['DDPG']:
-                action = clip_action(action, max_action)
             next_observation, reward, done, _ = env.step(action)
             score += reward
 
@@ -131,8 +127,8 @@ def main():
                     global_step=e)
             actor_losses, critic_losses = [], []
 
-            if score > best_score:
-                best_score = score
+            if np.mean(score_history) > best_score:
+                best_score = np.mean(score_history)
                 agent.save_models()
         tqdm.write(
             f'Episode: {e + 1}/{args.n_episodes}, Score: {score}, Average Score: {np.mean(score_history)}')

diff --git a/policy/utils.py b/policy/utils.py
@@ -1,4 +1,3 @@
-import math
 import numpy as np
 import torch
 
@@ -14,8 +13,7 @@ def __init__(self, mu, sigma=0.2, theta=0.15, dt=1e-2, x0=None):
         self.reset()
 
     def __call__(self):
-        x = self.x_prev + self.theta * (self.mu - self.x_prev) * \
-            self.dt + self.sigma + math.sqrt(self.dt) * torch.randn(*self.mu.shape, device=self.device)
+        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * torch.randn(size=self.mu.shape)
         self.x_prev = x
         return x