Fix: Rainbow Error

choru-k · choru-k · commit 6e866437d16a · 2018-11-22T19:03:42.000+09:00
diff --git a/rainbow/1-dqn/config.py b/rainbow/1-dqn/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.1
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
diff --git a/rainbow/2-DoubleDQN/config.py b/rainbow/2-DoubleDQN/config.py
@@ -3,10 +3,10 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.1
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
-log_interval=10
+log_interval = 10
 update_target = 100
 replay_memory_capacity = 1000
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/rainbow/3-DuelDQN/config.py b/rainbow/3-DuelDQN/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.1
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
diff --git a/rainbow/5-per/config.py b/rainbow/5-per/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.1
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
diff --git a/rainbow/6-Nosiy_net/config.py b/rainbow/6-Nosiy_net/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.1
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
diff --git a/rainbow/6-Nosiy_net/model.py b/rainbow/6-Nosiy_net/model.py
@@ -67,7 +67,6 @@ def train_model(cls, online_net, target_net, optimizer, batch):
         rewards = torch.Tensor(batch.reward)
         masks = torch.Tensor(batch.mask)
 
-        target_net.fc2.reset_noise()
         pred = online_net(states).squeeze(1)
         next_pred = target_net(next_states).squeeze(1)
 
@@ -79,11 +78,14 @@ def train_model(cls, online_net, target_net, optimizer, batch):
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
+        online_net.reset_noise()
 
         return loss
 
     def get_action(self, input):
-        self.fc2.reset_noise()
         qvalue = self.forward(input)
         _, action = torch.max(qvalue, 1)
         return action.numpy()[0]
+
+    def reset_noise(self):
+        self.fc2.reset_noise()
diff --git a/rainbow/6-Nosiy_net/train.py b/rainbow/6-Nosiy_net/train.py
@@ -14,6 +14,11 @@
 from config import env_name, initial_exploration, batch_size, update_target, goal_score, log_interval, device, replay_memory_capacity, lr
 
 
+def get_action(state, target_net, epsilon, env):
+    if np.random.rand() <= epsilon:
+        return env.action_space.sample()
+    else:
+        return target_net.get_action(state)
 
 def update_target_model(online_net, target_net):
     # Target <- Net
@@ -43,6 +48,7 @@ def main():
     target_net.train()
     memory = Memory(replay_memory_capacity)
     running_score = 0
+    epsilon = 1.0
     steps = 0
     loss = 0
 
@@ -55,10 +61,9 @@ def main():
         state = state.unsqueeze(0)
 
         while not done:
-
             steps += 1
 
-            action = target_net.get_action(state)
+            action = get_action(state, target_net, epsilon, env)
             next_state, reward, done, _ = env.step(action)
 
             next_state = torch.Tensor(next_state)
@@ -74,6 +79,8 @@ def main():
             state = next_state
 
             if steps > initial_exploration:
+                epsilon -= 0.00005
+                epsilon = max(epsilon, 0.1)
 
                 batch = memory.sample(batch_size)
                 loss = QNet.train_model(online_net, target_net, optimizer, batch)
@@ -84,8 +91,8 @@ def main():
         score = score if score == 500.0 else score + 1
         running_score = 0.99 * running_score + 0.01 * score
         if e % log_interval == 0:
-            print('{} episode | score: {:.2f} '.format(
-                e, running_score))
+            print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format(
+                e, running_score, epsilon))
             writer.add_scalar('log/score', float(running_score), e)
             writer.add_scalar('log/loss', float(loss), e)
 
diff --git a/rainbow/7-distributional_c51/model.py b/rainbow/7-distributional_c51/model.py
@@ -16,7 +16,6 @@ def __init__(self, num_inputs, num_outputs):
         self.z = torch.Tensor([V_min + i * self.dz for i in range(num_support)])
 
         self.fc1 = nn.Linear(num_inputs, 128)
-        # self.fc2 = nn.Linear(128, 128)
         self.fc2 = nn.Linear(128, num_outputs * num_support)
 
         for m in self.modules():
@@ -82,7 +81,7 @@ def train_model(cls, online_net, target_net, optimizer, batch):
         m_prob = cls.get_m(rewards, masks, prob_next_states_action)
         m_prob = torch.tensor(m_prob)
 
-        m_prob = m_prob / torch.sum(m_prob, dim=1, keepdim=True)
+        m_prob = (m_prob / torch.sum(m_prob, dim=1, keepdim=True)).detach()
         expand_dim_action = torch.unsqueeze(actions, -1)
         p = torch.sum(online_net(states) * expand_dim_action.float(), dim=1)
         loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1)
diff --git a/rainbow/8-Not_Distributional/config.py b/rainbow/8-Not_Distributional/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.01
+lr = 0.001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
diff --git a/rainbow/8-Not_Distributional/memory.py b/rainbow/8-Not_Distributional/memory.py
@@ -68,6 +68,7 @@ def sample(self, batch_size, net, target_net, beta):
 
 
         td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
+        td_error = td_error.detach()
 
         td_error_idx = 0
         for idx in indexes:
diff --git a/rainbow/9-Rainbow/config.py b/rainbow/9-Rainbow/config.py
@@ -3,7 +3,7 @@
 env_name = 'CartPole-v1'
 gamma = 0.99
 batch_size = 32
-lr = 0.01
+lr = 0.0001
 initial_exploration = 1000
 goal_score = 200
 log_interval = 10
@@ -12,11 +12,11 @@
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 # Multi_Step
-n_step = 3
+n_step = 1
 
 # PER
 small_epsilon = 0.0001
-alpha = 0.5
+alpha = 1
 beta_start = 0.1
 
 # Noisy Net
diff --git a/rainbow/9-Rainbow/memory.py b/rainbow/9-Rainbow/memory.py
@@ -1,14 +1,14 @@
 import random
 import numpy as np
-from collections import namedtuple, deque
 import torch
+from collections import namedtuple, deque
 from model import QNet
 from config import small_epsilon, gamma, alpha, device, n_step
 
 Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
 
 
-class Memory_With_TDError(object):
+class Memory(object):
     def __init__(self, capacity):
         self.memory = []
         self.memory_probabiliy = []
@@ -66,8 +66,8 @@ def sample(self, batch_size, net, target_net, beta):
         weights = torch.Tensor(weights).to(device)
         weights = weights / weights.max()
 
-
         td_error = QNet.get_loss(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
+        td_error = td_error.detach()
 
         td_error_idx = 0
         for idx in indexes:
diff --git a/rainbow/9-Rainbow/model.py b/rainbow/9-Rainbow/model.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import math
 import numpy as np
+import math
 
-from config import gamma, sigma_zero, n_step, num_support, V_max, V_min, batch_size
+from config import batch_size, num_support, gamma, V_max, V_min, sigma_zero, n_step
 
 class NoisyLinear(nn.Module):
   def __init__(self, in_features, out_features):
@@ -41,7 +41,6 @@ def reset_noise(self):
   def forward(self, input):
     return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon)
 
-
 class QNet(nn.Module):
     def __init__(self, num_inputs, num_outputs):
         super(QNet, self).__init__()
@@ -51,18 +50,17 @@ def __init__(self, num_inputs, num_outputs):
         self.dz = float(V_max - V_min) / (num_support - 1)
         self.z = torch.Tensor([V_min + i * self.dz for i in range(num_support)])
 
-
         self.fc = nn.Linear(num_inputs, 128)
         self.fc_adv = NoisyLinear(128, num_outputs * num_support)
         self.fc_val = nn.Linear(128, num_support)
 
-
         for m in self.modules():
             if isinstance(m, nn.Linear):
                 nn.init.xavier_uniform(m.weight)
 
-    def forward(self, x):
-        x = F.relu(self.fc(x))
+
+    def forward(self, input):
+        x = F.relu(self.fc(input))
         adv = self.fc_adv(x)
         val = self.fc_val(x)
 
@@ -73,6 +71,21 @@ def forward(self, x):
         p = nn.Softmax(dim=2)(z)
         return p
 
+    def get_Qvalue(self, input):
+        p = self.forward(input)
+        p = p.squeeze(0)
+        z_space = self.z.repeat(self.num_outputs, 1)
+        Q = torch.sum(p * z_space, dim=1)
+        return Q
+
+    def reset_noise(self):
+        self.fc_adv.reset_noise()
+
+    def get_action(self, input):
+        Q = self.get_Qvalue(input)
+        action = torch.argmax(Q)
+        return action.item()
+
     @classmethod
     def get_m(cls, _rewards, _masks, _prob_next_states_action):
         rewards = _rewards.numpy()
@@ -115,32 +128,23 @@ def get_loss(cls, online_net, target_net, states, next_states, actions, rewards,
         m_prob = cls.get_m(rewards, masks, prob_next_states_action)
         m_prob = torch.tensor(m_prob)
 
-        m_prob = m_prob / torch.sum(m_prob, dim=1, keepdim=True)
+        m_prob = (m_prob / torch.sum(m_prob, dim=1, keepdim=True)).detach()
         expand_dim_action = torch.unsqueeze(actions, -1)
         p = torch.sum(online_net(states) * expand_dim_action.float(), dim=1)
         loss = -torch.sum(m_prob * torch.log(p + 1e-20), 1)
-
+        
         return loss
 
+
     @classmethod
     def train_model(cls, online_net, target_net, optimizer, batch, weights):
         loss = cls.get_loss(online_net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask)
+        loss = (loss * weights.detach()).mean()
 
         optimizer.zero_grad()
-        (loss * weights).mean().backward()
+        loss.backward()
         optimizer.step()
 
-        return loss.mean()
-
-
-    def get_action(self, input):
-        self.reset_noise()
-        p = self.forward(input)
-        p = p.squeeze(0)
-        z_space = self.z.repeat(self.num_outputs, 1)
-        Q = torch.sum(p * z_space, dim=1)
-        action = torch.argmax(Q)
-        return action.item()
+        online_net.reset_noise()
 
-    def reset_noise(self):
-        self.fc_adv.reset_noise()
+        return loss
diff --git a/rainbow/9-Rainbow/train.py b/rainbow/9-Rainbow/train.py