Feat: TNPG use memory

choru-k · choru-k · commit 3bfca8b8781a · 2018-12-17T22:10:57.000+09:00
diff --git a/PG/5-TNPG/config.py b/PG/5-TNPG/config.py
@@ -2,7 +2,7 @@
 
 env_name = 'CartPole-v1'
 gamma = 0.99
-lr = 0.01
+lr = 0.001
 goal_score = 200
 log_interval = 10
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/PG/5-TNPG/memory.py b/PG/5-TNPG/memory.py
@@ -0,0 +1,18 @@
+import random
+from collections import namedtuple, deque
+
+Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask'))
+
+class Memory(object):
+    def __init__(self):
+        self.memory = deque()
+
+    def push(self, state, next_state, action, reward, mask):
+        self.memory.append(Transition(state, next_state, action, reward, mask))
+
+    def sample(self):
+        memory = self.memory
+        return Transition(*zip(*memory)) 
+
+    def __len__(self):
+        return len(self.memory)
diff --git a/PG/5-TNPG/model.py b/PG/5-TNPG/model.py
@@ -98,17 +98,25 @@ def forward(self, input):
 
     @classmethod
     def train_model(cls, net, transitions, k):
-        states, actions, rewards, masks = transitions
+        states, actions, rewards, masks = transitions.state, transitions.action, transitions.reward, transitions.mask
+
         states = torch.stack(states)
         actions = torch.stack(actions)
         rewards = torch.Tensor(rewards)
         masks = torch.Tensor(masks)
 
-        policy = net(states)
-        policy = policy.view(-1, net.num_outputs)
-        policy_action = (policy * actions.detach()).sum(dim=1)
+        returns = torch.zeros_like(rewards)
+
+        running_return = 0
+        for t in reversed(range(len(rewards))):
+            running_return = rewards[t] + gamma * running_return * masks[t]
+            returns[t] = running_return
+
+        policies = net(states)
+        policies = policies.view(-1, net.num_outputs)
+        policy_actions = (policies * actions.detach()).sum(dim=1)
 
-        loss = (policy_action * rewards).mean()
+        loss = (policy_actions * returns).mean()
 
         loss_grad = torch.autograd.grad(loss, net.parameters())
         loss_grad = flat_grad(loss_grad)
diff --git a/PG/5-TNPG/train.py b/PG/5-TNPG/train.py
@@ -10,6 +10,7 @@
 from model import QNet
 from tensorboardX import SummaryWriter
 
+from memory import Memory
 from config import env_name, goal_score, log_interval, device, lr, gamma
 
 
@@ -24,7 +25,6 @@ def main():
     print('action size:', num_actions)
 
     net = QNet(num_inputs, num_actions)
-
     writer = SummaryWriter('logs')
 
     net.to(device)
@@ -33,9 +33,9 @@ def main():
     steps = 0
     loss = 0
     k=0
-    for e in range(3000):
+    for e in range(30000):
         done = False
-        memory = []
+        memory = Memory() 
 
         score = 0
         state = env.reset()
@@ -56,22 +56,13 @@ def main():
 
             action_one_hot = torch.zeros(2)
             action_one_hot[action] = 1
-            memory.append([state, next_state, action_one_hot, reward, mask])
+            memory.push(state, next_state, action_one_hot, reward, mask)
 
             score += reward
             state = next_state
 
         sum_reward = 0
-        memory.reverse()
-        states, actions, rewards, masks = [], [], [], []
-        for t, transition in enumerate(memory):
-            state, next_state, action, reward, mask = transition
-            sum_reward = (reward + gamma * sum_reward)
-            states.append(state)
-            actions.append(action)
-            rewards.append(sum_reward)
-            masks.append(mask)
-        loss = QNet.train_model(net, (states, actions, rewards, masks), k)
+        loss = QNet.train_model(net, memory.sample(), k)
         k+=1
 
         score = score if score == 500.0 else score + 1