add done flag

lazyprogrammer · lazyprogrammer · commit 9578f53f8e24 · 2017-03-30T02:48:07.000-04:00
diff --git a/rl2/atari/dqn_tf.py b/rl2/atari/dqn_tf.py
@@ -123,9 +123,9 @@ def train(self, target_network):
 
     # randomly select a batch
     sample = random.sample(self.experience, self.batch_sz)
-    states, actions, rewards, next_states = map(np.array, zip(*sample))
+    states, actions, rewards, next_states, dones = map(np.array, zip(*sample))
     next_Q = np.max(target_network.predict(next_states), axis=1)
-    targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
+    targets = [r + self.gamma*next_q if done is False else r for r, next_q, done in zip(rewards, next_Q, dones)]
 
     # call optimizer
     self.session.run(
@@ -137,12 +137,12 @@ def train(self, target_network):
       }
     )
 
-  def add_experience(self, s, a, r, s2):
+  def add_experience(self, s, a, r, s2, done):
     if len(self.experience) >= self.max_experiences:
       self.experience.pop(0)
     if len(s) != 4 or len(s2) != 4:
       print("BAD STATE")
-    self.experience.append((s, a, r, s2))
+    self.experience.append((s, a, r, s2, done))
 
   def sample_action(self, x, eps):
     if np.random.random() < eps:
@@ -192,7 +192,7 @@ def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period):
 
     # update the model
     if len(state) == 4 and len(prev_state) == 4:
-      model.add_experience(prev_state, action, reward, state)
+      model.add_experience(prev_state, action, reward, state, done)
       model.train(tmodel)
 
     iters += 1
diff --git a/rl2/atari/dqn_tf_alt.py b/rl2/atari/dqn_tf_alt.py
@@ -174,9 +174,9 @@ def train(self, target_network):
 
     # randomly select a batch
     sample = random.sample(self.experience, self.batch_sz)
-    states, actions, rewards, next_states = map(np.array, zip(*sample))
+    states, actions, rewards, next_states, dones = map(np.array, zip(*sample))
     next_Q = np.max(target_network.predict(next_states), axis=1)
-    targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
+    targets = [r + self.gamma*next_q if done is False else r for r, next_q, done in zip(rewards, next_Q, dones)]
 
     # print("train start")
     # call optimizer
@@ -190,12 +190,12 @@ def train(self, target_network):
     )
     # print("train end")
 
-  def add_experience(self, s, a, r, s2):
+  def add_experience(self, s, a, r, s2, done):
     if len(self.experience) >= self.max_experiences:
       self.experience.pop(0)
     if len(s) != 4 or len(s2) != 4:
       print("BAD STATE")
-    self.experience.append((s, a, r, s2))
+    self.experience.append((s, a, r, s2, done))
 
   def sample_action(self, x, eps):
     if np.random.random() < eps:
@@ -245,7 +245,7 @@ def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period):
 
     # update the model
     if len(state) == 4 and len(prev_state) == 4:
-      model.add_experience(prev_state, action, reward, state)
+      model.add_experience(prev_state, action, reward, state, done)
       model.train(tmodel)
 
     iters += 1
diff --git a/rl2/atari/dqn_theano.py b/rl2/atari/dqn_theano.py
@@ -198,19 +198,19 @@ def train(self, target_network):
 
     # randomly select a batch
     sample = random.sample(self.experience, self.batch_sz)
-    states, actions, rewards, next_states = map(np.array, zip(*sample))
+    states, actions, rewards, next_states, dones = map(np.array, zip(*sample))
     next_Q = np.max(target_network.predict(next_states), axis=1)
-    targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
+    targets = [r + self.gamma*next_q if done is False else r for r, next_q, done in zip(rewards, next_Q, dones)]
 
     # call optimizer
     self.train_op(states, targets, actions)
 
-  def add_experience(self, s, a, r, s2):
+  def add_experience(self, s, a, r, s2, done):
     if len(self.experience) >= self.max_experiences:
       self.experience.pop(0)
     if len(s) != 4 or len(s2) != 4:
       print("BAD STATE")
-    self.experience.append((s, a, r, s2))
+    self.experience.append((s, a, r, s2, done))
 
   def sample_action(self, x, eps):
     if np.random.random() < eps:
@@ -260,7 +260,7 @@ def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period):
 
     # update the model
     if len(state) == 4 and len(prev_state) == 4:
-      model.add_experience(prev_state, action, reward, state)
+      model.add_experience(prev_state, action, reward, state, done)
       model.train(tmodel)
 
     iters += 1