update

bob7783 · bob7783 · commit 42358e958aff · 2019-08-03T14:36:13.000-04:00
diff --git a/rl2/cartpole/pg_tf.py b/rl2/cartpole/pg_tf.py
@@ -169,8 +169,8 @@ def play_one_td(env, pmodel, vmodel, gamma):
     #   reward = -200
 
     # update the models
-    V_next = vmodel.predict(observation)
-    G = reward + gamma*np.max(V_next)
+    V_next = vmodel.predict(observation)[0]
+    G = reward + gamma*V_next
     advantage = G - vmodel.predict(prev_observation)
     pmodel.partial_fit(prev_observation, action, advantage)
     vmodel.partial_fit(prev_observation, G)
diff --git a/rl2/cartpole/pg_theano.py b/rl2/cartpole/pg_theano.py
@@ -169,34 +169,34 @@ def predict(self, X):
     return self.predict_op(X)
 
 
-# def play_one_td(env, pmodel, vmodel, gamma):
-#   observation = env.reset()
-#   done = False
-#   totalreward = 0
-#   iters = 0
-
-#   while not done and iters < 2000:
-#     # if we reach 2000, just quit, don't want this going forever
-#     # the 200 limit seems a bit early
-#     action = pmodel.sample_action(observation)
-#     prev_observation = observation
-#     observation, reward, done, info = env.step(action)
-
-#     if done:
-#       reward = -200
-
-#     # update the models
-#     V_next = vmodel.predict(observation)
-#     G = reward + gamma*np.max(V_next)
-#     advantage = G - vmodel.predict(prev_observation)
-#     pmodel.partial_fit(prev_observation, action, advantage)
-#     vmodel.partial_fit(prev_observation, G)
-
-#     if reward == 1: # if we changed the reward to -200
-#       totalreward += reward
-#     iters += 1
-
-#   return totalreward
+def play_one_td(env, pmodel, vmodel, gamma):
+  observation = env.reset()
+  done = False
+  totalreward = 0
+  iters = 0
+
+  while not done and iters < 2000:
+    # if we reach 2000, just quit, don't want this going forever
+    # the 200 limit seems a bit early
+    action = pmodel.sample_action(observation)
+    prev_observation = observation
+    observation, reward, done, info = env.step(action)
+
+    if done:
+      reward = -200
+
+    # update the models
+    V_next = vmodel.predict(observation)
+    G = reward + gamma*np.max(V_next)
+    advantage = G - vmodel.predict(prev_observation)
+    pmodel.partial_fit(prev_observation, action, advantage)
+    vmodel.partial_fit(prev_observation, G)
+
+    if reward == 1: # if we changed the reward to -200
+      totalreward += reward
+    iters += 1
+
+  return totalreward
 
 
 def play_one_mc(env, pmodel, vmodel, gamma):