Skip to content

Commit 42358e9

Browse files
committed
update
1 parent c61ac1d commit 42358e9

File tree

2 files changed

+30
-30
lines changed

2 files changed

+30
-30
lines changed

rl2/cartpole/pg_tf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ def play_one_td(env, pmodel, vmodel, gamma):
169169
# reward = -200
170170

171171
# update the models
172-
V_next = vmodel.predict(observation)
173-
G = reward + gamma*np.max(V_next)
172+
V_next = vmodel.predict(observation)[0]
173+
G = reward + gamma*V_next
174174
advantage = G - vmodel.predict(prev_observation)
175175
pmodel.partial_fit(prev_observation, action, advantage)
176176
vmodel.partial_fit(prev_observation, G)

rl2/cartpole/pg_theano.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -169,34 +169,34 @@ def predict(self, X):
169169
return self.predict_op(X)
170170

171171

172-
# def play_one_td(env, pmodel, vmodel, gamma):
173-
# observation = env.reset()
174-
# done = False
175-
# totalreward = 0
176-
# iters = 0
177-
178-
# while not done and iters < 2000:
179-
# # if we reach 2000, just quit, don't want this going forever
180-
# # the 200 limit seems a bit early
181-
# action = pmodel.sample_action(observation)
182-
# prev_observation = observation
183-
# observation, reward, done, info = env.step(action)
184-
185-
# if done:
186-
# reward = -200
187-
188-
# # update the models
189-
# V_next = vmodel.predict(observation)
190-
# G = reward + gamma*np.max(V_next)
191-
# advantage = G - vmodel.predict(prev_observation)
192-
# pmodel.partial_fit(prev_observation, action, advantage)
193-
# vmodel.partial_fit(prev_observation, G)
194-
195-
# if reward == 1: # if we changed the reward to -200
196-
# totalreward += reward
197-
# iters += 1
198-
199-
# return totalreward
172+
def play_one_td(env, pmodel, vmodel, gamma):
173+
observation = env.reset()
174+
done = False
175+
totalreward = 0
176+
iters = 0
177+
178+
while not done and iters < 2000:
179+
# if we reach 2000, just quit, don't want this going forever
180+
# the 200 limit seems a bit early
181+
action = pmodel.sample_action(observation)
182+
prev_observation = observation
183+
observation, reward, done, info = env.step(action)
184+
185+
if done:
186+
reward = -200
187+
188+
# update the models
189+
V_next = vmodel.predict(observation)
190+
G = reward + gamma*np.max(V_next)
191+
advantage = G - vmodel.predict(prev_observation)
192+
pmodel.partial_fit(prev_observation, action, advantage)
193+
vmodel.partial_fit(prev_observation, G)
194+
195+
if reward == 1: # if we changed the reward to -200
196+
totalreward += reward
197+
iters += 1
198+
199+
return totalreward
200200

201201

202202
def play_one_mc(env, pmodel, vmodel, gamma):

0 commit comments

Comments
 (0)