@@ -169,34 +169,34 @@ def predict(self, X):
169
169
return self .predict_op (X )
170
170
171
171
172
- # def play_one_td(env, pmodel, vmodel, gamma):
173
- # observation = env.reset()
174
- # done = False
175
- # totalreward = 0
176
- # iters = 0
177
-
178
- # while not done and iters < 2000:
179
- # # if we reach 2000, just quit, don't want this going forever
180
- # # the 200 limit seems a bit early
181
- # action = pmodel.sample_action(observation)
182
- # prev_observation = observation
183
- # observation, reward, done, info = env.step(action)
184
-
185
- # if done:
186
- # reward = -200
187
-
188
- # # update the models
189
- # V_next = vmodel.predict(observation)
190
- # G = reward + gamma*np.max(V_next)
191
- # advantage = G - vmodel.predict(prev_observation)
192
- # pmodel.partial_fit(prev_observation, action, advantage)
193
- # vmodel.partial_fit(prev_observation, G)
194
-
195
- # if reward == 1: # if we changed the reward to -200
196
- # totalreward += reward
197
- # iters += 1
198
-
199
- # return totalreward
172
+ def play_one_td (env , pmodel , vmodel , gamma ):
173
+ observation = env .reset ()
174
+ done = False
175
+ totalreward = 0
176
+ iters = 0
177
+
178
+ while not done and iters < 2000 :
179
+ # if we reach 2000, just quit, don't want this going forever
180
+ # the 200 limit seems a bit early
181
+ action = pmodel .sample_action (observation )
182
+ prev_observation = observation
183
+ observation , reward , done , info = env .step (action )
184
+
185
+ if done :
186
+ reward = - 200
187
+
188
+ # update the models
189
+ V_next = vmodel .predict (observation )
190
+ G = reward + gamma * np .max (V_next )
191
+ advantage = G - vmodel .predict (prev_observation )
192
+ pmodel .partial_fit (prev_observation , action , advantage )
193
+ vmodel .partial_fit (prev_observation , G )
194
+
195
+ if reward == 1 : # if we changed the reward to -200
196
+ totalreward += reward
197
+ iters += 1
198
+
199
+ return totalreward
200
200
201
201
202
202
def play_one_mc (env , pmodel , vmodel , gamma ):
0 commit comments