@@ -100,10 +100,12 @@ def take_action(self, env):
100
100
# make the move
101
101
env .board [next_move [0 ], next_move [1 ]] = self .sym
102
102
103
- # update state history
104
- if best_state is None :
105
- best_state = env .get_state ()
106
- self .state_history .append (best_state )
103
+ def update_state_history (self , s ):
104
+ # cannot put this in take_action, because take_action only happens
105
+ # once every other iteration for each player
106
+ # state history needs to be updated every iteration
107
+ # s = env.get_state() # don't want to do this twice so pass it in
108
+ self .state_history .append (s )
107
109
108
110
def update (self , env ):
109
111
# update value function based on the reward just received and the most recent
@@ -267,10 +269,15 @@ def take_action(self, env):
267
269
def update (self , env ):
268
270
pass
269
271
272
+ def update_state_history (self , s ):
273
+ pass
274
+
270
275
271
276
# recursive function that will return all
272
277
# possible states (as ints) and who the corresponding winner is for those states (if any)
273
278
# (i, j) refers to the next cell on the board to permute (we need to try -1, 0, 1)
279
+ # impossible games are ignored, i.e. 3x's and 3o's in a row simultaneously
280
+ # since that will never happen in a real game
274
281
def get_state_hash_and_winner (env , i = 0 , j = 0 ):
275
282
results = []
276
283
@@ -386,15 +393,18 @@ def play_game(p1, p2, env, draw=False):
386
393
# current player makes a move
387
394
current_player .take_action (env )
388
395
396
+ # update state histories
397
+ state = env .get_state ()
398
+ p1 .update_state_history (state )
399
+ p2 .update_state_history (state )
400
+
389
401
if draw :
390
402
env .draw_board ()
391
403
392
404
# do the value function update
393
405
p1 .update (env )
394
406
p2 .update (env )
395
407
396
- # TODO: return useful stats
397
-
398
408
399
409
if __name__ == '__main__' :
400
410
# train the agent
@@ -415,13 +425,11 @@ def play_game(p1, p2, env, draw=False):
415
425
p1 .set_symbol (env .x )
416
426
p2 .set_symbol (env .o )
417
427
418
- for t in xrange (10000 ):
428
+ T = 10000
429
+ for t in xrange (T ):
419
430
if t % 200 == 0 :
420
431
print t
421
- play_game (p1 , p2 , Environment ())
422
-
423
- # TODO: plot things to help us understand how well the agent has learned
424
-
432
+ winner = play_game (p1 , p2 , Environment ())
425
433
426
434
# play human vs. agent
427
435
# do you think the agent learned to play the game well?
0 commit comments