Skip to content

Commit b239d19

Browse files
update
1 parent e68e6a2 commit b239d19

File tree

1 file changed

+19
-11
lines changed

1 file changed

+19
-11
lines changed

rl/tic_tac_toe.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,12 @@ def take_action(self, env):
100100
# make the move
101101
env.board[next_move[0], next_move[1]] = self.sym
102102

103-
# update state history
104-
if best_state is None:
105-
best_state = env.get_state()
106-
self.state_history.append(best_state)
103+
def update_state_history(self, s):
104+
# cannot put this in take_action, because take_action only happens
105+
# once every other iteration for each player
106+
# state history needs to be updated every iteration
107+
# s = env.get_state() # don't want to do this twice so pass it in
108+
self.state_history.append(s)
107109

108110
def update(self, env):
109111
# update value function based on the reward just received and the most recent
@@ -267,10 +269,15 @@ def take_action(self, env):
267269
def update(self, env):
268270
pass
269271

272+
def update_state_history(self, s):
273+
pass
274+
270275

271276
# recursive function that will return all
272277
# possible states (as ints) and who the corresponding winner is for those states (if any)
273278
# (i, j) refers to the next cell on the board to permute (we need to try -1, 0, 1)
279+
# impossible games are ignored, i.e. 3x's and 3o's in a row simultaneously
280+
# since that will never happen in a real game
274281
def get_state_hash_and_winner(env, i=0, j=0):
275282
results = []
276283

@@ -386,15 +393,18 @@ def play_game(p1, p2, env, draw=False):
386393
# current player makes a move
387394
current_player.take_action(env)
388395

396+
# update state histories
397+
state = env.get_state()
398+
p1.update_state_history(state)
399+
p2.update_state_history(state)
400+
389401
if draw:
390402
env.draw_board()
391403

392404
# do the value function update
393405
p1.update(env)
394406
p2.update(env)
395407

396-
# TODO: return useful stats
397-
398408

399409
if __name__ == '__main__':
400410
# train the agent
@@ -415,13 +425,11 @@ def play_game(p1, p2, env, draw=False):
415425
p1.set_symbol(env.x)
416426
p2.set_symbol(env.o)
417427

418-
for t in xrange(10000):
428+
T = 10000
429+
for t in xrange(T):
419430
if t % 200 == 0:
420431
print t
421-
play_game(p1, p2, Environment())
422-
423-
# TODO: plot things to help us understand how well the agent has learned
424-
432+
winner = play_game(p1, p2, Environment())
425433

426434
# play human vs. agent
427435
# do you think the agent learned to play the game well?

0 commit comments

Comments
 (0)