forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonte_carlo_random.py
125 lines (111 loc) · 3.61 KB
/
monte_carlo_random.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy
SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
# NOTE: this is only policy evaluation, not optimization
def random_action(a):
# choose given a with probability 0.5
# choose some other a' != a with probability 0.5/3
p = np.random.random()
if p < 0.5:
return a
else:
tmp = list(ALL_POSSIBLE_ACTIONS)
tmp.remove(a)
return np.random.choice(tmp)
def play_game(grid, policy):
# returns a list of states and corresponding returns
# reset game to start at a random position
# we need to do this, because given our current deterministic policy
# we would never end up at certain states, but we still want to measure their value
start_states = grid.actions.keys()
start_idx = np.random.choice(len(start_states))
grid.set_state(start_states[start_idx])
s = grid.current_state()
states_and_rewards = [(s, 0)] # list of tuples of (state, reward)
while not grid.game_over():
a = policy[s]
a = random_action(a)
r = grid.move(a)
s = grid.current_state()
states_and_rewards.append((s, r))
# calculate the returns by working backwards from the terminal state
G = 0
states_and_returns = []
first = True
for s, r in reversed(states_and_rewards):
# the value of the terminal state is 0 by definition
# we should ignore the first state we encounter
# and ignore the last G, which is meaningless since it doesn't correspond to any move
if first:
first = False
else:
states_and_returns.append((s, G))
G = r + GAMMA*G
states_and_returns.reverse() # we want it to be in order of state visited
return states_and_returns
if __name__ == '__main__':
# use the standard grid again (0 for every step) so that we can compare
# to iterative policy evaluation
grid = standard_grid()
# print rewards
print "rewards:"
print_values(grid.rewards, grid)
# state -> action
# found by policy_iteration_random on standard_grid
# MC method won't get exactly this, but should be close
# values:
# ---------------------------
# 0.43| 0.56| 0.72| 0.00|
# ---------------------------
# 0.33| 0.00| 0.21| 0.00|
# ---------------------------
# 0.25| 0.18| 0.11| -0.17|
# policy:
# ---------------------------
# R | R | R | |
# ---------------------------
# U | | U | |
# ---------------------------
# U | L | U | L |
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'U',
(2, 1): 'L',
(2, 2): 'U',
(2, 3): 'L',
}
# initialize V(s) and returns
V = {}
returns = {} # dictionary of state -> list of returns we've received
states = grid.all_states()
for s in states:
if s in grid.actions:
returns[s] = []
else:
# terminal state or state we can't otherwise get to
V[s] = 0
# repeat until convergence
for t in xrange(5000):
# generate an episode using pi
states_and_returns = play_game(grid, policy)
seen_states = set()
for s, G in states_and_returns:
# check if we have already seen s
# called "first-visit" MC policy evaluation
if s not in seen_states:
returns[s].append(G)
V[s] = np.mean(returns[s])
seen_states.add(s)
print "values:"
print_values(V, grid)
print "policy:"
print_policy(policy, grid)