Skip to content

Commit 646df00

Browse files
add course urls and rl files
1 parent 33fccb3 commit 646df00

20 files changed

+1664
-12
lines changed

rl/approx_mc_prediction.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from grid_world import standard_grid, negative_grid
5+
from iterative_policy_evaluation import print_values, print_policy
6+
7+
# NOTE: this is only policy evaluation, not optimization
8+
9+
# we'll try to obtain the same result as our other MC script
10+
from monte_carlo_random import random_action, play_game, SMALL_ENOUGH, GAMMA, ALL_POSSIBLE_ACTIONS
11+
12+
LEARNING_RATE = 0.001
13+
14+
if __name__ == '__main__':
15+
# use the standard grid again (0 for every step) so that we can compare
16+
# to iterative policy evaluation
17+
grid = standard_grid()
18+
19+
# print rewards
20+
print "rewards:"
21+
print_values(grid.rewards, grid)
22+
23+
# state -> action
24+
# found by policy_iteration_random on standard_grid
25+
# MC method won't get exactly this, but should be close
26+
# values:
27+
# ---------------------------
28+
# 0.43| 0.56| 0.72| 0.00|
29+
# ---------------------------
30+
# 0.33| 0.00| 0.21| 0.00|
31+
# ---------------------------
32+
# 0.25| 0.18| 0.11| -0.17|
33+
# policy:
34+
# ---------------------------
35+
# R | R | R | |
36+
# ---------------------------
37+
# U | | U | |
38+
# ---------------------------
39+
# U | L | U | L |
40+
policy = {
41+
(2, 0): 'U',
42+
(1, 0): 'U',
43+
(0, 0): 'R',
44+
(0, 1): 'R',
45+
(0, 2): 'R',
46+
(1, 2): 'U',
47+
(2, 1): 'L',
48+
(2, 2): 'U',
49+
(2, 3): 'L',
50+
}
51+
52+
# initialize theta
53+
# our model is V_hat = theta.dot(x)
54+
# where x = [row, col, row*col, 1] - 1 for bias term
55+
theta = np.random.randn(4) / 2
56+
def s2x(s):
57+
return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1])
58+
59+
# repeat until convergence
60+
deltas = []
61+
t = 1.0
62+
for it in xrange(20000):
63+
if it % 100 == 0:
64+
t += 0.01
65+
alpha = LEARNING_RATE/t
66+
# generate an episode using pi
67+
biggest_change = 0
68+
states_and_returns = play_game(grid, policy)
69+
seen_states = set()
70+
for s, G in states_and_returns:
71+
# check if we have already seen s
72+
# called "first-visit" MC policy evaluation
73+
if s not in seen_states:
74+
old_theta = theta.copy()
75+
x = s2x(s)
76+
V_hat = theta.dot(x)
77+
# grad(V_hat) wrt theta = x
78+
theta += alpha*(G - V_hat)*x
79+
biggest_change = max(biggest_change, np.abs(old_theta - theta).sum())
80+
seen_states.add(s)
81+
deltas.append(biggest_change)
82+
83+
plt.plot(deltas)
84+
plt.show()
85+
86+
# obtain predicted values
87+
V = {}
88+
states = grid.all_states()
89+
for s in states:
90+
if s in grid.actions:
91+
V[s] = theta.dot(s2x(s))
92+
else:
93+
# terminal state or state we can't otherwise get to
94+
V[s] = 0
95+
96+
print "values:"
97+
print_values(V, grid)
98+
print "policy:"
99+
print_policy(policy, grid)
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from grid_world import standard_grid, negative_grid
5+
from iterative_policy_evaluation import print_values, print_policy
6+
from monte_carlo_es import max_dict
7+
from sarsa import random_action, GAMMA, ALPHA, ALL_POSSIBLE_ACTIONS
8+
9+
SA2IDX = {}
10+
IDX = 0
11+
12+
class Model:
13+
def __init__(self):
14+
self.theta = np.random.randn(25) / np.sqrt(25)
15+
# if we use SA2IDX, a one-hot encoding for every (s,a) pair
16+
# in reality we wouldn't want to do this b/c we have just
17+
# as many params as before
18+
# print "D:", IDX
19+
# self.theta = np.random.randn(IDX) / np.sqrt(IDX)
20+
21+
def sa2x(self, s, a):
22+
# NOTE: using just (r, c, r*c, u, d, l, r, 1) is not expressive enough
23+
return np.array([
24+
s[0] - 1 if a == 'U' else 0,
25+
s[1] - 1.5 if a == 'U' else 0,
26+
(s[0]*s[1] - 3)/3 if a == 'U' else 0,
27+
(s[0]*s[0] - 2)/2 if a == 'U' else 0,
28+
(s[1]*s[1] - 4.5)/4.5 if a == 'U' else 0,
29+
1 if a == 'U' else 0,
30+
s[0] - 1 if a == 'D' else 0,
31+
s[1] - 1.5 if a == 'D' else 0,
32+
(s[0]*s[1] - 3)/3 if a == 'D' else 0,
33+
(s[0]*s[0] - 2)/2 if a == 'D' else 0,
34+
(s[1]*s[1] - 4.5)/4.5 if a == 'D' else 0,
35+
1 if a == 'D' else 0,
36+
s[0] - 1 if a == 'L' else 0,
37+
s[1] - 1.5 if a == 'L' else 0,
38+
(s[0]*s[1] - 3)/3 if a == 'L' else 0,
39+
(s[0]*s[0] - 2)/2 if a == 'L' else 0,
40+
(s[1]*s[1] - 4.5)/4.5 if a == 'L' else 0,
41+
1 if a == 'L' else 0,
42+
s[0] - 1 if a == 'R' else 0,
43+
s[1] - 1.5 if a == 'R' else 0,
44+
(s[0]*s[1] - 3)/3 if a == 'R' else 0,
45+
(s[0]*s[0] - 2)/2 if a == 'R' else 0,
46+
(s[1]*s[1] - 4.5)/4.5 if a == 'R' else 0,
47+
1 if a == 'R' else 0,
48+
1
49+
])
50+
# if we use SA2IDX, a one-hot encoding for every (s,a) pair
51+
# in reality we wouldn't want to do this b/c we have just
52+
# as many params as before
53+
# x = np.zeros(len(self.theta))
54+
# idx = SA2IDX[s][a]
55+
# x[idx] = 1
56+
# return x
57+
58+
def predict(self, s, a):
59+
x = self.sa2x(s, a)
60+
return self.theta.dot(x)
61+
62+
def grad(self, s, a):
63+
return self.sa2x(s, a)
64+
65+
66+
def getQs(model, s):
67+
# we need Q(s,a) to choose an action
68+
# i.e. a = argmax[a]{ Q(s,a) }
69+
Qs = {}
70+
for a in ALL_POSSIBLE_ACTIONS:
71+
q_sa = model.predict(s, a)
72+
Qs[a] = q_sa
73+
return Qs
74+
75+
76+
if __name__ == '__main__':
77+
# NOTE: if we use the standard grid, there's a good chance we will end up with
78+
# suboptimal policies
79+
# e.g.
80+
# ---------------------------
81+
# R | R | R | |
82+
# ---------------------------
83+
# R* | | U | |
84+
# ---------------------------
85+
# U | R | U | L |
86+
# since going R at (1,0) (shown with a *) incurs no cost, it's OK to keep doing that.
87+
# we'll either end up staying in the same spot, or back to the start (2,0), at which
88+
# point we whould then just go back up, or at (0,0), at which point we can continue
89+
# on right.
90+
# instead, let's penalize each movement so the agent will find a shorter route.
91+
#
92+
# grid = standard_grid()
93+
grid = negative_grid(step_cost=-0.1)
94+
95+
# print rewards
96+
print "rewards:"
97+
print_values(grid.rewards, grid)
98+
99+
# no policy initialization, we will derive our policy from most recent Q
100+
# enumerate all (s,a) pairs, each will have its own weight in our "dumb" model
101+
# essentially each weight will be a measure of Q(s,a) itself
102+
states = grid.all_states()
103+
for s in states:
104+
SA2IDX[s] = {}
105+
for a in ALL_POSSIBLE_ACTIONS:
106+
SA2IDX[s][a] = IDX
107+
IDX += 1
108+
109+
# initialize model
110+
model = Model()
111+
112+
# repeat until convergence
113+
t = 1.0
114+
t2 = 1.0
115+
deltas = []
116+
for it in xrange(20000):
117+
if it % 100 == 0:
118+
t += 10e-3
119+
t2 += 0.01
120+
if it % 1000 == 0:
121+
print "it:", it
122+
alpha = ALPHA / t2
123+
124+
# instead of 'generating' an epsiode, we will PLAY
125+
# an episode within this loop
126+
s = (2, 0) # start state
127+
grid.set_state(s)
128+
129+
# get Q(s) so we can choose the first action
130+
Qs = getQs(model, s)
131+
132+
# the first (s, r) tuple is the state we start in and 0
133+
# (since we don't get a reward) for simply starting the game
134+
# the last (s, r) tuple is the terminal state and the final reward
135+
# the value for the terminal state is by definition 0, so we don't
136+
# care about updating it.
137+
a = max_dict(Qs)[0]
138+
a = random_action(a, eps=0.5/t) # epsilon-greedy
139+
biggest_change = 0
140+
while not grid.game_over():
141+
r = grid.move(a)
142+
s2 = grid.current_state()
143+
144+
# we need the next action as well since Q(s,a) depends on Q(s',a')
145+
# if s2 not in policy then it's a terminal state, all Q are 0
146+
old_theta = model.theta.copy()
147+
if grid.is_terminal(s2):
148+
model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a)
149+
else:
150+
# not terminal
151+
Qs2 = getQs(model, s2)
152+
a2 = max_dict(Qs2)[0]
153+
a2 = random_action(a2, eps=0.5/t) # epsilon-greedy
154+
155+
# we will update Q(s,a) AS we experience the episode
156+
model.theta += alpha*(r + GAMMA*model.predict(s2, a2) - model.predict(s, a))*model.grad(s, a)
157+
158+
# next state becomes current state
159+
s = s2
160+
a = a2
161+
162+
biggest_change = max(biggest_change, np.abs(model.theta - old_theta).sum())
163+
deltas.append(biggest_change)
164+
165+
plt.plot(deltas)
166+
plt.show()
167+
168+
# determine the policy from Q*
169+
# find V* from Q*
170+
policy = {}
171+
V = {}
172+
Q = {}
173+
for s in grid.actions.keys():
174+
Qs = getQs(model, s)
175+
Q[s] = Qs
176+
a, max_q = max_dict(Qs)
177+
policy[s] = a
178+
V[s] = max_q
179+
180+
print "values:"
181+
print_values(V, grid)
182+
print "policy:"
183+
print_policy(policy, grid)
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from grid_world import standard_grid, negative_grid
5+
from iterative_policy_evaluation import print_values, print_policy
6+
from td0_prediction import play_game, SMALL_ENOUGH, GAMMA, ALPHA, ALL_POSSIBLE_ACTIONS
7+
8+
# NOTE: this is only policy evaluation, not optimization
9+
10+
class Model:
11+
def __init__(self):
12+
self.theta = np.random.randn(4) / 2
13+
14+
def s2x(self, s):
15+
return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1])
16+
17+
def predict(self, s):
18+
x = self.s2x(s)
19+
return self.theta.dot(x)
20+
21+
def grad(self, s):
22+
return self.s2x(s)
23+
24+
25+
if __name__ == '__main__':
26+
# use the standard grid again (0 for every step) so that we can compare
27+
# to iterative policy evaluation
28+
grid = standard_grid()
29+
30+
# print rewards
31+
print "rewards:"
32+
print_values(grid.rewards, grid)
33+
34+
# state -> action
35+
policy = {
36+
(2, 0): 'U',
37+
(1, 0): 'U',
38+
(0, 0): 'R',
39+
(0, 1): 'R',
40+
(0, 2): 'R',
41+
(1, 2): 'R',
42+
(2, 1): 'R',
43+
(2, 2): 'R',
44+
(2, 3): 'U',
45+
}
46+
47+
model = Model()
48+
deltas = []
49+
50+
# repeat until convergence
51+
k = 1.0
52+
for it in xrange(20000):
53+
if it % 10 == 0:
54+
k += 0.01
55+
alpha = ALPHA/k
56+
biggest_change = 0
57+
58+
# generate an episode using pi
59+
states_and_rewards = play_game(grid, policy)
60+
# the first (s, r) tuple is the state we start in and 0
61+
# (since we don't get a reward) for simply starting the game
62+
# the last (s, r) tuple is the terminal state and the final reward
63+
# the value for the terminal state is by definition 0, so we don't
64+
# care about updating it.
65+
for t in xrange(len(states_and_rewards) - 1):
66+
s, _ = states_and_rewards[t]
67+
s2, r = states_and_rewards[t+1]
68+
# we will update V(s) AS we experience the episode
69+
old_theta = model.theta.copy()
70+
if grid.is_terminal(s2):
71+
target = r
72+
else:
73+
target = r + GAMMA*model.predict(s2)
74+
model.theta += alpha*(target - model.predict(s))*model.grad(s)
75+
biggest_change = max(biggest_change, np.abs(old_theta - model.theta).sum())
76+
deltas.append(biggest_change)
77+
78+
plt.plot(deltas)
79+
plt.show()
80+
81+
# obtain predicted values
82+
V = {}
83+
states = grid.all_states()
84+
for s in states:
85+
if s in grid.actions:
86+
V[s] = model.predict(s)
87+
else:
88+
# terminal state or state we can't otherwise get to
89+
V[s] = 0
90+
91+
print "values:"
92+
print_values(V, grid)
93+
print "policy:"
94+
print_policy(policy, grid)

0 commit comments

Comments
 (0)