Skip to content

Commit f676a39

Browse files
author
User
committed
update
1 parent 802e81e commit f676a39

File tree

3 files changed

+459
-0
lines changed

3 files changed

+459
-0
lines changed

rl/approx_control.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
2+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
9+
import numpy as np
10+
import pandas as pd
11+
import matplotlib.pyplot as plt
12+
from grid_world import standard_grid, negative_grid
13+
from iterative_policy_evaluation import print_values, print_policy
14+
from sklearn.kernel_approximation import Nystroem, RBFSampler
15+
16+
GAMMA = 0.9
17+
ALPHA = 0.1
18+
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
19+
ACTION2INT = {a: i for i, a in enumerate(ALL_POSSIBLE_ACTIONS)}
20+
INT2ONEHOT = np.eye(len(ALL_POSSIBLE_ACTIONS))
21+
22+
23+
def epsilon_greedy(model, s, eps=0.1):
24+
# we'll use epsilon-soft to ensure all states are visited
25+
# what happens if you don't do this? i.e. eps=0
26+
p = np.random.random()
27+
if p < (1 - eps):
28+
values = model.predict_all_actions(s)
29+
return ALL_POSSIBLE_ACTIONS[np.argmax(values)]
30+
else:
31+
return np.random.choice(ALL_POSSIBLE_ACTIONS)
32+
33+
34+
def one_hot(k):
35+
return INT2ONEHOT[k]
36+
37+
38+
def merge_state_action(s, a):
39+
ai = one_hot(ACTION2INT[a])
40+
return np.concatenate((s, ai))
41+
42+
43+
def gather_samples(grid, n_episodes=1000):
44+
samples = []
45+
for _ in range(n_episodes):
46+
s = grid.reset()
47+
while not grid.game_over():
48+
a = np.random.choice(ALL_POSSIBLE_ACTIONS)
49+
sa = merge_state_action(s, a)
50+
samples.append(sa)
51+
52+
r = grid.move(a)
53+
s = grid.current_state()
54+
return samples
55+
56+
57+
class Model:
58+
def __init__(self, grid):
59+
# fit the featurizer to data
60+
samples = gather_samples(grid)
61+
# self.featurizer = Nystroem()
62+
self.featurizer = RBFSampler()
63+
self.featurizer.fit(samples)
64+
dims = self.featurizer.n_components
65+
66+
# initialize linear model weights
67+
self.w = np.zeros(dims)
68+
69+
def predict(self, s, a):
70+
sa = merge_state_action(s, a)
71+
x = self.featurizer.transform([sa])[0]
72+
return x @ self.w
73+
74+
def predict_all_actions(self, s):
75+
return [self.predict(s, a) for a in ALL_POSSIBLE_ACTIONS]
76+
77+
def grad(self, s, a):
78+
sa = merge_state_action(s, a)
79+
x = self.featurizer.transform([sa])[0]
80+
return x
81+
82+
83+
if __name__ == '__main__':
84+
# use the standard grid again (0 for every step) so that we can compare
85+
# to iterative policy evaluation
86+
# grid = standard_grid()
87+
grid = negative_grid(step_cost=-0.1)
88+
89+
# print rewards
90+
print("rewards:")
91+
print_values(grid.rewards, grid)
92+
93+
model = Model(grid)
94+
reward_per_episode = []
95+
state_visit_count = {}
96+
97+
# repeat until convergence
98+
n_episodes = 20000
99+
for it in range(n_episodes):
100+
if (it + 1) % 100 == 0:
101+
print(it + 1)
102+
103+
s = grid.reset()
104+
state_visit_count[s] = state_visit_count.get(s, 0) + 1
105+
episode_reward = 0
106+
while not grid.game_over():
107+
a = epsilon_greedy(model, s)
108+
r = grid.move(a)
109+
s2 = grid.current_state()
110+
state_visit_count[s2] = state_visit_count.get(s2, 0) + 1
111+
112+
# get the target
113+
if grid.game_over():
114+
target = r
115+
else:
116+
values = model.predict_all_actions(s2)
117+
target = r + GAMMA * np.max(values)
118+
119+
# update the model
120+
g = model.grad(s, a)
121+
err = target - model.predict(s, a)
122+
model.w += ALPHA * err * g
123+
124+
# accumulate reward
125+
episode_reward += r
126+
127+
# update state
128+
s = s2
129+
130+
reward_per_episode.append(episode_reward)
131+
132+
plt.plot(reward_per_episode)
133+
plt.title("Reward per episode")
134+
plt.show()
135+
136+
# obtain V* and pi*
137+
V = {}
138+
greedy_policy = {}
139+
states = grid.all_states()
140+
for s in states:
141+
if s in grid.actions:
142+
values = model.predict_all_actions(s)
143+
V[s] = np.max(values)
144+
greedy_policy[s] = ALL_POSSIBLE_ACTIONS[np.argmax(values)]
145+
else:
146+
# terminal state or state we can't otherwise get to
147+
V[s] = 0
148+
149+
print("values:")
150+
print_values(V, grid)
151+
print("policy:")
152+
print_policy(greedy_policy, grid)
153+
154+
155+
print("state_visit_count:")
156+
state_sample_count_arr = np.zeros((grid.rows, grid.cols))
157+
for i in range(grid.rows):
158+
for j in range(grid.cols):
159+
if (i, j) in state_visit_count:
160+
state_sample_count_arr[i,j] = state_visit_count[(i, j)]
161+
df = pd.DataFrame(state_sample_count_arr)
162+
print(df)

rl/approx_prediction.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
2+
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
from grid_world import standard_grid, negative_grid
12+
from iterative_policy_evaluation import print_values, print_policy
13+
from sklearn.kernel_approximation import Nystroem, RBFSampler
14+
15+
GAMMA = 0.9
16+
ALPHA = 0.01
17+
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
18+
19+
20+
def epsilon_greedy(greedy, s, eps=0.1):
21+
# we'll use epsilon-soft to ensure all states are visited
22+
# what happens if you don't do this? i.e. eps=0
23+
p = np.random.random()
24+
if p < (1 - eps):
25+
return greedy[s]
26+
else:
27+
return np.random.choice(ALL_POSSIBLE_ACTIONS)
28+
29+
30+
def gather_samples(grid, n_episodes=10000):
31+
samples = []
32+
for _ in range(n_episodes):
33+
s = grid.reset()
34+
samples.append(s)
35+
while not grid.game_over():
36+
a = np.random.choice(ALL_POSSIBLE_ACTIONS)
37+
r = grid.move(a)
38+
s = grid.current_state()
39+
samples.append(s)
40+
return samples
41+
42+
43+
class Model:
44+
def __init__(self, grid):
45+
# fit the featurizer to data
46+
samples = gather_samples(grid)
47+
# self.featurizer = Nystroem()
48+
self.featurizer = RBFSampler()
49+
self.featurizer.fit(samples)
50+
dims = self.featurizer.n_components
51+
52+
# initialize linear model weights
53+
self.w = np.zeros(dims)
54+
55+
def predict(self, s):
56+
x = self.featurizer.transform([s])[0]
57+
return x @ self.w
58+
59+
def grad(self, s):
60+
x = self.featurizer.transform([s])[0]
61+
return x
62+
63+
64+
if __name__ == '__main__':
65+
# use the standard grid again (0 for every step) so that we can compare
66+
# to iterative policy evaluation
67+
grid = standard_grid()
68+
69+
# print rewards
70+
print("rewards:")
71+
print_values(grid.rewards, grid)
72+
73+
# state -> action
74+
greedy_policy = {
75+
(2, 0): 'U',
76+
(1, 0): 'U',
77+
(0, 0): 'R',
78+
(0, 1): 'R',
79+
(0, 2): 'R',
80+
(1, 2): 'R',
81+
(2, 1): 'R',
82+
(2, 2): 'R',
83+
(2, 3): 'U',
84+
}
85+
86+
model = Model(grid)
87+
mse_per_episode = []
88+
89+
# repeat until convergence
90+
n_episodes = 10000
91+
for it in range(n_episodes):
92+
if (it + 1) % 100 == 0:
93+
print(it + 1)
94+
95+
s = grid.reset()
96+
Vs = model.predict(s)
97+
n_steps = 0
98+
episode_err = 0
99+
while not grid.game_over():
100+
a = epsilon_greedy(greedy_policy, s)
101+
r = grid.move(a)
102+
s2 = grid.current_state()
103+
104+
# get the target
105+
if grid.is_terminal(s2):
106+
target = r
107+
else:
108+
Vs2 = model.predict(s2)
109+
target = r + GAMMA * Vs2
110+
111+
# update the model
112+
g = model.grad(s)
113+
err = target - Vs
114+
model.w += ALPHA * err * g
115+
116+
# accumulate error
117+
n_steps += 1
118+
episode_err += err*err
119+
120+
# update state
121+
s = s2
122+
Vs = Vs2
123+
124+
mse = episode_err / n_steps
125+
mse_per_episode.append(mse)
126+
127+
plt.plot(mse_per_episode)
128+
plt.title("MSE per episode")
129+
plt.show()
130+
131+
# obtain predicted values
132+
V = {}
133+
states = grid.all_states()
134+
for s in states:
135+
if s in grid.actions:
136+
V[s] = model.predict(s)
137+
else:
138+
# terminal state or state we can't otherwise get to
139+
V[s] = 0
140+
141+
print("values:")
142+
print_values(V, grid)
143+
print("policy:")
144+
print_policy(greedy_policy, grid)

0 commit comments

Comments
 (0)