Skip to content

Commit 8e0100c

Browse files
py3
1 parent a9ce012 commit 8e0100c

16 files changed

+213
-118
lines changed

rl/approx_mc_prediction.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
import matplotlib.pyplot as plt
511
from grid_world import standard_grid, negative_grid
@@ -18,7 +24,7 @@
1824
grid = standard_grid()
1925

2026
# print rewards
21-
print "rewards:"
27+
print("rewards:")
2228
print_values(grid.rewards, grid)
2329

2430
# state -> action
@@ -60,7 +66,7 @@ def s2x(s):
6066
# repeat until convergence
6167
deltas = []
6268
t = 1.0
63-
for it in xrange(20000):
69+
for it in range(20000):
6470
if it % 100 == 0:
6571
t += 0.01
6672
alpha = LEARNING_RATE/t
@@ -94,7 +100,7 @@ def s2x(s):
94100
# terminal state or state we can't otherwise get to
95101
V[s] = 0
96102

97-
print "values:"
103+
print("values:")
98104
print_values(V, grid)
99-
print "policy:"
105+
print("policy:")
100106
print_policy(policy, grid)

rl/approx_semigradient_sarsa_control.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
import matplotlib.pyplot as plt
511
from grid_world import standard_grid, negative_grid
@@ -94,7 +100,7 @@ def getQs(model, s):
94100
grid = negative_grid(step_cost=-0.1)
95101

96102
# print rewards
97-
print "rewards:"
103+
print("rewards:")
98104
print_values(grid.rewards, grid)
99105

100106
# no policy initialization, we will derive our policy from most recent Q
@@ -114,12 +120,12 @@ def getQs(model, s):
114120
t = 1.0
115121
t2 = 1.0
116122
deltas = []
117-
for it in xrange(20000):
123+
for it in range(20000):
118124
if it % 100 == 0:
119125
t += 0.01
120126
t2 += 0.01
121127
if it % 1000 == 0:
122-
print "it:", it
128+
print("it:", it)
123129
alpha = ALPHA / t2
124130

125131
# instead of 'generating' an epsiode, we will PLAY
@@ -178,7 +184,7 @@ def getQs(model, s):
178184
policy[s] = a
179185
V[s] = max_q
180186

181-
print "values:"
187+
print("values:")
182188
print_values(V, grid)
183-
print "policy:"
189+
print("policy:")
184190
print_policy(policy, grid)

rl/approx_semigradient_td0_prediction.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
import matplotlib.pyplot as plt
511
from grid_world import standard_grid, negative_grid
@@ -29,7 +35,7 @@ def grad(self, s):
2935
grid = standard_grid()
3036

3137
# print rewards
32-
print "rewards:"
38+
print("rewards:")
3339
print_values(grid.rewards, grid)
3440

3541
# state -> action
@@ -50,7 +56,7 @@ def grad(self, s):
5056

5157
# repeat until convergence
5258
k = 1.0
53-
for it in xrange(20000):
59+
for it in range(20000):
5460
if it % 10 == 0:
5561
k += 0.01
5662
alpha = ALPHA/k
@@ -63,7 +69,7 @@ def grad(self, s):
6369
# the last (s, r) tuple is the terminal state and the final reward
6470
# the value for the terminal state is by definition 0, so we don't
6571
# care about updating it.
66-
for t in xrange(len(states_and_rewards) - 1):
72+
for t in range(len(states_and_rewards) - 1):
6773
s, _ = states_and_rewards[t]
6874
s2, r = states_and_rewards[t+1]
6975
# we will update V(s) AS we experience the episode
@@ -89,7 +95,7 @@ def grad(self, s):
8995
# terminal state or state we can't otherwise get to
9096
V[s] = 0
9197

92-
print "values:"
98+
print("values:")
9399
print_values(V, grid)
94-
print "policy:"
100+
print("policy:")
95101
print_policy(policy, grid)

rl/grid_world.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410

511

@@ -63,7 +69,7 @@ def all_states(self):
6369
# possibly buggy but simple way to get all states
6470
# either a position that has possible next actions
6571
# or a position that yields a reward
66-
return set(self.actions.keys() + self.rewards.keys())
72+
return set(self.actions.keys()) | set(self.rewards.keys())
6773

6874

6975
def standard_grid():

rl/iterative_policy_evaluation.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,35 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
from grid_world import standard_grid
511

612
SMALL_ENOUGH = 1e-3 # threshold for convergence
713

814
def print_values(V, g):
9-
for i in xrange(g.width):
10-
print "---------------------------"
11-
for j in xrange(g.height):
15+
for i in range(g.width):
16+
print("---------------------------")
17+
for j in range(g.height):
1218
v = V.get((i,j), 0)
1319
if v >= 0:
14-
print " %.2f|" % v,
20+
print(" %.2f|" % v, end="")
1521
else:
16-
print "%.2f|" % v, # -ve sign takes up an extra space
17-
print ""
22+
print("%.2f|" % v, end="") # -ve sign takes up an extra space
23+
print("")
1824

1925

2026
def print_policy(P, g):
21-
for i in xrange(g.width):
22-
print "---------------------------"
23-
for j in xrange(g.height):
27+
for i in range(g.width):
28+
print("---------------------------")
29+
for j in range(g.height):
2430
a = P.get((i,j), ' ')
25-
print " %s |" % a,
26-
print ""
31+
print(" %s |" % a, end="")
32+
print("")
2733

2834
if __name__ == '__main__':
2935
# iterative policy evaluation
@@ -68,9 +74,9 @@ def print_policy(P, g):
6874

6975
if biggest_change < SMALL_ENOUGH:
7076
break
71-
print "values for uniformly random actions:"
77+
print("values for uniformly random actions:")
7278
print_values(V, grid)
73-
print "\n\n"
79+
print("\n\n")
7480

7581
### fixed policy ###
7682
policy = {
@@ -110,5 +116,5 @@ def print_policy(P, g):
110116

111117
if biggest_change < SMALL_ENOUGH:
112118
break
113-
print "values for fixed policy:"
119+
print("values for fixed policy:")
114120
print_values(V, grid)

rl/monte_carlo.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
from grid_world import standard_grid, negative_grid
511
from iterative_policy_evaluation import print_values, print_policy
@@ -16,7 +22,7 @@ def play_game(grid, policy):
1622
# reset game to start at a random position
1723
# we need to do this, because given our current deterministic policy
1824
# we would never end up at certain states, but we still want to measure their value
19-
start_states = grid.actions.keys()
25+
start_states = list(grid.actions.keys())
2026
start_idx = np.random.choice(len(start_states))
2127
grid.set_state(start_states[start_idx])
2228

@@ -50,7 +56,7 @@ def play_game(grid, policy):
5056
grid = standard_grid()
5157

5258
# print rewards
53-
print "rewards:"
59+
print("rewards:")
5460
print_values(grid.rewards, grid)
5561

5662
# state -> action
@@ -78,7 +84,7 @@ def play_game(grid, policy):
7884
V[s] = 0
7985

8086
# repeat
81-
for t in xrange(100):
87+
for t in range(100):
8288

8389
# generate an episode using pi
8490
states_and_returns = play_game(grid, policy)
@@ -91,7 +97,7 @@ def play_game(grid, policy):
9197
V[s] = np.mean(returns[s])
9298
seen_states.add(s)
9399

94-
print "values:"
100+
print("values:")
95101
print_values(V, grid)
96-
print "policy:"
102+
print("policy:")
97103
print_policy(policy, grid)

rl/monte_carlo_es.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
import matplotlib.pyplot as plt
511
from grid_world import standard_grid, negative_grid
@@ -18,7 +24,7 @@ def play_game(grid, policy):
1824
# we need to do this if we have a deterministic policy
1925
# we would never end up at certain states, but we still want to measure their value
2026
# this is called the "exploring starts" method
21-
start_states = grid.actions.keys()
27+
start_states = list(grid.actions.keys())
2228
start_idx = np.random.choice(len(start_states))
2329
grid.set_state(start_states[start_idx])
2430

@@ -70,7 +76,7 @@ def max_dict(d):
7076
# put this into a function since we are using it so often
7177
max_key = None
7278
max_val = float('-inf')
73-
for k, v in d.iteritems():
79+
for k, v in d.items():
7480
if v > max_val:
7581
max_val = v
7682
max_key = k
@@ -86,7 +92,7 @@ def max_dict(d):
8692
grid = negative_grid(step_cost=-0.9)
8793

8894
# print rewards
89-
print "rewards:"
95+
print("rewards:")
9096
print_values(grid.rewards, grid)
9197

9298
# state -> action
@@ -111,9 +117,9 @@ def max_dict(d):
111117

112118
# repeat until convergence
113119
deltas = []
114-
for t in xrange(2000):
120+
for t in range(2000):
115121
if t % 100 == 0:
116-
print t
122+
print(t)
117123

118124
# generate an episode using pi
119125
biggest_change = 0
@@ -138,13 +144,13 @@ def max_dict(d):
138144
plt.plot(deltas)
139145
plt.show()
140146

141-
print "final policy:"
147+
print("final policy:")
142148
print_policy(policy, grid)
143149

144150
# find V
145151
V = {}
146-
for s, Qs in Q.iteritems():
152+
for s, Qs in Q.items():
147153
V[s] = max_dict(Q[s])[1]
148154

149-
print "final values:"
155+
print("final values:")
150156
print_values(V, grid)

rl/monte_carlo_no_es.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
22
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
39
import numpy as np
410
import matplotlib.pyplot as plt
511
from grid_world import standard_grid, negative_grid
@@ -77,7 +83,7 @@ def play_game(grid, policy):
7783
grid = negative_grid(step_cost=-0.1)
7884

7985
# print rewards
80-
print "rewards:"
86+
print("rewards:")
8187
print_values(grid.rewards, grid)
8288

8389
# state -> action
@@ -102,9 +108,9 @@ def play_game(grid, policy):
102108

103109
# repeat until convergence
104110
deltas = []
105-
for t in xrange(5000):
111+
for t in range(5000):
106112
if t % 1000 == 0:
107-
print t
113+
print(t)
108114

109115
# generate an episode using pi
110116
biggest_change = 0
@@ -138,8 +144,8 @@ def play_game(grid, policy):
138144
for s in policy.keys():
139145
V[s] = max_dict(Q[s])[1]
140146

141-
print "final values:"
147+
print("final values:")
142148
print_values(V, grid)
143-
print "final policy:"
149+
print("final policy:")
144150
print_policy(policy, grid)
145151

0 commit comments

Comments
 (0)