pepekalo
diff --git a/‎rl/approx_mc_prediction.py
+10-4 b/‎rl/approx_mc_prediction.py
+10-4
diff --git a/‎rl/approx_semigradient_sarsa_control.py
+11-5 b/‎rl/approx_semigradient_sarsa_control.py
+11-5
diff --git a/‎rl/approx_semigradient_td0_prediction.py
+11-5 b/‎rl/approx_semigradient_td0_prediction.py
+11-5
diff --git a/‎rl/grid_world.py
+7-1 b/‎rl/grid_world.py
+7-1
diff --git a/‎rl/iterative_policy_evaluation.py
+20-14 b/‎rl/iterative_policy_evaluation.py
+20-14
diff --git a/‎rl/monte_carlo.py
+11-5 b/‎rl/monte_carlo.py
+11-5
diff --git a/‎rl/monte_carlo_es.py
+14-8 b/‎rl/monte_carlo_es.py
+14-8
diff --git a/‎rl/monte_carlo_no_es.py
+11-5 b/‎rl/monte_carlo_no_es.py
+11-5
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from grid_world import standard_grid, negative_grid
@@ -18,7 +24,7 @@
   grid = standard_grid()
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # state -> action
@@ -60,7 +66,7 @@ def s2x(s):
   # repeat until convergence
   deltas = []
   t = 1.0
-  for it in xrange(20000):
+  for it in range(20000):
     if it % 100 == 0:
       t += 0.01
     alpha = LEARNING_RATE/t
@@ -94,7 +100,7 @@ def s2x(s):
       # terminal state or state we can't otherwise get to
       V[s] = 0
 
-  print "values:"
+  print("values:")
   print_values(V, grid)
-  print "policy:"
+  print("policy:")
   print_policy(policy, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from grid_world import standard_grid, negative_grid
@@ -94,7 +100,7 @@ def getQs(model, s):
   grid = negative_grid(step_cost=-0.1)
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # no policy initialization, we will derive our policy from most recent Q
@@ -114,12 +120,12 @@ def getQs(model, s):
   t = 1.0
   t2 = 1.0
   deltas = []
-  for it in xrange(20000):
+  for it in range(20000):
     if it % 100 == 0:
       t += 0.01
       t2 += 0.01
     if it % 1000 == 0:
-      print "it:", it
+      print("it:", it)
     alpha = ALPHA / t2
 
     # instead of 'generating' an epsiode, we will PLAY
@@ -178,7 +184,7 @@ def getQs(model, s):
     policy[s] = a
     V[s] = max_q
 
-  print "values:"
+  print("values:")
   print_values(V, grid)
-  print "policy:"
+  print("policy:")
   print_policy(policy, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from grid_world import standard_grid, negative_grid
@@ -29,7 +35,7 @@ def grad(self, s):
   grid = standard_grid()
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # state -> action
@@ -50,7 +56,7 @@ def grad(self, s):
 
   # repeat until convergence
   k = 1.0
-  for it in xrange(20000):
+  for it in range(20000):
     if it % 10 == 0:
       k += 0.01
     alpha = ALPHA/k
@@ -63,7 +69,7 @@ def grad(self, s):
     # the last (s, r) tuple is the terminal state and the final reward
     # the value for the terminal state is by definition 0, so we don't
     # care about updating it.
-    for t in xrange(len(states_and_rewards) - 1):
+    for t in range(len(states_and_rewards) - 1):
       s, _ = states_and_rewards[t]
       s2, r = states_and_rewards[t+1]
       # we will update V(s) AS we experience the episode
@@ -89,7 +95,7 @@ def grad(self, s):
       # terminal state or state we can't otherwise get to
       V[s] = 0
 
-  print "values:"
+  print("values:")
   print_values(V, grid)
-  print "policy:"
+  print("policy:")
   print_policy(policy, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 
 
@@ -63,7 +69,7 @@ def all_states(self):
     # possibly buggy but simple way to get all states
     # either a position that has possible next actions
     # or a position that yields a reward
-    return set(self.actions.keys() + self.rewards.keys())
+    return set(self.actions.keys()) | set(self.rewards.keys())
 
 
 def standard_grid():
 
@@ -1,29 +1,35 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 from grid_world import standard_grid
 
 SMALL_ENOUGH = 1e-3 # threshold for convergence
 
 def print_values(V, g):
-  for i in xrange(g.width):
-    print "---------------------------"
-    for j in xrange(g.height):
+  for i in range(g.width):
+    print("---------------------------")
+    for j in range(g.height):
       v = V.get((i,j), 0)
       if v >= 0:
-        print " %.2f|" % v,
+        print(" %.2f|" % v, end="")
       else:
-        print "%.2f|" % v, # -ve sign takes up an extra space
-    print ""
+        print("%.2f|" % v, end="") # -ve sign takes up an extra space
+    print("")
 
 
 def print_policy(P, g):
-  for i in xrange(g.width):
-    print "---------------------------"
-    for j in xrange(g.height):
+  for i in range(g.width):
+    print("---------------------------")
+    for j in range(g.height):
       a = P.get((i,j), ' ')
-      print "  %s  |" % a,
-    print ""
+      print("  %s  |" % a, end="")
+    print("")
 
 if __name__ == '__main__':
   # iterative policy evaluation
@@ -68,9 +74,9 @@ def print_policy(P, g):
 
     if biggest_change < SMALL_ENOUGH:
       break
-  print "values for uniformly random actions:"
+  print("values for uniformly random actions:")
   print_values(V, grid)
-  print "\n\n"
+  print("\n\n")
 
   ### fixed policy ###
   policy = {
@@ -110,5 +116,5 @@ def print_policy(P, g):
 
     if biggest_change < SMALL_ENOUGH:
       break
-  print "values for fixed policy:"
+  print("values for fixed policy:")
   print_values(V, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 from grid_world import standard_grid, negative_grid
 from iterative_policy_evaluation import print_values, print_policy
@@ -16,7 +22,7 @@ def play_game(grid, policy):
   # reset game to start at a random position
   # we need to do this, because given our current deterministic policy
   # we would never end up at certain states, but we still want to measure their value
-  start_states = grid.actions.keys()
+  start_states = list(grid.actions.keys())
   start_idx = np.random.choice(len(start_states))
   grid.set_state(start_states[start_idx])
 
@@ -50,7 +56,7 @@ def play_game(grid, policy):
   grid = standard_grid()
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # state -> action
@@ -78,7 +84,7 @@ def play_game(grid, policy):
       V[s] = 0
 
   # repeat
-  for t in xrange(100):
+  for t in range(100):
 
     # generate an episode using pi
     states_and_returns = play_game(grid, policy)
@@ -91,7 +97,7 @@ def play_game(grid, policy):
         V[s] = np.mean(returns[s])
         seen_states.add(s)
 
-  print "values:"
+  print("values:")
   print_values(V, grid)
-  print "policy:"
+  print("policy:")
   print_policy(policy, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from grid_world import standard_grid, negative_grid
@@ -18,7 +24,7 @@ def play_game(grid, policy):
   # we need to do this if we have a deterministic policy
   # we would never end up at certain states, but we still want to measure their value
   # this is called the "exploring starts" method
-  start_states = grid.actions.keys()
+  start_states = list(grid.actions.keys())
   start_idx = np.random.choice(len(start_states))
   grid.set_state(start_states[start_idx])
 
@@ -70,7 +76,7 @@ def max_dict(d):
   # put this into a function since we are using it so often
   max_key = None
   max_val = float('-inf')
-  for k, v in d.iteritems():
+  for k, v in d.items():
     if v > max_val:
       max_val = v
       max_key = k
@@ -86,7 +92,7 @@ def max_dict(d):
   grid = negative_grid(step_cost=-0.9)
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # state -> action
@@ -111,9 +117,9 @@ def max_dict(d):
 
   # repeat until convergence
   deltas = []
-  for t in xrange(2000):
+  for t in range(2000):
     if t % 100 == 0:
-      print t
+      print(t)
 
     # generate an episode using pi
     biggest_change = 0
@@ -138,13 +144,13 @@ def max_dict(d):
   plt.plot(deltas)
   plt.show()
 
-  print "final policy:"
+  print("final policy:")
   print_policy(policy, grid)
 
   # find V
   V = {}
-  for s, Qs in Q.iteritems():
+  for s, Qs in Q.items():
     V[s] = max_dict(Q[s])[1]
 
-  print "final values:"
+  print("final values:")
   print_values(V, grid)
@@ -1,5 +1,11 @@
 # https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
 # https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from grid_world import standard_grid, negative_grid
@@ -77,7 +83,7 @@ def play_game(grid, policy):
   grid = negative_grid(step_cost=-0.1)
 
   # print rewards
-  print "rewards:"
+  print("rewards:")
   print_values(grid.rewards, grid)
 
   # state -> action
@@ -102,9 +108,9 @@ def play_game(grid, policy):
 
   # repeat until convergence
   deltas = []
-  for t in xrange(5000):
+  for t in range(5000):
     if t % 1000 == 0:
-      print t
+      print(t)
 
     # generate an episode using pi
     biggest_change = 0
@@ -138,8 +144,8 @@ def play_game(grid, policy):
   for s in policy.keys():
     V[s] = max_dict(Q[s])[1]
 
-  print "final values:"
+  print("final values:")
   print_values(V, grid)
-  print "final policy:"
+  print("final policy:")
   print_policy(policy, grid)