SericWong
diff --git a/‎rl/approx_mc_prediction.py
Lines changed: 99 additions & 0 deletions b/‎rl/approx_mc_prediction.py
Lines changed: 99 additions & 0 deletions
diff --git a/‎rl/approx_semigradient_sarsa_control.py
Lines changed: 183 additions & 0 deletions b/‎rl/approx_semigradient_sarsa_control.py
Lines changed: 183 additions & 0 deletions
diff --git a/‎rl/approx_semigradient_td0_prediction.py
Lines changed: 94 additions & 0 deletions b/‎rl/approx_semigradient_td0_prediction.py
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,99 @@
+# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+import numpy as np
+import matplotlib.pyplot as plt
+from grid_world import standard_grid, negative_grid
+from iterative_policy_evaluation import print_values, print_policy
+
+# NOTE: this is only policy evaluation, not optimization
+
+# we'll try to obtain the same result as our other MC script
+from monte_carlo_random import random_action, play_game, SMALL_ENOUGH, GAMMA, ALL_POSSIBLE_ACTIONS
+
+LEARNING_RATE = 0.001
+
+if __name__ == '__main__':
+  # use the standard grid again (0 for every step) so that we can compare
+  # to iterative policy evaluation
+  grid = standard_grid()
+
+  # print rewards
+  print "rewards:"
+  print_values(grid.rewards, grid)
+
+  # state -> action
+  # found by policy_iteration_random on standard_grid
+  # MC method won't get exactly this, but should be close
+  # values:
+  # ---------------------------
+  #  0.43|  0.56|  0.72|  0.00|
+  # ---------------------------
+  #  0.33|  0.00|  0.21|  0.00|
+  # ---------------------------
+  #  0.25|  0.18|  0.11| -0.17|
+  # policy:
+  # ---------------------------
+  #   R  |   R  |   R  |      |
+  # ---------------------------
+  #   U  |      |   U  |      |
+  # ---------------------------
+  #   U  |   L  |   U  |   L  |
+  policy = {
+    (2, 0): 'U',
+    (1, 0): 'U',
+    (0, 0): 'R',
+    (0, 1): 'R',
+    (0, 2): 'R',
+    (1, 2): 'U',
+    (2, 1): 'L',
+    (2, 2): 'U',
+    (2, 3): 'L',
+  }
+
+  # initialize theta
+  # our model is V_hat = theta.dot(x)
+  # where x = [row, col, row*col, 1] - 1 for bias term
+  theta = np.random.randn(4) / 2
+  def s2x(s):
+    return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1])
+
+  # repeat until convergence
+  deltas = []
+  t = 1.0
+  for it in xrange(20000):
+    if it % 100 == 0:
+      t += 0.01
+    alpha = LEARNING_RATE/t
+    # generate an episode using pi
+    biggest_change = 0
+    states_and_returns = play_game(grid, policy)
+    seen_states = set()
+    for s, G in states_and_returns:
+      # check if we have already seen s
+      # called "first-visit" MC policy evaluation
+      if s not in seen_states:
+        old_theta = theta.copy()
+        x = s2x(s)
+        V_hat = theta.dot(x)
+        # grad(V_hat) wrt theta = x
+        theta += alpha*(G - V_hat)*x
+        biggest_change = max(biggest_change, np.abs(old_theta - theta).sum())
+        seen_states.add(s)
+    deltas.append(biggest_change)
+
+  plt.plot(deltas)
+  plt.show()
+
+  # obtain predicted values
+  V = {}
+  states = grid.all_states()
+  for s in states:
+    if s in grid.actions:
+      V[s] = theta.dot(s2x(s))
+    else:
+      # terminal state or state we can't otherwise get to
+      V[s] = 0
+
+  print "values:"
+  print_values(V, grid)
+  print "policy:"
+  print_policy(policy, grid)
@@ -0,0 +1,183 @@
+# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+import numpy as np
+import matplotlib.pyplot as plt
+from grid_world import standard_grid, negative_grid
+from iterative_policy_evaluation import print_values, print_policy
+from monte_carlo_es import max_dict
+from sarsa import random_action, GAMMA, ALPHA, ALL_POSSIBLE_ACTIONS
+
+SA2IDX = {}
+IDX = 0
+
+class Model:
+  def __init__(self):
+    self.theta = np.random.randn(25) / np.sqrt(25)
+    # if we use SA2IDX, a one-hot encoding for every (s,a) pair
+    # in reality we wouldn't want to do this b/c we have just
+    # as many params as before
+    # print "D:", IDX
+    # self.theta = np.random.randn(IDX) / np.sqrt(IDX)
+
+  def sa2x(self, s, a):
+    # NOTE: using just (r, c, r*c, u, d, l, r, 1) is not expressive enough
+    return np.array([
+      s[0] - 1              if a == 'U' else 0,
+      s[1] - 1.5            if a == 'U' else 0,
+      (s[0]*s[1] - 3)/3     if a == 'U' else 0,
+      (s[0]*s[0] - 2)/2     if a == 'U' else 0,
+      (s[1]*s[1] - 4.5)/4.5 if a == 'U' else 0,
+      1                     if a == 'U' else 0,
+      s[0] - 1              if a == 'D' else 0,
+      s[1] - 1.5            if a == 'D' else 0,
+      (s[0]*s[1] - 3)/3     if a == 'D' else 0,
+      (s[0]*s[0] - 2)/2     if a == 'D' else 0,
+      (s[1]*s[1] - 4.5)/4.5 if a == 'D' else 0,
+      1                     if a == 'D' else 0,
+      s[0] - 1              if a == 'L' else 0,
+      s[1] - 1.5            if a == 'L' else 0,
+      (s[0]*s[1] - 3)/3     if a == 'L' else 0,
+      (s[0]*s[0] - 2)/2     if a == 'L' else 0,
+      (s[1]*s[1] - 4.5)/4.5 if a == 'L' else 0,
+      1                     if a == 'L' else 0,
+      s[0] - 1              if a == 'R' else 0,
+      s[1] - 1.5            if a == 'R' else 0,
+      (s[0]*s[1] - 3)/3     if a == 'R' else 0,
+      (s[0]*s[0] - 2)/2     if a == 'R' else 0,
+      (s[1]*s[1] - 4.5)/4.5 if a == 'R' else 0,
+      1                     if a == 'R' else 0,
+      1
+    ])
+    # if we use SA2IDX, a one-hot encoding for every (s,a) pair
+    # in reality we wouldn't want to do this b/c we have just
+    # as many params as before
+    # x = np.zeros(len(self.theta))
+    # idx = SA2IDX[s][a]
+    # x[idx] = 1
+    # return x
+
+  def predict(self, s, a):
+    x = self.sa2x(s, a)
+    return self.theta.dot(x)
+
+  def grad(self, s, a):
+    return self.sa2x(s, a)
+
+
+def getQs(model, s):
+  # we need Q(s,a) to choose an action
+  # i.e. a = argmax[a]{ Q(s,a) }
+  Qs = {}
+  for a in ALL_POSSIBLE_ACTIONS:
+    q_sa = model.predict(s, a)
+    Qs[a] = q_sa
+  return Qs
+
+
+if __name__ == '__main__':
+  # NOTE: if we use the standard grid, there's a good chance we will end up with
+  # suboptimal policies
+  # e.g.
+  # ---------------------------
+  #   R  |   R  |   R  |      |
+  # ---------------------------
+  #   R* |      |   U  |      |
+  # ---------------------------
+  #   U  |   R  |   U  |   L  |
+  # since going R at (1,0) (shown with a *) incurs no cost, it's OK to keep doing that.
+  # we'll either end up staying in the same spot, or back to the start (2,0), at which
+  # point we whould then just go back up, or at (0,0), at which point we can continue
+  # on right.
+  # instead, let's penalize each movement so the agent will find a shorter route.
+  #
+  # grid = standard_grid()
+  grid = negative_grid(step_cost=-0.1)
+
+  # print rewards
+  print "rewards:"
+  print_values(grid.rewards, grid)
+
+  # no policy initialization, we will derive our policy from most recent Q
+  # enumerate all (s,a) pairs, each will have its own weight in our "dumb" model
+  # essentially each weight will be a measure of Q(s,a) itself
+  states = grid.all_states()
+  for s in states:
+    SA2IDX[s] = {}
+    for a in ALL_POSSIBLE_ACTIONS:
+      SA2IDX[s][a] = IDX
+      IDX += 1
+
+  # initialize model
+  model = Model()
+
+  # repeat until convergence
+  t = 1.0
+  t2 = 1.0
+  deltas = []
+  for it in xrange(20000):
+    if it % 100 == 0:
+      t += 10e-3
+      t2 += 0.01
+    if it % 1000 == 0:
+      print "it:", it
+    alpha = ALPHA / t2
+
+    # instead of 'generating' an epsiode, we will PLAY
+    # an episode within this loop
+    s = (2, 0) # start state
+    grid.set_state(s)
+
+    # get Q(s) so we can choose the first action
+    Qs = getQs(model, s)
+
+    # the first (s, r) tuple is the state we start in and 0
+    # (since we don't get a reward) for simply starting the game
+    # the last (s, r) tuple is the terminal state and the final reward
+    # the value for the terminal state is by definition 0, so we don't
+    # care about updating it.
+    a = max_dict(Qs)[0]
+    a = random_action(a, eps=0.5/t) # epsilon-greedy
+    biggest_change = 0
+    while not grid.game_over():
+      r = grid.move(a)
+      s2 = grid.current_state()
+
+      # we need the next action as well since Q(s,a) depends on Q(s',a')
+      # if s2 not in policy then it's a terminal state, all Q are 0
+      old_theta = model.theta.copy()
+      if grid.is_terminal(s2):
+        model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a)
+      else:
+        # not terminal
+        Qs2 = getQs(model, s2)
+        a2 = max_dict(Qs2)[0]
+        a2 = random_action(a2, eps=0.5/t) # epsilon-greedy
+
+        # we will update Q(s,a) AS we experience the episode
+        model.theta += alpha*(r + GAMMA*model.predict(s2, a2) - model.predict(s, a))*model.grad(s, a)
+        
+        # next state becomes current state
+        s = s2
+        a = a2
+
+      biggest_change = max(biggest_change, np.abs(model.theta - old_theta).sum())
+    deltas.append(biggest_change)
+
+  plt.plot(deltas)
+  plt.show()
+
+  # determine the policy from Q*
+  # find V* from Q*
+  policy = {}
+  V = {}
+  Q = {}
+  for s in grid.actions.keys():
+    Qs = getQs(model, s)
+    Q[s] = Qs
+    a, max_q = max_dict(Qs)
+    policy[s] = a
+    V[s] = max_q
+
+  print "values:"
+  print_values(V, grid)
+  print "policy:"
+  print_policy(policy, grid)
@@ -0,0 +1,94 @@
+# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
+import numpy as np
+import matplotlib.pyplot as plt
+from grid_world import standard_grid, negative_grid
+from iterative_policy_evaluation import print_values, print_policy
+from td0_prediction import play_game, SMALL_ENOUGH, GAMMA, ALPHA, ALL_POSSIBLE_ACTIONS
+
+# NOTE: this is only policy evaluation, not optimization
+
+class Model:
+  def __init__(self):
+    self.theta = np.random.randn(4) / 2
+  
+  def s2x(self, s):
+    return np.array([s[0] - 1, s[1] - 1.5, s[0]*s[1] - 3, 1])
+
+  def predict(self, s):
+    x = self.s2x(s)
+    return self.theta.dot(x)
+
+  def grad(self, s):
+    return self.s2x(s)
+
+
+if __name__ == '__main__':
+  # use the standard grid again (0 for every step) so that we can compare
+  # to iterative policy evaluation
+  grid = standard_grid()
+
+  # print rewards
+  print "rewards:"
+  print_values(grid.rewards, grid)
+
+  # state -> action
+  policy = {
+    (2, 0): 'U',
+    (1, 0): 'U',
+    (0, 0): 'R',
+    (0, 1): 'R',
+    (0, 2): 'R',
+    (1, 2): 'R',
+    (2, 1): 'R',
+    (2, 2): 'R',
+    (2, 3): 'U',
+  }
+
+  model = Model()
+  deltas = []
+
+  # repeat until convergence
+  k = 1.0
+  for it in xrange(20000):
+    if it % 10 == 0:
+      k += 0.01
+    alpha = ALPHA/k
+    biggest_change = 0
+
+    # generate an episode using pi
+    states_and_rewards = play_game(grid, policy)
+    # the first (s, r) tuple is the state we start in and 0
+    # (since we don't get a reward) for simply starting the game
+    # the last (s, r) tuple is the terminal state and the final reward
+    # the value for the terminal state is by definition 0, so we don't
+    # care about updating it.
+    for t in xrange(len(states_and_rewards) - 1):
+      s, _ = states_and_rewards[t]
+      s2, r = states_and_rewards[t+1]
+      # we will update V(s) AS we experience the episode
+      old_theta = model.theta.copy()
+      if grid.is_terminal(s2):
+        target = r
+      else:
+        target = r + GAMMA*model.predict(s2)
+      model.theta += alpha*(target - model.predict(s))*model.grad(s)
+      biggest_change = max(biggest_change, np.abs(old_theta - model.theta).sum())
+    deltas.append(biggest_change)
+
+  plt.plot(deltas)
+  plt.show()
+
+  # obtain predicted values
+  V = {}
+  states = grid.all_states()
+  for s in states:
+    if s in grid.actions:
+      V[s] = model.predict(s)
+    else:
+      # terminal state or state we can't otherwise get to
+      V[s] = 0
+
+  print "values:"
+  print_values(V, grid)
+  print "policy:"
+  print_policy(policy, grid)