|
| 1 | +# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python |
| 2 | +import numpy as np |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +from grid_world import standard_grid, negative_grid |
| 5 | +from iterative_policy_evaluation import print_values, print_policy |
| 6 | +from monte_carlo_es import max_dict |
| 7 | +from sarsa import random_action, GAMMA, ALPHA, ALL_POSSIBLE_ACTIONS |
| 8 | + |
| 9 | +SA2IDX = {} |
| 10 | +IDX = 0 |
| 11 | + |
| 12 | +class Model: |
| 13 | + def __init__(self): |
| 14 | + self.theta = np.random.randn(25) / np.sqrt(25) |
| 15 | + # if we use SA2IDX, a one-hot encoding for every (s,a) pair |
| 16 | + # in reality we wouldn't want to do this b/c we have just |
| 17 | + # as many params as before |
| 18 | + # print "D:", IDX |
| 19 | + # self.theta = np.random.randn(IDX) / np.sqrt(IDX) |
| 20 | + |
| 21 | + def sa2x(self, s, a): |
| 22 | + # NOTE: using just (r, c, r*c, u, d, l, r, 1) is not expressive enough |
| 23 | + return np.array([ |
| 24 | + s[0] - 1 if a == 'U' else 0, |
| 25 | + s[1] - 1.5 if a == 'U' else 0, |
| 26 | + (s[0]*s[1] - 3)/3 if a == 'U' else 0, |
| 27 | + (s[0]*s[0] - 2)/2 if a == 'U' else 0, |
| 28 | + (s[1]*s[1] - 4.5)/4.5 if a == 'U' else 0, |
| 29 | + 1 if a == 'U' else 0, |
| 30 | + s[0] - 1 if a == 'D' else 0, |
| 31 | + s[1] - 1.5 if a == 'D' else 0, |
| 32 | + (s[0]*s[1] - 3)/3 if a == 'D' else 0, |
| 33 | + (s[0]*s[0] - 2)/2 if a == 'D' else 0, |
| 34 | + (s[1]*s[1] - 4.5)/4.5 if a == 'D' else 0, |
| 35 | + 1 if a == 'D' else 0, |
| 36 | + s[0] - 1 if a == 'L' else 0, |
| 37 | + s[1] - 1.5 if a == 'L' else 0, |
| 38 | + (s[0]*s[1] - 3)/3 if a == 'L' else 0, |
| 39 | + (s[0]*s[0] - 2)/2 if a == 'L' else 0, |
| 40 | + (s[1]*s[1] - 4.5)/4.5 if a == 'L' else 0, |
| 41 | + 1 if a == 'L' else 0, |
| 42 | + s[0] - 1 if a == 'R' else 0, |
| 43 | + s[1] - 1.5 if a == 'R' else 0, |
| 44 | + (s[0]*s[1] - 3)/3 if a == 'R' else 0, |
| 45 | + (s[0]*s[0] - 2)/2 if a == 'R' else 0, |
| 46 | + (s[1]*s[1] - 4.5)/4.5 if a == 'R' else 0, |
| 47 | + 1 if a == 'R' else 0, |
| 48 | + 1 |
| 49 | + ]) |
| 50 | + # if we use SA2IDX, a one-hot encoding for every (s,a) pair |
| 51 | + # in reality we wouldn't want to do this b/c we have just |
| 52 | + # as many params as before |
| 53 | + # x = np.zeros(len(self.theta)) |
| 54 | + # idx = SA2IDX[s][a] |
| 55 | + # x[idx] = 1 |
| 56 | + # return x |
| 57 | + |
| 58 | + def predict(self, s, a): |
| 59 | + x = self.sa2x(s, a) |
| 60 | + return self.theta.dot(x) |
| 61 | + |
| 62 | + def grad(self, s, a): |
| 63 | + return self.sa2x(s, a) |
| 64 | + |
| 65 | + |
| 66 | +def getQs(model, s): |
| 67 | + # we need Q(s,a) to choose an action |
| 68 | + # i.e. a = argmax[a]{ Q(s,a) } |
| 69 | + Qs = {} |
| 70 | + for a in ALL_POSSIBLE_ACTIONS: |
| 71 | + q_sa = model.predict(s, a) |
| 72 | + Qs[a] = q_sa |
| 73 | + return Qs |
| 74 | + |
| 75 | + |
| 76 | +if __name__ == '__main__': |
| 77 | + # NOTE: if we use the standard grid, there's a good chance we will end up with |
| 78 | + # suboptimal policies |
| 79 | + # e.g. |
| 80 | + # --------------------------- |
| 81 | + # R | R | R | | |
| 82 | + # --------------------------- |
| 83 | + # R* | | U | | |
| 84 | + # --------------------------- |
| 85 | + # U | R | U | L | |
| 86 | + # since going R at (1,0) (shown with a *) incurs no cost, it's OK to keep doing that. |
| 87 | + # we'll either end up staying in the same spot, or back to the start (2,0), at which |
| 88 | + # point we whould then just go back up, or at (0,0), at which point we can continue |
| 89 | + # on right. |
| 90 | + # instead, let's penalize each movement so the agent will find a shorter route. |
| 91 | + # |
| 92 | + # grid = standard_grid() |
| 93 | + grid = negative_grid(step_cost=-0.1) |
| 94 | + |
| 95 | + # print rewards |
| 96 | + print "rewards:" |
| 97 | + print_values(grid.rewards, grid) |
| 98 | + |
| 99 | + # no policy initialization, we will derive our policy from most recent Q |
| 100 | + # enumerate all (s,a) pairs, each will have its own weight in our "dumb" model |
| 101 | + # essentially each weight will be a measure of Q(s,a) itself |
| 102 | + states = grid.all_states() |
| 103 | + for s in states: |
| 104 | + SA2IDX[s] = {} |
| 105 | + for a in ALL_POSSIBLE_ACTIONS: |
| 106 | + SA2IDX[s][a] = IDX |
| 107 | + IDX += 1 |
| 108 | + |
| 109 | + # initialize model |
| 110 | + model = Model() |
| 111 | + |
| 112 | + # repeat until convergence |
| 113 | + t = 1.0 |
| 114 | + t2 = 1.0 |
| 115 | + deltas = [] |
| 116 | + for it in xrange(20000): |
| 117 | + if it % 100 == 0: |
| 118 | + t += 10e-3 |
| 119 | + t2 += 0.01 |
| 120 | + if it % 1000 == 0: |
| 121 | + print "it:", it |
| 122 | + alpha = ALPHA / t2 |
| 123 | + |
| 124 | + # instead of 'generating' an epsiode, we will PLAY |
| 125 | + # an episode within this loop |
| 126 | + s = (2, 0) # start state |
| 127 | + grid.set_state(s) |
| 128 | + |
| 129 | + # get Q(s) so we can choose the first action |
| 130 | + Qs = getQs(model, s) |
| 131 | + |
| 132 | + # the first (s, r) tuple is the state we start in and 0 |
| 133 | + # (since we don't get a reward) for simply starting the game |
| 134 | + # the last (s, r) tuple is the terminal state and the final reward |
| 135 | + # the value for the terminal state is by definition 0, so we don't |
| 136 | + # care about updating it. |
| 137 | + a = max_dict(Qs)[0] |
| 138 | + a = random_action(a, eps=0.5/t) # epsilon-greedy |
| 139 | + biggest_change = 0 |
| 140 | + while not grid.game_over(): |
| 141 | + r = grid.move(a) |
| 142 | + s2 = grid.current_state() |
| 143 | + |
| 144 | + # we need the next action as well since Q(s,a) depends on Q(s',a') |
| 145 | + # if s2 not in policy then it's a terminal state, all Q are 0 |
| 146 | + old_theta = model.theta.copy() |
| 147 | + if grid.is_terminal(s2): |
| 148 | + model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a) |
| 149 | + else: |
| 150 | + # not terminal |
| 151 | + Qs2 = getQs(model, s2) |
| 152 | + a2 = max_dict(Qs2)[0] |
| 153 | + a2 = random_action(a2, eps=0.5/t) # epsilon-greedy |
| 154 | + |
| 155 | + # we will update Q(s,a) AS we experience the episode |
| 156 | + model.theta += alpha*(r + GAMMA*model.predict(s2, a2) - model.predict(s, a))*model.grad(s, a) |
| 157 | + |
| 158 | + # next state becomes current state |
| 159 | + s = s2 |
| 160 | + a = a2 |
| 161 | + |
| 162 | + biggest_change = max(biggest_change, np.abs(model.theta - old_theta).sum()) |
| 163 | + deltas.append(biggest_change) |
| 164 | + |
| 165 | + plt.plot(deltas) |
| 166 | + plt.show() |
| 167 | + |
| 168 | + # determine the policy from Q* |
| 169 | + # find V* from Q* |
| 170 | + policy = {} |
| 171 | + V = {} |
| 172 | + Q = {} |
| 173 | + for s in grid.actions.keys(): |
| 174 | + Qs = getQs(model, s) |
| 175 | + Q[s] = Qs |
| 176 | + a, max_q = max_dict(Qs) |
| 177 | + policy[s] = a |
| 178 | + V[s] = max_q |
| 179 | + |
| 180 | + print "values:" |
| 181 | + print_values(V, grid) |
| 182 | + print "policy:" |
| 183 | + print_policy(policy, grid) |
0 commit comments