|
| 1 | +# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python |
| 2 | +# https://www.udemy.com/deep-reinforcement-learning-in-python |
| 3 | +from __future__ import print_function, division |
| 4 | +from builtins import range |
| 5 | +# Note: you may need to update your version of future |
| 6 | +# sudo pip install -U future |
| 7 | + |
| 8 | +import gym |
| 9 | +import os |
| 10 | +import sys |
| 11 | +import random |
| 12 | +import numpy as np |
| 13 | +import tensorflow as tf |
| 14 | +import matplotlib.pyplot as plt |
| 15 | +from gym import wrappers |
| 16 | +from datetime import datetime |
| 17 | +from scipy.misc import imresize |
| 18 | + |
| 19 | +if '../cartpole' not in sys.path: |
| 20 | + sys.path.append('../cartpole') |
| 21 | +from q_learning_bins import plot_running_avg |
| 22 | + |
| 23 | +# constants |
| 24 | +IM_WIDTH = 80 |
| 25 | +IM_HEIGHT = 80 |
| 26 | + |
| 27 | + |
| 28 | +def downsample_image(A): |
| 29 | + B = A[31:195] # select the important parts of the image |
| 30 | + B = B.mean(axis=2) # convert to grayscale |
| 31 | + B = B / 255.0 # scale to 0..1 |
| 32 | + |
| 33 | + # downsample image |
| 34 | + # changing aspect ratio doesn't significantly distort the image |
| 35 | + # nearest neighbor interpolation produces a much sharper image |
| 36 | + # than default bilinear |
| 37 | + B = imresize(B, size=(IM_HEIGHT, IM_WIDTH), interp='nearest') |
| 38 | + return B |
| 39 | + |
| 40 | + |
| 41 | +class DQN: |
| 42 | + def __init__(self, K, conv_layer_sizes, hidden_layer_sizes, gamma, scope, max_experiences=500000, min_experiences=50000, batch_sz=32): |
| 43 | + self.K = K |
| 44 | + self.scope = scope |
| 45 | + |
| 46 | + with tf.variable_scope(scope): |
| 47 | + |
| 48 | + # inputs and targets |
| 49 | + self.X = tf.placeholder(tf.float32, shape=(None, 4, IM_HEIGHT, IM_WIDTH), name='X') |
| 50 | + # tensorflow convolution needs the order to be: |
| 51 | + # (num_samples, height, width, "color") |
| 52 | + # so we need to tranpose later |
| 53 | + self.G = tf.placeholder(tf.float32, shape=(None,), name='G') |
| 54 | + self.actions = tf.placeholder(tf.int32, shape=(None,), name='actions') |
| 55 | + |
| 56 | + # calculate output and cost |
| 57 | + # convolutional layers |
| 58 | + # these built-in layers are faster and don't require us to |
| 59 | + # calculate the size of the output of the final conv layer! |
| 60 | + Z = self.X |
| 61 | + Z = tf.transpose(Z, [0, 2, 3, 1]) |
| 62 | + for num_output_filters, filtersz, poolsz in conv_layer_sizes: |
| 63 | + Z = tf.contrib.layers.conv2d( |
| 64 | + Z, |
| 65 | + num_output_filters, |
| 66 | + filtersz, |
| 67 | + poolsz, |
| 68 | + activation_fn=tf.nn.relu |
| 69 | + ) |
| 70 | + |
| 71 | + # fully connected layers |
| 72 | + Z = tf.contrib.layers.flatten(Z) |
| 73 | + for M in hidden_layer_sizes: |
| 74 | + Z = tf.contrib.layers.fully_connected(Z, M) |
| 75 | + |
| 76 | + # final output layer |
| 77 | + self.predict_op = tf.contrib.layers.fully_connected(Z, K) |
| 78 | + |
| 79 | + selected_action_values = tf.reduce_sum( |
| 80 | + self.predict_op * tf.one_hot(self.actions, K), |
| 81 | + reduction_indices=[1] |
| 82 | + ) |
| 83 | + |
| 84 | + cost = tf.reduce_sum(tf.square(self.G - selected_action_values)) |
| 85 | + # self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost) |
| 86 | + # self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost) |
| 87 | + self.train_op = tf.train.RMSPropOptimizer(2.5e-4, decay=0.99, epsilon=10e-3).minimize(cost) |
| 88 | + # self.train_op = tf.train.MomentumOptimizer(10e-4, momentum=0.9).minimize(cost) |
| 89 | + # self.train_op = tf.train.GradientDescentOptimizer(10e-5).minimize(cost) |
| 90 | + |
| 91 | + # create replay memory |
| 92 | + self.experience = [] |
| 93 | + self.max_experiences = max_experiences |
| 94 | + self.min_experiences = min_experiences |
| 95 | + self.batch_sz = batch_sz |
| 96 | + self.gamma = gamma |
| 97 | + |
| 98 | + def copy_from(self, other): |
| 99 | + mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)] |
| 100 | + mine = sorted(mine, key=lambda v: v.name) |
| 101 | + theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)] |
| 102 | + theirs = sorted(theirs, key=lambda v: v.name) |
| 103 | + |
| 104 | + ops = [] |
| 105 | + for p, q in zip(mine, theirs): |
| 106 | + actual = session.run(q) |
| 107 | + op = p.assign(actual) |
| 108 | + ops.append(op) |
| 109 | + |
| 110 | + self.session.run(ops) |
| 111 | + |
| 112 | + def set_session(self, session): |
| 113 | + self.session = session |
| 114 | + |
| 115 | + def predict(self, X): |
| 116 | + return self.session.run(self.predict_op, feed_dict={self.X: X}) |
| 117 | + |
| 118 | + def train(self, target_network): |
| 119 | + # sample a random batch from buffer, do an iteration of GD |
| 120 | + if len(self.experience) < self.min_experiences: |
| 121 | + # don't do anything if we don't have enough experience |
| 122 | + return |
| 123 | + |
| 124 | + # randomly select a batch |
| 125 | + sample = random.sample(self.experience, self.batch_sz) |
| 126 | + states, actions, rewards, next_states = map(np.array, zip(*sample)) |
| 127 | + next_Q = np.max(target_network.predict(next_states), axis=1) |
| 128 | + targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)] |
| 129 | + |
| 130 | + # call optimizer |
| 131 | + self.session.run( |
| 132 | + self.train_op, |
| 133 | + feed_dict={ |
| 134 | + self.X: states, |
| 135 | + self.G: targets, |
| 136 | + self.actions: actions |
| 137 | + } |
| 138 | + ) |
| 139 | + |
| 140 | + def add_experience(self, s, a, r, s2): |
| 141 | + if len(self.experience) >= self.max_experiences: |
| 142 | + self.experience.pop(0) |
| 143 | + self.experience.append((s, a, r, s2)) |
| 144 | + |
| 145 | + def sample_action(self, x, eps): |
| 146 | + if np.random.random() < eps: |
| 147 | + return np.random.choice(self.K) |
| 148 | + else: |
| 149 | + return np.argmax(self.predict([x])[0]) |
| 150 | + |
| 151 | + |
| 152 | +def update_state(state, observation): |
| 153 | + # downsample and grayscale observation |
| 154 | + observation_small = downsample_image(observation) |
| 155 | + state.append(observation_small) |
| 156 | + if len(state) > 4: |
| 157 | + state.pop(0) |
| 158 | + |
| 159 | + |
| 160 | +def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period): |
| 161 | + observation = env.reset() |
| 162 | + done = False |
| 163 | + totalreward = 0 |
| 164 | + iters = 0 |
| 165 | + state = [] |
| 166 | + prev_state = [] |
| 167 | + update_state(state, observation) # add the first observation |
| 168 | + while not done and iters < 2000: |
| 169 | + # if we reach 2000, just quit, don't want this going forever |
| 170 | + # the 200 limit seems a bit early |
| 171 | + |
| 172 | + if len(state) < 4: |
| 173 | + # we can't choose an action based on model |
| 174 | + action = env.action_space.sample() |
| 175 | + else: |
| 176 | + action = model.sample_action(state, eps) |
| 177 | + |
| 178 | + # copy state to prev state |
| 179 | + prev_state.append(state[-1]) |
| 180 | + if len(prev_state) > 4: |
| 181 | + prev_state.pop(0) |
| 182 | + |
| 183 | + # perform the action |
| 184 | + observation, reward, done, info = env.step(action) |
| 185 | + |
| 186 | + # add the new frame to the state |
| 187 | + update_state(state, observation) |
| 188 | + |
| 189 | + totalreward += reward |
| 190 | + if done: |
| 191 | + reward = -200 |
| 192 | + |
| 193 | + # update the model |
| 194 | + model.add_experience(prev_state, action, reward, state) |
| 195 | + model.train(tmodel) |
| 196 | + |
| 197 | + iters += 1 |
| 198 | + eps = max(eps - eps_step, 0.1) |
| 199 | + |
| 200 | + if iters % copy_period == 0: |
| 201 | + tmodel.copy_from(model) |
| 202 | + |
| 203 | + return totalreward, eps, iters |
| 204 | + |
| 205 | + |
| 206 | +def main(): |
| 207 | + env = gym.make('Breakout-v0') |
| 208 | + gamma = 0.99 |
| 209 | + copy_period = 10000 |
| 210 | + |
| 211 | + D = len(env.observation_space.sample()) |
| 212 | + K = env.action_space.n |
| 213 | + conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)] |
| 214 | + hidden_sizes = [512] |
| 215 | + model = DQN(K, conv_sizes, hidden_sizes, gamma, scope='main') |
| 216 | + tmodel = DQN(K, conv_sizes, hidden_sizes, gamma, scope='target') |
| 217 | + init = tf.global_variables_initializer() |
| 218 | + session = tf.InteractiveSession() |
| 219 | + session.run(init) |
| 220 | + model.set_session(session) |
| 221 | + tmodel.set_session(session) |
| 222 | + |
| 223 | + |
| 224 | + if 'monitor' in sys.argv: |
| 225 | + filename = os.path.basename(__file__).split('.')[0] |
| 226 | + monitor_dir = './' + filename + '_' + str(datetime.now()) |
| 227 | + env = wrappers.Monitor(env, monitor_dir) |
| 228 | + |
| 229 | + |
| 230 | + N = 100000 |
| 231 | + totalrewards = np.empty(N) |
| 232 | + costs = np.empty(N) |
| 233 | + n_max = 500000 # last step to decrease epsilon |
| 234 | + eps_step = 0.9 / n_max |
| 235 | + eps = 1.0 |
| 236 | + for n in range(N): |
| 237 | + t0 = datetime.now() |
| 238 | + totalreward, eps, num_steps = play_one(env, model, tmodel, eps, eps_step, gamma, copy_period) |
| 239 | + totalrewards[n] = totalreward |
| 240 | + if n % 1 == 0: |
| 241 | + print("episode:", n, "total reward:", totalreward, "eps:", "%.3f" % eps, "num steps:", num_steps, "episode duration:", (datetime.now() - t0), "avg reward (last 100):", "%.3f" % totalrewards[max(0, n-100):(n+1)].mean()) |
| 242 | + |
| 243 | + print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) |
| 244 | + print("total steps:", totalrewards.sum()) |
| 245 | + |
| 246 | + plt.plot(totalrewards) |
| 247 | + plt.title("Rewards") |
| 248 | + plt.show() |
| 249 | + |
| 250 | + plot_running_avg(totalrewards) |
| 251 | + |
| 252 | + |
| 253 | +if __name__ == '__main__': |
| 254 | + main() |
| 255 | + |
| 256 | + |
0 commit comments