Skip to content

Commit ddbda89

Browse files
initial commit rl2
1 parent 99ce309 commit ddbda89

18 files changed

+3081
-0
lines changed

rl2/atari/dqn_tf.py

+256
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
2+
# https://www.udemy.com/deep-reinforcement-learning-in-python
3+
from __future__ import print_function, division
4+
from builtins import range
5+
# Note: you may need to update your version of future
6+
# sudo pip install -U future
7+
8+
import gym
9+
import os
10+
import sys
11+
import random
12+
import numpy as np
13+
import tensorflow as tf
14+
import matplotlib.pyplot as plt
15+
from gym import wrappers
16+
from datetime import datetime
17+
from scipy.misc import imresize
18+
19+
if '../cartpole' not in sys.path:
20+
sys.path.append('../cartpole')
21+
from q_learning_bins import plot_running_avg
22+
23+
# constants
24+
IM_WIDTH = 80
25+
IM_HEIGHT = 80
26+
27+
28+
def downsample_image(A):
29+
B = A[31:195] # select the important parts of the image
30+
B = B.mean(axis=2) # convert to grayscale
31+
B = B / 255.0 # scale to 0..1
32+
33+
# downsample image
34+
# changing aspect ratio doesn't significantly distort the image
35+
# nearest neighbor interpolation produces a much sharper image
36+
# than default bilinear
37+
B = imresize(B, size=(IM_HEIGHT, IM_WIDTH), interp='nearest')
38+
return B
39+
40+
41+
class DQN:
42+
def __init__(self, K, conv_layer_sizes, hidden_layer_sizes, gamma, scope, max_experiences=500000, min_experiences=50000, batch_sz=32):
43+
self.K = K
44+
self.scope = scope
45+
46+
with tf.variable_scope(scope):
47+
48+
# inputs and targets
49+
self.X = tf.placeholder(tf.float32, shape=(None, 4, IM_HEIGHT, IM_WIDTH), name='X')
50+
# tensorflow convolution needs the order to be:
51+
# (num_samples, height, width, "color")
52+
# so we need to tranpose later
53+
self.G = tf.placeholder(tf.float32, shape=(None,), name='G')
54+
self.actions = tf.placeholder(tf.int32, shape=(None,), name='actions')
55+
56+
# calculate output and cost
57+
# convolutional layers
58+
# these built-in layers are faster and don't require us to
59+
# calculate the size of the output of the final conv layer!
60+
Z = self.X
61+
Z = tf.transpose(Z, [0, 2, 3, 1])
62+
for num_output_filters, filtersz, poolsz in conv_layer_sizes:
63+
Z = tf.contrib.layers.conv2d(
64+
Z,
65+
num_output_filters,
66+
filtersz,
67+
poolsz,
68+
activation_fn=tf.nn.relu
69+
)
70+
71+
# fully connected layers
72+
Z = tf.contrib.layers.flatten(Z)
73+
for M in hidden_layer_sizes:
74+
Z = tf.contrib.layers.fully_connected(Z, M)
75+
76+
# final output layer
77+
self.predict_op = tf.contrib.layers.fully_connected(Z, K)
78+
79+
selected_action_values = tf.reduce_sum(
80+
self.predict_op * tf.one_hot(self.actions, K),
81+
reduction_indices=[1]
82+
)
83+
84+
cost = tf.reduce_sum(tf.square(self.G - selected_action_values))
85+
# self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)
86+
# self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost)
87+
self.train_op = tf.train.RMSPropOptimizer(2.5e-4, decay=0.99, epsilon=10e-3).minimize(cost)
88+
# self.train_op = tf.train.MomentumOptimizer(10e-4, momentum=0.9).minimize(cost)
89+
# self.train_op = tf.train.GradientDescentOptimizer(10e-5).minimize(cost)
90+
91+
# create replay memory
92+
self.experience = []
93+
self.max_experiences = max_experiences
94+
self.min_experiences = min_experiences
95+
self.batch_sz = batch_sz
96+
self.gamma = gamma
97+
98+
def copy_from(self, other):
99+
mine = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
100+
mine = sorted(mine, key=lambda v: v.name)
101+
theirs = [t for t in tf.trainable_variables() if t.name.startswith(other.scope)]
102+
theirs = sorted(theirs, key=lambda v: v.name)
103+
104+
ops = []
105+
for p, q in zip(mine, theirs):
106+
actual = session.run(q)
107+
op = p.assign(actual)
108+
ops.append(op)
109+
110+
self.session.run(ops)
111+
112+
def set_session(self, session):
113+
self.session = session
114+
115+
def predict(self, X):
116+
return self.session.run(self.predict_op, feed_dict={self.X: X})
117+
118+
def train(self, target_network):
119+
# sample a random batch from buffer, do an iteration of GD
120+
if len(self.experience) < self.min_experiences:
121+
# don't do anything if we don't have enough experience
122+
return
123+
124+
# randomly select a batch
125+
sample = random.sample(self.experience, self.batch_sz)
126+
states, actions, rewards, next_states = map(np.array, zip(*sample))
127+
next_Q = np.max(target_network.predict(next_states), axis=1)
128+
targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
129+
130+
# call optimizer
131+
self.session.run(
132+
self.train_op,
133+
feed_dict={
134+
self.X: states,
135+
self.G: targets,
136+
self.actions: actions
137+
}
138+
)
139+
140+
def add_experience(self, s, a, r, s2):
141+
if len(self.experience) >= self.max_experiences:
142+
self.experience.pop(0)
143+
self.experience.append((s, a, r, s2))
144+
145+
def sample_action(self, x, eps):
146+
if np.random.random() < eps:
147+
return np.random.choice(self.K)
148+
else:
149+
return np.argmax(self.predict([x])[0])
150+
151+
152+
def update_state(state, observation):
153+
# downsample and grayscale observation
154+
observation_small = downsample_image(observation)
155+
state.append(observation_small)
156+
if len(state) > 4:
157+
state.pop(0)
158+
159+
160+
def play_one(env, model, tmodel, eps, eps_step, gamma, copy_period):
161+
observation = env.reset()
162+
done = False
163+
totalreward = 0
164+
iters = 0
165+
state = []
166+
prev_state = []
167+
update_state(state, observation) # add the first observation
168+
while not done and iters < 2000:
169+
# if we reach 2000, just quit, don't want this going forever
170+
# the 200 limit seems a bit early
171+
172+
if len(state) < 4:
173+
# we can't choose an action based on model
174+
action = env.action_space.sample()
175+
else:
176+
action = model.sample_action(state, eps)
177+
178+
# copy state to prev state
179+
prev_state.append(state[-1])
180+
if len(prev_state) > 4:
181+
prev_state.pop(0)
182+
183+
# perform the action
184+
observation, reward, done, info = env.step(action)
185+
186+
# add the new frame to the state
187+
update_state(state, observation)
188+
189+
totalreward += reward
190+
if done:
191+
reward = -200
192+
193+
# update the model
194+
model.add_experience(prev_state, action, reward, state)
195+
model.train(tmodel)
196+
197+
iters += 1
198+
eps = max(eps - eps_step, 0.1)
199+
200+
if iters % copy_period == 0:
201+
tmodel.copy_from(model)
202+
203+
return totalreward, eps, iters
204+
205+
206+
def main():
207+
env = gym.make('Breakout-v0')
208+
gamma = 0.99
209+
copy_period = 10000
210+
211+
D = len(env.observation_space.sample())
212+
K = env.action_space.n
213+
conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
214+
hidden_sizes = [512]
215+
model = DQN(K, conv_sizes, hidden_sizes, gamma, scope='main')
216+
tmodel = DQN(K, conv_sizes, hidden_sizes, gamma, scope='target')
217+
init = tf.global_variables_initializer()
218+
session = tf.InteractiveSession()
219+
session.run(init)
220+
model.set_session(session)
221+
tmodel.set_session(session)
222+
223+
224+
if 'monitor' in sys.argv:
225+
filename = os.path.basename(__file__).split('.')[0]
226+
monitor_dir = './' + filename + '_' + str(datetime.now())
227+
env = wrappers.Monitor(env, monitor_dir)
228+
229+
230+
N = 100000
231+
totalrewards = np.empty(N)
232+
costs = np.empty(N)
233+
n_max = 500000 # last step to decrease epsilon
234+
eps_step = 0.9 / n_max
235+
eps = 1.0
236+
for n in range(N):
237+
t0 = datetime.now()
238+
totalreward, eps, num_steps = play_one(env, model, tmodel, eps, eps_step, gamma, copy_period)
239+
totalrewards[n] = totalreward
240+
if n % 1 == 0:
241+
print("episode:", n, "total reward:", totalreward, "eps:", "%.3f" % eps, "num steps:", num_steps, "episode duration:", (datetime.now() - t0), "avg reward (last 100):", "%.3f" % totalrewards[max(0, n-100):(n+1)].mean())
242+
243+
print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
244+
print("total steps:", totalrewards.sum())
245+
246+
plt.plot(totalrewards)
247+
plt.title("Rewards")
248+
plt.show()
249+
250+
plot_running_avg(totalrewards)
251+
252+
253+
if __name__ == '__main__':
254+
main()
255+
256+

0 commit comments

Comments
 (0)