Skip to content

Commit 7b03674

Browse files
committed
restructure
1 parent 820c263 commit 7b03674

File tree

4 files changed

+651
-0
lines changed

4 files changed

+651
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
import os
2+
import numpy as np
3+
import tensorflow as tf
4+
from tensorflow.initializers import random_uniform
5+
6+
class OUActionNoise(object):
7+
def __init__(self, mu, sigma=0.15, theta=.2, dt=1e-2, x0=None):
8+
self.theta = theta
9+
self.mu = mu
10+
self.sigma = sigma
11+
self.dt = dt
12+
self.x0 = x0
13+
self.reset()
14+
15+
def __call__(self):
16+
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
17+
self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
18+
self.x_prev = x
19+
return x
20+
21+
def reset(self):
22+
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
23+
24+
def __repr__(self):
25+
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(
26+
self.mu, self.sigma)
27+
28+
class ReplayBuffer(object):
29+
def __init__(self, max_size, input_shape, n_actions):
30+
self.mem_size = max_size
31+
self.mem_cntr = 0
32+
self.state_memory = np.zeros((self.mem_size, *input_shape))
33+
self.new_state_memory = np.zeros((self.mem_size, *input_shape))
34+
self.action_memory = np.zeros((self.mem_size, n_actions))
35+
self.reward_memory = np.zeros(self.mem_size)
36+
self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
37+
38+
def store_transition(self, state, action, reward, state_, done):
39+
index = self.mem_cntr % self.mem_size
40+
self.state_memory[index] = state
41+
self.new_state_memory[index] = state_
42+
self.action_memory[index] = action
43+
self.reward_memory[index] = reward
44+
self.terminal_memory[index] = 1 - done
45+
self.mem_cntr += 1
46+
47+
def sample_buffer(self, batch_size):
48+
max_mem = min(self.mem_cntr, self.mem_size)
49+
50+
batch = np.random.choice(max_mem, batch_size)
51+
52+
states = self.state_memory[batch]
53+
actions = self.action_memory[batch]
54+
rewards = self.reward_memory[batch]
55+
states_ = self.new_state_memory[batch]
56+
terminal = self.terminal_memory[batch]
57+
58+
return states, actions, rewards, states_, terminal
59+
60+
class Actor(object):
61+
def __init__(self, lr, n_actions, name, input_dims, sess, fc1_dims,
62+
fc2_dims, action_bound, batch_size=64, chkpt_dir='tmp/ddpg'):
63+
self.lr = lr
64+
self.n_actions = n_actions
65+
self.name = name
66+
self.fc1_dims = fc1_dims
67+
self.fc2_dims = fc2_dims
68+
self.chkpt_dir = chkpt_dir
69+
self.input_dims = input_dims
70+
self.batch_size = batch_size
71+
self.sess = sess
72+
self.action_bound = action_bound
73+
self.build_network()
74+
self.params = tf.trainable_variables(scope=self.name)
75+
self.saver = tf.train.Saver()
76+
self.checkpoint_file = os.path.join(chkpt_dir, name +'_ddpg.ckpt')
77+
78+
self.unnormalized_actor_gradients = tf.gradients(
79+
self.mu, self.params, -self.action_gradient)
80+
81+
self.actor_gradients = list(map(lambda x: tf.div(x, self.batch_size),
82+
self.unnormalized_actor_gradients))
83+
84+
self.optimize = tf.train.AdamOptimizer(self.lr).\
85+
apply_gradients(zip(self.actor_gradients, self.params))
86+
87+
def build_network(self):
88+
with tf.variable_scope(self.name):
89+
self.input = tf.placeholder(tf.float32,
90+
shape=[None, *self.input_dims],
91+
name='inputs')
92+
93+
self.action_gradient = tf.placeholder(tf.float32,
94+
shape=[None, self.n_actions],
95+
name='gradients')
96+
97+
f1 = 1. / np.sqrt(self.fc1_dims)
98+
dense1 = tf.layers.dense(self.input, units=self.fc1_dims,
99+
kernel_initializer=random_uniform(-f1, f1),
100+
bias_initializer=random_uniform(-f1, f1))
101+
batch1 = tf.layers.batch_normalization(dense1)
102+
layer1_activation = tf.nn.relu(batch1)
103+
f2 = 1. / np.sqrt(self.fc2_dims)
104+
dense2 = tf.layers.dense(layer1_activation, units=self.fc2_dims,
105+
kernel_initializer=random_uniform(-f2, f2),
106+
bias_initializer=random_uniform(-f2, f2))
107+
batch2 = tf.layers.batch_normalization(dense2)
108+
layer2_activation = tf.nn.relu(batch2)
109+
f3 = 0.003
110+
mu = tf.layers.dense(layer2_activation, units=self.n_actions,
111+
activation='tanh',
112+
kernel_initializer= random_uniform(-f3, f3),
113+
bias_initializer=random_uniform(-f3, f3))
114+
self.mu = tf.multiply(mu, self.action_bound)
115+
116+
def predict(self, inputs):
117+
return self.sess.run(self.mu, feed_dict={self.input: inputs})
118+
119+
def train(self, inputs, gradients):
120+
self.sess.run(self.optimize,
121+
feed_dict={self.input: inputs,
122+
self.action_gradient: gradients})
123+
124+
def load_checkpoint(self):
125+
print("...Loading checkpoint...")
126+
self.saver.restore(self.sess, self.checkpoint_file)
127+
128+
def save_checkpoint(self):
129+
print("...Saving checkpoint...")
130+
self.saver.save(self.sess, self.checkpoint_file)
131+
132+
class Critic(object):
133+
def __init__(self, lr, n_actions, name, input_dims, sess, fc1_dims, fc2_dims,
134+
batch_size=64, chkpt_dir='tmp/ddpg'):
135+
self.lr = lr
136+
self.n_actions = n_actions
137+
self.name = name
138+
self.fc1_dims = fc1_dims
139+
self.fc2_dims = fc2_dims
140+
self.chkpt_dir = chkpt_dir
141+
self.input_dims = input_dims
142+
self.batch_size = batch_size
143+
self.sess = sess
144+
self.build_network()
145+
self.params = tf.trainable_variables(scope=self.name)
146+
self.saver = tf.train.Saver()
147+
self.checkpoint_file = os.path.join(chkpt_dir, name +'_ddpg.ckpt')
148+
149+
self.optimize = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
150+
151+
self.action_gradients = tf.gradients(self.q, self.actions)
152+
153+
def build_network(self):
154+
with tf.variable_scope(self.name):
155+
self.input = tf.placeholder(tf.float32,
156+
shape=[None, *self.input_dims],
157+
name='inputs')
158+
159+
self.actions = tf.placeholder(tf.float32,
160+
shape=[None, self.n_actions],
161+
name='actions')
162+
163+
self.q_target = tf.placeholder(tf.float32,
164+
shape=[None,1],
165+
name='targets')
166+
167+
f1 = 1. / np.sqrt(self.fc1_dims)
168+
dense1 = tf.layers.dense(self.input, units=self.fc1_dims,
169+
kernel_initializer=random_uniform(-f1, f1),
170+
bias_initializer=random_uniform(-f1, f1))
171+
batch1 = tf.layers.batch_normalization(dense1)
172+
layer1_activation = tf.nn.relu(batch1)
173+
174+
f2 = 1. / np.sqrt(self.fc2_dims)
175+
dense2 = tf.layers.dense(layer1_activation, units=self.fc2_dims,
176+
kernel_initializer=random_uniform(-f2, f2),
177+
bias_initializer=random_uniform(-f2, f2))
178+
batch2 = tf.layers.batch_normalization(dense2)
179+
180+
action_in = tf.layers.dense(self.actions, units=self.fc2_dims,
181+
activation='relu')
182+
state_actions = tf.add(batch2, action_in)
183+
state_actions = tf.nn.relu(state_actions)
184+
185+
f3 = 0.003
186+
self.q = tf.layers.dense(state_actions, units=1,
187+
kernel_initializer=random_uniform(-f3, f3),
188+
bias_initializer=random_uniform(-f3, f3),
189+
kernel_regularizer=tf.keras.regularizers.l2(0.01))
190+
191+
self.loss = tf.losses.mean_squared_error(self.q_target, self.q)
192+
193+
def predict(self, inputs, actions):
194+
return self.sess.run(self.q,
195+
feed_dict={self.input: inputs,
196+
self.actions: actions})
197+
def train(self, inputs, actions, q_target):
198+
return self.sess.run(self.optimize,
199+
feed_dict={self.input: inputs,
200+
self.actions: actions,
201+
self.q_target: q_target})
202+
203+
def get_action_gradients(self, inputs, actions):
204+
return self.sess.run(self.action_gradients,
205+
feed_dict={self.input: inputs,
206+
self.actions: actions})
207+
def load_checkpoint(self):
208+
print("...Loading checkpoint...")
209+
self.saver.restore(self.sess, self.checkpoint_file)
210+
211+
def save_checkpoint(self):
212+
print("...Saving checkpoint...")
213+
self.saver.save(self.sess, self.checkpoint_file)
214+
215+
class Agent(object):
216+
def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, n_actions=2,
217+
max_size=1000000, layer1_size=400, layer2_size=300,
218+
batch_size=64):
219+
self.gamma = gamma
220+
self.tau = tau
221+
self.memory = ReplayBuffer(max_size, input_dims, n_actions)
222+
self.batch_size = batch_size
223+
self.sess = tf.Session()
224+
self.actor = Actor(alpha, n_actions, 'Actor', input_dims, self.sess,
225+
layer1_size, layer2_size, env.action_space.high)
226+
self.critic = Critic(beta, n_actions, 'Critic', input_dims,self.sess,
227+
layer1_size, layer2_size)
228+
229+
self.target_actor = Actor(alpha, n_actions, 'TargetActor',
230+
input_dims, self.sess, layer1_size,
231+
layer2_size, env.action_space.high)
232+
self.target_critic = Critic(beta, n_actions, 'TargetCritic', input_dims,
233+
self.sess, layer1_size, layer2_size)
234+
235+
self.noise = OUActionNoise(mu=np.zeros(n_actions))
236+
237+
# define ops here in __init__ otherwise time to execute the op
238+
# increases with each execution.
239+
self.update_critic = \
240+
[self.target_critic.params[i].assign(
241+
tf.multiply(self.critic.params[i], self.tau) \
242+
+ tf.multiply(self.target_critic.params[i], 1. - self.tau))
243+
for i in range(len(self.target_critic.params))]
244+
245+
self.update_actor = \
246+
[self.target_actor.params[i].assign(
247+
tf.multiply(self.actor.params[i], self.tau) \
248+
+ tf.multiply(self.target_actor.params[i], 1. - self.tau))
249+
for i in range(len(self.target_actor.params))]
250+
251+
self.sess.run(tf.global_variables_initializer())
252+
253+
self.update_network_parameters(first=True)
254+
255+
def update_network_parameters(self, first=False):
256+
if first:
257+
old_tau = self.tau
258+
self.tau = 1.0
259+
self.target_critic.sess.run(self.update_critic)
260+
self.target_actor.sess.run(self.update_actor)
261+
self.tau = old_tau
262+
else:
263+
self.target_critic.sess.run(self.update_critic)
264+
self.target_actor.sess.run(self.update_actor)
265+
266+
def remember(self, state, action, reward, new_state, done):
267+
self.memory.store_transition(state, action, reward, new_state, done)
268+
269+
def choose_action(self, state):
270+
state = state[np.newaxis, :]
271+
mu = self.actor.predict(state) # returns list of list
272+
noise = self.noise()
273+
mu_prime = mu + noise
274+
275+
return mu_prime[0]
276+
277+
def learn(self):
278+
if self.memory.mem_cntr < self.batch_size:
279+
return
280+
state, action, reward, new_state, done = \
281+
self.memory.sample_buffer(self.batch_size)
282+
283+
critic_value_ = self.target_critic.predict(new_state,
284+
self.target_actor.predict(new_state))
285+
target = []
286+
for j in range(self.batch_size):
287+
target.append(reward[j] + self.gamma*critic_value_[j]*done[j])
288+
target = np.reshape(target, (self.batch_size, 1))
289+
290+
_ = self.critic.train(state, action, target)
291+
292+
a_outs = self.actor.predict(state)
293+
grads = self.critic.get_action_gradients(state, a_outs)
294+
295+
self.actor.train(state, grads[0])
296+
297+
self.update_network_parameters()
298+
299+
def save_models(self):
300+
self.actor.save_checkpoint()
301+
self.target_actor.save_checkpoint()
302+
self.critic.save_checkpoint()
303+
self.target_critic.save_checkpoint()
304+
305+
def load_models(self):
306+
self.actor.load_checkpoint()
307+
self.target_actor.load_checkpoint()
308+
self.critic.save_checkpoint()
309+
self.target_critic.save_checkpoint()

0 commit comments

Comments
 (0)