Skip to content

Commit 04d9f09

Browse files
committed
improve performance
1 parent ed018b4 commit 04d9f09

File tree

3 files changed

+166
-5
lines changed

3 files changed

+166
-5
lines changed

contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import tensorflow as tf
1414
import numpy as np
1515
import gym
16+
import time
17+
1618

1719
np.random.seed(1)
1820
tf.set_random_seed(1)
@@ -27,7 +29,7 @@
2729
REPLACEMENT = [
2830
dict(name='soft', tau=0.01),
2931
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
30-
][1] # you can try different target replacement strategies
32+
][0] # you can try different target replacement strategies
3133
MEMORY_CAPACITY = 10000
3234
BATCH_SIZE = 32
3335

@@ -225,6 +227,7 @@ def sample(self, n):
225227

226228
var = 3 # control exploration
227229

230+
t1 = time.time()
228231
for i in range(MAX_EPISODES):
229232
s = env.reset()
230233
ep_reward = 0
@@ -259,4 +262,6 @@ def sample(self, n):
259262
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
260263
if ep_reward > -300:
261264
RENDER = True
262-
break
265+
break
266+
267+
print('Running time: ', time.time()-t1)

contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import tensorflow as tf
1414
import numpy as np
1515
import gym
16+
import time
17+
1618

1719
##################### hyper parameters ####################
1820

@@ -35,7 +37,6 @@ def __init__(self, a_dim, s_dim, a_bound,):
3537
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
3638
self.pointer = 0
3739
self.sess = tf.Session()
38-
self.a_replace_counter, self.c_replace_counter = 0, 0
3940

4041
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
4142
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
@@ -122,6 +123,7 @@ def _build_c(self, s, a, scope, trainable):
122123
ddpg = DDPG(a_dim, s_dim, a_bound)
123124

124125
var = 3 # control exploration
126+
t1 = time.time()
125127
for i in range(MAX_EPISODES):
126128
s = env.reset()
127129
ep_reward = 0
@@ -144,5 +146,6 @@ def _build_c(self, s, a, scope, trainable):
144146
ep_reward += r
145147
if j == MAX_EP_STEPS-1:
146148
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
147-
if ep_reward > -300:RENDER = True
148-
break
149+
# if ep_reward > -300:RENDER = True
150+
break
151+
print('Running time: ', time.time() - t1)
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
"""
2+
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
3+
DDPG is Actor Critic based algorithm.
4+
Pendulum example.
5+
6+
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
7+
8+
Using:
9+
tensorflow 1.0
10+
gym 0.8.0
11+
"""
12+
13+
import tensorflow as tf
14+
import numpy as np
15+
import gym
16+
import time
17+
18+
19+
##################### hyper parameters ####################
20+
21+
MAX_EPISODES = 200
22+
MAX_EP_STEPS = 200
23+
LR_A = 0.001 # learning rate for actor
24+
LR_C = 0.002 # learning rate for critic
25+
GAMMA = 0.9 # reward discount
26+
TAU = 0.01 # soft replacement
27+
MEMORY_CAPACITY = 10000
28+
BATCH_SIZE = 32
29+
30+
RENDER = False
31+
ENV_NAME = 'Pendulum-v0'
32+
33+
34+
############################### DDPG ####################################
35+
36+
37+
class DDPG(object):
38+
def __init__(self, a_dim, s_dim, a_bound,):
39+
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
40+
self.pointer = 0
41+
self.sess = tf.Session()
42+
43+
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
44+
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
45+
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
46+
self.R = tf.placeholder(tf.float32, [None, 1], 'r')
47+
48+
ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
49+
50+
def ema_getter(getter, name, *args, **kwargs):
51+
return ema.average(getter(name, *args, **kwargs))
52+
53+
self.a = self._build_a(self.S,)
54+
a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor')
55+
56+
# assign self.a = a in memory when calculating q for td_error,
57+
# otherwise the self.a is from Actor when updating Actor
58+
q = self._build_c(self.S, self.a,)
59+
c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic')
60+
61+
target_update = [ema.apply(a_params), ema.apply(c_params)]
62+
a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)
63+
q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
64+
65+
with tf.control_dependencies(target_update):
66+
q_target = self.R + GAMMA * q_
67+
# in the feed_dict for the td_error, the self.a should change to actions in memory
68+
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
69+
a_loss = - tf.reduce_mean(q) # maximize the q
70+
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
71+
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
72+
73+
self.sess.run(tf.global_variables_initializer())
74+
75+
def choose_action(self, s):
76+
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
77+
78+
def learn(self):
79+
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
80+
bt = self.memory[indices, :]
81+
bs = bt[:, :self.s_dim]
82+
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
83+
br = bt[:, -self.s_dim - 1: -self.s_dim]
84+
bs_ = bt[:, -self.s_dim:]
85+
86+
self.sess.run(self.atrain, {self.S: bs})
87+
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
88+
89+
def store_transition(self, s, a, r, s_):
90+
transition = np.hstack((s, a, [r], s_))
91+
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
92+
self.memory[index, :] = transition
93+
self.pointer += 1
94+
95+
def _build_a(self, s, reuse=None, custom_getter=None):
96+
trainable = True if reuse is None else False
97+
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
98+
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
99+
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
100+
return tf.multiply(a, self.a_bound, name='scaled_a')
101+
102+
def _build_c(self, s, a, reuse=None, custom_getter=None):
103+
trainable = True if reuse is None else False
104+
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
105+
n_l1 = 30
106+
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
107+
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
108+
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
109+
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
110+
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
111+
112+
113+
############################### training ####################################
114+
115+
116+
env = gym.make(ENV_NAME)
117+
env = env.unwrapped
118+
env.seed(1)
119+
120+
s_dim = env.observation_space.shape[0]
121+
a_dim = env.action_space.shape[0]
122+
a_bound = env.action_space.high
123+
124+
ddpg = DDPG(a_dim, s_dim, a_bound)
125+
126+
var = 3 # control exploration
127+
t1 = time.time()
128+
for i in range(MAX_EPISODES):
129+
s = env.reset()
130+
ep_reward = 0
131+
for j in range(MAX_EP_STEPS):
132+
if RENDER:
133+
env.render()
134+
135+
# Add exploration noise
136+
a = ddpg.choose_action(s)
137+
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
138+
s_, r, done, info = env.step(a)
139+
140+
ddpg.store_transition(s, a, r / 10, s_)
141+
142+
if ddpg.pointer > MEMORY_CAPACITY:
143+
var *= .9995 # decay the action randomness
144+
ddpg.learn()
145+
146+
s = s_
147+
ep_reward += r
148+
if j == MAX_EP_STEPS-1:
149+
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
150+
# if ep_reward > -300:RENDER = True
151+
break
152+
153+
print('Running time: ', time.time() - t1)

0 commit comments

Comments
 (0)