Skip to content

Commit 835ebfd

Browse files
MorvanZhouMorvan Zhou
authored andcommitted
update ppo
1 parent b55125a commit 835ebfd

File tree

1 file changed

+24
-26
lines changed

1 file changed

+24
-26
lines changed

contents/12_Proximal_Policy_Optimization/simply_PPO.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
BATCH = 32
2727
A_UPDATE_STEPS = 10
2828
C_UPDATE_STEPS = 10
29+
S_DIM, A_DIM = 3, 1
2930
METHOD = [
3031
dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
3132
dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better
@@ -34,12 +35,9 @@
3435

3536
class PPO(object):
3637

37-
def __init__(self, s_dim, a_dim,):
38-
self.a_dim = a_dim
39-
self.s_dim = s_dim
38+
def __init__(self):
4039
self.sess = tf.Session()
41-
42-
self.tfs = tf.placeholder(tf.float32, [None, s_dim], 'state')
40+
self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
4341

4442
# critic
4543
with tf.variable_scope('critic'):
@@ -53,24 +51,24 @@ def __init__(self, s_dim, a_dim,):
5351
# actor
5452
pi, pi_params = self._build_anet('pi', trainable=True)
5553
oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
56-
self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
54+
with tf.variable_scope('sample_action'):
55+
self.sample_op = tf.squeeze(pi.sample(1), axis=0) # choosing action
5756
with tf.variable_scope('update_oldpi'):
5857
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
5958

60-
self.tfa = tf.placeholder(tf.float32, [None, a_dim], 'action')
59+
self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
6160
self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
62-
with tf.variable_scope('surrogate'):
63-
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
64-
ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
65-
surr = ratio * self.tfadv
66-
if METHOD['name'] == 'kl_pen':
67-
self.tflam = tf.placeholder(tf.float32, None, 'lambda')
68-
with tf.variable_scope('loss'):
61+
with tf.variable_scope('loss'):
62+
with tf.variable_scope('surrogate'):
63+
# ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
64+
ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa)
65+
surr = ratio * self.tfadv
66+
if METHOD['name'] == 'kl_pen':
67+
self.tflam = tf.placeholder(tf.float32, None, 'lambda')
6968
kl = tf.stop_gradient(kl_divergence(oldpi, pi))
7069
self.kl_mean = tf.reduce_mean(kl)
7170
self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
72-
else: # clipping method, find this is better
73-
with tf.variable_scope('loss'):
71+
else: # clipping method, find this is better
7472
self.aloss = -tf.reduce_mean(tf.minimum(
7573
surr,
7674
tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon'])*self.tfadv))
@@ -82,14 +80,14 @@ def __init__(self, s_dim, a_dim,):
8280

8381
self.sess.run(tf.global_variables_initializer())
8482

85-
def update(self, s, a, r, m=20, b=10):
83+
def update(self, s, a, r):
8684
self.sess.run(self.update_oldpi_op)
8785
adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
8886
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
8987

9088
# update actor
9189
if METHOD['name'] == 'kl_pen':
92-
for _ in range(m):
90+
for _ in range(A_UPDATE_STEPS):
9391
_, kl = self.sess.run(
9492
[self.atrain_op, self.kl_mean],
9593
{self.tfs: s, self.tfa: a, self.tfadv: adv, self.tflam: METHOD['lam']})
@@ -101,16 +99,16 @@ def update(self, s, a, r, m=20, b=10):
10199
METHOD['lam'] *= 2
102100
METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # some time explode, this is my method
103101
else: # clipping method, find this is better (OpenAI's paper)
104-
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(m)]
102+
[self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(A_UPDATE_STEPS)]
105103

106104
# update critic
107-
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(b)]
105+
[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(C_UPDATE_STEPS)]
108106

109107
def _build_anet(self, name, trainable):
110108
with tf.variable_scope(name):
111109
l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
112-
mu = 2 * tf.layers.dense(l1, self.a_dim, tf.nn.tanh, trainable=trainable)
113-
sigma = tf.layers.dense(l1, self.a_dim, tf.nn.softplus, trainable=trainable)
110+
mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
111+
sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
114112
norm_dist = Normal(loc=mu, scale=sigma)
115113
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
116114
return norm_dist, params
@@ -125,14 +123,14 @@ def get_v(self, s):
125123
return self.sess.run(self.v, {self.tfs: s})[0, 0]
126124

127125
env = gym.make('Pendulum-v0').unwrapped
128-
ppo = PPO(s_dim=3, a_dim=1)
126+
ppo = PPO()
129127
all_ep_r = []
130128

131129
for ep in range(EP_MAX):
132130
s = env.reset()
133131
buffer_s, buffer_a, buffer_r = [], [], []
134132
ep_r = 0
135-
for t in range(1, EP_LEN): # one episode
133+
for t in range(EP_LEN): # in one episode
136134
env.render()
137135
a = ppo.choose_action(s)
138136
s_, r, done, _ = env.step(a)
@@ -143,7 +141,7 @@ def get_v(self, s):
143141
ep_r += r
144142

145143
# update ppo
146-
if t % (BATCH-1) == 0 or t == EP_LEN-1:
144+
if (t+1) % BATCH == 0 or t == EP_LEN-1:
147145
v_s_ = ppo.get_v(s_)
148146
discounted_r = []
149147
for r in buffer_r[::-1]:
@@ -153,7 +151,7 @@ def get_v(self, s):
153151

154152
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
155153
buffer_s, buffer_a, buffer_r = [], [], []
156-
ppo.update(bs, ba, br, m=A_UPDATE_STEPS, b=C_UPDATE_STEPS)
154+
ppo.update(bs, ba, br)
157155
if ep == 0: all_ep_r.append(ep_r)
158156
else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
159157
print(

0 commit comments

Comments
 (0)