26
26
BATCH = 32
27
27
A_UPDATE_STEPS = 10
28
28
C_UPDATE_STEPS = 10
29
+ S_DIM , A_DIM = 3 , 1
29
30
METHOD = [
30
31
dict (name = 'kl_pen' , kl_target = 0.01 , lam = 0.5 ), # KL penalty
31
32
dict (name = 'clip' , epsilon = 0.2 ), # Clipped surrogate objective, find this is better
34
35
35
36
class PPO (object ):
36
37
37
- def __init__ (self , s_dim , a_dim ,):
38
- self .a_dim = a_dim
39
- self .s_dim = s_dim
38
+ def __init__ (self ):
40
39
self .sess = tf .Session ()
41
-
42
- self .tfs = tf .placeholder (tf .float32 , [None , s_dim ], 'state' )
40
+ self .tfs = tf .placeholder (tf .float32 , [None , S_DIM ], 'state' )
43
41
44
42
# critic
45
43
with tf .variable_scope ('critic' ):
@@ -53,24 +51,24 @@ def __init__(self, s_dim, a_dim,):
53
51
# actor
54
52
pi , pi_params = self ._build_anet ('pi' , trainable = True )
55
53
oldpi , oldpi_params = self ._build_anet ('oldpi' , trainable = False )
56
- self .sample_op = tf .squeeze (pi .sample (1 ), axis = 0 ) # choosing action
54
+ with tf .variable_scope ('sample_action' ):
55
+ self .sample_op = tf .squeeze (pi .sample (1 ), axis = 0 ) # choosing action
57
56
with tf .variable_scope ('update_oldpi' ):
58
57
self .update_oldpi_op = [oldp .assign (p ) for p , oldp in zip (pi_params , oldpi_params )]
59
58
60
- self .tfa = tf .placeholder (tf .float32 , [None , a_dim ], 'action' )
59
+ self .tfa = tf .placeholder (tf .float32 , [None , A_DIM ], 'action' )
61
60
self .tfadv = tf .placeholder (tf .float32 , [None , 1 ], 'advantage' )
62
- with tf .variable_scope ('surrogate ' ):
63
- # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
64
- ratio = pi .prob (self .tfa ) / oldpi .prob (self .tfa )
65
- surr = ratio * self .tfadv
66
- if METHOD [ 'name' ] == 'kl_pen' :
67
- self . tflam = tf . placeholder ( tf . float32 , None , 'lambda' )
68
- with tf .variable_scope ( 'loss' ):
61
+ with tf .variable_scope ('loss ' ):
62
+ with tf .variable_scope ( 'surrogate' ):
63
+ # ratio = tf.exp( pi.log_prob (self.tfa) - oldpi.log_prob (self.tfa) )
64
+ ratio = pi . prob ( self .tfa ) / oldpi . prob ( self . tfa )
65
+ surr = ratio * self . tfadv
66
+ if METHOD [ 'name' ] == 'kl_pen' :
67
+ self . tflam = tf .placeholder ( tf . float32 , None , 'lambda' )
69
68
kl = tf .stop_gradient (kl_divergence (oldpi , pi ))
70
69
self .kl_mean = tf .reduce_mean (kl )
71
70
self .aloss = - (tf .reduce_mean (surr - self .tflam * kl ))
72
- else : # clipping method, find this is better
73
- with tf .variable_scope ('loss' ):
71
+ else : # clipping method, find this is better
74
72
self .aloss = - tf .reduce_mean (tf .minimum (
75
73
surr ,
76
74
tf .clip_by_value (ratio , 1. - METHOD ['epsilon' ], 1. + METHOD ['epsilon' ])* self .tfadv ))
@@ -82,14 +80,14 @@ def __init__(self, s_dim, a_dim,):
82
80
83
81
self .sess .run (tf .global_variables_initializer ())
84
82
85
- def update (self , s , a , r , m = 20 , b = 10 ):
83
+ def update (self , s , a , r ):
86
84
self .sess .run (self .update_oldpi_op )
87
85
adv = self .sess .run (self .advantage , {self .tfs : s , self .tfdc_r : r })
88
86
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
89
87
90
88
# update actor
91
89
if METHOD ['name' ] == 'kl_pen' :
92
- for _ in range (m ):
90
+ for _ in range (A_UPDATE_STEPS ):
93
91
_ , kl = self .sess .run (
94
92
[self .atrain_op , self .kl_mean ],
95
93
{self .tfs : s , self .tfa : a , self .tfadv : adv , self .tflam : METHOD ['lam' ]})
@@ -101,16 +99,16 @@ def update(self, s, a, r, m=20, b=10):
101
99
METHOD ['lam' ] *= 2
102
100
METHOD ['lam' ] = np .clip (METHOD ['lam' ], 1e-4 , 10 ) # some time explode, this is my method
103
101
else : # clipping method, find this is better (OpenAI's paper)
104
- [self .sess .run (self .atrain_op , {self .tfs : s , self .tfa : a , self .tfadv : adv }) for _ in range (m )]
102
+ [self .sess .run (self .atrain_op , {self .tfs : s , self .tfa : a , self .tfadv : adv }) for _ in range (A_UPDATE_STEPS )]
105
103
106
104
# update critic
107
- [self .sess .run (self .ctrain_op , {self .tfs : s , self .tfdc_r : r }) for _ in range (b )]
105
+ [self .sess .run (self .ctrain_op , {self .tfs : s , self .tfdc_r : r }) for _ in range (C_UPDATE_STEPS )]
108
106
109
107
def _build_anet (self , name , trainable ):
110
108
with tf .variable_scope (name ):
111
109
l1 = tf .layers .dense (self .tfs , 100 , tf .nn .relu , trainable = trainable )
112
- mu = 2 * tf .layers .dense (l1 , self . a_dim , tf .nn .tanh , trainable = trainable )
113
- sigma = tf .layers .dense (l1 , self . a_dim , tf .nn .softplus , trainable = trainable )
110
+ mu = 2 * tf .layers .dense (l1 , A_DIM , tf .nn .tanh , trainable = trainable )
111
+ sigma = tf .layers .dense (l1 , A_DIM , tf .nn .softplus , trainable = trainable )
114
112
norm_dist = Normal (loc = mu , scale = sigma )
115
113
params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = name )
116
114
return norm_dist , params
@@ -125,14 +123,14 @@ def get_v(self, s):
125
123
return self .sess .run (self .v , {self .tfs : s })[0 , 0 ]
126
124
127
125
env = gym .make ('Pendulum-v0' ).unwrapped
128
- ppo = PPO (s_dim = 3 , a_dim = 1 )
126
+ ppo = PPO ()
129
127
all_ep_r = []
130
128
131
129
for ep in range (EP_MAX ):
132
130
s = env .reset ()
133
131
buffer_s , buffer_a , buffer_r = [], [], []
134
132
ep_r = 0
135
- for t in range (1 , EP_LEN ): # one episode
133
+ for t in range (EP_LEN ): # in one episode
136
134
env .render ()
137
135
a = ppo .choose_action (s )
138
136
s_ , r , done , _ = env .step (a )
@@ -143,7 +141,7 @@ def get_v(self, s):
143
141
ep_r += r
144
142
145
143
# update ppo
146
- if t % ( BATCH - 1 ) == 0 or t == EP_LEN - 1 :
144
+ if ( t + 1 ) % BATCH == 0 or t == EP_LEN - 1 :
147
145
v_s_ = ppo .get_v (s_ )
148
146
discounted_r = []
149
147
for r in buffer_r [::- 1 ]:
@@ -153,7 +151,7 @@ def get_v(self, s):
153
151
154
152
bs , ba , br = np .vstack (buffer_s ), np .vstack (buffer_a ), np .array (discounted_r )[:, np .newaxis ]
155
153
buffer_s , buffer_a , buffer_r = [], [], []
156
- ppo .update (bs , ba , br , m = A_UPDATE_STEPS , b = C_UPDATE_STEPS )
154
+ ppo .update (bs , ba , br )
157
155
if ep == 0 : all_ep_r .append (ep_r )
158
156
else : all_ep_r .append (all_ep_r [- 1 ]* 0.9 + ep_r * 0.1 )
159
157
print (
0 commit comments