solari998
diff --git a/‎contents/12_Proximal_Policy_Optimization/DPPO.py
Lines changed: 68 additions & 65 deletions b/‎contents/12_Proximal_Policy_Optimization/DPPO.py
Lines changed: 68 additions & 65 deletions
@@ -1,8 +1,11 @@
 """
 A simple version of OpenAI's Proximal Policy Optimization (PPO). [http://adsabs.harvard.edu/abs/2017arXiv170706347S]
+
 Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
-Restart workers once PPO is updated. I think A3C may be faster than this version of PPO, because this PPO has to stop
-parallel data-collection for training.
+Restart workers once PPO is updated.
+
+The global PPO updating rule is adopted from DeepMind's paper (DPPO):
+Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [http://adsabs.harvard.edu/abs/2017arXiv170702286H]
 
 View more on my tutorial website: https://morvanzhou.github.io/tutorials
 
@@ -15,28 +18,26 @@
 from tensorflow.contrib.distributions import Normal
 import numpy as np
 import matplotlib.pyplot as plt
-import gym, threading
-from queue import Queue
+import gym, threading, queue
 
-EP_MAX = 600
+EP_MAX = 1000
 EP_LEN = 200
-N_WORKER = 3
-GAMMA = 0.9
-A_LR = 0.0001
-C_LR = 0.0002
-ROLL_OUT_STEP = 32
-UPDATE_STEP = 10
-EPSILON = 0.2                # Clipped surrogate objective
-S_DIM, A_DIM = 3, 1
+N_WORKER = 4                # parallel workers
+GAMMA = 0.9                 # reward discount factor
+A_LR = 0.0001               # learning rate for actor
+C_LR = 0.001                # learning rate for critic
+MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
+UPDATE_STEP = 5             # loop update operation n-steps
+EPSILON = 0.2               # for clipping surrogate objective
+GAME = 'Pendulum-v0'
+S_DIM, A_DIM = 3, 1         # state and action dimension
 
 
 class PPO(object):
-    def __init__(self, s_dim, a_dim,):
-        self.a_dim = a_dim
-        self.s_dim = s_dim
+    def __init__(self):
         self.sess = tf.Session()
 
-        self.tfs = tf.placeholder(tf.float32, [None, s_dim], 'state')
+        self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
 
         # critic
         l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
@@ -52,7 +53,7 @@ def __init__(self, s_dim, a_dim,):
         self.sample_op = tf.squeeze(pi.sample(1), axis=0)  # choosing action
         self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
 
-        self.tfa = tf.placeholder(tf.float32, [None, a_dim], 'action')
+        self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action')
         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
         # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
         ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
@@ -65,25 +66,27 @@ def __init__(self, s_dim, a_dim,):
         self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
         self.sess.run(tf.global_variables_initializer())
 
-    def update(self, coord, queue, rolling_events):
-        while not coord.should_stop():
-            if queue.full():
+    def update(self):
+        global GLOBAL_UPDATE_COUNTER
+        while not COORD.should_stop():
+            if GLOBAL_EP < EP_MAX:
+                UPDATE_EVENT.wait()         # wait until get batch of data
                 self.sess.run(self.update_oldpi_op)   # old pi to pi
-
-                data = [queue.get() for _ in range(queue.qsize())]
+                data = [QUEUE.get() for _ in range(QUEUE.qsize())]
                 data = np.vstack(data)
-                s, a, r = data[:, :self.s_dim], data[:, self.s_dim: self.s_dim + self.a_dim], data[:, -1:]
+                s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + A_DIM], data[:, -1:]
                 adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})
                 [self.sess.run(self.atrain_op, {self.tfs: s, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
                 [self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
-
-                [re.set() for re in rolling_events]     # set roll-out available
+                UPDATE_EVENT.clear()        # updating finished
+                GLOBAL_UPDATE_COUNTER = 0   # reset counter
+                ROLLING_EVENT.set()         # set roll-out available
 
     def _build_anet(self, name, trainable):
         with tf.variable_scope(name):
             l1 = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
-            mu = 2 * tf.layers.dense(l1, self.a_dim, tf.nn.tanh, trainable=trainable)
-            sigma = tf.layers.dense(l1, self.a_dim, tf.nn.softplus, trainable=trainable)
+            mu = 2 * tf.layers.dense(l1, A_DIM, tf.nn.tanh, trainable=trainable)
+            sigma = tf.layers.dense(l1, A_DIM, tf.nn.softplus, trainable=trainable)
             norm_dist = Normal(loc=mu, scale=sigma)
         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
         return norm_dist, params
@@ -99,83 +102,83 @@ def get_v(self, s):
 
 
 class Worker(object):
-    def __init__(self, globalPPO, roll_out_steps, wid, game, ep_len, rolling_event):
-        self.roll_out_steps = roll_out_steps
+    def __init__(self, wid):
         self.wid = wid
-        self.ep_len = ep_len
-        self.rolling_event = rolling_event
-        self.env = gym.make(game).unwrapped
-        self.ppo = globalPPO
-
-    def work(self, coord, queue,):
-        global GLOBAL_EP, GLOBAL_RUNNING_R
-        while not coord.should_stop():
+        self.env = gym.make(GAME).unwrapped
+        self.ppo = GLOBAL_PPO
+
+    def work(self):
+        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
+        while not COORD.should_stop():
             s = self.env.reset()
             ep_r = 0
             buffer_s, buffer_a, buffer_r = [], [], []
-            for t in range(self.ep_len):
+            for t in range(EP_LEN):
+                if not ROLLING_EVENT.is_set():                  # while global PPO is updating
+                    ROLLING_EVENT.wait()                        # wait until PPO is updated
+                    buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
                 a = self.ppo.choose_action(s)
                 s_, r, done, _ = self.env.step(a)
                 buffer_s.append(s)
                 buffer_a.append(a)
-                buffer_r.append((r + 8) / 8)  # normalize reward, find to be useful
+                buffer_r.append((r + 8) / 8)                    # normalize reward, find to be useful
                 s = s_
                 ep_r += r
 
-                # get update buffer
-                if (t+1) % self.roll_out_steps == 0 or t == self.ep_len - 1:
+                GLOBAL_UPDATE_COUNTER += 1                      # count to minimum batch size
+                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                     v_s_ = self.ppo.get_v(s_)
-                    discounted_r = []           # compute discounted reward
+                    discounted_r = []                           # compute discounted reward
                     for r in buffer_r[::-1]:
                         v_s_ = r + GAMMA * v_s_
                         discounted_r.append(v_s_)
                     discounted_r.reverse()
 
                     bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
                     buffer_s, buffer_a, buffer_r = [], [], []
-                    queue.put(np.hstack((bs, ba, br)))
-                    if GLOBAL_EP >= EP_MAX:             # stop training
-                        coord.request_stop()
+                    QUEUE.put(np.hstack((bs, ba, br)))
+                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
+                        ROLLING_EVENT.clear()       # stop collecting data
+                        UPDATE_EVENT.set()          # globalPPO update
+
+                    if GLOBAL_EP >= EP_MAX:         # stop training
+                        COORD.request_stop()
                         break
-                    else:
-                        self.rolling_event.clear()      # stop roll-out
-                        self.rolling_event.wait()       # stop and wait until network is updated
 
             # record reward changes, plot later
             if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
             else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
             GLOBAL_EP += 1
-            print('W%i' % self.wid, '|Ep: %i' % GLOBAL_EP, '|Ep_r: %.2f' % ep_r,)
+            print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
 
 
 if __name__ == '__main__':
-    globalPPO = PPO(S_DIM, A_DIM)
-    workers = [Worker(
-        globalPPO=globalPPO, roll_out_steps=ROLL_OUT_STEP, wid=i, game='Pendulum-v0',
-        ep_len=EP_LEN, rolling_event=threading.Event()) for i in range(N_WORKER)]
-
-    GLOBAL_EP = 0
+    GLOBAL_PPO = PPO()
+    UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
+    UPDATE_EVENT.clear()    # no update now
+    ROLLING_EVENT.set()     # start to roll out
+    workers = [Worker(wid=i) for i in range(N_WORKER)]
+    
+    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
     GLOBAL_RUNNING_R = []
     COORD = tf.train.Coordinator()
-    QUEUE = Queue(maxsize=N_WORKER)
+    QUEUE = queue.Queue()
     threads = []
     for worker in workers:  # worker threads
-        t = threading.Thread(target=worker.work, args=(COORD, QUEUE))
+        t = threading.Thread(target=worker.work, args=())
         t.start()
         threads.append(t)
-    # update thread for network
-    threads.append(threading.Thread(target=globalPPO.update, args=(COORD, QUEUE, [w.rolling_event for w in workers])))
+    # add a PPO updating thread
+    threads.append(threading.Thread(target=GLOBAL_PPO.update,))
     threads[-1].start()
     COORD.join(threads)
 
-    # plot reward change
+    # plot reward change and testing
     plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
     plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
-
-    env = gym.make('Pendulum-v0')       # testing
+    env = gym.make('Pendulum-v0')
     while True:
         s = env.reset()
         for t in range(400):
             env.render()
-            a = globalPPO.choose_action(s)
-            s = env.step(a)[0]
+            s = env.step(GLOBAL_PPO.choose_action(s))[0]