improve performance

MorvanZhou · MorvanZhou · commit 04d9f09fc4d8 · 2018-01-28T02:59:35.000+11:00
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
@@ -13,6 +13,8 @@
 import tensorflow as tf
 import numpy as np
 import gym
+import time
+
 
 np.random.seed(1)
 tf.set_random_seed(1)
@@ -27,7 +29,7 @@
 REPLACEMENT = [
     dict(name='soft', tau=0.01),
     dict(name='hard', rep_iter_a=600, rep_iter_c=500)
-][1]            # you can try different target replacement strategies
+][0]            # you can try different target replacement strategies
 MEMORY_CAPACITY = 10000
 BATCH_SIZE = 32
 
@@ -225,6 +227,7 @@ def sample(self, n):
 
 var = 3  # control exploration
 
+t1 = time.time()
 for i in range(MAX_EPISODES):
     s = env.reset()
     ep_reward = 0
@@ -259,4 +262,6 @@ def sample(self, n):
             print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
             if ep_reward > -300:
                 RENDER = True
-            break
+            break
+
+print('Running time: ', time.time()-t1)
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update.py
@@ -13,6 +13,8 @@
 import tensorflow as tf
 import numpy as np
 import gym
+import time
+
 
 #####################  hyper parameters  ####################
 
@@ -35,7 +37,6 @@ def __init__(self, a_dim, s_dim, a_bound,):
         self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
         self.pointer = 0
         self.sess = tf.Session()
-        self.a_replace_counter, self.c_replace_counter = 0, 0
 
         self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
         self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
@@ -122,6 +123,7 @@ def _build_c(self, s, a, scope, trainable):
 ddpg = DDPG(a_dim, s_dim, a_bound)
 
 var = 3  # control exploration
+t1 = time.time()
 for i in range(MAX_EPISODES):
     s = env.reset()
     ep_reward = 0
@@ -144,5 +146,6 @@ def _build_c(self, s, a, scope, trainable):
         ep_reward += r
         if j == MAX_EP_STEPS-1:
             print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
-            if ep_reward > -300:RENDER = True
-            break
+            # if ep_reward > -300:RENDER = True
+            break
+print('Running time: ', time.time() - t1)
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py
@@ -0,0 +1,153 @@
+"""
+Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
+DDPG is Actor Critic based algorithm.
+Pendulum example.
+
+View more on my tutorial page: https://morvanzhou.github.io/tutorials/
+
+Using:
+tensorflow 1.0
+gym 0.8.0
+"""
+
+import tensorflow as tf
+import numpy as np
+import gym
+import time
+
+
+#####################  hyper parameters  ####################
+
+MAX_EPISODES = 200
+MAX_EP_STEPS = 200
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.002    # learning rate for critic
+GAMMA = 0.9     # reward discount
+TAU = 0.01      # soft replacement
+MEMORY_CAPACITY = 10000
+BATCH_SIZE = 32
+
+RENDER = False
+ENV_NAME = 'Pendulum-v0'
+
+
+###############################  DDPG  ####################################
+
+
+class DDPG(object):
+    def __init__(self, a_dim, s_dim, a_bound,):
+        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
+        self.pointer = 0
+        self.sess = tf.Session()
+
+        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
+        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
+        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
+        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
+
+        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
+
+        def ema_getter(getter, name, *args, **kwargs):
+            return ema.average(getter(name, *args, **kwargs))
+
+        self.a = self._build_a(self.S,)
+        a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor')
+
+        # assign self.a = a in memory when calculating q for td_error,
+        # otherwise the self.a is from Actor when updating Actor
+        q = self._build_c(self.S, self.a,)
+        c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic')
+
+        target_update = [ema.apply(a_params), ema.apply(c_params)]
+        a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)
+        q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
+
+        with tf.control_dependencies(target_update):
+            q_target = self.R + GAMMA * q_
+            # in the feed_dict for the td_error, the self.a should change to actions in memory
+            td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
+            a_loss = - tf.reduce_mean(q)  # maximize the q
+            self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
+            self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
+
+        self.sess.run(tf.global_variables_initializer())
+
+    def choose_action(self, s):
+        return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
+
+    def learn(self):
+        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
+        bt = self.memory[indices, :]
+        bs = bt[:, :self.s_dim]
+        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
+        br = bt[:, -self.s_dim - 1: -self.s_dim]
+        bs_ = bt[:, -self.s_dim:]
+
+        self.sess.run(self.atrain, {self.S: bs})
+        self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
+
+    def store_transition(self, s, a, r, s_):
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
+        self.memory[index, :] = transition
+        self.pointer += 1
+
+    def _build_a(self, s, reuse=None, custom_getter=None):
+        trainable = True if reuse is None else False
+        with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
+            net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
+            a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
+            return tf.multiply(a, self.a_bound, name='scaled_a')
+
+    def _build_c(self, s, a, reuse=None, custom_getter=None):
+        trainable = True if reuse is None else False
+        with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
+            n_l1 = 30
+            w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
+            w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
+            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
+            net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
+            return tf.layers.dense(net, 1, trainable=trainable)  # Q(s,a)
+
+
+###############################  training  ####################################
+
+
+env = gym.make(ENV_NAME)
+env = env.unwrapped
+env.seed(1)
+
+s_dim = env.observation_space.shape[0]
+a_dim = env.action_space.shape[0]
+a_bound = env.action_space.high
+
+ddpg = DDPG(a_dim, s_dim, a_bound)
+
+var = 3  # control exploration
+t1 = time.time()
+for i in range(MAX_EPISODES):
+    s = env.reset()
+    ep_reward = 0
+    for j in range(MAX_EP_STEPS):
+        if RENDER:
+            env.render()
+
+        # Add exploration noise
+        a = ddpg.choose_action(s)
+        a = np.clip(np.random.normal(a, var), -2, 2)    # add randomness to action selection for exploration
+        s_, r, done, info = env.step(a)
+
+        ddpg.store_transition(s, a, r / 10, s_)
+
+        if ddpg.pointer > MEMORY_CAPACITY:
+            var *= .9995    # decay the action randomness
+            ddpg.learn()
+
+        s = s_
+        ep_reward += r
+        if j == MAX_EP_STEPS-1:
+            print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
+            # if ep_reward > -300:RENDER = True
+            break
+
+print('Running time: ', time.time() - t1)