simplify code

MorvanZhou · Morvan Zhou · commit 5e76b6981d0b · 2017-08-16T18:09:10.000+10:00
diff --git a/contents/5.2_Prioritized_Replay_DQN/RL_brain.py b/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
@@ -19,49 +19,38 @@ class SumTree(object):
     """
     This SumTree code is modified version and the original code is from: 
     https://github.com/jaara/AI-blog/blob/master/SumTree.py
-    
+
     Story the data with it priority in tree and data frameworks.
     """
     data_pointer = 0
 
     def __init__(self, capacity):
-        self.capacity = capacity    # for all priority values
-        self.tree = np.zeros(2*capacity - 1)
+        self.capacity = capacity  # for all priority values
+        self.tree = np.zeros(2 * capacity - 1)
         # [--------------Parent nodes-------------][-------leaves to recode priority-------]
         #             size: capacity - 1                       size: capacity
-        self.data = np.zeros(capacity, dtype=object)    # for all transitions
+        self.data = np.zeros(capacity, dtype=object)  # for all transitions
         # [--------------data frame-------------]
         #             size: capacity
 
-    def add_new_priority(self, p, data):
-        leaf_idx = self.data_pointer + self.capacity - 1
-
-        self.data[self.data_pointer] = data # update data_frame
-        self.update(leaf_idx, p)    # update tree_frame
+    def add(self, p, data):
+        tree_idx = self.data_pointer + self.capacity - 1
+        self.data[self.data_pointer] = data  # update data_frame
+        self.update(tree_idx, p)  # update tree_frame
 
         self.data_pointer += 1
         if self.data_pointer >= self.capacity:  # replace when exceed the capacity
             self.data_pointer = 0
 
     def update(self, tree_idx, p):
         change = p - self.tree[tree_idx]
-
         self.tree[tree_idx] = p
-        self._propagate_change(tree_idx, change)
-
-    def _propagate_change(self, tree_idx, change):
-        """change the sum of priority value in all parent nodes"""
-        parent_idx = (tree_idx - 1) // 2
-        self.tree[parent_idx] += change
-        if parent_idx != 0:
-            self._propagate_change(parent_idx, change)
-
-    def get_leaf(self, lower_bound):
-        leaf_idx = self._retrieve(lower_bound)  # search the max leaf priority based on the lower_bound
-        data_idx = leaf_idx - self.capacity + 1
-        return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
+        # then propagate the change through tree
+        while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
+            tree_idx = (tree_idx - 1) // 2
+            self.tree[tree_idx] += change
 
-    def _retrieve(self, lower_bound, parent_idx=0):
+    def get_leaf(self, v):
         """
         Tree structure and array storage:
 
@@ -75,32 +64,36 @@ def _retrieve(self, lower_bound, parent_idx=0):
         Array type for storing:
         [0,1,2,3,4,5,6]
         """
-        left_child_idx = 2 * parent_idx + 1
-        right_child_idx = left_child_idx + 1
+        parent_idx = 0
+        while True:     # the while loop is faster than the method in the reference code
+            cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
+            cr_idx = cl_idx + 1
+            if cl_idx >= len(self.tree):        # reach bottom, end search
+                leaf_idx = parent_idx
+                break
+            else:       # downward search, always search for a higher priority node
+                if v <= self.tree[cl_idx]:
+                    parent_idx = cl_idx
+                else:
+                    v -= self.tree[cl_idx]
+                    parent_idx = cr_idx
 
-        if left_child_idx >= len(self.tree):    # end search when no more child
-            return parent_idx
-
-        if self.tree[left_child_idx] == self.tree[right_child_idx]:
-            return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
-        if lower_bound <= self.tree[left_child_idx]:  # downward search, always search for a higher priority node
-            return self._retrieve(lower_bound, left_child_idx)
-        else:
-            return self._retrieve(lower_bound-self.tree[left_child_idx], right_child_idx)
+        data_idx = leaf_idx - self.capacity + 1
+        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
 
     @property
-    def root_priority(self):
-        return self.tree[0]     # the root
+    def total_p(self):
+        return self.tree[0]  # the root
 
 
-class Memory(object):   # stored as ( s, a, r, s_ ) in SumTree
+class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
     """
     This SumTree code is modified version and the original code is from:
     https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
     """
     epsilon = 0.01  # small amount to avoid zero priority
-    alpha = 0.6     # [0~1] convert the importance of TD error to priority
-    beta = 0.4      # importance-sampling, from initial value increasing to 1
+    alpha = 0.6  # [0~1] convert the importance of TD error to priority
+    beta = 0.4  # importance-sampling, from initial value increasing to 1
     beta_increment_per_sampling = 0.001
     abs_err_upper = 1.  # clipped abs error
 
@@ -111,37 +104,29 @@ def store(self, transition):
         max_p = np.max(self.tree.tree[-self.tree.capacity:])
         if max_p == 0:
             max_p = self.abs_err_upper
-        self.tree.add_new_priority(max_p, transition)   # set the max p for new p
+        self.tree.add(max_p, transition)   # set the max p for new p
 
     def sample(self, n):
-        batch_idx, batch_memory, ISWeights = [], [], []
-        segment = self.tree.root_priority / n
-        self.beta = np.min([1, self.beta + self.beta_increment_per_sampling])  # max = 1
+        b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
+        pri_seg = self.tree.total_p / n       # priority segment
+        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1
 
-        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
-        maxiwi = np.power(self.tree.capacity * min_prob, -self.beta)  # for later normalizing ISWeights
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
         for i in range(n):
-            a = segment * i
-            b = segment * (i + 1)
-            lower_bound = np.random.uniform(a, b)
-            idx, p, data = self.tree.get_leaf(lower_bound)
-            prob = p / self.tree.root_priority
-            ISWeights.append(self.tree.capacity * prob)
-            batch_idx.append(idx)
-            batch_memory.append(data)
-
-        ISWeights = np.vstack(ISWeights)
-        ISWeights = np.power(ISWeights, -self.beta) / maxiwi  # normalize
-        return batch_idx, np.vstack(batch_memory), ISWeights
-
-    def update(self, idx, error):
-        p = self._get_priority(error)
-        self.tree.update(idx, p)
-
-    def _get_priority(self, error):
-        error += self.epsilon  # avoid 0
-        clipped_error = np.clip(error, 0, self.abs_err_upper)
-        return np.power(clipped_error, self.alpha)
+            a, b = pri_seg * i, pri_seg * (i + 1)
+            v = np.random.uniform(a, b)
+            idx, p, data = self.tree.get_leaf(v)
+            prob = p / self.tree.total_p
+            ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
+            b_idx[i], b_memory[i, :] = idx, data
+        return b_idx, b_memory, ISWeights
+
+    def batch_update(self, tree_idx, abs_errors):
+        abs_errors += self.epsilon  # convert to abs and avoid 0
+        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
+        ps = np.power(clipped_errors, self.alpha)
+        for ti, p in zip(tree_idx, ps):
+            self.tree.update(ti, p)
 
 
 class DQNPrioritizedReplay:
@@ -197,15 +182,15 @@ def __init__(
         self.cost_his = []
 
     def _build_net(self):
-        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
+        def build_layers(s, c_names, n_l1, w_initializer, b_initializer, trainable):
             with tf.variable_scope('l1'):
-                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
-                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
+                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names, trainable=trainable)
+                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names,  trainable=trainable)
                 l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
 
             with tf.variable_scope('l2'):
-                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
-                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
+                w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names,  trainable=trainable)
+                b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names,  trainable=trainable)
                 out = tf.matmul(l1, w2) + b2
             return out
 
@@ -219,7 +204,7 @@ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
                 ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
                 tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
 
-            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)
+            self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer, True)
 
         with tf.variable_scope('loss'):
             if self.prioritized:
@@ -234,7 +219,7 @@ def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
         self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
         with tf.variable_scope('target_net'):
             c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
-            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
+            self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer, False)
 
     def store_transition(self, s, a, r, s_):
         if self.prioritized:    # prioritized replay
@@ -285,9 +270,7 @@ def learn(self):
                                          feed_dict={self.s: batch_memory[:, :self.n_features],
                                                     self.q_target: q_target,
                                                     self.ISWeights: ISWeights})
-            for i in range(len(tree_idx)):  # update priority
-                idx = tree_idx[i]
-                self.memory.update(idx, abs_errors[i])
+            self.memory.batch_update(tree_idx, abs_errors)     # update priority
         else:
             _, self.cost = self.sess.run([self._train_op, self.loss],
                                          feed_dict={self.s: batch_memory[:, :self.n_features],