update a3c

xiuqinhao · xiuqinhao · commit bf926873bb23 · 2019-01-30T14:56:22.000+08:00
diff --git a/contents/10_A3C/A3C_continuous_action.py b/contents/10_A3C/A3C_continuous_action.py
@@ -42,7 +42,7 @@
 
 N_S = env.observation_space.shape[0]
 N_A = env.action_space.shape[0]
-A_BOUND = [env.action_space.low, env.action_space.high]
+A_BOUND = [env.action_space.low, env.action_space.high]  #给定空间
 
 #给定的Actor和Critic网络，相结合来生成global_net
 class ACNet(object):
@@ -68,11 +68,12 @@ def __init__(self, scope, globalAC=None):
                 with tf.name_scope('wrap_a_out'):
                     mu, sigma = mu * A_BOUND[1], sigma + 1e-4
 
-                normal_dist = tf.distributions.Normal(mu, sigma)
+                normal_dist = tf.distributions.Normal(mu, sigma)  #正态分布
 
+                #动作选择是通过给定分布的概率密度来完成的
                 with tf.name_scope('a_loss'):  #计算actor的loss
-                    log_prob = normal_dist.log_prob(self.a_his)
-                    exp_v = log_prob * tf.stop_gradient(td)
+                    log_prob = normal_dist.log_prob(self.a_his)  #获取概率密度函数
+                    exp_v = log_prob * tf.stop_gradient(td)  #给定bp停止点
                     entropy = normal_dist.entropy()  # encourage exploration
                     self.exp_v = ENTROPY_BETA * entropy + exp_v
                     self.a_loss = tf.reduce_mean(-self.exp_v)
diff --git a/contents/10_A3C/A3C_discrete_action.py b/contents/10_A3C/A3C_discrete_action.py
@@ -42,6 +42,7 @@
 class ACNet(object):
     def __init__(self, scope, globalAC=None):
 
+        #判断是否是global_net
         if scope == GLOBAL_NET_SCOPE:   # get global network
             with tf.variable_scope(scope):
                 self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
@@ -54,15 +55,16 @@ def __init__(self, scope, globalAC=None):
 
                 self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
 
-                td = tf.subtract(self.v_target, self.v, name='TD_error')
+                td = tf.subtract(self.v_target, self.v, name='TD_error')   #给出差值
                 with tf.name_scope('c_loss'):
-                    self.c_loss = tf.reduce_mean(tf.square(td))
+                    self.c_loss = tf.reduce_mean(tf.square(td))   #求损失
 
                 with tf.name_scope('a_loss'):
+                    #给定计算概率密度
                     log_prob = tf.reduce_sum(tf.log(self.a_prob + 1e-5) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
                     exp_v = log_prob * tf.stop_gradient(td)
                     entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
-                                             axis=1, keep_dims=True)  # encourage exploration
+                                             axis=1, keep_dims=True)  # encourage exploration  计算熵
                     self.exp_v = ENTROPY_BETA * entropy + exp_v
                     self.a_loss = tf.reduce_mean(-self.exp_v)
 
diff --git a/contents/10_A3C/A3C_distributed_tf.py b/contents/10_A3C/A3C_distributed_tf.py
@@ -99,7 +99,7 @@ def work(job_name, task_index, global_ep, lock, r_queue, global_running_r):
         "ps": ['localhost:2220', 'localhost:2221',],
         "worker": ['localhost:2222', 'localhost:2223', 'localhost:2224', 'localhost:2225',]
     })
-    server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)
+    server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)  #创建集群工作服务
     if job_name == 'ps':
         print('Start Parameter Sever: ', task_index)
         server.join()
diff --git a/contents/11_Dyna_Q/RL_brain.py b/contents/11_Dyna_Q/RL_brain.py
@@ -3,6 +3,9 @@
 All decisions and learning processes are made in here.
 
 View more on my tutorial page: https://morvanzhou.github.io/tutorials/
+
+主要是使用Q-Learning来解决连续动作，
+使用针对Actor-Critic中Critic中给定Actor一个更好的选择，而不是只告诉好和不好
 """
 
 import numpy as np
diff --git a/contents/8_Actor_Critic_Advantage/AC_CartPole.py b/contents/8_Actor_Critic_Advantage/AC_CartPole.py
@@ -124,6 +124,9 @@ def __init__(self, sess, n_features, lr=0.01):
             )
 
         with tf.variable_scope('squared_TD_error'):  #td_error
+            #原本是Q(St,At)-V(St)
+            #因为 (St,At)=E(Rt+V(St+1))
+            #所以 Rt+V(St+1)-V(St)
             self.td_error = self.r + GAMMA * self.v_ - self.v   #td_error=R + gamma*Vt+1 - Vt
             self.loss = tf.square(self.td_error)    # TD_error = (r+gamma*V_next) - V_eval
         with tf.variable_scope('train'):

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,9 @@ def __init__(self, sess, n_features, lr=0.01):`
`124`	`124`	`)`
`125`	`125`
`126`	`126`	`with tf.variable_scope('squared_TD_error'): #td_error`
	`127`	`+ #原本是Q(St,At)-V(St)`
	`128`	`+ #因为 (St,At)=E(Rt+V(St+1))`
	`129`	`+ #所以 Rt+V(St+1)-V(St)`
`127`	`130`	`self.td_error = self.r + GAMMA * self.v_ - self.v #td_error=R + gamma*Vt+1 - Vt`
`128`	`131`	`self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval`
`129`	`132`	`with tf.variable_scope('train'):`