Skip to content

Commit bf92687

Browse files
committed
update a3c
1 parent fbe98b7 commit bf92687

File tree

5 files changed

+17
-8
lines changed

5 files changed

+17
-8
lines changed

contents/10_A3C/A3C_continuous_action.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343
N_S = env.observation_space.shape[0]
4444
N_A = env.action_space.shape[0]
45-
A_BOUND = [env.action_space.low, env.action_space.high]
45+
A_BOUND = [env.action_space.low, env.action_space.high] #给定空间
4646

4747
#给定的Actor和Critic网络,相结合来生成global_net
4848
class ACNet(object):
@@ -68,11 +68,12 @@ def __init__(self, scope, globalAC=None):
6868
with tf.name_scope('wrap_a_out'):
6969
mu, sigma = mu * A_BOUND[1], sigma + 1e-4
7070

71-
normal_dist = tf.distributions.Normal(mu, sigma)
71+
normal_dist = tf.distributions.Normal(mu, sigma) #正态分布
7272

73+
#动作选择是通过给定分布的概率密度来完成的
7374
with tf.name_scope('a_loss'): #计算actor的loss
74-
log_prob = normal_dist.log_prob(self.a_his)
75-
exp_v = log_prob * tf.stop_gradient(td)
75+
log_prob = normal_dist.log_prob(self.a_his) #获取概率密度函数
76+
exp_v = log_prob * tf.stop_gradient(td) #给定bp停止点
7677
entropy = normal_dist.entropy() # encourage exploration
7778
self.exp_v = ENTROPY_BETA * entropy + exp_v
7879
self.a_loss = tf.reduce_mean(-self.exp_v)

contents/10_A3C/A3C_discrete_action.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
class ACNet(object):
4343
def __init__(self, scope, globalAC=None):
4444

45+
#判断是否是global_net
4546
if scope == GLOBAL_NET_SCOPE: # get global network
4647
with tf.variable_scope(scope):
4748
self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
@@ -54,15 +55,16 @@ def __init__(self, scope, globalAC=None):
5455

5556
self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
5657

57-
td = tf.subtract(self.v_target, self.v, name='TD_error')
58+
td = tf.subtract(self.v_target, self.v, name='TD_error') #给出差值
5859
with tf.name_scope('c_loss'):
59-
self.c_loss = tf.reduce_mean(tf.square(td))
60+
self.c_loss = tf.reduce_mean(tf.square(td)) #求损失
6061

6162
with tf.name_scope('a_loss'):
63+
#给定计算概率密度
6264
log_prob = tf.reduce_sum(tf.log(self.a_prob + 1e-5) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), axis=1, keep_dims=True)
6365
exp_v = log_prob * tf.stop_gradient(td)
6466
entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
65-
axis=1, keep_dims=True) # encourage exploration
67+
axis=1, keep_dims=True) # encourage exploration 计算熵
6668
self.exp_v = ENTROPY_BETA * entropy + exp_v
6769
self.a_loss = tf.reduce_mean(-self.exp_v)
6870

contents/10_A3C/A3C_distributed_tf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def work(job_name, task_index, global_ep, lock, r_queue, global_running_r):
9999
"ps": ['localhost:2220', 'localhost:2221',],
100100
"worker": ['localhost:2222', 'localhost:2223', 'localhost:2224', 'localhost:2225',]
101101
})
102-
server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)
102+
server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) #创建集群工作服务
103103
if job_name == 'ps':
104104
print('Start Parameter Sever: ', task_index)
105105
server.join()

contents/11_Dyna_Q/RL_brain.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
All decisions and learning processes are made in here.
44
55
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
6+
7+
主要是使用Q-Learning来解决连续动作,
8+
使用针对Actor-Critic中Critic中给定Actor一个更好的选择,而不是只告诉好和不好
69
"""
710

811
import numpy as np

contents/8_Actor_Critic_Advantage/AC_CartPole.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ def __init__(self, sess, n_features, lr=0.01):
124124
)
125125

126126
with tf.variable_scope('squared_TD_error'): #td_error
127+
#原本是Q(St,At)-V(St)
128+
#因为 (St,At)=E(Rt+V(St+1))
129+
#所以 Rt+V(St+1)-V(St)
127130
self.td_error = self.r + GAMMA * self.v_ - self.v #td_error=R + gamma*Vt+1 - Vt
128131
self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval
129132
with tf.variable_scope('train'):

0 commit comments

Comments
 (0)