Skip to content

Commit b199cff

Browse files
committed
update policy_gradient
1 parent 790a048 commit b199cff

File tree

2 files changed

+59
-32
lines changed

2 files changed

+59
-32
lines changed

contents/7_Policy_gradient_softmax/RL_brain.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
Using:
1010
Tensorflow: 1.0
1111
gym: 0.8.0
12+
13+
使用策略梯度来实现小摇杆和小汽车游戏的历史
14+
15+
策略梯度查看网址:http://www.algorithmdog.com/rl-policy-gradient
1216
"""
1317

1418
import numpy as np
@@ -49,75 +53,97 @@ def __init__(
4953

5054
def _build_net(self):
5155
with tf.name_scope('inputs'):
52-
self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations")
53-
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
54-
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
56+
self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations") #给定环境
57+
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num") #给定的动作的数量
58+
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") #给定的动作值
5559
# fc1
60+
"""
61+
tf.layers.dense参数:
62+
units:整数或长整数,输出空间的维数。
63+
activation:激活功能(可调用),将其设置为“None”以保持线性激活。
64+
use_bias:Boolean,表示该层是否使用偏差。
65+
kernel_initializer:权重矩阵的初始化函数;如果为None(默认),则使用tf.get_variable使用的默认初始化程序初始化权重。
66+
bias_initializer:偏置的初始化函数。
67+
kernel_regularizer:权重矩阵的正则化函数。
68+
bias_regularizer:正规函数的偏差。
69+
activity_regularizer:输出的正则化函数。
70+
kernel_constraint:由Optimizer更新后应用于内核的可选投影函数(例如,用于实现层权重的范数约束或值约束)。该函数必须将未投影的变量作为输入,并且必须返回投影变量(必须具有相同的形状)。在进行异步分布式训练时,使用约束是不安全的。
71+
bias_constraint:由Optimizer更新后应用于偏置的可选投影函数。
72+
trainable:Boolean,如果为True,还将变量添加到图集合GraphKeys.TRAINABLE_VARIABLES中(请参阅参考资料tf.Variable)。
73+
name:String,图层的名称;具有相同名称的图层将共享权重,但为了避免错误,在这种情况下,我们需要reuse=True。
74+
reuse:Boolean,是否以同一名称重用前一层的权重。
75+
"""
5676
layer = tf.layers.dense(
57-
inputs=self.tf_obs,
58-
units=10,
77+
inputs=self.tf_obs, #给定输入
78+
units=10, #整数或长整数,输出空间的维数
5979
activation=tf.nn.tanh, # tanh activation
60-
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
61-
bias_initializer=tf.constant_initializer(0.1),
62-
name='fc1'
80+
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), #权重矩阵的初始化函数
81+
bias_initializer=tf.constant_initializer(0.1), #正则函数的偏差
82+
name='fc1' #图层名称
6383
)
6484
# fc2
6585
all_act = tf.layers.dense(
66-
inputs=layer,
67-
units=self.n_actions,
86+
inputs=layer, #输入层
87+
units=self.n_actions, #输出是动作个数
6888
activation=None,
69-
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
70-
bias_initializer=tf.constant_initializer(0.1),
71-
name='fc2'
89+
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), #权重矩阵的初始化函数
90+
bias_initializer=tf.constant_initializer(0.1), #正则函数的偏差
91+
name='fc2' #定义名称
7292
)
7393

7494
self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
7595

76-
with tf.name_scope('loss'):
96+
with tf.name_scope('loss'): #sum((R-b)*log(p(s|a)))
7797
# to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
7898
neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action
79-
# or in this way:
99+
# or in this way: 也可以使用下面的方法来实现梯度计算:sum(log(p(s|a)))
80100
# neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
81-
loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss
101+
102+
loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss 概率*奖励 (p*r)/N
82103

83104
with tf.name_scope('train'):
84105
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
85106

86107
def choose_action(self, observation):
108+
#获取所有权重
87109
prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
110+
111+
#选择动作,ravel函数是一个原nd的视图,但是会改变原值,flatten函数不会修改原nd
88112
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
89113
return action
90114

91115
def store_transition(self, s, a, r):
92-
self.ep_obs.append(s)
93-
self.ep_as.append(a)
94-
self.ep_rs.append(r)
116+
self.ep_obs.append(s) #记录状态
117+
self.ep_as.append(a) #记录动作
118+
self.ep_rs.append(r) #记录奖励
95119

96120
def learn(self):
97121
# discount and normalize episode reward
98-
discounted_ep_rs_norm = self._discount_and_norm_rewards()
122+
discounted_ep_rs_norm = self._discount_and_norm_rewards() #给定折扣:蒙特卡罗策略梯度
99123

100124
# train on episode
101125
self.sess.run(self.train_op, feed_dict={
102-
self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs]
103-
self.tf_acts: np.array(self.ep_as), # shape=[None, ]
104-
self.tf_vt: discounted_ep_rs_norm, # shape=[None, ]
126+
self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs] 环境
127+
self.tf_acts: np.array(self.ep_as), # shape=[None, ] 动作数量
128+
self.tf_vt: discounted_ep_rs_norm, # shape=[None, ] 动作值
105129
})
106130

107131
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
108132
return discounted_ep_rs_norm
109133

110134
def _discount_and_norm_rewards(self):
111135
# discount episode rewards
112-
discounted_ep_rs = np.zeros_like(self.ep_rs)
136+
discounted_ep_rs = np.zeros_like(self.ep_rs) #给定初始化的折扣reward
113137
running_add = 0
138+
139+
#反转(从后往前的乘以折扣因子,表示是最新的放到前面)
114140
for t in reversed(range(0, len(self.ep_rs))):
115141
running_add = running_add * self.gamma + self.ep_rs[t]
116-
discounted_ep_rs[t] = running_add
142+
discounted_ep_rs[t] = running_add #记录该值
117143

118144
# normalize episode rewards
119-
discounted_ep_rs -= np.mean(discounted_ep_rs)
120-
discounted_ep_rs /= np.std(discounted_ep_rs)
145+
discounted_ep_rs -= np.mean(discounted_ep_rs) # (R-b)给定平均值作为一个baseline: E[R()],防止没有采样的好动作以为内reward少而忽略
146+
discounted_ep_rs /= np.std(discounted_ep_rs) #给定标准差
121147
return discounted_ep_rs
122148

123149

contents/7_Policy_gradient_softmax/run_CartPole.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from RL_brain import PolicyGradient
1515
import matplotlib.pyplot as plt
1616

17+
#给定一个奖励的阈值
1718
DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold
1819
RENDER = False # rendering wastes time
1920

@@ -30,7 +31,7 @@
3031
n_actions=env.action_space.n,
3132
n_features=env.observation_space.shape[0],
3233
learning_rate=0.02,
33-
reward_decay=0.99,
34+
reward_decay=0.99, #给定折扣因子
3435
# output_graph=True,
3536
)
3637

@@ -45,16 +46,16 @@
4546

4647
observation_, reward, done, info = env.step(action)
4748

48-
RL.store_transition(observation, action, reward)
49+
RL.store_transition(observation, action, reward) #存储转换
4950

50-
if done:
51+
if done: #找到最终的动作
5152
ep_rs_sum = sum(RL.ep_rs)
5253

5354
if 'running_reward' not in globals():
5455
running_reward = ep_rs_sum
5556
else:
56-
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
57-
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
57+
running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 #折扣=历史总和*0.01+当前奖励*0.99
58+
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering后,比较下reward阈值
5859
print("episode:", i_episode, " reward:", int(running_reward))
5960

6061
vt = RL.learn()

0 commit comments

Comments
 (0)