|
9 | 9 | Using:
|
10 | 10 | Tensorflow: 1.0
|
11 | 11 | gym: 0.8.0
|
| 12 | +
|
| 13 | +使用策略梯度来实现小摇杆和小汽车游戏的历史 |
| 14 | +
|
| 15 | +策略梯度查看网址:http://www.algorithmdog.com/rl-policy-gradient |
12 | 16 | """
|
13 | 17 |
|
14 | 18 | import numpy as np
|
@@ -49,75 +53,97 @@ def __init__(
|
49 | 53 |
|
50 | 54 | def _build_net(self):
|
51 | 55 | with tf.name_scope('inputs'):
|
52 |
| - self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations") |
53 |
| - self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num") |
54 |
| - self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") |
| 56 | + self.tf_obs = tf.placeholder(tf.float32, [None, self.n_features], name="observations") #给定环境 |
| 57 | + self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num") #给定的动作的数量 |
| 58 | + self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") #给定的动作值 |
55 | 59 | # fc1
|
| 60 | + """ |
| 61 | + tf.layers.dense参数: |
| 62 | + units:整数或长整数,输出空间的维数。 |
| 63 | + activation:激活功能(可调用),将其设置为“None”以保持线性激活。 |
| 64 | + use_bias:Boolean,表示该层是否使用偏差。 |
| 65 | + kernel_initializer:权重矩阵的初始化函数;如果为None(默认),则使用tf.get_variable使用的默认初始化程序初始化权重。 |
| 66 | + bias_initializer:偏置的初始化函数。 |
| 67 | + kernel_regularizer:权重矩阵的正则化函数。 |
| 68 | + bias_regularizer:正规函数的偏差。 |
| 69 | + activity_regularizer:输出的正则化函数。 |
| 70 | + kernel_constraint:由Optimizer更新后应用于内核的可选投影函数(例如,用于实现层权重的范数约束或值约束)。该函数必须将未投影的变量作为输入,并且必须返回投影变量(必须具有相同的形状)。在进行异步分布式训练时,使用约束是不安全的。 |
| 71 | + bias_constraint:由Optimizer更新后应用于偏置的可选投影函数。 |
| 72 | + trainable:Boolean,如果为True,还将变量添加到图集合GraphKeys.TRAINABLE_VARIABLES中(请参阅参考资料tf.Variable)。 |
| 73 | + name:String,图层的名称;具有相同名称的图层将共享权重,但为了避免错误,在这种情况下,我们需要reuse=True。 |
| 74 | + reuse:Boolean,是否以同一名称重用前一层的权重。 |
| 75 | + """ |
56 | 76 | layer = tf.layers.dense(
|
57 |
| - inputs=self.tf_obs, |
58 |
| - units=10, |
| 77 | + inputs=self.tf_obs, #给定输入 |
| 78 | + units=10, #整数或长整数,输出空间的维数 |
59 | 79 | activation=tf.nn.tanh, # tanh activation
|
60 |
| - kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), |
61 |
| - bias_initializer=tf.constant_initializer(0.1), |
62 |
| - name='fc1' |
| 80 | + kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), #权重矩阵的初始化函数 |
| 81 | + bias_initializer=tf.constant_initializer(0.1), #正则函数的偏差 |
| 82 | + name='fc1' #图层名称 |
63 | 83 | )
|
64 | 84 | # fc2
|
65 | 85 | all_act = tf.layers.dense(
|
66 |
| - inputs=layer, |
67 |
| - units=self.n_actions, |
| 86 | + inputs=layer, #输入层 |
| 87 | + units=self.n_actions, #输出是动作个数 |
68 | 88 | activation=None,
|
69 |
| - kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), |
70 |
| - bias_initializer=tf.constant_initializer(0.1), |
71 |
| - name='fc2' |
| 89 | + kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), #权重矩阵的初始化函数 |
| 90 | + bias_initializer=tf.constant_initializer(0.1), #正则函数的偏差 |
| 91 | + name='fc2' #定义名称 |
72 | 92 | )
|
73 | 93 |
|
74 | 94 | self.all_act_prob = tf.nn.softmax(all_act, name='act_prob') # use softmax to convert to probability
|
75 | 95 |
|
76 |
| - with tf.name_scope('loss'): |
| 96 | + with tf.name_scope('loss'): #sum((R-b)*log(p(s|a))) |
77 | 97 | # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
|
78 | 98 | neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts) # this is negative log of chosen action
|
79 |
| - # or in this way: |
| 99 | + # or in this way: 也可以使用下面的方法来实现梯度计算:sum(log(p(s|a))) |
80 | 100 | # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
|
81 |
| - loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss |
| 101 | + |
| 102 | + loss = tf.reduce_mean(neg_log_prob * self.tf_vt) # reward guided loss 概率*奖励 (p*r)/N |
82 | 103 |
|
83 | 104 | with tf.name_scope('train'):
|
84 | 105 | self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
|
85 | 106 |
|
86 | 107 | def choose_action(self, observation):
|
| 108 | + #获取所有权重 |
87 | 109 | prob_weights = self.sess.run(self.all_act_prob, feed_dict={self.tf_obs: observation[np.newaxis, :]})
|
| 110 | + |
| 111 | + #选择动作,ravel函数是一个原nd的视图,但是会改变原值,flatten函数不会修改原nd |
88 | 112 | action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
|
89 | 113 | return action
|
90 | 114 |
|
91 | 115 | def store_transition(self, s, a, r):
|
92 |
| - self.ep_obs.append(s) |
93 |
| - self.ep_as.append(a) |
94 |
| - self.ep_rs.append(r) |
| 116 | + self.ep_obs.append(s) #记录状态 |
| 117 | + self.ep_as.append(a) #记录动作 |
| 118 | + self.ep_rs.append(r) #记录奖励 |
95 | 119 |
|
96 | 120 | def learn(self):
|
97 | 121 | # discount and normalize episode reward
|
98 |
| - discounted_ep_rs_norm = self._discount_and_norm_rewards() |
| 122 | + discounted_ep_rs_norm = self._discount_and_norm_rewards() #给定折扣:蒙特卡罗策略梯度 |
99 | 123 |
|
100 | 124 | # train on episode
|
101 | 125 | self.sess.run(self.train_op, feed_dict={
|
102 |
| - self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs] |
103 |
| - self.tf_acts: np.array(self.ep_as), # shape=[None, ] |
104 |
| - self.tf_vt: discounted_ep_rs_norm, # shape=[None, ] |
| 126 | + self.tf_obs: np.vstack(self.ep_obs), # shape=[None, n_obs] 环境 |
| 127 | + self.tf_acts: np.array(self.ep_as), # shape=[None, ] 动作数量 |
| 128 | + self.tf_vt: discounted_ep_rs_norm, # shape=[None, ] 动作值 |
105 | 129 | })
|
106 | 130 |
|
107 | 131 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
|
108 | 132 | return discounted_ep_rs_norm
|
109 | 133 |
|
110 | 134 | def _discount_and_norm_rewards(self):
|
111 | 135 | # discount episode rewards
|
112 |
| - discounted_ep_rs = np.zeros_like(self.ep_rs) |
| 136 | + discounted_ep_rs = np.zeros_like(self.ep_rs) #给定初始化的折扣reward |
113 | 137 | running_add = 0
|
| 138 | + |
| 139 | + #反转(从后往前的乘以折扣因子,表示是最新的放到前面) |
114 | 140 | for t in reversed(range(0, len(self.ep_rs))):
|
115 | 141 | running_add = running_add * self.gamma + self.ep_rs[t]
|
116 |
| - discounted_ep_rs[t] = running_add |
| 142 | + discounted_ep_rs[t] = running_add #记录该值 |
117 | 143 |
|
118 | 144 | # normalize episode rewards
|
119 |
| - discounted_ep_rs -= np.mean(discounted_ep_rs) |
120 |
| - discounted_ep_rs /= np.std(discounted_ep_rs) |
| 145 | + discounted_ep_rs -= np.mean(discounted_ep_rs) # (R-b)给定平均值作为一个baseline: E[R()],防止没有采样的好动作以为内reward少而忽略 |
| 146 | + discounted_ep_rs /= np.std(discounted_ep_rs) #给定标准差 |
121 | 147 | return discounted_ep_rs
|
122 | 148 |
|
123 | 149 |
|
|
0 commit comments