@@ -74,13 +74,13 @@ def __init__(self, D, K, hidden_layer_sizes, gamma, max_experiences=10000, min_e
74
74
)
75
75
76
76
cost = tf .reduce_sum (tf .square (self .G - selected_action_values ))
77
- # self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)
78
- self .train_op = tf .train .AdagradOptimizer (10e-3 ).minimize (cost )
77
+ self .train_op = tf .train .AdamOptimizer (10e-3 ).minimize (cost )
78
+ # self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost)
79
79
# self.train_op = tf.train.MomentumOptimizer(10e-4, momentum=0.9).minimize(cost)
80
80
# self.train_op = tf.train.GradientDescentOptimizer(10e-5).minimize(cost)
81
81
82
82
# create replay memory
83
- self .experience = {'s' : [], 'a' : [], 'r' : [], 's2' : []}
83
+ self .experience = {'s' : [], 'a' : [], 'r' : [], 's2' : [], 'done' : [] }
84
84
self .max_experiences = max_experiences
85
85
self .min_experiences = min_experiences
86
86
self .batch_sz = batch_sz
@@ -118,8 +118,9 @@ def train(self, target_network):
118
118
actions = [self .experience ['a' ][i ] for i in idx ]
119
119
rewards = [self .experience ['r' ][i ] for i in idx ]
120
120
next_states = [self .experience ['s2' ][i ] for i in idx ]
121
+ dones = [self .experience ['done' ][i ] for i in idx ]
121
122
next_Q = np .max (target_network .predict (next_states ), axis = 1 )
122
- targets = [r + self .gamma * next_q for r , next_q in zip (rewards , next_Q )]
123
+ targets = [r + self .gamma * next_q if not done else r for r , next_q , done in zip (rewards , next_Q , dones )]
123
124
124
125
# call optimizer
125
126
self .session .run (
@@ -131,16 +132,18 @@ def train(self, target_network):
131
132
}
132
133
)
133
134
134
- def add_experience (self , s , a , r , s2 ):
135
+ def add_experience (self , s , a , r , s2 , done ):
135
136
if len (self .experience ['s' ]) >= self .max_experiences :
136
137
self .experience ['s' ].pop (0 )
137
138
self .experience ['a' ].pop (0 )
138
139
self .experience ['r' ].pop (0 )
139
140
self .experience ['s2' ].pop (0 )
141
+ self .experience ['done' ].pop (0 )
140
142
self .experience ['s' ].append (s )
141
143
self .experience ['a' ].append (a )
142
144
self .experience ['r' ].append (r )
143
145
self .experience ['s2' ].append (s2 )
146
+ self .experience ['done' ].append (done )
144
147
145
148
def sample_action (self , x , eps ):
146
149
if np .random .random () < eps :
@@ -167,7 +170,7 @@ def play_one(env, model, tmodel, eps, gamma, copy_period):
167
170
reward = - 200
168
171
169
172
# update the model
170
- model .add_experience (prev_observation , action , reward , observation )
173
+ model .add_experience (prev_observation , action , reward , observation , done )
171
174
model .train (tmodel )
172
175
173
176
iters += 1
0 commit comments