Skip to content

Commit e998de5

Browse files
different optimizer
1 parent 397fb07 commit e998de5

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

rl2/cartpole/dqn_tf.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,13 @@ def __init__(self, D, K, hidden_layer_sizes, gamma, max_experiences=10000, min_e
7474
)
7575

7676
cost = tf.reduce_sum(tf.square(self.G - selected_action_values))
77-
# self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)
78-
self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost)
77+
self.train_op = tf.train.AdamOptimizer(10e-3).minimize(cost)
78+
# self.train_op = tf.train.AdagradOptimizer(10e-3).minimize(cost)
7979
# self.train_op = tf.train.MomentumOptimizer(10e-4, momentum=0.9).minimize(cost)
8080
# self.train_op = tf.train.GradientDescentOptimizer(10e-5).minimize(cost)
8181

8282
# create replay memory
83-
self.experience = {'s': [], 'a': [], 'r': [], 's2': []}
83+
self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
8484
self.max_experiences = max_experiences
8585
self.min_experiences = min_experiences
8686
self.batch_sz = batch_sz
@@ -118,8 +118,9 @@ def train(self, target_network):
118118
actions = [self.experience['a'][i] for i in idx]
119119
rewards = [self.experience['r'][i] for i in idx]
120120
next_states = [self.experience['s2'][i] for i in idx]
121+
dones = [self.experience['done'][i] for i in idx]
121122
next_Q = np.max(target_network.predict(next_states), axis=1)
122-
targets = [r + self.gamma*next_q for r, next_q in zip(rewards, next_Q)]
123+
targets = [r + self.gamma*next_q if not done else r for r, next_q, done in zip(rewards, next_Q, dones)]
123124

124125
# call optimizer
125126
self.session.run(
@@ -131,16 +132,18 @@ def train(self, target_network):
131132
}
132133
)
133134

134-
def add_experience(self, s, a, r, s2):
135+
def add_experience(self, s, a, r, s2, done):
135136
if len(self.experience['s']) >= self.max_experiences:
136137
self.experience['s'].pop(0)
137138
self.experience['a'].pop(0)
138139
self.experience['r'].pop(0)
139140
self.experience['s2'].pop(0)
141+
self.experience['done'].pop(0)
140142
self.experience['s'].append(s)
141143
self.experience['a'].append(a)
142144
self.experience['r'].append(r)
143145
self.experience['s2'].append(s2)
146+
self.experience['done'].append(done)
144147

145148
def sample_action(self, x, eps):
146149
if np.random.random() < eps:
@@ -167,7 +170,7 @@ def play_one(env, model, tmodel, eps, gamma, copy_period):
167170
reward = -200
168171

169172
# update the model
170-
model.add_experience(prev_observation, action, reward, observation)
173+
model.add_experience(prev_observation, action, reward, observation, done)
171174
model.train(tmodel)
172175

173176
iters += 1

0 commit comments

Comments
 (0)