-
Notifications
You must be signed in to change notification settings - Fork 1
/
dueling_ddqn_tf2.py
141 lines (111 loc) · 5.06 KB
/
dueling_ddqn_tf2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
"""dueling_ddqn_tf2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1uqwom28keeUs7oSCeCruSLzR7i4TLdSn
"""
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
import numpy as np
class DuelingDeepQNetwork(keras.Model):
def __init__(self, n_actions, fc1_dims, fc2_dims):
super(DuelingDeepQNetwork, self).__init__()
self.dense1 = keras.layers.Dense(fc1_dims, activation='relu')
self.dense2 = keras.layers.Dense(fc2_dims, activation='relu')
self.V = keras.layers.Dense(1, activation=None)
self.A = keras.layers.Dense(n_actions, activation=None)
def call(self, state):
x = self.dense1(state)
x = self.dense2(x)
V = self.V(x)
A = self.A(x)
Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
return Q
def advantage(self, state):
x = self.dense1(state)
x = self.dense2(x)
A = self.A(x)
return A
class ReplayBuffer():
def __init__(self, max_size, input_shape):
self.mem_size = max_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)
self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.action_memory[index] = action
self.reward_memory[index] = reward
self.terminal_memory[index] = done
self.mem_cntr += 1
def sample_buffer(self, batch_size):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size, replace=False)
states = self.state_memory[batch]
new_states = self.new_state_memory[batch]
actions = self.action_memory[batch]
rewards = self.reward_memory[batch]
dones = self.terminal_memory[batch]
return states, actions, rewards, new_states, dones
class Agent():
def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=1e-3, eps_end=0.01,
mem_size=100000, fc1_dims=128,
fc2_dims=128, replace=100):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.eps_dec = epsilon_dec
self.eps_min = eps_end
self.replace = replace
self.batch_size = batch_size
self.learn_step_counter = 0
self.memory = ReplayBuffer(mem_size, input_dims)
self.q_eval = DuelingDeepQNetwork(n_actions, fc1_dims, fc2_dims)
self.q_next = DuelingDeepQNetwork(n_actions, fc1_dims, fc2_dims)
self.q_eval.compile(optimizer=Adam(learning_rate=lr),
loss='mean_squared_error')
# just a formality, won't optimize network
self.q_next.compile(optimizer=Adam(learning_rate=lr),
loss='mean_squared_error')
def store_transition(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation])
actions = self.q_eval.advantage(state)
action = tf.math.argmax(actions, axis=1).numpy()[0]
return action
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
if self.learn_step_counter % self.replace == 0:
self.q_next.set_weights(self.q_eval.get_weights())
states, actions, rewards, states_, dones = \
self.memory.sample_buffer(self.batch_size)
q_pred = self.q_eval(states)
q_next = self.q_next(states_)
# changing q_pred doesn't matter because we are passing states to the train function anyway
# also, no obvious way to copy tensors in tf2?
q_target = q_pred.numpy()
max_actions = tf.math.argmax(self.q_eval(states_), axis=1)
# improve on my solution!
for idx, terminal in enumerate(dones):
#if terminal:
#q_next[idx] = 0.0
q_target[idx, actions[idx]] = rewards[idx] + \
self.gamma*q_next[idx, max_actions[idx]]*(1-int(dones[idx]))
self.q_eval.train_on_batch(states, q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
self.eps_min else self.eps_min
self.learn_step_counter += 1