forked from ml-jku/rudder-demonstration-code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrudder.py
174 lines (148 loc) · 7.93 KB
/
rudder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from nn import LSTMLayer
from torch.autograd import Variable
from torch.nn import MSELoss as MSELoss
def to_one_hot(y, n_dims=None):
""" Take integer y (tensor or variable) with n dims and convert it to 1-hot representation with n+1 dims. """
y_tensor = y.data if isinstance(y, Variable) else y
y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
y_one_hot = y_one_hot.view(*y.shape, -1)
return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot
class RRLSTM(nn.Module):
def __init__(self, state_input_size, n_actions, buffer, n_units, lstm_lr, l2_regularization,
return_scaling, lstm_batch_size=128, continuous_pred_factor=0.5):
super(RRLSTM, self).__init__()
self.buffer = buffer
self.return_scaling = return_scaling
self.lstm_batch_size = lstm_batch_size
self.continuous_pred_factor = continuous_pred_factor
self.n_actions = n_actions
# Forget gate and output gate are deactivated as used in the Atari games, see Appendix S4.2.1
self.lstm = LSTMLayer(in_features=state_input_size + n_actions, out_features=n_units,
w_ci=(lambda *args, **kwargs: nn.init.normal_(mean=0, std=0.1, *args, **kwargs), False),
w_ig=(False, lambda *args, **kwargs: nn.init.normal_(mean=0, std=0.1, *args, **kwargs)),
w_og=False,
b_ci=lambda *args, **kwargs: nn.init.normal_(mean=0, *args, **kwargs),
b_ig=lambda *args, **kwargs: nn.init.normal_(mean=-3, *args, **kwargs),
b_og=False,
a_out=lambda x: x
)
self.linear = nn.Linear(n_units, 1)
self.optimizer = optim.Adam(self.parameters(), lr=lstm_lr, weight_decay=l2_regularization)
self.lstm_updates = 0
def forward(self, input):
states, actions = input
# Prepare input features
repaired = states[:, :, 0:1]
transport_cond = states[:, :, 1:3]
brands = to_one_hot(states[:, :, 3], 4)
time = states[:, :, 4:] / states.shape[1]
states = torch.cat([repaired, transport_cond, brands, time], 2)
actions = to_one_hot(actions, self.n_actions)
actions = torch.cat((actions, torch.zeros((actions.shape[0], 1, self.n_actions))), 1)
input = torch.cat((states, actions), 2)
# Run the lstm
lstm_out = self.lstm.forward(input, return_all_seq_pos=True)
return self.linear(lstm_out[0])
def redistribute_reward(self, states, actions):
# Prepare LSTM inputs
states_var = Variable(torch.FloatTensor(states)).detach()
delta_states = torch.cat([states_var[:, 0:1, :], states_var[:, 1:, :] - states_var[:, :-1, :]], dim=1)
actions_var = Variable(torch.FloatTensor(actions)).detach()
# Calculate LSTM predictions
lstm_out = self.forward([delta_states, actions_var])
pred_g0 = torch.cat([torch.zeros_like(lstm_out[:, 0:1, :]), lstm_out], dim=1)[:, :-1, :]
# Difference of predictions of two consecutive timesteps.
redistributed_reward = pred_g0[:, 1:, 0] - pred_g0[:, :-1, 0]
# Scale reward back up as LSTM targets have been scaled.
new_reward = redistributed_reward * self.return_scaling
return new_reward
# Trains the LSTM until -on average- the main loss is below 0.25.
def train(self, episode):
i = 0
loss_average = 0.3
mse_loss = MSELoss(reduction="none")
while loss_average > 0.15:
i += 1
self.lstm_updates += 1
self.optimizer.zero_grad()
# Get samples from the lesson buffer and prepare them.
states, actions, rewards, lenght = self.buffer.sample(self.lstm_batch_size)
lenght = lenght[:, 0]
states_var = Variable(torch.FloatTensor(states)).detach()
actions_var = Variable(torch.FloatTensor(actions)).detach()
rewards_var = Variable(torch.FloatTensor(rewards)).detach()
# Scale the returns as they might have high / low values.
returns = torch.sum(rewards_var, 1, keepdim=True) / self.return_scaling
# Calculate differences of states
delta_states = torch.cat([states_var[:, 0:1, :], states_var[:, 1:, :] - states_var[:, :-1, :]], dim=1)
# Run the LSTM
lstm_out = self.forward([delta_states, actions_var])
predicted_G0 = lstm_out.squeeze()
# Loss calculations
all_timestep_loss = mse_loss(predicted_G0, returns.repeat(1, predicted_G0.size(1)))
# Loss at any position in the sequence
aux_loss = self.continuous_pred_factor * all_timestep_loss.mean()
# LSTM is mainly trained on getting the final prediction of g0 right.
main_loss = all_timestep_loss[range(self.lstm_batch_size), lenght[:] - 1].mean()
# LSTM update and loss tracking
lstm_loss = main_loss + aux_loss
lstm_loss.backward()
loss_np = lstm_loss.data.numpy()
main_loss_np = main_loss.data.numpy()
loss_average -= 0.01 * (loss_average - main_loss_np)
if main_loss_np > loss_average * 2:
loss_average = loss_np
self.optimizer.step()
class LessonBuffer:
def __init__(self, size, max_time, n_features):
self.size = size
# Samples, time, features
self.states_buffer = np.empty(shape=(size, max_time + 1, n_features))
self.actions_buffer = np.empty(shape=(size, max_time))
self.rewards_buffer = np.empty(shape=(size, max_time))
self.lens_buffer = np.empty(shape=(size, 1), dtype=np.int32)
self.next_spot_to_add = 0
self.buffer_is_full = False
self.samples_since_last_training = 0
# LSTM training does only make sense, if there are sequences in the buffer which have different returns.
# LSTM could otherwise learn to ignore the input and just use the bias units.
def different_returns_encountered(self):
if self.buffer_is_full:
return np.unique(self.rewards_buffer[..., -1]).shape[0] > 1
else:
return np.unique(self.rewards_buffer[:self.next_spot_to_add, -1]).shape[0] > 1
# We only train if 64 samples are played by a random policy
def full_enough(self):
return self.buffer_is_full or self.next_spot_to_add > 256
# Add a new episode to the buffer
def add(self, states, actions, rewards):
traj_length = states.shape[0]
next_ind = self.next_spot_to_add
self.next_spot_to_add = self.next_spot_to_add + 1
if self.next_spot_to_add >= self.size:
self.buffer_is_full = True
self.next_spot_to_add = self.next_spot_to_add % self.size
self.states_buffer[next_ind, :traj_length] = states.squeeze()
self.states_buffer[next_ind, traj_length:] = 0
self.actions_buffer[next_ind, :traj_length - 1] = actions
self.actions_buffer[next_ind, traj_length:] = 0
self.rewards_buffer[next_ind, :traj_length - 1] = rewards
self.rewards_buffer[next_ind, traj_length:] = 0
self.lens_buffer[next_ind] = traj_length
# Choose <batch_size> samples uniformly at random and return them.
def sample(self, batch_size):
self.samples_since_last_training = 0
if self.buffer_is_full:
indices = np.random.randint(0, self.size, batch_size)
else:
indices = np.random.randint(0, self.next_spot_to_add, batch_size)
return (self.states_buffer[indices, :, :], self.actions_buffer[indices, :],
self.rewards_buffer[indices, :], self.lens_buffer[indices, :])
def nograd(t):
return t.detach()