forked from ivan-vasilev/advanced-deep-learning-with-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_rnn_count_1s.py
128 lines (87 loc) · 3.62 KB
/
simple_rnn_count_1s.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
def step(s, x, U, W):
return x * U + s * W
def forward(x, U, W):
# Number of samples in the mini-batch
number_of_samples = len(x)
# Length of each sample
sequence_length = len(x[0])
# Initialize the state activation for each sample along the sequence
s = np.zeros((number_of_samples, sequence_length + 1))
# Update the states over the sequence
for t in range(0, sequence_length):
s[:, t + 1] = step(s[:, t], x[:, t], U, W) # step function
return s
def backward(x, s, y, W):
sequence_length = len(x[0])
# The network output is just the last activation of sequence
s_t = s[:, -1]
# Compute the gradient of the output w.r.t. MSE cost function at final state
gS = 2 * (s_t - y)
# Set the gradient accumulations to 0
gU, gW = 0, 0
# Accumulate gradients backwards
for k in range(sequence_length, 0, -1):
# Compute the parameter gradients and accumulate the results.
gU += np.sum(gS * x[:, k - 1])
gW += np.sum(gS * s[:, k - 1])
# Compute the gradient at the output of the previous layer
gS = gS * W
return gU, gW
def train(x, y, epochs, learning_rate=0.0005):
"""Train the network"""
# Set initial parameters
weights = (-2, 0) # (U, W)
# Accumulate the losses and their respective weights
losses = list()
gradients_u = list()
gradients_w = list()
# Perform iterative gradient descent
for i in range(epochs):
# Perform forward and backward pass to get the gradients
s = forward(x, weights[0], weights[1])
# Compute the loss
loss = (y[0] - s[-1, -1]) ** 2
# Store the loss and weights values for later display
losses.append(loss)
gradients = backward(x, s, y, weights[1])
gradients_u.append(gradients[0])
gradients_w.append(gradients[1])
# Update each parameter `p` by p = p - (gradient * learning_rate).
# `gp` is the gradient of parameter `p`
weights = tuple((p - gp * learning_rate) for p, gp in zip(weights, gradients))
print(weights)
return np.array(losses), np.array(gradients_u), np.array(gradients_w)
def plot_training(losses, gradients_u, gradients_w):
import matplotlib.pyplot as plt
# remove nan and inf values
losses = losses[~np.isnan(losses)][:-1]
gradients_u = gradients_u[~np.isnan(gradients_u)][:-1]
gradients_w = gradients_w[~np.isnan(gradients_w)][:-1]
# plot the weights U and W
fig, ax1 = plt.subplots(figsize=(5, 3.4))
ax1.set_ylim(-3, 600)
ax1.set_xlabel('epochs')
ax1.plot(gradients_u, label='grad U', color='blue', linestyle=':')
ax1.plot(gradients_w, label='grad W', color='red', linestyle='--')
ax1.legend(loc='upper left')
# instantiate a second axis that shares the same x-axis
# plot the loss on the second axis
ax2 = ax1.twinx()
# uncomment to plot exploding gradients
ax2.set_ylim(-3, 200)
ax2.plot(losses, label='Loss', color='green')
ax2.tick_params(axis='y', labelcolor='green')
ax2.legend(loc='upper right')
fig.tight_layout()
plt.show()
# Use these inputs for normal training
# The first dimension represents the mini-batch
# x = np.array([[0, 0, 0, 0, 1, 0, 1, 0, 1, 0]])
# y = np.array([3])
# Use these inputs to reproduce the exploding gradients scenario
x = np.array([[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]])
y = np.array([12])
print("Sum of ones RNN from scratch")
losses, gradients_u, gradients_w = train(x, y, epochs=150)
plot_training(losses, gradients_u, gradients_w)