-
Notifications
You must be signed in to change notification settings - Fork 0
/
backprop_network.py
294 lines (147 loc) · 7.15 KB
/
backprop_network.py
1
"""backprop_network.py"""import randomimport numpy as npimport matplotlib.pyplot as pltimport math# b0_norms = []# b1_norms = []# b2_norms = []# b3_norms = []# b4_norms = []class Network(object): def __init__(self, sizes): """The list ``sizes`` contains the number of neurons in the respective layers of the network. For example, if the list was [2, 3, 1] then it would be a three-layer network, with the first layer containing 2 neurons, the second layer 3 neurons, and the third layer 1 neuron. The biases and weights for the network are initialized randomly, using a Gaussian distribution with mean 0, and variance 1. Note that the first layer is assumed to be an input layer, and by convention we won't set any biases for those neurons, since biases are only ever used in computing the outputs from later layers.""" self.num_layers = len(sizes) self.sizes = sizes self.biases = [np.random.randn(y, 1) for y in sizes[1:]] self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])] def SGD(self, training_data, epochs, mini_batch_size, learning_rate, test_data): """Train the neural network using mini-batch stochastic gradient descent. The ``training_data`` is a list of tuples ``(x, y)`` representing the training inputs and the desired outputs. """ print("Initial test accuracy: {0}".format(self.one_label_accuracy(test_data))) n = len(training_data) train_acc = [] train_loss = [] test_acc = [] x = [i for i in range(epochs)] for j in range(epochs): random.shuffle(list(training_data)) mini_batches = [ training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)] for mini_batch in mini_batches: self.update_mini_batch(mini_batch, learning_rate) train_acc.append(self.one_hot_accuracy(training_data)) train_loss.append(self.loss(training_data)) test_acc.append(self.one_label_accuracy(test_data)) print ("Epoch {0} test accuracy: {1}".format(j, self.one_label_accuracy(test_data))) plt.ylabel("Accuracy") plt.xlabel("Epochs") plt.plot(x, train_acc, 'b-', label='Training Accuracy') plt.plot(x, test_acc, 'r-', label='Test Accuracy') plt.plot(x, train_loss, 'p-', label='Training Loss') plt.legend() plt.show() # plt.plot(b0_norms, label='layer 0') # plt.plot(b1_norms,label='layer 1') # plt.plot(b2_norms,label='layer 2') # plt.plot(b3_norms,label='layer 3') # plt.plot(b4_norms,label='layer 4') # plt.xlabel('#Epoch') # plt.legend() # plt.show() def update_mini_batch(self, mini_batch, learning_rate): """Update the network's weights and biases by applying stochastic gradient descent using backpropagation to a single mini batch. The ``mini_batch`` is a list of tuples ``(x, y)``.""" nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in mini_batch: delta_nabla_b, delta_nabla_w = self.backprop(x, y) nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)] self.weights = [w - (learning_rate / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)] self.biases = [b - (learning_rate / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)] # b0_norms.append(np.linalg.norm(nabla_b[0])/len(mini_batch)) # b1_norms.append(np.linalg.norm(nabla_b[1]) / len(mini_batch)) # b2_norms.append(np.linalg.norm(nabla_b[2]) / len(mini_batch)) # b3_norms.append(np.linalg.norm(nabla_b[3]) / len(mini_batch)) # b4_norms.append(np.linalg.norm(nabla_b[4]) / len(mini_batch)) def backprop(self, x, y): """The function receives as input a 784 dimensional vector x and a one-hot vector y. The function should return a tuple of two lists (db, dw) as described in the assignment pdf. """ a = [np.zeros(size) for size in self.sizes] z = [np.zeros(size) for size in self.sizes] a[0] = x z[0] = x for i in range(1,self.num_layers): z[i] = np.dot(self.weights[i-1],a[i-1]) + self.biases[i-1] a[i] = sigmoid(z[i]) a[-1] = self.output_softmax(z[-1]) der = [np.zeros(size) for size in self.sizes] der[-1] = self.loss_derivative_wr_output_activations(z[-1], y) for i in range(self.num_layers - 1, 0, -1): der[i-1] = np.dot(self.weights[i-1].T,der[i])*sigmoid_derivative(z[i-1]) dw = [np.dot(der[i],a[i-1].T) for i in range(1,self.num_layers)] db = der[1::] return db, dw def one_label_accuracy(self, data): """Return accuracy of network on data with numeric labels""" output_results = [(np.argmax(self.network_output_before_softmax(x)), y) for (x, y) in data] return sum(int(x == y) for (x, y) in output_results)/float(len(data)) def one_hot_accuracy(self,data): """Return accuracy of network on data with one-hot labels""" output_results = [(np.argmax(self.network_output_before_softmax(x)), np.argmax(y)) for (x, y) in data] return sum(int(x == y) for (x, y) in output_results) / float(len(data)) def network_output_before_softmax(self, x): """Return the output of the network before softmax if ``x`` is input.""" layer = 0 for b, w in zip(self.biases, self.weights): if layer == len(self.weights) - 1: x = np.dot(w, x) + b else: x = sigmoid(np.dot(w, x)+b) layer += 1 return x def loss(self, data): """Return the loss of the network on the data""" loss_list = [] for (x, y) in data: net_output_before_softmax = self.network_output_before_softmax(x) net_output_after_softmax = self.output_softmax(net_output_before_softmax) loss_list.append(np.dot(-np.log(net_output_after_softmax).transpose(),y).flatten()[0]) return sum(loss_list) / float(len(data)) def output_softmax(self, output_activations): """Return output after softmax given output before softmax""" output_exp = np.exp(output_activations) return output_exp/output_exp.sum() def loss_derivative_wr_output_activations(self, output_activations, y): """Return derivative of loss with respect to the output activations before softmax""" return self.output_softmax(output_activations) - ydef sigmoid(z): """The sigmoid function.""" return 1.0/(1.0+np.exp(-z))def sigmoid_derivative(z): """Derivative of the sigmoid function.""" return sigmoid(z)*(1-sigmoid(z))