From 22b18baf3fb6bf32d7038a37c18f1f0467e0a2d0 Mon Sep 17 00:00:00 2001 From: eriklindernoren Date: Sat, 23 Dec 2017 21:22:33 +0100 Subject: [PATCH] Renamed 'acc_grad' to 'accum_grad' to reduce ambiguity --- mlfromscratch/deep_learning/layers.py | 118 +++++++++--------- mlfromscratch/deep_learning/neural_network.py | 3 +- .../supervised_learning/naive_bayes.py | 12 +- .../particle_swarm_optimization.py | 7 +- 4 files changed, 73 insertions(+), 67 deletions(-) diff --git a/mlfromscratch/deep_learning/layers.py b/mlfromscratch/deep_learning/layers.py index 10da5c29..a8153696 100644 --- a/mlfromscratch/deep_learning/layers.py +++ b/mlfromscratch/deep_learning/layers.py @@ -26,10 +26,10 @@ def forward_pass(self, X, training): """ Propogates the signal forward in the network """ raise NotImplementedError() - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): """ Propogates the accumulated gradient backwards in the network. If the has trainable weights then these weights are also tuned in this method. - As input (acc_grad) it receives the gradient with respect to the output of the layer and + As input (accum_grad) it receives the gradient with respect to the output of the layer and returns the gradient with respect to the output of the previous layer. """ raise NotImplementedError() @@ -73,14 +73,14 @@ def forward_pass(self, X, training=True): self.layer_input = X return X.dot(self.W) + self.w0 - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): # Save weights used during forwards pass W = self.W if self.trainable: # Calculate gradient w.r.t layer weights - grad_w = self.layer_input.T.dot(acc_grad) - grad_w0 = np.sum(acc_grad, axis=0, keepdims=True) + grad_w = self.layer_input.T.dot(accum_grad) + grad_w0 = np.sum(accum_grad, axis=0, keepdims=True) # Update the layer weights self.W = self.W_opt.update(self.W, grad_w) @@ -88,8 +88,8 @@ def backward_pass(self, acc_grad): # Return accumulated gradient for next layer # Calculated based on the weights used during the forward pass - acc_grad = acc_grad.dot(W.T) - return acc_grad + accum_grad = accum_grad.dot(W.T) + return accum_grad def output_shape(self): return (self.n_units, ) @@ -160,8 +160,8 @@ def forward_pass(self, X, training=True): return self.outputs - def backward_pass(self, acc_grad): - _, timesteps, _ = acc_grad.shape + def backward_pass(self, accum_grad): + _, timesteps, _ = accum_grad.shape # Variables where we save the accumulated gradient w.r.t each parameter grad_U = np.zeros_like(self.U) @@ -169,16 +169,16 @@ def backward_pass(self, acc_grad): grad_W = np.zeros_like(self.W) # The gradient w.r.t the layer input. # Will be passed on to the previous layer in the network - acc_grad_next = np.zeros_like(acc_grad) + accum_grad_next = np.zeros_like(accum_grad) # Back Propagation Through Time for t in reversed(range(timesteps)): # Update gradient w.r.t V at time step t - grad_V += acc_grad[:, t].T.dot(self.states[:, t]) + grad_V += accum_grad[:, t].T.dot(self.states[:, t]) # Calculate the gradient w.r.t the state input - grad_wrt_state = acc_grad[:, t].dot(self.V) * self.activation.gradient(self.state_input[:, t]) + grad_wrt_state = accum_grad[:, t].dot(self.V) * self.activation.gradient(self.state_input[:, t]) # Gradient w.r.t the layer input - acc_grad_next[:, t] = grad_wrt_state.dot(self.U) + accum_grad_next[:, t] = grad_wrt_state.dot(self.U) # Update gradient w.r.t W and U by backprop. from time step t for at most # self.bptt_trunc number of time steps for t_ in reversed(np.arange(max(0, t - self.bptt_trunc), t+1)): @@ -192,7 +192,7 @@ def backward_pass(self, acc_grad): self.V = self.V_opt.update(self.V, grad_V) self.W = self.W_opt.update(self.W, grad_W) - return acc_grad_next + return accum_grad_next def output_shape(self): return self.input_shape @@ -253,31 +253,31 @@ def forward_pass(self, X, training=True): # Redistribute axises so that batch size comes first return output.transpose(3,0,1,2) - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): # Reshape accumulated gradient into column shape - acc_grad = acc_grad.transpose(1, 2, 3, 0).reshape(self.n_filters, -1) + accum_grad = accum_grad.transpose(1, 2, 3, 0).reshape(self.n_filters, -1) if self.trainable: # Take dot product between column shaped accum. gradient and column shape # layer input to determine the gradient at the layer with respect to layer weights - grad_w = acc_grad.dot(self.X_col.T).reshape(self.W.shape) + grad_w = accum_grad.dot(self.X_col.T).reshape(self.W.shape) # The gradient with respect to bias terms is the sum similarly to in Dense layer - grad_w0 = np.sum(acc_grad, axis=1, keepdims=True) + grad_w0 = np.sum(accum_grad, axis=1, keepdims=True) # Update the layers weights self.W = self.W_opt.update(self.W, grad_w) self.w0 = self.w0_opt.update(self.w0, grad_w0) # Recalculate the gradient which will be propogated back to prev. layer - acc_grad = self.W_col.T.dot(acc_grad) + accum_grad = self.W_col.T.dot(accum_grad) # Reshape from column shape to image shape - acc_grad = column_to_image(acc_grad, + accum_grad = column_to_image(accum_grad, self.layer_input.shape, self.filter_shape, stride=self.stride, output_shape=self.padding) - return acc_grad + return accum_grad def output_shape(self): channels, height, width = self.input_shape @@ -331,7 +331,7 @@ def forward_pass(self, X, training=True): return output - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): # Save parameters used during the forward pass gamma = self.gamma @@ -339,22 +339,22 @@ def backward_pass(self, acc_grad): # If the layer is trainable the parameters are updated if self.trainable: X_norm = self.X_centered * self.stddev_inv - grad_gamma = np.sum(acc_grad * X_norm, axis=0) - grad_beta = np.sum(acc_grad, axis=0) + grad_gamma = np.sum(accum_grad * X_norm, axis=0) + grad_beta = np.sum(accum_grad, axis=0) self.gamma = self.gamma_opt.update(self.gamma, grad_gamma) self.beta = self.beta_opt.update(self.beta, grad_beta) - batch_size = acc_grad.shape[0] + batch_size = accum_grad.shape[0] # The gradient of the loss with respect to the layer inputs (use weights from forward pass) - acc_grad = (1 / batch_size) * gamma * self.stddev_inv * ( - batch_size * acc_grad - - np.sum(acc_grad, axis=0) - - self.X_centered * self.stddev_inv**2 * np.sum(acc_grad * self.X_centered, axis=0) + accum_grad = (1 / batch_size) * gamma * self.stddev_inv * ( + batch_size * accum_grad + - np.sum(accum_grad, axis=0) + - self.X_centered * self.stddev_inv**2 * np.sum(accum_grad * self.X_centered, axis=0) ) - return acc_grad + return accum_grad def output_shape(self): return self.input_shape @@ -387,18 +387,18 @@ def forward_pass(self, X, training=True): return output - def backward_pass(self, acc_grad): - batch_size, _, _, _ = acc_grad.shape + def backward_pass(self, accum_grad): + batch_size, _, _, _ = accum_grad.shape channels, height, width = self.input_shape - acc_grad = acc_grad.transpose(2, 3, 0, 1).ravel() + accum_grad = accum_grad.transpose(2, 3, 0, 1).ravel() # MaxPool or AveragePool specific method - acc_grad_col = self._pool_backward(acc_grad) + accum_grad_col = self._pool_backward(accum_grad) - acc_grad = column_to_image(acc_grad_col, (batch_size * channels, 1, height, width), self.pool_shape, self.stride, 0) - acc_grad = acc_grad.reshape((batch_size,) + self.input_shape) + accum_grad = column_to_image(accum_grad_col, (batch_size * channels, 1, height, width), self.pool_shape, self.stride, 0) + accum_grad = accum_grad.reshape((batch_size,) + self.input_shape) - return acc_grad + return accum_grad def output_shape(self): channels, height, width = self.input_shape @@ -416,21 +416,21 @@ def _pool_forward(self, X_col): self.cache = arg_max return output - def _pool_backward(self, acc_grad): - acc_grad_col = np.zeros((np.prod(self.pool_shape), acc_grad.size)) + def _pool_backward(self, accum_grad): + accum_grad_col = np.zeros((np.prod(self.pool_shape), accum_grad.size)) arg_max = self.cache - acc_grad_col[arg_max, range(acc_grad.size)] = acc_grad - return acc_grad_col + accum_grad_col[arg_max, range(accum_grad.size)] = accum_grad + return accum_grad_col class AveragePooling2D(PoolingLayer): def _pool_forward(self, X_col): output = np.mean(X_col, axis=0) return output - def _pool_backward(self, acc_grad): - acc_grad_col = np.zeros((np.prod(self.pool_shape), acc_grad.size)) - acc_grad_col[:, range(acc_grad.size)] = 1. / acc_grad_col.shape[0] * acc_grad - return acc_grad_col + def _pool_backward(self, accum_grad): + accum_grad_col = np.zeros((np.prod(self.pool_shape), accum_grad.size)) + accum_grad_col[:, range(accum_grad.size)] = 1. / accum_grad_col.shape[0] * accum_grad + return accum_grad_col class ConstantPadding2D(Layer): @@ -463,11 +463,11 @@ def forward_pass(self, X, training=True): constant_values=self.padding_value) return output - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): pad_top, pad_left = self.padding[0][0], self.padding[1][0] height, width = self.input_shape[1], self.input_shape[2] - acc_grad = acc_grad[:, :, pad_top:pad_top+height, pad_left:pad_left+width] - return acc_grad + accum_grad = accum_grad[:, :, pad_top:pad_top+height, pad_left:pad_left+width] + return accum_grad def output_shape(self): new_height = self.input_shape[1] + np.sum(self.padding[0]) @@ -507,8 +507,8 @@ def forward_pass(self, X, training=True): self.prev_shape = X.shape return X.reshape((X.shape[0], -1)) - def backward_pass(self, acc_grad): - return acc_grad.reshape(self.prev_shape) + def backward_pass(self, accum_grad): + return accum_grad.reshape(self.prev_shape) def output_shape(self): return (np.prod(self.input_shape),) @@ -535,10 +535,10 @@ def forward_pass(self, X, training=True): X_new = X.repeat(self.size[0], axis=2).repeat(self.size[1], axis=3) return X_new - def backward_pass(self, acc_grad): + def backward_pass(self, accum_grad): # Down sample input to previous shape - acc_grad = acc_grad[:, :, ::self.size[0], ::self.size[1]] - return acc_grad + accum_grad = accum_grad[:, :, ::self.size[0], ::self.size[1]] + return accum_grad def output_shape(self): channels, height, width = self.input_shape @@ -563,8 +563,8 @@ def forward_pass(self, X, training=True): self.prev_shape = X.shape return X.reshape((X.shape[0], ) + self.shape) - def backward_pass(self, acc_grad): - return acc_grad.reshape(self.prev_shape) + def backward_pass(self, accum_grad): + return accum_grad.reshape(self.prev_shape) def output_shape(self): return self.shape @@ -594,8 +594,8 @@ def forward_pass(self, X, training=True): c = self._mask return X * c - def backward_pass(self, acc_grad): - return acc_grad * self._mask + def backward_pass(self, accum_grad): + return accum_grad * self._mask def output_shape(self): return self.input_shape @@ -632,8 +632,8 @@ def forward_pass(self, X, training=True): self.layer_input = X return self.activation_func(X) - def backward_pass(self, acc_grad): - return acc_grad * self.activation_func.gradient(self.layer_input) + def backward_pass(self, accum_grad): + return accum_grad * self.activation_func.gradient(self.layer_input) def output_shape(self): return self.input_shape diff --git a/mlfromscratch/deep_learning/neural_network.py b/mlfromscratch/deep_learning/neural_network.py index 84f08ae9..d27cdfb3 100644 --- a/mlfromscratch/deep_learning/neural_network.py +++ b/mlfromscratch/deep_learning/neural_network.py @@ -97,9 +97,8 @@ def _forward_pass(self, X, training=True): def _backward_pass(self, loss_grad): """ Propagate the gradient 'backwards' and update the weights in each layer """ - acc_grad = loss_grad for layer in reversed(self.layers): - acc_grad = layer.backward_pass(acc_grad) + loss_grad = layer.backward_pass(loss_grad) def summary(self, name="Model Summary"): # Print model name diff --git a/mlfromscratch/supervised_learning/naive_bayes.py b/mlfromscratch/supervised_learning/naive_bayes.py index 5851278d..6781f8cb 100644 --- a/mlfromscratch/supervised_learning/naive_bayes.py +++ b/mlfromscratch/supervised_learning/naive_bayes.py @@ -38,8 +38,11 @@ def _calculate_prior(self, c): return n_class_instances / n_total_instances def _classify(self, sample): - """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X) + """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X), + or Posterior = Likelihood * Prior / Scaling Factor + P(Y|X) - The posterior is the probability that sample x is of class y given the + feature values of x being distributed according to distribution of y and the prior. P(X|Y) - Likelihood of data X given class distribution Y. Gaussian distribution (given by _calculate_likelihood) P(Y) - Prior (given by _calculate_prior) @@ -52,17 +55,16 @@ def _classify(self, sample): posteriors = [] # Go through list of classes for i, c in enumerate(self.classes): + # Initialize posterior as prior posterior = self._calculate_prior(c) # Naive assumption (independence): # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y) - # Multiply with the class likelihoods + # Posterior is product of prior and likelihoods (ignoring scaling factor) for j, params in enumerate(self.parameters[i]): sample_feature = sample[j] - # Determine P(x|Y) + # Likelihood of sample x given distribution of y likelihood = self._calculate_likelihood(params["mean"], params["var"], sample_feature) - # Multiply with the accumulated probability posterior *= likelihood - # Total posterior = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y) posteriors.append(posterior) # Return the class with the largest posterior probability index_of_max = np.argmax(posteriors) diff --git a/mlfromscratch/supervised_learning/particle_swarm_optimization.py b/mlfromscratch/supervised_learning/particle_swarm_optimization.py index b5bb61dc..31bf382c 100644 --- a/mlfromscratch/supervised_learning/particle_swarm_optimization.py +++ b/mlfromscratch/supervised_learning/particle_swarm_optimization.py @@ -21,7 +21,12 @@ class ParticleSwarmOptimizedNN(): Neural Network Training Using Particle Swarm Optimization https://visualstudiomagazine.com/articles/2013/12/01/neural-network-training-using-particle-swarm-optimization.aspx """ - def __init__(self, population_size, model_builder, inertia_weight=0.8, cognitive_weight=2, social_weight=2, max_velocity=10): + def __init__(self, population_size, + model_builder, + inertia_weight=0.8, + cognitive_weight=2, + social_weight=2, + max_velocity=20): self.population_size = population_size self.model_builder = model_builder self.best_individual = None