From 22b18baf3fb6bf32d7038a37c18f1f0467e0a2d0 Mon Sep 17 00:00:00 2001
From: eriklindernoren <eriklindernoren@live.se>
Date: Sat, 23 Dec 2017 21:22:33 +0100
Subject: [PATCH] Renamed 'acc_grad' to 'accum_grad' to reduce ambiguity

---
 mlfromscratch/deep_learning/layers.py         | 118 +++++++++---------
 mlfromscratch/deep_learning/neural_network.py |   3 +-
 .../supervised_learning/naive_bayes.py        |  12 +-
 .../particle_swarm_optimization.py            |   7 +-
 4 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/mlfromscratch/deep_learning/layers.py b/mlfromscratch/deep_learning/layers.py
index 10da5c29..a8153696 100644
--- a/mlfromscratch/deep_learning/layers.py
+++ b/mlfromscratch/deep_learning/layers.py
@@ -26,10 +26,10 @@ def forward_pass(self, X, training):
         """ Propogates the signal forward in the network """
         raise NotImplementedError()
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
         """ Propogates the accumulated gradient backwards in the network.
         If the has trainable weights then these weights are also tuned in this method.
-        As input (acc_grad) it receives the gradient with respect to the output of the layer and
+        As input (accum_grad) it receives the gradient with respect to the output of the layer and
         returns the gradient with respect to the output of the previous layer. """
         raise NotImplementedError()
 
@@ -73,14 +73,14 @@ def forward_pass(self, X, training=True):
         self.layer_input = X
         return X.dot(self.W) + self.w0
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
         # Save weights used during forwards pass
         W = self.W
 
         if self.trainable:
             # Calculate gradient w.r.t layer weights
-            grad_w = self.layer_input.T.dot(acc_grad)
-            grad_w0 = np.sum(acc_grad, axis=0, keepdims=True)
+            grad_w = self.layer_input.T.dot(accum_grad)
+            grad_w0 = np.sum(accum_grad, axis=0, keepdims=True)
 
             # Update the layer weights
             self.W = self.W_opt.update(self.W, grad_w)
@@ -88,8 +88,8 @@ def backward_pass(self, acc_grad):
 
         # Return accumulated gradient for next layer
         # Calculated based on the weights used during the forward pass
-        acc_grad = acc_grad.dot(W.T)
-        return acc_grad
+        accum_grad = accum_grad.dot(W.T)
+        return accum_grad
 
     def output_shape(self):
         return (self.n_units, )
@@ -160,8 +160,8 @@ def forward_pass(self, X, training=True):
 
         return self.outputs
 
-    def backward_pass(self, acc_grad):
-        _, timesteps, _ = acc_grad.shape
+    def backward_pass(self, accum_grad):
+        _, timesteps, _ = accum_grad.shape
         
         # Variables where we save the accumulated gradient w.r.t each parameter
         grad_U = np.zeros_like(self.U)
@@ -169,16 +169,16 @@ def backward_pass(self, acc_grad):
         grad_W = np.zeros_like(self.W)
         # The gradient w.r.t the layer input. 
         # Will be passed on to the previous layer in the network
-        acc_grad_next = np.zeros_like(acc_grad)
+        accum_grad_next = np.zeros_like(accum_grad)
 
         # Back Propagation Through Time
         for t in reversed(range(timesteps)):
             # Update gradient w.r.t V at time step t
-            grad_V += acc_grad[:, t].T.dot(self.states[:, t])
+            grad_V += accum_grad[:, t].T.dot(self.states[:, t])
             # Calculate the gradient w.r.t the state input
-            grad_wrt_state = acc_grad[:, t].dot(self.V) * self.activation.gradient(self.state_input[:, t])
+            grad_wrt_state = accum_grad[:, t].dot(self.V) * self.activation.gradient(self.state_input[:, t])
             # Gradient w.r.t the layer input
-            acc_grad_next[:, t] = grad_wrt_state.dot(self.U)
+            accum_grad_next[:, t] = grad_wrt_state.dot(self.U)
             # Update gradient w.r.t W and U by backprop. from time step t for at most
             # self.bptt_trunc number of time steps
             for t_ in reversed(np.arange(max(0, t - self.bptt_trunc), t+1)):
@@ -192,7 +192,7 @@ def backward_pass(self, acc_grad):
         self.V = self.V_opt.update(self.V, grad_V)
         self.W = self.W_opt.update(self.W, grad_W)
 
-        return acc_grad_next
+        return accum_grad_next
 
     def output_shape(self):
         return self.input_shape
@@ -253,31 +253,31 @@ def forward_pass(self, X, training=True):
         # Redistribute axises so that batch size comes first
         return output.transpose(3,0,1,2)
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
         # Reshape accumulated gradient into column shape
-        acc_grad = acc_grad.transpose(1, 2, 3, 0).reshape(self.n_filters, -1)
+        accum_grad = accum_grad.transpose(1, 2, 3, 0).reshape(self.n_filters, -1)
 
         if self.trainable:
             # Take dot product between column shaped accum. gradient and column shape
             # layer input to determine the gradient at the layer with respect to layer weights
-            grad_w = acc_grad.dot(self.X_col.T).reshape(self.W.shape)
+            grad_w = accum_grad.dot(self.X_col.T).reshape(self.W.shape)
             # The gradient with respect to bias terms is the sum similarly to in Dense layer
-            grad_w0 = np.sum(acc_grad, axis=1, keepdims=True)
+            grad_w0 = np.sum(accum_grad, axis=1, keepdims=True)
 
             # Update the layers weights
             self.W = self.W_opt.update(self.W, grad_w)
             self.w0 = self.w0_opt.update(self.w0, grad_w0)
 
         # Recalculate the gradient which will be propogated back to prev. layer
-        acc_grad = self.W_col.T.dot(acc_grad)
+        accum_grad = self.W_col.T.dot(accum_grad)
         # Reshape from column shape to image shape
-        acc_grad = column_to_image(acc_grad, 
+        accum_grad = column_to_image(accum_grad, 
                                 self.layer_input.shape, 
                                 self.filter_shape, 
                                 stride=self.stride, 
                                 output_shape=self.padding)
 
-        return acc_grad
+        return accum_grad
 
     def output_shape(self):
         channels, height, width = self.input_shape
@@ -331,7 +331,7 @@ def forward_pass(self, X, training=True):
 
         return output
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
 
         # Save parameters used during the forward pass
         gamma = self.gamma
@@ -339,22 +339,22 @@ def backward_pass(self, acc_grad):
         # If the layer is trainable the parameters are updated
         if self.trainable:
             X_norm = self.X_centered * self.stddev_inv
-            grad_gamma = np.sum(acc_grad * X_norm, axis=0)
-            grad_beta = np.sum(acc_grad, axis=0)
+            grad_gamma = np.sum(accum_grad * X_norm, axis=0)
+            grad_beta = np.sum(accum_grad, axis=0)
 
             self.gamma = self.gamma_opt.update(self.gamma, grad_gamma)
             self.beta = self.beta_opt.update(self.beta, grad_beta)
 
-        batch_size = acc_grad.shape[0]
+        batch_size = accum_grad.shape[0]
 
         # The gradient of the loss with respect to the layer inputs (use weights from forward pass)
-        acc_grad = (1 / batch_size) * gamma * self.stddev_inv * (
-            batch_size * acc_grad 
-            - np.sum(acc_grad, axis=0)
-            - self.X_centered * self.stddev_inv**2 * np.sum(acc_grad * self.X_centered, axis=0)
+        accum_grad = (1 / batch_size) * gamma * self.stddev_inv * (
+            batch_size * accum_grad 
+            - np.sum(accum_grad, axis=0)
+            - self.X_centered * self.stddev_inv**2 * np.sum(accum_grad * self.X_centered, axis=0)
             )
 
-        return acc_grad
+        return accum_grad
 
     def output_shape(self):
         return self.input_shape
@@ -387,18 +387,18 @@ def forward_pass(self, X, training=True):
 
         return output
 
-    def backward_pass(self, acc_grad):
-        batch_size, _, _, _ = acc_grad.shape
+    def backward_pass(self, accum_grad):
+        batch_size, _, _, _ = accum_grad.shape
         channels, height, width = self.input_shape
-        acc_grad = acc_grad.transpose(2, 3, 0, 1).ravel()
+        accum_grad = accum_grad.transpose(2, 3, 0, 1).ravel()
 
         # MaxPool or AveragePool specific method
-        acc_grad_col = self._pool_backward(acc_grad)
+        accum_grad_col = self._pool_backward(accum_grad)
 
-        acc_grad = column_to_image(acc_grad_col, (batch_size * channels, 1, height, width), self.pool_shape, self.stride, 0)
-        acc_grad = acc_grad.reshape((batch_size,) + self.input_shape)
+        accum_grad = column_to_image(accum_grad_col, (batch_size * channels, 1, height, width), self.pool_shape, self.stride, 0)
+        accum_grad = accum_grad.reshape((batch_size,) + self.input_shape)
 
-        return acc_grad
+        return accum_grad
 
     def output_shape(self):
         channels, height, width = self.input_shape
@@ -416,21 +416,21 @@ def _pool_forward(self, X_col):
         self.cache = arg_max
         return output
 
-    def _pool_backward(self, acc_grad):
-        acc_grad_col = np.zeros((np.prod(self.pool_shape), acc_grad.size))
+    def _pool_backward(self, accum_grad):
+        accum_grad_col = np.zeros((np.prod(self.pool_shape), accum_grad.size))
         arg_max = self.cache
-        acc_grad_col[arg_max, range(acc_grad.size)] = acc_grad
-        return acc_grad_col
+        accum_grad_col[arg_max, range(accum_grad.size)] = accum_grad
+        return accum_grad_col
 
 class AveragePooling2D(PoolingLayer):
     def _pool_forward(self, X_col):
         output = np.mean(X_col, axis=0)
         return output
 
-    def _pool_backward(self, acc_grad):
-        acc_grad_col = np.zeros((np.prod(self.pool_shape), acc_grad.size))
-        acc_grad_col[:, range(acc_grad.size)] = 1. / acc_grad_col.shape[0] * acc_grad
-        return acc_grad_col
+    def _pool_backward(self, accum_grad):
+        accum_grad_col = np.zeros((np.prod(self.pool_shape), accum_grad.size))
+        accum_grad_col[:, range(accum_grad.size)] = 1. / accum_grad_col.shape[0] * accum_grad
+        return accum_grad_col
 
 
 class ConstantPadding2D(Layer):
@@ -463,11 +463,11 @@ def forward_pass(self, X, training=True):
             constant_values=self.padding_value)
         return output
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
         pad_top, pad_left = self.padding[0][0], self.padding[1][0]
         height, width = self.input_shape[1], self.input_shape[2]
-        acc_grad = acc_grad[:, :, pad_top:pad_top+height, pad_left:pad_left+width]
-        return acc_grad
+        accum_grad = accum_grad[:, :, pad_top:pad_top+height, pad_left:pad_left+width]
+        return accum_grad
 
     def output_shape(self):
         new_height = self.input_shape[1] + np.sum(self.padding[0])
@@ -507,8 +507,8 @@ def forward_pass(self, X, training=True):
         self.prev_shape = X.shape
         return X.reshape((X.shape[0], -1))
 
-    def backward_pass(self, acc_grad):
-        return acc_grad.reshape(self.prev_shape)
+    def backward_pass(self, accum_grad):
+        return accum_grad.reshape(self.prev_shape)
 
     def output_shape(self):
         return (np.prod(self.input_shape),)
@@ -535,10 +535,10 @@ def forward_pass(self, X, training=True):
         X_new = X.repeat(self.size[0], axis=2).repeat(self.size[1], axis=3)
         return X_new
 
-    def backward_pass(self, acc_grad):
+    def backward_pass(self, accum_grad):
         # Down sample input to previous shape
-        acc_grad = acc_grad[:, :, ::self.size[0], ::self.size[1]]
-        return acc_grad
+        accum_grad = accum_grad[:, :, ::self.size[0], ::self.size[1]]
+        return accum_grad
 
     def output_shape(self):
         channels, height, width = self.input_shape
@@ -563,8 +563,8 @@ def forward_pass(self, X, training=True):
         self.prev_shape = X.shape
         return X.reshape((X.shape[0], ) + self.shape)
 
-    def backward_pass(self, acc_grad):
-        return acc_grad.reshape(self.prev_shape)
+    def backward_pass(self, accum_grad):
+        return accum_grad.reshape(self.prev_shape)
 
     def output_shape(self):
         return self.shape
@@ -594,8 +594,8 @@ def forward_pass(self, X, training=True):
             c = self._mask
         return X * c
 
-    def backward_pass(self, acc_grad):
-        return acc_grad * self._mask
+    def backward_pass(self, accum_grad):
+        return accum_grad * self._mask
 
     def output_shape(self):
         return self.input_shape
@@ -632,8 +632,8 @@ def forward_pass(self, X, training=True):
         self.layer_input = X
         return self.activation_func(X)
 
-    def backward_pass(self, acc_grad):
-        return acc_grad * self.activation_func.gradient(self.layer_input)
+    def backward_pass(self, accum_grad):
+        return accum_grad * self.activation_func.gradient(self.layer_input)
 
     def output_shape(self):
         return self.input_shape
diff --git a/mlfromscratch/deep_learning/neural_network.py b/mlfromscratch/deep_learning/neural_network.py
index 84f08ae9..d27cdfb3 100644
--- a/mlfromscratch/deep_learning/neural_network.py
+++ b/mlfromscratch/deep_learning/neural_network.py
@@ -97,9 +97,8 @@ def _forward_pass(self, X, training=True):
 
     def _backward_pass(self, loss_grad):
         """ Propagate the gradient 'backwards' and update the weights in each layer """
-        acc_grad = loss_grad
         for layer in reversed(self.layers):
-            acc_grad = layer.backward_pass(acc_grad)
+            loss_grad = layer.backward_pass(loss_grad)
 
     def summary(self, name="Model Summary"):
         # Print model name
diff --git a/mlfromscratch/supervised_learning/naive_bayes.py b/mlfromscratch/supervised_learning/naive_bayes.py
index 5851278d..6781f8cb 100644
--- a/mlfromscratch/supervised_learning/naive_bayes.py
+++ b/mlfromscratch/supervised_learning/naive_bayes.py
@@ -38,8 +38,11 @@ def _calculate_prior(self, c):
         return n_class_instances / n_total_instances
 
     def _classify(self, sample):
-        """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X)
+        """ Classification using Bayes Rule P(Y|X) = P(X|Y)*P(Y)/P(X),
+            or Posterior = Likelihood * Prior / Scaling Factor
 
+        P(Y|X) - The posterior is the probability that sample x is of class y given the
+                 feature values of x being distributed according to distribution of y and the prior.
         P(X|Y) - Likelihood of data X given class distribution Y. 
                  Gaussian distribution (given by _calculate_likelihood)
         P(Y)   - Prior (given by _calculate_prior)
@@ -52,17 +55,16 @@ def _classify(self, sample):
         posteriors = []
         # Go through list of classes
         for i, c in enumerate(self.classes):
+            # Initialize posterior as prior
             posterior = self._calculate_prior(c)
             # Naive assumption (independence):
             # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
-            # Multiply with the class likelihoods
+            # Posterior is product of prior and likelihoods (ignoring scaling factor)
             for j, params in enumerate(self.parameters[i]):
                 sample_feature = sample[j]
-                # Determine P(x|Y)
+                # Likelihood of sample x given distribution of y
                 likelihood = self._calculate_likelihood(params["mean"], params["var"], sample_feature)
-                # Multiply with the accumulated probability
                 posterior *= likelihood
-            # Total posterior = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
             posteriors.append(posterior)
         # Return the class with the largest posterior probability
         index_of_max = np.argmax(posteriors)
diff --git a/mlfromscratch/supervised_learning/particle_swarm_optimization.py b/mlfromscratch/supervised_learning/particle_swarm_optimization.py
index b5bb61dc..31bf382c 100644
--- a/mlfromscratch/supervised_learning/particle_swarm_optimization.py
+++ b/mlfromscratch/supervised_learning/particle_swarm_optimization.py
@@ -21,7 +21,12 @@ class ParticleSwarmOptimizedNN():
         Neural Network Training Using Particle Swarm Optimization
         https://visualstudiomagazine.com/articles/2013/12/01/neural-network-training-using-particle-swarm-optimization.aspx 
     """
-    def __init__(self, population_size, model_builder, inertia_weight=0.8, cognitive_weight=2, social_weight=2, max_velocity=10):
+    def __init__(self, population_size, 
+                        model_builder, 
+                        inertia_weight=0.8, 
+                        cognitive_weight=2, 
+                        social_weight=2, 
+                        max_velocity=20):
         self.population_size = population_size
         self.model_builder = model_builder
         self.best_individual = None