From 80c5449224189a7825fef2a8f5f278f86b07126e Mon Sep 17 00:00:00 2001
From: Oleksii Trekhleb <trehleb@gmail.com>
Date: Thu, 20 Dec 2018 07:54:14 +0200
Subject: [PATCH] Add multilayer perceptron.

---
 .../neural_network/multilayer_perceptron.py   | 133 +++++++++++++---
 homemade/utils/hypothesis/__init__.py         |   1 +
 homemade/utils/hypothesis/sigmoid_gradient.py |   7 +
 .../multilayer_perceptron_demo.ipynb          | 148 +-----------------
 4 files changed, 124 insertions(+), 165 deletions(-)
 create mode 100644 homemade/utils/hypothesis/sigmoid_gradient.py

diff --git a/homemade/neural_network/multilayer_perceptron.py b/homemade/neural_network/multilayer_perceptron.py
index c4d6943..ddd3a61 100644
--- a/homemade/neural_network/multilayer_perceptron.py
+++ b/homemade/neural_network/multilayer_perceptron.py
@@ -1,7 +1,7 @@
 import numpy as np
 from scipy.optimize import minimize
 from ..utils.features import prepare_for_training
-from ..utils.hypothesis import sigmoid
+from ..utils.hypothesis import sigmoid, sigmoid_gradient
 
 
 class MultilayerPerceptron:
@@ -76,7 +76,7 @@ def gradient_descent(data, labels, initial_theta, layers, regularization_param,
             method='CG',
             # Function that will help to calculate gradient direction on each step.
             jac=lambda current_theta: MultilayerPerceptron.gradient_step(
-                data, labels, current_theta, regularization_param
+                data, labels, current_theta, layers, regularization_param
             ),
             # Record gradient descent progress for debugging.
             callback=lambda current_theta: cost_history.append(MultilayerPerceptron.cost_function(
@@ -94,20 +94,28 @@ def gradient_descent(data, labels, initial_theta, layers, regularization_param,
         return optimized_theta, cost_history
 
     @staticmethod
-    def gradient_step(unrolled_thetas, layers):
+    def gradient_step(data, labels, unrolled_thetas, layers, regularization_param):
         """Gradient step function.
 
         Computes the cost and gradient of the neural network for unrolled theta parameters.
 
-        :param unrolled_thetas: flat vector of model parameters
-        :param layers: model layers configuration
+        :param data: training set.
+        :param labels: training set labels.
+        :param unrolled_thetas: flat vector of model parameters.
+        :param layers: model layers configuration.
+        :param regularization_param: parameters that fights with model over-fitting.
         """
 
         # Reshape nn_params back into the matrix parameters.
         thetas = MultilayerPerceptron.thetas_roll(unrolled_thetas, layers)
 
         # Do backpropagation.
-        MultilayerPerceptron.back_propagation()
+        thetas_rolled_gradients = MultilayerPerceptron.back_propagation(
+            data, labels, thetas, layers, regularization_param
+        )
+
+        # Unroll thetas gradients.
+        return MultilayerPerceptron.thetas_unroll(thetas_rolled_gradients)
 
     @staticmethod
     def cost_function(data, labels, thetas, layers, regularization_param):
@@ -169,22 +177,107 @@ def feedforward_propagation(data, thetas, layers):
         num_examples = data.shape[0]
 
         # Input layer (l=1)
-        layer_in = data
+        in_layer_activation = data
 
         # Propagate to hidden layers.
         for layer_index in range(num_layers - 1):
             theta = thetas[layer_index]
-            layer_out = sigmoid(layer_in @ theta.T)
+            out_layer_activation = sigmoid(in_layer_activation @ theta.T)
             # Add bias units.
-            layer_out = np.hstack((np.ones((num_examples, 1)), layer_out))
-            layer_in = layer_out
+            out_layer_activation = np.hstack((np.ones((num_examples, 1)), out_layer_activation))
+            in_layer_activation = out_layer_activation
 
         # Output layer should not contain bias units.
-        return layer_in[:, 1:]
+        return in_layer_activation[:, 1:]
 
     @staticmethod
-    def back_propagation():
-        pass
+    def back_propagation(data, labels, thetas, layers, regularization_param):
+        """Backpropagation function"""
+
+        # Get total number of layers.
+        num_layers = len(layers)
+
+        # Get total number of training examples and features.
+        (num_examples, num_features) = data.shape
+
+        # Get the number of possible output labels.
+        num_label_types = layers[-1]
+
+        # Initialize big delta - aggregated delta values for all training examples that will
+        # indicate how exact theta need to be changed.
+        deltas = {}
+        for layer_index in range(num_layers - 1):
+            in_count = layers[layer_index]
+            out_count = layers[layer_index + 1]
+            deltas[layer_index] = np.zeros((out_count, in_count + 1))
+
+        # Let's go through all examples.
+        for example_index in range(num_examples):
+            # We will store layers inputs and activations in order to re-use it later.
+            layers_inputs = {}
+            layers_activations = {}
+
+            # Setup input layer activations.
+            layer_activation = data[example_index, :].reshape((num_features, 1))
+            layers_activations[0] = layer_activation
+
+            # Perform a feedforward pass for current training example.
+            for layer_index in range(num_layers - 1):
+                layer_theta = thetas[layer_index]
+                layer_input = layer_theta @ layer_activation
+                layer_activation = np.vstack((np.array([[1]]), sigmoid(layer_input)))
+
+                layers_inputs[layer_index + 1] = layer_input
+                layers_activations[layer_index + 1] = layer_activation
+
+            # Remove bias units from the output activations.
+            output_layer_activation = layer_activation[1:, :]
+
+            # Calculate deltas.
+
+            # For input layer we don't calculate delta because we do not
+            # associate error with the input.
+            delta = {}
+
+            # Convert the output from number to vector (i.e. 5 to [0; 0; 0; 0; 1; 0; 0; 0; 0; 0])
+            bitwise_label = np.zeros((num_label_types, 1))
+            bitwise_label[labels[example_index][0]] = 1
+
+            # Calculate deltas for the output layer for current training example.
+            delta[num_layers - 1] = output_layer_activation - bitwise_label
+
+            # Calculate small deltas for hidden layers for current training example.
+            # The loops should go for the layers L, L-1, ..., 1.
+            for layer_index in range(num_layers - 2, 0, -1):
+                layer_theta = thetas[layer_index]
+                next_delta = delta[layer_index + 1]
+                layer_input = layers_inputs[layer_index]
+
+                # Add bias row to the layer input.
+                layer_input = np.vstack((np.array([[1]]), layer_input))
+
+                # Calculate row delta and take off the bias row from it.
+                delta[layer_index] = (layer_theta.T @ next_delta) * sigmoid_gradient(layer_input)
+                delta[layer_index] = delta[layer_index][1:, :]
+
+            # Accumulate the gradient (update big deltas).
+            for layer_index in range(num_layers - 1):
+                layer_delta = delta[layer_index + 1] @ layers_activations[layer_index].T
+                deltas[layer_index] = deltas[layer_index] + layer_delta
+
+        # Obtain un-regularized gradient for the neural network cost function.
+        for layer_index in range(num_layers - 1):
+            # Remember that we should NOT be regularizing the first column of theta.
+            current_delta = deltas[layer_index]
+            current_delta = np.hstack((np.zeros((current_delta.shape[0], 1)), current_delta[:, 1:]))
+
+            # Calculate regularization.
+            regularization = (regularization_param / num_examples) * current_delta
+
+            # Regularize deltas.
+            deltas[layer_index] = (1 / num_examples) * deltas[layer_index] + regularization
+
+        return deltas
 
     @staticmethod
     def thetas_init(layers, epsilon):
@@ -208,9 +301,9 @@ def thetas_init(layers, epsilon):
         # Generate Thetas only for input and hidden layers.
         # There is no need to generate Thetas for the output layer.
         for layer_index in range(num_layers - 1):
-            layers_in = layers[layer_index]
-            layers_out = layers[layer_index + 1]
-            thetas[layer_index] = np.random.rand(layers_out, layers_in + 1) * 2 * epsilon - epsilon
+            in_count = layers[layer_index]
+            out_count = layers[layer_index + 1]
+            thetas[layer_index] = np.random.rand(out_count, in_count + 1) * 2 * epsilon - epsilon
 
         return thetas
 
@@ -238,11 +331,11 @@ def thetas_roll(unrolled_thetas, layers):
         unrolled_shift = 0
 
         for layer_index in range(num_layers - 1):
-            layers_in = layers[layer_index]
-            layers_out = layers[layer_index + 1]
+            in_count = layers[layer_index]
+            out_count = layers[layer_index + 1]
 
-            thetas_width = layers_in + 1  # We need to remember about bias unit.
-            thetas_height = layers_out
+            thetas_width = in_count + 1  # We need to remember about bias unit.
+            thetas_height = out_count
             thetas_volume = thetas_width * thetas_height
 
             # We need to remember about bias units when rolling up params.
diff --git a/homemade/utils/hypothesis/__init__.py b/homemade/utils/hypothesis/__init__.py
index 39daf95..c5b0718 100644
--- a/homemade/utils/hypothesis/__init__.py
+++ b/homemade/utils/hypothesis/__init__.py
@@ -1 +1,2 @@
 from .sigmoid import sigmoid
+from .sigmoid_gradient import sigmoid_gradient
diff --git a/homemade/utils/hypothesis/sigmoid_gradient.py b/homemade/utils/hypothesis/sigmoid_gradient.py
new file mode 100644
index 0000000..2920a57
--- /dev/null
+++ b/homemade/utils/hypothesis/sigmoid_gradient.py
@@ -0,0 +1,7 @@
+from .sigmoid import sigmoid
+
+
+def sigmoid_gradient(z):
+    """Computes the gradient of the sigmoid function evaluated at z."""
+
+    return sigmoid(z) * (1 - sigmoid(z))
diff --git a/notebooks/neural_network/multilayer_perceptron_demo.ipynb b/notebooks/neural_network/multilayer_perceptron_demo.ipynb
index 85951a1..554c87f 100644
--- a/notebooks/neural_network/multilayer_perceptron_demo.ipynb
+++ b/notebooks/neural_network/multilayer_perceptron_demo.ipynb
@@ -489,157 +489,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{0: array([[-0.0758517 , -0.10922462,  0.03927368, ..., -0.10116138,\n",
-      "        -0.06562225,  0.08434671],\n",
-      "       [ 0.0605631 ,  0.07263726, -0.02723178, ..., -0.11961522,\n",
-      "         0.11282753, -0.02994306],\n",
-      "       [ 0.11871606,  0.07256746, -0.01404285, ...,  0.10602606,\n",
-      "         0.10491185, -0.05867995],\n",
-      "       ...,\n",
-      "       [ 0.0919954 , -0.04510364,  0.03600806, ...,  0.11729077,\n",
-      "        -0.05894698,  0.06732849],\n",
-      "       [-0.11695147,  0.10688355,  0.0685417 , ..., -0.03359375,\n",
-      "        -0.0944231 ,  0.05190055],\n",
-      "       [-0.01826086,  0.09777763,  0.08879194, ...,  0.0213223 ,\n",
-      "        -0.01198016,  0.08821151]]), 1: array([[-0.0758517 , -0.10922462,  0.03927368,  0.04616654, -0.03550035,\n",
-      "        -0.04301836, -0.05111002, -0.01779515,  0.01221125,  0.01600122,\n",
-      "        -0.07813955, -0.04786295,  0.08427782,  0.09664291,  0.02393845,\n",
-      "        -0.11691394, -0.09485863,  0.10624724,  0.04490283, -0.10627072,\n",
-      "         0.10434709,  0.09732701,  0.00282358, -0.00734285, -0.09995941,\n",
-      "        -0.11459936],\n",
-      "       [-0.06447097,  0.08244544,  0.1055696 ,  0.07628717,  0.07190959,\n",
-      "        -0.03122521,  0.11683788, -0.11039357, -0.06716706,  0.01594479,\n",
-      "        -0.00133989,  0.03575698,  0.05225359,  0.00083552,  0.02347394,\n",
-      "        -0.06132896,  0.05638961, -0.0044371 , -0.02010712, -0.10999934,\n",
-      "        -0.04849349,  0.04404446,  0.07483567, -0.06039278,  0.04542441,\n",
-      "         0.08904211],\n",
-      "       [-0.04615776, -0.07739299, -0.09182918, -0.02440465, -0.05477409,\n",
-      "         0.03588698,  0.02053208, -0.04363991,  0.04534481,  0.05309741,\n",
-      "        -0.07172521, -0.01942062, -0.06344989,  0.09863689, -0.11781185,\n",
-      "         0.02971791, -0.02973962,  0.06304532, -0.07716626, -0.03389946,\n",
-      "        -0.04426616,  0.03890041, -0.07181278,  0.02769418,  0.00820932,\n",
-      "         0.10949384],\n",
-      "       [-0.0584883 , -0.05273799, -0.04073093, -0.08154635,  0.08999456,\n",
-      "        -0.09110997,  0.09805592,  0.02330922,  0.07835466, -0.11295456,\n",
-      "        -0.05768334,  0.00250513, -0.00909849, -0.00671458,  0.06267393,\n",
-      "         0.07735554,  0.05565781, -0.06221527,  0.10644233,  0.03333939,\n",
-      "         0.02334794, -0.01852243,  0.03946706,  0.11171577,  0.00829028,\n",
-      "        -0.05008512],\n",
-      "       [-0.00948572,  0.00763778,  0.07092984,  0.03798784, -0.07694375,\n",
-      "         0.05564401,  0.11472868,  0.11388296,  0.08657028, -0.01318174,\n",
-      "         0.02493628,  0.01862749,  0.01416905, -0.10815415,  0.08573075,\n",
-      "         0.02036101,  0.06934405,  0.11281956,  0.02856743, -0.06820671,\n",
-      "        -0.08479958,  0.02668589, -0.05561203,  0.05716293,  0.11849236,\n",
-      "         0.05245313],\n",
-      "       [ 0.11395225, -0.07448341, -0.11355455,  0.07997803, -0.02016351,\n",
-      "         0.02623673, -0.09786482, -0.08886998, -0.02424251,  0.06848556,\n",
-      "        -0.11399175,  0.01630017,  0.00199946,  0.00148151, -0.03053501,\n",
-      "         0.05940618, -0.05865   ,  0.06081712, -0.06157728, -0.11024059,\n",
-      "        -0.0677528 ,  0.06100844,  0.02996631, -0.03733193,  0.09442967,\n",
-      "         0.0271904 ],\n",
-      "       [ 0.03050059, -0.03451764,  0.07158443,  0.0541165 ,  0.01873904,\n",
-      "        -0.05535262,  0.00458515, -0.09848468, -0.01277639, -0.10496153,\n",
-      "        -0.06116952, -0.0284652 ,  0.0300631 , -0.02659276,  0.09268343,\n",
-      "        -0.08086429, -0.07301074, -0.03411321,  0.1054892 ,  0.0424244 ,\n",
-      "         0.09827251,  0.03980845, -0.09431661, -0.0580831 , -0.04872072,\n",
-      "         0.106885  ],\n",
-      "       [ 0.08076684, -0.00780408,  0.06917175,  0.10370648, -0.00244977,\n",
-      "        -0.09103661, -0.03319441, -0.10700324,  0.03875014, -0.02056288,\n",
-      "        -0.01949595, -0.05121848,  0.10714613, -0.00404258,  0.0173522 ,\n",
-      "        -0.05759117, -0.08206716,  0.08263817, -0.00864864, -0.08316974,\n",
-      "         0.08279706,  0.04957311,  0.03934321,  0.05675562,  0.04299622,\n",
-      "         0.04064601],\n",
-      "       [ 0.00825281, -0.07706374, -0.00922871,  0.05605853,  0.00982105,\n",
-      "        -0.05653799, -0.06617444, -0.08152387,  0.09066151,  0.00207551,\n",
-      "        -0.03963645,  0.09282233,  0.02758925,  0.01784172,  0.11217704,\n",
-      "         0.05094281,  0.08854876, -0.09565834,  0.00443037, -0.01511557,\n",
-      "         0.10326956, -0.06927156, -0.0166677 ,  0.0913672 ,  0.06746135,\n",
-      "        -0.04688244],\n",
-      "       [ 0.02260412,  0.00678681,  0.00549161, -0.11994145,  0.04870088,\n",
-      "        -0.05051432, -0.1141186 ,  0.06037819,  0.04170217, -0.0586402 ,\n",
-      "        -0.10248884,  0.01742958, -0.01947546,  0.06129252,  0.07150439,\n",
-      "        -0.06523626,  0.09166035,  0.09504693, -0.03253129, -0.06043063,\n",
-      "        -0.0926532 , -0.11705144,  0.0379782 , -0.05661604, -0.11245252,\n",
-      "        -0.1087203 ]])}\n",
-      "{0: array([[-0.0758517 , -0.10922462,  0.03927368, ..., -0.10116138,\n",
-      "        -0.06562225,  0.08434671],\n",
-      "       [ 0.0605631 ,  0.07263726, -0.02723178, ..., -0.11961522,\n",
-      "         0.11282753, -0.02994306],\n",
-      "       [ 0.11871606,  0.07256746, -0.01404285, ...,  0.10602606,\n",
-      "         0.10491185, -0.05867995],\n",
-      "       ...,\n",
-      "       [ 0.0919954 , -0.04510364,  0.03600806, ...,  0.11729077,\n",
-      "        -0.05894698,  0.06732849],\n",
-      "       [-0.11695147,  0.10688355,  0.0685417 , ..., -0.03359375,\n",
-      "        -0.0944231 ,  0.05190055],\n",
-      "       [-0.01826086,  0.09777763,  0.08879194, ...,  0.0213223 ,\n",
-      "        -0.01198016,  0.08821151]]), 1: array([[-0.0758517 , -0.10922462,  0.03927368,  0.04616654, -0.03550035,\n",
-      "        -0.04301836, -0.05111002, -0.01779515,  0.01221125,  0.01600122,\n",
-      "        -0.07813955, -0.04786295,  0.08427782,  0.09664291,  0.02393845,\n",
-      "        -0.11691394, -0.09485863,  0.10624724,  0.04490283, -0.10627072,\n",
-      "         0.10434709,  0.09732701,  0.00282358, -0.00734285, -0.09995941,\n",
-      "        -0.11459936],\n",
-      "       [-0.06447097,  0.08244544,  0.1055696 ,  0.07628717,  0.07190959,\n",
-      "        -0.03122521,  0.11683788, -0.11039357, -0.06716706,  0.01594479,\n",
-      "        -0.00133989,  0.03575698,  0.05225359,  0.00083552,  0.02347394,\n",
-      "        -0.06132896,  0.05638961, -0.0044371 , -0.02010712, -0.10999934,\n",
-      "        -0.04849349,  0.04404446,  0.07483567, -0.06039278,  0.04542441,\n",
-      "         0.08904211],\n",
-      "       [-0.04615776, -0.07739299, -0.09182918, -0.02440465, -0.05477409,\n",
-      "         0.03588698,  0.02053208, -0.04363991,  0.04534481,  0.05309741,\n",
-      "        -0.07172521, -0.01942062, -0.06344989,  0.09863689, -0.11781185,\n",
-      "         0.02971791, -0.02973962,  0.06304532, -0.07716626, -0.03389946,\n",
-      "        -0.04426616,  0.03890041, -0.07181278,  0.02769418,  0.00820932,\n",
-      "         0.10949384],\n",
-      "       [-0.0584883 , -0.05273799, -0.04073093, -0.08154635,  0.08999456,\n",
-      "        -0.09110997,  0.09805592,  0.02330922,  0.07835466, -0.11295456,\n",
-      "        -0.05768334,  0.00250513, -0.00909849, -0.00671458,  0.06267393,\n",
-      "         0.07735554,  0.05565781, -0.06221527,  0.10644233,  0.03333939,\n",
-      "         0.02334794, -0.01852243,  0.03946706,  0.11171577,  0.00829028,\n",
-      "        -0.05008512],\n",
-      "       [-0.00948572,  0.00763778,  0.07092984,  0.03798784, -0.07694375,\n",
-      "         0.05564401,  0.11472868,  0.11388296,  0.08657028, -0.01318174,\n",
-      "         0.02493628,  0.01862749,  0.01416905, -0.10815415,  0.08573075,\n",
-      "         0.02036101,  0.06934405,  0.11281956,  0.02856743, -0.06820671,\n",
-      "        -0.08479958,  0.02668589, -0.05561203,  0.05716293,  0.11849236,\n",
-      "         0.05245313],\n",
-      "       [ 0.11395225, -0.07448341, -0.11355455,  0.07997803, -0.02016351,\n",
-      "         0.02623673, -0.09786482, -0.08886998, -0.02424251,  0.06848556,\n",
-      "        -0.11399175,  0.01630017,  0.00199946,  0.00148151, -0.03053501,\n",
-      "         0.05940618, -0.05865   ,  0.06081712, -0.06157728, -0.11024059,\n",
-      "        -0.0677528 ,  0.06100844,  0.02996631, -0.03733193,  0.09442967,\n",
-      "         0.0271904 ],\n",
-      "       [ 0.03050059, -0.03451764,  0.07158443,  0.0541165 ,  0.01873904,\n",
-      "        -0.05535262,  0.00458515, -0.09848468, -0.01277639, -0.10496153,\n",
-      "        -0.06116952, -0.0284652 ,  0.0300631 , -0.02659276,  0.09268343,\n",
-      "        -0.08086429, -0.07301074, -0.03411321,  0.1054892 ,  0.0424244 ,\n",
-      "         0.09827251,  0.03980845, -0.09431661, -0.0580831 , -0.04872072,\n",
-      "         0.106885  ],\n",
-      "       [ 0.08076684, -0.00780408,  0.06917175,  0.10370648, -0.00244977,\n",
-      "        -0.09103661, -0.03319441, -0.10700324,  0.03875014, -0.02056288,\n",
-      "        -0.01949595, -0.05121848,  0.10714613, -0.00404258,  0.0173522 ,\n",
-      "        -0.05759117, -0.08206716,  0.08263817, -0.00864864, -0.08316974,\n",
-      "         0.08279706,  0.04957311,  0.03934321,  0.05675562,  0.04299622,\n",
-      "         0.04064601],\n",
-      "       [ 0.00825281, -0.07706374, -0.00922871,  0.05605853,  0.00982105,\n",
-      "        -0.05653799, -0.06617444, -0.08152387,  0.09066151,  0.00207551,\n",
-      "        -0.03963645,  0.09282233,  0.02758925,  0.01784172,  0.11217704,\n",
-      "         0.05094281,  0.08854876, -0.09565834,  0.00443037, -0.01511557,\n",
-      "         0.10326956, -0.06927156, -0.0166677 ,  0.0913672 ,  0.06746135,\n",
-      "        -0.04688244],\n",
-      "       [ 0.02260412,  0.00678681,  0.00549161, -0.11994145,  0.04870088,\n",
-      "        -0.05051432, -0.1141186 ,  0.06037819,  0.04170217, -0.0586402 ,\n",
-      "        -0.10248884,  0.01742958, -0.01947546,  0.06129252,  0.07150439,\n",
-      "        -0.06523626,  0.09166035,  0.09504693, -0.03253129, -0.06043063,\n",
-      "        -0.0926532 , -0.11705144,  0.0379782 , -0.05661604, -0.11245252,\n",
-      "        -0.1087203 ]])}\n"
+      "[1.54168371e-06 0.00000000e+00 0.00000000e+00 ... 2.03102558e-01\n",
+      " 1.36934643e-01 1.77471528e-01]\n"
      ]
     }
    ],