Created using Colaboratory

sandipanpaul21 · sandipanpaul21 · commit 030bd0e04a1f · 2020-09-03T18:42:03.000+05:30
diff --git a/08_Logistic_Regression.ipynb b/08_Logistic_Regression.ipynb
@@ -5,7 +5,7 @@
     "colab": {
       "name": "08 Logistic Regression.ipynb",
       "provenance": [],
-      "authorship_tag": "ABX9TyNmC97wOqe43KRJd+bMnG4a",
+      "authorship_tag": "ABX9TyOzdICGm/7utO3jg6uo4fdn",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -33,15 +33,21 @@
           "base_uri": "https://localhost:8080/",
           "height": 73
         },
-        "outputId": "488a4dcc-a320-41aa-d30d-18001ff28526"
+        "outputId": "b05857ae-b7ca-4eeb-af64-ee339d5216a8"
       },
       "source": [
         "# Libraries\n",
         "from sklearn import datasets\n",
         "import pandas as pd\n",
-        "import statsmodels.api as sm"
+        "import statsmodels.api as sm\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from sklearn import metrics\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "import numpy as np\n",
+        "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
+        "from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score"
       ],
-      "execution_count": 10,
+      "execution_count": 1,
       "outputs": [
         {
           "output_type": "stream",
@@ -62,7 +68,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 204
         },
-        "outputId": "1ab9c9da-03ea-4454-e837-fa364133db44"
+        "outputId": "bc5937b1-e238-4776-e45e-b02333be145d"
       },
       "source": [
         "# IRIS Dataset\n",
@@ -79,7 +85,7 @@
         "iris.columns = iris.columns.str.replace(\")\",\"\")\n",
         "iris.head()"
       ],
-      "execution_count": 3,
+      "execution_count": 2,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -167,7 +173,7 @@
           "metadata": {
             "tags": []
           },
-          "execution_count": 3
+          "execution_count": 2
         }
       ]
     },
@@ -180,13 +186,13 @@
           "base_uri": "https://localhost:8080/",
           "height": 71
         },
-        "outputId": "3d834a1c-9e56-4aec-abe7-c380280abb99"
+        "outputId": "4b1fcb62-60bb-4180-9b09-a053ebcde243"
       },
       "source": [
         "# Target Column Distribution\n",
         "iris['species'].value_counts()"
       ],
-      "execution_count": 4,
+      "execution_count": 3,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -200,7 +206,7 @@
           "metadata": {
             "tags": []
           },
-          "execution_count": 4
+          "execution_count": 3
         }
       ]
     },
@@ -213,13 +219,13 @@
           "base_uri": "https://localhost:8080/",
           "height": 142
         },
-        "outputId": "9e9e1c39-ebe4-473b-dfb1-d6dfc94c75ce"
+        "outputId": "4d927c5f-d629-4f5b-be62-d548b75eb025"
       },
       "source": [
         "# Distribution (mean) of Independent Columns respect to Dependent Column\n",
         "iris.groupby('species').mean().round(2)"
       ],
-      "execution_count": 5,
+      "execution_count": 4,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -285,7 +291,7 @@
           "metadata": {
             "tags": []
           },
-          "execution_count": 5
+          "execution_count": 4
         }
       ]
     },
@@ -298,14 +304,14 @@
           "base_uri": "https://localhost:8080/",
           "height": 204
         },
-        "outputId": "5260ca48-0b2f-4400-e29c-7137ecab6959"
+        "outputId": "10749ad7-318a-4caf-b8d1-59a5439dfc23"
       },
       "source": [
         "# Independent Variables\n",
         "Independent_Variable_Base_Set = iris[iris.columns[0:4]]\n",
         "Independent_Variable_Base_Set.head()"
       ],
-      "execution_count": 6,
+      "execution_count": 5,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -387,7 +393,7 @@
           "metadata": {
             "tags": []
           },
-          "execution_count": 6
+          "execution_count": 5
         }
       ]
     },
@@ -400,14 +406,14 @@
           "base_uri": "https://localhost:8080/",
           "height": 204
         },
-        "outputId": "19e38ebb-d128-4087-8583-bf3a75fef110"
+        "outputId": "48a3efc3-1f7a-49bc-cf99-57875155d2b3"
       },
       "source": [
         "# Dependent Variable\n",
         "Dependent_Variable = iris[iris.columns[-1:iris.columns.size]]\n",
         "Dependent_Variable.head()"
       ],
-      "execution_count": 9,
+      "execution_count": 6,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -471,7 +477,7 @@
           "metadata": {
             "tags": []
           },
-          "execution_count": 9
+          "execution_count": 6
         }
       ]
     },
@@ -484,7 +490,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 431
         },
-        "outputId": "54db6207-45cd-4270-e0c9-e3dde41afa82"
+        "outputId": "e1db0f39-4317-4e06-b6b3-477716d48e15"
       },
       "source": [
         "# Fitting Logistic Model \n",
@@ -494,7 +500,7 @@
         "result = logit_model.fit(method = 'bfgs')\n",
         "print(result.summary2())"
       ],
-      "execution_count": 12,
+      "execution_count": 7,
       "outputs": [
         {
           "output_type": "stream",
@@ -508,7 +514,7 @@
             "===================================================================\n",
             "Model:               Logit            Pseudo R-squared: 1.000      \n",
             "Dependent Variable:  species          AIC:              8.0002     \n",
-            "Date:                2020-09-03 11:50 BIC:              18.4209    \n",
+            "Date:                2020-09-03 13:11 BIC:              18.4209    \n",
             "No. Observations:    100              Log-Likelihood:   -0.00011118\n",
             "Df Model:            3                LL-Null:          -69.315    \n",
             "Df Residuals:        96               LLR p-value:      7.4648e-30 \n",
@@ -536,7 +542,7 @@
           "base_uri": "https://localhost:8080/",
           "height": 179
         },
-        "outputId": "b9612ca6-26d7-4877-dffb-a28619558c19"
+        "outputId": "736bc19c-e30a-45d8-8eed-bb39dea0365e"
       },
       "source": [
         "# Model Summary\n",
@@ -554,7 +560,7 @@
         "print(\"BIC also work same as AIC, Lower BIC better is the Model\")\n",
         "print(\"Base Model, BIC :\",base_model_bic)"
       ],
-      "execution_count": 22,
+      "execution_count": 8,
       "outputs": [
         {
           "output_type": "stream",
@@ -578,6 +584,157 @@
       "metadata": {
         "id": "IdcrKXSXzxZY",
         "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 539
+        },
+        "outputId": "73ef4b6f-f34d-4ea0-8c0f-3879eda08c43"
+      },
+      "source": [
+        "# Split the Dataset how logistic works\n",
+        "\n",
+        "# Lets start with selecting one variable\n",
+        "Independent_Variable_Set_v1 = iris[iris.columns[0:1]]\n",
+        "X_train, X_test, y_train, y_test = train_test_split(Independent_Variable_Set_v1,Dependent_Variable,test_size = 0.3,random_state = 21)\n",
+        "logit_model = sm.Logit(y_train,X_train)\n",
+        "result = logit_model.fit(method='bfgs')\n",
+        "print(result.summary2())\n",
+        "\n",
+        "# Model Summary\n",
+        "\n",
+        "print(\"Model 1 Summary\")\n",
+        "print(\"Iteration suggests how many loop model did to perform the fit\")\n",
+        "print(\"Iterations : 3\")\n",
+        "r_square_1 = result.prsquared.round(2)\n",
+        "print(\"Pseudo R Square suggests overall effect size (ideal value is close to 1)\")\n",
+        "print(\"Model 1, MacFadden Pseudo R Square : \",r_square_1)\n",
+        "base_model_aic_1 = result.aic.round(2)\n",
+        "print(\"AIC compares Goodness of Fit, Lower AIC better is the Model\")\n",
+        "print(\"Model 1, AIC  :\",base_model_aic_1)\n",
+        "base_model_bic_1 = result.bic.round(2)\n",
+        "print(\"BIC also work same as AIC, Lower BIC better is the Model\")\n",
+        "print(\"Model 1, BIC :\",base_model_bic_1)"
+      ],
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Optimization terminated successfully.\n",
+            "         Current function value: 0.662428\n",
+            "         Iterations: 3\n",
+            "         Function evaluations: 5\n",
+            "         Gradient evaluations: 5\n",
+            "                        Results: Logit\n",
+            "==============================================================\n",
+            "Model:              Logit            Pseudo R-squared: 0.024  \n",
+            "Dependent Variable: species          AIC:              94.7399\n",
+            "Date:               2020-09-03 13:11 BIC:              96.9884\n",
+            "No. Observations:   70               Log-Likelihood:   -46.370\n",
+            "Df Model:           0                LL-Null:          -47.487\n",
+            "Df Residuals:       69               LLR p-value:      nan    \n",
+            "Converged:          1.0000           Scale:            1.0000 \n",
+            "---------------------------------------------------------------\n",
+            "               Coef.   Std.Err.    z     P>|z|   [0.025  0.975]\n",
+            "---------------------------------------------------------------\n",
+            "sepallengthcm  0.0902    0.0442  2.0399  0.0414  0.0035  0.1769\n",
+            "==============================================================\n",
+            "\n",
+            "Model 1 Summary\n",
+            "Iteration suggests how many loop model did to perform the fit\n",
+            "Iterations : 3\n",
+            "Pseudo R Square suggests overall effect size (ideal value is close to 1)\n",
+            "Model 1, MacFadden Pseudo R Square :  0.02\n",
+            "AIC compares Goodness of Fit, Lower AIC better is the Model\n",
+            "Model 1, AIC  : 94.74\n",
+            "BIC also work same as AIC, Lower BIC better is the Model\n",
+            "Model 1, BIC : 96.99\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QwGV0dfc5Tq0",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 323
+        },
+        "outputId": "0ec5d4c8-56ff-4b9c-997b-1c61cff2886e"
+      },
+      "source": [
+        "# Model Prediction\n",
+        "\n",
+        "pred = result.predict(X_test)\n",
+        "model_prediction = pd.DataFrame(pred.round(2),columns = ['Prediction'])\n",
+        "model_prediction['temp'] = 'temp'\n",
+        "model_prediction['Final_Class'] = np.where(model_prediction['Prediction'] > 0.5,1,0)\n",
+        "print(model_prediction.head())\n",
+        "temp = model_prediction.groupby('temp')['Final_Class'].apply(list)\n",
+        "y_pred = temp.loc['temp']\n",
+        "model_1_accuracy = accuracy_score(y_test,y_pred).round(2)\n",
+        "print(\"\\nModel Performance\")\n",
+        "print(\"Model 1, Accuracy :\",model_1_accuracy)\n",
+        "model_1_precision = precision_score(y_test,y_pred).round(2)\n",
+        "print(\"Model 1, Precision :\",model_1_precision)\n",
+        "model_1_recall = recall_score(y_test,y_pred).round(2)\n",
+        "print(\"Model 1, Recall :\",model_1_recall)\n",
+        "model_1_fscore = f1_score(y_test,y_pred).round(2)\n",
+        "print(\"Model 1, F1 Score :\",model_1_fscore)\n",
+        "model_1_roc = roc_auc_score(y_test,y_pred)\n",
+        "print(\"Model 1, AUC :\",model_1_roc)\n",
+        "print(\"\\nConfusion Matrix Model 1\")\n",
+        "model_1_cm = confusion_matrix(y_test,y_pred)\n",
+        "print(model_1_cm)"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "    Prediction  temp  Final_Class\n",
+            "23        0.61  temp            1\n",
+            "81        0.62  temp            1\n",
+            "85        0.63  temp            1\n",
+            "34        0.61  temp            1\n",
+            "62        0.63  temp            1\n",
+            "\n",
+            "Model Performance\n",
+            "Model 1, Accuracy : 0.3\n",
+            "Model 1, Precision : 0.3\n",
+            "Model 1, Recall : 1.0\n",
+            "Model 1, F1 Score : 0.46\n",
+            "Model 1, AUC : 0.5\n",
+            "\n",
+            "Confusion Matrix Model 1\n",
+            "[[ 0 21]\n",
+            " [ 0  9]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "dhcskdb5BCiU",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": 10,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "v67NCxRZGCXx",
+        "colab_type": "code",
         "colab": {}
       },
       "source": [