|
5 | 5 | "colab": {
|
6 | 6 | "name": "08 Logistic Regression.ipynb",
|
7 | 7 | "provenance": [],
|
8 |
| - "authorship_tag": "ABX9TyNmC97wOqe43KRJd+bMnG4a", |
| 8 | + "authorship_tag": "ABX9TyOzdICGm/7utO3jg6uo4fdn", |
9 | 9 | "include_colab_link": true
|
10 | 10 | },
|
11 | 11 | "kernelspec": {
|
|
33 | 33 | "base_uri": "https://localhost:8080/",
|
34 | 34 | "height": 73
|
35 | 35 | },
|
36 |
| - "outputId": "488a4dcc-a320-41aa-d30d-18001ff28526" |
| 36 | + "outputId": "b05857ae-b7ca-4eeb-af64-ee339d5216a8" |
37 | 37 | },
|
38 | 38 | "source": [
|
39 | 39 | "# Libraries\n",
|
40 | 40 | "from sklearn import datasets\n",
|
41 | 41 | "import pandas as pd\n",
|
42 |
| - "import statsmodels.api as sm" |
| 42 | + "import statsmodels.api as sm\n", |
| 43 | + "from sklearn.linear_model import LogisticRegression\n", |
| 44 | + "from sklearn import metrics\n", |
| 45 | + "from sklearn.model_selection import train_test_split\n", |
| 46 | + "import numpy as np\n", |
| 47 | + "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n", |
| 48 | + "from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score" |
43 | 49 | ],
|
44 |
| - "execution_count": 10, |
| 50 | + "execution_count": 1, |
45 | 51 | "outputs": [
|
46 | 52 | {
|
47 | 53 | "output_type": "stream",
|
|
62 | 68 | "base_uri": "https://localhost:8080/",
|
63 | 69 | "height": 204
|
64 | 70 | },
|
65 |
| - "outputId": "1ab9c9da-03ea-4454-e837-fa364133db44" |
| 71 | + "outputId": "bc5937b1-e238-4776-e45e-b02333be145d" |
66 | 72 | },
|
67 | 73 | "source": [
|
68 | 74 | "# IRIS Dataset\n",
|
|
79 | 85 | "iris.columns = iris.columns.str.replace(\")\",\"\")\n",
|
80 | 86 | "iris.head()"
|
81 | 87 | ],
|
82 |
| - "execution_count": 3, |
| 88 | + "execution_count": 2, |
83 | 89 | "outputs": [
|
84 | 90 | {
|
85 | 91 | "output_type": "execute_result",
|
|
167 | 173 | "metadata": {
|
168 | 174 | "tags": []
|
169 | 175 | },
|
170 |
| - "execution_count": 3 |
| 176 | + "execution_count": 2 |
171 | 177 | }
|
172 | 178 | ]
|
173 | 179 | },
|
|
180 | 186 | "base_uri": "https://localhost:8080/",
|
181 | 187 | "height": 71
|
182 | 188 | },
|
183 |
| - "outputId": "3d834a1c-9e56-4aec-abe7-c380280abb99" |
| 189 | + "outputId": "4b1fcb62-60bb-4180-9b09-a053ebcde243" |
184 | 190 | },
|
185 | 191 | "source": [
|
186 | 192 | "# Target Column Distribution\n",
|
187 | 193 | "iris['species'].value_counts()"
|
188 | 194 | ],
|
189 |
| - "execution_count": 4, |
| 195 | + "execution_count": 3, |
190 | 196 | "outputs": [
|
191 | 197 | {
|
192 | 198 | "output_type": "execute_result",
|
|
200 | 206 | "metadata": {
|
201 | 207 | "tags": []
|
202 | 208 | },
|
203 |
| - "execution_count": 4 |
| 209 | + "execution_count": 3 |
204 | 210 | }
|
205 | 211 | ]
|
206 | 212 | },
|
|
213 | 219 | "base_uri": "https://localhost:8080/",
|
214 | 220 | "height": 142
|
215 | 221 | },
|
216 |
| - "outputId": "9e9e1c39-ebe4-473b-dfb1-d6dfc94c75ce" |
| 222 | + "outputId": "4d927c5f-d629-4f5b-be62-d548b75eb025" |
217 | 223 | },
|
218 | 224 | "source": [
|
219 | 225 | "# Distribution (mean) of Independent Columns respect to Dependent Column\n",
|
220 | 226 | "iris.groupby('species').mean().round(2)"
|
221 | 227 | ],
|
222 |
| - "execution_count": 5, |
| 228 | + "execution_count": 4, |
223 | 229 | "outputs": [
|
224 | 230 | {
|
225 | 231 | "output_type": "execute_result",
|
|
285 | 291 | "metadata": {
|
286 | 292 | "tags": []
|
287 | 293 | },
|
288 |
| - "execution_count": 5 |
| 294 | + "execution_count": 4 |
289 | 295 | }
|
290 | 296 | ]
|
291 | 297 | },
|
|
298 | 304 | "base_uri": "https://localhost:8080/",
|
299 | 305 | "height": 204
|
300 | 306 | },
|
301 |
| - "outputId": "5260ca48-0b2f-4400-e29c-7137ecab6959" |
| 307 | + "outputId": "10749ad7-318a-4caf-b8d1-59a5439dfc23" |
302 | 308 | },
|
303 | 309 | "source": [
|
304 | 310 | "# Independent Variables\n",
|
305 | 311 | "Independent_Variable_Base_Set = iris[iris.columns[0:4]]\n",
|
306 | 312 | "Independent_Variable_Base_Set.head()"
|
307 | 313 | ],
|
308 |
| - "execution_count": 6, |
| 314 | + "execution_count": 5, |
309 | 315 | "outputs": [
|
310 | 316 | {
|
311 | 317 | "output_type": "execute_result",
|
|
387 | 393 | "metadata": {
|
388 | 394 | "tags": []
|
389 | 395 | },
|
390 |
| - "execution_count": 6 |
| 396 | + "execution_count": 5 |
391 | 397 | }
|
392 | 398 | ]
|
393 | 399 | },
|
|
400 | 406 | "base_uri": "https://localhost:8080/",
|
401 | 407 | "height": 204
|
402 | 408 | },
|
403 |
| - "outputId": "19e38ebb-d128-4087-8583-bf3a75fef110" |
| 409 | + "outputId": "48a3efc3-1f7a-49bc-cf99-57875155d2b3" |
404 | 410 | },
|
405 | 411 | "source": [
|
406 | 412 | "# Dependent Variable\n",
|
407 | 413 | "Dependent_Variable = iris[iris.columns[-1:iris.columns.size]]\n",
|
408 | 414 | "Dependent_Variable.head()"
|
409 | 415 | ],
|
410 |
| - "execution_count": 9, |
| 416 | + "execution_count": 6, |
411 | 417 | "outputs": [
|
412 | 418 | {
|
413 | 419 | "output_type": "execute_result",
|
|
471 | 477 | "metadata": {
|
472 | 478 | "tags": []
|
473 | 479 | },
|
474 |
| - "execution_count": 9 |
| 480 | + "execution_count": 6 |
475 | 481 | }
|
476 | 482 | ]
|
477 | 483 | },
|
|
484 | 490 | "base_uri": "https://localhost:8080/",
|
485 | 491 | "height": 431
|
486 | 492 | },
|
487 |
| - "outputId": "54db6207-45cd-4270-e0c9-e3dde41afa82" |
| 493 | + "outputId": "e1db0f39-4317-4e06-b6b3-477716d48e15" |
488 | 494 | },
|
489 | 495 | "source": [
|
490 | 496 | "# Fitting Logistic Model \n",
|
|
494 | 500 | "result = logit_model.fit(method = 'bfgs')\n",
|
495 | 501 | "print(result.summary2())"
|
496 | 502 | ],
|
497 |
| - "execution_count": 12, |
| 503 | + "execution_count": 7, |
498 | 504 | "outputs": [
|
499 | 505 | {
|
500 | 506 | "output_type": "stream",
|
|
508 | 514 | "===================================================================\n",
|
509 | 515 | "Model: Logit Pseudo R-squared: 1.000 \n",
|
510 | 516 | "Dependent Variable: species AIC: 8.0002 \n",
|
511 |
| - "Date: 2020-09-03 11:50 BIC: 18.4209 \n", |
| 517 | + "Date: 2020-09-03 13:11 BIC: 18.4209 \n", |
512 | 518 | "No. Observations: 100 Log-Likelihood: -0.00011118\n",
|
513 | 519 | "Df Model: 3 LL-Null: -69.315 \n",
|
514 | 520 | "Df Residuals: 96 LLR p-value: 7.4648e-30 \n",
|
|
536 | 542 | "base_uri": "https://localhost:8080/",
|
537 | 543 | "height": 179
|
538 | 544 | },
|
539 |
| - "outputId": "b9612ca6-26d7-4877-dffb-a28619558c19" |
| 545 | + "outputId": "736bc19c-e30a-45d8-8eed-bb39dea0365e" |
540 | 546 | },
|
541 | 547 | "source": [
|
542 | 548 | "# Model Summary\n",
|
|
554 | 560 | "print(\"BIC also work same as AIC, Lower BIC better is the Model\")\n",
|
555 | 561 | "print(\"Base Model, BIC :\",base_model_bic)"
|
556 | 562 | ],
|
557 |
| - "execution_count": 22, |
| 563 | + "execution_count": 8, |
558 | 564 | "outputs": [
|
559 | 565 | {
|
560 | 566 | "output_type": "stream",
|
|
578 | 584 | "metadata": {
|
579 | 585 | "id": "IdcrKXSXzxZY",
|
580 | 586 | "colab_type": "code",
|
| 587 | + "colab": { |
| 588 | + "base_uri": "https://localhost:8080/", |
| 589 | + "height": 539 |
| 590 | + }, |
| 591 | + "outputId": "73ef4b6f-f34d-4ea0-8c0f-3879eda08c43" |
| 592 | + }, |
| 593 | + "source": [ |
| 594 | + "# Split the Dataset how logistic works\n", |
| 595 | + "\n", |
| 596 | + "# Lets start with selecting one variable\n", |
| 597 | + "Independent_Variable_Set_v1 = iris[iris.columns[0:1]]\n", |
| 598 | + "X_train, X_test, y_train, y_test = train_test_split(Independent_Variable_Set_v1,Dependent_Variable,test_size = 0.3,random_state = 21)\n", |
| 599 | + "logit_model = sm.Logit(y_train,X_train)\n", |
| 600 | + "result = logit_model.fit(method='bfgs')\n", |
| 601 | + "print(result.summary2())\n", |
| 602 | + "\n", |
| 603 | + "# Model Summary\n", |
| 604 | + "\n", |
| 605 | + "print(\"Model 1 Summary\")\n", |
| 606 | + "print(\"Iteration suggests how many loop model did to perform the fit\")\n", |
| 607 | + "print(\"Iterations : 3\")\n", |
| 608 | + "r_square_1 = result.prsquared.round(2)\n", |
| 609 | + "print(\"Pseudo R Square suggests overall effect size (ideal value is close to 1)\")\n", |
| 610 | + "print(\"Model 1, MacFadden Pseudo R Square : \",r_square_1)\n", |
| 611 | + "base_model_aic_1 = result.aic.round(2)\n", |
| 612 | + "print(\"AIC compares Goodness of Fit, Lower AIC better is the Model\")\n", |
| 613 | + "print(\"Model 1, AIC :\",base_model_aic_1)\n", |
| 614 | + "base_model_bic_1 = result.bic.round(2)\n", |
| 615 | + "print(\"BIC also work same as AIC, Lower BIC better is the Model\")\n", |
| 616 | + "print(\"Model 1, BIC :\",base_model_bic_1)" |
| 617 | + ], |
| 618 | + "execution_count": 9, |
| 619 | + "outputs": [ |
| 620 | + { |
| 621 | + "output_type": "stream", |
| 622 | + "text": [ |
| 623 | + "Optimization terminated successfully.\n", |
| 624 | + " Current function value: 0.662428\n", |
| 625 | + " Iterations: 3\n", |
| 626 | + " Function evaluations: 5\n", |
| 627 | + " Gradient evaluations: 5\n", |
| 628 | + " Results: Logit\n", |
| 629 | + "==============================================================\n", |
| 630 | + "Model: Logit Pseudo R-squared: 0.024 \n", |
| 631 | + "Dependent Variable: species AIC: 94.7399\n", |
| 632 | + "Date: 2020-09-03 13:11 BIC: 96.9884\n", |
| 633 | + "No. Observations: 70 Log-Likelihood: -46.370\n", |
| 634 | + "Df Model: 0 LL-Null: -47.487\n", |
| 635 | + "Df Residuals: 69 LLR p-value: nan \n", |
| 636 | + "Converged: 1.0000 Scale: 1.0000 \n", |
| 637 | + "---------------------------------------------------------------\n", |
| 638 | + " Coef. Std.Err. z P>|z| [0.025 0.975]\n", |
| 639 | + "---------------------------------------------------------------\n", |
| 640 | + "sepallengthcm 0.0902 0.0442 2.0399 0.0414 0.0035 0.1769\n", |
| 641 | + "==============================================================\n", |
| 642 | + "\n", |
| 643 | + "Model 1 Summary\n", |
| 644 | + "Iteration suggests how many loop model did to perform the fit\n", |
| 645 | + "Iterations : 3\n", |
| 646 | + "Pseudo R Square suggests overall effect size (ideal value is close to 1)\n", |
| 647 | + "Model 1, MacFadden Pseudo R Square : 0.02\n", |
| 648 | + "AIC compares Goodness of Fit, Lower AIC better is the Model\n", |
| 649 | + "Model 1, AIC : 94.74\n", |
| 650 | + "BIC also work same as AIC, Lower BIC better is the Model\n", |
| 651 | + "Model 1, BIC : 96.99\n" |
| 652 | + ], |
| 653 | + "name": "stdout" |
| 654 | + } |
| 655 | + ] |
| 656 | + }, |
| 657 | + { |
| 658 | + "cell_type": "code", |
| 659 | + "metadata": { |
| 660 | + "id": "QwGV0dfc5Tq0", |
| 661 | + "colab_type": "code", |
| 662 | + "colab": { |
| 663 | + "base_uri": "https://localhost:8080/", |
| 664 | + "height": 323 |
| 665 | + }, |
| 666 | + "outputId": "0ec5d4c8-56ff-4b9c-997b-1c61cff2886e" |
| 667 | + }, |
| 668 | + "source": [ |
| 669 | + "# Model Prediction\n", |
| 670 | + "\n", |
| 671 | + "pred = result.predict(X_test)\n", |
| 672 | + "model_prediction = pd.DataFrame(pred.round(2),columns = ['Prediction'])\n", |
| 673 | + "model_prediction['temp'] = 'temp'\n", |
| 674 | + "model_prediction['Final_Class'] = np.where(model_prediction['Prediction'] > 0.5,1,0)\n", |
| 675 | + "print(model_prediction.head())\n", |
| 676 | + "temp = model_prediction.groupby('temp')['Final_Class'].apply(list)\n", |
| 677 | + "y_pred = temp.loc['temp']\n", |
| 678 | + "model_1_accuracy = accuracy_score(y_test,y_pred).round(2)\n", |
| 679 | + "print(\"\\nModel Performance\")\n", |
| 680 | + "print(\"Model 1, Accuracy :\",model_1_accuracy)\n", |
| 681 | + "model_1_precision = precision_score(y_test,y_pred).round(2)\n", |
| 682 | + "print(\"Model 1, Precision :\",model_1_precision)\n", |
| 683 | + "model_1_recall = recall_score(y_test,y_pred).round(2)\n", |
| 684 | + "print(\"Model 1, Recall :\",model_1_recall)\n", |
| 685 | + "model_1_fscore = f1_score(y_test,y_pred).round(2)\n", |
| 686 | + "print(\"Model 1, F1 Score :\",model_1_fscore)\n", |
| 687 | + "model_1_roc = roc_auc_score(y_test,y_pred)\n", |
| 688 | + "print(\"Model 1, AUC :\",model_1_roc)\n", |
| 689 | + "print(\"\\nConfusion Matrix Model 1\")\n", |
| 690 | + "model_1_cm = confusion_matrix(y_test,y_pred)\n", |
| 691 | + "print(model_1_cm)" |
| 692 | + ], |
| 693 | + "execution_count": 10, |
| 694 | + "outputs": [ |
| 695 | + { |
| 696 | + "output_type": "stream", |
| 697 | + "text": [ |
| 698 | + " Prediction temp Final_Class\n", |
| 699 | + "23 0.61 temp 1\n", |
| 700 | + "81 0.62 temp 1\n", |
| 701 | + "85 0.63 temp 1\n", |
| 702 | + "34 0.61 temp 1\n", |
| 703 | + "62 0.63 temp 1\n", |
| 704 | + "\n", |
| 705 | + "Model Performance\n", |
| 706 | + "Model 1, Accuracy : 0.3\n", |
| 707 | + "Model 1, Precision : 0.3\n", |
| 708 | + "Model 1, Recall : 1.0\n", |
| 709 | + "Model 1, F1 Score : 0.46\n", |
| 710 | + "Model 1, AUC : 0.5\n", |
| 711 | + "\n", |
| 712 | + "Confusion Matrix Model 1\n", |
| 713 | + "[[ 0 21]\n", |
| 714 | + " [ 0 9]]\n" |
| 715 | + ], |
| 716 | + "name": "stdout" |
| 717 | + } |
| 718 | + ] |
| 719 | + }, |
| 720 | + { |
| 721 | + "cell_type": "code", |
| 722 | + "metadata": { |
| 723 | + "id": "dhcskdb5BCiU", |
| 724 | + "colab_type": "code", |
| 725 | + "colab": {} |
| 726 | + }, |
| 727 | + "source": [ |
| 728 | + "" |
| 729 | + ], |
| 730 | + "execution_count": 10, |
| 731 | + "outputs": [] |
| 732 | + }, |
| 733 | + { |
| 734 | + "cell_type": "code", |
| 735 | + "metadata": { |
| 736 | + "id": "v67NCxRZGCXx", |
| 737 | + "colab_type": "code", |
581 | 738 | "colab": {}
|
582 | 739 | },
|
583 | 740 | "source": [
|
|
0 commit comments