diff --git a/Prudential-life-insurance-assessment.ipynb b/Prudential-life-insurance-assessment.ipynb
new file mode 100644
index 0000000..5bfa475
--- /dev/null
+++ b/Prudential-life-insurance-assessment.ipynb
@@ -0,0 +1,5685 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "d652a156-492c-4a62-9e28-d3281f49b349",
+ "metadata": {},
+ "source": [
+ "## IMPORTANDO BIBLIOTECAS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "7456d8ed-d019-4375-925b-b0bd482c8bab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# BIBLIOTECAS UTILIZADAS NESTE PROJETO\n",
+ "\n",
+ "from collections import Counter\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "\n",
+ "from sklearn.experimental import enable_iterative_imputer\n",
+ "from sklearn.impute import IterativeImputer\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "\n",
+ "from imblearn.combine import SMOTEENN\n",
+ "\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "from sklearn.model_selection import cross_val_score\n",
+ "from sklearn.model_selection import validation_curve\n",
+ "from sklearn.model_selection import learning_curve\n",
+ "\n",
+ "from sklearn.metrics import plot_confusion_matrix\n",
+ "from sklearn.metrics import recall_score\n",
+ "from sklearn.metrics import f1_score\n",
+ "from sklearn.metrics import accuracy_score\n",
+ "from sklearn.metrics import classification_report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "d95deae8-42d7-4baa-961a-57574271d1e2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Author: Apolo Ferreira Santos\n",
+ "\n",
+ "seaborn : 0.11.1\n",
+ "sklearn : 0.24.1\n",
+ "pandas : 1.2.4\n",
+ "matplotlib: 3.3.4\n",
+ "numpy : 1.20.1\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# VERSÕES DOS PACOTES USADOS NESTE JUPYTER NOTEBOOK\n",
+ "%reload_ext watermark\n",
+ "%watermark -a \"Apolo Ferreira Santos\" --iversions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4a171a48-323d-481e-9691-4a2e4fb1d5d3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CONFIGURAÇÃO PARA MOSTRAR TODAS COLUNAS DE UM DATAFRAME\n",
+ "pd.set_option(\"display.max_columns\", None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "303c9293-0862-4625-933b-89943007950b",
+ "metadata": {},
+ "source": [
+ "##
ANÁLISE DO CONJUNTO DE DADOS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "46f9902e-31ee-4550-9374-34000fa5d754",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# IMPORTANDO CONJUNTO DE DADOS DE TREINO\n",
+ "dataset = pd.read_csv('train.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c737d379-cee2-4dee-bd96-684ef1285452",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Id | \n",
+ " Product_Info_1 | \n",
+ " Product_Info_2 | \n",
+ " Product_Info_3 | \n",
+ " Product_Info_4 | \n",
+ " Product_Info_5 | \n",
+ " Product_Info_6 | \n",
+ " Product_Info_7 | \n",
+ " Ins_Age | \n",
+ " Ht | \n",
+ " Wt | \n",
+ " BMI | \n",
+ " Employment_Info_1 | \n",
+ " Employment_Info_2 | \n",
+ " Employment_Info_3 | \n",
+ " Employment_Info_4 | \n",
+ " Employment_Info_5 | \n",
+ " Employment_Info_6 | \n",
+ " InsuredInfo_1 | \n",
+ " InsuredInfo_2 | \n",
+ " InsuredInfo_3 | \n",
+ " InsuredInfo_4 | \n",
+ " InsuredInfo_5 | \n",
+ " InsuredInfo_6 | \n",
+ " InsuredInfo_7 | \n",
+ " Insurance_History_1 | \n",
+ " Insurance_History_2 | \n",
+ " Insurance_History_3 | \n",
+ " Insurance_History_4 | \n",
+ " Insurance_History_5 | \n",
+ " Insurance_History_7 | \n",
+ " Insurance_History_8 | \n",
+ " Insurance_History_9 | \n",
+ " Family_Hist_1 | \n",
+ " Family_Hist_2 | \n",
+ " Family_Hist_3 | \n",
+ " Family_Hist_4 | \n",
+ " Family_Hist_5 | \n",
+ " Medical_History_1 | \n",
+ " Medical_History_2 | \n",
+ " Medical_History_3 | \n",
+ " Medical_History_4 | \n",
+ " Medical_History_5 | \n",
+ " Medical_History_6 | \n",
+ " Medical_History_7 | \n",
+ " Medical_History_8 | \n",
+ " Medical_History_9 | \n",
+ " Medical_History_10 | \n",
+ " Medical_History_11 | \n",
+ " Medical_History_12 | \n",
+ " Medical_History_13 | \n",
+ " Medical_History_14 | \n",
+ " Medical_History_15 | \n",
+ " Medical_History_16 | \n",
+ " Medical_History_17 | \n",
+ " Medical_History_18 | \n",
+ " Medical_History_19 | \n",
+ " Medical_History_20 | \n",
+ " Medical_History_21 | \n",
+ " Medical_History_22 | \n",
+ " Medical_History_23 | \n",
+ " Medical_History_24 | \n",
+ " Medical_History_25 | \n",
+ " Medical_History_26 | \n",
+ " Medical_History_27 | \n",
+ " Medical_History_28 | \n",
+ " Medical_History_29 | \n",
+ " Medical_History_30 | \n",
+ " Medical_History_31 | \n",
+ " Medical_History_32 | \n",
+ " Medical_History_33 | \n",
+ " Medical_History_34 | \n",
+ " Medical_History_35 | \n",
+ " Medical_History_36 | \n",
+ " Medical_History_37 | \n",
+ " Medical_History_38 | \n",
+ " Medical_History_39 | \n",
+ " Medical_History_40 | \n",
+ " Medical_History_41 | \n",
+ " Medical_Keyword_1 | \n",
+ " Medical_Keyword_2 | \n",
+ " Medical_Keyword_3 | \n",
+ " Medical_Keyword_4 | \n",
+ " Medical_Keyword_5 | \n",
+ " Medical_Keyword_6 | \n",
+ " Medical_Keyword_7 | \n",
+ " Medical_Keyword_8 | \n",
+ " Medical_Keyword_9 | \n",
+ " Medical_Keyword_10 | \n",
+ " Medical_Keyword_11 | \n",
+ " Medical_Keyword_12 | \n",
+ " Medical_Keyword_13 | \n",
+ " Medical_Keyword_14 | \n",
+ " Medical_Keyword_15 | \n",
+ " Medical_Keyword_16 | \n",
+ " Medical_Keyword_17 | \n",
+ " Medical_Keyword_18 | \n",
+ " Medical_Keyword_19 | \n",
+ " Medical_Keyword_20 | \n",
+ " Medical_Keyword_21 | \n",
+ " Medical_Keyword_22 | \n",
+ " Medical_Keyword_23 | \n",
+ " Medical_Keyword_24 | \n",
+ " Medical_Keyword_25 | \n",
+ " Medical_Keyword_26 | \n",
+ " Medical_Keyword_27 | \n",
+ " Medical_Keyword_28 | \n",
+ " Medical_Keyword_29 | \n",
+ " Medical_Keyword_30 | \n",
+ " Medical_Keyword_31 | \n",
+ " Medical_Keyword_32 | \n",
+ " Medical_Keyword_33 | \n",
+ " Medical_Keyword_34 | \n",
+ " Medical_Keyword_35 | \n",
+ " Medical_Keyword_36 | \n",
+ " Medical_Keyword_37 | \n",
+ " Medical_Keyword_38 | \n",
+ " Medical_Keyword_39 | \n",
+ " Medical_Keyword_40 | \n",
+ " Medical_Keyword_41 | \n",
+ " Medical_Keyword_42 | \n",
+ " Medical_Keyword_43 | \n",
+ " Medical_Keyword_44 | \n",
+ " Medical_Keyword_45 | \n",
+ " Medical_Keyword_46 | \n",
+ " Medical_Keyword_47 | \n",
+ " Medical_Keyword_48 | \n",
+ " Response | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " D3 | \n",
+ " 10 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0.641791 | \n",
+ " 0.581818 | \n",
+ " 0.148536 | \n",
+ " 0.323008 | \n",
+ " 0.028 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.000667 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 0.598039 | \n",
+ " NaN | \n",
+ " 0.526786 | \n",
+ " 4.0 | \n",
+ " 112 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 240.0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " A1 | \n",
+ " 26 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.059701 | \n",
+ " 0.600000 | \n",
+ " 0.131799 | \n",
+ " 0.272288 | \n",
+ " 0.000 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ " 0.0018 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.000133 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0.188406 | \n",
+ " NaN | \n",
+ " 0.084507 | \n",
+ " NaN | \n",
+ " 5.0 | \n",
+ " 412 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " E1 | \n",
+ " 26 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.029851 | \n",
+ " 0.745455 | \n",
+ " 0.288703 | \n",
+ " 0.428780 | \n",
+ " 0.030 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ " 0.0300 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0.304348 | \n",
+ " NaN | \n",
+ " 0.225352 | \n",
+ " NaN | \n",
+ " 10.0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 1 | \n",
+ " D4 | \n",
+ " 10 | \n",
+ " 0.487179 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.164179 | \n",
+ " 0.672727 | \n",
+ " 0.205021 | \n",
+ " 0.352438 | \n",
+ " 0.042 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 3 | \n",
+ " 0.2000 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0.420290 | \n",
+ " NaN | \n",
+ " 0.352113 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 350 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " D2 | \n",
+ " 26 | \n",
+ " 0.230769 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.417910 | \n",
+ " 0.654545 | \n",
+ " 0.234310 | \n",
+ " 0.424046 | \n",
+ " 0.027 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 2 | \n",
+ " 0.0500 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 0.463768 | \n",
+ " NaN | \n",
+ " 0.408451 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 162 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4 \\\n",
+ "0 2 1 D3 10 0.076923 \n",
+ "1 5 1 A1 26 0.076923 \n",
+ "2 6 1 E1 26 0.076923 \n",
+ "3 7 1 D4 10 0.487179 \n",
+ "4 8 1 D2 26 0.230769 \n",
+ "\n",
+ " Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht \\\n",
+ "0 2 1 1 0.641791 0.581818 \n",
+ "1 2 3 1 0.059701 0.600000 \n",
+ "2 2 3 1 0.029851 0.745455 \n",
+ "3 2 3 1 0.164179 0.672727 \n",
+ "4 2 3 1 0.417910 0.654545 \n",
+ "\n",
+ " Wt BMI Employment_Info_1 Employment_Info_2 \\\n",
+ "0 0.148536 0.323008 0.028 12 \n",
+ "1 0.131799 0.272288 0.000 1 \n",
+ "2 0.288703 0.428780 0.030 9 \n",
+ "3 0.205021 0.352438 0.042 9 \n",
+ "4 0.234310 0.424046 0.027 9 \n",
+ "\n",
+ " Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 \\\n",
+ "0 1 0.0 3 NaN \n",
+ "1 3 0.0 2 0.0018 \n",
+ "2 1 0.0 2 0.0300 \n",
+ "3 1 0.0 3 0.2000 \n",
+ "4 1 0.0 2 0.0500 \n",
+ "\n",
+ " InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 \\\n",
+ "0 1 2 6 3 1 \n",
+ "1 1 2 6 3 1 \n",
+ "2 1 2 8 3 1 \n",
+ "3 2 2 8 3 1 \n",
+ "4 1 2 6 3 1 \n",
+ "\n",
+ " InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 \\\n",
+ "0 2 1 1 1 \n",
+ "1 2 1 2 1 \n",
+ "2 1 1 2 1 \n",
+ "3 2 1 2 1 \n",
+ "4 2 1 2 1 \n",
+ "\n",
+ " Insurance_History_3 Insurance_History_4 Insurance_History_5 \\\n",
+ "0 3 1 0.000667 \n",
+ "1 3 1 0.000133 \n",
+ "2 1 3 NaN \n",
+ "3 1 3 NaN \n",
+ "4 1 3 NaN \n",
+ "\n",
+ " Insurance_History_7 Insurance_History_8 Insurance_History_9 \\\n",
+ "0 1 1 2 \n",
+ "1 1 3 2 \n",
+ "2 3 2 3 \n",
+ "3 3 2 3 \n",
+ "4 3 2 3 \n",
+ "\n",
+ " Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5 \\\n",
+ "0 2 NaN 0.598039 NaN 0.526786 \n",
+ "1 2 0.188406 NaN 0.084507 NaN \n",
+ "2 3 0.304348 NaN 0.225352 NaN \n",
+ "3 3 0.420290 NaN 0.352113 NaN \n",
+ "4 2 0.463768 NaN 0.408451 NaN \n",
+ "\n",
+ " Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 \\\n",
+ "0 4.0 112 2 1 \n",
+ "1 5.0 412 2 1 \n",
+ "2 10.0 3 2 2 \n",
+ "3 0.0 350 2 2 \n",
+ "4 NaN 162 2 2 \n",
+ "\n",
+ " Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 \\\n",
+ "0 1 3 2 2 \n",
+ "1 1 3 2 2 \n",
+ "2 1 3 2 2 \n",
+ "3 1 3 2 2 \n",
+ "4 1 3 2 2 \n",
+ "\n",
+ " Medical_History_9 Medical_History_10 Medical_History_11 \\\n",
+ "0 1 NaN 3 \n",
+ "1 1 NaN 3 \n",
+ "2 2 NaN 3 \n",
+ "3 2 NaN 3 \n",
+ "4 2 NaN 3 \n",
+ "\n",
+ " Medical_History_12 Medical_History_13 Medical_History_14 \\\n",
+ "0 2 3 3 \n",
+ "1 2 3 3 \n",
+ "2 2 3 3 \n",
+ "3 2 3 3 \n",
+ "4 2 3 3 \n",
+ "\n",
+ " Medical_History_15 Medical_History_16 Medical_History_17 \\\n",
+ "0 240.0 3 3 \n",
+ "1 0.0 1 3 \n",
+ "2 NaN 1 3 \n",
+ "3 NaN 1 3 \n",
+ "4 NaN 1 3 \n",
+ "\n",
+ " Medical_History_18 Medical_History_19 Medical_History_20 \\\n",
+ "0 1 1 2 \n",
+ "1 1 1 2 \n",
+ "2 1 1 2 \n",
+ "3 1 1 2 \n",
+ "4 1 1 2 \n",
+ "\n",
+ " Medical_History_21 Medical_History_22 Medical_History_23 \\\n",
+ "0 1 2 3 \n",
+ "1 1 2 3 \n",
+ "2 1 2 3 \n",
+ "3 2 2 3 \n",
+ "4 1 2 3 \n",
+ "\n",
+ " Medical_History_24 Medical_History_25 Medical_History_26 \\\n",
+ "0 NaN 1 3 \n",
+ "1 NaN 1 3 \n",
+ "2 NaN 2 2 \n",
+ "3 NaN 1 3 \n",
+ "4 NaN 2 2 \n",
+ "\n",
+ " Medical_History_27 Medical_History_28 Medical_History_29 \\\n",
+ "0 3 1 3 \n",
+ "1 3 1 3 \n",
+ "2 3 1 3 \n",
+ "3 3 1 3 \n",
+ "4 3 1 3 \n",
+ "\n",
+ " Medical_History_30 Medical_History_31 Medical_History_32 \\\n",
+ "0 2 3 NaN \n",
+ "1 2 3 NaN \n",
+ "2 2 3 NaN \n",
+ "3 2 3 NaN \n",
+ "4 2 3 NaN \n",
+ "\n",
+ " Medical_History_33 Medical_History_34 Medical_History_35 \\\n",
+ "0 1 3 1 \n",
+ "1 3 1 1 \n",
+ "2 3 3 1 \n",
+ "3 3 3 1 \n",
+ "4 3 3 1 \n",
+ "\n",
+ " Medical_History_36 Medical_History_37 Medical_History_38 \\\n",
+ "0 2 2 1 \n",
+ "1 2 2 1 \n",
+ "2 3 2 1 \n",
+ "3 2 2 1 \n",
+ "4 3 2 1 \n",
+ "\n",
+ " Medical_History_39 Medical_History_40 Medical_History_41 \\\n",
+ "0 3 3 3 \n",
+ "1 3 3 1 \n",
+ "2 3 3 1 \n",
+ "3 3 3 1 \n",
+ "4 3 3 1 \n",
+ "\n",
+ " Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 1 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_48 Response \n",
+ "0 0 8 \n",
+ "1 0 4 \n",
+ "2 0 8 \n",
+ "3 0 8 \n",
+ "4 0 8 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZAÇÃO DO DATASET\n",
+ "dataset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "129b50d4-27e3-4d49-92d6-c41791f9fe92",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# FUNÇÃO PARA VERIFICAR O SHAPE DO DATASET, A EXISTÊNCIA DE VALORES NULOS E SUA QUANTIDADE\n",
+ "\n",
+ "def valuesDataset(df):\n",
+ " print('\\033[1mTamanho do dataset\\033[0m -> {} x {}'.format(df.shape[0], df.shape[1]))\n",
+ " print('\\033[1mExiste valores faltantes:\\033[0m {}'.format(df.isnull().values.any()))\n",
+ " print('\\033[1mQuantidade de valores faltantes:\\033[0m {}'.format(df.isnull().values.sum()))\n",
+ " print('\\033[1mPorcentagem dos valores faltantes:\\033[0m {:.3}%'.format((df.isnull().values.sum()/(df.shape[0]*df.shape[1]) * 100))) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "aafbfb9b-cb61-452b-8a22-5b1347602166",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1mTamanho do dataset\u001b[0m -> 59381 x 128\n",
+ "\u001b[1mExiste valores faltantes:\u001b[0m True\n",
+ "\u001b[1mQuantidade de valores faltantes:\u001b[0m 393103\n",
+ "\u001b[1mPorcentagem dos valores faltantes:\u001b[0m 5.17%\n"
+ ]
+ }
+ ],
+ "source": [
+ "# FAZENDO A CHAMADA DA FUNÇÃO\n",
+ "valuesDataset(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "00cdf78c-789a-4aae-9c47-adc57bc7ff62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# FUNÇÃO PARA CALCULAR A PORCENTAGEM DE VALORES FALTANTES DE CADA COLUNA E ORGANIZAR EM UM DATAFRAME DE ORDEM DECRESCENTE\n",
+ "\n",
+ "def missing_percentage(df): \n",
+ " total = df.isnull().sum().sort_values(ascending = False)[df.isnull().sum().sort_values(ascending = False) != 0] \n",
+ " percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)[round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2) != 0] \n",
+ " return pd.concat([total, percent], axis=1, keys=['Total','Percent'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "f9a38aa9-1297-4d2e-b6d7-17d2fd94503d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Total | \n",
+ " Percent | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Medical_History_10 | \n",
+ " 58824 | \n",
+ " 99.06 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_32 | \n",
+ " 58274 | \n",
+ " 98.14 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_24 | \n",
+ " 55580 | \n",
+ " 93.60 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_15 | \n",
+ " 44596 | \n",
+ " 75.10 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_5 | \n",
+ " 41811 | \n",
+ " 70.41 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_3 | \n",
+ " 34241 | \n",
+ " 57.66 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_2 | \n",
+ " 28656 | \n",
+ " 48.26 | \n",
+ "
\n",
+ " \n",
+ " Insurance_History_5 | \n",
+ " 25396 | \n",
+ " 42.77 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_4 | \n",
+ " 19184 | \n",
+ " 32.31 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_6 | \n",
+ " 10854 | \n",
+ " 18.28 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_1 | \n",
+ " 8889 | \n",
+ " 14.97 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_4 | \n",
+ " 6779 | \n",
+ " 11.42 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_1 | \n",
+ " 19 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Total Percent\n",
+ "Medical_History_10 58824 99.06\n",
+ "Medical_History_32 58274 98.14\n",
+ "Medical_History_24 55580 93.60\n",
+ "Medical_History_15 44596 75.10\n",
+ "Family_Hist_5 41811 70.41\n",
+ "Family_Hist_3 34241 57.66\n",
+ "Family_Hist_2 28656 48.26\n",
+ "Insurance_History_5 25396 42.77\n",
+ "Family_Hist_4 19184 32.31\n",
+ "Employment_Info_6 10854 18.28\n",
+ "Medical_History_1 8889 14.97\n",
+ "Employment_Info_4 6779 11.42\n",
+ "Employment_Info_1 19 0.03"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# CHAMANDO FUNÇÃO\n",
+ "df_missing = missing_percentage(dataset)\n",
+ "\n",
+ "# VISUALIZANDO ATRIBUTOS COM DADOS FALTANTES E SUAS RESPECTIVAS PORCENTAGENS\n",
+ "df_missing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "11844316-9cc2-4b85-85d8-fc29d9124d68",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CRIANDO LISTAS COM IDENTIFICAÇÃO DOS ATRIBUTOS COM DADOS FALTANTES QUE SERÃO MANTIDOS E EXCLUIDOS COM LIMIAR DE 70%\n",
+ "# ATRIBUTOS COM MAIS DE 70% DOS DADOS FALTANTES SERÃO EXCLUÍDOS\n",
+ "\n",
+ "missing_values_drop = df_missing[df_missing.Percent >= 70].index.tolist()\n",
+ "missing_values_keep = df_missing[df_missing.Percent < 70].index.tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "e88902b7-288c-4b57-8736-237dd2d2d8c6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ " mean | \n",
+ " std | \n",
+ " min | \n",
+ " 25% | \n",
+ " 50% | \n",
+ " 75% | \n",
+ " max | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Family_Hist_3 | \n",
+ " 25140.0 | \n",
+ " 0.497737 | \n",
+ " 0.140187 | \n",
+ " 0.0 | \n",
+ " 0.401961 | \n",
+ " 0.519608 | \n",
+ " 0.598039 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_2 | \n",
+ " 30725.0 | \n",
+ " 0.474550 | \n",
+ " 0.154959 | \n",
+ " 0.0 | \n",
+ " 0.362319 | \n",
+ " 0.463768 | \n",
+ " 0.579710 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " Insurance_History_5 | \n",
+ " 33985.0 | \n",
+ " 0.001733 | \n",
+ " 0.007338 | \n",
+ " 0.0 | \n",
+ " 0.000400 | \n",
+ " 0.000973 | \n",
+ " 0.002000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " Family_Hist_4 | \n",
+ " 40197.0 | \n",
+ " 0.444890 | \n",
+ " 0.163012 | \n",
+ " 0.0 | \n",
+ " 0.323944 | \n",
+ " 0.422535 | \n",
+ " 0.563380 | \n",
+ " 0.943662 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_6 | \n",
+ " 48527.0 | \n",
+ " 0.361469 | \n",
+ " 0.349551 | \n",
+ " 0.0 | \n",
+ " 0.060000 | \n",
+ " 0.250000 | \n",
+ " 0.550000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_1 | \n",
+ " 50492.0 | \n",
+ " 7.962172 | \n",
+ " 13.027697 | \n",
+ " 0.0 | \n",
+ " 2.000000 | \n",
+ " 4.000000 | \n",
+ " 9.000000 | \n",
+ " 240.000000 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_4 | \n",
+ " 52602.0 | \n",
+ " 0.006283 | \n",
+ " 0.032816 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_1 | \n",
+ " 59362.0 | \n",
+ " 0.077582 | \n",
+ " 0.082347 | \n",
+ " 0.0 | \n",
+ " 0.035000 | \n",
+ " 0.060000 | \n",
+ " 0.100000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count mean std min 25% 50% \\\n",
+ "Family_Hist_3 25140.0 0.497737 0.140187 0.0 0.401961 0.519608 \n",
+ "Family_Hist_2 30725.0 0.474550 0.154959 0.0 0.362319 0.463768 \n",
+ "Insurance_History_5 33985.0 0.001733 0.007338 0.0 0.000400 0.000973 \n",
+ "Family_Hist_4 40197.0 0.444890 0.163012 0.0 0.323944 0.422535 \n",
+ "Employment_Info_6 48527.0 0.361469 0.349551 0.0 0.060000 0.250000 \n",
+ "Medical_History_1 50492.0 7.962172 13.027697 0.0 2.000000 4.000000 \n",
+ "Employment_Info_4 52602.0 0.006283 0.032816 0.0 0.000000 0.000000 \n",
+ "Employment_Info_1 59362.0 0.077582 0.082347 0.0 0.035000 0.060000 \n",
+ "\n",
+ " 75% max \n",
+ "Family_Hist_3 0.598039 1.000000 \n",
+ "Family_Hist_2 0.579710 1.000000 \n",
+ "Insurance_History_5 0.002000 1.000000 \n",
+ "Family_Hist_4 0.563380 0.943662 \n",
+ "Employment_Info_6 0.550000 1.000000 \n",
+ "Medical_History_1 9.000000 240.000000 \n",
+ "Employment_Info_4 0.000000 1.000000 \n",
+ "Employment_Info_1 0.100000 1.000000 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# ATRIBUTOS COM VALORES FALTANTES ENTRE 70 E 0.02% DO TOTAL\n",
+ "dataset[missing_values_keep].describe().T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a3a82275-3618-47ad-bbc2-7bd332fcf8e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 59381 entries, 0 to 59380\n",
+ "Data columns (total 128 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Id 59381 non-null int64 \n",
+ " 1 Product_Info_1 59381 non-null int64 \n",
+ " 2 Product_Info_2 59381 non-null object \n",
+ " 3 Product_Info_3 59381 non-null int64 \n",
+ " 4 Product_Info_4 59381 non-null float64\n",
+ " 5 Product_Info_5 59381 non-null int64 \n",
+ " 6 Product_Info_6 59381 non-null int64 \n",
+ " 7 Product_Info_7 59381 non-null int64 \n",
+ " 8 Ins_Age 59381 non-null float64\n",
+ " 9 Ht 59381 non-null float64\n",
+ " 10 Wt 59381 non-null float64\n",
+ " 11 BMI 59381 non-null float64\n",
+ " 12 Employment_Info_1 59362 non-null float64\n",
+ " 13 Employment_Info_2 59381 non-null int64 \n",
+ " 14 Employment_Info_3 59381 non-null int64 \n",
+ " 15 Employment_Info_4 52602 non-null float64\n",
+ " 16 Employment_Info_5 59381 non-null int64 \n",
+ " 17 Employment_Info_6 48527 non-null float64\n",
+ " 18 InsuredInfo_1 59381 non-null int64 \n",
+ " 19 InsuredInfo_2 59381 non-null int64 \n",
+ " 20 InsuredInfo_3 59381 non-null int64 \n",
+ " 21 InsuredInfo_4 59381 non-null int64 \n",
+ " 22 InsuredInfo_5 59381 non-null int64 \n",
+ " 23 InsuredInfo_6 59381 non-null int64 \n",
+ " 24 InsuredInfo_7 59381 non-null int64 \n",
+ " 25 Insurance_History_1 59381 non-null int64 \n",
+ " 26 Insurance_History_2 59381 non-null int64 \n",
+ " 27 Insurance_History_3 59381 non-null int64 \n",
+ " 28 Insurance_History_4 59381 non-null int64 \n",
+ " 29 Insurance_History_5 33985 non-null float64\n",
+ " 30 Insurance_History_7 59381 non-null int64 \n",
+ " 31 Insurance_History_8 59381 non-null int64 \n",
+ " 32 Insurance_History_9 59381 non-null int64 \n",
+ " 33 Family_Hist_1 59381 non-null int64 \n",
+ " 34 Family_Hist_2 30725 non-null float64\n",
+ " 35 Family_Hist_3 25140 non-null float64\n",
+ " 36 Family_Hist_4 40197 non-null float64\n",
+ " 37 Family_Hist_5 17570 non-null float64\n",
+ " 38 Medical_History_1 50492 non-null float64\n",
+ " 39 Medical_History_2 59381 non-null int64 \n",
+ " 40 Medical_History_3 59381 non-null int64 \n",
+ " 41 Medical_History_4 59381 non-null int64 \n",
+ " 42 Medical_History_5 59381 non-null int64 \n",
+ " 43 Medical_History_6 59381 non-null int64 \n",
+ " 44 Medical_History_7 59381 non-null int64 \n",
+ " 45 Medical_History_8 59381 non-null int64 \n",
+ " 46 Medical_History_9 59381 non-null int64 \n",
+ " 47 Medical_History_10 557 non-null float64\n",
+ " 48 Medical_History_11 59381 non-null int64 \n",
+ " 49 Medical_History_12 59381 non-null int64 \n",
+ " 50 Medical_History_13 59381 non-null int64 \n",
+ " 51 Medical_History_14 59381 non-null int64 \n",
+ " 52 Medical_History_15 14785 non-null float64\n",
+ " 53 Medical_History_16 59381 non-null int64 \n",
+ " 54 Medical_History_17 59381 non-null int64 \n",
+ " 55 Medical_History_18 59381 non-null int64 \n",
+ " 56 Medical_History_19 59381 non-null int64 \n",
+ " 57 Medical_History_20 59381 non-null int64 \n",
+ " 58 Medical_History_21 59381 non-null int64 \n",
+ " 59 Medical_History_22 59381 non-null int64 \n",
+ " 60 Medical_History_23 59381 non-null int64 \n",
+ " 61 Medical_History_24 3801 non-null float64\n",
+ " 62 Medical_History_25 59381 non-null int64 \n",
+ " 63 Medical_History_26 59381 non-null int64 \n",
+ " 64 Medical_History_27 59381 non-null int64 \n",
+ " 65 Medical_History_28 59381 non-null int64 \n",
+ " 66 Medical_History_29 59381 non-null int64 \n",
+ " 67 Medical_History_30 59381 non-null int64 \n",
+ " 68 Medical_History_31 59381 non-null int64 \n",
+ " 69 Medical_History_32 1107 non-null float64\n",
+ " 70 Medical_History_33 59381 non-null int64 \n",
+ " 71 Medical_History_34 59381 non-null int64 \n",
+ " 72 Medical_History_35 59381 non-null int64 \n",
+ " 73 Medical_History_36 59381 non-null int64 \n",
+ " 74 Medical_History_37 59381 non-null int64 \n",
+ " 75 Medical_History_38 59381 non-null int64 \n",
+ " 76 Medical_History_39 59381 non-null int64 \n",
+ " 77 Medical_History_40 59381 non-null int64 \n",
+ " 78 Medical_History_41 59381 non-null int64 \n",
+ " 79 Medical_Keyword_1 59381 non-null int64 \n",
+ " 80 Medical_Keyword_2 59381 non-null int64 \n",
+ " 81 Medical_Keyword_3 59381 non-null int64 \n",
+ " 82 Medical_Keyword_4 59381 non-null int64 \n",
+ " 83 Medical_Keyword_5 59381 non-null int64 \n",
+ " 84 Medical_Keyword_6 59381 non-null int64 \n",
+ " 85 Medical_Keyword_7 59381 non-null int64 \n",
+ " 86 Medical_Keyword_8 59381 non-null int64 \n",
+ " 87 Medical_Keyword_9 59381 non-null int64 \n",
+ " 88 Medical_Keyword_10 59381 non-null int64 \n",
+ " 89 Medical_Keyword_11 59381 non-null int64 \n",
+ " 90 Medical_Keyword_12 59381 non-null int64 \n",
+ " 91 Medical_Keyword_13 59381 non-null int64 \n",
+ " 92 Medical_Keyword_14 59381 non-null int64 \n",
+ " 93 Medical_Keyword_15 59381 non-null int64 \n",
+ " 94 Medical_Keyword_16 59381 non-null int64 \n",
+ " 95 Medical_Keyword_17 59381 non-null int64 \n",
+ " 96 Medical_Keyword_18 59381 non-null int64 \n",
+ " 97 Medical_Keyword_19 59381 non-null int64 \n",
+ " 98 Medical_Keyword_20 59381 non-null int64 \n",
+ " 99 Medical_Keyword_21 59381 non-null int64 \n",
+ " 100 Medical_Keyword_22 59381 non-null int64 \n",
+ " 101 Medical_Keyword_23 59381 non-null int64 \n",
+ " 102 Medical_Keyword_24 59381 non-null int64 \n",
+ " 103 Medical_Keyword_25 59381 non-null int64 \n",
+ " 104 Medical_Keyword_26 59381 non-null int64 \n",
+ " 105 Medical_Keyword_27 59381 non-null int64 \n",
+ " 106 Medical_Keyword_28 59381 non-null int64 \n",
+ " 107 Medical_Keyword_29 59381 non-null int64 \n",
+ " 108 Medical_Keyword_30 59381 non-null int64 \n",
+ " 109 Medical_Keyword_31 59381 non-null int64 \n",
+ " 110 Medical_Keyword_32 59381 non-null int64 \n",
+ " 111 Medical_Keyword_33 59381 non-null int64 \n",
+ " 112 Medical_Keyword_34 59381 non-null int64 \n",
+ " 113 Medical_Keyword_35 59381 non-null int64 \n",
+ " 114 Medical_Keyword_36 59381 non-null int64 \n",
+ " 115 Medical_Keyword_37 59381 non-null int64 \n",
+ " 116 Medical_Keyword_38 59381 non-null int64 \n",
+ " 117 Medical_Keyword_39 59381 non-null int64 \n",
+ " 118 Medical_Keyword_40 59381 non-null int64 \n",
+ " 119 Medical_Keyword_41 59381 non-null int64 \n",
+ " 120 Medical_Keyword_42 59381 non-null int64 \n",
+ " 121 Medical_Keyword_43 59381 non-null int64 \n",
+ " 122 Medical_Keyword_44 59381 non-null int64 \n",
+ " 123 Medical_Keyword_45 59381 non-null int64 \n",
+ " 124 Medical_Keyword_46 59381 non-null int64 \n",
+ " 125 Medical_Keyword_47 59381 non-null int64 \n",
+ " 126 Medical_Keyword_48 59381 non-null int64 \n",
+ " 127 Response 59381 non-null int64 \n",
+ "dtypes: float64(18), int64(109), object(1)\n",
+ "memory usage: 58.0+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO INFORMAÇÕES INDIVIDUAIS DE CADA ATRIBUTO\n",
+ "dataset.info(verbose=True, show_counts=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "7665736a-2385-4e89-addf-e2500e56ad72",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Column Type | \n",
+ " Count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " int64 | \n",
+ " 109 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " float64 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " object | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Column Type Count\n",
+ "0 int64 109\n",
+ "1 float64 18\n",
+ "2 object 1"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# FAZENDO O AGRUPAMENTO DOS TIPOS DE VARIÁVEIS DE CADA TRIBUTO\n",
+ "\n",
+ "dtype_data = dataset.dtypes.reset_index()\n",
+ "dtype_data.columns = [\"Count\", \"Column Type\"]\n",
+ "dtype_data.groupby(\"Column Type\").aggregate('count').reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "9c8276d9-e547-456e-aee4-391825403505",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# FUNÇÃO PARA FAZER A CONTAGEM DE QUANTOS VALORES ÚNICOS TEM CADA ATRIBUTO\n",
+ "\n",
+ "def uniqueValues(df):\n",
+ " df = df.drop(columns=['Response','Id'], axis=1)\n",
+ " count_values = {}\n",
+ " features_group = {}\n",
+ " two_values = []\n",
+ " tree_values = []\n",
+ " more_values = [] \n",
+ " \n",
+ " for column in df.columns:\n",
+ " unique_values = df[column].unique()\n",
+ " len_unique = len(unique_values)\n",
+ " count_values[column] = len_unique\n",
+ " \n",
+ " if len_unique == 2:\n",
+ " two_values.append(column) \n",
+ " elif len_unique == 3: \n",
+ " tree_values.append(column)\n",
+ " else: \n",
+ " more_values.append(column)\n",
+ " \n",
+ " features_group[2] = two_values\n",
+ " features_group[3] = tree_values\n",
+ " features_group['more'] = more_values \n",
+ " features_group = pd.DataFrame(data=features_group.items(), columns=['unique_values', 'features']) \n",
+ " \n",
+ " df_unique = pd.DataFrame(data=count_values.items(), columns=['Features', 'Unique_values'])\n",
+ " df_unique.sort_values(by='Unique_values', ascending=True, inplace=True)\n",
+ " df_unique.reset_index(inplace=True, drop=True)\n",
+ " df_unique = df_unique\n",
+ " return df_unique, features_group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "698b565c-d5d8-46f3-b50f-b4e4ca0bbe1d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CHAMANDO A FUNÇÃO\n",
+ "df_unique_values, df_group = uniqueValues(dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "bad1da7f-6b5f-4220-82b1-342b1dc3ea5e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ " 15 | \n",
+ " 16 | \n",
+ " 17 | \n",
+ " 18 | \n",
+ " 19 | \n",
+ " 20 | \n",
+ " 21 | \n",
+ " 22 | \n",
+ " 23 | \n",
+ " 24 | \n",
+ " 25 | \n",
+ " 26 | \n",
+ " 27 | \n",
+ " 28 | \n",
+ " 29 | \n",
+ " 30 | \n",
+ " 31 | \n",
+ " 32 | \n",
+ " 33 | \n",
+ " 34 | \n",
+ " 35 | \n",
+ " 36 | \n",
+ " 37 | \n",
+ " 38 | \n",
+ " 39 | \n",
+ " 40 | \n",
+ " 41 | \n",
+ " 42 | \n",
+ " 43 | \n",
+ " 44 | \n",
+ " 45 | \n",
+ " 46 | \n",
+ " 47 | \n",
+ " 48 | \n",
+ " 49 | \n",
+ " 50 | \n",
+ " 51 | \n",
+ " 52 | \n",
+ " 53 | \n",
+ " 54 | \n",
+ " 55 | \n",
+ " 56 | \n",
+ " 57 | \n",
+ " 58 | \n",
+ " 59 | \n",
+ " 60 | \n",
+ " 61 | \n",
+ " 62 | \n",
+ " 63 | \n",
+ " 64 | \n",
+ " 65 | \n",
+ " 66 | \n",
+ " 67 | \n",
+ " 68 | \n",
+ " 69 | \n",
+ " 70 | \n",
+ " 71 | \n",
+ " 72 | \n",
+ " 73 | \n",
+ " 74 | \n",
+ " 75 | \n",
+ " 76 | \n",
+ " 77 | \n",
+ " 78 | \n",
+ " 79 | \n",
+ " 80 | \n",
+ " 81 | \n",
+ " 82 | \n",
+ " 83 | \n",
+ " 84 | \n",
+ " 85 | \n",
+ " 86 | \n",
+ " 87 | \n",
+ " 88 | \n",
+ " 89 | \n",
+ " 90 | \n",
+ " 91 | \n",
+ " 92 | \n",
+ " 93 | \n",
+ " 94 | \n",
+ " 95 | \n",
+ " 96 | \n",
+ " 97 | \n",
+ " 98 | \n",
+ " 99 | \n",
+ " 100 | \n",
+ " 101 | \n",
+ " 102 | \n",
+ " 103 | \n",
+ " 104 | \n",
+ " 105 | \n",
+ " 106 | \n",
+ " 107 | \n",
+ " 108 | \n",
+ " 109 | \n",
+ " 110 | \n",
+ " 111 | \n",
+ " 112 | \n",
+ " 113 | \n",
+ " 114 | \n",
+ " 115 | \n",
+ " 116 | \n",
+ " 117 | \n",
+ " 118 | \n",
+ " 119 | \n",
+ " 120 | \n",
+ " 121 | \n",
+ " 122 | \n",
+ " 123 | \n",
+ " 124 | \n",
+ " 125 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Features | \n",
+ " Product_Info_1 | \n",
+ " Medical_Keyword_20 | \n",
+ " Medical_Keyword_19 | \n",
+ " Medical_Keyword_18 | \n",
+ " Medical_Keyword_17 | \n",
+ " Medical_Keyword_16 | \n",
+ " Medical_Keyword_15 | \n",
+ " Medical_Keyword_14 | \n",
+ " Medical_Keyword_13 | \n",
+ " Medical_Keyword_12 | \n",
+ " Medical_Keyword_11 | \n",
+ " Medical_Keyword_21 | \n",
+ " Medical_Keyword_10 | \n",
+ " Medical_Keyword_8 | \n",
+ " Medical_Keyword_7 | \n",
+ " Medical_Keyword_6 | \n",
+ " Medical_Keyword_5 | \n",
+ " Medical_Keyword_4 | \n",
+ " Medical_Keyword_3 | \n",
+ " Medical_Keyword_2 | \n",
+ " Medical_Keyword_1 | \n",
+ " Medical_History_38 | \n",
+ " Medical_History_33 | \n",
+ " Medical_Keyword_9 | \n",
+ " Medical_Keyword_47 | \n",
+ " Medical_Keyword_22 | \n",
+ " Medical_Keyword_24 | \n",
+ " Medical_Keyword_46 | \n",
+ " Medical_Keyword_45 | \n",
+ " Medical_Keyword_44 | \n",
+ " Medical_Keyword_43 | \n",
+ " Medical_Keyword_42 | \n",
+ " Medical_Keyword_41 | \n",
+ " Medical_Keyword_40 | \n",
+ " Medical_Keyword_39 | \n",
+ " Medical_Keyword_38 | \n",
+ " Medical_Keyword_37 | \n",
+ " Medical_Keyword_23 | \n",
+ " Medical_Keyword_36 | \n",
+ " Medical_Keyword_34 | \n",
+ " Medical_Keyword_33 | \n",
+ " Medical_Keyword_32 | \n",
+ " Medical_Keyword_31 | \n",
+ " Medical_Keyword_30 | \n",
+ " Medical_Keyword_29 | \n",
+ " Medical_Keyword_28 | \n",
+ " Medical_Keyword_27 | \n",
+ " Medical_Keyword_26 | \n",
+ " Medical_Keyword_25 | \n",
+ " Medical_Keyword_35 | \n",
+ " Medical_History_22 | \n",
+ " Medical_Keyword_48 | \n",
+ " InsuredInfo_4 | \n",
+ " Product_Info_5 | \n",
+ " Product_Info_6 | \n",
+ " Employment_Info_3 | \n",
+ " Employment_Info_5 | \n",
+ " Medical_History_4 | \n",
+ " InsuredInfo_2 | \n",
+ " InsuredInfo_5 | \n",
+ " InsuredInfo_6 | \n",
+ " InsuredInfo_7 | \n",
+ " Insurance_History_1 | \n",
+ " Medical_History_3 | \n",
+ " Medical_History_5 | \n",
+ " Medical_History_16 | \n",
+ " Medical_History_6 | \n",
+ " Medical_History_7 | \n",
+ " Family_Hist_1 | \n",
+ " Insurance_History_9 | \n",
+ " Insurance_History_2 | \n",
+ " Insurance_History_7 | \n",
+ " Insurance_History_4 | \n",
+ " Insurance_History_3 | \n",
+ " Medical_History_8 | \n",
+ " InsuredInfo_1 | \n",
+ " Product_Info_7 | \n",
+ " Insurance_History_8 | \n",
+ " Medical_History_9 | \n",
+ " Medical_History_26 | \n",
+ " Medical_History_41 | \n",
+ " Medical_History_17 | \n",
+ " Medical_History_18 | \n",
+ " Medical_History_19 | \n",
+ " Medical_History_20 | \n",
+ " Medical_History_21 | \n",
+ " Medical_History_14 | \n",
+ " Medical_History_23 | \n",
+ " Medical_History_25 | \n",
+ " Medical_History_27 | \n",
+ " Medical_History_28 | \n",
+ " Medical_History_29 | \n",
+ " Medical_History_30 | \n",
+ " Medical_History_31 | \n",
+ " Medical_History_13 | \n",
+ " Medical_History_40 | \n",
+ " Medical_History_34 | \n",
+ " Medical_History_35 | \n",
+ " Medical_History_12 | \n",
+ " Medical_History_36 | \n",
+ " Medical_History_11 | \n",
+ " Medical_History_37 | \n",
+ " Medical_History_39 | \n",
+ " InsuredInfo_3 | \n",
+ " Product_Info_2 | \n",
+ " Product_Info_3 | \n",
+ " Employment_Info_2 | \n",
+ " Ht | \n",
+ " Ins_Age | \n",
+ " Family_Hist_2 | \n",
+ " Family_Hist_4 | \n",
+ " Family_Hist_3 | \n",
+ " Family_Hist_5 | \n",
+ " Medical_History_32 | \n",
+ " Medical_History_10 | \n",
+ " Medical_History_1 | \n",
+ " Medical_History_24 | \n",
+ " Medical_History_15 | \n",
+ " Wt | \n",
+ " Medical_History_2 | \n",
+ " Employment_Info_4 | \n",
+ " Employment_Info_6 | \n",
+ " Product_Info_4 | \n",
+ " Employment_Info_1 | \n",
+ " Insurance_History_5 | \n",
+ " BMI | \n",
+ "
\n",
+ " \n",
+ " Unique_values | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 11 | \n",
+ " 19 | \n",
+ " 34 | \n",
+ " 36 | \n",
+ " 39 | \n",
+ " 65 | \n",
+ " 69 | \n",
+ " 69 | \n",
+ " 91 | \n",
+ " 91 | \n",
+ " 96 | \n",
+ " 104 | \n",
+ " 172 | \n",
+ " 228 | \n",
+ " 242 | \n",
+ " 300 | \n",
+ " 579 | \n",
+ " 872 | \n",
+ " 993 | \n",
+ " 1491 | \n",
+ " 1937 | \n",
+ " 2266 | \n",
+ " 3256 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 \\\n",
+ "Features Product_Info_1 Medical_Keyword_20 Medical_Keyword_19 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 3 4 5 \\\n",
+ "Features Medical_Keyword_18 Medical_Keyword_17 Medical_Keyword_16 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 6 7 8 \\\n",
+ "Features Medical_Keyword_15 Medical_Keyword_14 Medical_Keyword_13 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 9 10 11 \\\n",
+ "Features Medical_Keyword_12 Medical_Keyword_11 Medical_Keyword_21 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 12 13 14 \\\n",
+ "Features Medical_Keyword_10 Medical_Keyword_8 Medical_Keyword_7 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 15 16 17 \\\n",
+ "Features Medical_Keyword_6 Medical_Keyword_5 Medical_Keyword_4 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 18 19 20 \\\n",
+ "Features Medical_Keyword_3 Medical_Keyword_2 Medical_Keyword_1 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 21 22 23 \\\n",
+ "Features Medical_History_38 Medical_History_33 Medical_Keyword_9 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 24 25 26 \\\n",
+ "Features Medical_Keyword_47 Medical_Keyword_22 Medical_Keyword_24 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 27 28 29 \\\n",
+ "Features Medical_Keyword_46 Medical_Keyword_45 Medical_Keyword_44 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 30 31 32 \\\n",
+ "Features Medical_Keyword_43 Medical_Keyword_42 Medical_Keyword_41 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 33 34 35 \\\n",
+ "Features Medical_Keyword_40 Medical_Keyword_39 Medical_Keyword_38 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 36 37 38 \\\n",
+ "Features Medical_Keyword_37 Medical_Keyword_23 Medical_Keyword_36 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 39 40 41 \\\n",
+ "Features Medical_Keyword_34 Medical_Keyword_33 Medical_Keyword_32 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 42 43 44 \\\n",
+ "Features Medical_Keyword_31 Medical_Keyword_30 Medical_Keyword_29 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 45 46 47 \\\n",
+ "Features Medical_Keyword_28 Medical_Keyword_27 Medical_Keyword_26 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 48 49 50 \\\n",
+ "Features Medical_Keyword_25 Medical_Keyword_35 Medical_History_22 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 51 52 53 \\\n",
+ "Features Medical_Keyword_48 InsuredInfo_4 Product_Info_5 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 54 55 56 \\\n",
+ "Features Product_Info_6 Employment_Info_3 Employment_Info_5 \n",
+ "Unique_values 2 2 2 \n",
+ "\n",
+ " 57 58 59 60 \\\n",
+ "Features Medical_History_4 InsuredInfo_2 InsuredInfo_5 InsuredInfo_6 \n",
+ "Unique_values 2 2 2 2 \n",
+ "\n",
+ " 61 62 63 \\\n",
+ "Features InsuredInfo_7 Insurance_History_1 Medical_History_3 \n",
+ "Unique_values 2 2 3 \n",
+ "\n",
+ " 64 65 66 \\\n",
+ "Features Medical_History_5 Medical_History_16 Medical_History_6 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 67 68 69 \\\n",
+ "Features Medical_History_7 Family_Hist_1 Insurance_History_9 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 70 71 72 \\\n",
+ "Features Insurance_History_2 Insurance_History_7 Insurance_History_4 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 73 74 75 \\\n",
+ "Features Insurance_History_3 Medical_History_8 InsuredInfo_1 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 76 77 78 \\\n",
+ "Features Product_Info_7 Insurance_History_8 Medical_History_9 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 79 80 81 \\\n",
+ "Features Medical_History_26 Medical_History_41 Medical_History_17 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 82 83 84 \\\n",
+ "Features Medical_History_18 Medical_History_19 Medical_History_20 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 85 86 87 \\\n",
+ "Features Medical_History_21 Medical_History_14 Medical_History_23 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 88 89 90 \\\n",
+ "Features Medical_History_25 Medical_History_27 Medical_History_28 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 91 92 93 \\\n",
+ "Features Medical_History_29 Medical_History_30 Medical_History_31 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 94 95 96 \\\n",
+ "Features Medical_History_13 Medical_History_40 Medical_History_34 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 97 98 99 \\\n",
+ "Features Medical_History_35 Medical_History_12 Medical_History_36 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 100 101 102 \\\n",
+ "Features Medical_History_11 Medical_History_37 Medical_History_39 \n",
+ "Unique_values 3 3 3 \n",
+ "\n",
+ " 103 104 105 \\\n",
+ "Features InsuredInfo_3 Product_Info_2 Product_Info_3 \n",
+ "Unique_values 11 19 34 \n",
+ "\n",
+ " 106 107 108 109 110 \\\n",
+ "Features Employment_Info_2 Ht Ins_Age Family_Hist_2 Family_Hist_4 \n",
+ "Unique_values 36 39 65 69 69 \n",
+ "\n",
+ " 111 112 113 \\\n",
+ "Features Family_Hist_3 Family_Hist_5 Medical_History_32 \n",
+ "Unique_values 91 91 96 \n",
+ "\n",
+ " 114 115 116 \\\n",
+ "Features Medical_History_10 Medical_History_1 Medical_History_24 \n",
+ "Unique_values 104 172 228 \n",
+ "\n",
+ " 117 118 119 120 \\\n",
+ "Features Medical_History_15 Wt Medical_History_2 Employment_Info_4 \n",
+ "Unique_values 242 300 579 872 \n",
+ "\n",
+ " 121 122 123 \\\n",
+ "Features Employment_Info_6 Product_Info_4 Employment_Info_1 \n",
+ "Unique_values 993 1491 1937 \n",
+ "\n",
+ " 124 125 \n",
+ "Features Insurance_History_5 BMI \n",
+ "Unique_values 2266 3256 "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO QUANTOS VALORES ÚNICOS TEM CADA ATRIBUTO\n",
+ "df_unique_values.T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "ba10d10d-b5f5-49de-b8ed-6ba56db18e20",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Features | \n",
+ "
\n",
+ " \n",
+ " Unique_values | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " 63 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 96 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 104 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 172 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 228 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 242 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 300 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 579 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 872 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 993 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1491 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1937 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2266 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3256 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Features\n",
+ "Unique_values \n",
+ "2 63\n",
+ "3 40\n",
+ "11 1\n",
+ "19 1\n",
+ "34 1\n",
+ "36 1\n",
+ "39 1\n",
+ "65 1\n",
+ "69 2\n",
+ "91 2\n",
+ "96 1\n",
+ "104 1\n",
+ "172 1\n",
+ "228 1\n",
+ "242 1\n",
+ "300 1\n",
+ "579 1\n",
+ "872 1\n",
+ "993 1\n",
+ "1491 1\n",
+ "1937 1\n",
+ "2266 1\n",
+ "3256 1"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# FAZENDO AGRUPAMENTO DOS ATRIBUTOS PELA QUANTIDADE DE VALORES ÚNICOS\n",
+ "df_unique_values.groupby('Unique_values').count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "0d6753eb-60d3-40e1-8826-6bd1f28e4c45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CRIANDO UMA LISTA INDIVIDUAL COM O NOME DOS ATRIBUTOS COM 2 VALORES ÚNICOS, 3 VALORES ÚNICOS E MAIS DE 3 VALORES ÚNICOS\n",
+ "\n",
+ "two_values = df_group[df_group.unique_values == 2]['features'].item()\n",
+ "tree_values = df_group[df_group.unique_values == 3]['features'].item()\n",
+ "more_values = df_group[df_group.unique_values == 'more']['features'].item()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "253d7c85-db6f-4d07-97c7-3a3f1eafde66",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Não existe valores faltantes na lista: \"two_values\"\n",
+ "Não existe valores faltantes na lista: \"tree_values\n",
+ "Existe valores faltantes na lista \"more_values\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "# VERIFICANDO A EXISTÊNCIA DE VALORES FALTANTES EM CADA UMA DAS LISTA DE CONTAGEM DE VALORES ÚNICOS\n",
+ "\n",
+ "for row in df_group.itertuples():\n",
+ " if set(row.features).intersection(missing_values_keep):\n",
+ " print('Existe valores faltantes na lista \"two_values\"') if row.unique_values==2 else print('Não existe valores faltantes na lista: \"two_values\"')\n",
+ " print('Existe valores faltantes na lista \"tree_values\"') if row.unique_values==3 else print('Não existe valores faltantes na lista: \"tree_values')\n",
+ " print('Existe valores faltantes na lista \"more_values\"') if row.unique_values=='more' else print('Não existe valores faltantes na lista: \"more_values')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "530e46d9-83d8-4d77-98f1-77f017b7b1ea",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# PLOTANDO A QUANTIDADE DE AMOSTRAS POR CLASSE\n",
+ "\n",
+ "plt.figure(figsize=(15,6))\n",
+ "sns.countplot(data=dataset, x='Response', palette = \"OrRd\")\n",
+ "plt.box(False)\n",
+ "plt.xlabel('Classes', fontsize = 11)\n",
+ "plt.ylabel('Quantidade', fontsize = 11)\n",
+ "plt.title('Contagem de Classes\\n')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "07815c68-be10-4d76-bec8-ba1df8660a8f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# MAIORES CORRELAÇÕES POSITIVAS E NEGATIVAS EM RELAÇÃO A VARIÁVEL ALVO\n",
+ "\n",
+ "corr_positivo = dataset.corr()['Response'].sort_values(ascending=False)[0:10]\n",
+ "corr_negative = dataset.corr()['Response'].sort_values(ascending=True)[0:10]\n",
+ "correlacao = pd.concat([corr_positivo, corr_negative], axis=1, keys=['Corr Positiva','Corr Negativa'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "4a2a2e09-1201-4a67-a3ad-af13a7078cbe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Corr Positiva | \n",
+ " Corr Negativa | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Response | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_23 | \n",
+ " 0.286584 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_15 | \n",
+ " 0.277311 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_4 | \n",
+ " 0.239896 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_39 | \n",
+ " 0.220176 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Product_Info_4 | \n",
+ " 0.202434 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_6 | \n",
+ " 0.159230 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_32 | \n",
+ " 0.144536 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_13 | \n",
+ " 0.134863 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " Medical_History_40 | \n",
+ " 0.131519 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " BMI | \n",
+ " NaN | \n",
+ " -0.381601 | \n",
+ "
\n",
+ " \n",
+ " Wt | \n",
+ " NaN | \n",
+ " -0.351395 | \n",
+ "
\n",
+ " \n",
+ " Medical_Keyword_15 | \n",
+ " NaN | \n",
+ " -0.259169 | \n",
+ "
\n",
+ " \n",
+ " Medical_Keyword_3 | \n",
+ " NaN | \n",
+ " -0.257706 | \n",
+ "
\n",
+ " \n",
+ " Ins_Age | \n",
+ " NaN | \n",
+ " -0.209610 | \n",
+ "
\n",
+ " \n",
+ " Medical_Keyword_48 | \n",
+ " NaN | \n",
+ " -0.159557 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_16 | \n",
+ " NaN | \n",
+ " -0.137542 | \n",
+ "
\n",
+ " \n",
+ " Insurance_History_2 | \n",
+ " NaN | \n",
+ " -0.122196 | \n",
+ "
\n",
+ " \n",
+ " Employment_Info_3 | \n",
+ " NaN | \n",
+ " -0.116408 | \n",
+ "
\n",
+ " \n",
+ " Medical_History_30 | \n",
+ " NaN | \n",
+ " -0.114870 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Corr Positiva Corr Negativa\n",
+ "Response 1.000000 NaN\n",
+ "Medical_History_23 0.286584 NaN\n",
+ "Medical_History_15 0.277311 NaN\n",
+ "Medical_History_4 0.239896 NaN\n",
+ "Medical_History_39 0.220176 NaN\n",
+ "Product_Info_4 0.202434 NaN\n",
+ "Medical_History_6 0.159230 NaN\n",
+ "Medical_History_32 0.144536 NaN\n",
+ "Medical_History_13 0.134863 NaN\n",
+ "Medical_History_40 0.131519 NaN\n",
+ "BMI NaN -0.381601\n",
+ "Wt NaN -0.351395\n",
+ "Medical_Keyword_15 NaN -0.259169\n",
+ "Medical_Keyword_3 NaN -0.257706\n",
+ "Ins_Age NaN -0.209610\n",
+ "Medical_Keyword_48 NaN -0.159557\n",
+ "Medical_History_16 NaN -0.137542\n",
+ "Insurance_History_2 NaN -0.122196\n",
+ "Employment_Info_3 NaN -0.116408\n",
+ "Medical_History_30 NaN -0.114870"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO CORRELAÇÕES\n",
+ "correlacao"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "e9e4f5ce-2a5f-4913-806b-9b978ba644e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# PLOTANDO GRÁFICO PARA VISUALIZAÇÃO DAS CORRELAÇÕES COM A CLASSE\n",
+ "\n",
+ "dataset.corrwith(dataset.Response).plot.bar(figsize = (30, 8), \n",
+ " title = \"Correlação das Variáveis de Entrada com a Classe de Saída\", \n",
+ " fontsize = 8,\n",
+ " grid = True, \n",
+ " legend=False)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "038513f1-43e0-47e1-97ef-b76ffa4016ec",
+ "metadata": {},
+ "source": [
+ "# ENGENHARIA DE ATRIBUTOS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "b2dd9a1d-a8c2-4dc9-9a39-9a873a7cf34a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# SEPARANDO OS ATRIBUTOS DA VARIÁVEL ALVO E REMOVENDO ATRIBUTOS COM VALORES FALTANTES ACIMA DE 70%\n",
+ "\n",
+ "x = dataset.drop(columns = missing_values_drop + ['Response', 'Id'], axis=1)\n",
+ "y_train = dataset[['Response']].values.ravel()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "783aa71b-e812-49b0-823a-b1f39f4b4295",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['Product_Info_2']"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO ATRIBUTOS DO TIPO 'OBJECT' DO DATASET\n",
+ "list(x.select_dtypes(include=['object']).columns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "5388b083-5861-43d4-9f92-6a58dc04318a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['D3', 'A1', 'E1', 'D4', 'D2', 'A8', 'A2', 'D1', 'A7', 'A6', 'A3',\n",
+ " 'A5', 'C4', 'C1', 'B2', 'C3', 'C2', 'A4', 'B1'], dtype=object)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO OS VALORES DO ATRIBUTO TIPO 'OBJECT'\n",
+ "x['Product_Info_2'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "ee0d1d75-93b6-4cd7-9a9b-d42b2edacf5c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# SEPARANDO OS CARACTERES ALFABÉTICOS E NUMÉRICOS DA VARIÁVEL 'PRODUCT_INFO_2' E CRIANDO UMA NOVA COLUNA PARA CADA\n",
+ "x['Product_Info_2_char'] = x.Product_Info_2.str[0]\n",
+ "x['Product_Info_2_num'] = x.Product_Info_2.str[1]\n",
+ "\n",
+ "# OBTENDO OS CARACTERES ÚNICOS E OS ORDENANDO EM ORDEM DECRESCENTE\n",
+ "producto_info_2_char = np.sort(x['Product_Info_2_char'].unique()).tolist()\n",
+ "\n",
+ "# ELIMINANDO A COLUNA QUE FOI DESDOBRADA EM DUAS\n",
+ "x = x.drop(columns='Product_Info_2', axis=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cf81e375-92d5-4560-8257-d79f0b3eba6a",
+ "metadata": {},
+ "source": [
+ "# Criando Pipeline para Tratamento dos Dados"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "281f5cbd-1c38-4c01-a595-4bf014e97bb7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PIPELINE PARA TRATAMENTO DOS ATRIBUTOS NUMÉRICOS\n",
+ "\n",
+ "pipeline_numerical = Pipeline([ \n",
+ " ('simple_imputer', SimpleImputer(strategy='mean')), \n",
+ " ('iterative_imputer', IterativeImputer())\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "1bca9a65-1f60-4512-86d1-150fd358c585",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PIPELINE PARA TRATAMENTO DOS ATRIBUTOS TIPO OBJECT\n",
+ "pipeline_object = Pipeline([('ordinal_encoder', OrdinalEncoder(categories=[producto_info_2_char]))])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "3aa211df-06c3-4566-8b30-0528bce738f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# FAZENDO TRANFORMAÇÕES NOS ATRIBUTOS UTILIZADOS NOS PIPELINES\n",
+ "\n",
+ "ct_preprocessor = ColumnTransformer([\n",
+ " ('numerical', pipeline_numerical, missing_values_keep), \n",
+ " ('object', pipeline_object, ['Product_Info_2_char'])\n",
+ " ], remainder='passthrough')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "8a495dc2-74f6-4338-99e5-af5e92432c23",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "ColumnTransformer(remainder='passthrough',\n",
+ " transformers=[('numerical',\n",
+ " Pipeline(steps=[('simple_imputer',\n",
+ " SimpleImputer()),\n",
+ " ('iterative_imputer',\n",
+ " IterativeImputer())]),\n",
+ " ['Family_Hist_3', 'Family_Hist_2',\n",
+ " 'Insurance_History_5', 'Family_Hist_4',\n",
+ " 'Employment_Info_6', 'Medical_History_1',\n",
+ " 'Employment_Info_4', 'Employment_Info_1']),\n",
+ " ('object',\n",
+ " Pipeline(steps=[('ordinal_encoder',\n",
+ " OrdinalEncoder(categories=[['A',\n",
+ " 'B',\n",
+ " 'C',\n",
+ " 'D',\n",
+ " 'E']]))]),\n",
+ " ['Product_Info_2_char'])])"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# TREINAMNDO ALGORITMO PARA TRATAMENTO DOS DADOS\n",
+ "ct_preprocessor.fit(x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "b898d50e-0f93-497c-afec-dc762cd6c9f8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# TRANSFORMANDO OS DADOS E ATRIBUINDO-OS A UM DATAFRAME\n",
+ "x_train = pd.DataFrame(ct_preprocessor.transform(x), columns=x.columns.tolist())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "1f7859bc-ea2f-4f17-805d-4f1ac757cadc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Product_Info_1 | \n",
+ " Product_Info_3 | \n",
+ " Product_Info_4 | \n",
+ " Product_Info_5 | \n",
+ " Product_Info_6 | \n",
+ " Product_Info_7 | \n",
+ " Ins_Age | \n",
+ " Ht | \n",
+ " Wt | \n",
+ " BMI | \n",
+ " Employment_Info_1 | \n",
+ " Employment_Info_2 | \n",
+ " Employment_Info_3 | \n",
+ " Employment_Info_4 | \n",
+ " Employment_Info_5 | \n",
+ " Employment_Info_6 | \n",
+ " InsuredInfo_1 | \n",
+ " InsuredInfo_2 | \n",
+ " InsuredInfo_3 | \n",
+ " InsuredInfo_4 | \n",
+ " InsuredInfo_5 | \n",
+ " InsuredInfo_6 | \n",
+ " InsuredInfo_7 | \n",
+ " Insurance_History_1 | \n",
+ " Insurance_History_2 | \n",
+ " Insurance_History_3 | \n",
+ " Insurance_History_4 | \n",
+ " Insurance_History_5 | \n",
+ " Insurance_History_7 | \n",
+ " Insurance_History_8 | \n",
+ " Insurance_History_9 | \n",
+ " Family_Hist_1 | \n",
+ " Family_Hist_2 | \n",
+ " Family_Hist_3 | \n",
+ " Family_Hist_4 | \n",
+ " Medical_History_1 | \n",
+ " Medical_History_2 | \n",
+ " Medical_History_3 | \n",
+ " Medical_History_4 | \n",
+ " Medical_History_5 | \n",
+ " Medical_History_6 | \n",
+ " Medical_History_7 | \n",
+ " Medical_History_8 | \n",
+ " Medical_History_9 | \n",
+ " Medical_History_11 | \n",
+ " Medical_History_12 | \n",
+ " Medical_History_13 | \n",
+ " Medical_History_14 | \n",
+ " Medical_History_16 | \n",
+ " Medical_History_17 | \n",
+ " Medical_History_18 | \n",
+ " Medical_History_19 | \n",
+ " Medical_History_20 | \n",
+ " Medical_History_21 | \n",
+ " Medical_History_22 | \n",
+ " Medical_History_23 | \n",
+ " Medical_History_25 | \n",
+ " Medical_History_26 | \n",
+ " Medical_History_27 | \n",
+ " Medical_History_28 | \n",
+ " Medical_History_29 | \n",
+ " Medical_History_30 | \n",
+ " Medical_History_31 | \n",
+ " Medical_History_33 | \n",
+ " Medical_History_34 | \n",
+ " Medical_History_35 | \n",
+ " Medical_History_36 | \n",
+ " Medical_History_37 | \n",
+ " Medical_History_38 | \n",
+ " Medical_History_39 | \n",
+ " Medical_History_40 | \n",
+ " Medical_History_41 | \n",
+ " Medical_Keyword_1 | \n",
+ " Medical_Keyword_2 | \n",
+ " Medical_Keyword_3 | \n",
+ " Medical_Keyword_4 | \n",
+ " Medical_Keyword_5 | \n",
+ " Medical_Keyword_6 | \n",
+ " Medical_Keyword_7 | \n",
+ " Medical_Keyword_8 | \n",
+ " Medical_Keyword_9 | \n",
+ " Medical_Keyword_10 | \n",
+ " Medical_Keyword_11 | \n",
+ " Medical_Keyword_12 | \n",
+ " Medical_Keyword_13 | \n",
+ " Medical_Keyword_14 | \n",
+ " Medical_Keyword_15 | \n",
+ " Medical_Keyword_16 | \n",
+ " Medical_Keyword_17 | \n",
+ " Medical_Keyword_18 | \n",
+ " Medical_Keyword_19 | \n",
+ " Medical_Keyword_20 | \n",
+ " Medical_Keyword_21 | \n",
+ " Medical_Keyword_22 | \n",
+ " Medical_Keyword_23 | \n",
+ " Medical_Keyword_24 | \n",
+ " Medical_Keyword_25 | \n",
+ " Medical_Keyword_26 | \n",
+ " Medical_Keyword_27 | \n",
+ " Medical_Keyword_28 | \n",
+ " Medical_Keyword_29 | \n",
+ " Medical_Keyword_30 | \n",
+ " Medical_Keyword_31 | \n",
+ " Medical_Keyword_32 | \n",
+ " Medical_Keyword_33 | \n",
+ " Medical_Keyword_34 | \n",
+ " Medical_Keyword_35 | \n",
+ " Medical_Keyword_36 | \n",
+ " Medical_Keyword_37 | \n",
+ " Medical_Keyword_38 | \n",
+ " Medical_Keyword_39 | \n",
+ " Medical_Keyword_40 | \n",
+ " Medical_Keyword_41 | \n",
+ " Medical_Keyword_42 | \n",
+ " Medical_Keyword_43 | \n",
+ " Medical_Keyword_44 | \n",
+ " Medical_Keyword_45 | \n",
+ " Medical_Keyword_46 | \n",
+ " Medical_Keyword_47 | \n",
+ " Medical_Keyword_48 | \n",
+ " Product_Info_2_char | \n",
+ " Product_Info_2_num | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.598039 | \n",
+ " 0.47455 | \n",
+ " 0.000667 | \n",
+ " 0.44489 | \n",
+ " 0.361469 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.028 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0.641791 | \n",
+ " 0.581818 | \n",
+ " 0.148536 | \n",
+ " 0.323008 | \n",
+ " 12 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 112 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.497737 | \n",
+ " 0.188406 | \n",
+ " 0.000133 | \n",
+ " 0.084507 | \n",
+ " 0.0018 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ " 26 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.059701 | \n",
+ " 0.6 | \n",
+ " 0.131799 | \n",
+ " 0.272288 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 412 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.497737 | \n",
+ " 0.304348 | \n",
+ " 0.001733 | \n",
+ " 0.225352 | \n",
+ " 0.03 | \n",
+ " 10.0 | \n",
+ " 0.0 | \n",
+ " 0.03 | \n",
+ " 4.0 | \n",
+ " 1 | \n",
+ " 26 | \n",
+ " 0.076923 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.029851 | \n",
+ " 0.745455 | \n",
+ " 0.288703 | \n",
+ " 0.42878 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.497737 | \n",
+ " 0.42029 | \n",
+ " 0.001733 | \n",
+ " 0.352113 | \n",
+ " 0.2 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.042 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 10 | \n",
+ " 0.487179 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.164179 | \n",
+ " 0.672727 | \n",
+ " 0.205021 | \n",
+ " 0.352438 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 350 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.497737 | \n",
+ " 0.463768 | \n",
+ " 0.001733 | \n",
+ " 0.408451 | \n",
+ " 0.05 | \n",
+ " 7.962172 | \n",
+ " 0.0 | \n",
+ " 0.027 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ " 26 | \n",
+ " 0.230769 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0.41791 | \n",
+ " 0.654545 | \n",
+ " 0.23431 | \n",
+ " 0.424046 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 162 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Product_Info_1 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 \\\n",
+ "0 0.598039 0.47455 0.000667 0.44489 0.361469 \n",
+ "1 0.497737 0.188406 0.000133 0.084507 0.0018 \n",
+ "2 0.497737 0.304348 0.001733 0.225352 0.03 \n",
+ "3 0.497737 0.42029 0.001733 0.352113 0.2 \n",
+ "4 0.497737 0.463768 0.001733 0.408451 0.05 \n",
+ "\n",
+ " Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 Employment_Info_2 \\\n",
+ "0 4.0 0.0 0.028 3.0 1 10 0.076923 \n",
+ "1 5.0 0.0 0.0 0.0 1 26 0.076923 \n",
+ "2 10.0 0.0 0.03 4.0 1 26 0.076923 \n",
+ "3 0.0 0.0 0.042 3.0 1 10 0.487179 \n",
+ "4 7.962172 0.0 0.027 3.0 1 26 0.230769 \n",
+ "\n",
+ " Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6 \\\n",
+ "0 2 1 1 0.641791 \n",
+ "1 2 3 1 0.059701 \n",
+ "2 2 3 1 0.029851 \n",
+ "3 2 3 1 0.164179 \n",
+ "4 2 3 1 0.41791 \n",
+ "\n",
+ " InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5 \\\n",
+ "0 0.581818 0.148536 0.323008 12 1 \n",
+ "1 0.6 0.131799 0.272288 1 3 \n",
+ "2 0.745455 0.288703 0.42878 9 1 \n",
+ "3 0.672727 0.205021 0.352438 9 1 \n",
+ "4 0.654545 0.23431 0.424046 9 1 \n",
+ "\n",
+ " InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2 \\\n",
+ "0 3 1 2 6 \n",
+ "1 2 1 2 6 \n",
+ "2 2 1 2 8 \n",
+ "3 3 2 2 8 \n",
+ "4 2 1 2 6 \n",
+ "\n",
+ " Insurance_History_3 Insurance_History_4 Insurance_History_5 \\\n",
+ "0 3 1 2 \n",
+ "1 3 1 2 \n",
+ "2 3 1 1 \n",
+ "3 3 1 2 \n",
+ "4 3 1 2 \n",
+ "\n",
+ " Insurance_History_7 Insurance_History_8 Insurance_History_9 Family_Hist_1 \\\n",
+ "0 1 1 1 3 \n",
+ "1 1 2 1 3 \n",
+ "2 1 2 1 1 \n",
+ "3 1 2 1 1 \n",
+ "4 1 2 1 1 \n",
+ "\n",
+ " Family_Hist_2 Family_Hist_3 Family_Hist_4 Medical_History_1 \\\n",
+ "0 1 1 1 2 \n",
+ "1 1 1 3 2 \n",
+ "2 3 3 2 3 \n",
+ "3 3 3 2 3 \n",
+ "4 3 3 2 3 \n",
+ "\n",
+ " Medical_History_2 Medical_History_3 Medical_History_4 Medical_History_5 \\\n",
+ "0 2 112 2 1 \n",
+ "1 2 412 2 1 \n",
+ "2 3 3 2 2 \n",
+ "3 3 350 2 2 \n",
+ "4 2 162 2 2 \n",
+ "\n",
+ " Medical_History_6 Medical_History_7 Medical_History_8 Medical_History_9 \\\n",
+ "0 1 3 2 2 \n",
+ "1 1 3 2 2 \n",
+ "2 1 3 2 2 \n",
+ "3 1 3 2 2 \n",
+ "4 1 3 2 2 \n",
+ "\n",
+ " Medical_History_11 Medical_History_12 Medical_History_13 Medical_History_14 \\\n",
+ "0 1 3 2 3 \n",
+ "1 1 3 2 3 \n",
+ "2 2 3 2 3 \n",
+ "3 2 3 2 3 \n",
+ "4 2 3 2 3 \n",
+ "\n",
+ " Medical_History_16 Medical_History_17 Medical_History_18 Medical_History_19 \\\n",
+ "0 3 3 3 1 \n",
+ "1 3 1 3 1 \n",
+ "2 3 1 3 1 \n",
+ "3 3 1 3 1 \n",
+ "4 3 1 3 1 \n",
+ "\n",
+ " Medical_History_20 Medical_History_21 Medical_History_22 Medical_History_23 \\\n",
+ "0 1 2 1 2 \n",
+ "1 1 2 1 2 \n",
+ "2 1 2 1 2 \n",
+ "3 1 2 2 2 \n",
+ "4 1 2 1 2 \n",
+ "\n",
+ " Medical_History_25 Medical_History_26 Medical_History_27 Medical_History_28 \\\n",
+ "0 3 1 3 3 \n",
+ "1 3 1 3 3 \n",
+ "2 3 2 2 3 \n",
+ "3 3 1 3 3 \n",
+ "4 3 2 2 3 \n",
+ "\n",
+ " Medical_History_29 Medical_History_30 Medical_History_31 Medical_History_33 \\\n",
+ "0 1 3 2 3 \n",
+ "1 1 3 2 3 \n",
+ "2 1 3 2 3 \n",
+ "3 1 3 2 3 \n",
+ "4 1 3 2 3 \n",
+ "\n",
+ " Medical_History_34 Medical_History_35 Medical_History_36 Medical_History_37 \\\n",
+ "0 1 3 1 2 \n",
+ "1 3 1 1 2 \n",
+ "2 3 3 1 3 \n",
+ "3 3 3 1 2 \n",
+ "4 3 3 1 3 \n",
+ "\n",
+ " Medical_History_38 Medical_History_39 Medical_History_40 Medical_History_41 \\\n",
+ "0 2 1 3 3 \n",
+ "1 2 1 3 3 \n",
+ "2 2 1 3 3 \n",
+ "3 2 1 3 3 \n",
+ "4 2 1 3 3 \n",
+ "\n",
+ " Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4 \\\n",
+ "0 3 0 0 0 \n",
+ "1 1 0 0 0 \n",
+ "2 1 0 0 0 \n",
+ "3 1 0 0 0 \n",
+ "4 1 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 Medical_Keyword_16 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 Medical_Keyword_28 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 1 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 Medical_Keyword_40 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48 \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " Product_Info_2_char Product_Info_2_num \n",
+ "0 0 3 \n",
+ "1 0 1 \n",
+ "2 0 1 \n",
+ "3 0 4 \n",
+ "4 0 2 "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO DADOS TRATADOS\n",
+ "x_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ce8d1ae4-449a-4021-875c-208364ccb828",
+ "metadata": {},
+ "source": [
+ "# Balanceamento das Classes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1d9ae14e-bc06-46e0-913c-de1aeeb5650d",
+ "metadata": {},
+ "source": [
+ "-> SMOTE cria dados sinteticos para os atributos minoritários dentro da variação padrão, fazendo assim o balanceanto igualitário da distribuição de dados para cada classe alvo.\n",
+ "\n",
+ "-> SMOTE + ENN é uma técnica híbrida de sub-amostragem onde os vizinhos mais próximos da classe majoritária são estimados. Caso os vizinhos mais próximos classifiquem incorretamente essa instância \n",
+ " particular da classe majoritária, então essa instância é excluída.\n",
+ " O resultado do balanceamento ocorre de forma inversa a distribuição normal da entrada de dados. \n",
+ " Considerando que o modelo recebera mais dados de uma determinada classe ele gera menos valores da mesma, visto que o modelo tende a prever com mais frequência\n",
+ " as classes que mais foram analisadas no treinamento. Ou seja, se treinarmos com mais dados as classes que naturalmente tem um quantidade menor, o modelo não terá dificuldade para prevê-las em operação."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "22f1072a-1f13-409e-bc10-80357f8fd9de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CONTAGEM DE CADA CLASSE\n",
+ "counter = Counter(y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "cc3b0cc7-4d96-4223-b1af-150ec775537c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CRIANDO SMOTEENN\n",
+ "smote_enn = SMOTEENN(sampling_strategy='all')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "96ccb080-372d-45fc-b4ce-282df78bd35b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# TREINANDO E APLICANDO BALANCEAMENTO DAS CLASSES\n",
+ "x_balanced, y_balanced = smote_enn.fit_resample(x_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "00317208-ecb5-4635-9d58-513f2dc74a19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# PLOTANDO QUANTIDADE DE AMOSTRAS POR CLASSE ÁPOS BALANCEAMENTO\n",
+ "\n",
+ "plt.figure(figsize=(15,6))\n",
+ "sns.countplot(x=y_balanced, palette = \"OrRd\")\n",
+ "plt.box(False)\n",
+ "plt.xlabel('Classes', fontsize = 11)\n",
+ "plt.ylabel('Quantidade', fontsize = 11)\n",
+ "plt.title('Contagem de Classes\\n')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "09859742-0717-4095-a873-d0a67c682130",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Shape dos dados de treino antes do balanceamento -> atributos:(59381, 122) label:(59381,)\n",
+ "Shape dos dados de treino depois do balanceamento -> atributos:(98269, 122) label:(98269,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO ALTERAÇÕES FEITAS NO TAMANHO DO DADOS\n",
+ "\n",
+ "print('Shape dos dados de treino antes do balanceamento -> atributos:{} label:{}'.format(x_train.shape, y_train.shape))\n",
+ "print('Shape dos dados de treino depois do balanceamento -> atributos:{} label:{}'.format(x_balanced.shape, y_balanced.shape))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8ba5b70-fe92-47b9-9af4-568709ac17db",
+ "metadata": {},
+ "source": [
+ "## Salvando dados tratados em um csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "66cdc414-6ec4-40a9-b7cc-4775eec347e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data = pd.DataFrame(x_balanced, columns=x.columns.tolist())\n",
+ "train_data['Response'] = y_balanced"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "85b0a3ca-fadb-432d-907b-163683327e67",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Product_Info_1 | \n",
+ " Product_Info_3 | \n",
+ " Product_Info_4 | \n",
+ " Product_Info_5 | \n",
+ " Product_Info_6 | \n",
+ " Product_Info_7 | \n",
+ " Ins_Age | \n",
+ " Ht | \n",
+ " Wt | \n",
+ " BMI | \n",
+ " Employment_Info_1 | \n",
+ " Employment_Info_2 | \n",
+ " Employment_Info_3 | \n",
+ " Employment_Info_4 | \n",
+ " Employment_Info_5 | \n",
+ " Employment_Info_6 | \n",
+ " InsuredInfo_1 | \n",
+ " InsuredInfo_2 | \n",
+ " InsuredInfo_3 | \n",
+ " InsuredInfo_4 | \n",
+ " InsuredInfo_5 | \n",
+ " InsuredInfo_6 | \n",
+ " InsuredInfo_7 | \n",
+ " Insurance_History_1 | \n",
+ " Insurance_History_2 | \n",
+ " Insurance_History_3 | \n",
+ " Insurance_History_4 | \n",
+ " Insurance_History_5 | \n",
+ " Insurance_History_7 | \n",
+ " Insurance_History_8 | \n",
+ " Insurance_History_9 | \n",
+ " Family_Hist_1 | \n",
+ " Family_Hist_2 | \n",
+ " Family_Hist_3 | \n",
+ " Family_Hist_4 | \n",
+ " Medical_History_1 | \n",
+ " Medical_History_2 | \n",
+ " Medical_History_3 | \n",
+ " Medical_History_4 | \n",
+ " Medical_History_5 | \n",
+ " Medical_History_6 | \n",
+ " Medical_History_7 | \n",
+ " Medical_History_8 | \n",
+ " Medical_History_9 | \n",
+ " Medical_History_11 | \n",
+ " Medical_History_12 | \n",
+ " Medical_History_13 | \n",
+ " Medical_History_14 | \n",
+ " Medical_History_16 | \n",
+ " Medical_History_17 | \n",
+ " Medical_History_18 | \n",
+ " Medical_History_19 | \n",
+ " Medical_History_20 | \n",
+ " Medical_History_21 | \n",
+ " Medical_History_22 | \n",
+ " Medical_History_23 | \n",
+ " Medical_History_25 | \n",
+ " Medical_History_26 | \n",
+ " Medical_History_27 | \n",
+ " Medical_History_28 | \n",
+ " Medical_History_29 | \n",
+ " Medical_History_30 | \n",
+ " Medical_History_31 | \n",
+ " Medical_History_33 | \n",
+ " Medical_History_34 | \n",
+ " Medical_History_35 | \n",
+ " Medical_History_36 | \n",
+ " Medical_History_37 | \n",
+ " Medical_History_38 | \n",
+ " Medical_History_39 | \n",
+ " Medical_History_40 | \n",
+ " Medical_History_41 | \n",
+ " Medical_Keyword_1 | \n",
+ " Medical_Keyword_2 | \n",
+ " Medical_Keyword_3 | \n",
+ " Medical_Keyword_4 | \n",
+ " Medical_Keyword_5 | \n",
+ " Medical_Keyword_6 | \n",
+ " Medical_Keyword_7 | \n",
+ " Medical_Keyword_8 | \n",
+ " Medical_Keyword_9 | \n",
+ " Medical_Keyword_10 | \n",
+ " Medical_Keyword_11 | \n",
+ " Medical_Keyword_12 | \n",
+ " Medical_Keyword_13 | \n",
+ " Medical_Keyword_14 | \n",
+ " Medical_Keyword_15 | \n",
+ " Medical_Keyword_16 | \n",
+ " Medical_Keyword_17 | \n",
+ " Medical_Keyword_18 | \n",
+ " Medical_Keyword_19 | \n",
+ " Medical_Keyword_20 | \n",
+ " Medical_Keyword_21 | \n",
+ " Medical_Keyword_22 | \n",
+ " Medical_Keyword_23 | \n",
+ " Medical_Keyword_24 | \n",
+ " Medical_Keyword_25 | \n",
+ " Medical_Keyword_26 | \n",
+ " Medical_Keyword_27 | \n",
+ " Medical_Keyword_28 | \n",
+ " Medical_Keyword_29 | \n",
+ " Medical_Keyword_30 | \n",
+ " Medical_Keyword_31 | \n",
+ " Medical_Keyword_32 | \n",
+ " Medical_Keyword_33 | \n",
+ " Medical_Keyword_34 | \n",
+ " Medical_Keyword_35 | \n",
+ " Medical_Keyword_36 | \n",
+ " Medical_Keyword_37 | \n",
+ " Medical_Keyword_38 | \n",
+ " Medical_Keyword_39 | \n",
+ " Medical_Keyword_40 | \n",
+ " Medical_Keyword_41 | \n",
+ " Medical_Keyword_42 | \n",
+ " Medical_Keyword_43 | \n",
+ " Medical_Keyword_44 | \n",
+ " Medical_Keyword_45 | \n",
+ " Medical_Keyword_46 | \n",
+ " Medical_Keyword_47 | \n",
+ " Medical_Keyword_48 | \n",
+ " Product_Info_2_char | \n",
+ " Product_Info_2_num | \n",
+ " Response | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.497737 | \n",
+ " 0.797101 | \n",
+ " 0.000667 | \n",
+ " 0.44489 | \n",
+ " 0.05 | \n",
+ " 7.962172 | \n",
+ " 0.0 | \n",
+ " 0.025 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 21.0 | \n",
+ " 0.076923 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.552239 | \n",
+ " 0.6 | \n",
+ " 0.284519 | \n",
+ " 0.587796 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 162.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.5 | \n",
+ " 0.47455 | \n",
+ " 0.001733 | \n",
+ " 0.661972 | \n",
+ " 0.05 | \n",
+ " 6.0 | \n",
+ " 0.0 | \n",
+ " 0.1 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 26.0 | \n",
+ " 0.230769 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.447761 | \n",
+ " 0.727273 | \n",
+ " 0.330544 | \n",
+ " 0.51639 | \n",
+ " 9.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 6.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 112.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.578431 | \n",
+ " 0.47455 | \n",
+ " 0.000667 | \n",
+ " 0.44489 | \n",
+ " 1.0 | \n",
+ " 7.962172 | \n",
+ " 0.006283 | \n",
+ " 0.15 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 26.0 | \n",
+ " 0.076923 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.791045 | \n",
+ " 0.818182 | \n",
+ " 0.560669 | \n",
+ " 0.758997 | \n",
+ " 12.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 6.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 162.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.431373 | \n",
+ " 0.47455 | \n",
+ " 0.001733 | \n",
+ " 0.44489 | \n",
+ " 0.361469 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.02 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 26.0 | \n",
+ " 0.025641 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.492537 | \n",
+ " 0.745455 | \n",
+ " 0.320084 | \n",
+ " 0.479639 | \n",
+ " 9.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 8.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 373.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.497737 | \n",
+ " 0.681159 | \n",
+ " 0.000167 | \n",
+ " 0.647887 | \n",
+ " 1.0 | \n",
+ " 6.0 | \n",
+ " 0.0 | \n",
+ " 0.09 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 26.0 | \n",
+ " 0.487179 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 0.567164 | \n",
+ " 0.672727 | \n",
+ " 0.288703 | \n",
+ " 0.504999 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 8.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 112.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Product_Info_1 Product_Info_3 Product_Info_4 Product_Info_5 Product_Info_6 \\\n",
+ "0 0.497737 0.797101 0.000667 0.44489 0.05 \n",
+ "1 0.5 0.47455 0.001733 0.661972 0.05 \n",
+ "2 0.578431 0.47455 0.000667 0.44489 1.0 \n",
+ "3 0.431373 0.47455 0.001733 0.44489 0.361469 \n",
+ "4 0.497737 0.681159 0.000167 0.647887 1.0 \n",
+ "\n",
+ " Product_Info_7 Ins_Age Ht Wt BMI Employment_Info_1 \\\n",
+ "0 7.962172 0.0 0.025 4.0 1.0 21.0 \n",
+ "1 6.0 0.0 0.1 3.0 1.0 26.0 \n",
+ "2 7.962172 0.006283 0.15 3.0 1.0 26.0 \n",
+ "3 1.0 0.0 0.02 0.0 1.0 26.0 \n",
+ "4 6.0 0.0 0.09 3.0 1.0 26.0 \n",
+ "\n",
+ " Employment_Info_2 Employment_Info_3 Employment_Info_4 Employment_Info_5 \\\n",
+ "0 0.076923 2.0 3.0 1.0 \n",
+ "1 0.230769 2.0 3.0 1.0 \n",
+ "2 0.076923 2.0 3.0 1.0 \n",
+ "3 0.025641 2.0 1.0 1.0 \n",
+ "4 0.487179 2.0 3.0 1.0 \n",
+ "\n",
+ " Employment_Info_6 InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 \\\n",
+ "0 0.552239 0.6 0.284519 0.587796 1.0 \n",
+ "1 0.447761 0.727273 0.330544 0.51639 9.0 \n",
+ "2 0.791045 0.818182 0.560669 0.758997 12.0 \n",
+ "3 0.492537 0.745455 0.320084 0.479639 9.0 \n",
+ "4 0.567164 0.672727 0.288703 0.504999 3.0 \n",
+ "\n",
+ " InsuredInfo_5 InsuredInfo_6 InsuredInfo_7 Insurance_History_1 \\\n",
+ "0 3.0 3.0 2.0 2.0 \n",
+ "1 1.0 2.0 1.0 2.0 \n",
+ "2 1.0 2.0 1.0 2.0 \n",
+ "3 1.0 2.0 2.0 2.0 \n",
+ "4 1.0 2.0 1.0 2.0 \n",
+ "\n",
+ " Insurance_History_2 Insurance_History_3 Insurance_History_4 \\\n",
+ "0 3.0 3.0 1.0 \n",
+ "1 6.0 3.0 1.0 \n",
+ "2 6.0 3.0 1.0 \n",
+ "3 8.0 3.0 1.0 \n",
+ "4 8.0 3.0 1.0 \n",
+ "\n",
+ " Insurance_History_5 Insurance_History_7 Insurance_History_8 \\\n",
+ "0 2.0 1.0 2.0 \n",
+ "1 1.0 1.0 2.0 \n",
+ "2 1.0 1.0 2.0 \n",
+ "3 1.0 1.0 2.0 \n",
+ "4 1.0 1.0 2.0 \n",
+ "\n",
+ " Insurance_History_9 Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 \\\n",
+ "0 1.0 3.0 1.0 1.0 3.0 \n",
+ "1 1.0 1.0 3.0 3.0 2.0 \n",
+ "2 1.0 3.0 1.0 1.0 3.0 \n",
+ "3 1.0 1.0 3.0 3.0 2.0 \n",
+ "4 3.0 3.0 1.0 1.0 3.0 \n",
+ "\n",
+ " Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4 \\\n",
+ "0 2.0 3.0 162.0 2.0 \n",
+ "1 3.0 3.0 112.0 2.0 \n",
+ "2 2.0 3.0 162.0 2.0 \n",
+ "3 3.0 3.0 373.0 2.0 \n",
+ "4 2.0 3.0 112.0 2.0 \n",
+ "\n",
+ " Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8 \\\n",
+ "0 2.0 1.0 3.0 2.0 \n",
+ "1 2.0 1.0 3.0 2.0 \n",
+ "2 2.0 1.0 3.0 2.0 \n",
+ "3 2.0 1.0 3.0 2.0 \n",
+ "4 1.0 1.0 3.0 2.0 \n",
+ "\n",
+ " Medical_History_9 Medical_History_11 Medical_History_12 Medical_History_13 \\\n",
+ "0 2.0 2.0 3.0 2.0 \n",
+ "1 2.0 2.0 3.0 2.0 \n",
+ "2 2.0 2.0 3.0 2.0 \n",
+ "3 2.0 2.0 3.0 2.0 \n",
+ "4 2.0 2.0 3.0 2.0 \n",
+ "\n",
+ " Medical_History_14 Medical_History_16 Medical_History_17 Medical_History_18 \\\n",
+ "0 3.0 3.0 3.0 3.0 \n",
+ "1 3.0 3.0 1.0 3.0 \n",
+ "2 3.0 3.0 1.0 3.0 \n",
+ "3 3.0 3.0 1.0 3.0 \n",
+ "4 3.0 3.0 1.0 3.0 \n",
+ "\n",
+ " Medical_History_19 Medical_History_20 Medical_History_21 Medical_History_22 \\\n",
+ "0 1.0 1.0 2.0 1.0 \n",
+ "1 1.0 1.0 2.0 1.0 \n",
+ "2 1.0 1.0 2.0 1.0 \n",
+ "3 1.0 1.0 2.0 1.0 \n",
+ "4 1.0 1.0 2.0 2.0 \n",
+ "\n",
+ " Medical_History_23 Medical_History_25 Medical_History_26 Medical_History_27 \\\n",
+ "0 2.0 1.0 1.0 3.0 \n",
+ "1 2.0 3.0 1.0 3.0 \n",
+ "2 2.0 3.0 1.0 3.0 \n",
+ "3 2.0 3.0 2.0 2.0 \n",
+ "4 2.0 3.0 1.0 3.0 \n",
+ "\n",
+ " Medical_History_28 Medical_History_29 Medical_History_30 Medical_History_31 \\\n",
+ "0 3.0 1.0 3.0 2.0 \n",
+ "1 3.0 1.0 3.0 2.0 \n",
+ "2 3.0 1.0 1.0 2.0 \n",
+ "3 3.0 1.0 3.0 2.0 \n",
+ "4 3.0 1.0 3.0 2.0 \n",
+ "\n",
+ " Medical_History_33 Medical_History_34 Medical_History_35 Medical_History_36 \\\n",
+ "0 3.0 3.0 3.0 1.0 \n",
+ "1 3.0 3.0 3.0 1.0 \n",
+ "2 3.0 3.0 3.0 1.0 \n",
+ "3 3.0 3.0 3.0 1.0 \n",
+ "4 3.0 3.0 3.0 1.0 \n",
+ "\n",
+ " Medical_History_37 Medical_History_38 Medical_History_39 Medical_History_40 \\\n",
+ "0 2.0 2.0 1.0 3.0 \n",
+ "1 2.0 2.0 1.0 3.0 \n",
+ "2 2.0 2.0 1.0 3.0 \n",
+ "3 3.0 2.0 1.0 1.0 \n",
+ "4 2.0 2.0 1.0 3.0 \n",
+ "\n",
+ " Medical_History_41 Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 \\\n",
+ "0 3.0 1.0 0.0 0.0 \n",
+ "1 3.0 1.0 0.0 0.0 \n",
+ "2 3.0 1.0 0.0 0.0 \n",
+ "3 3.0 1.0 0.0 0.0 \n",
+ "4 1.0 3.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 1.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_8 Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 1.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18 Medical_Keyword_19 \\\n",
+ "0 1.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_20 Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27 \\\n",
+ "0 0.0 0.0 1.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30 Medical_Keyword_31 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_32 Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42 Medical_Keyword_43 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_44 Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47 \\\n",
+ "0 0.0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Medical_Keyword_48 Product_Info_2_char Product_Info_2_num Response \n",
+ "0 0.0 0.0 1.0 1 \n",
+ "1 0.0 0.0 3.0 1 \n",
+ "2 0.0 0.0 1.0 1 \n",
+ "3 0.0 0.0 8.0 1 \n",
+ "4 0.0 0.0 3.0 1 "
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "35f8335f-dcaa-40b7-ba05-885be2c0fa1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.to_csv(\"/train_data_trated.csv\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4868f0c3-b1c8-4c39-9108-bd5a9abd81d9",
+ "metadata": {},
+ "source": [
+ "# CRIANDO MODELO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41bd4800-5e97-4e61-aeb5-aa2f60922e65",
+ "metadata": {},
+ "source": [
+ "## Otimização de Hyper-Parâmetros"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "ddb5a4bb-fb1d-47f1-a936-6bc7af7ff868",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# PARÂMETROS A SEREM TESTADOS\n",
+ "\n",
+ "param = {'min_samples_leaf': [1, 3, 5, 7], \n",
+ " 'max_features': ['auto', 'sqrt', 'log2']}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "162464a1-9146-4f27-b312-6be8085094dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CRIANDO OTMIZADOR DE PARÂMETROS\n",
+ "\n",
+ "grid_search = GridSearchCV(estimator = RandomForestClassifier(n_estimators=50, criterion='gini', random_state=42), \n",
+ " param_grid = param, \n",
+ " cv = 5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "0a27a2fd-f1f9-4785-a26c-fed4714ad5cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# REALIZANDO TREINAMENTO\n",
+ "grid_random_forest = grid_search.fit(x_balanced, y_balanced)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "6a9e3f15-7b17-48e2-8bf6-74692dbd8593",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RandomForestClassifier(max_features='log2', n_estimators=50, random_state=42)"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO A MELHOR COMBINAÇÃO DE PARÂMETROS\n",
+ "grid_search.best_estimator_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "966d2ffa-3db8-4709-bd32-8374eb9417b5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'max_features': 'log2', 'min_samples_leaf': 1}"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# VISUALIZANDO A MELHOR COMBINAÇÃO DE PARÂMETROS DENTRO DAS OPÇÕES DADAS\n",
+ "grid_search.best_params_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "e5792cdb-da31-4147-9f51-03a4969e7623",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.8440200136456252"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# MELHOR PONTUAÇÃO OBTIDA NO TREINAMENTO\n",
+ "grid_search.best_score_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7b008dd-7f35-4ea9-b740-04e309575f36",
+ "metadata": {},
+ "source": [
+ "## Curvas de Validação"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9edb4a2-53ad-4b9b-ab63-58f17d6e6ef1",
+ "metadata": {},
+ "source": [
+ "-> A curva de validação serve para verificar como se comporta a pontuação do modelo em detrimento da alteração de algum hyper-parâmetro.
\n",
+ "-> Neste caso, foi alterado o número de árvores utilizados pelo algoritmo RandomForestClassifier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "b748b82d-c2a8-4950-be50-a761d63a55b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CRIANDO ARRAY COM VALORES PARA SEREM TESTADOS ([ 20, 60, 100, 140, 180, 220, 260, 300])\n",
+ "range_estimators = np.arange(20, 301, 40)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "e74b9193-2a5a-4d0a-83c3-fbf051257b73",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# EXECUÇÃO DO ALGORITMO DE VALIDAÇÃO\n",
+ "\n",
+ "train_scores, test_scores = validation_curve(RandomForestClassifier(criterion='gini', max_features='log2', min_samples_leaf=1), \n",
+ " x_balanced, y_balanced, \n",
+ " param_name = 'n_estimators', \n",
+ " param_range = range_estimators,\n",
+ " cv = 10, \n",
+ " scoring = 'accuracy')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "120021bf-2489-41b5-b0f1-0697fe5480d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# OBTENDO MÉDIAS E DESVIO PADRÃO PARA PLOTAGEM\n",
+ "\n",
+ "train_mean = np.mean(train_scores, axis=1)\n",
+ "train_std = np.std(train_scores, axis=1)\n",
+ "\n",
+ "test_mean = np.mean(test_scores, axis=1)\n",
+ "test_std = np.std(test_scores, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "359d1d5e-f351-45b7-acb2-945c27ab0aac",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "