updated data preprocessing

chhayac · chhayac · commit f126f4bdd2e9 · 2019-07-11T23:21:56.000-07:00
diff --git a/customer_churn_prediction.ipynb b/customer_churn_prediction.ipynb
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -33,7 +33,8 @@
     "%matplotlib inline\n",
     "import warnings\n",
     "warnings.filterwarnings(\"ignore\")\n",
-    "from pylab import rcParams"
+    "from pylab import rcParams\n",
+    "from sklearn.model_selection import train_test_split"
    ]
   },
   {
@@ -71,7 +72,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +277,7 @@
        "[5 rows x 21 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -289,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -298,7 +299,7 @@
        "(7043, 21)"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -309,7 +310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -323,7 +324,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -334,7 +335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -345,7 +346,7 @@
        "Name: Churn, dtype: int64"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -356,7 +357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -373,7 +374,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -406,15 +407,174 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next step is data preprocessing and feature engineering"
+    "Next step is data preprocessing and feature engineering\n",
+    "In this step, we will preprocess the data because if the data is not good, our predictions will not be good. So, we will be doing the following:\n",
+    "* Dropping irrelevant data\n",
+    "* Handling missing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dropping irrelavant data\n",
+    "df.drop('customerID', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Handling missing data\n",
+    "df.isnull().values.any()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "gender              0\n",
+       "SeniorCitizen       0\n",
+       "Partner             0\n",
+       "Dependents          0\n",
+       "tenure              0\n",
+       "PhoneService        0\n",
+       "MultipleLines       0\n",
+       "InternetService     0\n",
+       "OnlineSecurity      0\n",
+       "OnlineBackup        0\n",
+       "DeviceProtection    0\n",
+       "TechSupport         0\n",
+       "StreamingTV         0\n",
+       "StreamingMovies     0\n",
+       "Contract            0\n",
+       "PaperlessBilling    0\n",
+       "PaymentMethod       0\n",
+       "MonthlyCharges      0\n",
+       "TotalCharges        0\n",
+       "Churn               0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Missing values per column\n",
+    "df.isnull().sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We don't have any missing values in any of the columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "gender               object\n",
+       "SeniorCitizen         int64\n",
+       "Partner              object\n",
+       "Dependents           object\n",
+       "tenure                int64\n",
+       "PhoneService         object\n",
+       "MultipleLines        object\n",
+       "InternetService      object\n",
+       "OnlineSecurity       object\n",
+       "OnlineBackup         object\n",
+       "DeviceProtection     object\n",
+       "TechSupport          object\n",
+       "StreamingTV          object\n",
+       "StreamingMovies      object\n",
+       "Contract             object\n",
+       "PaperlessBilling     object\n",
+       "PaymentMethod        object\n",
+       "MonthlyCharges      float64\n",
+       "TotalCharges         object\n",
+       "Churn                object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Converting object type columns to numerical\n",
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TotalCharges column is object type where it actually should be numerical\n",
+    "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Convert categorical data to numerical data like for the column gender\n",
+    "# Using get_dummies function to replace gender column with gender_male and gender_female columns\n",
+    "df = pd.get_dummies(data=df, columns=['gender'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Splitting the dataset\n",
+    "df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})\n",
+    "y = df[\"Churn\"].values\n",
+    "X = df.drop(labels = [\"Churn\"],axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Train & Test Data\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)"
+   ]
   }
  ],
  "metadata": {