Skip to content

Commit f126f4b

Browse files
committed
updated data preprocessing
1 parent 287c541 commit f126f4b

File tree

1 file changed

+175
-15
lines changed

1 file changed

+175
-15
lines changed

customer_churn_prediction.ipynb

Lines changed: 175 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
},
2222
{
2323
"cell_type": "code",
24-
"execution_count": 3,
24+
"execution_count": 1,
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
@@ -33,7 +33,8 @@
3333
"%matplotlib inline\n",
3434
"import warnings\n",
3535
"warnings.filterwarnings(\"ignore\")\n",
36-
"from pylab import rcParams"
36+
"from pylab import rcParams\n",
37+
"from sklearn.model_selection import train_test_split"
3738
]
3839
},
3940
{
@@ -71,7 +72,7 @@
7172
},
7273
{
7374
"cell_type": "code",
74-
"execution_count": 7,
75+
"execution_count": 2,
7576
"metadata": {},
7677
"outputs": [
7778
{
@@ -276,7 +277,7 @@
276277
"[5 rows x 21 columns]"
277278
]
278279
},
279-
"execution_count": 7,
280+
"execution_count": 2,
280281
"metadata": {},
281282
"output_type": "execute_result"
282283
}
@@ -289,7 +290,7 @@
289290
},
290291
{
291292
"cell_type": "code",
292-
"execution_count": 8,
293+
"execution_count": 3,
293294
"metadata": {},
294295
"outputs": [
295296
{
@@ -298,7 +299,7 @@
298299
"(7043, 21)"
299300
]
300301
},
301-
"execution_count": 8,
302+
"execution_count": 3,
302303
"metadata": {},
303304
"output_type": "execute_result"
304305
}
@@ -309,7 +310,7 @@
309310
},
310311
{
311312
"cell_type": "code",
312-
"execution_count": 9,
313+
"execution_count": 4,
313314
"metadata": {},
314315
"outputs": [
315316
{
@@ -323,7 +324,7 @@
323324
" dtype='object')"
324325
]
325326
},
326-
"execution_count": 9,
327+
"execution_count": 4,
327328
"metadata": {},
328329
"output_type": "execute_result"
329330
}
@@ -334,7 +335,7 @@
334335
},
335336
{
336337
"cell_type": "code",
337-
"execution_count": 10,
338+
"execution_count": 5,
338339
"metadata": {},
339340
"outputs": [
340341
{
@@ -345,7 +346,7 @@
345346
"Name: Churn, dtype: int64"
346347
]
347348
},
348-
"execution_count": 10,
349+
"execution_count": 5,
349350
"metadata": {},
350351
"output_type": "execute_result"
351352
}
@@ -356,7 +357,7 @@
356357
},
357358
{
358359
"cell_type": "code",
359-
"execution_count": 14,
360+
"execution_count": 6,
360361
"metadata": {},
361362
"outputs": [
362363
{
@@ -373,7 +374,7 @@
373374
},
374375
{
375376
"cell_type": "code",
376-
"execution_count": 34,
377+
"execution_count": 7,
377378
"metadata": {},
378379
"outputs": [
379380
{
@@ -406,15 +407,174 @@
406407
"cell_type": "markdown",
407408
"metadata": {},
408409
"source": [
409-
"Next step is data preprocessing and feature engineering"
410+
"Next step is data preprocessing and feature engineering\n",
411+
"In this step, we will preprocess the data because if the data is not good, our predictions will not be good. So, we will be doing the following:\n",
412+
"* Dropping irrelevant data\n",
413+
"* Handling missing data"
414+
]
415+
},
416+
{
417+
"cell_type": "code",
418+
"execution_count": 8,
419+
"metadata": {},
420+
"outputs": [],
421+
"source": [
422+
"# Dropping irrelavant data\n",
423+
"df.drop('customerID', axis=1, inplace=True)"
424+
]
425+
},
426+
{
427+
"cell_type": "code",
428+
"execution_count": 9,
429+
"metadata": {},
430+
"outputs": [
431+
{
432+
"data": {
433+
"text/plain": [
434+
"False"
435+
]
436+
},
437+
"execution_count": 9,
438+
"metadata": {},
439+
"output_type": "execute_result"
440+
}
441+
],
442+
"source": [
443+
"# Handling missing data\n",
444+
"df.isnull().values.any()"
445+
]
446+
},
447+
{
448+
"cell_type": "code",
449+
"execution_count": 10,
450+
"metadata": {},
451+
"outputs": [
452+
{
453+
"data": {
454+
"text/plain": [
455+
"gender 0\n",
456+
"SeniorCitizen 0\n",
457+
"Partner 0\n",
458+
"Dependents 0\n",
459+
"tenure 0\n",
460+
"PhoneService 0\n",
461+
"MultipleLines 0\n",
462+
"InternetService 0\n",
463+
"OnlineSecurity 0\n",
464+
"OnlineBackup 0\n",
465+
"DeviceProtection 0\n",
466+
"TechSupport 0\n",
467+
"StreamingTV 0\n",
468+
"StreamingMovies 0\n",
469+
"Contract 0\n",
470+
"PaperlessBilling 0\n",
471+
"PaymentMethod 0\n",
472+
"MonthlyCharges 0\n",
473+
"TotalCharges 0\n",
474+
"Churn 0\n",
475+
"dtype: int64"
476+
]
477+
},
478+
"execution_count": 10,
479+
"metadata": {},
480+
"output_type": "execute_result"
481+
}
482+
],
483+
"source": [
484+
"# Missing values per column\n",
485+
"df.isnull().sum()"
486+
]
487+
},
488+
{
489+
"cell_type": "markdown",
490+
"metadata": {},
491+
"source": [
492+
"We don't have any missing values in any of the columns"
493+
]
494+
},
495+
{
496+
"cell_type": "code",
497+
"execution_count": 11,
498+
"metadata": {},
499+
"outputs": [
500+
{
501+
"data": {
502+
"text/plain": [
503+
"gender object\n",
504+
"SeniorCitizen int64\n",
505+
"Partner object\n",
506+
"Dependents object\n",
507+
"tenure int64\n",
508+
"PhoneService object\n",
509+
"MultipleLines object\n",
510+
"InternetService object\n",
511+
"OnlineSecurity object\n",
512+
"OnlineBackup object\n",
513+
"DeviceProtection object\n",
514+
"TechSupport object\n",
515+
"StreamingTV object\n",
516+
"StreamingMovies object\n",
517+
"Contract object\n",
518+
"PaperlessBilling object\n",
519+
"PaymentMethod object\n",
520+
"MonthlyCharges float64\n",
521+
"TotalCharges object\n",
522+
"Churn object\n",
523+
"dtype: object"
524+
]
525+
},
526+
"execution_count": 11,
527+
"metadata": {},
528+
"output_type": "execute_result"
529+
}
530+
],
531+
"source": [
532+
"# Converting object type columns to numerical\n",
533+
"df.dtypes"
534+
]
535+
},
536+
{
537+
"cell_type": "code",
538+
"execution_count": 12,
539+
"metadata": {},
540+
"outputs": [],
541+
"source": [
542+
"# TotalCharges column is object type where it actually should be numerical\n",
543+
"df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')"
410544
]
411545
},
412546
{
413547
"cell_type": "code",
414-
"execution_count": null,
548+
"execution_count": 13,
415549
"metadata": {},
416550
"outputs": [],
417-
"source": []
551+
"source": [
552+
"# Convert categorical data to numerical data like for the column gender\n",
553+
"# Using get_dummies function to replace gender column with gender_male and gender_female columns\n",
554+
"df = pd.get_dummies(data=df, columns=['gender'])"
555+
]
556+
},
557+
{
558+
"cell_type": "code",
559+
"execution_count": 14,
560+
"metadata": {},
561+
"outputs": [],
562+
"source": [
563+
"# Splitting the dataset\n",
564+
"df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})\n",
565+
"y = df[\"Churn\"].values\n",
566+
"X = df.drop(labels = [\"Churn\"],axis = 1)"
567+
]
568+
},
569+
{
570+
"cell_type": "code",
571+
"execution_count": 18,
572+
"metadata": {},
573+
"outputs": [],
574+
"source": [
575+
"# Create Train & Test Data\n",
576+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)"
577+
]
418578
}
419579
],
420580
"metadata": {

0 commit comments

Comments
 (0)