|
21 | 21 | },
|
22 | 22 | {
|
23 | 23 | "cell_type": "code",
|
24 |
| - "execution_count": 3, |
| 24 | + "execution_count": 1, |
25 | 25 | "metadata": {},
|
26 | 26 | "outputs": [],
|
27 | 27 | "source": [
|
|
33 | 33 | "%matplotlib inline\n",
|
34 | 34 | "import warnings\n",
|
35 | 35 | "warnings.filterwarnings(\"ignore\")\n",
|
36 |
| - "from pylab import rcParams" |
| 36 | + "from pylab import rcParams\n", |
| 37 | + "from sklearn.model_selection import train_test_split" |
37 | 38 | ]
|
38 | 39 | },
|
39 | 40 | {
|
|
71 | 72 | },
|
72 | 73 | {
|
73 | 74 | "cell_type": "code",
|
74 |
| - "execution_count": 7, |
| 75 | + "execution_count": 2, |
75 | 76 | "metadata": {},
|
76 | 77 | "outputs": [
|
77 | 78 | {
|
|
276 | 277 | "[5 rows x 21 columns]"
|
277 | 278 | ]
|
278 | 279 | },
|
279 |
| - "execution_count": 7, |
| 280 | + "execution_count": 2, |
280 | 281 | "metadata": {},
|
281 | 282 | "output_type": "execute_result"
|
282 | 283 | }
|
|
289 | 290 | },
|
290 | 291 | {
|
291 | 292 | "cell_type": "code",
|
292 |
| - "execution_count": 8, |
| 293 | + "execution_count": 3, |
293 | 294 | "metadata": {},
|
294 | 295 | "outputs": [
|
295 | 296 | {
|
|
298 | 299 | "(7043, 21)"
|
299 | 300 | ]
|
300 | 301 | },
|
301 |
| - "execution_count": 8, |
| 302 | + "execution_count": 3, |
302 | 303 | "metadata": {},
|
303 | 304 | "output_type": "execute_result"
|
304 | 305 | }
|
|
309 | 310 | },
|
310 | 311 | {
|
311 | 312 | "cell_type": "code",
|
312 |
| - "execution_count": 9, |
| 313 | + "execution_count": 4, |
313 | 314 | "metadata": {},
|
314 | 315 | "outputs": [
|
315 | 316 | {
|
|
323 | 324 | " dtype='object')"
|
324 | 325 | ]
|
325 | 326 | },
|
326 |
| - "execution_count": 9, |
| 327 | + "execution_count": 4, |
327 | 328 | "metadata": {},
|
328 | 329 | "output_type": "execute_result"
|
329 | 330 | }
|
|
334 | 335 | },
|
335 | 336 | {
|
336 | 337 | "cell_type": "code",
|
337 |
| - "execution_count": 10, |
| 338 | + "execution_count": 5, |
338 | 339 | "metadata": {},
|
339 | 340 | "outputs": [
|
340 | 341 | {
|
|
345 | 346 | "Name: Churn, dtype: int64"
|
346 | 347 | ]
|
347 | 348 | },
|
348 |
| - "execution_count": 10, |
| 349 | + "execution_count": 5, |
349 | 350 | "metadata": {},
|
350 | 351 | "output_type": "execute_result"
|
351 | 352 | }
|
|
356 | 357 | },
|
357 | 358 | {
|
358 | 359 | "cell_type": "code",
|
359 |
| - "execution_count": 14, |
| 360 | + "execution_count": 6, |
360 | 361 | "metadata": {},
|
361 | 362 | "outputs": [
|
362 | 363 | {
|
|
373 | 374 | },
|
374 | 375 | {
|
375 | 376 | "cell_type": "code",
|
376 |
| - "execution_count": 34, |
| 377 | + "execution_count": 7, |
377 | 378 | "metadata": {},
|
378 | 379 | "outputs": [
|
379 | 380 | {
|
|
406 | 407 | "cell_type": "markdown",
|
407 | 408 | "metadata": {},
|
408 | 409 | "source": [
|
409 |
| - "Next step is data preprocessing and feature engineering" |
| 410 | + "Next step is data preprocessing and feature engineering\n", |
| 411 | + "In this step, we will preprocess the data because if the data is not good, our predictions will not be good. So, we will be doing the following:\n", |
| 412 | + "* Dropping irrelevant data\n", |
| 413 | + "* Handling missing data" |
| 414 | + ] |
| 415 | + }, |
| 416 | + { |
| 417 | + "cell_type": "code", |
| 418 | + "execution_count": 8, |
| 419 | + "metadata": {}, |
| 420 | + "outputs": [], |
| 421 | + "source": [ |
| 422 | + "# Dropping irrelavant data\n", |
| 423 | + "df.drop('customerID', axis=1, inplace=True)" |
| 424 | + ] |
| 425 | + }, |
| 426 | + { |
| 427 | + "cell_type": "code", |
| 428 | + "execution_count": 9, |
| 429 | + "metadata": {}, |
| 430 | + "outputs": [ |
| 431 | + { |
| 432 | + "data": { |
| 433 | + "text/plain": [ |
| 434 | + "False" |
| 435 | + ] |
| 436 | + }, |
| 437 | + "execution_count": 9, |
| 438 | + "metadata": {}, |
| 439 | + "output_type": "execute_result" |
| 440 | + } |
| 441 | + ], |
| 442 | + "source": [ |
| 443 | + "# Handling missing data\n", |
| 444 | + "df.isnull().values.any()" |
| 445 | + ] |
| 446 | + }, |
| 447 | + { |
| 448 | + "cell_type": "code", |
| 449 | + "execution_count": 10, |
| 450 | + "metadata": {}, |
| 451 | + "outputs": [ |
| 452 | + { |
| 453 | + "data": { |
| 454 | + "text/plain": [ |
| 455 | + "gender 0\n", |
| 456 | + "SeniorCitizen 0\n", |
| 457 | + "Partner 0\n", |
| 458 | + "Dependents 0\n", |
| 459 | + "tenure 0\n", |
| 460 | + "PhoneService 0\n", |
| 461 | + "MultipleLines 0\n", |
| 462 | + "InternetService 0\n", |
| 463 | + "OnlineSecurity 0\n", |
| 464 | + "OnlineBackup 0\n", |
| 465 | + "DeviceProtection 0\n", |
| 466 | + "TechSupport 0\n", |
| 467 | + "StreamingTV 0\n", |
| 468 | + "StreamingMovies 0\n", |
| 469 | + "Contract 0\n", |
| 470 | + "PaperlessBilling 0\n", |
| 471 | + "PaymentMethod 0\n", |
| 472 | + "MonthlyCharges 0\n", |
| 473 | + "TotalCharges 0\n", |
| 474 | + "Churn 0\n", |
| 475 | + "dtype: int64" |
| 476 | + ] |
| 477 | + }, |
| 478 | + "execution_count": 10, |
| 479 | + "metadata": {}, |
| 480 | + "output_type": "execute_result" |
| 481 | + } |
| 482 | + ], |
| 483 | + "source": [ |
| 484 | + "# Missing values per column\n", |
| 485 | + "df.isnull().sum()" |
| 486 | + ] |
| 487 | + }, |
| 488 | + { |
| 489 | + "cell_type": "markdown", |
| 490 | + "metadata": {}, |
| 491 | + "source": [ |
| 492 | + "We don't have any missing values in any of the columns" |
| 493 | + ] |
| 494 | + }, |
| 495 | + { |
| 496 | + "cell_type": "code", |
| 497 | + "execution_count": 11, |
| 498 | + "metadata": {}, |
| 499 | + "outputs": [ |
| 500 | + { |
| 501 | + "data": { |
| 502 | + "text/plain": [ |
| 503 | + "gender object\n", |
| 504 | + "SeniorCitizen int64\n", |
| 505 | + "Partner object\n", |
| 506 | + "Dependents object\n", |
| 507 | + "tenure int64\n", |
| 508 | + "PhoneService object\n", |
| 509 | + "MultipleLines object\n", |
| 510 | + "InternetService object\n", |
| 511 | + "OnlineSecurity object\n", |
| 512 | + "OnlineBackup object\n", |
| 513 | + "DeviceProtection object\n", |
| 514 | + "TechSupport object\n", |
| 515 | + "StreamingTV object\n", |
| 516 | + "StreamingMovies object\n", |
| 517 | + "Contract object\n", |
| 518 | + "PaperlessBilling object\n", |
| 519 | + "PaymentMethod object\n", |
| 520 | + "MonthlyCharges float64\n", |
| 521 | + "TotalCharges object\n", |
| 522 | + "Churn object\n", |
| 523 | + "dtype: object" |
| 524 | + ] |
| 525 | + }, |
| 526 | + "execution_count": 11, |
| 527 | + "metadata": {}, |
| 528 | + "output_type": "execute_result" |
| 529 | + } |
| 530 | + ], |
| 531 | + "source": [ |
| 532 | + "# Converting object type columns to numerical\n", |
| 533 | + "df.dtypes" |
| 534 | + ] |
| 535 | + }, |
| 536 | + { |
| 537 | + "cell_type": "code", |
| 538 | + "execution_count": 12, |
| 539 | + "metadata": {}, |
| 540 | + "outputs": [], |
| 541 | + "source": [ |
| 542 | + "# TotalCharges column is object type where it actually should be numerical\n", |
| 543 | + "df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')" |
410 | 544 | ]
|
411 | 545 | },
|
412 | 546 | {
|
413 | 547 | "cell_type": "code",
|
414 |
| - "execution_count": null, |
| 548 | + "execution_count": 13, |
415 | 549 | "metadata": {},
|
416 | 550 | "outputs": [],
|
417 |
| - "source": [] |
| 551 | + "source": [ |
| 552 | + "# Convert categorical data to numerical data like for the column gender\n", |
| 553 | + "# Using get_dummies function to replace gender column with gender_male and gender_female columns\n", |
| 554 | + "df = pd.get_dummies(data=df, columns=['gender'])" |
| 555 | + ] |
| 556 | + }, |
| 557 | + { |
| 558 | + "cell_type": "code", |
| 559 | + "execution_count": 14, |
| 560 | + "metadata": {}, |
| 561 | + "outputs": [], |
| 562 | + "source": [ |
| 563 | + "# Splitting the dataset\n", |
| 564 | + "df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})\n", |
| 565 | + "y = df[\"Churn\"].values\n", |
| 566 | + "X = df.drop(labels = [\"Churn\"],axis = 1)" |
| 567 | + ] |
| 568 | + }, |
| 569 | + { |
| 570 | + "cell_type": "code", |
| 571 | + "execution_count": 18, |
| 572 | + "metadata": {}, |
| 573 | + "outputs": [], |
| 574 | + "source": [ |
| 575 | + "# Create Train & Test Data\n", |
| 576 | + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)" |
| 577 | + ] |
418 | 578 | }
|
419 | 579 | ],
|
420 | 580 | "metadata": {
|
|
0 commit comments