-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_and_explore_dataset.py
More file actions
762 lines (643 loc) · 35.6 KB
/
load_and_explore_dataset.py
File metadata and controls
762 lines (643 loc) · 35.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
"""
Load, Explore, Analyze and Visualize the Dataset
===============================================
This comprehensive script performs the following tasks:
1. Load the dataset using pandas
2. Display the first few rows using .head()
3. Explore the structure of the dataset (data types and missing values)
4. Clean the dataset by handling missing values
5. Perform basic data analysis including:
- Compute basic statistics of numerical columns using .describe()
- Perform groupings on categorical columns and compute statistics
- Identify patterns and interesting findings from the analysis
6. Create comprehensive data visualizations including:
- Line chart showing trends over time (subscription patterns)
- Bar chart comparing numerical values across categories (name lengths by country)
- Histogram showing distribution of numerical data (email lengths)
- Scatter plot visualizing relationships between numerical columns
- Additional specialized visualizations (pie charts, heatmaps, box plots)
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
def load_and_explore_dataset():
"""
Load and explore the customers dataset
"""
print("=" * 50)
print("LOAD AND EXPLORE THE DATASET")
print("=" * 50)
# Task 1: Load the dataset using pandas
print("\n1. Loading the dataset...")
try:
df = pd.read_csv('customers-100.csv')
print(f"✓ Dataset loaded successfully!")
print(f" Dataset shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")
except FileNotFoundError:
print("❌ Error: customers-100.csv file not found!")
return None
except Exception as e:
print(f"❌ Error loading dataset: {e}")
return None
# Task 2: Display the first few rows using .head()
print("\n2. Displaying the first few rows of the dataset:")
print("-" * 50)
print(df.head())
# Also show the last few rows for completeness
print("\nLast few rows of the dataset:")
print("-" * 30)
print(df.tail())
# Task 3: Explore the structure of the dataset
print("\n3. Exploring the structure of the dataset:")
print("-" * 50)
# Display column names
print(f"Column names ({len(df.columns)} total):")
for i, col in enumerate(df.columns, 1):
print(f" {i:2d}. {col}")
# Display data types
print(f"\nData types:")
print(df.dtypes)
# Display basic info about the dataset
print(f"\nDataset info:")
print("-" * 20)
df.info()
# Display basic statistics
print(f"\nBasic statistics:")
print("-" * 20)
print(df.describe(include='all'))
# Check for missing values
print(f"\n4. Checking for missing values:")
print("-" * 40)
missing_values = df.isnull().sum()
total_missing = missing_values.sum()
if total_missing == 0:
print("✓ No missing values found in the dataset!")
else:
print(f"Missing values found:")
for col, missing_count in missing_values.items():
if missing_count > 0:
percentage = (missing_count / len(df)) * 100
print(f" {col}: {missing_count} ({percentage:.2f}%)")
print(f"\nTotal missing values: {total_missing}")
# Task 4: Clean the dataset by handling missing values
print(f"\n5. Cleaning the dataset:")
print("-" * 30)
if total_missing == 0:
print("✓ Dataset is already clean - no missing values to handle!")
cleaned_df = df.copy()
else:
print("Cleaning strategy:")
cleaned_df = df.copy()
# Strategy for different types of columns
for col in df.columns:
missing_count = df[col].isnull().sum()
if missing_count > 0:
if df[col].dtype == 'object': # String/categorical columns
if missing_count / len(df) > 0.5: # If more than 50% missing
print(f" Dropping column '{col}' (too many missing values: {missing_count})")
cleaned_df = cleaned_df.drop(columns=[col])
else:
print(f" Filling missing values in '{col}' with 'Unknown'")
cleaned_df[col] = cleaned_df[col].fillna('Unknown')
else: # Numeric columns
if missing_count / len(df) > 0.5: # If more than 50% missing
print(f" Dropping column '{col}' (too many missing values: {missing_count})")
cleaned_df = cleaned_df.drop(columns=[col])
else:
median_value = df[col].median()
print(f" Filling missing values in '{col}' with median: {median_value}")
cleaned_df[col] = cleaned_df[col].fillna(median_value)
# Verify cleaning
print(f"\n6. Verification after cleaning:")
print("-" * 35)
remaining_missing = cleaned_df.isnull().sum().sum()
print(f"Original dataset shape: {df.shape}")
print(f"Cleaned dataset shape: {cleaned_df.shape}")
print(f"Remaining missing values: {remaining_missing}")
if remaining_missing == 0:
print("✓ Dataset successfully cleaned!")
else:
print("⚠ Some missing values still remain")
# Additional exploration
print(f"\n7. Additional dataset insights:")
print("-" * 35)
# Check for duplicates
duplicates = cleaned_df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
# Show unique values for some key columns (if they exist)
categorical_columns = cleaned_df.select_dtypes(include=['object']).columns
for col in categorical_columns[:3]: # Show first 3 categorical columns
unique_count = cleaned_df[col].nunique()
print(f"Unique values in '{col}': {unique_count}")
if unique_count <= 10: # Show values if not too many
print(f" Values: {sorted(cleaned_df[col].unique())}")
print(f"\n" + "=" * 50)
print("DATASET EXPLORATION COMPLETED!")
print("=" * 50)
return cleaned_df
def perform_basic_data_analysis(df):
"""
Perform comprehensive basic data analysis on the dataset
"""
print("\n" + "=" * 60)
print("BASIC DATA ANALYSIS")
print("=" * 60)
# Task 1: Compute basic statistics of numerical columns
print("\n1. Basic Statistics of Numerical Columns:")
print("-" * 45)
# Identify numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numerical_cols) == 0:
print("⚠ No numerical columns found in the dataset.")
# Let's check if we can extract numerical data from string columns
print("\nAttempting to extract numerical information from string columns...")
# Try to extract numerical data from phone numbers, dates, etc.
analysis_df = df.copy()
# Extract year from subscription date if it exists
if 'Subscription Date' in df.columns:
try:
analysis_df['Subscription Year'] = pd.to_datetime(df['Subscription Date']).dt.year
numerical_cols.append('Subscription Year')
print("✓ Extracted 'Subscription Year' from 'Subscription Date'")
except:
print("❌ Could not extract year from 'Subscription Date'")
# Count characters in text fields for analysis
text_analysis_cols = []
for col in ['First Name', 'Last Name', 'Company', 'City', 'Email']:
if col in df.columns:
new_col = f"{col} Length"
analysis_df[new_col] = df[col].astype(str).str.len()
numerical_cols.append(new_col)
text_analysis_cols.append(new_col)
if text_analysis_cols:
print(f"✓ Created length analysis for: {', '.join([col.replace(' Length', '') for col in text_analysis_cols])}")
else:
analysis_df = df.copy()
if len(numerical_cols) > 0:
print(f"\nNumerical columns found: {numerical_cols}")
# Display comprehensive statistics
print(f"\nDetailed Statistics:")
print("-" * 25)
stats_df = analysis_df[numerical_cols].describe()
print(stats_df)
# Additional statistics
print(f"\nAdditional Statistics:")
print("-" * 25)
for col in numerical_cols:
data = analysis_df[col].dropna()
if len(data) > 0:
print(f"\n{col}:")
print(f" Mean: {data.mean():.2f}")
print(f" Median: {data.median():.2f}")
print(f" Standard Deviation: {data.std():.2f}")
print(f" Variance: {data.var():.2f}")
print(f" Range: {data.max() - data.min():.2f}")
print(f" Skewness: {data.skew():.2f}")
print(f" Kurtosis: {data.kurtosis():.2f}")
# Task 2: Perform groupings on categorical columns
print(f"\n2. Grouping Analysis by Categorical Columns:")
print("-" * 50)
categorical_cols = analysis_df.select_dtypes(include=['object']).columns.tolist()
if len(categorical_cols) > 0 and len(numerical_cols) > 0:
# Analyze each categorical column
for cat_col in categorical_cols[:5]: # Limit to first 5 to avoid too much output
unique_values = analysis_df[cat_col].nunique()
# Only analyze if there are reasonable number of categories (2-50)
if 2 <= unique_values <= 50:
print(f"\n📊 Analysis by '{cat_col}' ({unique_values} unique values):")
print("-" * 40)
# Group by categorical column and compute statistics for numerical columns
grouped = analysis_df.groupby(cat_col)
for num_col in numerical_cols[:3]: # Limit to first 3 numerical columns
try:
group_stats = grouped[num_col].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
print(f"\n{num_col} by {cat_col}:")
print(group_stats.round(2))
# Find interesting patterns
max_mean_group = group_stats['mean'].idxmax()
min_mean_group = group_stats['mean'].idxmin()
print(f" 🔝 Highest mean {num_col}: {max_mean_group} ({group_stats.loc[max_mean_group, 'mean']:.2f})")
print(f" 🔻 Lowest mean {num_col}: {min_mean_group} ({group_stats.loc[min_mean_group, 'mean']:.2f})")
except Exception as e:
print(f" ❌ Could not analyze {num_col}: {str(e)}")
else:
print(f"\n⚠ Skipping '{cat_col}' - too many unique values ({unique_values}) for meaningful grouping")
# Task 3: Identify patterns and interesting findings
print(f"\n3. Patterns and Interesting Findings:")
print("-" * 45)
findings = []
# Pattern 1: Country distribution
if 'Country' in analysis_df.columns:
country_counts = analysis_df['Country'].value_counts()
findings.append(f"🌍 Geographic Distribution: Dataset contains customers from {country_counts.nunique()} countries")
findings.append(f" Top country: {country_counts.index[0]} ({country_counts.iloc[0]} customers)")
if len(country_counts) > 1:
findings.append(f" Second most: {country_counts.index[1]} ({country_counts.iloc[1]} customers)")
# Pattern 2: Subscription patterns by year
if 'Subscription Year' in analysis_df.columns:
year_counts = analysis_df['Subscription Year'].value_counts().sort_index()
findings.append(f"📅 Subscription Timeline: Customers joined between {year_counts.index.min()} and {year_counts.index.max()}")
peak_year = year_counts.idxmax()
findings.append(f" Peak subscription year: {peak_year} ({year_counts[peak_year]} customers)")
# Pattern 3: Name length patterns
if 'First Name Length' in analysis_df.columns and 'Last Name Length' in analysis_df.columns:
avg_first_name = analysis_df['First Name Length'].mean()
avg_last_name = analysis_df['Last Name Length'].mean()
findings.append(f"👤 Name Patterns:")
findings.append(f" Average first name length: {avg_first_name:.1f} characters")
findings.append(f" Average last name length: {avg_last_name:.1f} characters")
# Pattern 4: Email domain analysis
if 'Email' in analysis_df.columns:
try:
analysis_df['Email Domain'] = analysis_df['Email'].str.extract(r'@([^.]+\.[^.]+)$')
domain_counts = analysis_df['Email Domain'].value_counts()
findings.append(f"📧 Email Domains: {domain_counts.nunique()} unique email domains")
findings.append(f" Most common domain: {domain_counts.index[0]} ({domain_counts.iloc[0]} users)")
except:
findings.append(f"📧 Email analysis: Could not extract email domain patterns")
# Pattern 5: Company name analysis
if 'Company' in analysis_df.columns:
company_counts = analysis_df['Company'].value_counts()
findings.append(f"🏢 Company Distribution: {company_counts.nunique()} unique companies")
if company_counts.iloc[0] > 1:
findings.append(f" Largest client: {company_counts.index[0]} ({company_counts.iloc[0]} customers)")
# Pattern 6: Data quality insights
total_rows = len(analysis_df)
complete_rows = analysis_df.dropna().shape[0]
completeness = (complete_rows / total_rows) * 100
findings.append(f"📊 Data Quality: {completeness:.1f}% of rows have complete data")
# Pattern 7: Correlation analysis (if we have multiple numerical columns)
if len(numerical_cols) >= 2:
try:
correlation_matrix = analysis_df[numerical_cols].corr()
# Find strongest correlations (excluding self-correlations)
correlation_pairs = []
for i in range(len(numerical_cols)):
for j in range(i+1, len(numerical_cols)):
corr_value = correlation_matrix.iloc[i, j]
if abs(corr_value) > 0.3: # Only show moderate to strong correlations
correlation_pairs.append((numerical_cols[i], numerical_cols[j], corr_value))
if correlation_pairs:
findings.append(f"🔗 Correlations Found:")
for col1, col2, corr in sorted(correlation_pairs, key=lambda x: abs(x[2]), reverse=True)[:3]:
strength = "strong" if abs(corr) > 0.7 else "moderate"
direction = "positive" if corr > 0 else "negative"
findings.append(f" {col1} ↔ {col2}: {corr:.3f} ({strength} {direction})")
except:
findings.append(f"🔗 Correlation analysis: Could not compute correlations")
# Display all findings
if findings:
for finding in findings:
print(finding)
else:
print("No specific patterns identified with available data types.")
# Summary insights
print(f"\n4. Summary Insights:")
print("-" * 25)
print(f"✓ Dataset contains {len(analysis_df)} customer records")
print(f"✓ {len(categorical_cols)} categorical and {len(numerical_cols)} numerical features analyzed")
print(f"✓ Data spans multiple countries and time periods")
print(f"✓ Dataset appears suitable for customer segmentation and geographic analysis")
return analysis_df
def create_data_visualizations(df):
"""
Create comprehensive data visualizations including line charts, bar charts,
histograms, and scatter plots
"""
print("\n" + "=" * 60)
print("DATA VISUALIZATION")
print("=" * 60)
# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")
# Create a figure with subplots
fig = plt.figure(figsize=(20, 16))
fig.suptitle('Customer Dataset Analysis - Comprehensive Visualization Dashboard',
fontsize=20, fontweight='bold', y=0.98)
# Prepare data for visualizations
viz_df = df.copy()
# Extract subscription date information for time series analysis
if 'Subscription Date' in df.columns:
try:
viz_df['Subscription DateTime'] = pd.to_datetime(df['Subscription Date'])
viz_df['Subscription Year'] = viz_df['Subscription DateTime'].dt.year
viz_df['Subscription Month'] = viz_df['Subscription DateTime'].dt.month
viz_df['Subscription YearMonth'] = viz_df['Subscription DateTime'].dt.to_period('M')
except:
print("⚠ Could not parse subscription dates for time series analysis")
# Create numerical features if they don't exist
numerical_features = []
if 'First Name' in df.columns:
viz_df['First Name Length'] = df['First Name'].astype(str).str.len()
numerical_features.append('First Name Length')
if 'Last Name' in df.columns:
viz_df['Last Name Length'] = df['Last Name'].astype(str).str.len()
numerical_features.append('Last Name Length')
if 'Company' in df.columns:
viz_df['Company Length'] = df['Company'].astype(str).str.len()
numerical_features.append('Company Length')
if 'Email' in df.columns:
viz_df['Email Length'] = df['Email'].astype(str).str.len()
numerical_features.append('Email Length')
print(f"Creating visualizations with {len(numerical_features)} numerical features...")
# Visualization 1: Line Chart - Time Series of Subscriptions
plt.subplot(2, 2, 1)
if 'Subscription YearMonth' in viz_df.columns:
try:
# Count subscriptions by month
monthly_subscriptions = viz_df['Subscription YearMonth'].value_counts().sort_index()
# Convert period index to string for plotting
months = [str(period) for period in monthly_subscriptions.index]
counts = monthly_subscriptions.values
plt.plot(range(len(months)), counts, marker='o', linewidth=2.5, markersize=6, color='#2E86AB')
plt.title('📈 Customer Subscription Trends Over Time', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Time Period (Year-Month)', fontsize=12)
plt.ylabel('Number of New Subscriptions', fontsize=12)
plt.grid(True, alpha=0.3)
# Customize x-axis labels
step = max(1, len(months) // 6) # Show max 6 labels
plt.xticks(range(0, len(months), step), [months[i] for i in range(0, len(months), step)],
rotation=45, ha='right')
# Add trend annotation
if len(counts) > 1:
trend = "📈 Increasing" if counts[-1] > counts[0] else "📉 Decreasing"
plt.text(0.02, 0.98, f'Trend: {trend}', transform=plt.gca().transAxes,
fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
print("✓ Line chart: Subscription trends over time")
except Exception as e:
plt.text(0.5, 0.5, f'Time series data not available\nCreating alternative trend analysis...',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
# Alternative: Show subscription by year
if 'Subscription Year' in viz_df.columns:
yearly_subs = viz_df['Subscription Year'].value_counts().sort_index()
plt.plot(yearly_subs.index, yearly_subs.values, marker='o', linewidth=3, markersize=8)
plt.title('📈 Annual Subscription Trends', fontsize=14, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Subscriptions', fontsize=12)
plt.grid(True, alpha=0.3)
else:
plt.text(0.5, 0.5, 'No time-series data available\nfor trend analysis',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
plt.title('📈 Time Series Analysis (Data Not Available)', fontsize=14)
# Visualization 2: Bar Chart - Average Name Length by Country
plt.subplot(2, 2, 2)
if 'Country' in viz_df.columns and 'First Name Length' in viz_df.columns:
try:
# Calculate average name length by country (top 10 countries)
country_name_avg = viz_df.groupby('Country')['First Name Length'].mean().sort_values(ascending=False).head(10)
bars = plt.bar(range(len(country_name_avg)), country_name_avg.values,
color=plt.cm.Set3(np.linspace(0, 1, len(country_name_avg))))
plt.title('📊 Average First Name Length by Country (Top 10)', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Average Name Length (characters)', fontsize=12)
plt.xticks(range(len(country_name_avg)), country_name_avg.index, rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
# Add value labels on bars
for i, bar in enumerate(bars):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
f'{height:.1f}', ha='center', va='bottom', fontsize=9)
# Add statistics annotation
plt.text(0.02, 0.98, f'Range: {country_name_avg.min():.1f} - {country_name_avg.max():.1f} chars',
transform=plt.gca().transAxes, fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
print("✓ Bar chart: Average name length by country")
except Exception as e:
# Alternative: Company length by country
if 'Company Length' in viz_df.columns:
country_company_avg = viz_df.groupby('Country')['Company Length'].mean().sort_values(ascending=False).head(8)
plt.bar(range(len(country_company_avg)), country_company_avg.values, color='skyblue')
plt.title('📊 Average Company Name Length by Country', fontsize=14, fontweight='bold')
plt.xlabel('Country', fontsize=12)
plt.ylabel('Average Company Name Length', fontsize=12)
plt.xticks(range(len(country_company_avg)), country_company_avg.index, rotation=45, ha='right')
else:
plt.text(0.5, 0.5, 'Insufficient data for\ncategorical comparison',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
else:
plt.text(0.5, 0.5, 'No categorical data available\nfor comparison analysis',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
plt.title('📊 Categorical Comparison (Data Not Available)', fontsize=14)
# Visualization 3: Histogram - Distribution of Email Length
plt.subplot(2, 2, 3)
if 'Email Length' in viz_df.columns:
try:
email_lengths = viz_df['Email Length'].dropna()
# Create histogram with customization
n, bins, patches = plt.hist(email_lengths, bins=20, color='lightcoral', alpha=0.7, edgecolor='black')
# Color bars based on frequency
fracs = n / n.max()
norm = plt.cm.colors.Normalize(vmin=fracs.min(), vmax=fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.viridis(norm(thisfrac))
thispatch.set_facecolor(color)
plt.title('📈 Distribution of Email Address Lengths', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Email Address Length (characters)', fontsize=12)
plt.ylabel('Frequency (Number of Customers)', fontsize=12)
plt.grid(True, alpha=0.3, axis='y')
# Add statistical annotations
mean_length = email_lengths.mean()
median_length = email_lengths.median()
plt.axvline(mean_length, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_length:.1f}')
plt.axvline(median_length, color='blue', linestyle='--', linewidth=2, label=f'Median: {median_length:.1f}')
plt.legend()
# Add distribution info
plt.text(0.98, 0.98, f'Std Dev: {email_lengths.std():.1f}\nRange: {email_lengths.min()}-{email_lengths.max()}',
transform=plt.gca().transAxes, fontsize=10, verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))
print("✓ Histogram: Email length distribution")
except Exception as e:
plt.text(0.5, 0.5, f'Error creating histogram: {str(e)}',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
else:
# Alternative: Use first available numerical column
if numerical_features:
alt_feature = numerical_features[0]
data = viz_df[alt_feature].dropna()
plt.hist(data, bins=15, color='lightgreen', alpha=0.7, edgecolor='black')
plt.title(f'📈 Distribution of {alt_feature}', fontsize=14, fontweight='bold')
plt.xlabel(f'{alt_feature}', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, alpha=0.3, axis='y')
print(f"✓ Histogram: {alt_feature} distribution")
else:
plt.text(0.5, 0.5, 'No numerical data available\nfor distribution analysis',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
plt.title('📈 Distribution Analysis (Data Not Available)', fontsize=14)
# Visualization 4: Scatter Plot - Relationship between Name Lengths
plt.subplot(2, 2, 4)
if 'First Name Length' in viz_df.columns and 'Last Name Length' in viz_df.columns:
try:
# Create scatter plot
first_name_len = viz_df['First Name Length'].dropna()
last_name_len = viz_df['Last Name Length'].dropna()
# Ensure both series have the same length
min_len = min(len(first_name_len), len(last_name_len))
x_data = first_name_len.iloc[:min_len]
y_data = last_name_len.iloc[:min_len]
# Create scatter plot with color mapping
scatter = plt.scatter(x_data, y_data, alpha=0.6, s=60, c=range(len(x_data)),
cmap='plasma', edgecolors='black', linewidth=0.5)
plt.title('🔗 Relationship: First Name vs Last Name Length', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('First Name Length (characters)', fontsize=12)
plt.ylabel('Last Name Length (characters)', fontsize=12)
plt.grid(True, alpha=0.3)
# Add trend line
if len(x_data) > 1:
z = np.polyfit(x_data, y_data, 1)
p = np.poly1d(z)
plt.plot(x_data.sort_values(), p(x_data.sort_values()), "r--", alpha=0.8, linewidth=2)
# Calculate correlation
correlation = np.corrcoef(x_data, y_data)[0, 1]
plt.text(0.02, 0.98, f'Correlation: {correlation:.3f}',
transform=plt.gca().transAxes, fontsize=10, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='lightcyan', alpha=0.8))
# Add colorbar
cbar = plt.colorbar(scatter, ax=plt.gca())
cbar.set_label('Customer Index', fontsize=10)
print("✓ Scatter plot: First name vs last name length relationship")
except Exception as e:
plt.text(0.5, 0.5, f'Error creating scatter plot: {str(e)}',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
elif len(numerical_features) >= 2:
# Alternative: Use two available numerical features
try:
feature1, feature2 = numerical_features[0], numerical_features[1]
x_data = viz_df[feature1].dropna()
y_data = viz_df[feature2].dropna()
min_len = min(len(x_data), len(y_data))
plt.scatter(x_data.iloc[:min_len], y_data.iloc[:min_len], alpha=0.6, s=50, color='purple')
plt.title(f'🔗 Relationship: {feature1} vs {feature2}', fontsize=14, fontweight='bold')
plt.xlabel(feature1, fontsize=12)
plt.ylabel(feature2, fontsize=12)
plt.grid(True, alpha=0.3)
print(f"✓ Scatter plot: {feature1} vs {feature2} relationship")
except Exception as e:
plt.text(0.5, 0.5, 'Insufficient numerical data\nfor relationship analysis',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
else:
plt.text(0.5, 0.5, 'Insufficient numerical data\nfor relationship analysis',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
plt.title('🔗 Relationship Analysis (Data Not Available)', fontsize=14)
# Adjust layout and save
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
try:
plt.savefig('customer_data_visualization.png', dpi=300, bbox_inches='tight',
facecolor='white', edgecolor='none')
print("\n✓ Visualization saved as 'customer_data_visualization.png'")
except Exception as e:
print(f"\n⚠ Could not save visualization: {e}")
plt.show()
# Create additional specialized visualizations
create_additional_visualizations(viz_df)
print("\n" + "=" * 60)
print("DATA VISUALIZATION COMPLETED!")
print("=" * 60)
print("📊 Four main visualization types created:")
print(" 1. ✓ Line Chart - Time series trends")
print(" 2. ✓ Bar Chart - Categorical comparisons")
print(" 3. ✓ Histogram - Distribution analysis")
print(" 4. ✓ Scatter Plot - Relationship analysis")
print("🎨 All plots include custom titles, labels, legends, and styling")
def create_additional_visualizations(df):
"""
Create additional specialized visualizations for deeper insights
"""
print(f"\n📈 Creating additional specialized visualizations...")
# Additional Visualization 1: Country Distribution Pie Chart
if 'Country' in df.columns:
try:
plt.figure(figsize=(12, 8))
# Pie chart for top countries
plt.subplot(2, 2, 1)
country_counts = df['Country'].value_counts().head(8)
colors = plt.cm.Set3(np.linspace(0, 1, len(country_counts)))
wedges, texts, autotexts = plt.pie(country_counts.values, labels=country_counts.index,
autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('🌍 Customer Distribution by Country (Top 8)', fontsize=14, fontweight='bold')
# Enhance text appearance
for autotext in autotexts:
autotext.set_color('white')
autotext.set_fontweight('bold')
# Heatmap of subscription by year and month
plt.subplot(2, 2, 2)
if 'Subscription Year' in df.columns and 'Subscription Month' in df.columns:
try:
# Create pivot table for heatmap
subscription_pivot = df.groupby(['Subscription Year', 'Subscription Month']).size().unstack(fill_value=0)
sns.heatmap(subscription_pivot, annot=True, fmt='d', cmap='YlOrRd',
cbar_kws={'label': 'Number of Subscriptions'})
plt.title('🔥 Subscription Heatmap (Year vs Month)', fontsize=14, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Year', fontsize=12)
except:
plt.text(0.5, 0.5, 'Heatmap data\nnot available',
ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
# Box plot for name lengths by top countries
plt.subplot(2, 1, 2)
if 'First Name Length' in df.columns:
top_countries = df['Country'].value_counts().head(6).index
country_name_data = []
country_labels = []
for country in top_countries:
country_data = df[df['Country'] == country]['First Name Length'].dropna()
if len(country_data) > 0:
country_name_data.append(country_data)
country_labels.append(country)
if country_name_data:
box_plot = plt.boxplot(country_name_data, labels=country_labels, patch_artist=True)
# Color the boxes
colors = plt.cm.viridis(np.linspace(0, 1, len(box_plot['boxes'])))
for patch, color in zip(box_plot['boxes'], colors):
patch.set_facecolor(color)
patch.set_alpha(0.7)
plt.title('📦 Distribution of Name Lengths by Country', fontsize=14, fontweight='bold')
plt.xlabel('Country', fontsize=12)
plt.ylabel('First Name Length', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
try:
plt.savefig('additional_customer_visualizations.png', dpi=300, bbox_inches='tight')
print("✓ Additional visualizations saved as 'additional_customer_visualizations.png'")
except:
pass
plt.show()
except Exception as e:
print(f"⚠ Could not create additional visualizations: {e}")
return df
def save_cleaned_dataset(df, filename='customers_cleaned.csv'):
"""
Save the cleaned dataset to a new CSV file
"""
try:
df.to_csv(filename, index=False)
print(f"\n✓ Cleaned dataset saved as '{filename}'")
except Exception as e:
print(f"\n❌ Error saving cleaned dataset: {e}")
if __name__ == "__main__":
# Run the main function
cleaned_data = load_and_explore_dataset()
if cleaned_data is not None:
# Perform basic data analysis
analyzed_data = perform_basic_data_analysis(cleaned_data)
# Create comprehensive data visualizations
print(f"\nStarting data visualization process...")
create_data_visualizations(analyzed_data)
# Ask user if they want to save the cleaned dataset
print(f"\nWould you like to save the cleaned dataset? (y/n): ", end="")
# For automated execution, we'll save it automatically
# In interactive mode, you can uncomment the input() line below
# response = input().lower().strip()
response = 'y' # Automatic yes for demo
if response in ['y', 'yes']:
save_cleaned_dataset(analyzed_data)
print(f"\n🎉 Complete analysis with visualizations finished!")
print(f"📋 Summary: Data loading ✓ | Exploration ✓ | Cleaning ✓ | Analysis ✓ | Visualization ✓")
print(f"🚀 Dataset is ready for machine learning or advanced analytics!")
print(f"📊 Visualization files created: 'customer_data_visualization.png' & 'additional_customer_visualizations.png'")