KamiLimu1 · root458 · Dec 12, 2022
diff --git a/exploratory_data_analysis.py b/exploratory_data_analysis.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+"""Exploratory Data Analysis.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1hcGgs0D5ZP3eIeYCONrR1fMvT4iQ5Agg
+"""
+
+# Commented out IPython magic to ensure Python compatibility.
+# Import libraries for EDA
+
+import pandas as pd
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+# %matplotlib inline
+import seaborn as sns
+import scipy.stats as st
+from sklearn import ensemble, tree, linear_model
+import missingno as msno
+
+# load Data
+
+train = pd.read_csv('train.csv')
+test = pd.read_csv('test.csv')
+
+# Show dataset description
+
+train.describe()
+
+# Show first 5 rows to get a feel of the data
+
+train.head()
+
+# Show last 5 rows to get a feel of the data
+
+train.tail()
+
+# How data frame shapes
+# Test has fewer columns since it has no target
+
+train.shape , test.shape
+
+# Examine numerical features in the train dataset
+
+numeric_features = train.select_dtypes(include=[np.number])
+
+numeric_features.columns
+
+# Examine categorical features in the train dataset
+# np.object is deprecated therefore just use object to specify categorical data
+
+categorical_features = train.select_dtypes(include=[object])
+
+categorical_features.columns
+
+# Visualising missing values for a sample of 250
+
+msno.matrix(train.sample(250))
+
+# Measure nullity correlation: how strongly the presence or absence of 
+# one variable affects the presence of another
+msno.heatmap(train)
+
+# Measure nullity correlation with a bar graph
+
+msno.bar(train.sample(1000))
+
+# Use dendrogram to allow you to more fully correlate variable completion, 
+# revealing trends deeper than the pairwise ones visible 
+# in the correlation heatmap
+
+msno.dendrogram(train)
+
+# Estimate skewness and Kurtosis
+
+train.skew(), train.kurt()
+
+y = train['SalePrice']
+plt.figure(1); plt.title('Johnson SU')
+sns.distplot(y, kde=False, fit=st.johnsonsu)
+plt.figure(2); plt.title('Normal')
+sns.distplot(y, kde=False, fit=st.norm)
+plt.figure(3); plt.title('Log Normal')
+sns.distplot(y, kde=False, fit=st.lognorm)
+
+# Show skewness
+
+sns.distplot(train.skew(),color='blue',axlabel ='Skewness')
+
+#  Show Kurtosis
+
+plt.figure(figsize = (12,8))
+sns.distplot(train.kurt(),color='r',axlabel ='Kurtosis',norm_hist= False, kde = True,rug = False)
+#plt.hist(train.kurt(),orientation = 'vertical',histtype = 'bar',label ='Kurtosis', color ='blue')
+plt.show()
+
+plt.hist(train['SalePrice'],orientation = 'vertical',histtype = 'bar', color ='blue')
+plt.show()
+
+target = np.log(train['SalePrice'])
+target.skew()
+plt.hist(target,color='blue')
+
+correlation = numeric_features.corr()
+print(correlation['SalePrice'].sort_values(ascending = False),'\n')
+
+# Show corelation heatmap
+
+f , ax = plt.subplots(figsize = (14,12))
+
+plt.title('Correlation of Numeric Features with Sale Price',y=1,size=16)
+
+sns.heatmap(correlation,square = True,  vmax=0.8)
+
+# Observing correlation with Zoomed Heat Map
+
+k= 11
+cols = correlation.nlargest(k,'SalePrice')['SalePrice'].index
+print(cols)
+cm = np.corrcoef(train[cols].values.T)
+f , ax = plt.subplots(figsize = (14,12))
+sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
+            linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)
+
+# Pair plot with identified columns
+
+sns.set()
+columns = ['SalePrice','OverallQual','TotalBsmtSF','GrLivArea','GarageArea','FullBath','YearBuilt','YearRemodAdd']
+sns.pairplot(train[columns],size = 2 ,kind ='scatter',diag_kind='kde')
+plt.show()
+
+# Scatter plots between the most correlated variables
+
+fig, ((ax1, ax2), (ax3, ax4),(ax5,ax6)) = plt.subplots(nrows=3, ncols=2, figsize=(14,10))
+OverallQual_scatter_plot = pd.concat([train['SalePrice'],train['OverallQual']],axis = 1)
+sns.regplot(x='OverallQual',y = 'SalePrice',data = OverallQual_scatter_plot,scatter= True, fit_reg=True, ax=ax1)
+TotalBsmtSF_scatter_plot = pd.concat([train['SalePrice'],train['TotalBsmtSF']],axis = 1)
+sns.regplot(x='TotalBsmtSF',y = 'SalePrice',data = TotalBsmtSF_scatter_plot,scatter= True, fit_reg=True, ax=ax2)
+GrLivArea_scatter_plot = pd.concat([train['SalePrice'],train['GrLivArea']],axis = 1)
+sns.regplot(x='GrLivArea',y = 'SalePrice',data = GrLivArea_scatter_plot,scatter= True, fit_reg=True, ax=ax3)
+GarageArea_scatter_plot = pd.concat([train['SalePrice'],train['GarageArea']],axis = 1)
+sns.regplot(x='GarageArea',y = 'SalePrice',data = GarageArea_scatter_plot,scatter= True, fit_reg=True, ax=ax4)
+FullBath_scatter_plot = pd.concat([train['SalePrice'],train['FullBath']],axis = 1)
+sns.regplot(x='FullBath',y = 'SalePrice',data = FullBath_scatter_plot,scatter= True, fit_reg=True, ax=ax5)
+YearBuilt_scatter_plot = pd.concat([train['SalePrice'],train['YearBuilt']],axis = 1)
+sns.regplot(x='YearBuilt',y = 'SalePrice',data = YearBuilt_scatter_plot,scatter= True, fit_reg=True, ax=ax6)
+YearRemodAdd_scatter_plot = pd.concat([train['SalePrice'],train['YearRemodAdd']],axis = 1)
+YearRemodAdd_scatter_plot.plot.scatter('YearRemodAdd','SalePrice')
+
+saleprice_overall_quality= train.pivot_table(index ='OverallQual',values = 'SalePrice', aggfunc = np.median)
+saleprice_overall_quality.plot(kind = 'bar',color = 'blue')
+plt.xlabel('Overall Quality')
+plt.ylabel('Median Sale Price')
+plt.show()
+
+# Box plot - OverallQual
+
+var = 'OverallQual'
+data = pd.concat([train['SalePrice'], train[var]], axis=1)
+f, ax = plt.subplots(figsize=(12, 8))
+fig = sns.boxplot(x=var, y="SalePrice", data=data)
+fig.axis(ymin=0, ymax=800000);
+
+# Box plot - Neighborhood
+
+var = 'Neighborhood'
+data = pd.concat([train['SalePrice'], train[var]], axis=1)
+f, ax = plt.subplots(figsize=(16, 10))
+fig = sns.boxplot(x=var, y="SalePrice", data=data)
+fig.axis(ymin=0, ymax=800000);
+xt = plt.xticks(rotation=45)
+
+# Count Plot - Neighborhood
+
+plt.figure(figsize = (12, 6))
+sns.countplot(x = 'Neighborhood', data = data)
+xt = plt.xticks(rotation=45)
+
+for c in categorical_features:
+    train[c] = train[c].astype('category')
+    if train[c].isnull().any():
+        train[c] = train[c].cat.add_categories(['MISSING'])
+        train[c] = train[c].fillna('MISSING')
+
+def boxplot(x, y, **kwargs):
+    sns.boxplot(x=x, y=y)
+    x=plt.xticks(rotation=90)
+f = pd.melt(train, id_vars=['SalePrice'], value_vars=categorical_features)
+g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
+g = g.map(boxplot, "value", "SalePrice")
+
+var = 'SaleType'
+data = pd.concat([train['SalePrice'], train[var]], axis=1)
+f, ax = plt.subplots(figsize=(16, 10))
+fig = sns.boxplot(x=var, y="SalePrice", data=data)
+fig.axis(ymin=0, ymax=800000);
+xt = plt.xticks(rotation=45)
+
+var = 'SaleCondition'
+data = pd.concat([train['SalePrice'], train[var]], axis=1)
+f, ax = plt.subplots(figsize=(16, 10))
+fig = sns.boxplot(x=var, y="SalePrice", data=data)
+fig.axis(ymin=0, ymax=800000);
+xt = plt.xticks(rotation=45)
+
+# ViolinPlot - Functional vs.SalePrice
+
+sns.violinplot('Functional', 'SalePrice', data = train)
+
+# FactorPlot - FirePlaceQC vs. SalePrice
+
+sns.factorplot('FireplaceQu', 'SalePrice', data = train, color = 'm', \
+               estimator = np.median, order = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], size = 4.5,  aspect=1.35)
+
+# Facet Grid Plot - FirePlace QC vs.SalePrice
+
+g = sns.FacetGrid(train, col = 'FireplaceQu', col_wrap = 3, col_order=['Ex', 'Gd', 'TA', 'Fa', 'Po'])
+g.map(sns.boxplot, 'Fireplaces', 'SalePrice', order = [1, 2, 3], palette = 'Set2')
+
+# PointPlot
+
+plt.figure(figsize=(8,10))
+g1 = sns.pointplot(x='Neighborhood', y='SalePrice', 
+                   data=train, hue='LotShape')
+g1.set_xticklabels(g1.get_xticklabels(),rotation=90)
+g1.set_title("Lotshape Based on Neighborhood", fontsize=15)
+g1.set_xlabel("Neighborhood")
+g1.set_ylabel("Sale Price", fontsize=12)
+plt.show()
+
+# Missing Value Analysis
+# Numeric Features
+
+total = numeric_features.isnull().sum().sort_values(ascending=False)
+percent = (numeric_features.isnull().sum()/numeric_features.isnull().count()).sort_values(ascending=False)
+missing_data = pd.concat([total, percent], axis=1,join='outer', keys=['Total Missing Count', '% of Total Observations'])
+missing_data.index.name =' Numeric Feature'
+
+missing_data.head(20)
+
+# Missing values for all numeric features in Bar chart Representation
+
+missing_values = numeric_features.isnull().sum(axis=0).reset_index()
+missing_values.columns = ['column_name', 'missing_count']
+missing_values = missing_values.loc[missing_values['missing_count']>0]
+missing_values = missing_values.sort_values(by='missing_count')
+
+ind = np.arange(missing_values.shape[0])
+width = 0.1
+fig, ax = plt.subplots(figsize=(12,3))
+rects = ax.barh(ind, missing_values.missing_count.values, color='b')
+ax.set_yticks(ind)
+ax.set_yticklabels(missing_values.column_name.values, rotation='horizontal')
+ax.set_xlabel("Missing Observations Count")
+ax.set_title("Missing Observations Count - Numeric Features")
+plt.show()
+
+# Categorical Features
+
+total = categorical_features.isnull().sum().sort_values(ascending=False)
+percent = (categorical_features.isnull().sum()/categorical_features.isnull().count()).sort_values(ascending=False)
+missing_data = pd.concat([total, percent], axis=1,join='outer', keys=['Total Missing Count', ' % of Total Observations'])
+missing_data.index.name ='Feature'
+missing_data.head(20)
+
+# Missing values for Categorical features in Bar chart Representation
+
+missing_values = categorical_features.isnull().sum(axis=0).reset_index()
+missing_values.columns = ['column_name', 'missing_count']
+missing_values = missing_values.loc[missing_values['missing_count']>0]
+missing_values = missing_values.sort_values(by='missing_count')
+
+ind = np.arange(missing_values.shape[0])
+width = 0.9
+fig, ax = plt.subplots(figsize=(12,18))
+rects = ax.barh(ind, missing_values.missing_count.values, color='red')
+ax.set_yticks(ind)
+ax.set_yticklabels(missing_values.column_name.values, rotation='horizontal')
+ax.set_xlabel("Missing Observations Count")
+ax.set_title("Missing Observations Count - Categorical Features")
+plt.show()
+
+# Categorical Feature Exploration
+
+for column_name in train.columns:
+    if train[column_name].dtypes == 'object':
+        train[column_name] = train[column_name].fillna(train[column_name].mode().iloc[0])
+        unique_category = len(train[column_name].unique())
+        print("Feature '{column_name}' has '{unique_category}' unique categories".format(column_name = column_name,
+                                                                                         unique_category=unique_category))
+
+for column_name in test.columns:
+    if test[column_name].dtypes == 'object':
+        test[column_name] = test[column_name].fillna(test[column_name].mode().iloc[0])
+        unique_category = len(test[column_name].unique())
+        print("Features in test set '{column_name}' has '{unique_category}' unique categories".format(column_name = column_name, unique_category=unique_category))