-
Notifications
You must be signed in to change notification settings - Fork 12
/
fitdata.py
128 lines (99 loc) · 4.39 KB
/
fitdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 24 16:31:23 2019
@author: michaelboles
"""
# set up working directory
import os
#os.chdir('/Users/michaelboles/Michael/Coding/2019/Realestate') # Mac
os.chdir('C:\\Users\\bolesmi\\Lam\\Coding\\Python\\2019\\Realestate') # PC
# import packages
import pandas as pd
import statsmodels.formula.api as smf
# import full dataset
data_all_raw = pd.read_csv('./data/listings/data_all.csv')
# format/clean -- select columns of interest, drop rows with zeros, NaNs
data_all_temp2 = data_all_raw.dropna()
data_all_temp3 = data_all_temp2[(data_all_temp2 != 0).all(1)]
# get rid of some outliers
data_all_temp4 = data_all_temp3[data_all_temp3['Home size'] < 5000]
data_all_temp5 = data_all_temp4[data_all_temp4['Lot size'] < 20000]
data_all_temp6 = data_all_temp5[data_all_temp5['Price'] < 5000000]
data_all_temp7 = data_all_temp6[data_all_temp6['Beds'] < 6]
data_all_temp8 = data_all_temp7[data_all_temp7['Baths'] < 6]
# remove spaces in column names - necessary for OLS?
data_all_temp8.columns = data_all_temp8.columns.str.replace(' ', '_')
data_all = data_all_temp8
# check for multicollinearity
correlations = data_all.corr()
# query count, mean, stdev etc. of selected data
data_all.describe()
## option to select one zipcode
#zipcode = 94401
#data_subset = data_all[data_all['Zip'] == zipcode]
### BUILD MODEL ###
# build model from data using backward elimination
formula_1 = 'Price ~ Home_size + Lot_size + Beds + Baths + Commute_time + School_score'
formula_2 = 'Price ~ Home_size + Lot_size + Beds + Commute_time + School_score'
formula_3 = 'Price ~ Home_size + Lot_size + Commute_time + School_score'
formula_4 = 'Price ~ Home_size + Lot_size + Beds + Baths'
regressor = smf.ols(formula_4, data = data_all).fit() # settled on formula_3
regressor.summary()
summary = regressor.summary()
summary_text = summary.as_text()
## try same for only one zipcode
#data_subset = data_all.loc[data_all['Zip'] == 95126]
#formula_subset_1 = 'Price ~ Home_size + Lot_size + Beds + Baths + Commute_time + School_score'
#formula_subset_2 = 'Price ~ Home_size + Lot_size + Baths'
#regressor = smf.ols(formula_subset_2, data = data_subset).fit()
#summary_subset = regressor.summary()
#summary_subset_text = summary_subset.as_text()
# get variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = regressor.model.exog
vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
### sklearn ###
# set features (independent) and labels (dependent)
x = data_all[['Home_size', 'Lot_size', 'Commute_time', 'School_score']]
y = data_all['Price']
# before commute, school quality data
x2 = data_all[['Home_size', 'Lot_size', 'Beds', 'Baths']]
# fit data
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x2, y)
#regressor.intercept_, regressor.coef_
# predict values based on model
y_pred = regressor.predict(x) # listing + surroundings data
y_pred2 = regressor.predict(x2) # listing data only
# calculate difference between predicted and actual prices
diff = round((y - y_pred2), 6)
# add difference to full data set
data_all['Price difference 2'] = diff
#data_all.to_csv('data_all_price_predictions.csv')
# import fitted data
data_all = pd.read_csv('./data/listings/data_all_price_predictions.csv')
# set up histogram text box
description = data_all['Price difference'].describe()
prices_textbox = 'Median = -$%.2f M \nStdev = $%.2f M' % (round(-description['50%']/100000,4), round(description['std']/1000000,4))
# plot price difference
from plotfunctions import plothist2
figure_name = 'price_diff_loc_data.jpg'
binwidth = 100
data1 = data_all['Price difference']/1000000
data2 = data_all['Price difference 2']/1000000
plothist2(data1, data2, 0.1, prices_textbox, -1, 1, 'Actual - predicted price ($M)', 'Counts', figure_name)
# plot price difference in seaborn
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# Plot histograms
plt.figure(figsize=(5,5))
ax1 = sns.distplot(data1, kde=False, color='blue', hist_kws=dict(edgecolor="k", linewidth=.5))
ax2 = sns.distplot(data2, kde=False, color='orangered', hist_kws=dict(edgecolor="k", linewidth=.5))
plt.xlabel('Actual - predicted price ($M)', fontsize=13) #, fontweight='bold')
plt.ylabel('Counts', fontsize=13) #, fontweight='bold')
#plt.legend()
plt.xticks(np.arange(-1, 1.1, 0.25))
plt.xlim(-1,1)
plt.savefig('.\Images\pricediffhist.jpg', dpi=600)