-
Notifications
You must be signed in to change notification settings - Fork 12
/
plotboxplots.py
191 lines (150 loc) · 8.33 KB
/
plotboxplots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 27 10:03:23 2019
@author: michaelboles
"""
# set up working directory
import os
os.chdir('/Users/michaelboles/Michael/Coding/2019/Realestate') # Mac
#os.chdir('C:\\Users\\bolesmi\\Lam\\Coding\\Python\\2019\\Realestate') # PC
# import packages
import pandas as pd
import matplotlib.pyplot as plt
# import master dataset
data_bay = pd.read_csv('./Data/listings/data_bay.csv')
# also import dataset with price residuals (actual - predicted)
data_residuals = pd.read_csv('./Data/listings/data_all_price_predictions.csv')
# count number of entries for each city/town
city_counts = data_bay.groupby('City').count().iloc[:,1].to_frame()
city_counts.columns = ['Count']
# display cities with most listings first
city_counts_sorted = city_counts.sort_values('Count', ascending = False)
# select cities of interest
city_info = pd.DataFrame({'City': [], 'Count': [], 'Average Price': [], 'Stdev': []})
cities_of_interest = ['San Francisco', 'San Jose', 'Oakland', 'Berkeley', 'San Leandro',
'Hayward', 'Fremont', 'Richmond', 'Vallejo', 'Antioch', 'Walnut Creek',
'Orinda', 'San Mateo', 'Redwood City', 'Palo Alto', 'Mountain View',
'Daly City', 'Woodside', 'Menlo Park',
'Santa Clara', 'Cupertino', 'Los Altos', 'Los Gatos', 'Sunnyvale',
'Mill Valley', 'Tiburon', 'Sausalito', 'Hillsborough', 'Piedmont']
cities_of_interest2 = ['San Francisco', 'San Jose', 'Oakland', 'Berkeley', 'San Leandro',
'Hayward', 'Fremont', 'Richmond', 'Vallejo', 'Antioch', 'Walnut Creek',
'Orinda', 'San Mateo', 'Redwood City', 'Palo Alto', 'Mountain View',
'Daly City', 'Menlo Park', 'East Palo Alto', 'San Anselmo',
'Santa Clara', 'Cupertino', 'Los Altos', 'Los Gatos', 'Sunnyvale',
'Mill Valley', 'Tiburon', 'Sausalito', 'Alameda', 'Piedmont']
# create dataframe with only cities of interest
data_of_interest = data_bay[data_bay['City'].isin(cities_of_interest)]
data_of_interest_residuals = data_residuals[data_residuals['City'].isin(cities_of_interest2)]
# add a column for price per lot sqft
data_of_interest.insert(loc = 6, column = 'Price per lot sqft', value = data_of_interest['Price']/data_of_interest['Lot size'])
# Determine order
city_order_price = data_of_interest.groupby('City').median().sort_values(by='Price',ascending=True).iloc[:,2].to_frame().reset_index()
city_order_pricesqft = data_of_interest.groupby('City').median().sort_values(by='Price per lot sqft',ascending=True).iloc[:,3].to_frame().reset_index()
city_order_residuals = data_of_interest_residuals.groupby('City').median().sort_values(by = 'Price difference', ascending = True).iloc[:,-1].to_frame().reset_index()
# create seaborn box + strip plot
import seaborn as sns
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize = (60,30))
ax = sns.boxplot(x = 'City', y = 'Price', data = data_of_interest,
showfliers = False, order = list(city_order_price['City']), linewidth = 5)
ax = sns.stripplot(x = 'City', y = 'Price', data = data_of_interest,
order = list(city_order_price['City']), jitter = 0.25, size = 15,
linewidth = 3, edgecolor = 'black', alpha = 0.5)
# set axis properties
plt.xticks(rotation=45, fontname = 'Helvetica', fontsize = 42, ha = 'right')
plt.yticks(fontname = 'Helvetica', fontsize = 42)
plt.xlabel('Location', fontsize = 55, fontname = 'Arial', fontweight = 'bold')
plt.ylabel('Single Family Home Price ($M)', fontsize = 55, fontname = 'Arial',
fontweight = 'bold')
scale = 1000000; ax.set_ylim(0, 8000000); ax.yaxis.labelpad = 25
ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
ax.xaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_major_formatter(ticks)
plt.setp(ax.spines.values(), linewidth = 3)
# do the same for price per lot sqft
fig, ax = plt.subplots(1, 1, figsize = (60,30))
ax = sns.boxplot(x = 'City', y = 'Price per lot sqft', data = data_of_interest,
showfliers = False, order = list(city_order_pricesqft['City']), linewidth = 5)
ax = sns.stripplot(x = 'City', y = 'Price per lot sqft', data = data_of_interest,
order = list(city_order_pricesqft['City']), jitter = 0.25, size = 15,
linewidth = 3, edgecolor = 'black', alpha = 0.5)
# set axis properties
plt.xticks(rotation=45, fontname = 'Helvetica', fontsize = 42, ha = 'right')
plt.yticks(fontname = 'Helvetica', fontsize = 42)
plt.xlabel('Location', fontsize = 55, fontname = 'Arial', fontweight = 'bold')
plt.ylabel('Land Price Per Square Foot ($)', fontsize = 55, fontname = 'Arial',
fontweight = 'bold')
ax.set_ylim(0, 2000); ax.yaxis.labelpad = 25
ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y))
ax.xaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_major_formatter(ticks)
plt.setp(ax.spines.values(), linewidth = 3)
# and do the same for price difference
fig, ax = plt.subplots(1, 1, figsize = (60,30))
plt.axhline(y = 0, color = 'k', linestyle = ':', linewidth = 2)
ax = sns.boxplot(x = 'City', y = 'Price difference', data = data_of_interest_residuals,
showfliers = False, order = list(city_order_residuals['City']), linewidth = 5)
ax = sns.stripplot(x = 'City', y = 'Price difference', data = data_of_interest_residuals,
order = list(city_order_residuals['City']), jitter = 0.25, size = 15,
linewidth = 3, edgecolor = 'black', alpha = 0.5)
# set axis properties
plt.xticks(rotation = 45, fontname = 'Helvetica', fontsize = 42, ha = 'right')
plt.yticks(fontname = 'Helvetica', fontsize = 42)
plt.xlabel('Location', fontsize = 55, fontname = 'Arial', fontweight = 'bold')
plt.ylabel('Actual - predicted price ($M)', fontsize = 55, fontname = 'Arial',
fontweight = 'bold')
scale = 1000000; ax.set_ylim(-1200000, 2500000); ax.yaxis.labelpad = 25
ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
ax.xaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_tick_params(width = 3, length = 15)
ax.yaxis.set_major_formatter(ticks)
plt.setp(ax.spines.values(), linewidth = 3)
# other seaborn plot options
#
## boxenplot
#fig, ax = plt.subplots(1, 1, figsize = (20,10))
#ax = sns.boxenplot(x = 'City', y = 'Price', data = data_of_interest,
# outlier_prop = 0.01, order = list(city_order['City']))
#plt.xticks(rotation=45)
#plt.xlabel('City or Town', fontsize = 18, fontname = 'Arial', fontweight = 'bold')
#plt.ylabel('Single Family Home Price ($M)', fontsize = 18, fontweight = 'bold')
#ax.set_ylim(0,8000000)
#ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
#ax.yaxis.set_major_formatter(ticks)
#
## violinplot
#fig, ax = plt.subplots(1, 1, figsize = (20,10))
#ax = sns.violinplot(x = 'City', y = 'Price', data = data_of_interest,
# outlier_prop = 0.01, order = list(city_order['City']))
#plt.xticks(rotation=45)
#plt.xlabel('City or Town', fontsize = 18, fontname = 'Arial', fontweight = 'bold')
#plt.ylabel('Single Family Home Price ($M)', fontsize = 18, fontweight = 'bold')
#ax.set_ylim(0,8000000)
#ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
#ax.yaxis.set_major_formatter(ticks)
#
## stripplot
#fig, ax = plt.subplots(1, 1, figsize = (20,10))
#ax = sns.stripplot(x = 'City', y = 'Price', data = data_of_interest,
# order = list(city_order['City']))
#plt.xticks(rotation=45)
#plt.xlabel('City or Town', fontsize = 18, fontname = 'Arial', fontweight = 'bold')
#plt.ylabel('Single Family Home Price ($M)', fontsize = 18, fontweight = 'bold')
#ax.set_ylim(0,8000000)
#ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
#ax.yaxis.set_major_formatter(ticks)
#
## swarmplot
#fig, ax = plt.subplots(1, 1, figsize = (20,10))
#ax = sns.swarmplot(x = 'City', y = 'Price', data = data_of_interest,
# order = list(city_order['City']))
#plt.xticks(rotation=45)
#plt.xlabel('City or Town', fontsize = 18, fontname = 'Arial', fontweight = 'bold')
#plt.ylabel('Single Family Home Price ($M)', fontsize = 18, fontweight = 'bold')
#ax.set_ylim(0,8000000)
#ticks = ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y/scale))
#ax.yaxis.set_major_formatter(ticks)