-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLectureSoln_3.R
328 lines (280 loc) · 11 KB
/
LectureSoln_3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# Author: prof. Ing. Štefan Lyócsa, PhD./Sujit Thapa
###################################################################################
# Seminar No. 3
# 1) Work with raw data and convert to a workable dataset (training, testing)
# 2) Prepare summary statistics (to-do write a function that will return a table with: mean, sd, quantiles, skewness, kurtosis, normality test, correlation table)
# 3) Visualize data: Overlapping histograms, Ordered box-plots, x-y plots.
# 4) Estimate lm() models
# 5) compare model accuracy
# 6) consider other loss functions - create your own!
###################################################################################
# The raw dataset
getwd()
oc = read.csv(file='octavia.csv',fileEncoding="latin1")
# Let's take a look at first 10 observations
head(oc,n=10)
# Let's take a look at last 10 observations
tail(oc,n=10)
###################################################################################
# 1) Work with raw data and convert to a workable dataset (training, testing)
###################################################################################
# I do not like the first column - uninformative
##################################
## TASK 1 - Load data without the first and the column!
oc =oc[,-(1:2)]
##################################
# Now check
head(oc,n=10)
# Lets translate the names of variables
names(oc)[c(4,5,7)] = c('power','region','transmission')
names(oc)
#################
# Check variables
#################
# YEAR
# Some older cars are not very well represented.
table(oc$year)
# First, all cars older than 2004 incl. are going to be set to 2004
# Second, we will change year to age - more intuitive (better scale)
oc$year[oc$year <= 2004] = 2004
table(oc$year)
#mean(oc$year)
##################################
# TASK 2 - Create age variable
oc$age = 2022-oc$year
table(oc$age)
summary(oc$age)
##################################
# KM
summary(oc$km)
# I am suspicious -> outliers?
# I also do not like the unnecessary large number - convert into 1000s of km
oc$km = oc$km/1000
summary(oc$km)
# An ugly histogram
hist(oc$km,prob=T)
hist(oc$km)
# Slightly better looking one
dev.off()
hist(oc$km,breaks=15,prob=T,yaxt='n',xaxt='n',ylim=c(-0.0004,0.008),xlim=c(0,500),density=30,col=rgb(0.5,0.5,0.5,alpha=0.8),xlab=c(),ylab=c(),main='')
axis(1,at=seq(from=0,to=max(oc$km),by=100),label=seq(from=0,to=max(oc$km),by=100),cex.axis=0.85)
axis(2,at=seq(from=0,to=0.01,by=0.001),label=seq(from=0,to=0.01,by=0.001),cex.axis=0.85,las=2)
legend('bottomright',bty='n',legend='Kilometers')
legend('topleft',bty='n',legend='Density')
# We should categorize relatively new cars: km < 10000.
oc$km10 = (oc$km<10)*1
##################################
# TASK 3 - Use log transform on km
oc$lkm = log(oc$km)
table(oc$km)
hist(oc$lkm,prob=T)
##################################
# POWER
summary(oc$power)
table(oc$power)
# It seems like there might be errors but also different power levels. Let's create dummies
oc$power_lowest = (oc$power<77)*1
oc$power_low = (oc$power>=77 & oc$power<85)*1
oc$power_mid = (oc$power>=85 & oc$power<103)*1
oc$power_high = (oc$power>=103 & oc$power<118)*1
oc$power_highest = (oc$power>118)*1
# Domain knowledge will guide us later, as power and fuel should be addressed together
# REGION
table(oc$region)
typeof(oc$region)
oc$region = as.character(oc$region)
# Simplify into dummies
oc$ba = (oc$region == 'BA kraj')*1
oc$bb = (oc$region == 'BB kraj')*1
oc$ke = (oc$region == 'KE kraj')*1
oc$nr = (oc$region == 'NR kraj')*1
oc$po = (oc$region == 'PO kraj')*1
oc$tn = (oc$region == 'TN kraj')*1
oc$tt = (oc$region == 'TT kraj')*1
oc$za = (oc$region == 'ZA kraj')*1
# FUEL
table(oc$fuel)
# Translate and convert:
typeof(oc$fuel)
oc$fuel = as.character(oc$fuel)
oc$diesel = (oc$fuel == 'Diesel')*1
oc$petrol = (oc$fuel == 'Benzmn')*1 # you might have issues with the special character here
oc$petgas = (oc$fuel == 'Benzmn+Plyn')*1
# TRANSMISSION
table(oc$transmission)
# Simplify and dummify
oc$man = (oc$transmission != 'Automat')*1
# POWER AND FUEL - some important combination?
oc$eng1 = c(oc$power == 77 & oc$diesel == 1)*1
oc$eng2 = c(oc$power == 81 & oc$diesel == 1)*1
oc$eng3 = c(oc$power == 85 & oc$diesel == 1)*1
oc$eng4 = c(oc$power == 103 & oc$diesel == 1)*1
oc$eng5 = c(oc$power == 103 & oc$diesel == 1)*1
oc$eng6 = c(oc$power == 110 & oc$diesel == 1)*1
# PRICE
summary(oc$price)
hist(oc$price,prob=F,breaks=15)
dev.off()
hist(oc$price,breaks=15,prob=F,yaxt='n',xaxt='n',ylim=c(0,180),xlim=c(0,40000),density=30,col=rgb(0.5,0.5,0.5,alpha=0.8),xlab=c(),ylab=c(),main='')
axis(1,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.85)
axis(2,at=seq(from=0,to=180,by=30),label=seq(from=0,to=180,by=30),cex.axis=0.85)
legend('bottomright',bty='n',legend='Price')
legend('topleft',bty='n',legend='Density')
# REMOVE WHAT WE DO NOT NEED
names(oc)
oc[,c('year','km','power','region','fuel','transmission')] = NULL
names(oc)
# Divide the sample into training and testing
set.seed(50)
# Overall number of observations
N = dim(oc)[1]
idx = sample(1:N,size=floor(0.8*N),replace=F)
#a = sample(1:N,size=floor(0.8*N),replace=F)
# Training
ocr = oc[ idx,]
# Number of observations in the Training
NR = dim(ocr)[1]
# Testing
oct = oc[-idx,]
# Number of observations in the Testing
NT = dim(oct)[1]
###################################################################################
# 2) Prepare summary statistics - work in training dataset
###################################################################################
##################################
## TASK 4 - Calculate and print correlation & stat. significance between price and all remaining variables:
table(y)
nv = dim(ocr[2])
y = ocr$price
#for (i in 2:nv) {
# m = cor.test(y,ocr[,i], method = 'kendall')
#print(m)
#}
for(i in 2:nv) {
m = cor.test(y,ocr[,i], method ='kendall')
print(paste('corr. between price and', names(ocr)[i], 'is' ,
round(m$estimate,3), 'and the sig. is', round(m$p.value,3)))
}
##################################
##################################
## Home assignment: Write a function() that will return 2 tables:
# 1) Descriptive statistics for given variables
# 2) Correlation table with statistical significance
statSigSumm <- function(x) {
summary(x)
for(i in 2:nv) {
m = cor.test(x,ocr[,i], method ='kendall')
print(paste('corr. between price and', names(ocr)[i], 'is' ,
round(m$estimate,3), 'and the sig. is', round(m$p.value,3)))
}
}
summary(ocr$price)
statSigSumm(ocr$price)
# Are there features that are excessively correlated? (say above 0.95?)
###################################################################################
# 3) Visualize data: Overlapping histograms, Ordered box-plots, x-y plots.
###################################################################################
dev.off()
par(mfrow=c(1, 1)) # layout of figures - (rows,columns)
par(cex = 1.1)
par(oma = c(2, 2.0, 1.0, 1.0))
par(tcl = -0.25)
par(mgp = c(2, 0.6, 0))
par(mar = c(2.0, 3.0, 1.5, 0.5))
hist(oc$price[oc$man==1],breaks=15,prob=T,xaxt='n',
xlim=c(0,40000),density=10,
col=rgb(0.85,0.5,0.05,alpha=0.9),
xlab=c(),ylab=c(),main='',cex.axis=0.55,
ylim=c(0,9.5^(-4)))
hist(ocr$price[oc$man==0],breaks=15,prob=T,add=T,
col=rgb(0.15,0.85,0.85,alpha=0.9),density=10)
axis(1,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.65)
lines(density(ocr$price[ocr$man==1]),col=rgb(0.85,0.5,0.05),lwd=1.25,lty=2)
lines(density(ocr$price[ocr$man==0]),col=rgb(0.15,0.85,0.85),lwd=1.25,lty=2)
legend('topright',bty='n',legend=c('Distribution of price\nmanual shift\n',
'Distribution of price\nautomatic shift'),
col=c(rgb(0.85,0.5,0.05),rgb(0.15,0.85,0.85)),cex=0.75,lty=1)
boxplot(price~man,data=ocr,pch=19,cex=0.35,yaxt='n',xlab='',
ylab = 'Price of the car',xaxt='n',cex.lab=0.75)
axis(2,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.65,las=2)
axis(1,at=c(1,2),label=c('Automatic shift', 'Manual shift'),
cex.axis=0.65)
dev.off()
par(mfrow=c(1, 1)) # layout of figures - (rows,columns)
par(cex = 1.1)
par(oma = c(2, 2.0, 1.0, 1.0))
par(tcl = -0.25)
par(mgp = c(2, 0.6, 0))
par(mar = c(3.0, 3.0, 1.5, 0.5))
plot(x=exp(ocr$lkm[ocr$age > 9]),y=ocr$price[ocr$age > 9],
pch=19, cex=0.5, col='black',ylab='Price',xlab='Kilometers',
ylim=c(0,max(ocr$price)),xlim=c(0,max(exp(ocr$lkm))),
cex.axis=0.85,cex.lab=0.95)
abline(lm(price~I(exp(lkm)),data=ocr[ocr$age > 9,]),lwd=1.5)
points(x=exp(ocr$lkm[ocr$age <= 9]),y=ocr$price[ocr$age <= 9],
col='red',pch=19,cex=0.5)
abline(lm(price~I(exp(lkm)),data=ocr[ocr$age <= 9,]),lwd=1.5,
col='red')
legend('topright',bty='n',legend=c('Dependence for older cars\n(9 +)\n',
'Dependence for younger cars\n(9 -)'),
col=c('black','red'),cex=0.75,lty=1)
?boxplot
boxplot(price~man,data=ocr)
###################################################################################
# 4) Estimate OLS models
###################################################################################
m1 = lm(price ~ lkm, data=ocr)
summary(m1)
m2 = lm(price ~ age, data=ocr)
summary(m2)
m3 = lm(price ~ lkm + age, data=ocr)
summary(m3)
m4 = lm(price ~ lkm + age + I(lkm*age), data=ocr)
summary(m4)
##################################
# TASK 5 - Estimate your own specification - try to beat the others! You can also create new variables
m5 = lm(price ~ lkm + age + I(lkm*age), data = ocr )
m6 = lm(price ~ lkm + age + I(lkm*age) + I(eng1*age)+I(eng2*age)+
I(eng3*age)+I(eng4*age)+I(eng5*age) + I(power_low*age)+
I(eng6*age)+I(ba*lkm)+I(bb*lkm)+I(ke*lkm) + I(nr*age)+
I(po*lkm)+I(tn*lkm)+I(tt*lkm),data = ocr)
##################################
#########
# Predict
# Store
predictions = matrix(NA,nrow=NT,ncol=7)
colnames(predictions) = c('True',paste('p',1:6,sep=''))
predictions[,1] = oct$price
# Predictions itself
?predict
p1 = predict(m1,new=oct); summary(p1)
predictions[,2] = p1
p2 = predict(m2,new=oct); summary(p2) # Negative price does not make sense here... so some adjustment is in order
# We substitute the lowest positive predicted value
p2[p2 < 0] = min(p2[p2>0]); summary(p2)
predictions[,3] = p2
p3 = predict(m3,new=oct); summary(p3)
# Again
p3[p3 < 0] = min(p3[p3>0]); summary(p3)
predictions[,4] = p3
p4 = predict(m4,new=oct); summary(p4)
predictions[,5] = p4
p5 = predict(m5,new=oct); summary(p5) # Again
p5[p5 < 0] = min(p5[p5>0]); summary(p5)
predictions[,6] = p5
p6 = predict(m6,new=oct);
summary(p6)
p6[p6<0] = min(p6[p6>0])
predictions[,7] = p6
#########
#########
# Evaluate
# Store
mse = matrix(NA,nrow=NT,ncol=6) # 5 models
for (i in 1:6) mse[,i] = (predictions[,1] - predictions[,i+1])^2
apply(mse,2,mean)
#########
# MCS
library(MCS)
MCSprocedure(mse)
#########