-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRegressionProblem.R
263 lines (222 loc) · 10.6 KB
/
RegressionProblem.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
################################################################
# Households data - regression problem
households = read.csv(file='HouseholdsExamI.csv')
head(households)
#Looking for NA rows in dataset
any(is.na(households$dis_income))
apply(households, 2, function(x) any(is.na(x)))
#There are empty rows in dis-income which is our dependent variable, so will remove them
households=households[!is.na(households$dis_income),]
################################################################
################################################################
# This is a dataset from a survey of monthly household incomes and expenditures.
# The survey was conducted in years 2012 and 2015 in Slovakia.
################################################################
# The variables are as follows:
# year Year in which the survey was conducted
# dis_income Discretionary income (Nominal household income minus expenditures on necessities/consumption)
# income Nominal household income
# consumption Expenditures on necessities
# dis_income_p Discretionary income as a % of income
# Jan Monthly dummy, 1 - yes for given month
# Feb Monthly dummy, 1 - yes for given month
# Mar Monthly dummy, 1 - yes for given month
# Apr Monthly dummy, 1 - yes for given month
# Maj Monthly dummy, 1 - yes for given month
# Jun Monthly dummy, 1 - yes for given month
# Jul Monthly dummy, 1 - yes for given month
# Aug Monthly dummy, 1 - yes for given month
# Sep Monthly dummy, 1 - yes for given month
# Okt Monthly dummy, 1 - yes for given month
# Nov Monthly dummy, 1 - yes for given month
# Dec Monthly dummy, 1 - yes for given month
# BA region 1
# TT region 2
# TN region 3
# NR region 4
# ZA region 5
# BB region 6
# PO region 7
# KE region 8
# PC1 Number of household members, 1
# PC2 Number of household members, 2
# PC3 Number of household members, 3
# PC4 Number of household members, 4
# PC5 Number of household members, 5 or more
# PP1 Number of working household members, 1
# PP2 Number of working household members, 2
# PP3 Number of working household members, 3
# PP4 Number of working household members, 4 or more
# D0 Number of dependent (non-working) children, 0
# D1 Number of dependent (non-working) children, 1
# D2 Number of dependent (non-working) children, 2
# D3 Number of dependent (non-working) children, 3
# D4 Number of dependent (non-working) children, 4 or more
# N0 Number of unemployed household members, 0
# N1 Number of unemployed household members, 1
# N2 Number of unemployed household members, 2
# N3 Number of unemployed household members, 3 or more
# jed Is there a retiree in the household?
# PPD0 Number of working retirees in the household, 0
# PPD1 Number of working retirees in the household, 1
# PPD2 Number of working retirees in the household, 2 or more
# age Age of the household representative
# male Gender of the household representative, 1 - male
# Middle Highest education status - middle
# University Highest education status - university degree
# ver Is the household representative working in the public sector, 1 - yes
# full Is the household representative working full-time, 1 - yes
################################################################
################################################################
# - Training dataset consists of all observations in 2012
# - Testing dataset consists of all observations in 2015
# - Submit a working R (python) code - clear the workspace and test the code before submitting
################################################################
################################################################
# The goal is to design a prediction model that will be used to predict
# discretionary income of households in 2015.
# As features you can use everything, except 'consumption' and 'dis_income_p' variables.
################################################################
################################################################
# SCORING
# - 10 points with respect to data preparation (outliers, interactions, transformations, unsupervised learning,...)
# - 10 points for valid predictions & evaluation
# - 10 points for interpretation & presentation of results (tables or figures in R) and extra effort
################################################################
#Data Preperation
# Checking if age and income are highly correlated
colN=c('income', 'age')
x=households$income
for (i in 2:length(colN)) {
m=cor.test(x,households[,colN[i]],method='kendall')
print(m)
}
# It does not seem to have high correlation.
#Looking at some of the continuous variables
#Income
households$income
summary(households$income)
hist(households$income)
# It seems like the income is skewed. There may be some outliers, so using log on this feature might lead to better result and could reduce outlier to certain extent
households$income=log(households$income)
hist(households$income)
# We have sligly better distribution
#Age
households$age
summary(households$age)
hist(households$age)
#LETS SEE HOWW THE DISTRIBUTION WOULD BE IF USED LOG
hist(log(households$age))
#I would say better so will use log on age as well. Max age is 96 which may be normal. But, using log can ensure that even if its outlier, log can help to reduce to ceertain extent
households$age=log(households$age)
hist(households$age)
plot(households$dis_income,households$income)
plot(households$dis_income,households$age)
#It does not seem there is a very strong correlation.
library(moments)
rowN=c('mean','sd','kurtosis','skewness','min','25p','md','75p','max')
desStat = matrix(NA, nrow=length(rowN), ncol=length(colN))
colnames(desStat)=colN
rownames(desStat)=rowN
for (i in 1:length(colN)) {
x = households[,colN[i]]
desStat[,i]=round(c(mean(x),sd(x),kurtosis(x),skewness(x),quantile(x,p=c(0,0.25,0.5,0.75,1))),3)
}
desStat
# After using the log, we have better decriptive sstatistics on continuous variables.
#Lets create some intersectional variables between dummy variablers retiree and number of working retirees
households$jedp0=households$jed*households$PPD0
households$jedp1=households$jed*households$PPD1
households$jedp2=households$jed*households$PPD2
# Lets use some regression model.
# creating train dataset that has year 2012
train = households[households$year==2012,]
head(train)
# creating test dataset that has year 2015
test = households[households$year==2015,]
head(test)
##Estimating OLS Model
spec_one = as.formula(dis_income~income+Jan+Feb+Mar+Apr+Maj+Jun+Jul+Aug+Sep+Okt+Nov+Dec+BA+TT+TN
+NR+ZA+BB+PO+KE+PC1+PC2+PC3+PC4+PC5+PP1+PP2+PP3+PP4+D0+D1+D2+D3+D4+N0+N1+
N2+N3+age+male+Stredne+Vysoke+ver+full+jedp0+jedp1+jedp2)
m1=lm(spec_one, data = train)
head(train)
summary(m1)
pr1 = predict(m1, newdata = test)
plot(pr1,test$dis_income,pch=19,cex=0.5,col='red')
#Creating matrix for storing results
colN=c('true','ols','dt','lasso', 'ridge')
predictRes=matrix(NA,nrow=dim(test)[1],ncol = length(colN))
colnames(predictRes)=colN
head(predictRes)
predictRes[,1]=test$dis_income
predictRes[,2]=pr1
colN=c('ols','dt','lasso', 'ridge')
mseresult=matrix(NA,nrow=dim(test)[1],ncol = length(colN))
colnames(mseresult)=colN
mseresult[,1]=(predictRes[,1]-predictRes[,2])^2
head(mseresult)
apply(mseresult,2,mean)
colN=c('ols','dt','lasso', 'ridge')
maeresult=matrix(NA,nrow=dim(test)[1],ncol = length(colN))
colnames(maeresult)=colN
maeresult[,1]=abs(predictRes[,1]-predictRes[,2])
head(maeresult)
apply(maeresult,2,mean)
#Using Decision Tree
library(rpart)
library(rpart.plot)
m2=rpart(spec_one,data = train,method='anova', model=TRUE,
control=rpart.control(cp = 0.01, maxdepth = 9))
summary(m2)
pr2 = predict(m2, newdata = test)
plot(pr2,test$dis_income,pch=19,cex=0.5,col='red')
predictRes[,3]=pr2
head(predictRes)
mseresult[,2]=(predictRes[,1]-predictRes[,3])^2
head(mseresult)
maeresult[,2]=abs(predictRes[,1]-predictRes[,3])
head(maeresult)
apply(mseresult,2,mean)
apply(maeresult,2,mean)
library(glmnet)
#Using lasso as it ivolves feature selection as well and we have alot of features
features = c('income','Jan','Feb','Mar','Apr','Maj','Jun','Jul','Aug','Sep','Okt','Nov','Dec','BA','TT','TN','NR','ZA','BB','PO','KE','PC1','PC2','PC3','PC4','PC5','PP1','PP2','PP3','PP4','D0','D1','D2','D3','D4','N0','N1','N2','N3','age','male','Stredne','Vysoke','ver','full','jedp0','jedp1','jedp2')
X = as.matrix(train[,features])
Y = as.matrix(train[,'dis_income'])
XT = as.matrix(test[,features])
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1)
p3 = predict(cvm,newx = XT,s='lambda.min')
plot(p3,test$dis_income,pch=19,cex=0.5,col='red')
predictRes[,4]=p3
head(predictRes)
mseresult[,3]=(predictRes[,1]-predictRes[,4])^2
head(mseresult)
maeresult[,3]=abs(predictRes[,1]-predictRes[,4])
head(maeresult)
apply(mseresult,2,mean)
apply(maeresult,2,mean)
#Using Ridge
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0)
# Figure
plot(cvm)
p4 = predict(cvm,newx = XT,s='lambda.min')
predictRes[,5]=p4
plot(p4,test$dis_income,pch=19,cex=0.5,col='red')
head(predictRes)
mseresult[,4]=(predictRes[,1]-predictRes[,5])^2
head(mseresult)
maeresult[,4]=abs(predictRes[,1]-predictRes[,5])
head(maeresult)
apply(mseresult,2,mean)
apply(maeresult,2,mean)
library(MCS)
MCSprocedure(maeresult)
MCSprocedure(mseresult)
# Based on the overall evaluation, Interms of MSE result, decision tree performs the best. However, one problem
#with decision tree is overfitting. Here, it would be better if we could try both bagging and random forest.
#I didd try that, however, I had error in code and unfortunately because of time constraint, was not able to correct it.
#'based on the MAE results, here also decision tree outperforms others'
#'
#'Using MCSPROCEDURE; decision tree outperforms other moddel. Here, again recommendattion is to try bagging
#or RF, however as mentioned earlier, due to code errorr could not do that.