-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLectureSoln_9.R
119 lines (110 loc) · 3.56 KB
/
LectureSoln_9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# rm(list=ls())
library(caret)
library(glmnet)
library(tree)
library(rpart)
library(rpart.plot)
library(ranger)
library(factoextra)
library(dummies)
library(cluster)
##################################
# Datasets - corp default again
##################################
corpdef = read.csv(file='corpdef.csv')
# Stratify the dataset into training and testing
N = dim(corpdef)[1]
set.seed(200)
idx = sample(1:N,size=floor(N*0.8),replace=F)
train = corpdef[+idx,]
test = corpdef[-idx,]
train = train[,-95]
test = test[,-95]
# Create alternative datasets - that are more balanced
idx.good = which(train$Bankrupt.==0); length(idx.good)
idx.bad = which(train$Bankrupt.==1); length(idx.bad)
# Undersampling majority and oversampling minority (3 times the number of minority)
rt = 3;
set.seed(300)
tmp.good = idx.good[sample(1:length(idx.good),size=length(idx.bad)*rt,replace=F)]
# Here we need replacement = T - why?
tmp.bad = idx.bad[sample(1:length(idx.bad),size=length(idx.bad)*rt,replace=T)] # Oversampling
train.uo = rbind(train[tmp.good,],train[tmp.bad,])
# Just some randomization - we do not want defaults and non-deafults to be stacked one next to each other (just to be sure)
train.uo = train.uo[sample(1:dim(train.uo)[1],size=dim(train.uo)[1],replace=F),]
# Check
dim(train.uo)
summary(train.uo)
summary(train.uo$Bankrupt.)
#############################
# Names corpdef
#############################
# Standardized
#############################
names(train)
prof = train.uo[,c(2:4,27:29,43,44,53,90)]
sprof = scale(prof)
debt = train.uo[,c(93:95,89,67:68,41,36:38,15)]
sdebt = scale(debt)
#############################
# Standardized
#############################
prof_test = test[,c(2:4,27:29,43,44,53,90)]
sprof_test = scale(prof_test)
debt_test = test[,c(93:95,89,67:68,41,36:38,15)]
sdebt = scale(debt_test)
#############################
library(factoextra)
#############################
# Number of variables in sprof
N1 = dim(sprof)[2]
m = prcomp(x=sprof)
str(m)
get_eigenvalue(m)
# Weights in the matrix
dim(m$rotation)
head(m$rotation)
# Rows are variables and Columns are principal components
# Values are weights
# Extract principal components
pcc = sprof %*% m$rotation
# Check - uncorrelated?
round(cor(cbind(pcc,sprof)),2)
# How to select number of components
fviz_eig(m)
# Values above 1?
get_eig(m)
# Add to training dataset
train.uo = data.frame(train.uo,pcc)
# Add to testing dataset
pcc = as.matrix(prof_test) %*% m$rotation
test = data.frame(test,pcc)
#############################
# Estimate models - RF
#############################
specs.old = gen.fm(dep='Bankrupt.',x=names(train.uo)[c(2:95)])
length(2:95)
# Now spec withouth profitability measures but instead using 2 PCs
idx = c(2:4,27:29,43,44,53,90)
all = 2:97
all = all[-which(all %in% idx)]
length(all)
specs.new = gen.fm(dep='Bankrupt.',x=names(train.uo)[all])
# Convert to factors
thr = 0.5
ref = as.factor(test$Bankrupt.)
# Random forest
rf.tree = ranger(specs.old,
data=train.uo,
num.trees=5000,mtry=9,min.node.size=10,max.depth=0)
p1 = predict(rf.tree,data=test)$predictions
rf.tree = ranger(specs.new,
data=train.uo,
num.trees=5000,mtry=9,min.node.size=10,max.depth=0)
p2 = predict(rf.tree,data=test)$predictions
# Evaluate and compare
f1 = as.factor((p1>thr)*1);
f2 = as.factor((p2>thr)*1)
confusionMatrix(f1,reference=ref,positive='1')
confusionMatrix(f2,reference=ref,positive='1')
# Prediction accuracy is similar but we have used less observations.