Skip to content

Commit d94d324

Browse files
committed
WIP
1 parent 92352b3 commit d94d324

8 files changed

+575
-402
lines changed

FeatureEng_GermanCredit_Template.Rmd

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
---
2+
title: "Feature Engineering German Credit - Baseline Attempt"
3+
output:
4+
html_document:
5+
toc: yes
6+
toc_float: yes
7+
code_folding: hide
8+
---
9+
10+
```{r setup, include=FALSE}
11+
knitr::opts_chunk$set(echo = TRUE)
12+
```
13+
14+
15+
```{r}
16+
library(dplyr)
17+
library(caret)
18+
library(rpart)
19+
library(rpart.plot)
20+
```
21+
22+
# Load Data
23+
24+
```{r}
25+
data(GermanCredit, package = "caret")
26+
df = GermanCredit
27+
df$Class = as.character(df$Class)
28+
df$Class[df$Class == "Bad"] = "NotGood" # Rename, just for personal preference.
29+
df$Class = as.factor(df$Class)
30+
str(df)
31+
head(df)
32+
table(df$Class)
33+
formula = Class ~ .
34+
positive = "Good"
35+
```
36+
37+
# Feature Engineering
38+
39+
```{r}
40+
# Hint: use the preProcess() and predict() fkunctions
41+
#p1 <- preProcess(...)
42+
#df = predict(p1, df)
43+
```
44+
45+
# Feature Selection
46+
47+
```{r}
48+
dim(df)
49+
set.seed(10)
50+
51+
# The following snippet will perform feature selection using caret's SBF = Selection By Filtering
52+
#filterCtrl <- sbfControl(functions = rfSBF, number=1, verbose=T)
53+
#r <- sbf(formula, data = df, sbfControl = filterCtrl)
54+
#r
55+
#df = cbind(df[,predictors(r)], Class=df$Class)]
56+
#dim(df)
57+
```
58+
59+
# Splitting the Data
60+
61+
```{r}
62+
set.seed(123) # Set the seed to make it reproducible
63+
64+
train.index <- createDataPartition(df$Class, p = .8, list = FALSE)
65+
train <- df[ train.index,]
66+
test <- df[-train.index,]
67+
68+
# Double check that the stratefied sampling worked
69+
table(df$Class)/nrow(df)
70+
table(train$Class)/nrow(train)
71+
table(test$Class)/nrow(test)
72+
73+
actual = test$Class
74+
```
75+
76+
# Parameter Tuning - KNN
77+
78+
```{r}
79+
grid <- expand.grid(.kmax = c(5),
80+
.distance=c(1),
81+
.kernel=c("rectangular"))
82+
83+
ctrl <- trainControl(method = "repeatedcv",
84+
number = 10, repeats = 5,
85+
classProbs = TRUE, returnResamp = "all")
86+
87+
model_fit <- train(formula,
88+
data = train,
89+
method = "kknn",
90+
metric="Accuracy",
91+
trControl=ctrl, tuneGrid = grid)
92+
93+
summary(model_fit)
94+
model_fit
95+
96+
pred = predict(model_fit, test)
97+
caret::confusionMatrix(data=pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
98+
```
+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
---
2+
title: "Feature Engineering German Credit - Steve's Attempt"
3+
output:
4+
html_document:
5+
toc: yes
6+
toc_float: yes
7+
code_folding: hide
8+
---
9+
10+
```{r setup, include=FALSE}
11+
knitr::opts_chunk$set(echo = TRUE)
12+
```
13+
14+
15+
```{r}
16+
library(dplyr)
17+
library(caret)
18+
library(rpart)
19+
library(rpart.plot)
20+
```
21+
22+
# Load Data
23+
24+
```{r}
25+
data(GermanCredit, package = "caret")
26+
df = GermanCredit
27+
df$Class = as.character(df$Class)
28+
df$Class[df$Class == "Bad"] = "NotGood" # Rename, just for personal preference.
29+
df$Class = as.factor(df$Class)
30+
str(df)
31+
head(df)
32+
table(df$Class)
33+
34+
formula = Class ~ .
35+
positive = "Good"
36+
```
37+
38+
39+
# Feature Engineering
40+
41+
```{r}
42+
# Hint: use the preProcess() and predict() fkunctions
43+
p1 <- preProcess(df)
44+
df = predict(p1, df)
45+
```
46+
47+
# Feature Selection
48+
49+
```{r}
50+
dim(df)
51+
set.seed(10)
52+
53+
# The following snippet will perform feature selection using caret's SBF = Selection By Filtering
54+
filterCtrl <- sbfControl(functions = rfSBF, number=1, verbose=T)
55+
r <- sbf(formula, data = df, sbfControl = filterCtrl)
56+
r
57+
df = cbind(df[,predictors(r)], Class=df$Class)
58+
dim(df)
59+
```
60+
61+
# Splitting the Data
62+
63+
```{r}
64+
set.seed(123) # Set the seed to make it reproducible
65+
66+
train.index <- createDataPartition(df$Class, p = .8, list = FALSE)
67+
train <- df[ train.index,]
68+
test <- df[-train.index,]
69+
70+
# Double check that the stratefied sampling worked
71+
table(df$Class)/nrow(df)
72+
table(train$Class)/nrow(train)
73+
table(test$Class)/nrow(test)
74+
75+
actual = test$Class
76+
```
77+
78+
# Trying it all: KNN
79+
80+
```{r}
81+
82+
83+
ctrl <- trainControl(method = "repeatedcv",
84+
number = 10, repeats = 5,
85+
classProbs = TRUE, returnResamp = "all", sampling="down")
86+
87+
kknn_grid <- expand.grid(.kmax = c(3, 5, 10),
88+
.distance=c(1, 2),
89+
.kernel=c("rectangular", "gaussian", "biweight"))
90+
91+
kknn_fit <- train(formula,
92+
data = train,
93+
method = "kknn",
94+
metric="Kappa",
95+
preProc=c('nzv', 'center', 'scale'),
96+
trControl=ctrl, tuneGrid = kknn_grid)
97+
98+
summary(kknn_fit)
99+
pred = predict(kknn_fit, test)
100+
caret::confusionMatrix(data=pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
101+
```
102+
103+
# Trying it all: RF
104+
105+
```{r, warning=FALSE}
106+
ctrl <- trainControl(method = "repeatedcv",
107+
number = 10, repeats = 5,
108+
classProbs = TRUE, returnResamp = "all", sampling="down")
109+
110+
rf_grid <- expand.grid(.mtry = c(50, 100, 500))
111+
112+
rf_fit <- train(formula, data = train, "parRF",
113+
preProc=c('nzv', 'center', 'scale'),
114+
trControl = ctrl, tuneGrid = rf_grid,
115+
metric="Kappa", allowParallel = TRUE)
116+
117+
summary(rf_fit)
118+
pred = predict(rf_fit, test)
119+
caret::confusionMatrix(data=pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual"))
120+
```

0 commit comments

Comments
 (0)