|
| 1 | +--- |
| 2 | +title: "Feature Engineering German Credit - Steve's Attempt" |
| 3 | +output: |
| 4 | + html_document: |
| 5 | + toc: yes |
| 6 | + toc_float: yes |
| 7 | + code_folding: hide |
| 8 | +--- |
| 9 | + |
| 10 | +```{r setup, include=FALSE} |
| 11 | +knitr::opts_chunk$set(echo = TRUE) |
| 12 | +``` |
| 13 | + |
| 14 | + |
| 15 | +```{r} |
| 16 | +library(dplyr) |
| 17 | +library(caret) |
| 18 | +library(rpart) |
| 19 | +library(rpart.plot) |
| 20 | +``` |
| 21 | + |
| 22 | +# Load Data |
| 23 | + |
| 24 | +```{r} |
| 25 | +data(GermanCredit, package = "caret") |
| 26 | +df = GermanCredit |
| 27 | +df$Class = as.character(df$Class) |
| 28 | +df$Class[df$Class == "Bad"] = "NotGood" # Rename, just for personal preference. |
| 29 | +df$Class = as.factor(df$Class) |
| 30 | +str(df) |
| 31 | +head(df) |
| 32 | +table(df$Class) |
| 33 | +
|
| 34 | +formula = Class ~ . |
| 35 | +positive = "Good" |
| 36 | +``` |
| 37 | + |
| 38 | + |
| 39 | +# Feature Engineering |
| 40 | + |
| 41 | +```{r} |
| 42 | +# Hint: use the preProcess() and predict() fkunctions |
| 43 | +p1 <- preProcess(df) |
| 44 | +df = predict(p1, df) |
| 45 | +``` |
| 46 | + |
| 47 | +# Feature Selection |
| 48 | + |
| 49 | +```{r} |
| 50 | +dim(df) |
| 51 | +set.seed(10) |
| 52 | +
|
| 53 | +# The following snippet will perform feature selection using caret's SBF = Selection By Filtering |
| 54 | +filterCtrl <- sbfControl(functions = rfSBF, number=1, verbose=T) |
| 55 | +r <- sbf(formula, data = df, sbfControl = filterCtrl) |
| 56 | +r |
| 57 | +df = cbind(df[,predictors(r)], Class=df$Class) |
| 58 | +dim(df) |
| 59 | +``` |
| 60 | + |
| 61 | +# Splitting the Data |
| 62 | + |
| 63 | +```{r} |
| 64 | +set.seed(123) # Set the seed to make it reproducible |
| 65 | +
|
| 66 | +train.index <- createDataPartition(df$Class, p = .8, list = FALSE) |
| 67 | +train <- df[ train.index,] |
| 68 | +test <- df[-train.index,] |
| 69 | +
|
| 70 | +# Double check that the stratefied sampling worked |
| 71 | +table(df$Class)/nrow(df) |
| 72 | +table(train$Class)/nrow(train) |
| 73 | +table(test$Class)/nrow(test) |
| 74 | +
|
| 75 | +actual = test$Class |
| 76 | +``` |
| 77 | + |
| 78 | +# Trying it all: KNN |
| 79 | + |
| 80 | +```{r} |
| 81 | +
|
| 82 | +
|
| 83 | +ctrl <- trainControl(method = "repeatedcv", |
| 84 | + number = 10, repeats = 5, |
| 85 | + classProbs = TRUE, returnResamp = "all", sampling="down") |
| 86 | +
|
| 87 | +kknn_grid <- expand.grid(.kmax = c(3, 5, 10), |
| 88 | + .distance=c(1, 2), |
| 89 | + .kernel=c("rectangular", "gaussian", "biweight")) |
| 90 | +
|
| 91 | +kknn_fit <- train(formula, |
| 92 | + data = train, |
| 93 | + method = "kknn", |
| 94 | + metric="Kappa", |
| 95 | + preProc=c('nzv', 'center', 'scale'), |
| 96 | + trControl=ctrl, tuneGrid = kknn_grid) |
| 97 | +
|
| 98 | +summary(kknn_fit) |
| 99 | +pred = predict(kknn_fit, test) |
| 100 | +caret::confusionMatrix(data=pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual")) |
| 101 | +``` |
| 102 | + |
| 103 | +# Trying it all: RF |
| 104 | + |
| 105 | +```{r, warning=FALSE} |
| 106 | +ctrl <- trainControl(method = "repeatedcv", |
| 107 | + number = 10, repeats = 5, |
| 108 | + classProbs = TRUE, returnResamp = "all", sampling="down") |
| 109 | +
|
| 110 | +rf_grid <- expand.grid(.mtry = c(50, 100, 500)) |
| 111 | +
|
| 112 | +rf_fit <- train(formula, data = train, "parRF", |
| 113 | + preProc=c('nzv', 'center', 'scale'), |
| 114 | + trControl = ctrl, tuneGrid = rf_grid, |
| 115 | + metric="Kappa", allowParallel = TRUE) |
| 116 | +
|
| 117 | +summary(rf_fit) |
| 118 | +pred = predict(rf_fit, test) |
| 119 | +caret::confusionMatrix(data=pred, reference=actual, positive=positive, dnn=c("Predicted", "Actual")) |
| 120 | +``` |
0 commit comments