-
Notifications
You must be signed in to change notification settings - Fork 0
/
04cv.Rmd
122 lines (107 loc) · 3.07 KB
/
04cv.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Cross-validation
```{block, type='rmdcomment'}
In this chapter, we will describe the ideas of cross-validation.
```
- Watch the video describing this chapter [![video](https://img.youtube.com/vi/BSDrIrSxqFU/maxresdefault.jpg)](https://youtu.be/BSDrIrSxqFU)
```{r setup04, include=FALSE}
require(Publish)
require(caret)
perform <- function(new.data,
model.fit,model.formula=NULL,
y.name = "Y",
digits=3){
# data dimension
p <- dim(model.matrix(model.fit))[2]
# predicted value
pred.y <- predict(model.fit, new.data)
# sample size
n <- length(pred.y)
# outcome
new.data.y <- as.numeric(new.data[,y.name])
# R2
R2 <- caret:::R2(pred.y, new.data.y)
# adj R2 using alternate formula
df.residual <- n-p
adjR2 <- 1-(1-R2)*((n-1)/df.residual)
# RMSE
RMSE <- caret:::RMSE(pred.y, new.data.y)
# combine all of the results
res <- round(cbind(n,p,R2,adjR2,RMSE),digits)
# returning object
return(res)
}
```
## Read previously saved data
```{r}
ObsData <- readRDS(file = "data/rhcAnalytic.RDS")
out.formula1 <- readRDS(file = "data/form1.RDS")
```
## k-fold cross-vaildation
[ref](https://en.wikipedia.org/wiki/Cross-validation_(statistics))
```{r c954, echo=FALSE, out.width = '90%'}
knitr::include_graphics("images/kf.png")
```
```{r}
k = 5
dim(ObsData)
set.seed(567)
# create folds (based on outcome)
folds <- createFolds(ObsData$Length.of.Stay, k = k,
list = TRUE, returnTrain = TRUE)
mode(folds)
dim(ObsData)*4/5 # approximate training data size
dim(ObsData)/5 # approximate test data size
length(folds[[1]])
length(folds[[5]])
str(folds[[1]])
str(folds[[5]])
```
### Calculation for Fold 1
```{r}
fold.index <- 1
fold1.train.ids <- folds[[fold.index]]
head(fold1.train.ids)
fold1.train <- ObsData[fold1.train.ids,]
fold1.test <- ObsData[-fold1.train.ids,]
out.formula1
model.fit <- lm(out.formula1, data = fold1.train)
predictions <- predict(model.fit,
newdata = fold1.test)
perform(new.data=fold1.test,
y.name = "Length.of.Stay",
model.fit=model.fit)
```
### Calculation for Fold 2
```{r}
fold.index <- 2
fold1.train.ids <- folds[[fold.index]]
head(fold1.train.ids)
fold1.train <- ObsData[fold1.train.ids,]
fold1.test <- ObsData[-fold1.train.ids,]
model.fit <- lm(out.formula1, data = fold1.train)
predictions <- predict(model.fit,
newdata = fold1.test)
perform(new.data=fold1.test,
y.name = "Length.of.Stay",
model.fit=model.fit)
```
## Using caret package to automate
[ref](https://topepo.github.io/caret/model-training-and-tuning.html)
```{r cv, cache= TRUE}
# Using Caret package
set.seed(504)
# make a 5-fold CV
ctrl<-trainControl(method = "cv",number = 5)
# fit the model with formula = out.formula1
# use training method lm
fit.cv<-train(out.formula1, trControl = ctrl,
data = ObsData, method = "lm")
fit.cv
# extract results from each test data
summary.res <- fit.cv$resample
summary.res
mean(fit.cv$resample$Rsquared)
sd(fit.cv$resample$Rsquared)
mean(fit.cv$resample$RMSE)
sd(fit.cv$resample$RMSE)
```