From 4a81f2e5c13b3ffc727dfca94b51708f993b8bb1 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean@seananderson.ca>
Date: Fri, 29 Sep 2023 10:48:06 -0700
Subject: [PATCH] Start cleaning up cross validation vignette #259

---
 vignettes/web_only/cross-validation.Rmd | 101 +++++++++---------------
 1 file changed, 39 insertions(+), 62 deletions(-)

diff --git a/vignettes/web_only/cross-validation.Rmd b/vignettes/web_only/cross-validation.Rmd
index 9ede6b6f3..392e5bc7b 100644
--- a/vignettes/web_only/cross-validation.Rmd
+++ b/vignettes/web_only/cross-validation.Rmd
@@ -62,10 +62,10 @@ pcod$fyear <- as.factor(pcod$year)
 ```
 
 
-```{r ex1, eval = FALSE}
+```{r ex1, eval=TRUE}
 # Set parallel processing if desired:
-# library(future)
-# plan(multisession)
+library(future)
+plan(multisession, workers = 2)
 m_cv <- sdmTMB_cv(
   density ~ 0 + s(depth_scaled) + fyear,
   data = pcod,
@@ -78,7 +78,7 @@ m_cv <- sdmTMB_cv(
 In the above example, folds are assigned randomly---but these can be modified to specific spatial or temporal applications.
 Without getting into the complexities of the `blockCV` or `spatialsample` packages, we could simply use `kmeans` to generate spatial clusters, e.g.
 
-```{r ex2, eval = FALSE}
+```{r ex2, eval=TRUE}
 clust <- kmeans(pcod[, c("X", "Y")], 20)$cluster
 
 m_cv <- sdmTMB_cv(
@@ -86,14 +86,13 @@ m_cv <- sdmTMB_cv(
   data = pcod,
   mesh = mesh,
   fold_ids = clust,
-  family = tweedie(link = "log"),
-  k_folds = length(unique(clust))
+  family = tweedie(link = "log")
 )
 ```
 
 Or similarly, these clusters could be assigned in time---here, each year to a unique fold. Note that year is not included as a factor and spatiotemporal fields are turned off because they cannot be estimated in missing years.
 
-```{r ex3, eval = FALSE}
+```{r ex3, eval=TRUE}
 clust <- as.numeric(as.factor(pcod$year))
 
 m_cv <- sdmTMB_cv(
@@ -102,19 +101,17 @@ m_cv <- sdmTMB_cv(
   mesh = mesh,
   fold_ids = clust,
   spatiotemporal = "off",
-  family = tweedie(link = "log"),
-  k_folds = length(unique(clust))
+  family = tweedie(link = "log")
 )
 ```
 
 # Measuring model performance
 
 Lots of measures of predictive accuracy can be used to evaluate model performance.
-By default, `sdmTMB_cv()` returns a list that contains 2 measures: the log likelihoods for each fold (and total), and the expected log predictive density for each fold (and total).
-The latter (ELPD) is a measure of the predictive ability of the model for new observations, while the log-likelihood of the hold out data corresponds to the density for those particular observations.
+By default, `sdmTMB_cv()` returns a list that contains the sum of the log likelihoods for each left-out fold and the total summed across the left-out folds. This is roughly equivalent to the expected log predictive density (ELPD) in the Bayesian literature and can be interpreted as the predictive ability of the model for new observations.
 These can be accessed as below, and inspecting the quantities across folds may help elucidate whether there are particular folds that are difficult to predict.
 
-```{r ex4, eval = FALSE}
+```{r ex4, eval=TRUE}
 m_cv <- sdmTMB_cv(
   density ~ 0 + s(depth_scaled) + fyear,
   data = pcod,
@@ -123,20 +120,17 @@ m_cv <- sdmTMB_cv(
   k_folds = 4
 )
 
-m_cv$fold_elpd # fold ELPD
-m_cv$elpd # total ELPD
-
 m_cv$fold_loglik # fold log-likelihood
 m_cv$sum_loglik # total log-likelihood
 ```
 
-# Single splits and Leave Future Out Cross-Validation
+# Single splits
 
-In cases where only a single test set is evaluated (e.g. 10% of the data), using the `sdmTMB_cv()` function may be overkill because two `sdmTMB()` models will be fit, but using this function may be worthwhile to reduce coding errors (in the log-likelihood or ELPD calculations).
+In cases where only a single test set is evaluated (e.g., 10% of the data), using the `sdmTMB_cv()` function may be overkill because two `sdmTMB()` models will be fit, but using this function may be worthwhile to reduce coding errors (in the log-likelihood calculations).
 For example, here we assign two folds, randomly holding out 10% of the observations as a test set (the test set is given ID = 1, and the training set is given ID = 2).
 
 ```{r ex5, warning=FALSE, message=FALSE}
-clust <- sample(1:2, size = nrow(pcod), replace = T, prob = c(0.1, 0.9))
+clust <- sample(1:2, size = nrow(pcod), replace = TRUE, prob = c(0.1, 0.9))
 
 m_cv <- sdmTMB_cv(
   density ~ 0 + s(depth_scaled) + fyear,
@@ -148,47 +142,36 @@ m_cv <- sdmTMB_cv(
 )
 ```
 
-We can ignore the total log-likelihood and total ELPD, and just focus on the first elements of these lists, e.g.
+We can ignore the total log-likelihood, and just focus on the first element of list list:
 
 ```{r}
 m_cv$fold_loglik[[1]]
 ```
 
-If we wanted to do LFOCV, we could also use the `sdmTMB_cv()` function---though either way, it gets complicated because we need to change the data for each prediction.
-With the `pcod` dataset, the years are
+<!-- # Leave Future Out Cross-Validation -->
 
-```{r}
-unique(pcod$year)
-```
+<!-- We can do LFOCV using the relevant arguments. Our dataset has 9 time steps. Therefore, if we use the arguments `lfo_forecast=2` and `lfo_validations=3`, the cross validation will: -->
+<!-- - Fit data to time steps 1 to 5, predict and validate step 7. -->
+<!-- - Fit data to time steps 1 to 6, predict and validate step 8. -->
+<!-- - Fit data to time steps 1 to 7, predict and validate step 9. -->
 
-As above with temporal folds, we cannot include year as a factor and turn spatiotemporal fields off. We can use years 2011-2017 as test years.
-Two things to note are that if we specified time varying coefficients or a smooth on year effects ~ s(year), we'd want to specify missing years with the `extra_time` argument.
-Second, given the ways the folds are set up below, we have to extract the log-likelihood values for just the years of interest ourselves. Finally, it's also possible to do this same procedure using `sdmTMB()` rather than `sdmTMB_cv()`.
-
-```{r eval = FALSE}
-test_years <- c(2011, 2013, 2015, 2017)
-
-models <- list()
-log_lik <- list()
-for (i in 1:length(test_years)) {
-  clust <- rep(1, nrow(pcod))
-  clust[which(pcod$year < test_years[i])] <- 2
-
-  models[[i]] <- sdmTMB_cv(
-    density ~ 0 + s(depth_scaled),
-    data = pcod,
-    mesh = mesh,
-    spatiotemporal = "off",
-    fold_ids = clust,
-    family = tweedie(link = "log"),
-    k_folds = length(unique(clust))
-  )
-
-  log_lik[[i]] <- sum(m_cv$data$cv_loglik[which(m_cv$data$year == test_years[i])])
-}
-```
+```{r, eval=FALSE, include=FALSE}
+m_lfocv <- sdmTMB_cv(
+  present ~ s(year, k = 3),
+  data = pcod,
+  mesh = mesh,
+  lfo = TRUE, # do LFOCV
+  lfo_forecast = 2, # number of time steps to forecast
+  lfo_validations = 3, # number of time steps to validate
+  family = binomial(),
+  spatiotemporal = "off",
+  time = "year" # must be specified
+)
 
-Note: in the above model, we're not using year as a factor because doing so would not make it possible to predict on new years.
+# See how the LFOCV folds were assigned:
+example_data <- m_lfocv$models[[2]]$data
+table(example_data$cv_fold, example_data$year)
+```
 
 # Comparing two or more models
 
@@ -196,16 +179,15 @@ We can use the output of `sdmTMB_cv()` to compare two or more models.
 For example, if we wanted to evaluate the support for a depth effect or not, we could do 10-fold cross validation (it's important that the folds be the same across the two models).
 In this example, using either the predictive log-likelihood or ELPD would lead one to conclude that including depth improves the predictive accuracy of the model.
 
-```{r ex6, eval=FALSE}
-clust <- sample(1:10, size = nrow(pcod), replace = T)
+```{r ex6, eval=TRUE}
+clust <- sample(seq_len(10), size = nrow(pcod), replace = TRUE)
 
 m1 <- sdmTMB_cv(
   density ~ 0 + fyear,
   data = pcod,
   mesh = mesh,
   fold_ids = clust,
-  family = tweedie(link = "log"),
-  k_folds = length(unique(clust))
+  family = tweedie(link = "log")
 )
 
 m2 <- sdmTMB_cv(
@@ -213,17 +195,12 @@ m2 <- sdmTMB_cv(
   data = pcod,
   mesh = mesh,
   fold_ids = clust,
-  family = tweedie(link = "log"),
-  k_folds = length(unique(clust))
+  family = tweedie(link = "log")
 )
 
 # Compare log-likelihoods -- higher is better!
 m1$sum_loglik
 m2$sum_loglik
-
-# Compare ELPD -- higher is better!
-m1$elpd
-m2$elpd
 ```
 
 # Model ensembling
@@ -231,7 +208,7 @@ m2$elpd
 Finally, instead of identifying single "best" models, we may be interested in doing model averaging.
 In the sdmTMB package, we've implemented the model stacking procedure described by [@yao_2018] in the `sdmTMB_stacking()` function.
 This procedure uses optimization to find the normalized weights that maximize the total log-likelihood across models (other metrics may also be used).
-Inputs to the function are a list of models, where each list element is the output of a call to `sdmTMB_cv()`:
+Inputs to the function are a list of models (a fictitious `model_list`), where each list element is the output of a call to `sdmTMB_cv()`:
 
 ```{r ex7, eval=FALSE}
 weights <- sdmTMB_stacking(model_list)