Skip to content

survival model fit augmentation #960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: parsnip
Title: A Common API to Modeling and Analysis Functions
Version: 1.1.0.9000
Version: 1.1.0.9001
Authors@R: c(
person("Max", "Kuhn", , "max@posit.co", role = c("aut", "cre")),
person("Davis", "Vaughan", , "davis@posit.co", role = "aut"),
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# parsnip (development version)

* `augment()` now works for censored regression models.

# parsnip 1.1.0

This release of parsnip contains a number of new features and bug fixes, accompanied by several optimizations that substantially decrease the time to `fit()` and `predict()` with the package.
Expand Down
112 changes: 80 additions & 32 deletions R/augment.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,34 @@
#'
#' `augment()` will add column(s) for predictions to the given data.
#'
#' @param x A `model_fit` object produced by [fit.model_spec()] or
#' [fit_xy.model_spec()].
#' @param eval_time For censored regression models, a vector of time points at
#' which the survival probability is estimated.
#' @details
#'
#' ## Regression
#' For regression models, a `.pred` column is added. If `x` was created using
#' [fit.model_spec()] and `new_data` contains the outcome column, a `.resid` column is
#' also added.
#' [fit.model_spec()] and `new_data` contains a regression outcome column, a
#' `.resid` column is also added.
#'
#' ## Classification
#'
#' For classification models, the results can include a column called
#' `.pred_class` as well as class probability columns named `.pred_{level}`.
#' This depends on what type of prediction types are available for the model.
#' @param x A `model_fit` object produced by [fit.model_spec()] or
#' [fit_xy.model_spec()] .
#'
#' ## Censored Regression
#'
#' For these models, predictions for the expected time and survival probability
#' are created (if the model engine supports them). If the model supports
#' survival prediction, the `eval_time` argument is required.
#'
#' If survival predictions are created and `new_data` contains a
#' [survival::Surv()] object, additional columns are added for inverse
#' probability of censoring weights (IPCW) are also created. This enables the
#' user to compute performance metrics in the \pkg{yardstick} package.
#'
#' @param new_data A data frame or matrix.
#' @param ... Not currently used.
#' @rdname augment
Expand Down Expand Up @@ -56,36 +75,65 @@
#' augment(cls_xy, cls_tst)
#' augment(cls_xy, cls_tst[, -3])
#'
augment.model_fit <- function(x, new_data, ...) {
augment.model_fit <- function(x, new_data, eval_time = NULL, ...) {
new_data <- tibble::new_tibble(new_data)
res <-
switch(
x$spec$mode,
"regression" = augment_regression(x, new_data),
"classification" = augment_classification(x, new_data),
"censored regression" = augment_censored(x, new_data, eval_time = eval_time),
rlang::abort(paste("Unknown mode:", x$spec$mode))
)
tibble::new_tibble(res)
}

augment_regression <- function(x, new_data) {
ret <- new_data
if (x$spec$mode == "regression") {
check_spec_pred_type(x, "numeric")
ret <-
ret %>%
dplyr::bind_cols(
predict(x, new_data = new_data)
)
if (length(x$preproc$y_var) > 0) {
y_nm <- x$preproc$y_var
if (any(names(new_data) == y_nm)) {
ret <- dplyr::mutate(ret, .resid = !!rlang::sym(y_nm) - .pred)
}
check_spec_pred_type(x, "numeric")
ret <- dplyr::bind_cols(predict(x, new_data = new_data), ret)
if (length(x$preproc$y_var) > 0) {
y_nm <- x$preproc$y_var
if (any(names(new_data) == y_nm)) {
ret <- dplyr::mutate(ret, .resid = !!rlang::sym(y_nm) - .pred)
}
} else if (x$spec$mode == "classification") {
if (spec_has_pred_type(x, "class")) {
ret <- dplyr::bind_cols(
ret,
predict(x, new_data = new_data, type = "class")
)
}
if (spec_has_pred_type(x, "prob")) {
ret <- dplyr::bind_cols(
ret,
predict(x, new_data = new_data, type = "prob")
)
}
dplyr::relocate(ret, dplyr::starts_with(".pred"), dplyr::starts_with(".resid"))
}

augment_classification <- function(x, new_data) {
ret <- new_data

if (spec_has_pred_type(x, "prob")) {
ret <- dplyr::bind_cols(predict(x, new_data = new_data, type = "prob"), ret)
}

if (spec_has_pred_type(x, "class")) {
ret <- dplyr::bind_cols(predict(x, new_data = new_data, type = "class"), ret)
}
ret
}

# nocov start
# tested in tidymodels/extratests#
augment_censored <- function(x, new_data, eval_time = NULL) {
ret <- new_data

if (spec_has_pred_type(x, "time")) {
ret <- dplyr::bind_cols(predict(x, new_data = new_data, type = "time"), ret)
}

if (spec_has_pred_type(x, "survival")) {
.filter_eval_time(eval_time)
ret <- dplyr::bind_cols(
predict(x, new_data = new_data, type = "survival", eval_time = eval_time),
ret)
# Add inverse probability weights when the outcome is present in new_data
y_col <- .find_surv_col(new_data, fail = FALSE)
if (length(y_col) != 0) {
ret <- .censoring_weights_graf(x, ret)
}
} else {
rlang::abort(paste("Unknown mode:", x$spec$mode))
}
as_tibble(ret)
ret
}
# nocov end
4 changes: 2 additions & 2 deletions R/survival-censoring-weights.R
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,11 @@ add_graf_weights_vec <- function(object, .pred, surv_obj, trunc = 0.05, eps = 10
vctrs::vec_chop(y, sizes = num_times)
}

.find_surv_col <- function(x, call = rlang::env_parent()) {
.find_surv_col <- function(x, call = rlang::env_parent(), fail = TRUE) {
is_lst_col <- purrr::map_lgl(x, purrr::is_list)
is_surv <- purrr::map_lgl(x[!is_lst_col], .is_surv, fail = FALSE)
num_surv <- sum(is_surv)
if (num_surv != 1) {
if (fail && num_surv != 1) {
rlang::abort("There should be a single column of class `Surv`", call = call)
}
names(is_surv)[is_surv]
Expand Down
29 changes: 25 additions & 4 deletions man/augment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 17 additions & 13 deletions tests/testthat/test_augment.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,31 @@ test_that('regression models', {

expect_equal(
colnames(augment(reg_form, head(mtcars))),
c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb", ".pred", ".resid")
c( ".pred", ".resid",
"mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb")
)
expect_equal(nrow(augment(reg_form, head(mtcars))), 6)
expect_equal(
colnames(augment(reg_form, head(mtcars[, -1]))),
c("cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb", ".pred")
c(".pred",
"cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb")
)
expect_equal(nrow(augment(reg_form, head(mtcars[, -1]))), 6)

expect_equal(
colnames(augment(reg_xy, head(mtcars))),
c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb", ".pred")
c(".pred",
"mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb")
)
expect_equal(nrow(augment(reg_xy, head(mtcars))), 6)
expect_equal(
colnames(augment(reg_xy, head(mtcars[, -1]))),
c("cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb", ".pred")
c(".pred",
"cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am",
"gear", "carb")
)
expect_equal(nrow(augment(reg_xy, head(mtcars[, -1]))), 6)

Expand All @@ -49,23 +53,23 @@ test_that('classification models', {

expect_equal(
colnames(augment(cls_form, head(two_class_dat))),
c("A", "B", "Class", ".pred_class", ".pred_Class1", ".pred_Class2")
c(".pred_class", ".pred_Class1", ".pred_Class2", "A", "B", "Class")
)
expect_equal(nrow(augment(cls_form, head(two_class_dat))), 6)
expect_equal(
colnames(augment(cls_form, head(two_class_dat[, -3]))),
c("A", "B", ".pred_class", ".pred_Class1", ".pred_Class2")
c(".pred_class", ".pred_Class1", ".pred_Class2", "A", "B")
)
expect_equal(nrow(augment(cls_form, head(two_class_dat[, -3]))), 6)

expect_equal(
colnames(augment(cls_xy, head(two_class_dat))),
c("A", "B", "Class", ".pred_class", ".pred_Class1", ".pred_Class2")
c(".pred_class", ".pred_Class1", ".pred_Class2", "A", "B", "Class")
)
expect_equal(nrow(augment(cls_xy, head(two_class_dat))), 6)
expect_equal(
colnames(augment(cls_xy, head(two_class_dat[, -3]))),
c("A", "B", ".pred_class", ".pred_Class1", ".pred_Class2")
c(".pred_class", ".pred_Class1", ".pred_Class2", "A", "B")
)
expect_equal(nrow(augment(cls_xy, head(two_class_dat[, -3]))), 6)

Expand All @@ -81,7 +85,7 @@ test_that('augment for model without class probabilities', {

expect_equal(
colnames(augment(cls_form, head(two_class_dat))),
c("A", "B", "Class", ".pred_class")
c(".pred_class", "A", "B", "Class")
)
expect_equal(nrow(augment(cls_form, head(two_class_dat))), 6)

Expand Down