Skip to content

Commit

Permalink
[R-package] [python-package] deprecate Dataset arguments to cv() and …
Browse files Browse the repository at this point in the history
…train() (#6446)
  • Loading branch information
jameslamb authored May 11, 2024
1 parent ae55f32 commit a70e832
Show file tree
Hide file tree
Showing 15 changed files with 185 additions and 61 deletions.
37 changes: 31 additions & 6 deletions R-package/R/lgb.cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ CVBooster <- R6::R6Class(
#' @description Cross validation logic used by LightGBM
#' @inheritParams lgb_shared_params
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param label Deprecated. See "Deprecated Arguments" section below.
#' @param weight Deprecated. See "Deprecated Arguments" section below.
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation.
#' This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
Expand All @@ -36,10 +36,8 @@ CVBooster <- R6::R6Class(
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param colnames Deprecated. See "Deprecated Arguments" section below.
#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets
Expand Down Expand Up @@ -70,6 +68,13 @@ CVBooster <- R6::R6Class(
#' , nfold = 3L
#' )
#' }
#'
#' @section Deprecated Arguments:
#'
#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
#' to argument \code{'data'}. It will also remove support for passing arguments
#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
#'
#' @importFrom data.table data.table setorderv
#' @export
lgb.cv <- function(params = list()
Expand Down Expand Up @@ -102,12 +107,32 @@ lgb.cv <- function(params = list()

# If 'data' is not an lgb.Dataset, try to construct one using 'label'
if (!.is_Dataset(x = data)) {
warning(paste0(
"Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. "
, "Either pass an lgb.Dataset object, or use lightgbm()."
))
if (is.null(label)) {
stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
}
data <- lgb.Dataset(data = data, label = label)
}

# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
args <- names(match.call())
if ("categorical_feature" %in% args) {
.emit_dataset_kwarg_warning("categorical_feature", "lgb.cv")
}
if ("colnames" %in% args) {
.emit_dataset_kwarg_warning("colnames", "lgb.cv")
}
if ("label" %in% args) {
.emit_dataset_kwarg_warning("label", "lgb.cv")
}
if ("weight" %in% args) {
.emit_dataset_kwarg_warning("weight", "lgb.cv")
}

# set some parameters, resolving the way they were passed in with other parameters
# in `params`.
# this ensures that the model stored with Booster$save() correctly represents
Expand Down
23 changes: 19 additions & 4 deletions R-package/R/lgb.train.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
#' @inheritParams lgb_shared_params
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param colnames Deprecated. See "Deprecated Arguments" section below.
#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
#' booster model into a predictor model which frees up memory and the
Expand Down Expand Up @@ -43,6 +41,13 @@
#' , early_stopping_rounds = 3L
#' )
#' }
#'
#' @section Deprecated Arguments:
#'
#' A future release of \code{lightgbm} will remove support for passing arguments
#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to
#' \code{lgb.Dataset} instead.
#'
#' @export
lgb.train <- function(params = list(),
data,
Expand Down Expand Up @@ -78,6 +83,16 @@ lgb.train <- function(params = list(),
}
}

# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
args <- names(match.call())
if ("categorical_feature" %in% args) {
.emit_dataset_kwarg_warning("categorical_feature", "lgb.train")
}
if ("colnames" %in% args) {
.emit_dataset_kwarg_warning("colnames", "lgb.train")
}

# set some parameters, resolving the way they were passed in with other parameters
# in `params`.
# this ensures that the model stored with Booster$save() correctly represents
Expand Down
21 changes: 16 additions & 5 deletions R-package/R/lightgbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ NULL
#'
#' \emph{New in version 4.0.0}
#'
#' @param colnames Character vector of features. Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
#' @param categorical_feature categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
#'
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#' \itemize{
#' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
Expand All @@ -152,10 +158,6 @@ NULL
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
#' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
#' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
#' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
#' \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
#' say "the first and tenth columns").}
#' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets}
#' }
Expand All @@ -176,6 +178,8 @@ lightgbm <- function(data,
objective = "auto",
init_score = NULL,
num_threads = NULL,
colnames = NULL,
categorical_feature = NULL,
...) {

# validate inputs early to avoid unnecessary computation
Expand Down Expand Up @@ -221,7 +225,14 @@ lightgbm <- function(data,

# Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
if (!.is_Dataset(x = dtrain)) {
dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score)
dtrain <- lgb.Dataset(
data = data
, label = label
, weight = weights
, init_score = init_score
, categorical_feature = categorical_feature
, colnames = colnames
)
}

train_args <- list(
Expand Down
16 changes: 16 additions & 0 deletions R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -260,3 +260,19 @@
return(a == b)
}
}

# ref: https://github.com/microsoft/LightGBM/issues/6435
.emit_dataset_kwarg_warning <- function(calling_function, argname) {
msg <- sprintf(
paste0(
"Argument '%s' to %s() is deprecated and will be removed in a future release. "
, "Set '%s' with lgb.Dataset() instead. "
, "See https://github.com/microsoft/LightGBM/issues/6435."
)
, argname
, calling_function
, argname
)
warning(msg)
return(invisible(NULL))
}
19 changes: 13 additions & 6 deletions R-package/man/lgb.cv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions R-package/man/lgb.train.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 9 additions & 4 deletions R-package/man/lightgbm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 13 additions & 9 deletions R-package/tests/testthat/test_basic.R
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
}
})

test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", {
test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", {
bad_values <- list(
4L
, "hello"
Expand Down Expand Up @@ -1788,11 +1788,6 @@ test_that("lgb.train() works with early stopping for regression with a metric th


test_that("lgb.train() supports non-ASCII feature names", {
dtrain <- lgb.Dataset(
data = matrix(rnorm(400L), ncol = 4L)
, label = rnorm(100L)
, params = list(num_threads = .LGB_MAX_THREADS)
)
# content below is equivalent to
#
# feature_names <- c("F_零", "F_一", "F_二", "F_三")
Expand All @@ -1805,6 +1800,12 @@ test_that("lgb.train() supports non-ASCII feature names", {
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c)))
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89)))
)
dtrain <- lgb.Dataset(
data = matrix(rnorm(400L), ncol = 4L)
, label = rnorm(100L)
, params = list(num_threads = .LGB_MAX_THREADS)
, colnames = feature_names
)
bst <- lgb.train(
data = dtrain
, nrounds = 5L
Expand All @@ -1814,7 +1815,6 @@ test_that("lgb.train() supports non-ASCII feature names", {
, verbose = .LGB_VERBOSITY
, num_threads = .LGB_MAX_THREADS
)
, colnames = feature_names
)
expect_true(.is_Booster(bst))
dumped_model <- jsonlite::fromJSON(bst$dump_model())
Expand Down Expand Up @@ -2838,7 +2838,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is

test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), {
set.seed(1L)
dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
dtrain <- lgb.Dataset(
train$data
, label = train$label
, params = list(num_threads = .LGB_MAX_THREADS)
)

params <- list(
objective = "regression"
Expand All @@ -2854,6 +2858,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
pred1 <- bst$predict(test$data)

new_colnames <- paste0(colnames(train$data), "_x")
dtrain$set_colnames(new_colnames)
params <- list(
objective = "regression"
, interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
Expand All @@ -2864,7 +2869,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
data = dtrain
, params = params
, nrounds = 2L
, colnames = new_colnames
)
pred2 <- bst$predict(test$data)

Expand Down
Loading

0 comments on commit a70e832

Please sign in to comment.