[R-package] [python-package] deprecate Dataset arguments to cv() and …

…train() (#6446)
microsoft · May 11, 2024 · a70e832 · a70e832
1 parent ae55f32
commit a70e832
Show file tree

Hide file tree

Showing 15 changed files with 185 additions and 61 deletions.
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
@@ -25,8 +25,8 @@ CVBooster <- R6::R6Class(
 #' @description Cross validation logic used by LightGBM
 #' @inheritParams lgb_shared_params
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
-#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
-#' @param weight vector of response values. If not NULL, will set to dataset
+#' @param label Deprecated. See "Deprecated Arguments" section below.
+#' @param weight Deprecated. See "Deprecated Arguments" section below.
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation.
 #'               This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
@@ -36,10 +36,8 @@ CVBooster <- R6::R6Class(
 #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
 #'              (each element must be a vector of test fold's indices). When folds are supplied,
 #'              the \code{nfold} and \code{stratified} parameters are ignored.
-#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
-#' @param categorical_feature categorical features. This can either be a character vector of feature
-#'                            names or an integer vector with the indices of the features (e.g.
-#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
+#' @param colnames Deprecated. See "Deprecated Arguments" section below.
+#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                   into a predictor model which frees up memory and the original datasets
@@ -70,6 +68,13 @@ CVBooster <- R6::R6Class(
 #'   , nfold = 3L
 #' )
 #' }
+#'
+#' @section Deprecated Arguments:
+#'
+#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
+#' to argument \code{'data'}. It will also remove support for passing arguments
+#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
+#'
 #' @importFrom data.table data.table setorderv
 #' @export
 lgb.cv <- function(params = list()
@@ -102,12 +107,32 @@ lgb.cv <- function(params = list()
 
   # If 'data' is not an lgb.Dataset, try to construct one using 'label'
   if (!.is_Dataset(x = data)) {
+    warning(paste0(
+      "Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. "
+      , "Either pass an lgb.Dataset object, or use lightgbm()."
+    ))
     if (is.null(label)) {
       stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
     }
     data <- lgb.Dataset(data = data, label = label)
   }
 
+  # raise deprecation warnings if necessary
+  # ref: https://github.com/microsoft/LightGBM/issues/6435
+  args <- names(match.call())
+  if ("categorical_feature" %in% args) {
+    .emit_dataset_kwarg_warning("categorical_feature", "lgb.cv")
+  }
+  if ("colnames" %in% args) {
+    .emit_dataset_kwarg_warning("colnames", "lgb.cv")
+  }
+  if ("label" %in% args) {
+    .emit_dataset_kwarg_warning("label", "lgb.cv")
+  }
+  if ("weight" %in% args) {
+    .emit_dataset_kwarg_warning("weight", "lgb.cv")
+  }
+
   # set some parameters, resolving the way they were passed in with other parameters
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents

diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
@@ -6,10 +6,8 @@
 #' @inheritParams lgb_shared_params
 #' @param valids a list of \code{lgb.Dataset} objects, used for validation
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
-#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
-#' @param categorical_feature categorical features. This can either be a character vector of feature
-#'                            names or an integer vector with the indices of the features (e.g.
-#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
+#' @param colnames Deprecated. See "Deprecated Arguments" section below.
+#' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
 #' @param callbacks List of callback functions that are applied at each iteration.
 #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
 #'                   booster model into a predictor model which frees up memory and the
@@ -43,6 +41,13 @@
 #'   , early_stopping_rounds = 3L
 #' )
 #' }
+#'
+#' @section Deprecated Arguments:
+#'
+#' A future release of \code{lightgbm} will remove support for passing arguments
+#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to
+#' \code{lgb.Dataset} instead.
+#'
 #' @export
 lgb.train <- function(params = list(),
                       data,
@@ -78,6 +83,16 @@ lgb.train <- function(params = list(),
     }
   }
 
+  # raise deprecation warnings if necessary
+  # ref: https://github.com/microsoft/LightGBM/issues/6435
+  args <- names(match.call())
+  if ("categorical_feature" %in% args) {
+    .emit_dataset_kwarg_warning("categorical_feature", "lgb.train")
+  }
+  if ("colnames" %in% args) {
+    .emit_dataset_kwarg_warning("colnames", "lgb.train")
+  }
+
   # set some parameters, resolving the way they were passed in with other parameters
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents

diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
@@ -144,6 +144,12 @@ NULL
 #'
 #'                    \emph{New in version 4.0.0}
 #'
+#' @param colnames Character vector of features. Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
+#' @param categorical_feature categorical features. This can either be a character vector of feature
+#'                            names or an integer vector with the indices of the features (e.g.
+#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
+#'                            Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
+#'
 #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
 #'     \itemize{
 #'        \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@@ -152,10 +158,6 @@ NULL
 #'                    \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
 #'        \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
 #'        \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
-#'        \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
-#'        \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
-#'                            names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
-#'                            say "the first and tenth columns").}
 #'        \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
 #'                          into a predictor model which frees up memory and the original datasets}
 #'     }
@@ -176,6 +178,8 @@ lightgbm <- function(data,
                      objective = "auto",
                      init_score = NULL,
                      num_threads = NULL,
+                     colnames = NULL,
+                     categorical_feature = NULL,
                      ...) {
 
   # validate inputs early to avoid unnecessary computation
@@ -221,7 +225,14 @@ lightgbm <- function(data,
 
   # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
   if (!.is_Dataset(x = dtrain)) {
-    dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score)
+    dtrain <- lgb.Dataset(
+      data = data
+      , label = label
+      , weight = weights
+      , init_score = init_score
+      , categorical_feature = categorical_feature
+      , colnames = colnames
+    )
   }
 
   train_args <- list(

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
@@ -260,3 +260,19 @@
     return(a == b)
   }
 }
+
+# ref: https://github.com/microsoft/LightGBM/issues/6435
+.emit_dataset_kwarg_warning <- function(calling_function, argname) {
+  msg <- sprintf(
+    paste0(
+      "Argument '%s' to %s() is deprecated and will be removed in a future release. "
+      , "Set '%s' with lgb.Dataset() instead. "
+      , "See https://github.com/microsoft/LightGBM/issues/6435."
+    )
+    , argname
+    , calling_function
+    , argname
+  )
+  warning(msg)
+  return(invisible(NULL))
+}
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
   }
 })
 
-test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", {
+test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", {
   bad_values <- list(
     4L
     , "hello"
@@ -1788,11 +1788,6 @@ test_that("lgb.train() works with early stopping for regression with a metric th
 
 
 test_that("lgb.train() supports non-ASCII feature names", {
-  dtrain <- lgb.Dataset(
-    data = matrix(rnorm(400L), ncol =  4L)
-    , label = rnorm(100L)
-    , params = list(num_threads = .LGB_MAX_THREADS)
-  )
   # content below is equivalent to
   #
   #  feature_names <- c("F_零", "F_一", "F_二", "F_三")
@@ -1805,6 +1800,12 @@ test_that("lgb.train() supports non-ASCII feature names", {
     , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c)))
     , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89)))
   )
+  dtrain <- lgb.Dataset(
+    data = matrix(rnorm(400L), ncol =  4L)
+    , label = rnorm(100L)
+    , params = list(num_threads = .LGB_MAX_THREADS)
+    , colnames = feature_names
+  )
   bst <- lgb.train(
     data = dtrain
     , nrounds = 5L
@@ -1814,7 +1815,6 @@ test_that("lgb.train() supports non-ASCII feature names", {
       , verbose = .LGB_VERBOSITY
       , num_threads = .LGB_MAX_THREADS
     )
-    , colnames = feature_names
   )
   expect_true(.is_Booster(bst))
   dumped_model <- jsonlite::fromJSON(bst$dump_model())
@@ -2838,7 +2838,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
 
 test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), {
   set.seed(1L)
-  dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS))
+  dtrain <- lgb.Dataset(
+    train$data
+    , label = train$label
+    , params = list(num_threads = .LGB_MAX_THREADS)
+  )
 
   params <- list(
     objective = "regression"
@@ -2854,6 +2858,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
   pred1 <- bst$predict(test$data)
 
   new_colnames <- paste0(colnames(train$data), "_x")
+  dtrain$set_colnames(new_colnames)
   params <- list(
     objective = "regression"
     , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
@@ -2864,7 +2869,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
     data = dtrain
     , params = params
     , nrounds = 2L
-    , colnames = new_colnames
   )
   pred2 <- bst$predict(test$data)