set_threads() helper (#605)

mllg · web-flow · commit 3fac1e2741c0 · 2021-02-05T10:09:44.000+01:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -69,6 +69,7 @@ Imports:
     mlbench,
     mlr3measures (>= 0.3.0),
     mlr3misc (>= 0.7.0),
+    parallelly,
     palmerpenguins,
     paradox (>= 0.6.0),
     uuid
@@ -189,6 +190,7 @@ Collate:
     'predict.R'
     'reexports.R'
     'resample.R'
+    'set_threads.R'
     'task_converters.R'
     'worker.R'
     'zzz.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -62,6 +62,8 @@ S3method(predict,Learner)
 S3method(print,PredictionData)
 S3method(rd_info,Learner)
 S3method(rd_info,Task)
+S3method(set_threads,Learner)
+S3method(set_threads,list)
 export(BenchmarkResult)
 export(DataBackend)
 export(DataBackendDataTable)
@@ -163,6 +165,8 @@ export(msrs)
 export(resample)
 export(rsmp)
 export(rsmps)
+export(set.threads.default)
+export(set_threads)
 export(tgen)
 export(tgens)
 export(tsk)
diff --git a/R/LearnerClassifDebug.R b/R/LearnerClassifDebug.R
@@ -18,6 +18,7 @@
 #'    \item{segfault_predict:}{Probability to provokes a segfault during predict.}
 #'    \item{predict_missing}{Ratio of predictions which will be NA.}
 #'    \item{save_tasks:}{Saves input task in `model` slot during training and prediction.}
+#'    \item{threads:}{Number of threads to use. Has no effect.}
 #'    \item{x:}{Numeric tuning parameter. Has no effect.}
 #' }
 #' Note that segfaults may not be triggered on your operating system.
@@ -66,6 +67,7 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif,
             ParamDbl$new("segfault_predict", lower = 0, upper = 1, default = 0, tags = "predict"),
             ParamDbl$new("predict_missing", lower = 0, upper = 1, default = 0, tags = "predict"),
             ParamLgl$new("save_tasks", default = FALSE, tags = c("train", "predict")),
+            ParamInt$new("threads", lower = 1, tags = c("train", "threads")),
             ParamDbl$new("x", lower = 0, upper = 1, tags = "train")
           )
         ),
diff --git a/R/set_threads.R b/R/set_threads.R
@@ -0,0 +1,53 @@
+#' @title Set the Number of Threads
+#'
+#' @description
+#' Control the parallelism via threading while calling external packages from \CRANpkg{mlr3}.
+#'
+#' For example, the random forest implementation in package \CRANpkg{ranger} (connected
+#' via \CRANpkg{mlr3learners}) supports threading via OpenMP.
+#' The number of threads to use can be set via hyperparameter `num.threads`, and
+#' defaults to 1. By calling `set_threads(x, 4)` with `x` being a ranger learner, the
+#' hyperparameter is changed so that 4 cores are used.
+#'
+#' If the object `x` does not support threading, `x` is returned as-is.
+#' If applied to a list, recurses through all list elements.
+#'
+#' Note that threading is incompatible with other parallelization techniques such as forking
+#' via the [future::plan] [future::multicore]. For this reason all learners connected to \CRANpkg{mlr3}
+#' have threading disabled in their defaults.
+#'
+#' @param x (`any`)\cr
+#'   Object to set threads for, e.g. a [Learner].
+#'   This object is modified in-place.
+#' @param n (`integer(1)`)\cr
+#'   Number of threads to use.
+#'
+#' @return Same object as input `x` (changed in-place),
+#'   with possibly updated parameter values.
+#' @export
+set_threads = function(x, n = parallelly::availableCores()) {
+  assert_count(n, positive = TRUE)
+  UseMethod("set_threads")
+}
+
+#' @rdname set_threads
+#' @export
+set.threads.default = function(x, n = parallelly::availableCores()) { # nolint
+  x
+}
+
+#' @rdname set_threads
+#' @export
+set_threads.Learner = function(x, n = parallelly::availableCores()) { # nolint
+  id = x$param_set$ids(tags = "threads")
+  if (length(id)) {
+    x$param_set$values = insert_named(x$param_set$values, named_list(id, n))
+  }
+  x
+}
+
+#' @rdname set_threads
+#' @export
+set_threads.list = function(x, n = parallelly::availableCores()) { # nolint
+  lapply(x, set_threads, n = n)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -142,6 +142,9 @@ Also, many helpful R libraries did not exist at the time [mlr](https://github.co
   All user input is checked with [`checkmate`](https://cran.r-project.org/package=checkmate).
   Return types are documented, and mechanisms popular in base R which "simplify" the result unpredictably (e.g., `sapply()` or `drop` argument in `[.data.frame`) are avoided.
 * Be light on dependencies. `mlr3` requires the following packages at runtime:
+    - [`parallelly`](https://cran.r-project.org/package=parallelly):
+      Helper functions for parallelization.
+      No extra recursive dependencies.
     - [`future.apply`](https://cran.r-project.org/package=future.apply):
       Resampling and benchmarking is parallelized with the [`future`](https://cran.r-project.org/package=future) abstraction interfacing many parallel backends.
     - [`backports`](https://cran.r-project.org/package=backports):
@@ -178,6 +181,9 @@ Also, many helpful R libraries did not exist at the time [mlr](https://github.co
     - [`mlbench`](https://cran.r-project.org/package=mlbench):
       A collection of machine learning data sets.
       No dependencies.
+    - [`palmerpenguins`](https://cran.r-project.org/package=palmerpenguins):
+      A classification data set about penguins, used on examples and provided as a
+      toy task.  No dependencies.
 * [Reflections](https://en.wikipedia.org/wiki/Reflection_%28computer_programming%29): Objects are queryable for properties and capabilities, allowing you to program on them.
 * Additional functionality that comes with extra dependencies:
     - To capture output, warnings and exceptions, [`evaluate`](https://cran.r-project.org/package=evaluate) and [`callr`](https://cran.r-project.org/package=callr) can be used.
diff --git a/README.md b/README.md
@@ -148,9 +148,9 @@ rr$score(measure)
 ```
 
     ##                 task  task_id                   learner    learner_id
-    ## 1: <TaskClassif[45]> penguins <LearnerClassifRpart[34]> classif.rpart
-    ## 2: <TaskClassif[45]> penguins <LearnerClassifRpart[34]> classif.rpart
-    ## 3: <TaskClassif[45]> penguins <LearnerClassifRpart[34]> classif.rpart
+    ## 1: <TaskClassif[46]> penguins <LearnerClassifRpart[34]> classif.rpart
+    ## 2: <TaskClassif[46]> penguins <LearnerClassifRpart[34]> classif.rpart
+    ## 3: <TaskClassif[46]> penguins <LearnerClassifRpart[34]> classif.rpart
     ##            resampling resampling_id iteration              prediction
     ## 1: <ResamplingCV[19]>            cv         1 <PredictionClassif[19]>
     ## 2: <ResamplingCV[19]>            cv         2 <PredictionClassif[19]>
@@ -217,6 +217,9 @@ would result in non-trivial API changes.
     argument in `[.data.frame`) are avoided.
 -   Be light on dependencies. `mlr3` requires the following packages at
     runtime:
+    -   [`parallelly`](https://cran.r-project.org/package=parallelly):
+        Helper functions for parallelization. No extra recursive
+        dependencies.
     -   [`future.apply`](https://cran.r-project.org/package=future.apply):
         Resampling and benchmarking is parallelized with the
         [`future`](https://cran.r-project.org/package=future)
@@ -248,6 +251,9 @@ would result in non-trivial API changes.
         Performance measures. No extra recursive dependencies.
     -   [`mlbench`](https://cran.r-project.org/package=mlbench): A
         collection of machine learning data sets. No dependencies.
+    -   [`palmerpenguins`](https://cran.r-project.org/package=palmerpenguins):
+        A classification data set about penguins, used on examples and
+        provided as a toy task. No dependencies.
 -   [Reflections](https://en.wikipedia.org/wiki/Reflection_%28computer_programming%29):
     Objects are queryable for properties and capabilities, allowing you
     to program on them.
diff --git a/inst/testthat/helper_expectations.R b/inst/testthat/helper_expectations.R
@@ -314,6 +314,7 @@ expect_learner = function(lrn, task = NULL) {
   checkmate::expect_choice(lrn$task_type, mlr3::mlr_reflections$task_types$type)
   checkmate::expect_character(lrn$packages, any.missing = FALSE, min.chars = 1L, unique = TRUE)
   checkmate::expect_class(lrn$param_set, "ParamSet")
+  testthat::expect_lte(length(lrn$param_set$ids(tags = "threads")), 1L)
   checkmate::expect_character(lrn$properties, any.missing = FALSE, min.chars = 1L, unique = TRUE)
   if (is.null(private(lrn)$.train)) {
     checkmate::expect_function(lrn$train_internal, args = "task", nargs = 1L)
diff --git a/man/mlr_learners_classif.debug.Rd b/man/mlr_learners_classif.debug.Rd
diff --git a/man/set_threads.Rd b/man/set_threads.Rd
diff --git a/tests/testthat/test_set_threads.R b/tests/testthat/test_set_threads.R
@@ -0,0 +1,13 @@
+test_that("set_threads", {
+  l1 = lrn("classif.featureless")
+  expect_learner(set_threads(l1))
+
+  l2 = lrn("classif.debug")
+  expect_null(l2$param_set$values$threads)
+  expect_learner(set_threads(l2, 1))
+  expect_equal(l2$param_set$values$threads, 1)
+
+  x = list(l1, l2)
+  expect_list(set_threads(x, 2))
+  expect_equal(l2$param_set$values$threads, 2)
+})