e1071::svm(): Use formula interface only if factors are present (mlr-org#1740)

mb706 · vrodriguezf · commit 0facddd68bc7 · 2021-01-16T13:43:04.000+01:00
* Testing svm with many features task * svm use data.frame instead of formula * spaces around match operator * Only use svm data.frame interface if task is all numeric * Deploy from Travis build 13884 [ci skip] Build URL: https://travis-ci.org/mlr-org/mlr/builds/542175846 Commit: 5565287 * add NEWS entry * Deploy from Travis build 13922 [ci skip] Build URL: https://travis-ci.org/mlr-org/mlr/builds/546742364 Commit: de67d1a
diff --git a/NEWS.md b/NEWS.md
@@ -11,6 +11,7 @@
   See `?regr.randomForest` for more details.  
   `regr.ranger` relies on the functions provided by the package ("jackknife" and "infjackknife" (default))  
   (@jakob-r, #1784)
+- `e1071::svm()` now only uses the formula interface if factors are present. This change is supposed to prevent from "stack overflow" issues some users encountered when using large datasets. See #1738 for more information. (@mb706, #1740)
 
 ## functions - general
 - `getClassWeightParam()` now also works for Wrapper* Models and ensemble models (@ja-thomas, #891)
diff --git a/R/RLearner_classif_svm.R b/R/RLearner_classif_svm.R
@@ -28,9 +28,16 @@ makeRLearner.classif.svm = function() {
 }
 
 #' @export
-trainLearner.classif.svm = function(.learner, .task, .subset, .weights = NULL, ...) {
-  f = getTaskFormula(.task)
-  e1071::svm(f, data = getTaskData(.task, .subset), probability = .learner$predict.type == "prob", ...)
+trainLearner.classif.svm = function(.learner, .task, .subset, .weights = NULL,  ...) {
+  if (sum(getTaskDesc(.task)$n.feat[c("factors", "ordered")]) > 0) {
+    # use formula interface if factors are present 
+    f = getTaskFormula(.task)
+    e1071::svm(f, data = getTaskData(.task, .subset), probability = .learner$predict.type == "prob", ...)
+  } else {
+    # use the "data.frame" approach if no factors are present to prevent issues like https://github.com/mlr-org/mlr/issues/1738
+    d = getTaskData(.task, .subset, target.extra = TRUE)
+    e1071::svm(d$data, d$target, probability = .learner$predict.type == "prob", ...)
+  }
 }
 
 #' @export
diff --git a/R/RLearner_regr_svm.R b/R/RLearner_regr_svm.R
@@ -27,9 +27,14 @@ makeRLearner.regr.svm = function() {
 }
 
 #' @export
-trainLearner.regr.svm = function(.learner, .task, .subset, .weights = NULL, ...) {
-  f = getTaskFormula(.task)
-  e1071::svm(f, data = getTaskData(.task, .subset), ...)
+trainLearner.regr.svm = function(.learner, .task, .subset, .weights = NULL,  ...) {
+  if (sum(getTaskDesc(.task)$n.feat[c("factors", "ordered")]) > 0) {
+    f = getTaskFormula(.task)
+    e1071::svm(f, data = getTaskData(.task, .subset), ...)
+  } else {
+    d = getTaskData(.task, .subset, target.extra = TRUE)
+    e1071::svm(d$data, d$target, ...)
+  }
 }
 
 #' @export
diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/tests/testthat/test_classif_svm.R b/tests/testthat/test_classif_svm.R
@@ -54,3 +54,11 @@ test_that("classif_svm", {
   preds = predict(model, multiclass.task)
   expect_lt(performance(preds), 0.3)
 })
+
+test_that("classif_svm with many features", {
+  set.seed(8008135)
+  xt = cbind(as.data.frame(matrix(rnorm(4e4), ncol = 2e4)), x = as.factor(c("a", "b")))
+  xt.task = makeClassifTask("xt", xt, "x")
+  # the given task has many features, the formula interface fails
+  train("classif.svm", xt.task)
+})
diff --git a/tests/testthat/test_regr_svm.R b/tests/testthat/test_regr_svm.R
@@ -30,3 +30,12 @@ test_that("regr_svm", {
 
   testCVParsets("regr.svm", regr.df, regr.target, tune.train = tt, tune.predict = tp, parset.list = parset.list)
 })
+
+test_that("classif_svm with many features", {
+  set.seed(8008135)
+  xt = cbind(as.data.frame(matrix(rnorm(4e4), ncol = 2e4)), x = 1:2)
+  xt.task = makeRegrTask("xt", xt, "x")
+  # the given task has many features, the formula interface fails
+  train("regr.svm", xt.task)
+})
+