[SPARK-31777][ML][PYSPARK] Add user-specified fold column to CrossValidator

viirya · viirya · commit 7f6a8ab16656 · 2020-06-16T16:46:32.000-07:00
### What changes were proposed in this pull request? This patch adds user-specified fold column support to `CrossValidator`. User can assign fold numbers to dataset instead of letting Spark do random splits. ### Why are the changes needed? This gives `CrossValidator` users more flexibility in splitting folds. ### Does this PR introduce _any_ user-facing change? Yes, a new `foldCol` param is added to `CrossValidator`. User can use it to specify custom fold splitting. ### How was this patch tested? Added unit tests. Closes #28704 from viirya/SPARK-31777. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Liang-Chi Hsieh <liangchi@uber.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -30,13 +30,13 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.evaluation.Evaluator
-import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasCollectSubModels, HasParallelism}
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.util.Instrumentation.instrumented
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{DataFrame, Dataset}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{IntegerType, StructType}
 import org.apache.spark.util.ThreadUtils
 
 /**
@@ -56,6 +56,19 @@ private[ml] trait CrossValidatorParams extends ValidatorParams {
   def getNumFolds: Int = $(numFolds)
 
   setDefault(numFolds -> 3)
+
+  /**
+   * Param for the column name of user specified fold number. Once this is specified,
+   * `CrossValidator` won't do random k-fold split. Note that this column should be
+   * integer type with range [0, numFolds) and Spark will throw exception on out-of-range
+   * fold numbers.
+   */
+  val foldCol: Param[String] = new Param[String](this, "foldCol",
+    "the column name of user specified fold number")
+
+  def getFoldCol: String = $(foldCol)
+
+  setDefault(foldCol, "")
 }
 
 /**
@@ -94,6 +107,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   @Since("2.0.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
+  /** @group setParam */
+  @Since("3.1.0")
+  def setFoldCol(value: String): this.type = set(foldCol, value)
+
   /**
    * Set the maximum level of parallelism to evaluate models in parallel.
    * Default is 1 for serial evaluation
@@ -132,7 +149,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
 
     instr.logPipelineStage(this)
     instr.logDataset(dataset)
-    instr.logParams(this, numFolds, seed, parallelism)
+    instr.logParams(this, numFolds, seed, parallelism, foldCol)
     logTuningParams(instr)
 
     val collectSubModelsParam = $(collectSubModels)
@@ -142,10 +159,15 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     } else None
 
     // Compute metrics for each model over each split
-    val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
+    val (splits, schemaWithoutFold) = if ($(foldCol) == "") {
+      (MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed)), schema)
+    } else {
+      val filteredSchema = StructType(schema.filter(_.name != $(foldCol)).toArray)
+      (MLUtils.kFold(dataset.toDF, $(numFolds), $(foldCol)), filteredSchema)
+    }
     val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
-      val trainingDataset = sparkSession.createDataFrame(training, schema).cache()
-      val validationDataset = sparkSession.createDataFrame(validation, schema).cache()
+      val trainingDataset = sparkSession.createDataFrame(training, schemaWithoutFold).cache()
+      val validationDataset = sparkSession.createDataFrame(validation, schemaWithoutFold).cache()
       instr.logDebug(s"Train split $splitIndex with multiple sets of parameters.")
 
       // Fit models in a Future for training in parallel
@@ -183,7 +205,14 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   }
 
   @Since("1.4.0")
-  override def transformSchema(schema: StructType): StructType = transformSchemaImpl(schema)
+  override def transformSchema(schema: StructType): StructType = {
+    if ($(foldCol) != "") {
+      val foldColDt = schema.apply($(foldCol)).dataType
+      require(foldColDt.isInstanceOf[IntegerType],
+        s"The specified `foldCol` column ${$(foldCol)} must be integer type, but got $foldColDt.")
+    }
+    transformSchemaImpl(schema)
+  }
 
   @Since("1.4.0")
   override def copy(extra: ParamMap): CrossValidator = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -20,15 +20,15 @@ package org.apache.spark.mllib.util
 import scala.annotation.varargs
 import scala.reflect.ClassTag
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{MatrixUDT => MLMatrixUDT, VectorUDT => MLVectorUDT}
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD}
-import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.functions._
@@ -248,6 +248,36 @@ object MLUtils extends Logging {
     }.toArray
   }
 
+  /**
+   * Version of `kFold()` taking a fold column name.
+   */
+  @Since("3.1.0")
+  def kFold(df: DataFrame, numFolds: Int, foldColName: String): Array[(RDD[Row], RDD[Row])] = {
+    val foldCol = df.col(foldColName)
+    val checker = udf { foldNum: Int =>
+      // Valid fold number is in range [0, numFolds).
+      if (foldNum < 0 || foldNum >= numFolds) {
+        throw new SparkException(s"Fold number must be in range [0, $numFolds), but got $foldNum.")
+      }
+      true
+    }
+    (0 until numFolds).map { fold =>
+      val training = df
+        .filter(checker(foldCol) && foldCol =!= fold)
+        .drop(foldColName).rdd
+      val validation = df
+        .filter(checker(foldCol) && foldCol === fold)
+        .drop(foldColName).rdd
+      if (training.isEmpty()) {
+        throw new SparkException(s"The training data at fold $fold is empty.")
+      }
+      if (validation.isEmpty()) {
+        throw new SparkException(s"The validation data at fold $fold is empty.")
+      }
+      (training, validation)
+    }.toArray
+  }
+
   /**
    * Returns a new vector with `1.0` (bias) appended to the input vector.
    */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.mllib.util.LinearDataGenerator
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.StructType
 
 class CrossValidatorSuite
@@ -40,10 +41,14 @@ class CrossValidatorSuite
   import testImplicits._
 
   @transient var dataset: Dataset[_] = _
+  @transient var datasetWithFold: Dataset[_] = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
     dataset = sc.parallelize(generateLogisticInput(1.0, 1.0, 100, 42), 2).toDF()
+    val dfWithRandom = dataset.repartition(1).withColumn("random", rand(100L))
+    val foldCol = when(col("random") < 0.33, 0).when(col("random") < 0.66, 1).otherwise(2)
+    datasetWithFold = dfWithRandom.withColumn("fold", foldCol).drop("random").repartition(2)
   }
 
   test("cross validation with logistic regression") {
@@ -75,6 +80,65 @@ class CrossValidatorSuite
     }
   }
 
+  test("cross validation with logistic regression with fold col") {
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.001, 1000.0))
+      .addGrid(lr.maxIter, Array(0, 10))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new CrossValidator()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setNumFolds(3)
+      .setFoldCol("fold")
+    val cvModel = cv.fit(datasetWithFold)
+
+    MLTestingUtils.checkCopyAndUids(cv, cvModel)
+
+    val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    assert(parent.getRegParam === 0.001)
+    assert(parent.getMaxIter === 10)
+    assert(cvModel.avgMetrics.length === lrParamMaps.length)
+
+    val result = cvModel.transform(dataset).select("prediction").as[Double].collect()
+    testTransformerByGlobalCheckFunc[(Double, Vector)](dataset.toDF(), cvModel, "prediction") {
+      rows =>
+        val result2 = rows.map(_.getDouble(0))
+        assert(result === result2)
+    }
+  }
+
+  test("cross validation with logistic regression with wrong fold col") {
+    val lr = new LogisticRegression
+    val lrParamMaps = new ParamGridBuilder()
+      .addGrid(lr.regParam, Array(0.001, 1000.0))
+      .addGrid(lr.maxIter, Array(0, 10))
+      .build()
+    val eval = new BinaryClassificationEvaluator
+    val cv = new CrossValidator()
+      .setEstimator(lr)
+      .setEstimatorParamMaps(lrParamMaps)
+      .setEvaluator(eval)
+      .setNumFolds(3)
+      .setFoldCol("fold1")
+    val err1 = intercept[IllegalArgumentException] {
+      cv.fit(datasetWithFold)
+    }
+    assert(err1.getMessage.contains("fold1 does not exist. Available: label, features, fold"))
+
+    // Fold column must be integer type.
+    val foldCol = udf(() => 1L)
+    val datasetWithWrongFoldType = dataset.withColumn("fold1", foldCol())
+    val err2 = intercept[IllegalArgumentException] {
+      cv.fit(datasetWithWrongFoldType)
+    }
+    assert(err2
+      .getMessage
+      .contains("The specified `foldCol` column fold1 must be integer type, but got LongType."))
+  }
+
   test("cross validation with linear regression") {
     val dataset = sc.parallelize(
       LinearDataGenerator.generateLinearInput(
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -353,4 +353,34 @@ class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext {
       convertMatrixColumnsFromML(df, "p._2")
     }
   }
+
+  test("kFold with fold column") {
+    val data = sc.parallelize(1 to 100, 2).map(x => (x, if (x <= 50) 0 else 1)).toDF("i", "fold")
+    val collectedData = data.collect().map(_.getInt(0)).sorted
+    val twoFoldedRdd = kFold(data, 2, "fold")
+    assert(twoFoldedRdd(0)._1.collect().map(_.getInt(0)).sorted ===
+      twoFoldedRdd(1)._2.collect().map(_.getInt(0)).sorted)
+    assert(twoFoldedRdd(0)._2.collect().map(_.getInt(0)).sorted ===
+      twoFoldedRdd(1)._1.collect().map(_.getInt(0)).sorted)
+
+    val result1 = twoFoldedRdd(0)._1.union(twoFoldedRdd(0)._2).collect().map(_.getInt(0)).sorted
+    assert(result1 ===  collectedData,
+      "Each training+validation set combined should contain all of the data.")
+    val result2 = twoFoldedRdd(1)._1.union(twoFoldedRdd(1)._2).collect().map(_.getInt(0)).sorted
+    assert(result2 ===  collectedData,
+      "Each training+validation set combined should contain all of the data.")
+  }
+
+  test("kFold with fold column: invalid fold numbers") {
+    val data = sc.parallelize(Seq(0, 1, 2), 2).toDF( "fold")
+    val err1 = intercept[SparkException] {
+      kFold(data, 2, "fold")(0)._1.collect()
+    }
+    assert(err1.getMessage.contains("Fold number must be in range [0, 2), but got 2."))
+
+    val err2 = intercept[SparkException] {
+      kFold(data, 4, "fold")(0)._1.collect()
+    }
+    assert(err2.getMessage.contains("The validation data at fold 3 is empty."))
+  }
 }
diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py
@@ -380,6 +380,80 @@ def test_save_load_pipeline_estimator(self):
                                               original_nested_pipeline_model.stages):
             self.assertEqual(loadedStage.uid, originalStage.uid)
 
+    def test_user_specified_folds(self):
+        from pyspark.sql import functions as F
+
+        dataset = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0),
+             (Vectors.dense([0.4]), 1.0),
+             (Vectors.dense([0.5]), 0.0),
+             (Vectors.dense([0.6]), 1.0),
+             (Vectors.dense([1.0]), 1.0)] * 10,
+            ["features", "label"]).repartition(2, "features")
+
+        dataset_with_folds = dataset.repartition(1).withColumn("random", rand(100)) \
+            .withColumn("fold", F.when(F.col("random") < 0.33, 0)
+                        .when(F.col("random") < 0.66, 1)
+                        .otherwise(2)).repartition(2, "features")
+
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [20]).build()
+        evaluator = BinaryClassificationEvaluator()
+
+        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)
+        cv_with_user_folds = CrossValidator(estimator=lr,
+                                            estimatorParamMaps=grid,
+                                            evaluator=evaluator,
+                                            numFolds=3,
+                                            foldCol="fold")
+
+        self.assertEqual(cv.getEstimator().uid, cv_with_user_folds.getEstimator().uid)
+
+        cvModel1 = cv.fit(dataset)
+        cvModel2 = cv_with_user_folds.fit(dataset_with_folds)
+        for index in range(len(cvModel1.avgMetrics)):
+            print(abs(cvModel1.avgMetrics[index] - cvModel2.avgMetrics[index]))
+            self.assertTrue(abs(cvModel1.avgMetrics[index] - cvModel2.avgMetrics[index])
+                            < 0.1)
+
+        # test save/load of CrossValidator
+        temp_path = tempfile.mkdtemp()
+        cvPath = temp_path + "/cv"
+        cv_with_user_folds.save(cvPath)
+        loadedCV = CrossValidator.load(cvPath)
+        self.assertEqual(loadedCV.getFoldCol(), cv_with_user_folds.getFoldCol())
+
+    def test_invalid_user_specified_folds(self):
+        from pyspark.sql import functions as F
+
+        dataset_with_folds = self.spark.createDataFrame(
+            [(Vectors.dense([0.0]), 0.0, 0),
+             (Vectors.dense([0.4]), 1.0, 1),
+             (Vectors.dense([0.5]), 0.0, 2),
+             (Vectors.dense([0.6]), 1.0, 0),
+             (Vectors.dense([1.0]), 1.0, 1)] * 10,
+            ["features", "label", "fold"])
+
+        lr = LogisticRegression()
+        grid = ParamGridBuilder().addGrid(lr.maxIter, [20]).build()
+        evaluator = BinaryClassificationEvaluator()
+
+        cv = CrossValidator(estimator=lr,
+                            estimatorParamMaps=grid,
+                            evaluator=evaluator,
+                            numFolds=2,
+                            foldCol="fold")
+        with self.assertRaisesRegexp(Exception, "Fold number must be in range"):
+            cv.fit(dataset_with_folds)
+
+        cv = CrossValidator(estimator=lr,
+                            estimatorParamMaps=grid,
+                            evaluator=evaluator,
+                            numFolds=4,
+                            foldCol="fold")
+        with self.assertRaisesRegexp(Exception, "The validation data at fold 3 is empty"):
+            cv.fit(dataset_with_folds)
+
 
 class TrainValidationSplitTests(SparkSessionTestCase):
 
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py