address comments

zhengruifeng · zhengruifeng · commit 02457a785c61 · 2019-10-16T15:17:07.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -169,7 +169,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param loss evaluation metric.
    * @return Measure of model error on data
    */
-  def computeError(
+  def computeWeightedError(
       data: RDD[Instance],
       trees: Array[DecisionTreeRegressionModel],
       treeWeights: Array[Double],
@@ -179,7 +179,7 @@ private[spark] object GradientBoostedTrees extends Logging {
         updatePrediction(features, acc, model, weight)
       }
       (loss.computeError(predicted, label) * weight, weight)
-    }.treeReduce{ case ((err1, weight1), (err2, weight2)) =>
+    }.treeReduce { case ((err1, weight1), (err2, weight2)) =>
         (err1 + err2, weight1 + weight2)
     }
     errSum / weightSum
@@ -191,13 +191,13 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param predError Prediction and error.
    * @return Measure of model error on data
    */
-  def computeError(
+  def computeWeightedError(
       data: RDD[Instance],
       predError: RDD[(Double, Double)]): Double = {
     val (errSum, weightSum) = data.zip(predError).map {
       case (Instance(_, weight, _), (_, err)) =>
         (err * weight, weight)
-    }.treeReduce{ case ((err1, weight1), (err2, weight2)) =>
+    }.treeReduce { case ((err1, weight1), (err2, weight2)) =>
       (err1 + err2, weight1 + weight2)
     }
     errSum / weightSum
@@ -220,24 +220,18 @@ private[spark] object GradientBoostedTrees extends Logging {
       treeWeights: Array[Double],
       loss: OldLoss,
       algo: OldAlgo.Value): Array[Double] = {
-
-    val sc = data.sparkContext
     val remappedData = algo match {
       case OldAlgo.Classification =>
         data.map(x => Instance((x.label * 2) - 1, x.weight, x.features))
       case _ => data
     }
 
-    val broadcastTrees = sc.broadcast(trees)
-    val localTreeWeights = treeWeights
     val numTrees = trees.length
-
     val (errSum, weightSum) = remappedData.mapPartitions { iter =>
-      val trees = broadcastTrees.value
       iter.map { case Instance(label, weight, features) =>
         val pred = Array.tabulate(numTrees) { i =>
           trees(i).rootNode.predictImpl(features)
-            .prediction * localTreeWeights(i)
+            .prediction * treeWeights(i)
         }
         val err = pred.scanLeft(0.0)(_ + _).drop(1)
           .map(p => loss.computeError(p, label) * weight)
@@ -248,7 +242,6 @@ private[spark] object GradientBoostedTrees extends Logging {
       (err1, weight1 + weight2)
     }
 
-    broadcastTrees.destroy()
     errSum.map(_ / weightSum)
   }
 
@@ -298,8 +291,10 @@ private[spark] object GradientBoostedTrees extends Logging {
     }
 
     // Prepare periodic checkpointers
+    // Note: this is checkpointing the unweighted training error
     val predErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)](
       treeStrategy.getCheckpointInterval, input.sparkContext)
+    // Note: this is checkpointing the unweighted validation error
     val validatePredErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)](
       treeStrategy.getCheckpointInterval, input.sparkContext)
 
@@ -319,15 +314,19 @@ private[spark] object GradientBoostedTrees extends Logging {
 
     var predError = computeInitialPredictionAndError(input, firstTreeWeight, firstTreeModel, loss)
     predErrorCheckpointer.update(predError)
-    logDebug("error of gbt = " + computeError(input, predError))
+    logDebug("error of gbt = " + computeWeightedError(input, predError))
 
     // Note: A model of type regression is used since we require raw prediction
     timer.stop("building tree 0")
 
     var validatePredError =
       computeInitialPredictionAndError(validationInput, firstTreeWeight, firstTreeModel, loss)
     if (validate) validatePredErrorCheckpointer.update(validatePredError)
-    var bestValidateError = if (validate) computeError(validationInput, validatePredError) else 0.0
+    var bestValidateError = if (validate) {
+      computeWeightedError(validationInput, validatePredError)
+    } else {
+      0.0
+    }
     var bestM = 1
 
     var m = 1
@@ -356,7 +355,7 @@ private[spark] object GradientBoostedTrees extends Logging {
       predError = updatePredictionError(
         input, predError, baseLearnerWeights(m), baseLearners(m), loss)
       predErrorCheckpointer.update(predError)
-      logDebug("error of gbt = " + computeError(input, predError))
+      logDebug("error of gbt = " + computeWeightedError(input, predError))
 
       if (validate) {
         // Stop training early if
@@ -367,7 +366,7 @@ private[spark] object GradientBoostedTrees extends Logging {
         validatePredError = updatePredictionError(
           validationInput, validatePredError, baseLearnerWeights(m), baseLearners(m), loss)
         validatePredErrorCheckpointer.update(validatePredError)
-        val currentValidateError = computeError(validationInput, validatePredError)
+        val currentValidateError = computeWeightedError(validationInput, validatePredError)
         if (bestValidateError - currentValidateError < validationTol * Math.max(
           currentValidateError, 0.01)) {
           doneLearning = true
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -405,11 +405,11 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
       case LabeledPoint(label, features) =>
         Instance(label * 2 - 1, 1.0, features)
     }
-    val lossErr1 = GradientBoostedTrees.computeError(remappedValidationData,
+    val lossErr1 = GradientBoostedTrees.computeWeightedError(remappedValidationData,
       model1.trees, model1.treeWeights, model1.getOldLossType)
-    val lossErr2 = GradientBoostedTrees.computeError(remappedValidationData,
+    val lossErr2 = GradientBoostedTrees.computeWeightedError(remappedValidationData,
       model2.trees, model2.treeWeights, model2.getOldLossType)
-    val lossErr3 = GradientBoostedTrees.computeError(remappedValidationData,
+    val lossErr3 = GradientBoostedTrees.computeWeightedError(remappedValidationData,
       model3.trees, model3.treeWeights, model3.getOldLossType)
 
     assert(evalArr(0) ~== lossErr1 relTol 1E-3)
@@ -443,9 +443,9 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
           case LabeledPoint(label, features) =>
             Instance(label * 2 - 1, 1.0, features)
         }
-        (GradientBoostedTrees.computeError(remappedRdd, modelWithoutValidation.trees,
+        (GradientBoostedTrees.computeWeightedError(remappedRdd, modelWithoutValidation.trees,
           modelWithoutValidation.treeWeights, modelWithoutValidation.getOldLossType),
-          GradientBoostedTrees.computeError(remappedRdd, modelWithValidation.trees,
+          GradientBoostedTrees.computeWeightedError(remappedRdd, modelWithValidation.trees,
             modelWithValidation.treeWeights, modelWithValidation.getOldLossType))
       }
       assert(errorWithValidation < errorWithoutValidation)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -243,11 +243,11 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
 
       for (evalLossType <- GBTRegressor.supportedLossTypes) {
         val evalArr = model3.evaluateEachIteration(validationData.toDF, evalLossType)
-        val lossErr1 = GradientBoostedTrees.computeError(validationData.map(_.toInstance),
+        val lossErr1 = GradientBoostedTrees.computeWeightedError(validationData.map(_.toInstance),
           model1.trees, model1.treeWeights, model1.convertToOldLossType(evalLossType))
-        val lossErr2 = GradientBoostedTrees.computeError(validationData.map(_.toInstance),
+        val lossErr2 = GradientBoostedTrees.computeWeightedError(validationData.map(_.toInstance),
           model2.trees, model2.treeWeights, model2.convertToOldLossType(evalLossType))
-        val lossErr3 = GradientBoostedTrees.computeError(validationData.map(_.toInstance),
+        val lossErr3 = GradientBoostedTrees.computeWeightedError(validationData.map(_.toInstance),
           model3.trees, model3.treeWeights, model3.convertToOldLossType(evalLossType))
 
         assert(evalArr(0) ~== lossErr1 relTol 1E-3)
@@ -278,11 +278,11 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
       // early stop
       assert(modelWithValidation.numTrees < numIter)
 
-      val errorWithoutValidation = GradientBoostedTrees.computeError(
+      val errorWithoutValidation = GradientBoostedTrees.computeWeightedError(
         validationData.map(_.toInstance),
         modelWithoutValidation.trees, modelWithoutValidation.treeWeights,
         modelWithoutValidation.getOldLossType)
-      val errorWithValidation = GradientBoostedTrees.computeError(
+      val errorWithValidation = GradientBoostedTrees.computeWeightedError(
         validationData.map(_.toInstance),
         modelWithValidation.trees, modelWithValidation.treeWeights,
         modelWithValidation.getOldLossType)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/GradientBoostedTreesSuite.scala
@@ -56,12 +56,12 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
       val (errorWithoutValidation, errorWithValidation) = {
         if (algo == Classification) {
           val remappedRdd = validateRdd.map(x => Instance(2 * x.label - 1, x.weight, x.features))
-          (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss),
-            GradientBoostedTrees.computeError(remappedRdd, validateTrees,
+          (GradientBoostedTrees.computeWeightedError(remappedRdd, trees, treeWeights, loss),
+            GradientBoostedTrees.computeWeightedError(remappedRdd, validateTrees,
               validateTreeWeights, loss))
         } else {
-          (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss),
-            GradientBoostedTrees.computeError(validateRdd, validateTrees,
+          (GradientBoostedTrees.computeWeightedError(validateRdd, trees, treeWeights, loss),
+            GradientBoostedTrees.computeWeightedError(validateRdd, validateTrees,
               validateTreeWeights, loss))
         }
       }