Skip to content

Commit d839952

Browse files
holdenknemccarthy
authored andcommitted
[SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing max bins
Author: Holden Karau <holden@pigscanfly.ca> Closes apache#6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits: 2894695 [Holden Karau] remove extra blank line 2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too 3a09170 [Holden Karau] add maxBins to to the train method as well af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100 (cherry picked from commit 164fe2a) Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
1 parent 805b62a commit d839952

File tree

3 files changed

+24
-9
lines changed

3 files changed

+24
-9
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,12 +685,14 @@ private[python] class PythonMLLibAPI extends Serializable {
685685
lossStr: String,
686686
numIterations: Int,
687687
learningRate: Double,
688-
maxDepth: Int): GradientBoostedTreesModel = {
688+
maxDepth: Int,
689+
maxBins: Int): GradientBoostedTreesModel = {
689690
val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
690691
boostingStrategy.setLoss(Losses.fromString(lossStr))
691692
boostingStrategy.setNumIterations(numIterations)
692693
boostingStrategy.setLearningRate(learningRate)
693694
boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
695+
boostingStrategy.treeStrategy.setMaxBins(maxBins)
694696
boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap
695697

696698
val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)

python/pyspark/mllib/tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,13 @@ def test_regression(self):
444444
except ValueError:
445445
self.fail()
446446

447+
# Verify that maxBins is being passed through
448+
GradientBoostedTrees.trainRegressor(
449+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
450+
with self.assertRaises(Exception) as cm:
451+
GradientBoostedTrees.trainRegressor(
452+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
453+
447454

448455
class StatTests(MLlibTestCase):
449456
# SPARK-4023

python/pyspark/mllib/tree.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
299299
1 internal node + 2 leaf nodes. (default: 4)
300300
:param maxBins: maximum number of bins used for splitting
301301
features
302-
(default: 100)
302+
(default: 32)
303303
:param seed: Random seed for bootstrapping and choosing feature
304304
subsets.
305305
:return: RandomForestModel that can be used for prediction
@@ -377,7 +377,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
377377
1 leaf node; depth 1 means 1 internal node + 2 leaf
378378
nodes. (default: 4)
379379
:param maxBins: maximum number of bins used for splitting
380-
features (default: 100)
380+
features (default: 32)
381381
:param seed: Random seed for bootstrapping and choosing feature
382382
subsets.
383383
:return: RandomForestModel that can be used for prediction
@@ -435,16 +435,17 @@ class GradientBoostedTrees(object):
435435

436436
@classmethod
437437
def _train(cls, data, algo, categoricalFeaturesInfo,
438-
loss, numIterations, learningRate, maxDepth):
438+
loss, numIterations, learningRate, maxDepth, maxBins):
439439
first = data.first()
440440
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
441441
model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
442-
loss, numIterations, learningRate, maxDepth)
442+
loss, numIterations, learningRate, maxDepth, maxBins)
443443
return GradientBoostedTreesModel(model)
444444

445445
@classmethod
446446
def trainClassifier(cls, data, categoricalFeaturesInfo,
447-
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
447+
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
448+
maxBins=32):
448449
"""
449450
Method to train a gradient-boosted trees model for
450451
classification.
@@ -467,6 +468,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
467468
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
468469
1 leaf node; depth 1 means 1 internal node + 2 leaf
469470
nodes. (default: 3)
471+
:param maxBins: maximum number of bins used for splitting
472+
features (default: 32) DecisionTree requires maxBins >= max categories
470473
:return: GradientBoostedTreesModel that can be used for
471474
prediction
472475
@@ -499,11 +502,12 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
499502
[1.0, 0.0]
500503
"""
501504
return cls._train(data, "classification", categoricalFeaturesInfo,
502-
loss, numIterations, learningRate, maxDepth)
505+
loss, numIterations, learningRate, maxDepth, maxBins)
503506

504507
@classmethod
505508
def trainRegressor(cls, data, categoricalFeaturesInfo,
506-
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3):
509+
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
510+
maxBins=32):
507511
"""
508512
Method to train a gradient-boosted trees model for regression.
509513
@@ -522,6 +526,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
522526
contribution of each estimator. The learning rate
523527
should be between in the interval (0, 1].
524528
(default: 0.1)
529+
:param maxBins: maximum number of bins used for splitting
530+
features (default: 32) DecisionTree requires maxBins >= max categories
525531
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
526532
1 leaf node; depth 1 means 1 internal node + 2 leaf
527533
nodes. (default: 3)
@@ -556,7 +562,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
556562
[1.0, 0.0]
557563
"""
558564
return cls._train(data, "regression", categoricalFeaturesInfo,
559-
loss, numIterations, learningRate, maxDepth)
565+
loss, numIterations, learningRate, maxDepth, maxBins)
560566

561567

562568
def _test():

0 commit comments

Comments
 (0)