Skip to content

Commit 164fe2a

Browse files
holdenkjkbradley
authored andcommitted
[SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing max bins
Author: Holden Karau <holden@pigscanfly.ca> Closes apache#6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits: 2894695 [Holden Karau] remove extra blank line 2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too 3a09170 [Holden Karau] add maxBins to to the train method as well af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100
1 parent 44fa7df commit 164fe2a

File tree

3 files changed

+24
-9
lines changed

3 files changed

+24
-9
lines changed

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,12 +696,14 @@ private[python] class PythonMLLibAPI extends Serializable {
696696
lossStr: String,
697697
numIterations: Int,
698698
learningRate: Double,
699-
maxDepth: Int): GradientBoostedTreesModel = {
699+
maxDepth: Int,
700+
maxBins: Int): GradientBoostedTreesModel = {
700701
val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
701702
boostingStrategy.setLoss(Losses.fromString(lossStr))
702703
boostingStrategy.setNumIterations(numIterations)
703704
boostingStrategy.setLearningRate(learningRate)
704705
boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
706+
boostingStrategy.treeStrategy.setMaxBins(maxBins)
705707
boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap
706708

707709
val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)

python/pyspark/mllib/tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,13 @@ def test_regression(self):
463463
except ValueError:
464464
self.fail()
465465

466+
# Verify that maxBins is being passed through
467+
GradientBoostedTrees.trainRegressor(
468+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
469+
with self.assertRaises(Exception) as cm:
470+
GradientBoostedTrees.trainRegressor(
471+
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
472+
466473

467474
class StatTests(MLlibTestCase):
468475
# SPARK-4023

python/pyspark/mllib/tree.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
299299
1 internal node + 2 leaf nodes. (default: 4)
300300
:param maxBins: maximum number of bins used for splitting
301301
features
302-
(default: 100)
302+
(default: 32)
303303
:param seed: Random seed for bootstrapping and choosing feature
304304
subsets.
305305
:return: RandomForestModel that can be used for prediction
@@ -377,7 +377,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
377377
1 leaf node; depth 1 means 1 internal node + 2 leaf
378378
nodes. (default: 4)
379379
:param maxBins: maximum number of bins used for splitting
380-
features (default: 100)
380+
features (default: 32)
381381
:param seed: Random seed for bootstrapping and choosing feature
382382
subsets.
383383
:return: RandomForestModel that can be used for prediction
@@ -435,16 +435,17 @@ class GradientBoostedTrees(object):
435435

436436
@classmethod
437437
def _train(cls, data, algo, categoricalFeaturesInfo,
438-
loss, numIterations, learningRate, maxDepth):
438+
loss, numIterations, learningRate, maxDepth, maxBins):
439439
first = data.first()
440440
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
441441
model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
442-
loss, numIterations, learningRate, maxDepth)
442+
loss, numIterations, learningRate, maxDepth, maxBins)
443443
return GradientBoostedTreesModel(model)
444444

445445
@classmethod
446446
def trainClassifier(cls, data, categoricalFeaturesInfo,
447-
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
447+
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
448+
maxBins=32):
448449
"""
449450
Method to train a gradient-boosted trees model for
450451
classification.
@@ -467,6 +468,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
467468
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
468469
1 leaf node; depth 1 means 1 internal node + 2 leaf
469470
nodes. (default: 3)
471+
:param maxBins: maximum number of bins used for splitting
472+
features (default: 32) DecisionTree requires maxBins >= max categories
470473
:return: GradientBoostedTreesModel that can be used for
471474
prediction
472475
@@ -499,11 +502,12 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
499502
[1.0, 0.0]
500503
"""
501504
return cls._train(data, "classification", categoricalFeaturesInfo,
502-
loss, numIterations, learningRate, maxDepth)
505+
loss, numIterations, learningRate, maxDepth, maxBins)
503506

504507
@classmethod
505508
def trainRegressor(cls, data, categoricalFeaturesInfo,
506-
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3):
509+
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
510+
maxBins=32):
507511
"""
508512
Method to train a gradient-boosted trees model for regression.
509513
@@ -522,6 +526,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
522526
contribution of each estimator. The learning rate
523527
should be between in the interval (0, 1].
524528
(default: 0.1)
529+
:param maxBins: maximum number of bins used for splitting
530+
features (default: 32) DecisionTree requires maxBins >= max categories
525531
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
526532
1 leaf node; depth 1 means 1 internal node + 2 leaf
527533
nodes. (default: 3)
@@ -556,7 +562,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
556562
[1.0, 0.0]
557563
"""
558564
return cls._train(data, "regression", categoricalFeaturesInfo,
559-
loss, numIterations, learningRate, maxDepth)
565+
loss, numIterations, learningRate, maxDepth, maxBins)
560566

561567

562568
def _test():

0 commit comments

Comments
 (0)