Skip to content

Commit 430d782

Browse files
committed
Added more debug info on binning error. Added some docs.
1 parent d036089 commit 430d782

File tree

2 files changed

+25
-24
lines changed

2 files changed

+25
-24
lines changed

mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,30 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
2727
/**
2828
* :: Experimental ::
2929
* Stores all the configuration options for tree construction
30-
* @param algo classification or regression
31-
* @param impurity criterion used for information gain calculation
30+
* @param algo Learning goal. Supported:
31+
* [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
32+
* [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
33+
* @param impurity Criterion used for information gain calculation.
34+
* Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]],
35+
* [[org.apache.spark.mllib.tree.impurity.Entropy]].
36+
* Supported for Regression: [[org.apache.spark.mllib.tree.impurity.Variance]].
3237
* @param maxDepth Maximum depth of the tree.
3338
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
34-
* @param numClassesForClassification number of classes for classification. Default value is 2
35-
* leads to binary classification
36-
* @param maxBins maximum number of bins used for splitting features
37-
* @param quantileCalculationStrategy algorithm for calculating quantiles
39+
* @param numClassesForClassification Number of classes for classification.
40+
* (Ignored for regression.)
41+
* Default value is 2 (binary classification).
42+
* @param maxBins Maximum number of bins used for discretizing continuous features and
43+
* for choosing how to split on features at each node.
44+
* More bins give higher granularity.
45+
* @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported:
46+
* [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]]
3847
* @param categoricalFeaturesInfo A map storing information about the categorical variables and the
3948
* number of discrete values they take. For example, an entry (n ->
4049
* k) implies the feature n is categorical with k categories 0,
4150
* 1, 2, ... , k-1. It's important to note that features are
4251
* zero-indexed.
43-
* @param maxMemoryInMB maximum memory in MB allocated to histogram aggregation. Default value is
52+
* @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation. Default value is
4453
* 128 MB.
45-
*
4654
*/
4755
@Experimental
4856
class Strategy (
@@ -64,20 +72,7 @@ class Strategy (
6472
= isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
6573

6674
/**
67-
* Java-friendly constructor.
68-
*
69-
* @param algo classification or regression
70-
* @param impurity criterion used for information gain calculation
71-
* @param maxDepth Maximum depth of the tree.
72-
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
73-
* @param numClassesForClassification number of classes for classification. Default value is 2
74-
* leads to binary classification
75-
* @param maxBins maximum number of bins used for splitting features
76-
* @param categoricalFeaturesInfo A map storing information about the categorical variables and
77-
* the number of discrete values they take. For example, an entry
78-
* (n -> k) implies the feature n is categorical with k categories
79-
* 0, 1, 2, ... , k-1. It's important to note that features are
80-
* zero-indexed.
75+
* Java-friendly constructor for [[org.apache.spark.mllib.tree.configuration.Strategy]]
8176
*/
8277
def this(
8378
algo: Algo,
@@ -90,6 +85,10 @@ class Strategy (
9085
categoricalFeaturesInfo.asInstanceOf[java.util.Map[Int, Int]].asScala.toMap)
9186
}
9287

88+
/**
89+
* Check validity of parameters.
90+
* Throws exception if invalid.
91+
*/
9392
private[tree] def assertValid(): Unit = {
9493
algo match {
9594
case Classification =>

mllib/src/main/scala/org/apache/spark/mllib/tree/impl/TreePoint.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ private[tree] object TreePoint {
181181
// Perform binary search for finding bin for continuous features.
182182
val binIndex = binarySearchForBins()
183183
if (binIndex == -1) {
184-
throw new UnknownError("no bin was found for continuous variable.")
184+
throw new UnknownError("No bin was found for continuous feature." +
185+
s" Feature index: $featureIndex. Feature value: ${labeledPoint.features(featureIndex)}")
185186
}
186187
binIndex
187188
} else {
@@ -192,7 +193,8 @@ private[tree] object TreePoint {
192193
sequentialBinSearchForOrderedCategoricalFeature()
193194
}
194195
if (binIndex == -1) {
195-
throw new UnknownError("no bin was found for categorical variable.")
196+
throw new UnknownError("No bin was found for categorical feature." +
197+
s" Feature index: $featureIndex. Feature value: ${labeledPoint.features(featureIndex)}")
196198
}
197199
binIndex
198200
}

0 commit comments

Comments
 (0)