apache · manishamde · Apr 20, 2014 · Apr 22, 2014 · Apr 22, 2014 · Apr 22, 2014
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
@@ -77,15 +77,17 @@ bins if the condition is not satisfied.
 
 **Categorical features**
 
-For `$M$` categorical features, one could come up with `$2^M-1$` split candidates. However, for
-binary classification, the number of split candidates can be reduced to `$M-1$` by ordering the
+For `$M$` categorical feature values, one could come up with `$2^(M-1)-1$` split candidates. For
+binary classification, we can reduce the number of split candidates to `$M-1$` by ordering the
 categorical feature values by the proportion of labels falling in one of the two classes (see
 Section 9.2.4 in
 [Elements of Statistical Machine Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) for
 details). For example, for a binary classification problem with one categorical feature with three
 categories A, B and C with corresponding proportion of label 1 as 0.2, 0.6 and 0.4, the categorical
 features are ordered as A followed by C followed B or A, B, C. The two split candidates are A \| C, B
-and A , B \| C where \| denotes the split.
+and A , B \| C where \| denotes the split. A similar heuristic is used for multiclass classification
+when `$2^(M-1)-1$` is greater than the number of bins -- the impurity for each categorical feature value
+is used for ordering.
 
 ### Stopping rule
 

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -49,6 +49,7 @@ object DecisionTreeRunner {
   case class Params(
       input: String = null,
       algo: Algo = Classification,
+      numClassesForClassification: Int = 2,
       maxDepth: Int = 5,
       impurity: ImpurityType = Gini,
       maxBins: Int = 100)
@@ -68,6 +69,10 @@ object DecisionTreeRunner {
       opt[Int]("maxDepth")
         .text(s"max depth of the tree, default: ${defaultParams.maxDepth}")
         .action((x, c) => c.copy(maxDepth = x))
+      opt[Int]("numClassesForClassification")
+        .text(s"number of classes for classification, "
+          + s"default: ${defaultParams.numClassesForClassification}")
+        .action((x, c) => c.copy(numClassesForClassification = x))
       opt[Int]("maxBins")
         .text(s"max number of bins, default: ${defaultParams.maxBins}")
         .action((x, c) => c.copy(maxBins = x))
@@ -118,7 +123,13 @@ object DecisionTreeRunner {
       case Variance => impurity.Variance
     }
 
-    val strategy = new Strategy(params.algo, impurityCalculator, params.maxDepth, params.maxBins)
+    val strategy
+      = new Strategy(
+          algo = params.algo,
+          impurity = impurityCalculator,
+          maxDepth = params.maxDepth,
+          maxBins = params.maxBins,
+          numClassesForClassification = params.numClassesForClassification)
     val model = DecisionTree.train(training, strategy)
 
     if (params.algo == Classification) {
@@ -139,12 +150,8 @@ object DecisionTreeRunner {
    */
   private def accuracyScore(
       model: DecisionTreeModel,
-      data: RDD[LabeledPoint],
-      threshold: Double = 0.5): Double = {
-    def predictedValue(features: Vector): Double = {
-      if (model.predict(features) < threshold) 0.0 else 1.0
-    }
-    val correctCount = data.filter(y => predictedValue(y.features) == y.label).count()
+      data: RDD[LabeledPoint]): Double = {
+    val correctCount = data.filter(y => model.predict(y.features) == y.label).count()
     val count = data.count()
     correctCount.toDouble / count
   }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -28,6 +28,8 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
  * @param algo classification or regression
  * @param impurity criterion used for information gain calculation
  * @param maxDepth maximum depth of the tree
+ * @param numClassesForClassification number of classes for classification. Default value is 2
+ *                                    leads to binary classification
  * @param maxBins maximum number of bins used for splitting features
  * @param quantileCalculationStrategy algorithm for calculating quantiles
  * @param categoricalFeaturesInfo A map storing information about the categorical variables and the
@@ -44,7 +46,15 @@ class Strategy (
     val algo: Algo,
     val impurity: Impurity,
     val maxDepth: Int,
+    val numClassesForClassification: Int = 2,
     val maxBins: Int = 100,
     val quantileCalculationStrategy: QuantileStrategy = Sort,
     val categoricalFeaturesInfo: Map[Int, Int] = Map[Int, Int](),
-    val maxMemoryInMB: Int = 128) extends Serializable
+    val maxMemoryInMB: Int = 128) extends Serializable {
+
+  require(numClassesForClassification >= 2)
+  val isMulticlassClassification = numClassesForClassification > 2
+  val isMulticlassWithCategoricalFeatures
+    = isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -31,23 +31,35 @@ object Entropy extends Impurity {
 
   /**
    * :: DeveloperApi ::
-   * entropy calculation
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
-   * @return entropy value
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value
    */
   @DeveloperApi
-  override def calculate(c0: Double, c1: Double): Double = {
-    if (c0 == 0 || c1 == 0) {
-      0
-    } else {
-      val total = c0 + c1
-      val f0 = c0 / total
-      val f1 = c1 / total
-      -(f0 * log2(f0)) - (f1 * log2(f1))
+  override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    val numClasses = counts.length
+    var impurity = 0.0
+    var classIndex = 0
+    while (classIndex < numClasses) {
+      val classCount = counts(classIndex)
+      if (classCount != 0) {
+        val freq = classCount / totalCount
+        impurity -= freq * log2(freq)
+      }
+      classIndex += 1
     }
+    impurity
   }
 
+  /**
+   * :: DeveloperApi ::
+   * variance calculation
+   * @param count number of instances
+   * @param sum sum of labels
+   * @param sumSquares summation of squares of the labels
+   */
+  @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Entropy.calculate")
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -30,23 +30,32 @@ object Gini extends Impurity {
 
   /**
    * :: DeveloperApi ::
-   * Gini coefficient calculation
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
-   * @return Gini coefficient value
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value
    */
   @DeveloperApi
-  override def calculate(c0: Double, c1: Double): Double = {
-    if (c0 == 0 || c1 == 0) {
-      0
-    } else {
-      val total = c0 + c1
-      val f0 = c0 / total
-      val f1 = c1 / total
-      1 - f0 * f0 - f1 * f1
+  override def calculate(counts: Array[Double], totalCount: Double): Double = {
+    val numClasses = counts.length
+    var impurity = 1.0
+    var classIndex = 0
+    while (classIndex < numClasses) {
+      val freq = counts(classIndex) / totalCount
+      impurity -= freq * freq
+      classIndex += 1
     }
+    impurity
   }
 
+  /**
+   * :: DeveloperApi ::
+   * variance calculation
+   * @param count number of instances
+   * @param sum sum of labels
+   * @param sumSquares summation of squares of the labels
+   */
+  @DeveloperApi
   override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
     throw new UnsupportedOperationException("Gini.calculate")
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -28,13 +28,13 @@ trait Impurity extends Serializable {
 
   /**
    * :: DeveloperApi ::
-   * information calculation for binary classification
-   * @param c0 count of instances with label 0
-   * @param c1 count of instances with label 1
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
    * @return information value
    */
   @DeveloperApi
-  def calculate(c0 : Double, c1 : Double): Double
+  def calculate(counts: Array[Double], totalCount: Double): Double
 
   /**
    * :: DeveloperApi ::

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -25,7 +25,16 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
  */
 @Experimental
 object Variance extends Impurity {
-   override def calculate(c0: Double, c1: Double): Double =
+
+  /**
+   * :: DeveloperApi ::
+   * information calculation for multiclass classification
+   * @param counts Array[Double] with counts for each label
+   * @param totalCount sum of counts for all labels
+   * @return information value
+   */
+  @DeveloperApi
+  override def calculate(counts: Array[Double], totalCount: Double): Double =
      throw new UnsupportedOperationException("Variance.calculate")
 
   /**

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Bin.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType._
  * @param highSplit signifying the upper threshold for the continuous feature to be
  *                 accepted in the bin
  * @param featureType type of feature -- categorical or continuous
- * @param category categorical label value accepted in the bin
+ * @param category categorical label value accepted in the bin for binary classification
  */
 private[tree]
 case class Bin(lowSplit: Split, highSplit: Split, featureType: FeatureType, category: Double)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -27,17 +27,19 @@ import org.apache.spark.annotation.DeveloperApi
  * @param leftImpurity left node impurity
  * @param rightImpurity right node impurity
  * @param predict predicted value
+ * @param prob probability of the label (classification only)
  */
 @DeveloperApi
 class InformationGainStats(
     val gain: Double,
     val impurity: Double,
     val leftImpurity: Double,
     val rightImpurity: Double,
-    val predict: Double) extends Serializable {
+    val predict: Double,
+    val prob: Double = 0.0) extends Serializable {
 
   override def toString = {
-    "gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f"
-      .format(gain, impurity, leftImpurity, rightImpurity, predict)
+    "gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f, prob = %f"
+      .format(gain, impurity, leftImpurity, rightImpurity, predict, prob)
   }
 }