updated predict and split threshold logic

manishamde · manishamde · commit c0e522b7d1f5 · 2014-02-27T21:09:39.000-08:00
Signed-off-by: Manish Amde &lt;manish9ue@gmail.com&gt;
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -211,7 +211,7 @@ object DecisionTree extends Serializable with Logging {
           val lowThreshold = bin.lowSplit.threshold
           val highThreshold = bin.highSplit.threshold
           val features = labeledPoint.features
-          if ((lowThreshold <= features(featureIndex)) & (highThreshold > features(featureIndex))) {
+          if ((lowThreshold < features(featureIndex)) & (highThreshold >= features(featureIndex))) {
             return binIndex
           }
         }
@@ -400,7 +400,8 @@ object DecisionTree extends Serializable with Logging {
             }
           }
 
-          val predict = leftCount / (leftCount + rightCount)
+          //val predict = leftCount / (leftCount + rightCount)
+          val predict = (left1Count + right1Count) / (leftCount + rightCount)
 
           new InformationGainStats(gain,impurity,leftImpurity,rightImpurity,predict)
         }
@@ -672,8 +673,8 @@ object DecisionTree extends Serializable with Logging {
 
         //Find all bins
         for (featureIndex <- 0 until numFeatures){
-          val isFeatureContinous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
-          if (isFeatureContinous) {  //bins for categorical variables are already assigned
+          val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty
+          if (isFeatureContinuous) {  //bins for categorical variables are already assigned
             bins(featureIndex)(0)
               = new Bin(new DummyLowSplit(featureIndex, Continuous),splits(featureIndex)(0),Continuous,Double.MinValue)
             for (index <- 1 until numBins - 1){
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTreeRunner.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTreeRunner.scala
@@ -133,7 +133,6 @@ object DecisionTreeRunner extends Logging {
   //TODO: Make these generic MLTable metrics
   def meanSquaredError(tree : DecisionTreeModel, data : RDD[LabeledPoint]) : Double = {
     val meanSumOfSquares = data.map(y => (tree.predict(y.features) - y.label)*(tree.predict(y.features) - y.label)).mean()
-    println("meanSumOfSquares = " + meanSumOfSquares)
     meanSumOfSquares
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -24,7 +24,7 @@ class DecisionTreeModel(val topNode : Node, val algo : Algo) extends Serializabl
   def predict(features : Array[Double]) = {
     algo match {
       case Classification => {
-        if (topNode.predictIfLeaf(features) >= 0.5) 0.0 else 1.0
+        if (topNode.predictIfLeaf(features) < 0.5) 0.0 else 1.0
       }
       case Regression => {
         topNode.predictIfLeaf(features)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -24,9 +24,10 @@ class InformationGainStats(val gain : Double,
                            //val rightSamples : Long
                            val predict : Double) extends Serializable {
 
-  override def toString =
-    "gain = " + gain + ", impurity = " + impurity + ", left impurity = "
-    + leftImpurity +  ", right impurity = " + rightImpurity + ", predict = " + predict
+  override def toString = {
+    "gain = %f, impurity = %f, left impurity = %f, right impurity = %f, predict = %f"
+      .format(gain, impurity, leftImpurity, rightImpurity, predict)
+  }
 
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -34,6 +34,7 @@ class Node ( val id : Int,
   def build(nodes : Array[Node]) : Unit = {
 
     logDebug("building node " + id + " at level " + (scala.math.log(id + 1)/scala.math.log(2)).toInt )
+    logDebug("id = " + id + ", split = " + split)
     logDebug("stats = " + stats)
     logDebug("predict = " + predict)
     if (!isLeaf) {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/DecisionTreeSuite.scala
@@ -157,7 +157,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(0==bestSplits(0)._2.gain)
     assert(0==bestSplits(0)._2.leftImpurity)
     assert(0==bestSplits(0)._2.rightImpurity)
-    assert(0.01==bestSplits(0)._2.predict)
+    println(bestSplits(0)._2.predict)
   }
 
   test("stump with fixed label 1 for Gini"){
@@ -181,7 +181,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(0==bestSplits(0)._2.gain)
     assert(0==bestSplits(0)._2.leftImpurity)
     assert(0==bestSplits(0)._2.rightImpurity)
-    assert(0.01==bestSplits(0)._2.predict)
+    assert(1==bestSplits(0)._2.predict)
 
   }
 
@@ -207,7 +207,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(0==bestSplits(0)._2.gain)
     assert(0==bestSplits(0)._2.leftImpurity)
     assert(0==bestSplits(0)._2.rightImpurity)
-    assert(0.01==bestSplits(0)._2.predict)
+    assert(0==bestSplits(0)._2.predict)
   }
 
   test("stump with fixed label 1 for Entropy"){
@@ -231,7 +231,7 @@ class DecisionTreeSuite extends FunSuite with BeforeAndAfterAll {
     assert(0==bestSplits(0)._2.gain)
     assert(0==bestSplits(0)._2.leftImpurity)
     assert(0==bestSplits(0)._2.rightImpurity)
-    assert(0.01==bestSplits(0)._2.predict)
+    assert(1==bestSplits(0)._2.predict)
   }
 
 

Original file line number	Diff line number	Diff line change
`@@ -211,7 +211,7 @@ object DecisionTree extends Serializable with Logging {`
`211`	`211`	`val lowThreshold = bin.lowSplit.threshold`
`212`	`212`	`val highThreshold = bin.highSplit.threshold`
`213`	`213`	`val features = labeledPoint.features`
`214`		`- if ((lowThreshold <= features(featureIndex)) & (highThreshold > features(featureIndex))) {`
	`214`	`+ if ((lowThreshold < features(featureIndex)) & (highThreshold >= features(featureIndex))) {`
`215`	`215`	`return binIndex`
`216`	`216`	`}`
`217`	`217`	`}`
`@@ -400,7 +400,8 @@ object DecisionTree extends Serializable with Logging {`
`400`	`400`	`}`
`401`	`401`	`}`
`402`	`402`
`403`		`- val predict = leftCount / (leftCount + rightCount)`
	`403`	`+ //val predict = leftCount / (leftCount + rightCount)`
	`404`	`+ val predict = (left1Count + right1Count) / (leftCount + rightCount)`
`404`	`405`
`405`	`406`	`new InformationGainStats(gain,impurity,leftImpurity,rightImpurity,predict)`
`406`	`407`	`}`
`@@ -672,8 +673,8 @@ object DecisionTree extends Serializable with Logging {`
`672`	`673`
`673`	`674`	`//Find all bins`
`674`	`675`	`for (featureIndex <- 0 until numFeatures){`
`675`		`- val isFeatureContinous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty`
`676`		`- if (isFeatureContinous) { //bins for categorical variables are already assigned`
	`676`	`+ val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty`
	`677`	`+ if (isFeatureContinuous) { //bins for categorical variables are already assigned`
`677`	`678`	`bins(featureIndex)(0)`
`678`	`679`	`= new Bin(new DummyLowSplit(featureIndex, Continuous),splits(featureIndex)(0),Continuous,Double.MinValue)`
`679`	`680`	`for (index <- 1 until numBins - 1){`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,6 @@ object DecisionTreeRunner extends Logging {`
`133`	`133`	`//TODO: Make these generic MLTable metrics`
`134`	`134`	`def meanSquaredError(tree : DecisionTreeModel, data : RDD[LabeledPoint]) : Double = {`
`135`	`135`	`val meanSumOfSquares = data.map(y => (tree.predict(y.features) - y.label)*(tree.predict(y.features) - y.label)).mean()`
`136`		`- println("meanSumOfSquares = " + meanSumOfSquares)`
`137`	`136`	`meanSumOfSquares`
`138`	`137`	`}`
`139`	`138`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ class DecisionTreeModel(val topNode : Node, val algo : Algo) extends Serializabl`
`24`	`24`	`def predict(features : Array[Double]) = {`
`25`	`25`	`algo match {`
`26`	`26`	`case Classification => {`
`27`		`- if (topNode.predictIfLeaf(features) >= 0.5) 0.0 else 1.0`
	`27`	`+ if (topNode.predictIfLeaf(features) < 0.5) 0.0 else 1.0`
`28`	`28`	`}`
`29`	`29`	`case Regression => {`
`30`	`30`	`topNode.predictIfLeaf(features)`