[SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large

hhbyyh · jkbradley · commit a490787daa5e · 2016-01-13T17:43:38.000-08:00
jira: https://issues.apache.org/jira/browse/SPARK-12026 The issue is valid as features.toArray.view.zipWithIndex.slice(startCol, endCol) becomes slower as startCol gets larger. I tested on local and the change can improve the performance and the running time was stable. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #10146 from hhbyyh/chiSq. (cherry picked from commit 021dafc) Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -109,7 +109,9 @@ private[stat] object ChiSqTest extends Logging {
           }
           i += 1
           distinctLabels += label
-          features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) =>
+          val brzFeatures = features.toBreeze
+          (startCol until endCol).map { col =>
+            val feature = brzFeatures(col)
             allDistinctFeatures(col) += feature
             (col, feature, label)
           }
@@ -122,7 +124,7 @@ private[stat] object ChiSqTest extends Logging {
           pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
       }
       val numLabels = labels.size
-      pairCounts.keys.groupBy(_._1).map { case (col, keys) =>
+      pairCounts.keys.groupBy(_._1).foreach { case (col, keys) =>
         val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
         val numRows = features.size
         val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))

Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,9 @@ private[stat] object ChiSqTest extends Logging {`
`109`	`109`	`}`
`110`	`110`	`i += 1`
`111`	`111`	`distinctLabels += label`
`112`		`- features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) =>`
	`112`	`+ val brzFeatures = features.toBreeze`
	`113`	`+ (startCol until endCol).map { col =>`
	`114`	`+ val feature = brzFeatures(col)`
`113`	`115`	`allDistinctFeatures(col) += feature`
`114`	`116`	`(col, feature, label)`
`115`	`117`	`}`
`@@ -122,7 +124,7 @@ private[stat] object ChiSqTest extends Logging {`
`122`	`124`	`pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap`
`123`	`125`	`}`
`124`	`126`	`val numLabels = labels.size`
`125`		`- pairCounts.keys.groupBy(_._1).map { case (col, keys) =>`
	`127`	`+ pairCounts.keys.groupBy(_._1).foreach { case (col, keys) =>`
`126`	`128`	`val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap`
`127`	`129`	`val numRows = features.size`
`128`	`130`	`val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))`