Added min and max to Scala and Java RDD, added min and max to StatCounter

dwmclary · dwmclary · commit a5c13b03c3e6 · 2014-03-14T13:30:12.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -477,6 +477,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     new java.util.ArrayList(arr)
   }
 
+  def max(comp: Comparator[T]): T = {
+    import scala.collection.JavaConversions._
+    rdd.max()(Ordering.comparatorToOrdering(comp))
+  }
+
+  def min(comp: Comparator[T]): T = {
+    import scala.collection.JavaConversions._
+    rdd.min()(Ordering.comparatorToOrdering(comp))
+  }
+
   /**
    * Returns the first K elements from this RDD using the
    * natural ordering for T while maintain the order.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -958,6 +958,10 @@ abstract class RDD[T: ClassTag](
    */
   def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T] = top(num)(ord.reverse)
 
+  def max()(implicit ord: Ordering[T]):T = this.reduce{(x,y) => ord.max(x,y)}
+
+  def min()(implicit ord: Ordering[T]):T = this.reduce{(x,y) => ord.min(x,y)}
+
   /**
    * Save this RDD as a text file, using string representations of elements.
    */
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -29,6 +29,8 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
   private var n: Long = 0     // Running count of our values
   private var mu: Double = 0  // Running mean of our values
   private var m2: Double = 0  // Running variance numerator (sum of (x - mean)^2)
+  private var max_v: Double = 0 // Running max of our values
+  private var min_v: Double = 0 // Running min of our values
 
   merge(values)
 
@@ -41,6 +43,8 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
     n += 1
     mu += delta / n
     m2 += delta * (value - mu)
+    max_v = math.max(max_v, value)
+    min_v = math.min(min_v, value)
     this
   }
 
@@ -58,7 +62,9 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
       if (n == 0) {
         mu = other.mu
         m2 = other.m2
-        n = other.n       
+        n = other.n  
+        max_v = other.max_v
+        min_v = other.min_v
       } else if (other.n != 0) {        
         val delta = other.mu - mu
         if (other.n * 10 < n) {
@@ -70,6 +76,8 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
         }
         m2 += other.m2 + (delta * delta * n * other.n) / (n + other.n)
         n += other.n
+        max_v = math.max(max_v, other.max_v)
+        min_v = math.min(min_v, other.min_v)
       }
       this
     }
@@ -81,6 +89,8 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
     other.n = n
     other.mu = mu
     other.m2 = m2
+    other.max_v = max_v
+    other.min_v = min_v
     other
   }
 
@@ -90,6 +100,10 @@ class StatCounter(values: TraversableOnce[Double]) extends Serializable {
 
   def sum: Double = n * mu
 
+  def max: Double = max_v
+
+  def min: Double = min_v
+
   /** Return the variance of the values. */
   def variance: Double = {
     if (n == 0) {
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -171,6 +171,8 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet
     assert(abs(6.0/2 - rdd.mean) < 0.01)
     assert(abs(1.0 - rdd.variance) < 0.01)
     assert(abs(1.0 - rdd.stdev) < 0.01)
+    assert(abs(4.0 - stats.max) === 0)
+    assert(abs(-1.0 - stats.max) === 0)
 
     // Add other tests here for classes that should be able to handle empty partitions correctly
   }
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -47,6 +47,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(nums.glom().map(_.toList).collect().toList === List(List(1, 2), List(3, 4)))
     assert(nums.collect({ case i if i >= 3 => i.toString }).collect().toList === List("3", "4"))
     assert(nums.keyBy(_.toString).collect().toList === List(("1", 1), ("2", 2), ("3", 3), ("4", 4)))
+    assert(nums.max() === 4)
+    assert(nums.min() === 1)
     val partitionSums = nums.mapPartitions(iter => Iterator(iter.reduceLeft(_ + _)))
     assert(partitionSums.collect().toList === List(3, 7))
 
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -24,7 +24,6 @@
 import sys
 import shlex
 import traceback
-from bisect import bisect_right
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 from threading import Thread

Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,8 @@ class PartitioningSuite extends FunSuite with SharedSparkContext with PrivateMet`
`171`	`171`	`assert(abs(6.0/2 - rdd.mean) < 0.01)`
`172`	`172`	`assert(abs(1.0 - rdd.variance) < 0.01)`
`173`	`173`	`assert(abs(1.0 - rdd.stdev) < 0.01)`
	`174`	`+ assert(abs(4.0 - stats.max) === 0)`
	`175`	`+ assert(abs(-1.0 - stats.max) === 0)`
`174`	`176`
`175`	`177`	`// Add other tests here for classes that should be able to handle empty partitions correctly`
`176`	`178`	`}`