@@ -21,6 +21,7 @@ import breeze.linalg.{Vector => BV, DenseVector => BDV}
21
21
import org .apache .spark .mllib .linalg .{Vector , Vectors }
22
22
import org .apache .spark .mllib .util .MLUtils ._
23
23
import org .apache .spark .rdd .RDD
24
+ import breeze .numerics ._
24
25
25
26
/**
26
27
* Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector ]] through an
@@ -161,4 +162,24 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
161
162
}
162
163
}
163
164
}
165
+
166
+ def parallelMeanAndVar (size : Int ): (Vector , Vector ) = {
167
+ val statistics = self.map(_.toBreeze).aggregate((BV .zeros[Double ](size), BV .zeros[Double ](size), 0.0 ))(
168
+ seqOp = (c, v) => (c, v) match {
169
+ case ((prevMean, prevM2n, cnt), currData) =>
170
+ val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0 )
171
+ (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0 )
172
+ },
173
+ combOp = (lhs, rhs) => (lhs, rhs) match {
174
+ case ((lhsMean, lhsM2n, lhsCnt), (rhsMean, rhsM2n, rhsCnt)) =>
175
+ val totalCnt = lhsCnt + rhsCnt
176
+ val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt
177
+ val deltaMean = rhsMean - lhsMean
178
+ val totalM2n = lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt)
179
+ (totalMean, totalM2n, totalCnt)
180
+ }
181
+ )
182
+
183
+ (Vectors .fromBreeze(statistics._1), Vectors .fromBreeze(statistics._2 :/ statistics._3))
184
+ }
164
185
}
0 commit comments