@@ -20,6 +20,7 @@ import breeze.linalg.{Vector => BV}
20
20
21
21
import org .apache .spark .mllib .linalg .{Vector , Vectors }
22
22
import org .apache .spark .rdd .RDD
23
+ import breeze .linalg .axpy
23
24
24
25
case class VectorRDDStatisticalSummary (
25
26
mean : Vector ,
@@ -58,17 +59,22 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
58
59
BV .fill(size){Double .MaxValue }))(
59
60
seqOp = (c, v) => (c, v) match {
60
61
case ((prevMean, prevM2n, cnt, nnzVec, maxVec, minVec), currData) =>
61
- val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0 )
62
- val nonZeroCnt = Vectors
63
- .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0 ))).toBreeze
62
+ val currMean = prevMean :* (cnt / (cnt + 1.0 ))
63
+ axpy(1.0 / (cnt+ 1.0 ), currData, currMean)
64
+ axpy(- 1.0 , currData, prevMean)
65
+ prevMean :*= (currMean - currData)
66
+ axpy(1.0 , prevMean, prevM2n)
67
+ axpy(1.0 ,
68
+ Vectors .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0 ))).toBreeze,
69
+ nnzVec)
64
70
currData.activeIterator.foreach { case (id, value) =>
65
71
if (maxVec(id) < value) maxVec(id) = value
66
72
if (minVec(id) > value) minVec(id) = value
67
73
}
68
74
(currMean,
69
- prevM2n + ((currData - prevMean) :* (currData - currMean)) ,
75
+ prevM2n,
70
76
cnt + 1.0 ,
71
- nnzVec + nonZeroCnt ,
77
+ nnzVec,
72
78
maxVec,
73
79
minVec)
74
80
},
@@ -77,23 +83,30 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
77
83
(lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin),
78
84
(rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) =>
79
85
val totalCnt = lhsCnt + rhsCnt
80
- val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt
81
86
val deltaMean = rhsMean - lhsMean
82
- val totalM2n =
83
- lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt)
87
+ lhsMean :*= (lhsCnt / totalCnt)
88
+ axpy(rhsCnt/ totalCnt, rhsMean, lhsMean)
89
+ val totalMean = lhsMean
90
+ deltaMean :*= deltaMean
91
+ axpy(lhsCnt* rhsCnt/ totalCnt, deltaMean, lhsM2n)
92
+ axpy(1.0 , rhsM2n, lhsM2n)
93
+ val totalM2n = lhsM2n
84
94
rhsMax.activeIterator.foreach { case (id, value) =>
85
95
if (lhsMax(id) < value) lhsMax(id) = value
86
96
}
87
97
rhsMin.activeIterator.foreach { case (id, value) =>
88
98
if (lhsMin(id) > value) lhsMin(id) = value
89
99
}
90
- (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin)
100
+ axpy(1.0 , rhsNNZ, lhsNNZ)
101
+ (totalMean, totalM2n, totalCnt, lhsNNZ, lhsMax, lhsMin)
91
102
}
92
103
)
93
104
105
+ results._2 :/= results._3
106
+
94
107
VectorRDDStatisticalSummary (
95
108
Vectors .fromBreeze(results._1),
96
- Vectors .fromBreeze(results._2 :/ results._3 ),
109
+ Vectors .fromBreeze(results._2),
97
110
results._3.toLong,
98
111
Vectors .fromBreeze(results._4),
99
112
Vectors .fromBreeze(results._5),
0 commit comments