16
16
*/
17
17
package org .apache .spark .mllib .rdd
18
18
19
- import breeze .linalg .{Vector => BV , axpy }
19
+ import breeze .linalg .{axpy , Vector => BV }
20
20
21
21
import org .apache .spark .mllib .linalg .{Vector , Vectors }
22
22
import org .apache .spark .rdd .RDD
23
23
24
+ /**
25
+ * Case class of the summary statistics, including mean, variance, count, max, min, and non-zero
26
+ * elements count.
27
+ */
24
28
case class VectorRDDStatisticalSummary (
25
29
mean : Vector ,
26
30
variance : Vector ,
@@ -29,6 +33,12 @@ case class VectorRDDStatisticalSummary(
29
33
min : Vector ,
30
34
nonZeroCnt : Vector ) extends Serializable
31
35
36
+ /**
37
+ * Case class of the aggregate value for collecting summary statistics from RDD[Vector]. These
38
+ * values are relatively with
39
+ * [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary ]], the
40
+ * latter is computed from the former.
41
+ */
32
42
private case class VectorRDDStatisticalRing (
33
43
fakeMean : BV [Double ],
34
44
fakeM2n : BV [Double ],
@@ -45,18 +55,8 @@ private case class VectorRDDStatisticalRing(
45
55
class VectorRDDFunctions (self : RDD [Vector ]) extends Serializable {
46
56
47
57
/**
48
- * Compute full column-wise statistics for the RDD, including
49
- * {{{
50
- * Mean: Vector,
51
- * Variance: Vector,
52
- * Count: Double,
53
- * Non-zero count: Vector,
54
- * Maximum elements: Vector,
55
- * Minimum elements: Vector.
56
- * }}},
57
- * with the size of Vector as input parameter.
58
+ * Aggregate function used for aggregating elements in a worker together.
58
59
*/
59
-
60
60
private def seqOp (
61
61
aggregator : VectorRDDStatisticalRing ,
62
62
currData : BV [Double ]): VectorRDDStatisticalRing = {
@@ -84,6 +84,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
84
84
}
85
85
}
86
86
87
+ /**
88
+ * Combine function used for combining intermediate results together from every worker.
89
+ */
87
90
private def combOp (
88
91
statistics1 : VectorRDDStatisticalRing ,
89
92
statistics2 : VectorRDDStatisticalRing ): VectorRDDStatisticalRing = {
@@ -92,27 +95,38 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
92
95
VectorRDDStatisticalRing (mean2, m2n2, cnt2, nnz2, max2, min2)) =>
93
96
val totalCnt = cnt1 + cnt2
94
97
val deltaMean = mean2 - mean1
98
+
95
99
mean2.activeIterator.foreach {
96
100
case (id, 0.0 ) =>
97
- case (id, value) => mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id))
101
+ case (id, value) =>
102
+ mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id))
98
103
}
104
+
99
105
m2n2.activeIterator.foreach {
100
106
case (id, 0.0 ) =>
101
- case (id, value) => m2n1(id) += value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+ nnz2(id))
107
+ case (id, value) =>
108
+ m2n1(id) +=
109
+ value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+ nnz2(id))
102
110
}
111
+
103
112
max2.activeIterator.foreach {
104
113
case (id, value) =>
105
114
if (max1(id) < value) max1(id) = value
106
115
}
116
+
107
117
min2.activeIterator.foreach {
108
118
case (id, value) =>
109
119
if (min1(id) > value) min1(id) = value
110
120
}
121
+
111
122
axpy(1.0 , nnz2, nnz1)
112
123
VectorRDDStatisticalRing (mean1, m2n1, totalCnt, nnz1, max1, min1)
113
124
}
114
125
}
115
126
127
+ /**
128
+ * Compute full column-wise statistics for the RDD with the size of Vector as input parameter.
129
+ */
116
130
def summarizeStatistics (size : Int ): VectorRDDStatisticalSummary = {
117
131
val zeroValue = VectorRDDStatisticalRing (
118
132
BV .zeros[Double ](size),
@@ -122,16 +136,17 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
122
136
BV .fill(size)(Double .MinValue ),
123
137
BV .fill(size)(Double .MaxValue ))
124
138
125
- val breezeVectors = self.map(_.toBreeze)
126
139
val VectorRDDStatisticalRing (fakeMean, fakeM2n, totalCnt, nnz, fakeMax, fakeMin) =
127
- breezeVectors .aggregate(zeroValue)(seqOp, combOp)
140
+ self.map(_.toBreeze) .aggregate(zeroValue)(seqOp, combOp)
128
141
129
142
// solve real mean
130
143
val realMean = fakeMean :* nnz :/ totalCnt
131
- // solve real variance
132
- val deltaMean = fakeMean :- 0.0
133
- val realVar = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt)
134
- // max, min
144
+
145
+ // solve real m2n
146
+ val deltaMean = fakeMean
147
+ val realM2n = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt)
148
+
149
+ // remove the initial value in max and min, i.e. the Double.MaxValue or Double.MinValue.
135
150
val max = Vectors .sparse(size, fakeMax.activeIterator.map { case (id, value) =>
136
151
if ((value == Double .MinValue ) && (realMean(id) != Double .MinValue )) (id, 0.0 )
137
152
else (id, value)
@@ -142,11 +157,11 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable {
142
157
}.toSeq)
143
158
144
159
// get variance
145
- realVar :/= totalCnt
160
+ realM2n :/= totalCnt
146
161
147
162
VectorRDDStatisticalSummary (
148
163
Vectors .fromBreeze(realMean),
149
- Vectors .fromBreeze(realVar ),
164
+ Vectors .fromBreeze(realM2n ),
150
165
totalCnt.toLong,
151
166
Vectors .fromBreeze(nnz),
152
167
max,
0 commit comments