@@ -157,58 +157,6 @@ object MLUtils {
157
157
dataStr.saveAsTextFile(dir)
158
158
}
159
159
160
- /**
161
- * Utility function to compute mean and standard deviation on a given dataset.
162
- *
163
- * @param data - input data set whose statistics are computed
164
- * @param numFeatures - number of features
165
- * @param numExamples - number of examples in input dataset
166
- *
167
- * @return (yMean, xColMean, xColSd) - Tuple consisting of
168
- * yMean - mean of the labels
169
- * xColMean - Row vector with mean for every column (or feature) of the input data
170
- * xColSd - Row vector standard deviation for every column (or feature) of the input data.
171
- */
172
- def computeStats (
173
- data : RDD [LabeledPoint ],
174
- numFeatures : Int ,
175
- numExamples : Long ): (Double , Vector , Vector ) = {
176
- val brzData = data.map { case LabeledPoint (label, features) =>
177
- (label, features.toBreeze)
178
- }
179
- val aggStats = brzData.aggregate(
180
- (0L , 0.0 , BDV .zeros[Double ](numFeatures), BDV .zeros[Double ](numFeatures))
181
- )(
182
- seqOp = (c, v) => (c, v) match {
183
- case ((n, sumLabel, sum, sumSq), (label, features)) =>
184
- features.activeIterator.foreach { case (i, x) =>
185
- sumSq(i) += x * x
186
- }
187
- (n + 1L , sumLabel + label, sum += features, sumSq)
188
- },
189
- combOp = (c1, c2) => (c1, c2) match {
190
- case ((n1, sumLabel1, sum1, sumSq1), (n2, sumLabel2, sum2, sumSq2)) =>
191
- (n1 + n2, sumLabel1 + sumLabel2, sum1 += sum2, sumSq1 += sumSq2)
192
- }
193
- )
194
- val (nl, sumLabel, sum, sumSq) = aggStats
195
-
196
- require(nl > 0 , " Input data is empty." )
197
- require(nl == numExamples)
198
-
199
- val n = nl.toDouble
200
- val yMean = sumLabel / n
201
- val mean = sum / n
202
- val std = new Array [Double ](sum.length)
203
- var i = 0
204
- while (i < numFeatures) {
205
- std(i) = sumSq(i) / n - mean(i) * mean(i)
206
- i += 1
207
- }
208
-
209
- (yMean, Vectors .fromBreeze(mean), Vectors .dense(std))
210
- }
211
-
212
160
/**
213
161
* Returns the squared Euclidean distance between two vectors. The following formula will be used
214
162
* if it does not introduce too much numerical error:
0 commit comments