Skip to content

Commit d61363f

Browse files
committed
rebase to latest master
1 parent 16ae684 commit d61363f

File tree

2 files changed

+1
-52
lines changed

2 files changed

+1
-52
lines changed

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ private class ColumnStatisticsAggregator(private val n: Int)
161161
}
162162

163163
/**
164+
* :: Experimental ::
164165
* Represents a row-oriented distributed Matrix with no meaningful row indices.
165166
*
166167
* @param rows rows stored as an RDD[Vector]

mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -157,58 +157,6 @@ object MLUtils {
157157
dataStr.saveAsTextFile(dir)
158158
}
159159

160-
/**
161-
* Utility function to compute mean and standard deviation on a given dataset.
162-
*
163-
* @param data - input data set whose statistics are computed
164-
* @param numFeatures - number of features
165-
* @param numExamples - number of examples in input dataset
166-
*
167-
* @return (yMean, xColMean, xColSd) - Tuple consisting of
168-
* yMean - mean of the labels
169-
* xColMean - Row vector with mean for every column (or feature) of the input data
170-
* xColSd - Row vector standard deviation for every column (or feature) of the input data.
171-
*/
172-
def computeStats(
173-
data: RDD[LabeledPoint],
174-
numFeatures: Int,
175-
numExamples: Long): (Double, Vector, Vector) = {
176-
val brzData = data.map { case LabeledPoint(label, features) =>
177-
(label, features.toBreeze)
178-
}
179-
val aggStats = brzData.aggregate(
180-
(0L, 0.0, BDV.zeros[Double](numFeatures), BDV.zeros[Double](numFeatures))
181-
)(
182-
seqOp = (c, v) => (c, v) match {
183-
case ((n, sumLabel, sum, sumSq), (label, features)) =>
184-
features.activeIterator.foreach { case (i, x) =>
185-
sumSq(i) += x * x
186-
}
187-
(n + 1L, sumLabel + label, sum += features, sumSq)
188-
},
189-
combOp = (c1, c2) => (c1, c2) match {
190-
case ((n1, sumLabel1, sum1, sumSq1), (n2, sumLabel2, sum2, sumSq2)) =>
191-
(n1 + n2, sumLabel1 + sumLabel2, sum1 += sum2, sumSq1 += sumSq2)
192-
}
193-
)
194-
val (nl, sumLabel, sum, sumSq) = aggStats
195-
196-
require(nl > 0, "Input data is empty.")
197-
require(nl == numExamples)
198-
199-
val n = nl.toDouble
200-
val yMean = sumLabel / n
201-
val mean = sum / n
202-
val std = new Array[Double](sum.length)
203-
var i = 0
204-
while (i < numFeatures) {
205-
std(i) = sumSq(i) / n - mean(i) * mean(i)
206-
i += 1
207-
}
208-
209-
(yMean, Vectors.fromBreeze(mean), Vectors.dense(std))
210-
}
211-
212160
/**
213161
* Returns the squared Euclidean distance between two vectors. The following formula will be used
214162
* if it does not introduce too much numerical error:

0 commit comments

Comments
 (0)