Skip to content

Commit 7cb0e40

Browse files
committed
fix for data inconsistency
1 parent 8881444 commit 7cb0e40

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

mllib/src/main/scala/org/apache/spark/mllib/rdd/RandomRDD.scala

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ private[mllib] class RandomRDDPartition(val idx: Int,
3030

3131
override val index: Int = idx
3232

33-
private val rng = distribution
34-
rng.setSeed(seed + idx)
35-
36-
private val iter = new FixedSizeIterator(size, rng)
37-
38-
def getIterator = iter
33+
// The RNG has to be reset every time the iterator is requested to guarantee same data
34+
// every time the content of the RDD is examined.
35+
def getIterator = {
36+
val newRng = distribution.copy()
37+
newRng.setSeed(seed + idx)
38+
new FixedSizeIterator(size, newRng)
39+
}
3940
}
4041

4142
private[mllib] class FixedSizeIterator(override val size: Long, val rng: Distribution)
@@ -71,7 +72,7 @@ private[mllib] class RandomRDD(@transient private var sc: SparkContext,
7172
partitions(i) = if (i == 0) {
7273
new RandomRDDPartition(i, firstPartitionSize, distribution, seed)
7374
} else {
74-
new RandomRDDPartition(i, partitionSize, distribution.copy, seed)
75+
new RandomRDDPartition(i, partitionSize, distribution, seed)
7576
}
7677
i += 1
7778
}

0 commit comments

Comments
 (0)