Skip to content

Commit 81fcdd2

Browse files
dorxmengxr
authored andcommitted
[SPARK-2514] [mllib] Random RDD generator
Utilities for generating random RDDs. RandomRDD and RandomVectorRDD are created instead of using `sc.parallelize(range:Range)` because `Range` objects in Scala can only have `size <= Int.MaxValue`. The object `RandomRDDGenerators` can be transformed into a generator class to reduce the number of auxiliary methods for optional arguments. Author: Doris Xin <doris.s.xin@gmail.com> Closes #1520 from dorx/randomRDD and squashes the following commits: 01121ac [Doris Xin] reviewer comments 6bf27d8 [Doris Xin] Merge branch 'master' into randomRDD a8ea92d [Doris Xin] Reviewer comments 063ea0b [Doris Xin] Merge branch 'master' into randomRDD aec68eb [Doris Xin] newline bc90234 [Doris Xin] units passed. d56cacb [Doris Xin] impl with RandomRDD 92d6f1c [Doris Xin] solution for Cloneable df5bcff [Doris Xin] Merge branch 'generator' into randomRDD f46d928 [Doris Xin] WIP 49ed20d [Doris Xin] alternative poisson distribution generator 7cb0e40 [Doris Xin] fix for data inconsistency 8881444 [Doris Xin] RandomRDDGenerator: initial design
1 parent ecf30ee commit 81fcdd2

File tree

5 files changed

+940
-0
lines changed

5 files changed

+940
-0
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.random
19+
20+
import cern.jet.random.Poisson
21+
import cern.jet.random.engine.DRand
22+
23+
import org.apache.spark.annotation.Experimental
24+
import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
25+
26+
/**
27+
* :: Experimental ::
28+
* Trait for random number generators that generate i.i.d. values from a distribution.
29+
*/
30+
@Experimental
31+
trait DistributionGenerator extends Pseudorandom with Serializable {
32+
33+
/**
34+
* Returns an i.i.d. sample as a Double from an underlying distribution.
35+
*/
36+
def nextValue(): Double
37+
38+
/**
39+
* Returns a copy of the DistributionGenerator with a new instance of the rng object used in the
40+
* class when applicable for non-locking concurrent usage.
41+
*/
42+
def copy(): DistributionGenerator
43+
}
44+
45+
/**
46+
* :: Experimental ::
47+
* Generates i.i.d. samples from U[0.0, 1.0]
48+
*/
49+
@Experimental
50+
class UniformGenerator extends DistributionGenerator {
51+
52+
// XORShiftRandom for better performance. Thread safety isn't necessary here.
53+
private val random = new XORShiftRandom()
54+
55+
override def nextValue(): Double = {
56+
random.nextDouble()
57+
}
58+
59+
override def setSeed(seed: Long) = random.setSeed(seed)
60+
61+
override def copy(): UniformGenerator = new UniformGenerator()
62+
}
63+
64+
/**
65+
* :: Experimental ::
66+
* Generates i.i.d. samples from the standard normal distribution.
67+
*/
68+
@Experimental
69+
class StandardNormalGenerator extends DistributionGenerator {
70+
71+
// XORShiftRandom for better performance. Thread safety isn't necessary here.
72+
private val random = new XORShiftRandom()
73+
74+
override def nextValue(): Double = {
75+
random.nextGaussian()
76+
}
77+
78+
override def setSeed(seed: Long) = random.setSeed(seed)
79+
80+
override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
81+
}
82+
83+
/**
84+
* :: Experimental ::
85+
* Generates i.i.d. samples from the Poisson distribution with the given mean.
86+
*
87+
* @param mean mean for the Poisson distribution.
88+
*/
89+
@Experimental
90+
class PoissonGenerator(val mean: Double) extends DistributionGenerator {
91+
92+
private var rng = new Poisson(mean, new DRand)
93+
94+
override def nextValue(): Double = rng.nextDouble()
95+
96+
override def setSeed(seed: Long) {
97+
rng = new Poisson(mean, new DRand(seed.toInt))
98+
}
99+
100+
override def copy(): PoissonGenerator = new PoissonGenerator(mean)
101+
}

0 commit comments

Comments
 (0)