@@ -22,7 +22,9 @@ import scala.reflect.ClassTag
22
22
23
23
import org .scalatest .FunSuite
24
24
25
+ import org .apache .commons .math3 .distribution .BinomialDistribution
25
26
import org .apache .commons .math3 .distribution .PoissonDistribution
27
+
26
28
import org .apache .spark ._
27
29
import org .apache .spark .SparkContext ._
28
30
import org .apache .spark .rdd ._
@@ -496,29 +498,25 @@ class RDDSuite extends FunSuite with SharedSparkContext {
496
498
}
497
499
498
500
test(" computeFraction" ) {
499
- // test that the computed fraction guarantees enough datapoints in the sample with a failure rate <= 0.0001
501
+ // test that the computed fraction guarantees enough datapoints
502
+ // in the sample with a failure rate <= 0.0001
500
503
val data = new EmptyRDD [Int ](sc)
501
504
val n = 100000
502
505
503
506
for (s <- 1 to 15 ) {
504
507
val frac = data.computeFraction(s, n, true )
505
- val qpois = new PoissonDistribution (frac * n)
506
- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
508
+ val poisson = new PoissonDistribution (frac * n)
509
+ assert(poisson .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
507
510
}
508
- for (s <- 1 to 15 ) {
509
- val frac = data.computeFraction(s, n, false )
510
- val qpois = new PoissonDistribution (frac * n)
511
- assert(qpois.inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
512
- }
513
- for (s <- List (1 , 10 , 100 , 1000 )) {
511
+ for (s <- List (20 , 100 , 1000 )) {
514
512
val frac = data.computeFraction(s, n, true )
515
- val qpois = new PoissonDistribution (frac * n)
516
- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
513
+ val poisson = new PoissonDistribution (frac * n)
514
+ assert(poisson .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
517
515
}
518
516
for (s <- List (1 , 10 , 100 , 1000 )) {
519
517
val frac = data.computeFraction(s, n, false )
520
- val qpois = new PoissonDistribution (frac * n )
521
- assert(qpois .inverseCumulativeProbability(0.0001 ) >= s, " Computed fraction is too low" )
518
+ val binomial = new BinomialDistribution (n, frac )
519
+ assert(binomial .inverseCumulativeProbability(0.0001 )* n >= s, " Computed fraction is too low" )
522
520
}
523
521
}
524
522
@@ -530,37 +528,37 @@ class RDDSuite extends FunSuite with SharedSparkContext {
530
528
val sample = data.takeSample(withReplacement= false , num= num)
531
529
assert(sample.size === num) // Got exactly num elements
532
530
assert(sample.toSet.size === num) // Elements are distinct
533
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
531
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
534
532
}
535
533
for (seed <- 1 to 5 ) {
536
534
val sample = data.takeSample(withReplacement= false , 20 , seed)
537
535
assert(sample.size === 20 ) // Got exactly 20 elements
538
536
assert(sample.toSet.size === 20 ) // Elements are distinct
539
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
537
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
540
538
}
541
539
for (seed <- 1 to 5 ) {
542
540
val sample = data.takeSample(withReplacement= false , 100 , seed)
543
541
assert(sample.size === 100 ) // Got only 100 elements
544
542
assert(sample.toSet.size === 100 ) // Elements are distinct
545
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
543
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
546
544
}
547
545
for (seed <- 1 to 5 ) {
548
546
val sample = data.takeSample(withReplacement= true , 20 , seed)
549
547
assert(sample.size === 20 ) // Got exactly 20 elements
550
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
548
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
551
549
}
552
550
{
553
551
val sample = data.takeSample(withReplacement= true , num= 20 )
554
552
assert(sample.size === 20 ) // Got exactly 100 elements
555
553
assert(sample.toSet.size <= 20 , " sampling with replacement returned all distinct elements" )
556
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
554
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
557
555
}
558
556
{
559
557
val sample = data.takeSample(withReplacement= true , num= n)
560
558
assert(sample.size === n) // Got exactly 100 elements
561
559
// Chance of getting all distinct elements is astronomically low, so test we got < 100
562
560
assert(sample.toSet.size < n, " sampling with replacement returned all distinct elements" )
563
- assert(sample.forall(x => 1 <= x && x <= n), " elements not in [1, 100 ]" )
561
+ assert(sample.forall(x => 1 <= x && x <= n), s " elements not in [1, $n ] " )
564
562
}
565
563
for (seed <- 1 to 5 ) {
566
564
val sample = data.takeSample(withReplacement= true , n, seed)
0 commit comments