@@ -529,10 +529,127 @@ def test_deleting_input_files(self):
529
529
530
530
def test_sampling_default_seed (self ):
531
531
# Test for SPARK-3995 (default seed setting)
532
- data = self .sc .parallelize (range (1000 ), 1 )
532
+ data = self .sc .parallelize (xrange (1000 ), 1 )
533
533
subset = data .takeSample (False , 10 )
534
534
self .assertEqual (len (subset ), 10 )
535
535
536
+ def test_aggregate_mutable_zero_value (self ):
537
+ # Test for SPARK-9021; uses aggregate and treeAggregate to build dict
538
+ # representing a counter of ints
539
+ # NOTE: dict is used instead of collections.Counter for Python 2.6
540
+ # compatibility
541
+ from collections import defaultdict
542
+
543
+ # Show that single or multiple partitions work
544
+ data1 = self .sc .range (10 , numSlices = 1 )
545
+ data2 = self .sc .range (10 , numSlices = 2 )
546
+
547
+ def seqOp (x , y ):
548
+ x [y ] += 1
549
+ return x
550
+
551
+ def comboOp (x , y ):
552
+ for key , val in y .items ():
553
+ x [key ] += val
554
+ return x
555
+
556
+ counts1 = data1 .aggregate (defaultdict (int ), seqOp , comboOp )
557
+ counts2 = data2 .aggregate (defaultdict (int ), seqOp , comboOp )
558
+ counts3 = data1 .treeAggregate (defaultdict (int ), seqOp , comboOp , 2 )
559
+ counts4 = data2 .treeAggregate (defaultdict (int ), seqOp , comboOp , 2 )
560
+
561
+ ground_truth = defaultdict (int , dict ((i , 1 ) for i in range (10 )))
562
+ self .assertEqual (counts1 , ground_truth )
563
+ self .assertEqual (counts2 , ground_truth )
564
+ self .assertEqual (counts3 , ground_truth )
565
+ self .assertEqual (counts4 , ground_truth )
566
+
567
+ def test_aggregate_by_key_mutable_zero_value (self ):
568
+ # Test for SPARK-9021; uses aggregateByKey to make a pair RDD that
569
+ # contains lists of all values for each key in the original RDD
570
+
571
+ # list(range(...)) for Python 3.x compatibility (can't use * operator
572
+ # on a range object)
573
+ # list(zip(...)) for Python 3.x compatibility (want to parallelize a
574
+ # collection, not a zip object)
575
+ tuples = list (zip (list (range (10 ))* 2 , [1 ]* 20 ))
576
+ # Show that single or multiple partitions work
577
+ data1 = self .sc .parallelize (tuples , 1 )
578
+ data2 = self .sc .parallelize (tuples , 2 )
579
+
580
+ def seqOp (x , y ):
581
+ x .append (y )
582
+ return x
583
+
584
+ def comboOp (x , y ):
585
+ x .extend (y )
586
+ return x
587
+
588
+ values1 = data1 .aggregateByKey ([], seqOp , comboOp ).collect ()
589
+ values2 = data2 .aggregateByKey ([], seqOp , comboOp ).collect ()
590
+ # Sort lists to ensure clean comparison with ground_truth
591
+ values1 .sort ()
592
+ values2 .sort ()
593
+
594
+ ground_truth = [(i , [1 ]* 2 ) for i in range (10 )]
595
+ self .assertEqual (values1 , ground_truth )
596
+ self .assertEqual (values2 , ground_truth )
597
+
598
+ def test_fold_mutable_zero_value (self ):
599
+ # Test for SPARK-9021; uses fold to merge an RDD of dict counters into
600
+ # a single dict
601
+ # NOTE: dict is used instead of collections.Counter for Python 2.6
602
+ # compatibility
603
+ from collections import defaultdict
604
+
605
+ counts1 = defaultdict (int , dict ((i , 1 ) for i in range (10 )))
606
+ counts2 = defaultdict (int , dict ((i , 1 ) for i in range (3 , 8 )))
607
+ counts3 = defaultdict (int , dict ((i , 1 ) for i in range (4 , 7 )))
608
+ counts4 = defaultdict (int , dict ((i , 1 ) for i in range (5 , 6 )))
609
+ all_counts = [counts1 , counts2 , counts3 , counts4 ]
610
+ # Show that single or multiple partitions work
611
+ data1 = self .sc .parallelize (all_counts , 1 )
612
+ data2 = self .sc .parallelize (all_counts , 2 )
613
+
614
+ def comboOp (x , y ):
615
+ for key , val in y .items ():
616
+ x [key ] += val
617
+ return x
618
+
619
+ fold1 = data1 .fold (defaultdict (int ), comboOp )
620
+ fold2 = data2 .fold (defaultdict (int ), comboOp )
621
+
622
+ ground_truth = defaultdict (int )
623
+ for counts in all_counts :
624
+ for key , val in counts .items ():
625
+ ground_truth [key ] += val
626
+ self .assertEqual (fold1 , ground_truth )
627
+ self .assertEqual (fold2 , ground_truth )
628
+
629
+ def test_fold_by_key_mutable_zero_value (self ):
630
+ # Test for SPARK-9021; uses foldByKey to make a pair RDD that contains
631
+ # lists of all values for each key in the original RDD
632
+
633
+ tuples = [(i , range (i )) for i in range (10 )]* 2
634
+ # Show that single or multiple partitions work
635
+ data1 = self .sc .parallelize (tuples , 1 )
636
+ data2 = self .sc .parallelize (tuples , 2 )
637
+
638
+ def comboOp (x , y ):
639
+ x .extend (y )
640
+ return x
641
+
642
+ values1 = data1 .foldByKey ([], comboOp ).collect ()
643
+ values2 = data2 .foldByKey ([], comboOp ).collect ()
644
+ # Sort lists to ensure clean comparison with ground_truth
645
+ values1 .sort ()
646
+ values2 .sort ()
647
+
648
+ # list(range(...)) for Python 3.x compatibility
649
+ ground_truth = [(i , list (range (i ))* 2 ) for i in range (10 )]
650
+ self .assertEqual (values1 , ground_truth )
651
+ self .assertEqual (values2 , ground_truth )
652
+
536
653
def test_aggregate_by_key (self ):
537
654
data = self .sc .parallelize ([(1 , 1 ), (1 , 1 ), (3 , 2 ), (5 , 1 ), (5 , 3 )], 2 )
538
655
@@ -624,8 +741,8 @@ def test_zip_with_different_serializers(self):
624
741
625
742
def test_zip_with_different_object_sizes (self ):
626
743
# regress test for SPARK-5973
627
- a = self .sc .parallelize (range (10000 )).map (lambda i : '*' * i )
628
- b = self .sc .parallelize (range (10000 , 20000 )).map (lambda i : '*' * i )
744
+ a = self .sc .parallelize (xrange (10000 )).map (lambda i : '*' * i )
745
+ b = self .sc .parallelize (xrange (10000 , 20000 )).map (lambda i : '*' * i )
629
746
self .assertEqual (10000 , a .zip (b ).count ())
630
747
631
748
def test_zip_with_different_number_of_items (self ):
@@ -647,7 +764,7 @@ def test_zip_with_different_number_of_items(self):
647
764
self .assertRaises (Exception , lambda : a .zip (b ).count ())
648
765
649
766
def test_count_approx_distinct (self ):
650
- rdd = self .sc .parallelize (range (1000 ))
767
+ rdd = self .sc .parallelize (xrange (1000 ))
651
768
self .assertTrue (950 < rdd .countApproxDistinct (0.03 ) < 1050 )
652
769
self .assertTrue (950 < rdd .map (float ).countApproxDistinct (0.03 ) < 1050 )
653
770
self .assertTrue (950 < rdd .map (str ).countApproxDistinct (0.03 ) < 1050 )
@@ -777,7 +894,7 @@ def test_distinct(self):
777
894
def test_external_group_by_key (self ):
778
895
self .sc ._conf .set ("spark.python.worker.memory" , "1m" )
779
896
N = 200001
780
- kv = self .sc .parallelize (range (N )).map (lambda x : (x % 3 , x ))
897
+ kv = self .sc .parallelize (xrange (N )).map (lambda x : (x % 3 , x ))
781
898
gkv = kv .groupByKey ().cache ()
782
899
self .assertEqual (3 , gkv .count ())
783
900
filtered = gkv .filter (lambda kv : kv [0 ] == 1 )
@@ -871,7 +988,7 @@ def test_narrow_dependency_in_join(self):
871
988
872
989
# Regression test for SPARK-6294
873
990
def test_take_on_jrdd (self ):
874
- rdd = self .sc .parallelize (range (1 << 20 )).map (lambda x : str (x ))
991
+ rdd = self .sc .parallelize (xrange (1 << 20 )).map (lambda x : str (x ))
875
992
rdd ._jrdd .first ()
876
993
877
994
def test_sortByKey_uses_all_partitions_not_only_first_and_last (self ):
@@ -1503,13 +1620,13 @@ def run():
1503
1620
self .fail ("daemon had been killed" )
1504
1621
1505
1622
# run a normal job
1506
- rdd = self .sc .parallelize (range (100 ), 1 )
1623
+ rdd = self .sc .parallelize (xrange (100 ), 1 )
1507
1624
self .assertEqual (100 , rdd .map (str ).count ())
1508
1625
1509
1626
def test_after_exception (self ):
1510
1627
def raise_exception (_ ):
1511
1628
raise Exception ()
1512
- rdd = self .sc .parallelize (range (100 ), 1 )
1629
+ rdd = self .sc .parallelize (xrange (100 ), 1 )
1513
1630
with QuietTest (self .sc ):
1514
1631
self .assertRaises (Exception , lambda : rdd .foreach (raise_exception ))
1515
1632
self .assertEqual (100 , rdd .map (str ).count ())
@@ -1525,22 +1642,22 @@ def test_after_jvm_exception(self):
1525
1642
with QuietTest (self .sc ):
1526
1643
self .assertRaises (Exception , lambda : filtered_data .count ())
1527
1644
1528
- rdd = self .sc .parallelize (range (100 ), 1 )
1645
+ rdd = self .sc .parallelize (xrange (100 ), 1 )
1529
1646
self .assertEqual (100 , rdd .map (str ).count ())
1530
1647
1531
1648
def test_accumulator_when_reuse_worker (self ):
1532
1649
from pyspark .accumulators import INT_ACCUMULATOR_PARAM
1533
1650
acc1 = self .sc .accumulator (0 , INT_ACCUMULATOR_PARAM )
1534
- self .sc .parallelize (range (100 ), 20 ).foreach (lambda x : acc1 .add (x ))
1651
+ self .sc .parallelize (xrange (100 ), 20 ).foreach (lambda x : acc1 .add (x ))
1535
1652
self .assertEqual (sum (range (100 )), acc1 .value )
1536
1653
1537
1654
acc2 = self .sc .accumulator (0 , INT_ACCUMULATOR_PARAM )
1538
- self .sc .parallelize (range (100 ), 20 ).foreach (lambda x : acc2 .add (x ))
1655
+ self .sc .parallelize (xrange (100 ), 20 ).foreach (lambda x : acc2 .add (x ))
1539
1656
self .assertEqual (sum (range (100 )), acc2 .value )
1540
1657
self .assertEqual (sum (range (100 )), acc1 .value )
1541
1658
1542
1659
def test_reuse_worker_after_take (self ):
1543
- rdd = self .sc .parallelize (range (100000 ), 1 )
1660
+ rdd = self .sc .parallelize (xrange (100000 ), 1 )
1544
1661
self .assertEqual (0 , rdd .first ())
1545
1662
1546
1663
def count ():
0 commit comments