1
1
from collections import defaultdict
2
2
from itertools import chain , ifilter , imap
3
+ import operator
4
+
5
+ import logging
3
6
4
7
from pyspark .serializers import NoOpSerializer ,\
5
8
BatchedSerializer , CloudPickleSerializer , pack_long
@@ -24,6 +27,18 @@ def generatedRDDs(self):
24
27
"""
25
28
pass
26
29
30
+ def count (self ):
31
+ """
32
+
33
+ """
34
+ #TODO make sure count implementation, thiis different from what pyspark does
35
+ return self .mapPartitions (lambda i : [sum (1 for _ in i )]).sum ().map (lambda x : x [1 ])
36
+
37
+ def sum (self ):
38
+ """
39
+ """
40
+ return self .mapPartitions (lambda x : [sum (x )]).reduce (operator .add )
41
+
27
42
def print_ (self ):
28
43
"""
29
44
"""
@@ -63,9 +78,9 @@ def reduce(self, func, numPartitions=None):
63
78
"""
64
79
65
80
"""
66
- return self ._combineByKey (lambda x :x , func , func , numPartitions )
81
+ return self .combineByKey (lambda x :x , func , func , numPartitions )
67
82
68
- def _combineByKey (self , createCombiner , mergeValue , mergeCombiners ,
83
+ def combineByKey (self , createCombiner , mergeValue , mergeCombiners ,
69
84
numPartitions = None ):
70
85
"""
71
86
"""
@@ -74,6 +89,12 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
74
89
def combineLocally (iterator ):
75
90
combiners = {}
76
91
for x in iterator :
92
+
93
+ #TODO for count operation make sure count implementation
94
+ # This is different from what pyspark does
95
+ if isinstance (x , int ):
96
+ x = ("" , x )
97
+
77
98
(k , v ) = x
78
99
if k not in combiners :
79
100
combiners [k ] = createCombiner (v )
0 commit comments