@@ -20,9 +20,7 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
20
20
21
21
def count (self ):
22
22
"""
23
-
24
23
"""
25
- # TODO: make sure count implementation, this different from what pyspark does
26
24
return self ._mapPartitions (lambda i : [sum (1 for _ in i )])._sum ()
27
25
28
26
def _sum (self ):
@@ -79,7 +77,6 @@ def _mapPartitionsWithIndex(self, f, preservesPartitioning=False):
79
77
80
78
def reduce (self , func ):
81
79
"""
82
-
83
80
"""
84
81
return self .map (lambda x : (None , x )).reduceByKey (func , 1 ).map (lambda x : x [1 ])
85
82
@@ -107,12 +104,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
107
104
def combineLocally (iterator ):
108
105
combiners = {}
109
106
for x in iterator :
110
-
111
- #TODO for count operation make sure count implementation
112
- # This is different from what pyspark does
113
- #if isinstance(x, int):
114
- # x = ("", x)
115
-
116
107
(k , v ) = x
117
108
if k not in combiners :
118
109
combiners [k ] = createCombiner (v )
@@ -142,6 +133,7 @@ def partitionBy(self, numPartitions, partitionFunc=None):
142
133
143
134
if partitionFunc is None :
144
135
partitionFunc = lambda x : 0 if x is None else hash (x )
136
+
145
137
# Transferring O(n) objects to Java is too expensive. Instead, we'll
146
138
# form the hash buckets in Python, transferring O(numPartitions) objects
147
139
# to Java. Each object is a (splitNumber, [objects]) pair.
@@ -228,7 +220,6 @@ def takeAndPrint(rdd, time):
228
220
229
221
self .foreachRDD (takeAndPrint )
230
222
231
-
232
223
#def transform(self, func):
233
224
# from utils import RDDFunction
234
225
# wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
0 commit comments