@@ -35,25 +35,31 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
35
35
self .ctx = ssc ._sc
36
36
self ._jrdd_deserializer = jrdd_deserializer
37
37
38
+ def context (self ):
39
+ """
40
+ Return the StreamingContext associated with this DStream
41
+ """
42
+ return self ._ssc
43
+
38
44
def count (self ):
39
45
"""
40
46
Return a new DStream which contains the number of elements in this DStream.
41
47
"""
42
- return self ._mapPartitions (lambda i : [sum (1 for _ in i )])._sum ()
48
+ return self .mapPartitions (lambda i : [sum (1 for _ in i )])._sum ()
43
49
44
50
def _sum (self ):
45
51
"""
46
52
Add up the elements in this DStream.
47
53
"""
48
- return self ._mapPartitions (lambda x : [sum (x )]).reduce (operator .add )
54
+ return self .mapPartitions (lambda x : [sum (x )]).reduce (operator .add )
49
55
50
56
def print_ (self , label = None ):
51
57
"""
52
58
Since print is reserved name for python, we cannot define a "print" method function.
53
59
This function prints serialized data in RDD in DStream because Scala and Java cannot
54
- deserialized pickled python object. Please use DStream.pyprint() instead to print results.
60
+ deserialized pickled python object. Please use DStream.pyprint() to print results.
55
61
56
- Call DStream.print().
62
+ Call DStream.print() and this function will print byte array in the DStream
57
63
"""
58
64
# a hack to call print function in DStream
59
65
getattr (self ._jdstream , "print" )(label )
@@ -63,29 +69,32 @@ def filter(self, f):
63
69
Return a new DStream containing only the elements that satisfy predicate.
64
70
"""
65
71
def func (iterator ): return ifilter (f , iterator )
66
- return self ._mapPartitions (func )
72
+ return self .mapPartitions (func )
67
73
68
74
def flatMap (self , f , preservesPartitioning = False ):
69
75
"""
70
76
Pass each value in the key-value pair DStream through flatMap function
71
77
without changing the keys: this also retains the original RDD's partition.
72
78
"""
73
- def func (s , iterator ): return chain .from_iterable (imap (f , iterator ))
79
+ def func (s , iterator ):
80
+ return chain .from_iterable (imap (f , iterator ))
74
81
return self ._mapPartitionsWithIndex (func , preservesPartitioning )
75
82
76
- def map (self , f ):
83
+ def map (self , f , preservesPartitioning = False ):
77
84
"""
78
85
Return a new DStream by applying a function to each element of DStream.
79
86
"""
80
- def func (iterator ): return imap (f , iterator )
81
- return self ._mapPartitions (func )
87
+ def func (iterator ):
88
+ return imap (f , iterator )
89
+ return self .mapPartitions (func , preservesPartitioning )
82
90
83
- def _mapPartitions (self , f ):
91
+ def mapPartitions (self , f , preservesPartitioning = False ):
84
92
"""
85
93
Return a new DStream by applying a function to each partition of this DStream.
86
94
"""
87
- def func (s , iterator ): return f (iterator )
88
- return self ._mapPartitionsWithIndex (func )
95
+ def func (s , iterator ):
96
+ return f (iterator )
97
+ return self ._mapPartitionsWithIndex (func , preservesPartitioning )
89
98
90
99
def _mapPartitionsWithIndex (self , f , preservesPartitioning = False ):
91
100
"""
@@ -131,7 +140,7 @@ def combineLocally(iterator):
131
140
else :
132
141
combiners [k ] = mergeValue (combiners [k ], v )
133
142
return combiners .iteritems ()
134
- locally_combined = self ._mapPartitions (combineLocally )
143
+ locally_combined = self .mapPartitions (combineLocally )
135
144
shuffled = locally_combined .partitionBy (numPartitions )
136
145
137
146
def _mergeCombiners (iterator ):
@@ -143,7 +152,7 @@ def _mergeCombiners(iterator):
143
152
combiners [k ] = mergeCombiners (combiners [k ], v )
144
153
return combiners .iteritems ()
145
154
146
- return shuffled ._mapPartitions (_mergeCombiners )
155
+ return shuffled .mapPartitions (_mergeCombiners )
147
156
148
157
def partitionBy (self , numPartitions , partitionFunc = None ):
149
158
"""
@@ -246,6 +255,34 @@ def takeAndPrint(rdd, time):
246
255
247
256
self .foreachRDD (takeAndPrint )
248
257
258
+ def mapValues (self , f ):
259
+ """
260
+ Pass each value in the key-value pair RDD through a map function
261
+ without changing the keys; this also retains the original RDD's
262
+ partitioning.
263
+ """
264
+ map_values_fn = lambda (k , v ): (k , f (v ))
265
+ return self .map (map_values_fn , preservesPartitioning = True )
266
+
267
+ def flatMapValues (self , f ):
268
+ """
269
+ Pass each value in the key-value pair RDD through a flatMap function
270
+ without changing the keys; this also retains the original RDD's
271
+ partitioning.
272
+ """
273
+ flat_map_fn = lambda (k , v ): ((k , x ) for x in f (v ))
274
+ return self .flatMap (flat_map_fn , preservesPartitioning = True )
275
+
276
+ def glom (self ):
277
+ """
278
+ Return a new DStream in which RDD is generated by applying glom() to RDD of
279
+ this DStream. Applying glom() to an RDD coalesces all elements within each partition into
280
+ an list.
281
+ """
282
+ def func (iterator ):
283
+ yield list (iterator )
284
+ return self .mapPartitions (func )
285
+
249
286
#def transform(self, func): - TD
250
287
# from utils import RDDFunction
251
288
# wrapped_func = RDDFunction(self.ctx, self._jrdd_deserializer, func)
@@ -255,7 +292,7 @@ def takeAndPrint(rdd, time):
255
292
def _test_output (self , result ):
256
293
"""
257
294
This function is only for test case.
258
- Store data in a DStream to result to verify the result in tese case
295
+ Store data in a DStream to result to verify the result in test case
259
296
"""
260
297
def get_output (rdd , time ):
261
298
taken = rdd .collect ()
@@ -318,4 +355,4 @@ def _jdstream(self):
318
355
return self ._jdstream_val
319
356
320
357
def _is_pipelinable (self ):
321
- return not ( self .is_cached )
358
+ return not self .is_cached
0 commit comments