17
17
18
18
from collections import defaultdict
19
19
from itertools import chain , ifilter , imap
20
- import time
21
20
import operator
22
21
23
22
from pyspark .serializers import NoOpSerializer ,\
24
23
BatchedSerializer , CloudPickleSerializer , pack_long
25
24
from pyspark .rdd import _JavaStackTrace
25
+ from pyspark .storagelevel import StorageLevel
26
+ from pyspark .resultiterable import ResultIterable
26
27
27
28
from py4j .java_collections import ListConverter , MapConverter
28
29
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
35
36
self ._ssc = ssc
36
37
self .ctx = ssc ._sc
37
38
self ._jrdd_deserializer = jrdd_deserializer
39
+ self .is_cached = False
40
+ self .is_checkpointed = False
38
41
39
42
def context (self ):
40
43
"""
@@ -234,8 +237,6 @@ def takeAndPrint(rdd, time):
234
237
taken = rdd .take (11 )
235
238
print "-------------------------------------------"
236
239
print "Time: %s" % (str (time ))
237
- print rdd .glom ().collect ()
238
- print "-------------------------------------------"
239
240
print "-------------------------------------------"
240
241
for record in taken [:10 ]:
241
242
print record
@@ -290,32 +291,65 @@ def get_output(rdd, time):
290
291
291
292
self .foreachRDD (get_output )
292
293
293
- def _test_switch_dserializer (self , serializer_que ):
294
+ def cache (self ):
295
+ """
296
+ Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
297
+ """
298
+ self .is_cached = True
299
+ self .persist (StorageLevel .MEMORY_ONLY_SER )
300
+ return self
301
+
302
+ def persist (self , storageLevel ):
303
+ """
304
+ Set this DStream's storage level to persist its values across operations
305
+ after the first time it is computed. This can only be used to assign
306
+ a new storage level if the DStream does not have a storage level set yet.
307
+ """
308
+ self .is_cached = True
309
+ javaStorageLevel = self .ctx ._getJavaStorageLevel (storageLevel )
310
+ self ._jdstream .persist (javaStorageLevel )
311
+ return self
312
+
313
+ def checkpoint (self , interval ):
294
314
"""
295
- Deserializer is dynamically changed based on numSlice and the number of
296
- input. This function choose deserializer. Currently this is just FIFO.
315
+ Mark this DStream for checkpointing. It will be saved to a file inside the
316
+ checkpoint directory set with L{SparkContext.setCheckpointDir()}
317
+
318
+ I am not sure this part in DStream
319
+ and
320
+ all references to its parent RDDs will be removed. This function must
321
+ be called before any job has been executed on this RDD. It is strongly
322
+ recommended that this RDD is persisted in memory, otherwise saving it
323
+ on a file will require recomputation.
324
+
325
+ interval must be pysprak.streaming.duration
297
326
"""
298
-
299
- jrdd_deserializer = self ._jrdd_deserializer
327
+ self .is_checkpointed = True
328
+ self ._jdstream .checkpoint (interval )
329
+ return self
330
+
331
+ def groupByKey (self , numPartitions = None ):
332
+ def createCombiner (x ):
333
+ return [x ]
300
334
301
- def switch (rdd , jtime ):
302
- try :
303
- print serializer_que
304
- jrdd_deserializer = serializer_que .pop (0 )
305
- print jrdd_deserializer
306
- except Exception as e :
307
- print e
335
+ def mergeValue (xs , x ):
336
+ xs .append (x )
337
+ return xs
308
338
309
- self .foreachRDD (switch )
339
+ def mergeCombiners (a , b ):
340
+ a .extend (b )
341
+ return a
310
342
343
+ return self .combineByKey (createCombiner , mergeValue , mergeCombiners ,
344
+ numPartitions ).mapValues (lambda x : ResultIterable (x ))
311
345
312
346
313
347
# TODO: implement groupByKey
348
+ # TODO: implement saveAsTextFile
349
+
350
+ # Following operation has dependency to transform
314
351
# TODO: impelment union
315
- # TODO: implement cache
316
- # TODO: implement persist
317
352
# TODO: implement repertitions
318
- # TODO: implement saveAsTextFile
319
353
# TODO: implement cogroup
320
354
# TODO: implement join
321
355
# TODO: implement countByValue
@@ -342,6 +376,7 @@ def pipeline_func(split, iterator):
342
376
self ._prev_jdstream = prev ._prev_jdstream # maintain the pipeline
343
377
self ._prev_jrdd_deserializer = prev ._prev_jrdd_deserializer
344
378
self .is_cached = False
379
+ self .is_checkpointed = False
345
380
self ._ssc = prev ._ssc
346
381
self .ctx = prev .ctx
347
382
self .prev = prev
@@ -378,4 +413,4 @@ def _jdstream(self):
378
413
return self ._jdstream_val
379
414
380
415
def _is_pipelinable (self ):
381
- return not self .is_cached
416
+ return not ( self .is_cached or self . is_checkpointed )
0 commit comments