17
17
18
18
from collections import defaultdict
19
19
from itertools import chain , ifilter , imap
20
- import time
21
20
import operator
22
21
23
22
from pyspark .serializers import NoOpSerializer ,\
24
23
BatchedSerializer , CloudPickleSerializer , pack_long
25
24
from pyspark .rdd import _JavaStackTrace
25
+ from pyspark .storagelevel import StorageLevel
26
+ from pyspark .resultiterable import ResultIterable
26
27
27
28
from py4j .java_collections import ListConverter , MapConverter
28
29
@@ -35,6 +36,8 @@ def __init__(self, jdstream, ssc, jrdd_deserializer):
35
36
self ._ssc = ssc
36
37
self .ctx = ssc ._sc
37
38
self ._jrdd_deserializer = jrdd_deserializer
39
+ self .is_cached = False
40
+ self .is_checkpointed = False
38
41
39
42
def context (self ):
40
43
"""
@@ -247,8 +250,6 @@ def takeAndPrint(rdd, time):
247
250
taken = rdd .take (11 )
248
251
print "-------------------------------------------"
249
252
print "Time: %s" % (str (time ))
250
- print rdd .glom ().collect ()
251
- print "-------------------------------------------"
252
253
print "-------------------------------------------"
253
254
for record in taken [:10 ]:
254
255
print record
@@ -303,32 +304,65 @@ def get_output(rdd, time):
303
304
304
305
self .foreachRDD (get_output )
305
306
306
- def _test_switch_dserializer (self , serializer_que ):
307
+ def cache (self ):
308
+ """
309
+ Persist this DStream with the default storage level (C{MEMORY_ONLY_SER}).
310
+ """
311
+ self .is_cached = True
312
+ self .persist (StorageLevel .MEMORY_ONLY_SER )
313
+ return self
314
+
315
+ def persist (self , storageLevel ):
316
+ """
317
+ Set this DStream's storage level to persist its values across operations
318
+ after the first time it is computed. This can only be used to assign
319
+ a new storage level if the DStream does not have a storage level set yet.
320
+ """
321
+ self .is_cached = True
322
+ javaStorageLevel = self .ctx ._getJavaStorageLevel (storageLevel )
323
+ self ._jdstream .persist (javaStorageLevel )
324
+ return self
325
+
326
+ def checkpoint (self , interval ):
307
327
"""
308
- Deserializer is dynamically changed based on numSlice and the number of
309
- input. This function choose deserializer. Currently this is just FIFO.
328
+ Mark this DStream for checkpointing. It will be saved to a file inside the
329
+ checkpoint directory set with L{SparkContext.setCheckpointDir()}
330
+
331
+ I am not sure this part in DStream
332
+ and
333
+ all references to its parent RDDs will be removed. This function must
334
+ be called before any job has been executed on this RDD. It is strongly
335
+ recommended that this RDD is persisted in memory, otherwise saving it
336
+ on a file will require recomputation.
337
+
338
+ interval must be pysprak.streaming.duration
310
339
"""
311
-
312
- jrdd_deserializer = self ._jrdd_deserializer
340
+ self .is_checkpointed = True
341
+ self ._jdstream .checkpoint (interval )
342
+ return self
343
+
344
+ def groupByKey (self , numPartitions = None ):
345
+ def createCombiner (x ):
346
+ return [x ]
313
347
314
- def switch (rdd , jtime ):
315
- try :
316
- print serializer_que
317
- jrdd_deserializer = serializer_que .pop (0 )
318
- print jrdd_deserializer
319
- except Exception as e :
320
- print e
348
+ def mergeValue (xs , x ):
349
+ xs .append (x )
350
+ return xs
321
351
322
- self .foreachRDD (switch )
352
+ def mergeCombiners (a , b ):
353
+ a .extend (b )
354
+ return a
323
355
356
+ return self .combineByKey (createCombiner , mergeValue , mergeCombiners ,
357
+ numPartitions ).mapValues (lambda x : ResultIterable (x ))
324
358
325
359
326
360
# TODO: implement groupByKey
361
+ # TODO: implement saveAsTextFile
362
+
363
+ # Following operation has dependency to transform
327
364
# TODO: impelment union
328
- # TODO: implement cache
329
- # TODO: implement persist
330
365
# TODO: implement repertitions
331
- # TODO: implement saveAsTextFile
332
366
# TODO: implement cogroup
333
367
# TODO: implement join
334
368
# TODO: implement countByValue
@@ -355,6 +389,7 @@ def pipeline_func(split, iterator):
355
389
self ._prev_jdstream = prev ._prev_jdstream # maintain the pipeline
356
390
self ._prev_jrdd_deserializer = prev ._prev_jrdd_deserializer
357
391
self .is_cached = False
392
+ self .is_checkpointed = False
358
393
self ._ssc = prev ._ssc
359
394
self .ctx = prev .ctx
360
395
self .prev = prev
@@ -391,4 +426,4 @@ def _jdstream(self):
391
426
return self ._jdstream_val
392
427
393
428
def _is_pipelinable (self ):
394
- return not self .is_cached
429
+ return not ( self .is_cached or self . is_checkpointed )
0 commit comments