@@ -207,7 +207,7 @@ def _defaultReducePartitions(self):
207
207
"""
208
208
Returns the default number of partitions to use during reduce tasks (e.g., groupBy).
209
209
If spark.default.parallelism is set, then we'll use the value from SparkContext
210
- defaultParallelism, otherwise we'll use the number of partitions in this RDD.
210
+ defaultParallelism, otherwise we'll use the number of partitions in this RDD
211
211
212
212
This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce
213
213
the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will
@@ -222,7 +222,8 @@ def getNumPartitions(self):
222
222
"""
223
223
Return the number of partitions in RDD
224
224
"""
225
- # TODO: remove hardcoding. RDD has NumPartitions but DStream does not have.
225
+ # TODO: remove hardcoding. RDD has NumPartitions. How do we get the number of partition
226
+ # through DStream?
226
227
return 2
227
228
228
229
def foreachRDD (self , func ):
@@ -243,6 +244,10 @@ def pyprint(self):
243
244
operator, so this DStream will be registered as an output stream and there materialized.
244
245
"""
245
246
def takeAndPrint (rdd , time ):
247
+ """
248
+ Closure to take element from RDD and print first 10 elements.
249
+ This closure is called by py4j callback server.
250
+ """
246
251
taken = rdd .take (11 )
247
252
print "-------------------------------------------"
248
253
print "Time: %s" % (str (time ))
@@ -307,17 +312,11 @@ def checkpoint(self, interval):
307
312
Mark this DStream for checkpointing. It will be saved to a file inside the
308
313
checkpoint directory set with L{SparkContext.setCheckpointDir()}
309
314
310
- I am not sure this part in DStream
311
- and
312
- all references to its parent RDDs will be removed. This function must
313
- be called before any job has been executed on this RDD. It is strongly
314
- recommended that this RDD is persisted in memory, otherwise saving it
315
- on a file will require recomputation.
316
-
317
- interval must be pysprak.streaming.duration
315
+ @param interval: Time interval after which generated RDD will be checkpointed
316
+ interval has to be pyspark.streaming.duration.Duration
318
317
"""
319
318
self .is_checkpointed = True
320
- self ._jdstream .checkpoint (interval )
319
+ self ._jdstream .checkpoint (interval . _jduration )
321
320
return self
322
321
323
322
def groupByKey (self , numPartitions = None ):
@@ -369,6 +368,10 @@ def saveAsTextFiles(self, prefix, suffix=None):
369
368
Save this DStream as a text file, using string representations of elements.
370
369
"""
371
370
def saveAsTextFile (rdd , time ):
371
+ """
372
+ Closure to save element in RDD in DStream as Pickled data in file.
373
+ This closure is called by py4j callback server.
374
+ """
372
375
path = rddToFileName (prefix , suffix , time )
373
376
rdd .saveAsTextFile (path )
374
377
@@ -410,9 +413,10 @@ def get_output(rdd, time):
410
413
# TODO: implement countByWindow
411
414
# TODO: implement reduceByWindow
412
415
413
- # Following operation has dependency to transform
416
+ # transform Operation
414
417
# TODO: implement transform
415
418
# TODO: implement transformWith
419
+ # Following operation has dependency with transform
416
420
# TODO: implement union
417
421
# TODO: implement repertitions
418
422
# TODO: implement cogroup
0 commit comments