@@ -620,37 +620,36 @@ abstract class DStream[T: ClassTag] (
620
620
new ForEachDStream (this , context.sparkContext.clean(foreachFunc)).register()
621
621
}
622
622
623
- // TODO move pyprint to PythonDStream
623
+ // TODO move pyprint to PythonDStream and executed by py4j call back function
624
624
/**
625
625
* Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
626
626
* operator, so this PythonDStream will be registered as an output stream and there materialized.
627
627
* Since serialized Python object is readable by Python, pyprint writes out binary data to
628
628
* temporary file and run python script to deserialized and print the first ten elements
629
+ *
630
+ * Currently call python script directly. We should avoid this
629
631
*/
630
632
private [streaming] def pyprint () {
631
633
def foreachFunc = (rdd : RDD [T ], time : Time ) => {
632
634
val iter = rdd.take(11 ).iterator
633
635
634
- // make a temporary file
636
+ // Generate a temporary file
635
637
val prefix = " spark"
636
638
val suffix = " .tmp"
637
639
val tempFile = File .createTempFile(prefix, suffix)
638
640
val tempFileStream = new DataOutputStream (new FileOutputStream (tempFile.getAbsolutePath))
639
- // write out serialized python object
641
+ // Write out serialized python object to temporary file
640
642
PythonRDD .writeIteratorToStream(iter, tempFileStream)
641
643
tempFileStream.close()
642
644
643
- // This value has to be passed from python
644
- // Python currently does not do cluster deployment. But what happened
645
+ // pythonExec should be passed from python. Move pyprint to PythonDStream
645
646
val pythonExec = new ProcessBuilder ().environment().get(" PYSPARK_PYTHON" )
646
647
val sparkHome = new ProcessBuilder ().environment().get(" SPARK_HOME" )
647
- // val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
648
- // absolute path to the python script is needed to change because we do not use pysparkstreaming
648
+ // Call python script to deserialize and print result in stdout
649
649
val pb = new ProcessBuilder (pythonExec, sparkHome + " /python/pyspark/streaming/pyprint.py" , tempFile.getAbsolutePath)
650
650
val workerEnv = pb.environment()
651
651
652
- // envVars also need to be pass
653
- // workerEnv.putAll(envVars)
652
+ // envVars also should be pass from python
654
653
val pythonPath = sparkHome + " /python/" + File .pathSeparator + workerEnv.get(" PYTHONPATH" )
655
654
workerEnv.put(" PYTHONPATH" , pythonPath)
656
655
val worker = pb.start()
@@ -662,7 +661,7 @@ abstract class DStream[T: ClassTag] (
662
661
println (" Time: " + time)
663
662
println (" -------------------------------------------" )
664
663
665
- // print value from python std out
664
+ // Print values which is from python std out
666
665
var line = " "
667
666
breakable {
668
667
while (true ) {
@@ -671,7 +670,7 @@ abstract class DStream[T: ClassTag] (
671
670
println(line)
672
671
}
673
672
}
674
- // delete temporary file
673
+ // Delete temporary file
675
674
tempFile.delete()
676
675
println()
677
676
0 commit comments