WIP

giwa · giwa · commit 1f68b78b23ab · 2014-08-18T15:42:15.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -312,6 +312,8 @@ private[spark] object PythonRDD extends Logging {
     } catch {
       case eof: EOFException => {}
     }
+    println("RDDDD ==================")
+    println(objs)
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
diff --git a/examples/src/main/python/streaming/test_oprations.py b/examples/src/main/python/streaming/test_oprations.py
@@ -9,11 +9,15 @@
     conf = SparkConf()
     conf.setAppName("PythonStreamingNetworkWordCount")
     ssc = StreamingContext(conf=conf, duration=Seconds(1))
+    ssc.checkpoint("/tmp/spark_ckp")
 
-    test_input = ssc._testInputStream([1,1,1,1])
-    mapped = test_input.map(lambda x: (x, 1))
-    mapped.pyprint()
+    test_input = ssc._testInputStream([[1],[1],[1]])
+#    ssc.checkpoint("/tmp/spark_ckp")
+    fm_test = test_input.flatMap(lambda x: x.split(" "))
+    mapped_test = fm_test.map(lambda x: (x, 1))
 
+
+    mapped_test.print_()
     ssc.start()
 #    ssc.awaitTermination()
 #    ssc.stop()
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
@@ -146,7 +146,10 @@ def _testInputStream(self, test_input, numSlices=None):
         # Calling the Java parallelize() method with an ArrayList is too slow,
         # because it sends O(n) Py4J commands.  As an alternative, serialized
         # objects are written to a file and loaded through textFile().
-        tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+
+        #tempFile = NamedTemporaryFile(delete=False, dir=self._sc._temp_dir)
+        tempFile = open("/tmp/spark_rdd", "wb")
+
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(test_input):
             c = list(test_input)    # Make it a list so we can compute its length
@@ -157,6 +160,7 @@ def _testInputStream(self, test_input, numSlices=None):
         else:
             serializer = self._sc._unbatched_serializer
         serializer.dump_stream(test_input, tempFile)
+        tempFile.flush()
         tempFile.close()
         print tempFile.name
         jinput_stream = self._jvm.PythonTestInputStream(self._jssc,

Original file line number	Diff line number	Diff line change
`@@ -312,6 +312,8 @@ private[spark] object PythonRDD extends Logging {`
`312`	`312`	`} catch {`
`313`	`313`	`case eof: EOFException => {}`
`314`	`314`	`}`
	`315`	`+ println("RDDDD ==================")`
	`316`	`+ println(objs)`
`315`	`317`	`JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))`
`316`	`318`	`}`
`317`	`319`