added reducedByKey not working yet

Ken Takagiwa · giwa · commit c455c8d71123 · 2014-09-20T18:54:51.000+09:00
diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
@@ -143,7 +143,7 @@ def _defaultReducePartitions(self):
         if self.ctx._conf.contains("spark.default.parallelism"):
             return self.ctx.defaultParallelism
         else:
-            return self.getNumPartitions()
+            return 2
 
     def getNumPartitions(self):
       """
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala
@@ -55,6 +55,7 @@ class PythonDStream[T: ClassTag](
       case None => None
     }
   }
+<<<<<<< HEAD
 
   val asJavaDStream  = JavaDStream.fromDStream(this)
 
@@ -133,3 +134,87 @@ DStream[(Long, Array[Byte])](prev.ssc){
   }
   val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
 }
+=======
+  val asJavaDStream  = JavaDStream.fromDStream(this)
+
+  /**
+   * Print the first ten elements of each PythonRDD generated in this PythonDStream. This is an output
+   * operator, so this PythonDStream will be registered as an output stream and there materialized.
+   * Since serialized Python object is readable by Python, pyprint writes out binary data to
+   * temporary file and run python script to deserialized and print the first ten elements
+   */
+  private[streaming] def ppyprint() {
+    def foreachFunc = (rdd: RDD[Array[Byte]], time: Time) => {
+      val iter = rdd.take(11).iterator
+
+      // make a temporary file
+      val prefix = "spark"
+      val suffix = ".tmp"
+      val tempFile = File.createTempFile(prefix, suffix)
+      val tempFileStream = new DataOutputStream(new FileOutputStream(tempFile.getAbsolutePath))
+      //write out serialized python object
+      PythonRDD.writeIteratorToStream(iter, tempFileStream)
+      tempFileStream.close()
+
+      // This value has to be passed from python
+      //val pythonExec = new ProcessBuilder().environment().get("PYSPARK_PYTHON")
+      val sparkHome = new ProcessBuilder().environment().get("SPARK_HOME")
+      //val pb = new ProcessBuilder(Seq(pythonExec, sparkHome + "/python/pyspark/streaming/pyprint.py", tempFile.getAbsolutePath())) // why this fails to compile???
+      //absolute path to the python script is needed to change because we do not use pysparkstreaming
+      val pb = new ProcessBuilder(pythonExec, sparkHome + "/python/pysparkstreaming/streaming/pyprint.py", tempFile.getAbsolutePath)
+      val workerEnv = pb.environment()
+
+      //envVars also need to be pass
+      //workerEnv.putAll(envVars)
+      val pythonPath = sparkHome + "/python/" + File.pathSeparator + workerEnv.get("PYTHONPATH")
+      workerEnv.put("PYTHONPATH", pythonPath)
+      val worker = pb.start()
+      val is = worker.getInputStream()
+      val isr = new InputStreamReader(is)
+      val br = new BufferedReader(isr)
+
+      println ("-------------------------------------------")
+      println ("Time: " + time)
+      println ("-------------------------------------------")
+
+      //print value from python std out
+      var line = ""
+      breakable {
+        while (true) {
+          line = br.readLine()
+          if (line == null) break()
+          println(line)
+        }
+      }
+      //delete temporary file
+      tempFile.delete()
+      println()
+
+    }
+    new ForEachDStream(this, context.sparkContext.clean(foreachFunc)).register()
+  }
+}
+
+
+private class PairwiseDStream(prev:DStream[Array[Byte]]) extends
+DStream[(Long, Array[Byte])](prev.ssc){
+  override def dependencies = List(prev)
+
+  override def slideDuration: Duration = prev.slideDuration
+
+  override def compute(validTime:Time):Option[RDD[(Long, Array[Byte])]]={
+    prev.getOrCompute(validTime) match{
+      case Some(rdd)=>Some(rdd)
+        val pairwiseRDD = new PairwiseRDD(rdd)
+        Some(pairwiseRDD.asJavaPairRDD.rdd)
+      case None => None
+    }
+  }
+  val asJavaPairDStream : JavaPairDStream[Long, Array[Byte]]  = JavaPairDStream.fromJavaDStream(this)
+}
+
+
+
+
+
+>>>>>>> added reducedByKey not working yet