poc using arrow streams

d80tb7 · d80tb7 · commit d4cf6d060eff · 2019-06-27T11:21:59.000+01:00
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -359,19 +359,24 @@ def __repr__(self):
 class InterleavedArrowReader(object):
 
     def __init__(self, stream):
-        import pyarrow as pa
-        self._schema1 = pa.read_schema(stream)
-        self._schema2 = pa.read_schema(stream)
-        self._reader = pa.MessageReader.open_stream(stream)
+        self._stream = stream
 
     def __iter__(self):
         return self
 
     def __next__(self):
+        stream_status = read_int(self._stream)
+        if stream_status == SpecialLengths.START_ARROW_STREAM:
+            return self._read_df(), self._read_df()
+        elif stream_status == SpecialLengths.END_OF_DATA_SECTION:
+            raise StopIteration
+        else:
+            raise ValueError('Received invalid stream status {0}'.format(stream_status))
+
+    def _read_df(self):
         import pyarrow as pa
-        batch1 = pa.read_record_batch(self._reader.read_next_message(),  self._schema1)
-        batch2 = pa.read_record_batch(self._reader.read_next_message(),  self._schema2)
-        return batch1, batch2
+        reader = pa.ipc.open_stream(self._stream)
+        return [b for b in reader]
 
 
 class ArrowStreamPandasUDFSerializer(ArrowStreamPandasSerializer):
@@ -428,11 +433,11 @@ def load_stream(self, stream):
         """
         Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series.
         """
-        import pyarrow as pa
-        reader = InterleavedArrowReader(pa.input_stream(stream))
+        reader = InterleavedArrowReader(stream)
         for batch1, batch2 in reader:
-            yield ( [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch1]).itercolumns()],
-                    [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch2]).itercolumns()])
+            import pyarrow as pa
+            yield ([self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch1).itercolumns()],
+                   [self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch2).itercolumns()])
 
 
 class BatchedSerializer(Serializer):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BasePandasGroupExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BasePandasGroupExec.scala
@@ -20,7 +20,7 @@ import org.apache.spark.TaskContext
 import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, PythonUDF, UnsafeProjection}
 import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.ArrowUtils
@@ -29,18 +29,19 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConverters._
 
-trait AbstractPandasGroupExec extends SparkPlan {
+abstract class BasePandasGroupExec(func: Expression,
+                                   output: Seq[Attribute]) extends SparkPlan {
 
   protected val sessionLocalTimeZone = conf.sessionLocalTimeZone
 
   protected val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
 
-  protected def chainedFunc = Seq(
-    ChainedPythonFunctions(Seq(func.asInstanceOf[PythonUDF].func)))
+  protected val pandasFunction = func.asInstanceOf[PythonUDF].func
 
-  def output: Seq[Attribute]
+  protected val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
+
+  override def producedAttributes: AttributeSet = AttributeSet(output)
 
-  def func: Expression
 
   protected def executePython[T](data: Iterator[T],
                                   runner: BasePythonRunner[T, ColumnarBatch]): Iterator[InternalRow] = {
@@ -62,16 +63,12 @@ trait AbstractPandasGroupExec extends SparkPlan {
 
   protected def groupAndDedup(
       input: Iterator[InternalRow], groupingAttributes: Seq[Attribute],
-      inputSchema: Seq[Attribute], dedupSchema: Seq[Attribute]): Iterator[Iterator[InternalRow]] = {
-     if (groupingAttributes.isEmpty) {
-       Iterator(input)
-     } else {
-        val groupedIter = GroupedIterator(input, groupingAttributes, inputSchema)
-        val dedupProj = UnsafeProjection.create(dedupSchema, inputSchema)
-        groupedIter.map {
-          case (_, groupedRowIter) => groupedRowIter.map(dedupProj)
-        }
-     }
+      inputSchema: Seq[Attribute], dedupSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
+    val groupedIter = GroupedIterator(input, groupingAttributes, inputSchema)
+    val dedupProj = UnsafeProjection.create(dedupSchema, inputSchema)
+    groupedIter.map {
+      case (k, groupedRowIter) => (k, groupedRowIter.map(dedupProj))
+    }
   }
 
   protected def createSchema(child: SparkPlan, groupingAttributes: Seq[Attribute])
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala
@@ -31,12 +31,10 @@ case class FlatMapCoGroupsInPandasExec(
     output: Seq[Attribute],
     left: SparkPlan,
     right: SparkPlan)
-  extends BinaryExecNode with AbstractPandasGroupExec {
+  extends BasePandasGroupExec(func, output) with BinaryExecNode{
 
   override def outputPartitioning: Partitioning = left.outputPartitioning
 
-  override def producedAttributes: AttributeSet = AttributeSet(output)
-
   override def requiredChildDistribution: Seq[Distribution] = {
     ClusteredDistribution(leftGroup) :: ClusteredDistribution(rightGroup) :: Nil
   }
@@ -48,16 +46,15 @@ case class FlatMapCoGroupsInPandasExec(
 
   override protected def doExecute(): RDD[InternalRow] = {
 
-    val (schemaLeft, attrLeft, _) = createSchema(left, leftGroup)
-    val (schemaRight, attrRight, _) = createSchema(right, rightGroup)
+    val (schemaLeft, leftDedup, _) = createSchema(left, leftGroup)
+    val (schemaRight, rightDedup, _) = createSchema(right, rightGroup)
 
     left.execute().zipPartitions(right.execute())  { (leftData, rightData) =>
-      val leftGrouped = GroupedIterator(leftData, leftGroup, left.output)
-      val rightGrouped = GroupedIterator(rightData, rightGroup, right.output)
-      val projLeft = UnsafeProjection.create(attrLeft, left.output)
-      val projRight = UnsafeProjection.create(attrRight, right.output)
+
+      val leftGrouped = groupAndDedup(leftData, leftGroup, left.output, leftDedup)
+      val rightGrouped = groupAndDedup(rightData, rightGroup, right.output, rightDedup)
       val data = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup)
-        .map{case (k, l, r) => (l.map(projLeft), r.map(projRight))}
+        .map{case (k, l, r) => (l, r)}
 
       val runner = new InterleavedArrowPythonRunner(
         chainedFunc,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -53,12 +53,10 @@ case class FlatMapGroupsInPandasExec(
     func: Expression,
     output: Seq[Attribute],
     child: SparkPlan)
-  extends UnaryExecNode with AbstractPandasGroupExec {
+  extends  BasePandasGroupExec(func, output) with UnaryExecNode {
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
-  override def producedAttributes: AttributeSet = AttributeSet(output)
-
   override def requiredChildDistribution: Seq[Distribution] = {
     if (groupingAttributes.isEmpty) {
       AllTuples :: Nil
@@ -79,6 +77,7 @@ case class FlatMapGroupsInPandasExec(
     inputRDD.mapPartitionsInternal { iter => if (iter.isEmpty) iter else {
 
       val data = groupAndDedup(iter, groupingAttributes, child.output, dedupAttributes)
+        .map{case(_, x) => x}
 
       val runner = new ArrowPythonRunner(
         chainedFunc,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/InterleavedArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/InterleavedArrowPythonRunner.scala
@@ -21,7 +21,8 @@ import java.io._
 import java.net._
 
 import org.apache.arrow.vector.VectorSchemaRoot
-
+import org.apache.arrow.vector.dictionary.DictionaryProvider
+import org.apache.arrow.vector.ipc.ArrowStreamWriter
 import org.apache.spark._
 import org.apache.spark.api.python._
 import org.apache.spark.sql.catalyst.InternalRow
@@ -64,55 +65,39 @@ class InterleavedArrowPythonRunner(
       }
 
       protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
-        val leftArrowSchema = ArrowUtils.toArrowSchema(leftSchema, timeZoneId)
-        val rightArrowSchema = ArrowUtils.toArrowSchema(rightSchema, timeZoneId)
+        while (inputIterator.hasNext) {
+          dataOut.writeInt(SpecialLengths.START_ARROW_STREAM)
+          val (nextLeft, nextRight) = inputIterator.next()
+          writeGroup(nextLeft, leftSchema, dataOut)
+          writeGroup(nextRight, rightSchema, dataOut)
+        }
+        dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
+      }
+
+      def writeGroup(group: Iterator[InternalRow], schema: StructType, dataOut: DataOutputStream
+                    ) = {
+        val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val allocator = ArrowUtils.rootAllocator.newChildAllocator(
           s"stdout writer for $pythonExec", 0, Long.MaxValue)
-        val leftRoot = VectorSchemaRoot.create(leftArrowSchema, allocator)
-        val rightRoot = VectorSchemaRoot.create(rightArrowSchema, allocator)
+        val root = VectorSchemaRoot.create(arrowSchema, allocator)
 
         Utils.tryWithSafeFinally {
-          val leftArrowWriter = ArrowWriter.create(leftRoot)
-          val rightArrowWriter = ArrowWriter.create(rightRoot)
-          val writer = InterleavedArrowWriter(leftRoot, rightRoot, dataOut)
+          val writer = new ArrowStreamWriter(root, null, dataOut)
+          val arrowWriter = ArrowWriter.create(root)
           writer.start()
 
-          while (inputIterator.hasNext) {
-
-            val (nextLeft, nextRight) = inputIterator.next()
-
-            while (nextLeft.hasNext) {
-              leftArrowWriter.write(nextLeft.next())
-            }
-            while (nextRight.hasNext) {
-              rightArrowWriter.write(nextRight.next())
-            }
-            leftArrowWriter.finish()
-            rightArrowWriter.finish()
-            writer.writeBatch()
-            leftArrowWriter.reset()
-            rightArrowWriter.reset()
+          while (group.hasNext) {
+            arrowWriter.write(group.next())
           }
-          // end writes footer to the output stream and doesn't clean any resources.
-          // It could throw exception if the output stream is closed, so it should be
-          // in the try block.
+          arrowWriter.finish()
+          writer.writeBatch()
           writer.end()
-        } {
-          // If we close root and allocator in TaskCompletionListener, there could be a race
-          // condition where the writer thread keeps writing to the VectorSchemaRoot while
-          // it's being closed by the TaskCompletion listener.
-          // Closing root and allocator here is cleaner because root and allocator is owned
-          // by the writer thread and is only visible to the writer thread.
-          //
-          // If the writer thread is interrupted by TaskCompletionListener, it should either
-          // (1) in the try block, in which case it will get an InterruptedException when
-          // performing io, and goes into the finally block or (2) in the finally block,
-          // in which case it will ignore the interruption and close the resources.
-          leftRoot.close()
-          rightRoot.close()
+        }{
+          root.close()
           allocator.close()
         }
       }
     }
   }
 }
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/InterleavedArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/InterleavedArrowWriter.scala