common trait for grouped mandas udfs

d80tb7 · d80tb7 · commit 8007fa66dd08 · 2019-06-27T07:54:06.000+01:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -47,8 +47,8 @@ import org.apache.spark.sql.types.{NumericType, StructType}
  */
 @Stable
 class RelationalGroupedDataset protected[sql](
-    private val df: DataFrame,
-    private val groupingExprs: Seq[Expression],
+    val df: DataFrame,
+    val groupingExprs: Seq[Expression],
     groupType: RelationalGroupedDataset.GroupType) {
 
   private[this] def toDF(aggExprs: Seq[Expression]): DataFrame = {
@@ -542,11 +542,15 @@ class RelationalGroupedDataset protected[sql](
 
     val leftAttributes = leftGroupingNamedExpressions.map(_.toAttribute)
     val rightAttributes = rightGroupingNamedExpressions.map(_.toAttribute)
-    val left = df.logicalPlan
-    val right = r.df.logicalPlan
+
+    val leftChild = df.logicalPlan
+    val rightChild = r.df.logicalPlan
+
+    val left = Project(leftGroupingNamedExpressions ++ leftChild.output, leftChild)
+    val right = Project(rightGroupingNamedExpressions ++ rightChild.output, rightChild)
+
     val output = expr.dataType.asInstanceOf[StructType].toAttributes
     val plan = FlatMapCoGroupsInPandas(leftAttributes, rightAttributes, expr, output, left, right)
-
     Dataset.ofRows(df.sparkSession, plan)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AbstractPandasGroupExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AbstractPandasGroupExec.scala
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.python
+
+import org.apache.spark.TaskContext
+import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF, UnsafeProjection}
+import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.ArrowUtils
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.JavaConverters._
+
+trait AbstractPandasGroupExec extends SparkPlan {
+
+  protected val sessionLocalTimeZone = conf.sessionLocalTimeZone
+
+  protected val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
+
+  protected def chainedFunc = Seq(
+    ChainedPythonFunctions(Seq(func.asInstanceOf[PythonUDF].func)))
+
+  def output: Seq[Attribute]
+
+  def func: Expression
+
+  protected def executePython[T](data: Iterator[T],
+                                  runner: BasePythonRunner[T, ColumnarBatch]): Iterator[InternalRow] = {
+
+    val context = TaskContext.get()
+    val columnarBatchIter = runner.compute(data, context.partitionId(), context)
+    val unsafeProj = UnsafeProjection.create(output, output)
+
+    columnarBatchIter.flatMap { batch =>
+      //  UDF returns a StructType column in ColumnarBatch, select the children here
+      val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
+      val outputVectors = output.indices.map(structVector.getChild)
+      val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
+      flattenedBatch.setNumRows(batch.numRows())
+      flattenedBatch.rowIterator.asScala
+    }.map(unsafeProj)
+
+  }
+
+  protected def groupAndDedup(
+      input: Iterator[InternalRow], groupingAttributes: Seq[Attribute],
+      inputSchema: Seq[Attribute], dedupSchema: Seq[Attribute]): Iterator[Iterator[InternalRow]] = {
+     if (groupingAttributes.isEmpty) {
+       Iterator(input)
+     } else {
+        val groupedIter = GroupedIterator(input, groupingAttributes, inputSchema)
+        val dedupProj = UnsafeProjection.create(dedupSchema, inputSchema)
+        groupedIter.map {
+          case (_, groupedRowIter) => groupedRowIter.map(dedupProj)
+        }
+     }
+  }
+
+  protected def createSchema(child: SparkPlan, groupingAttributes: Seq[Attribute])
+    : (StructType, Seq[Attribute], Array[Array[Int]]) = {
+
+    // Deduplicate the grouping attributes.
+    // If a grouping attribute also appears in data attributes, then we don't need to send the
+    // grouping attribute to Python worker. If a grouping attribute is not in data attributes,
+    // then we need to send this grouping attribute to python worker.
+    //
+    // We use argOffsets to distinguish grouping attributes and data attributes as following:
+    //
+    // argOffsets[0] is the length of grouping attributes
+    // argOffsets[1 .. argOffsets[0]+1] is the arg offsets for grouping attributes
+    // argOffsets[argOffsets[0]+1 .. ] is the arg offsets for data attributes
+
+    val dataAttributes = child.output.drop(groupingAttributes.length)
+    val groupingIndicesInData = groupingAttributes.map { attribute =>
+      dataAttributes.indexWhere(attribute.semanticEquals)
+    }
+
+    val groupingArgOffsets = new ArrayBuffer[Int]
+    val nonDupGroupingAttributes = new ArrayBuffer[Attribute]
+    val nonDupGroupingSize = groupingIndicesInData.count(_ == -1)
+
+    // Non duplicate grouping attributes are added to nonDupGroupingAttributes and
+    // their offsets are 0, 1, 2 ...
+    // Duplicate grouping attributes are NOT added to nonDupGroupingAttributes and
+    // their offsets are n + index, where n is the total number of non duplicate grouping
+    // attributes and index is the index in the data attributes that the grouping attribute
+    // is a duplicate of.
+
+    groupingAttributes.zip(groupingIndicesInData).foreach {
+      case (attribute, index) =>
+        if (index == -1) {
+          groupingArgOffsets += nonDupGroupingAttributes.length
+          nonDupGroupingAttributes += attribute
+        } else {
+          groupingArgOffsets += index + nonDupGroupingSize
+        }
+    }
+
+    val dataArgOffsets = nonDupGroupingAttributes.length until
+      (nonDupGroupingAttributes.length + dataAttributes.length)
+
+    val argOffsets = Array(Array(groupingAttributes.length) ++ groupingArgOffsets ++ dataArgOffsets)
+
+    // Attributes after deduplication
+    val dedupAttributes = nonDupGroupingAttributes ++ dataAttributes
+    val dedupSchema = StructType.fromAttributes(dedupAttributes)
+    (dedupSchema, dedupAttributes, argOffsets)
+  }
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala
@@ -17,17 +17,12 @@
 
 package org.apache.spark.sql.execution.python
 
-import scala.collection.JavaConverters._
-
-import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.execution.{BinaryExecNode, CoGroupedIterator, GroupedIterator, SparkPlan}
-import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 case class FlatMapCoGroupsInPandasExec(
     leftGroup: Seq[Attribute],
@@ -36,9 +31,7 @@ case class FlatMapCoGroupsInPandasExec(
     output: Seq[Attribute],
     left: SparkPlan,
     right: SparkPlan)
-  extends BinaryExecNode {
-
-  private val pandasFunction = func.asInstanceOf[PythonUDF].func
+  extends BinaryExecNode with AbstractPandasGroupExec {
 
   override def outputPartitioning: Partitioning = left.outputPartitioning
 
@@ -53,41 +46,30 @@ case class FlatMapCoGroupsInPandasExec(
       .map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil
   }
 
-
   override protected def doExecute(): RDD[InternalRow] = {
 
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
-
+    val (schemaLeft, attrLeft, _) = createSchema(left, leftGroup)
+    val (schemaRight, attrRight, _) = createSchema(right, rightGroup)
 
     left.execute().zipPartitions(right.execute())  { (leftData, rightData) =>
       val leftGrouped = GroupedIterator(leftData, leftGroup, left.output)
       val rightGrouped = GroupedIterator(rightData, rightGroup, right.output)
-      val cogroup = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup)
-        .map{case (k, l, r) => (l, r)}
-      val context = TaskContext.get()
+      val projLeft = UnsafeProjection.create(attrLeft, left.output)
+      val projRight = UnsafeProjection.create(attrRight, right.output)
+      val data = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup)
+        .map{case (k, l, r) => (l.map(projLeft), r.map(projRight))}
 
-      val columnarBatchIter = new InterleavedArrowPythonRunner(
+      val runner = new InterleavedArrowPythonRunner(
         chainedFunc,
         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         Array(Array.empty),
-        left.schema,
-        right.schema,
+        schemaLeft,
+        schemaRight,
         sessionLocalTimeZone,
-        pythonRunnerConf).compute(cogroup, context.partitionId(), context)
-
+        pythonRunnerConf)
 
-        val unsafeProj = UnsafeProjection.create(output, output)
+      executePython(data, runner)
 
-        columnarBatchIter.flatMap { batch =>
-          //  UDF returns a StructType column in ColumnarBatch, select the children here
-          val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
-          val outputVectors = output.indices.map(structVector.getChild)
-          val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
-          flattenedBatch.setNumRows(batch.numRows())
-          flattenedBatch.rowIterator.asScala
-        }.map(unsafeProj)
     }
 
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -53,9 +53,7 @@ case class FlatMapGroupsInPandasExec(
     func: Expression,
     output: Seq[Attribute],
     child: SparkPlan)
-  extends UnaryExecNode {
-
-  private val pandasFunction = func.asInstanceOf[PythonUDF].func
+  extends UnaryExecNode with AbstractPandasGroupExec {
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
@@ -75,88 +73,22 @@ case class FlatMapGroupsInPandasExec(
   override protected def doExecute(): RDD[InternalRow] = {
     val inputRDD = child.execute()
 
-    val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
-    val sessionLocalTimeZone = conf.sessionLocalTimeZone
-    val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
-
-    // Deduplicate the grouping attributes.
-    // If a grouping attribute also appears in data attributes, then we don't need to send the
-    // grouping attribute to Python worker. If a grouping attribute is not in data attributes,
-    // then we need to send this grouping attribute to python worker.
-    //
-    // We use argOffsets to distinguish grouping attributes and data attributes as following:
-    //
-    // argOffsets[0] is the length of grouping attributes
-    // argOffsets[1 .. argOffsets[0]+1] is the arg offsets for grouping attributes
-    // argOffsets[argOffsets[0]+1 .. ] is the arg offsets for data attributes
-
-    val dataAttributes = child.output.drop(groupingAttributes.length)
-    val groupingIndicesInData = groupingAttributes.map { attribute =>
-      dataAttributes.indexWhere(attribute.semanticEquals)
-    }
-
-    val groupingArgOffsets = new ArrayBuffer[Int]
-    val nonDupGroupingAttributes = new ArrayBuffer[Attribute]
-    val nonDupGroupingSize = groupingIndicesInData.count(_ == -1)
-
-    // Non duplicate grouping attributes are added to nonDupGroupingAttributes and
-    // their offsets are 0, 1, 2 ...
-    // Duplicate grouping attributes are NOT added to nonDupGroupingAttributes and
-    // their offsets are n + index, where n is the total number of non duplicate grouping
-    // attributes and index is the index in the data attributes that the grouping attribute
-    // is a duplicate of.
-
-    groupingAttributes.zip(groupingIndicesInData).foreach {
-      case (attribute, index) =>
-        if (index == -1) {
-          groupingArgOffsets += nonDupGroupingAttributes.length
-          nonDupGroupingAttributes += attribute
-        } else {
-          groupingArgOffsets += index + nonDupGroupingSize
-        }
-    }
-
-    val dataArgOffsets = nonDupGroupingAttributes.length until
-      (nonDupGroupingAttributes.length + dataAttributes.length)
-
-    val argOffsets = Array(Array(groupingAttributes.length) ++ groupingArgOffsets ++ dataArgOffsets)
-
-    // Attributes after deduplication
-    val dedupAttributes = nonDupGroupingAttributes ++ dataAttributes
-    val dedupSchema = StructType.fromAttributes(dedupAttributes)
+    val (dedupSchema, dedupAttributes, argOffsets) = createSchema(child, groupingAttributes)
 
     // Map grouped rows to ArrowPythonRunner results, Only execute if partition is not empty
     inputRDD.mapPartitionsInternal { iter => if (iter.isEmpty) iter else {
-      val grouped = if (groupingAttributes.isEmpty) {
-        Iterator(iter)
-      } else {
-        val groupedIter = GroupedIterator(iter, groupingAttributes, child.output)
-        val dedupProj = UnsafeProjection.create(dedupAttributes, child.output)
-        groupedIter.map {
-          case (_, groupedRowIter) => groupedRowIter.map(dedupProj)
-        }
-      }
 
-      val context = TaskContext.get()
+      val data = groupAndDedup(iter, groupingAttributes, child.output, dedupAttributes)
 
-      val columnarBatchIter = new ArrowPythonRunner(
+      val runner = new ArrowPythonRunner(
         chainedFunc,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
         argOffsets,
         dedupSchema,
         sessionLocalTimeZone,
-        pythonRunnerConf).compute(grouped, context.partitionId(), context)
-
-      val unsafeProj = UnsafeProjection.create(output, output)
+        pythonRunnerConf)
 
-      columnarBatchIter.flatMap { batch =>
-        // Grouped Map UDF returns a StructType column in ColumnarBatch, select the children here
-        val structVector = batch.column(0).asInstanceOf[ArrowColumnVector]
-        val outputVectors = output.indices.map(structVector.getChild)
-        val flattenedBatch = new ColumnarBatch(outputVectors.toArray)
-        flattenedBatch.setNumRows(batch.numRows())
-        flattenedBatch.rowIterator.asScala
-      }.map(unsafeProj)
+      executePython(data, runner)
     }}
   }
 }