add offset support to adapt Spark-3.4.x

apache · cfmcgrady · Mar 23, 2023 · Mar 24, 2023 · Mar 29, 2023 · Apr 3, 2023
commit b72bc6fb2da63bd91b12ab7f237a848b37b5b1ce
diff --git a/...uubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/kyuubi/SparkDatasetHelper.scala b/...uubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/kyuubi/SparkDatasetHelper.scala
@@ -42,7 +42,8 @@ object SparkDatasetHelper {
   def executeArrowBatchCollect: SparkPlan => Array[Array[Byte]] = {
     case adaptiveSparkPlan: AdaptiveSparkPlanExec =>
       executeArrowBatchCollect(finalPhysicalPlan(adaptiveSparkPlan))
-    case collectLimit: CollectLimitExec =>
+    // TODO: avoid extra shuffle if `offset` > 0
+    case collectLimit: CollectLimitExec if offset(collectLimit) <= 0 =>
       doCollectLimit(collectLimit)
     case plan: SparkPlan =>
       toArrowBatchRdd(plan).collect()
@@ -193,11 +194,24 @@ object SparkDatasetHelper {
     val result = fun(plan)
     val finalPlanUpdate = DynMethods.builder("finalPlanUpdate")
       .hiddenImpl(adaptiveSparkPlanExec.getClass)
-      .build(adaptiveSparkPlanExec)
+      .build()
     finalPlanUpdate.invoke[Unit](adaptiveSparkPlanExec)
     result
   }
 
+  /**
+   * offset support was add since Spark-3.4(set SPARK-28330), to ensure backward compatibility with
+   * earlier versions of Spark, this function uses reflective calls to the "offset".
+   */
+  private def offset(collectLimitExec: CollectLimitExec): Int = {
+    val offset = DynMethods.builder("offset")
+      .impl(collectLimitExec.getClass)
+      .orNoop()
+      .build()
+    Option(offset.invoke[Int](collectLimitExec))
+      .getOrElse(0)
+  }
+
   /**
    * refer to org.apache.spark.sql.Dataset#withAction(), assign a new execution id for arrow-based
    * operation, so that we can track the arrow-based queries on the UI tab.

diff --git a/...c/test/scala/org/apache/kyuubi/engine/spark/operation/SparkArrowbasedOperationSuite.scala b/...c/test/scala/org/apache/kyuubi/engine/spark/operation/SparkArrowbasedOperationSuite.scala
@@ -204,6 +204,26 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
     }
   }
 
+  test("result offset support") {
+    assume(SPARK_ENGINE_RUNTIME_VERSION > "3.3")
+    var numStages = 0
+    val listener = new SparkListener {
+      override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
+        numStages = jobStart.stageInfos.length
+      }
+    }
+    withJdbcStatement() { statement =>
+      withSparkListener(listener) {
+        withPartitionedTable("t_3") {
+          statement.executeQuery("select * from t_3 limit 10 offset 10")
+        }
+        KyuubiSparkContextHelper.waitListenerBus(spark)
+      }
+    }
+    // the extra shuffle be introduced if the `offset` > 0
+    assert(numStages == 2)
+  }
+
   test("arrow serialization should not introduce extra shuffle for outermost limit") {
     var numStages = 0
     val listener = new SparkListener {