Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ARROW] Arrow serialization should not introduce extra shuffle for outermost limit #4662

Closed
wants to merge 28 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a584943
arrow take
cfmcgrady Mar 23, 2023
8593d85
driver slice last batch
cfmcgrady Mar 24, 2023
0088671
refine
cfmcgrady Mar 29, 2023
ed8c692
refactor
cfmcgrady Apr 3, 2023
4212a89
refactor and add ut
cfmcgrady Apr 4, 2023
6c5b1eb
add ut
cfmcgrady Apr 4, 2023
ee5a756
revert unnecessarily changes
cfmcgrady Apr 4, 2023
4e7ca54
unnecessarily changes
cfmcgrady Apr 4, 2023
885cf2c
infer row size by schema.defaultSize
cfmcgrady Apr 4, 2023
25e4f05
add docs
cfmcgrady Apr 4, 2023
03d0747
address comment
cfmcgrady Apr 6, 2023
2286afc
reflective calla AdaptiveSparkPlanExec.finalPhysicalPlan
cfmcgrady Apr 6, 2023
81886f0
address comment
cfmcgrady Apr 6, 2023
e3bf84c
refactor
cfmcgrady Apr 6, 2023
d70aee3
SparkPlan.session -> SparkSession.active to adapt Spark-3.1.x
cfmcgrady Apr 6, 2023
4cef204
SparkArrowbasedOperationSuite adapt Spark-3.1.x
cfmcgrady Apr 6, 2023
573a262
fix
cfmcgrady Apr 6, 2023
c83cf3f
SparkArrowbasedOperationSuite adapt Spark-3.1.x
cfmcgrady Apr 6, 2023
9ffb44f
make toBatchIterator private
cfmcgrady Apr 6, 2023
b72bc6f
add offset support to adapt Spark-3.4.x
cfmcgrady Apr 6, 2023
22cc70f
add ut
cfmcgrady Apr 6, 2023
8280783
add `isStaticConfigKey` to adapt Spark-3.1.x
cfmcgrady Apr 7, 2023
6d596fc
address comment
cfmcgrady Apr 7, 2023
6064ab9
limit = 0 test case
cfmcgrady Apr 7, 2023
3700839
SparkArrowbasedOperationSuite adapt Spark-3.1.x
cfmcgrady Apr 7, 2023
facc13f
exclude rule OptimizeLimitZero
cfmcgrady Apr 7, 2023
130bcb1
finally close
cfmcgrady Apr 7, 2023
82c912e
close vector
cfmcgrady Apr 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add ut
  • Loading branch information
cfmcgrady committed Apr 4, 2023
commit 6c5b1eb615b8d5e3232e4815c845a717df6926a5
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ package org.apache.kyuubi.engine.spark.operation
import java.sql.Statement

import org.apache.spark.KyuubiSparkContextHelper
import org.apache.spark.sql.Row
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.{CollectLimitExec, QueryExecution}
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
Expand Down Expand Up @@ -143,8 +144,7 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
assert(metrics("numOutputRows").value === 1)
}

test("aa") {

test("SparkDatasetHelper.executeArrowBatchCollect should return expect row count") {
val returnSize = Seq(
7, // less than one partition
10, // equal to one partition
Expand Down Expand Up @@ -202,6 +202,25 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
}
}

test("arrow serialization should not introduce extra shuffle for outermost limit") {
var numStages = 0
val listener = new SparkListener {
override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
numStages = jobStart.stageInfos.length
}
}
withJdbcStatement() { statement =>
withSparkListener(listener) {
withPartitionedTable("t_3") {
statement.executeQuery("select * from t_3 limit 1000")
}
KyuubiSparkContextHelper.waitListenerBus(spark)
}
}
// Should be only one stage since there is no shuffle.
assert(numStages == 1)
}

private def checkResultSetFormat(statement: Statement, expectFormat: String): Unit = {
val query =
s"""
Expand Down Expand Up @@ -241,4 +260,37 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
.allSessions()
.foreach(_.asInstanceOf[SparkSessionImpl].spark.listenerManager.unregister(listener))
}

private def withSparkListener[T](listener: SparkListener)(body: => T): T = {
withAllSessions(s => s.sparkContext.addSparkListener(listener))
try {
body
} finally {
withAllSessions(s => s.sparkContext.removeSparkListener(listener))
}

}

private def withPartitionedTable[T](viewName: String)(body: => T): T = {
withAllSessions { spark =>
spark.range(0, 1000, 1, numPartitions = 100)
.createOrReplaceTempView(viewName)
}
try {
body
} finally {
withAllSessions { spark =>
spark.sql(s"DROP VIEW IF EXISTS $viewName")
}
}
}

private def withAllSessions(op: SparkSession => Unit): Unit = {
SparkSQLEngine.currentEngine.get
.backendService
.sessionManager
.allSessions()
.map(_.asInstanceOf[SparkSessionImpl].spark)
.foreach(op(_))
}
}