Skip to content

Commit b5241c9

Browse files
imback82cloud-fan
authored andcommitted
[SPARK-34701][SQL] Introduce AnalysisOnlyCommand that allows its children to be removed once the command is marked as analyzed
### What changes were proposed in this pull request? This PR proposes to introduce the `AnalysisOnlyCommand` trait such that a command that extends this trait can have its children only analyzed, but not optimized. There is a corresponding analysis rule `HandleAnalysisOnlyCommand` that marks the command as analyzed after all other analysis rules are run. This can be useful if a logical plan has children where they need to be only analyzed, but not optimized - e.g., `CREATE VIEW` or `CACHE TABLE AS`. This also addresses the issue found in #31933. This PR also updates `CreateViewCommand`, `CacheTableAsSelect`, and `AlterViewAsCommand` to use the new trait / rule such that their children are only analyzed. ### Why are the changes needed? To address the issue where the plan is unnecessarily re-analyzed in `CreateViewCommand`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests should cover the changes. Closes #32032 from imback82/skip_transform. Authored-by: Terry Kim <yuminkim@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 816f6dd commit b5241c9

File tree

9 files changed

+102
-42
lines changed

9 files changed

+102
-42
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,9 @@ class Analyzer(override val catalogManager: CatalogManager)
309309
Batch("Subquery", Once,
310310
UpdateOuterReferences),
311311
Batch("Cleanup", fixedPoint,
312-
CleanupAliases)
312+
CleanupAliases),
313+
Batch("HandleAnalysisOnlyCommand", Once,
314+
HandleAnalysisOnlyCommand)
313315
)
314316

315317
/**
@@ -3543,6 +3545,18 @@ class Analyzer(override val catalogManager: CatalogManager)
35433545
}
35443546
}
35453547
}
3548+
3549+
/**
3550+
* A rule that marks a command as analyzed so that its children are removed to avoid
3551+
* being optimized. This rule should run after all other analysis rules are run.
3552+
*/
3553+
object HandleAnalysisOnlyCommand extends Rule[LogicalPlan] {
3554+
override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
3555+
case c: AnalysisOnlyCommand if c.resolved =>
3556+
checkAnalysis(c)
3557+
c.markAsAnalyzed()
3558+
}
3559+
}
35463560
}
35473561

35483562
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Command.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,14 @@ trait Command extends LogicalPlan {
3737
trait LeafCommand extends Command with LeafLike[LogicalPlan]
3838
trait UnaryCommand extends Command with UnaryLike[LogicalPlan]
3939
trait BinaryCommand extends Command with BinaryLike[LogicalPlan]
40+
41+
/**
42+
* A logical node that can be used for a command that requires its children to be only analyzed,
43+
* but not optimized.
44+
*/
45+
trait AnalysisOnlyCommand extends Command {
46+
val isAnalyzed: Boolean
47+
def childrenToAnalyze: Seq[LogicalPlan]
48+
override final def children: Seq[LogicalPlan] = if (isAnalyzed) Nil else childrenToAnalyze
49+
def markAsAnalyzed(): LogicalPlan
50+
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1022,7 +1022,18 @@ case class CacheTableAsSelect(
10221022
plan: LogicalPlan,
10231023
originalText: String,
10241024
isLazy: Boolean,
1025-
options: Map[String, String]) extends LeafCommand
1025+
options: Map[String, String],
1026+
isAnalyzed: Boolean = false) extends AnalysisOnlyCommand {
1027+
override protected def withNewChildrenInternal(
1028+
newChildren: IndexedSeq[LogicalPlan]): CacheTableAsSelect = {
1029+
assert(!isAnalyzed)
1030+
copy(plan = newChildren.head)
1031+
}
1032+
1033+
override def childrenToAnalyze: Seq[LogicalPlan] = plan :: Nil
1034+
1035+
override def markAsAnalyzed(): LogicalPlan = copy(isAnalyzed = true)
1036+
}
10261037

10271038
/**
10281039
* The logical plan of the UNCACHE TABLE command.

sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3370,10 +3370,11 @@ class Dataset[T] private[sql](
33703370
comment = None,
33713371
properties = Map.empty,
33723372
originalText = None,
3373-
child = logicalPlan,
3373+
plan = logicalPlan,
33743374
allowExisting = false,
33753375
replace = replace,
3376-
viewType = viewType)
3376+
viewType = viewType,
3377+
isAnalyzed = true)
33773378
}
33783379

33793380
/**

sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -474,15 +474,15 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager)
474474
case SetTableLocation(ResolvedV1TableIdentifier(ident), partitionSpec, location) =>
475475
AlterTableSetLocationCommand(ident.asTableIdentifier, partitionSpec, location)
476476

477-
case AlterViewAs(ResolvedView(ident, _), originalText, query) if query.resolved =>
477+
case AlterViewAs(ResolvedView(ident, _), originalText, query) =>
478478
AlterViewAsCommand(
479479
ident.asTableIdentifier,
480480
originalText,
481481
query)
482482

483483
case CreateViewStatement(
484484
tbl, userSpecifiedColumns, comment, properties,
485-
originalText, child, allowExisting, replace, viewType) if child.resolved =>
485+
originalText, child, allowExisting, replace, viewType) =>
486486

487487
val v1TableName = if (viewType != PersistedView) {
488488
// temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name.
@@ -491,15 +491,15 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager)
491491
parseV1Table(tbl, "CREATE VIEW")
492492
}
493493
CreateViewCommand(
494-
v1TableName.asTableIdentifier,
495-
userSpecifiedColumns,
496-
comment,
497-
properties,
498-
originalText,
499-
child,
500-
allowExisting,
501-
replace,
502-
viewType)
494+
name = v1TableName.asTableIdentifier,
495+
userSpecifiedColumns = userSpecifiedColumns,
496+
comment = comment,
497+
properties = properties,
498+
originalText = originalText,
499+
plan = child,
500+
allowExisting = allowExisting,
501+
replace = replace,
502+
viewType = viewType)
503503

504504
case ShowViews(resolved: ResolvedNamespace, pattern, output) =>
505505
resolved match {

sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, Pe
2929
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, SessionCatalog, TemporaryViewRelation}
3030
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, SubqueryExpression, UserDefinedExpression}
3131
import org.apache.spark.sql.catalyst.plans.QueryPlan
32-
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View}
32+
import org.apache.spark.sql.catalyst.plans.logical.{AnalysisOnlyCommand, LogicalPlan, Project, View}
3333
import org.apache.spark.sql.catalyst.util.CharVarcharUtils
3434
import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper
3535
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
@@ -48,29 +48,41 @@ import org.apache.spark.sql.util.SchemaUtils
4848
* @param properties the properties of this view.
4949
* @param originalText the original SQL text of this view, can be None if this view is created via
5050
* Dataset API.
51-
* @param child the logical plan that represents the view; this is used to generate the logical
52-
* plan for temporary view and the view schema.
51+
* @param plan the logical plan that represents the view; this is used to generate the logical
52+
* plan for temporary view and the view schema.
5353
* @param allowExisting if true, and if the view already exists, noop; if false, and if the view
5454
* already exists, throws analysis exception.
5555
* @param replace if true, and if the view already exists, updates it; if false, and if the view
5656
* already exists, throws analysis exception.
5757
* @param viewType the expected view type to be created with this command.
58+
* @param isAnalyzed whether this command is analyzed or not.
5859
*/
5960
case class CreateViewCommand(
6061
name: TableIdentifier,
6162
userSpecifiedColumns: Seq[(String, Option[String])],
6263
comment: Option[String],
6364
properties: Map[String, String],
6465
originalText: Option[String],
65-
child: LogicalPlan,
66+
plan: LogicalPlan,
6667
allowExisting: Boolean,
6768
replace: Boolean,
68-
viewType: ViewType)
69-
extends LeafRunnableCommand {
69+
viewType: ViewType,
70+
isAnalyzed: Boolean = false) extends RunnableCommand with AnalysisOnlyCommand {
7071

7172
import ViewHelper._
7273

73-
override def innerChildren: Seq[QueryPlan[_]] = Seq(child)
74+
override protected def withNewChildrenInternal(
75+
newChildren: IndexedSeq[LogicalPlan]): CreateViewCommand = {
76+
assert(!isAnalyzed)
77+
copy(plan = newChildren.head)
78+
}
79+
80+
override def innerChildren: Seq[QueryPlan[_]] = Seq(plan)
81+
82+
// `plan` needs to be analyzed, but shouldn't be optimized so that caching works correctly.
83+
override def childrenToAnalyze: Seq[LogicalPlan] = plan :: Nil
84+
85+
def markAsAnalyzed(): LogicalPlan = copy(isAnalyzed = true)
7486

7587
if (viewType == PersistedView) {
7688
require(originalText.isDefined, "'originalText' must be provided to create permanent view")
@@ -96,10 +108,10 @@ case class CreateViewCommand(
96108
}
97109

98110
override def run(sparkSession: SparkSession): Seq[Row] = {
99-
// If the plan cannot be analyzed, throw an exception and don't proceed.
100-
val qe = sparkSession.sessionState.executePlan(child)
101-
qe.assertAnalyzed()
102-
val analyzedPlan = qe.analyzed
111+
if (!isAnalyzed) {
112+
throw new AnalysisException("The logical plan that represents the view is not analyzed.")
113+
}
114+
val analyzedPlan = plan
103115

104116
if (userSpecifiedColumns.nonEmpty &&
105117
userSpecifiedColumns.length != analyzedPlan.output.length) {
@@ -233,12 +245,23 @@ case class CreateViewCommand(
233245
case class AlterViewAsCommand(
234246
name: TableIdentifier,
235247
originalText: String,
236-
query: LogicalPlan) extends LeafRunnableCommand {
248+
query: LogicalPlan,
249+
isAnalyzed: Boolean = false) extends RunnableCommand with AnalysisOnlyCommand {
237250

238251
import ViewHelper._
239252

253+
override protected def withNewChildrenInternal(
254+
newChildren: IndexedSeq[LogicalPlan]): AlterViewAsCommand = {
255+
assert(!isAnalyzed)
256+
copy(query = newChildren.head)
257+
}
258+
240259
override def innerChildren: Seq[QueryPlan[_]] = Seq(query)
241260

261+
override def childrenToAnalyze: Seq[LogicalPlan] = query :: Nil
262+
263+
def markAsAnalyzed(): LogicalPlan = copy(isAnalyzed = true)
264+
242265
override def run(session: SparkSession): Seq[Row] = {
243266
if (session.sessionState.catalog.isTempView(name)) {
244267
alterTemporaryView(session, query)

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,19 +94,19 @@ case class CacheTableAsSelectExec(
9494
override lazy val relationName: String = tempViewName
9595

9696
override lazy val planToCache: LogicalPlan = {
97-
Dataset.ofRows(sparkSession,
98-
CreateViewCommand(
99-
name = TableIdentifier(tempViewName),
100-
userSpecifiedColumns = Nil,
101-
comment = None,
102-
properties = Map.empty,
103-
originalText = Some(originalText),
104-
child = query,
105-
allowExisting = false,
106-
replace = false,
107-
viewType = LocalTempView
108-
)
109-
)
97+
CreateViewCommand(
98+
name = TableIdentifier(tempViewName),
99+
userSpecifiedColumns = Nil,
100+
comment = None,
101+
properties = Map.empty,
102+
originalText = Some(originalText),
103+
plan = query,
104+
allowExisting = false,
105+
replace = false,
106+
viewType = LocalTempView,
107+
isAnalyzed = true
108+
).run(sparkSession)
109+
110110
dataFrameForCachedPlan.logicalPlan
111111
}
112112

sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,7 @@ Execute CreateViewCommand (1)
913913
Output: []
914914

915915
(2) CreateViewCommand
916-
Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView
916+
Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, true
917917

918918
(3) LogicalRelation
919919
Arguments: parquet, [key#x, val#x], CatalogTable(

sql/core/src/test/resources/sql-tests/results/explain.sql.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,7 @@ Execute CreateViewCommand (1)
858858
Output: []
859859

860860
(2) CreateViewCommand
861-
Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView
861+
Arguments: `default`.`explain_view`, SELECT key, val FROM explain_temp1, false, false, PersistedView, true
862862

863863
(3) LogicalRelation
864864
Arguments: parquet, [key#x, val#x], CatalogTable(

0 commit comments

Comments
 (0)