Skip to content

Commit a95360a

Browse files
committed
Split out Option[Map]
Signed-off-by: Karen Feng <karen.feng@databricks.com>
1 parent e44c683 commit a95360a

File tree

2 files changed

+40
-30
lines changed

2 files changed

+40
-30
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,12 @@ import org.apache.spark.sql.types._
2424

2525
/**
2626
* This aims to handle a nested column aliasing pattern inside the [[ColumnPruning]] optimizer rule.
27-
* If a project or its child references to nested fields, and not all the fields
28-
* in a nested attribute are used, we can substitute them by alias attributes; then a project
29-
* of the nested fields as aliases on the children of the child will be created.
27+
* If:
28+
* - A [[Project]] or its child references nested fields
29+
* - Not all of the fields in a nested attribute are used
30+
* Then:
31+
* - Substitute the nested field references with alias attributes
32+
* - Add grandchild [[Project]]s transforming the nested fields to aliases
3033
*
3134
* Example 1: Project
3235
* ------------------
@@ -76,7 +79,7 @@ import org.apache.spark.sql.types._
7679
*/
7780
object NestedColumnAliasing {
7881

79-
def unapply(plan: LogicalPlan): Option[Map[Attribute, Seq[ExtractValue]]] = plan match {
82+
def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match {
8083
/**
8184
* This pattern is needed to support [[Filter]] plan cases like
8285
* [[Project]]->[[Filter]]->listed plan in [[canProjectPushThrough]] (e.g., [[Window]]).
@@ -85,25 +88,40 @@ object NestedColumnAliasing {
8588
*/
8689
case Project(projectList, Filter(condition, child)) if
8790
SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) =>
88-
getAttributeToExtractValues(
89-
projectList ++ Seq(condition) ++ child.expressions, child.producedAttributes.toSeq)
91+
rewritePlanIfSubsetFieldsUsed(
92+
plan, projectList ++ Seq(condition) ++ child.expressions, child.producedAttributes.toSeq)
9093

9194
case Project(projectList, child) if
9295
SQLConf.get.nestedSchemaPruningEnabled && canProjectPushThrough(child) =>
93-
getAttributeToExtractValues(
94-
projectList ++ child.expressions, child.producedAttributes.toSeq)
96+
rewritePlanIfSubsetFieldsUsed(
97+
plan, projectList ++ child.expressions, child.producedAttributes.toSeq)
9598

9699
case p if SQLConf.get.nestedSchemaPruningEnabled && canPruneOn(p) =>
97-
getAttributeToExtractValues(
98-
p.expressions, p.producedAttributes.toSeq)
100+
rewritePlanIfSubsetFieldsUsed(
101+
plan, p.expressions, p.producedAttributes.toSeq)
99102

100103
case _ => None
101104
}
102105

106+
/**
107+
* Rewrites a plan with aliases if only a subset of the nested fields are used.
108+
*/
109+
def rewritePlanIfSubsetFieldsUsed(
110+
plan: LogicalPlan,
111+
exprList: Seq[Expression],
112+
exclusiveAttrs: Seq[Attribute]): Option[LogicalPlan] = {
113+
val attrToExtractValues = getAttributeToExtractValues(exprList, exclusiveAttrs)
114+
if (attrToExtractValues.isEmpty) {
115+
None
116+
} else {
117+
Some(rewritePlanWithAliases(plan, attrToExtractValues))
118+
}
119+
}
120+
103121
/**
104122
* Replace nested columns to prune unused nested columns later.
105123
*/
106-
def replacePlanWithAliases(
124+
def rewritePlanWithAliases(
107125
plan: LogicalPlan,
108126
attributeToExtractValues: Map[Attribute, Seq[ExtractValue]]): LogicalPlan = {
109127
// Each expression can contain multiple nested fields.
@@ -218,20 +236,19 @@ object NestedColumnAliasing {
218236
}
219237

220238
/**
221-
* Creates a map from root [[Attribute]]s to non-redundant nested [[ExtractValue]]s in the
222-
* case that only a subset of the nested fields are used.
239+
* Creates a map from root [[Attribute]]s to non-redundant nested [[ExtractValue]]s.
223240
* Nested field accessors of `exclusiveAttrs` are not considered in nested fields aliasing.
224241
*/
225242
def getAttributeToExtractValues(
226243
exprList: Seq[Expression],
227-
exclusiveAttrs: Seq[Attribute]): Option[Map[Attribute, Seq[ExtractValue]]] = {
244+
exclusiveAttrs: Seq[Attribute]): Map[Attribute, Seq[ExtractValue]] = {
228245

229246
val nestedFieldReferences = exprList.flatMap(collectExtractValue)
230247
val otherRootReferences = exprList.flatMap(collectAttributeReference)
231248
val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences)
232249

233250
// Remove cosmetic variations when we group extractors by their references
234-
val attributeToExtractValues = nestedFieldReferences
251+
nestedFieldReferences
235252
.filter(!_.references.subsetOf(exclusiveAttrSet))
236253
.groupBy(_.references.head.canonicalized.asInstanceOf[Attribute])
237254
.flatMap { case (attr: Attribute, nestedFields: Seq[ExtractValue]) =>
@@ -258,12 +275,6 @@ object NestedColumnAliasing {
258275
None
259276
}
260277
}
261-
262-
if (attributeToExtractValues.isEmpty) {
263-
None
264-
} else {
265-
Some(attributeToExtractValues)
266-
}
267278
}
268279

269280
/**
@@ -281,11 +292,10 @@ object NestedColumnAliasing {
281292
}
282293

283294
/**
284-
* This prunes unnecessary nested columns from [[Generate]] and optional [[Project]] on top
285-
* of it.
295+
* This prunes unnecessary nested columns from [[Generate]], or [[Project]] -> [[Generate]]
286296
*/
287297
object GeneratorNestedColumnAliasing {
288-
def unapply(plan: LogicalPlan): Option[Map[Attribute, Seq[ExtractValue]]] = plan match {
298+
def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match {
289299
// Either `nestedPruningOnExpressions` or `nestedSchemaPruningEnabled` is enabled, we
290300
// need to prune nested columns through Project and under Generate. The difference is
291301
// when `nestedSchemaPruningEnabled` is on, nested columns will be pruned further at
@@ -294,17 +304,17 @@ object GeneratorNestedColumnAliasing {
294304
SQLConf.get.nestedSchemaPruningEnabled) && canPruneGenerator(g.generator) =>
295305
// On top on `Generate`, a `Project` that might have nested column accessors.
296306
// We try to get alias maps for both project list and generator's children expressions.
297-
NestedColumnAliasing.getAttributeToExtractValues(
298-
projectList ++ g.generator.children, g.qualifiedGeneratorOutput)
307+
NestedColumnAliasing.rewritePlanIfSubsetFieldsUsed(
308+
plan, projectList ++ g.generator.children, g.qualifiedGeneratorOutput)
299309

300310
case g: Generate if SQLConf.get.nestedSchemaPruningEnabled &&
301311
canPruneGenerator(g.generator) =>
302312
// If any child output is required by higher projection, we cannot prune on it even we
303313
// only use part of nested column of it. A required child output means it is referred
304314
// as a whole or partially by higher projection, pruning it here will cause unresolved
305315
// query plan.
306-
NestedColumnAliasing.getAttributeToExtractValues(
307-
g.generator.children, g.requiredChildOutput)
316+
NestedColumnAliasing.rewritePlanIfSubsetFieldsUsed(
317+
plan, g.generator.children, g.requiredChildOutput)
308318

309319
case _ =>
310320
None

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,7 @@ object ColumnPruning extends Rule[LogicalPlan] {
783783
p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices))
784784

785785
// prune unrequired nested fields from `Generate`.
786-
case p @ GeneratorNestedColumnAliasing(a) => NestedColumnAliasing.replacePlanWithAliases(p, a)
786+
case p @ GeneratorNestedColumnAliasing(rewrittenPlan) => rewrittenPlan
787787

788788
// Eliminate unneeded attributes from right side of a Left Existence Join.
789789
case j @ Join(_, right, LeftExistence(_), _, _) =>
@@ -817,7 +817,7 @@ object ColumnPruning extends Rule[LogicalPlan] {
817817
// Can't prune the columns on LeafNode
818818
case p @ Project(_, _: LeafNode) => p
819819

820-
case p @ NestedColumnAliasing(a) => NestedColumnAliasing.replacePlanWithAliases(p, a)
820+
case NestedColumnAliasing(rewrittenPlan) => rewrittenPlan
821821

822822
// for all other logical plans that inherits the output from it's children
823823
// Project over project is handled by the first case, skip it here.

0 commit comments

Comments
 (0)