Skip to content

Commit e272123

Browse files
chenghao-intelyhuai
authored andcommitted
[SPARK-8972] [SQL] Incorrect result for rollup
We don't support the complex expression keys in the rollup/cube, and we even will not report it if we have the complex group by keys, that will cause very confusing/incorrect result. e.g. `SELECT key%100 FROM src GROUP BY key %100 with ROLLUP` This PR adds an additional project during the analyzing for the complex GROUP BY keys, and that projection will be the child of `Expand`, so to `Expand`, the GROUP BY KEY are always the simple key(attribute names). Author: Cheng Hao <hao.cheng@intel.com> Closes #7343 from chenghao-intel/expand and squashes the following commits: 1ebbb59 [Cheng Hao] update the comment 827873f [Cheng Hao] update as feedback 34def69 [Cheng Hao] Add more unit test and comments c695760 [Cheng Hao] fix bug of incorrect result for rollup
1 parent ba33096 commit e272123

8 files changed

+145
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,52 @@ class Analyzer(
194194
}
195195

196196
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
197+
case a if !a.childrenResolved => a // be sure all of the children are resolved.
197198
case a: Cube =>
198199
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
199200
case a: Rollup =>
200201
GroupingSets(bitmasks(a), a.groupByExprs, a.child, a.aggregations)
201202
case x: GroupingSets =>
202203
val gid = AttributeReference(VirtualColumn.groupingIdName, IntegerType, false)()
204+
// We will insert another Projection if the GROUP BY keys contains the
205+
// non-attribute expressions. And the top operators can references those
206+
// expressions by its alias.
207+
// e.g. SELECT key%5 as c1 FROM src GROUP BY key%5 ==>
208+
// SELECT a as c1 FROM (SELECT key%5 AS a FROM src) GROUP BY a
209+
210+
// find all of the non-attribute expressions in the GROUP BY keys
211+
val nonAttributeGroupByExpressions = new ArrayBuffer[Alias]()
212+
213+
// The pair of (the original GROUP BY key, associated attribute)
214+
val groupByExprPairs = x.groupByExprs.map(_ match {
215+
case e: NamedExpression => (e, e.toAttribute)
216+
case other => {
217+
val alias = Alias(other, other.toString)()
218+
nonAttributeGroupByExpressions += alias // add the non-attributes expression alias
219+
(other, alias.toAttribute)
220+
}
221+
})
222+
223+
// substitute the non-attribute expressions for aggregations.
224+
val aggregation = x.aggregations.map(expr => expr.transformDown {
225+
case e => groupByExprPairs.find(_._1.semanticEquals(e)).map(_._2).getOrElse(e)
226+
}.asInstanceOf[NamedExpression])
227+
228+
// substitute the group by expressions.
229+
val newGroupByExprs = groupByExprPairs.map(_._2)
230+
231+
val child = if (nonAttributeGroupByExpressions.length > 0) {
232+
// insert additional projection if contains the
233+
// non-attribute expressions in the GROUP BY keys
234+
Project(x.child.output ++ nonAttributeGroupByExpressions, x.child)
235+
} else {
236+
x.child
237+
}
238+
203239
Aggregate(
204-
x.groupByExprs :+ VirtualColumn.groupingIdAttribute,
205-
x.aggregations,
206-
Expand(x.bitmasks, x.groupByExprs, gid, x.child))
240+
newGroupByExprs :+ VirtualColumn.groupingIdAttribute,
241+
aggregation,
242+
Expand(x.bitmasks, newGroupByExprs, gid, child))
207243
}
208244
}
209245

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
500 NULL 0
2+
91 0 1
3+
84 1 1
4+
105 2 1
5+
113 3 1
6+
107 4 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
1 NULL -3 2
2+
1 NULL -1 2
3+
1 NULL 3 2
4+
1 NULL 4 2
5+
1 NULL 5 2
6+
1 NULL 6 2
7+
1 NULL 12 2
8+
1 NULL 14 2
9+
1 NULL 15 2
10+
1 NULL 22 2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
1 NULL -3 2
2+
1 NULL -1 2
3+
1 NULL 3 2
4+
1 NULL 4 2
5+
1 NULL 5 2
6+
1 NULL 6 2
7+
1 NULL 12 2
8+
1 NULL 14 2
9+
1 NULL 15 2
10+
1 NULL 22 2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
500 NULL 0
2+
91 0 1
3+
84 1 1
4+
105 2 1
5+
113 3 1
6+
107 4 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
1 0 5 3
2+
1 0 15 3
3+
1 0 25 3
4+
1 0 60 3
5+
1 0 75 3
6+
1 0 80 3
7+
1 0 100 3
8+
1 0 140 3
9+
1 0 145 3
10+
1 0 150 3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
1 0 5 3
2+
1 0 15 3
3+
1 0 25 3
4+
1 0 60 3
5+
1 0 75 3
6+
1 0 80 3
7+
1 0 100 3
8+
1 0 140 3
9+
1 0 145 3
10+
1 0 150 3

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,60 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
8585
}
8686
}
8787

88+
createQueryTest("SPARK-8976 Wrong Result for Rollup #1",
89+
"""
90+
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH ROLLUP
91+
""".stripMargin)
92+
93+
createQueryTest("SPARK-8976 Wrong Result for Rollup #2",
94+
"""
95+
SELECT
96+
count(*) AS cnt,
97+
key % 5 as k1,
98+
key-5 as k2,
99+
GROUPING__ID as k3
100+
FROM src group by key%5, key-5
101+
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
102+
""".stripMargin)
103+
104+
createQueryTest("SPARK-8976 Wrong Result for Rollup #3",
105+
"""
106+
SELECT
107+
count(*) AS cnt,
108+
key % 5 as k1,
109+
key-5 as k2,
110+
GROUPING__ID as k3
111+
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
112+
WITH ROLLUP ORDER BY cnt, k1, k2, k3 LIMIT 10
113+
""".stripMargin)
114+
115+
createQueryTest("SPARK-8976 Wrong Result for CUBE #1",
116+
"""
117+
SELECT count(*) AS cnt, key % 5,GROUPING__ID FROM src group by key%5 WITH CUBE
118+
""".stripMargin)
119+
120+
createQueryTest("SPARK-8976 Wrong Result for CUBE #2",
121+
"""
122+
SELECT
123+
count(*) AS cnt,
124+
key % 5 as k1,
125+
key-5 as k2,
126+
GROUPING__ID as k3
127+
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
128+
WITH CUBE ORDER BY cnt, k1, k2, k3 LIMIT 10
129+
""".stripMargin)
130+
131+
createQueryTest("SPARK-8976 Wrong Result for GroupingSet",
132+
"""
133+
SELECT
134+
count(*) AS cnt,
135+
key % 5 as k1,
136+
key-5 as k2,
137+
GROUPING__ID as k3
138+
FROM (SELECT key, key%2, key - 5 FROM src) t group by key%5, key-5
139+
GROUPING SETS (key%5, key-5) ORDER BY cnt, k1, k2, k3 LIMIT 10
140+
""".stripMargin)
141+
88142
createQueryTest("insert table with generator with column name",
89143
"""
90144
| CREATE TABLE gen_tmp (key Int);

0 commit comments

Comments
 (0)