Skip to content

Commit c3cda49

Browse files
willbrxin
authored andcommitted
SPARK-2180: support HAVING clauses in Hive queries
This PR extends Spark's HiveQL support to handle HAVING clauses in aggregations. The HAVING test from the Hive compatibility suite doesn't appear to be runnable from within Spark, so I added a simple comparable test to `HiveQuerySuite`. Author: William Benton <willb@redhat.com> Closes #1136 from willb/SPARK-2180 and squashes the following commits: 3bbaf26 [William Benton] Added casts to HAVING expressions 83f1340 [William Benton] scalastyle fixes 18387f1 [William Benton] Add test for HAVING without GROUP BY b880bef [William Benton] Added semantic error for HAVING without GROUP BY 942428e [William Benton] Added test coverage for SPARK-2180. 56084cc [William Benton] Add support for HAVING clauses in Hive queries. (cherry picked from commit 171ebb3) Signed-off-by: Reynold Xin <rxin@apache.org>
1 parent b1ea9e5 commit c3cda49

File tree

2 files changed

+53
-6
lines changed

2 files changed

+53
-6
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,9 @@ private[hive] object HiveQl {
204204
class ParseException(sql: String, cause: Throwable)
205205
extends Exception(s"Failed to parse: $sql", cause)
206206

207+
class SemanticException(msg: String)
208+
extends Exception(s"Error in semantic analysis: $msg")
209+
207210
/**
208211
* Returns the AST for the given SQL string.
209212
*/
@@ -480,6 +483,7 @@ private[hive] object HiveQl {
480483
whereClause ::
481484
groupByClause ::
482485
orderByClause ::
486+
havingClause ::
483487
sortByClause ::
484488
clusterByClause ::
485489
distributeByClause ::
@@ -494,6 +498,7 @@ private[hive] object HiveQl {
494498
"TOK_WHERE",
495499
"TOK_GROUPBY",
496500
"TOK_ORDERBY",
501+
"TOK_HAVING",
497502
"TOK_SORTBY",
498503
"TOK_CLUSTERBY",
499504
"TOK_DISTRIBUTEBY",
@@ -576,21 +581,34 @@ private[hive] object HiveQl {
576581
val withDistinct =
577582
if (selectDistinctClause.isDefined) Distinct(withProject) else withProject
578583

584+
val withHaving = havingClause.map { h =>
585+
586+
if (groupByClause == None) {
587+
throw new SemanticException("HAVING specified without GROUP BY")
588+
}
589+
590+
val havingExpr = h.getChildren.toSeq match {
591+
case Seq(hexpr) => nodeToExpr(hexpr)
592+
}
593+
594+
Filter(Cast(havingExpr, BooleanType), withDistinct)
595+
}.getOrElse(withDistinct)
596+
579597
val withSort =
580598
(orderByClause, sortByClause, distributeByClause, clusterByClause) match {
581599
case (Some(totalOrdering), None, None, None) =>
582-
Sort(totalOrdering.getChildren.map(nodeToSortOrder), withDistinct)
600+
Sort(totalOrdering.getChildren.map(nodeToSortOrder), withHaving)
583601
case (None, Some(perPartitionOrdering), None, None) =>
584-
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withDistinct)
602+
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder), withHaving)
585603
case (None, None, Some(partitionExprs), None) =>
586-
Repartition(partitionExprs.getChildren.map(nodeToExpr), withDistinct)
604+
Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving)
587605
case (None, Some(perPartitionOrdering), Some(partitionExprs), None) =>
588606
SortPartitions(perPartitionOrdering.getChildren.map(nodeToSortOrder),
589-
Repartition(partitionExprs.getChildren.map(nodeToExpr), withDistinct))
607+
Repartition(partitionExprs.getChildren.map(nodeToExpr), withHaving))
590608
case (None, None, None, Some(clusterExprs)) =>
591609
SortPartitions(clusterExprs.getChildren.map(nodeToExpr).map(SortOrder(_, Ascending)),
592-
Repartition(clusterExprs.getChildren.map(nodeToExpr), withDistinct))
593-
case (None, None, None, None) => withDistinct
610+
Repartition(clusterExprs.getChildren.map(nodeToExpr), withHaving))
611+
case (None, None, None, None) => withHaving
594612
case _ => sys.error("Unsupported set of ordering / distribution clauses.")
595613
}
596614

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,32 @@ class HiveQuerySuite extends HiveComparisonTest {
224224
TestHive.reset()
225225
}
226226

227+
test("SPARK-2180: HAVING support in GROUP BY clauses (positive)") {
228+
val fixture = List(("foo", 2), ("bar", 1), ("foo", 4), ("bar", 3))
229+
.zipWithIndex.map {case Pair(Pair(value, attr), key) => HavingRow(key, value, attr)}
230+
231+
TestHive.sparkContext.parallelize(fixture).registerAsTable("having_test")
232+
233+
val results =
234+
hql("SELECT value, max(attr) AS attr FROM having_test GROUP BY value HAVING attr > 3")
235+
.collect()
236+
.map(x => Pair(x.getString(0), x.getInt(1)))
237+
238+
assert(results === Array(Pair("foo", 4)))
239+
240+
TestHive.reset()
241+
}
242+
243+
test("SPARK-2180: HAVING without GROUP BY raises exception") {
244+
intercept[Exception] {
245+
hql("SELECT value, attr FROM having_test HAVING attr > 3")
246+
}
247+
}
248+
249+
test("SPARK-2180: HAVING with non-boolean clause raises no exceptions") {
250+
val results = hql("select key, count(*) c from src group by key having c").collect()
251+
}
252+
227253
test("Query Hive native command execution result") {
228254
val tableName = "test_native_commands"
229255

@@ -441,3 +467,6 @@ class HiveQuerySuite extends HiveComparisonTest {
441467
// since they modify /clear stuff.
442468

443469
}
470+
471+
// for SPARK-2180 test
472+
case class HavingRow(key: Int, value: String, attr: Int)

0 commit comments

Comments
 (0)