Skip to content

Commit aa4ca8b

Browse files
committed
[SQL] Improve error messages
Author: Michael Armbrust <michael@databricks.com> Author: wangfei <wangfei1@huawei.com> Closes apache#4558 from marmbrus/errorMessages and squashes the following commits: 5e5ab50 [Michael Armbrust] Merge pull request alteryx#15 from scwf/errorMessages fa38881 [wangfei] fix for grouping__id f279a71 [wangfei] make right references for ScriptTransformation d29fbde [Michael Armbrust] extra case 1a797b4 [Michael Armbrust] comments d4e9015 [Michael Armbrust] add comment af9e668 [Michael Armbrust] no braces 34eb3a4 [Michael Armbrust] more work 6197cd5 [Michael Armbrust] [SQL] Better error messages for analysis failures
1 parent 6a1be02 commit aa4ca8b

File tree

14 files changed

+164
-103
lines changed

14 files changed

+164
-103
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

+71-52
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
2323
import org.apache.spark.sql.catalyst.expressions._
2424
import org.apache.spark.sql.catalyst.plans.logical._
2525
import org.apache.spark.sql.catalyst.rules._
26-
import org.apache.spark.sql.types.{ArrayType, StructField, StructType, IntegerType}
26+
import org.apache.spark.sql.types._
2727

2828
/**
2929
* A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
@@ -66,32 +66,82 @@ class Analyzer(catalog: Catalog,
6666
typeCoercionRules ++
6767
extendedRules : _*),
6868
Batch("Check Analysis", Once,
69-
CheckResolution ::
70-
CheckAggregation ::
71-
Nil: _*),
72-
Batch("AnalysisOperators", fixedPoint,
73-
EliminateAnalysisOperators)
69+
CheckResolution),
70+
Batch("Remove SubQueries", fixedPoint,
71+
EliminateSubQueries)
7472
)
7573

7674
/**
7775
* Makes sure all attributes and logical plans have been resolved.
7876
*/
7977
object CheckResolution extends Rule[LogicalPlan] {
78+
def failAnalysis(msg: String) = { throw new AnalysisException(msg) }
79+
8080
def apply(plan: LogicalPlan): LogicalPlan = {
81-
plan.transformUp {
82-
case p if p.expressions.exists(!_.resolved) =>
83-
val missing = p.expressions.filterNot(_.resolved).map(_.prettyString).mkString(",")
84-
val from = p.inputSet.map(_.name).mkString("{", ", ", "}")
85-
86-
throw new AnalysisException(s"Cannot resolve '$missing' given input columns $from")
87-
case p if !p.resolved && p.childrenResolved =>
88-
throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
89-
} match {
90-
// As a backstop, use the root node to check that the entire plan tree is resolved.
91-
case p if !p.resolved =>
92-
throw new AnalysisException(s"Unresolved operator in the query plan ${p.simpleString}")
93-
case p => p
81+
// We transform up and order the rules so as to catch the first possible failure instead
82+
// of the result of cascading resolution failures.
83+
plan.foreachUp {
84+
case operator: LogicalPlan =>
85+
operator transformExpressionsUp {
86+
case a: Attribute if !a.resolved =>
87+
val from = operator.inputSet.map(_.name).mkString(", ")
88+
failAnalysis(s"cannot resolve '${a.prettyString}' given input columns $from")
89+
90+
case c: Cast if !c.resolved =>
91+
failAnalysis(
92+
s"invalid cast from ${c.child.dataType.simpleString} to ${c.dataType.simpleString}")
93+
94+
case b: BinaryExpression if !b.resolved =>
95+
failAnalysis(
96+
s"invalid expression ${b.prettyString} " +
97+
s"between ${b.left.simpleString} and ${b.right.simpleString}")
98+
}
99+
100+
operator match {
101+
case f: Filter if f.condition.dataType != BooleanType =>
102+
failAnalysis(
103+
s"filter expression '${f.condition.prettyString}' " +
104+
s"of type ${f.condition.dataType.simpleString} is not a boolean.")
105+
106+
case aggregatePlan @ Aggregate(groupingExprs, aggregateExprs, child) =>
107+
def checkValidAggregateExpression(expr: Expression): Unit = expr match {
108+
case _: AggregateExpression => // OK
109+
case e: Attribute if !groupingExprs.contains(e) =>
110+
failAnalysis(
111+
s"expression '${e.prettyString}' is neither present in the group by, " +
112+
s"nor is it an aggregate function. " +
113+
"Add to group by or wrap in first() if you don't care which value you get.")
114+
case e if groupingExprs.contains(e) => // OK
115+
case e if e.references.isEmpty => // OK
116+
case e => e.children.foreach(checkValidAggregateExpression)
117+
}
118+
119+
val cleaned = aggregateExprs.map(_.transform {
120+
// Should trim aliases around `GetField`s. These aliases are introduced while
121+
// resolving struct field accesses, because `GetField` is not a `NamedExpression`.
122+
// (Should we just turn `GetField` into a `NamedExpression`?)
123+
case Alias(g, _) => g
124+
})
125+
126+
cleaned.foreach(checkValidAggregateExpression)
127+
128+
case o if o.children.nonEmpty &&
129+
!o.references.filter(_.name != "grouping__id").subsetOf(o.inputSet) =>
130+
val missingAttributes = (o.references -- o.inputSet).map(_.prettyString).mkString(",")
131+
val input = o.inputSet.map(_.prettyString).mkString(",")
132+
133+
failAnalysis(s"resolved attributes $missingAttributes missing from $input")
134+
135+
// Catch all
136+
case o if !o.resolved =>
137+
failAnalysis(
138+
s"unresolved operator ${operator.simpleString}")
139+
140+
case _ => // Analysis successful!
141+
}
94142
}
143+
144+
plan
95145
}
96146
}
97147

@@ -192,45 +242,14 @@ class Analyzer(catalog: Catalog,
192242
}
193243
}
194244

195-
/**
196-
* Checks for non-aggregated attributes with aggregation
197-
*/
198-
object CheckAggregation extends Rule[LogicalPlan] {
199-
def apply(plan: LogicalPlan): LogicalPlan = {
200-
plan.transform {
201-
case aggregatePlan @ Aggregate(groupingExprs, aggregateExprs, child) =>
202-
def isValidAggregateExpression(expr: Expression): Boolean = expr match {
203-
case _: AggregateExpression => true
204-
case e: Attribute => groupingExprs.contains(e)
205-
case e if groupingExprs.contains(e) => true
206-
case e if e.references.isEmpty => true
207-
case e => e.children.forall(isValidAggregateExpression)
208-
}
209-
210-
aggregateExprs.find { e =>
211-
!isValidAggregateExpression(e.transform {
212-
// Should trim aliases around `GetField`s. These aliases are introduced while
213-
// resolving struct field accesses, because `GetField` is not a `NamedExpression`.
214-
// (Should we just turn `GetField` into a `NamedExpression`?)
215-
case Alias(g: GetField, _) => g
216-
})
217-
}.foreach { e =>
218-
throw new TreeNodeException(plan, s"Expression not in GROUP BY: $e")
219-
}
220-
221-
aggregatePlan
222-
}
223-
}
224-
}
225-
226245
/**
227246
* Replaces [[UnresolvedRelation]]s with concrete relations from the catalog.
228247
*/
229248
object ResolveRelations extends Rule[LogicalPlan] {
230249
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
231250
case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier, alias), _, _, _) =>
232251
i.copy(
233-
table = EliminateAnalysisOperators(catalog.lookupRelation(tableIdentifier, alias)))
252+
table = EliminateSubQueries(catalog.lookupRelation(tableIdentifier, alias)))
234253
case UnresolvedRelation(tableIdentifier, alias) =>
235254
catalog.lookupRelation(tableIdentifier, alias)
236255
}
@@ -477,7 +496,7 @@ class Analyzer(catalog: Catalog,
477496
* only required to provide scoping information for attributes and can be removed once analysis is
478497
* complete.
479498
*/
480-
object EliminateAnalysisOperators extends Rule[LogicalPlan] {
499+
object EliminateSubQueries extends Rule[LogicalPlan] {
481500
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
482501
case Subquery(_, child) => child
483502
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala

+5-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@ package org.apache.spark.sql.catalyst.expressions
2020
import org.apache.spark.sql.catalyst.analysis.Star
2121

2222
protected class AttributeEquals(val a: Attribute) {
23-
override def hashCode() = a.exprId.hashCode()
23+
override def hashCode() = a match {
24+
case ar: AttributeReference => ar.exprId.hashCode()
25+
case a => a.hashCode()
26+
}
27+
2428
override def equals(other: Any) = (a, other.asInstanceOf[AttributeEquals].a) match {
2529
case (a1: AttributeReference, a2: AttributeReference) => a1.exprId == a2.exprId
2630
case (a1, a2) => a1 == a2

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ case class PrettyAttribute(name: String) extends Attribute with trees.LeafNode[E
218218
override def exprId: ExprId = ???
219219
override def eval(input: Row): EvaluatedType = ???
220220
override def nullable: Boolean = ???
221-
override def dataType: DataType = ???
221+
override def dataType: DataType = NullType
222222
}
223223

224224
object VirtualColumn {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/ScriptTransformation.scala

+4-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.plans.logical
1919

20-
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
20+
import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Attribute, Expression}
2121

2222
/**
2323
* Transforms the input by forking and running the specified script.
@@ -32,7 +32,9 @@ case class ScriptTransformation(
3232
script: String,
3333
output: Seq[Attribute],
3434
child: LogicalPlan,
35-
ioschema: ScriptInputOutputSchema) extends UnaryNode
35+
ioschema: ScriptInputOutputSchema) extends UnaryNode {
36+
override def references: AttributeSet = AttributeSet(input.flatMap(_.references))
37+
}
3638

3739
/**
3840
* A placeholder for implementation specific input and output properties when passing data

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala

+9
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] {
4646
children.foreach(_.foreach(f))
4747
}
4848

49+
/**
50+
* Runs the given function recursively on [[children]] then on this node.
51+
* @param f the function to be applied to each node in the tree.
52+
*/
53+
def foreachUp(f: BaseType => Unit): Unit = {
54+
children.foreach(_.foreach(f))
55+
f(this)
56+
}
57+
4958
/**
5059
* Returns a Seq containing the result of applying the given function to each
5160
* node in this tree in a preorder traversal.

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala

+54-25
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
2020
import org.scalatest.{BeforeAndAfter, FunSuite}
2121

2222
import org.apache.spark.sql.AnalysisException
23-
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
23+
import org.apache.spark.sql.catalyst.expressions.{Literal, Alias, AttributeReference}
2424
import org.apache.spark.sql.catalyst.plans.logical._
2525
import org.apache.spark.sql.types._
2626

@@ -108,24 +108,56 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
108108
testRelation)
109109
}
110110

111-
test("throw errors for unresolved attributes during analysis") {
112-
val e = intercept[AnalysisException] {
113-
caseSensitiveAnalyze(Project(Seq(UnresolvedAttribute("abcd")), testRelation))
111+
def errorTest(
112+
name: String,
113+
plan: LogicalPlan,
114+
errorMessages: Seq[String],
115+
caseSensitive: Boolean = true) = {
116+
test(name) {
117+
val error = intercept[AnalysisException] {
118+
if(caseSensitive) {
119+
caseSensitiveAnalyze(plan)
120+
} else {
121+
caseInsensitiveAnalyze(plan)
122+
}
123+
}
124+
125+
errorMessages.foreach(m => assert(error.getMessage contains m))
114126
}
115-
assert(e.getMessage().toLowerCase.contains("cannot resolve"))
116127
}
117128

118-
test("throw errors for unresolved plans during analysis") {
119-
case class UnresolvedTestPlan() extends LeafNode {
120-
override lazy val resolved = false
121-
override def output = Nil
122-
}
123-
val e = intercept[AnalysisException] {
124-
caseSensitiveAnalyze(UnresolvedTestPlan())
125-
}
126-
assert(e.getMessage().toLowerCase.contains("unresolved"))
129+
errorTest(
130+
"unresolved attributes",
131+
testRelation.select('abcd),
132+
"cannot resolve" :: "abcd" :: Nil)
133+
134+
errorTest(
135+
"bad casts",
136+
testRelation.select(Literal(1).cast(BinaryType).as('badCast)),
137+
"invalid cast" :: Literal(1).dataType.simpleString :: BinaryType.simpleString :: Nil)
138+
139+
errorTest(
140+
"non-boolean filters",
141+
testRelation.where(Literal(1)),
142+
"filter" :: "'1'" :: "not a boolean" :: Literal(1).dataType.simpleString :: Nil)
143+
144+
errorTest(
145+
"missing group by",
146+
testRelation2.groupBy('a)('b),
147+
"'b'" :: "group by" :: Nil
148+
)
149+
150+
case class UnresolvedTestPlan() extends LeafNode {
151+
override lazy val resolved = false
152+
override def output = Nil
127153
}
128154

155+
errorTest(
156+
"catch all unresolved plan",
157+
UnresolvedTestPlan(),
158+
"unresolved" :: Nil)
159+
160+
129161
test("divide should be casted into fractional types") {
130162
val testRelation2 = LocalRelation(
131163
AttributeReference("a", StringType)(),
@@ -134,18 +166,15 @@ class AnalysisSuite extends FunSuite with BeforeAndAfter {
134166
AttributeReference("d", DecimalType.Unlimited)(),
135167
AttributeReference("e", ShortType)())
136168

137-
val expr0 = 'a / 2
138-
val expr1 = 'a / 'b
139-
val expr2 = 'a / 'c
140-
val expr3 = 'a / 'd
141-
val expr4 = 'e / 'e
142-
val plan = caseInsensitiveAnalyze(Project(
143-
Alias(expr0, s"Analyzer($expr0)")() ::
144-
Alias(expr1, s"Analyzer($expr1)")() ::
145-
Alias(expr2, s"Analyzer($expr2)")() ::
146-
Alias(expr3, s"Analyzer($expr3)")() ::
147-
Alias(expr4, s"Analyzer($expr4)")() :: Nil, testRelation2))
169+
val plan = caseInsensitiveAnalyze(
170+
testRelation2.select(
171+
'a / Literal(2) as 'div1,
172+
'a / 'b as 'div2,
173+
'a / 'c as 'div3,
174+
'a / 'd as 'div4,
175+
'e / 'e as 'div5))
148176
val pl = plan.asInstanceOf[Project].projectList
177+
149178
assert(pl(0).dataType == DoubleType)
150179
assert(pl(1).dataType == DoubleType)
151180
assert(pl(2).dataType == DoubleType)

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.optimizer
1919

20-
import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
20+
import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
2121
import org.apache.spark.sql.catalyst.expressions._
2222
import org.apache.spark.sql.catalyst.plans.logical._
2323
import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -30,7 +30,7 @@ class BooleanSimplificationSuite extends PlanTest with PredicateHelper {
3030
object Optimize extends RuleExecutor[LogicalPlan] {
3131
val batches =
3232
Batch("AnalysisNodes", Once,
33-
EliminateAnalysisOperators) ::
33+
EliminateSubQueries) ::
3434
Batch("Constant Folding", FixedPoint(50),
3535
NullPropagation,
3636
ConstantFolding,

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.optimizer
1919

20-
import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, EliminateAnalysisOperators}
20+
import org.apache.spark.sql.catalyst.analysis.{UnresolvedGetField, EliminateSubQueries}
2121
import org.apache.spark.sql.catalyst.expressions._
2222
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
2323
import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -33,7 +33,7 @@ class ConstantFoldingSuite extends PlanTest {
3333
object Optimize extends RuleExecutor[LogicalPlan] {
3434
val batches =
3535
Batch("AnalysisNodes", Once,
36-
EliminateAnalysisOperators) ::
36+
EliminateSubQueries) ::
3737
Batch("ConstantFolding", Once,
3838
ConstantFolding,
3939
BooleanSimplification) :: Nil

0 commit comments

Comments
 (0)