[SPARK-29239][SPARK-29221][SQL] Subquery should not cause NPE when eliminating subexpression

viirya · cloud-fan · commit b8b59d6fa3ac · 2019-09-26T13:55:01.000+08:00
### What changes were proposed in this pull request? This patch proposes to skip PlanExpression when doing subexpression elimination on executors. ### Why are the changes needed? Subexpression elimination can possibly cause NPE when applying on execution subquery expression like ScalarSubquery on executors. It is because PlanExpression wraps query plan. To compare query plan on executor when eliminating subexpression, can cause unexpected error, like NPE when accessing transient fields. The NPE looks like: ``` [info] - SPARK-29239: Subquery should not cause NPE when eliminating subexpression *** FAILED *** (175 milliseconds) [info] org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1395.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1395.0 (TID 3447, 10.0.0.196, executor driver): java.lang.NullPointerException [info] at org.apache.spark.sql.execution.LocalTableScanExec.stringArgs(LocalTableScanExec.scala:62) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.argString(TreeNode.scala:506) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.simpleString(TreeNode.scala:534) [info] at org.apache.spark.sql.catalyst.plans.QueryPlan.simpleString(QueryPlan.scala:179) [info] at org.apache.spark.sql.catalyst.plans.QueryPlan.verboseString(QueryPlan.scala:181) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:647) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:675) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:675) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.treeString(TreeNode.scala:569) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.treeString(TreeNode.scala:559) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.treeString(TreeNode.scala:551) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.toString(TreeNode.scala:548) [info] at org.apache.spark.sql.catalyst.errors.package$TreeNodeException.<init>(package.scala:36) [info] at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.makeCopy(TreeNode.scala:436) [info] at org.apache.spark.sql.catalyst.trees.TreeNode.makeCopy(TreeNode.scala:425) [info] at org.apache.spark.sql.execution.SparkPlan.makeCopy(SparkPlan.scala:102) [info] at org.apache.spark.sql.execution.SparkPlan.makeCopy(SparkPlan.scala:63) [info] at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:132) [info] at org.apache.spark.sql.catalyst.plans.QueryPlan.doCanonicalize(QueryPlan.scala:261) ``` ### Does this PR introduce any user-facing change? No ### How was this patch tested? Added unit test. Closes #25925 from viirya/SPARK-29239. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.collection.mutable
 
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable
 
@@ -72,7 +73,10 @@ class EquivalentExpressions {
     val skip = expr.isInstanceOf[LeafExpression] ||
       // `LambdaVariable` is usually used as a loop variable, which can't be evaluated ahead of the
       // loop. So we can't evaluate sub-expressions containing `LambdaVariable` at the beginning.
-      expr.find(_.isInstanceOf[LambdaVariable]).isDefined
+      expr.find(_.isInstanceOf[LambdaVariable]).isDefined ||
+      // `PlanExpression` wraps query plan. To compare query plans of `PlanExpression` on executor,
+      // can cause error like NPE.
+      (expr.isInstanceOf[PlanExpression[_]] && TaskContext.get != null)
 
     // There are some special expressions that we should not recurse into all of its children.
     //   1. CodegenFallback: it's children will not be used to generate code (call eval() instead)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.{AccumulatorSuite, SparkException}
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
@@ -3173,6 +3174,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession {
       checkAnswer(sql("select * from t1 where d > '1999-13'"), Row(result))
       checkAnswer(sql("select to_timestamp('2000-01-01 01:10:00') > '1'"), Row(true))
     }
+    sql("DROP VIEW t1")
   }
 
   test("SPARK-28156: self-join should not miss cached view") {
@@ -3216,6 +3218,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession {
       checkAnswer(df3, Array(Row(new java.math.BigDecimal("0.100000000000000000000000100"))))
     }
   }
+
+  test("SPARK-29239: Subquery should not cause NPE when eliminating subexpression") {
+    withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
+        SQLConf.SUBQUERY_REUSE_ENABLED.key -> "false",
+        SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
+        SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> ConvertToLocalRelation.ruleName) {
+      withTempView("t1", "t2") {
+        sql("create temporary view t1 as select * from values ('val1a', 10L) as t1(t1a, t1b)")
+        sql("create temporary view t2 as select * from values ('val3a', 110L) as t2(t2a, t2b)")
+        val df = sql("SELECT min, min from (SELECT (SELECT min(t2b) FROM t2) min " +
+          "FROM t1 WHERE t1a = 'val1c')")
+        assert(df.collect().size == 0)
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])