[SC-5985] Add spark.databricks.debug.taskKiller.minOutputRows config.

ala · rxin · commit c4ce3c4c6363 · 2017-02-24T10:01:12.000-08:00
## What changes were proposed in this pull request? A configuration parameter spark.databricks.debug.taskKiller.minOutputRows is added. It sets the minimum required number of records that need to be produced at some point in task execution, before the task can be terminated by DatabricksTaskDebugListener. ## How was this patch tested? Adds unit tests. Author: Ala Luszczak <ala@databricks.com> Closes apache#250 from ala/min-output-rows.
diff --git a/sql/core/src/main/scala/com/databricks/sql/DatabricksSQLConf.scala b/sql/core/src/main/scala/com/databricks/sql/DatabricksSQLConf.scala
@@ -114,6 +114,13 @@ object DatabricksSQLConf {
     .longConf
     .createWithDefault(10L)
 
+  val TASK_KILLER_MIN_OUTPUT_ROWS = buildConf("spark.databricks.debug.taskKiller.minOutputRows")
+    .internal()
+    .doc("The minimum number of rows that need to be produced by the task before it can be " +
+      "cancelled.")
+    .longConf
+    .createWithDefault(1000L * 1000L)
+
   val TASK_KILLER_ERROR_MESSAGE = buildStaticConf("spark.databricks.debug.taskKiller.message")
     .internal()
     .doc("The error message to displayed when a task is terminated by DatabricksTaskDebugListener.")
diff --git a/sql/core/src/main/scala/com/databricks/sql/debugger/DatabricksTaskDebugListener.scala b/sql/core/src/main/scala/com/databricks/sql/debugger/DatabricksTaskDebugListener.scala
@@ -78,25 +78,28 @@ class DatabricksTaskDebugListener(
         }
       }
 
-      val outputRatio = if (recordsIn > 0) recordsOut / recordsIn else 0
-      checkOutputRatio(outputRatio)
+      checkOutputRatio(recordsIn, recordsOut)
     }
 
     /**
      * Compare running time and output ratio and with the configured limits.
      * If needed, request task cancellation.
      */
-    private def checkOutputRatio(outputRatio: Long): Unit = {
+    private def checkOutputRatio(recordsIn: Long, recordsOut: Long): Unit = {
+      val outputRatio = if (recordsIn > 0) recordsOut / recordsIn else 0
+
       val queryExecution = SQLExecution.getQueryExecution(executionId)
       if (!cancelRequestIssued || launchTime > 0 || queryExecution != null) {
         val minRunningTimeSec = queryExecution.sparkSession.sessionState.conf.getConf(
             DatabricksSQLConf.TASK_KILLER_MIN_TIME)
+        val minOutputRows = queryExecution.sparkSession.sessionState.conf.getConf(
+            DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS)
         val outputRatioKillThreshold = queryExecution.sparkSession.sessionState.conf.getConf(
             DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD)
         val runningTimeSec = (System.currentTimeMillis() - launchTime) / 1000
 
-        if (runningTimeSec > minRunningTimeSec && outputRatioKillThreshold > 0 &&
-            outputRatio > outputRatioKillThreshold) {
+        if (runningTimeSec > minRunningTimeSec && recordsOut >= minOutputRows &&
+            outputRatioKillThreshold > 0 && outputRatio > outputRatioKillThreshold) {
           val errorMsgTemplate = queryExecution.sparkSession.sessionState.conf.getConf(
               DatabricksSQLConf.TASK_KILLER_ERROR_MESSAGE)
           terminateTask(outputRatio, outputRatioKillThreshold, errorMsgTemplate)
diff --git a/sql/core/src/test/scala/com/databricks/sql/debugger/DatabricksTaskDebugListenerSuite.scala b/sql/core/src/test/scala/com/databricks/sql/debugger/DatabricksTaskDebugListenerSuite.scala
@@ -16,7 +16,7 @@ import com.databricks.sql.DatabricksSQLConf
 import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
@@ -27,25 +27,33 @@ class DatabricksTaskDebugListenerSuite
   with SQLTestUtils
   with Eventually {
 
+  import testImplicits._
+
   val CART_PROD_INPUT_SIZE = 100000L
-  var prevKillerOutputRatioThreshold = 0L
+  var prevKillerOutputRatio = 0L
   var prevKillerMinTime = 0L
+  var prevMinOutputRows = 0L
 
   protected override def beforeAll(): Unit = {
     super.beforeAll()
 
-    prevKillerOutputRatioThreshold = spark.sessionState.conf.getConf(
-        DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD)
-    prevKillerMinTime = spark.sessionState.conf.getConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME)
+    val conf = spark.sessionState.conf
+
+    prevKillerOutputRatio = conf.getConf(DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD)
+    prevKillerMinTime = conf.getConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME)
+    prevMinOutputRows = conf.getConf(DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS)
 
-    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD, 100L)
-    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME, 5L)
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD, 100L)
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME, 5L)
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS, 1000L)
   }
 
   protected override def afterAll(): Unit = {
-    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD,
-        prevKillerOutputRatioThreshold)
-    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME, prevKillerMinTime)
+    val conf = spark.sessionState.conf
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_OUTPUT_RATIO_THRESHOLD, prevKillerOutputRatio)
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_TIME, prevKillerMinTime)
+    conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS, prevMinOutputRows)
+
     super.afterAll()
   }
 
@@ -112,4 +120,27 @@ class DatabricksTaskDebugListenerSuite
       }
     }
   }
+
+  // Create a query that takes ca. 20 seconds to process (because heartbeats with metrics are send
+  // around 10 seconds apart), but doesn't produce too much output (max 250,000 records) at any
+  // point in execution.
+  def gen20SecQuery: DataFrame = {
+    spark.range(200L).repartition(1).map { x =>
+      // Trickle out 10 rows per second
+      Thread.sleep(100)
+      x
+    }.crossJoin(spark.range(1000L)).toDF("a", "b").agg(sum("a"), sum("b"))
+  }
+
+  test("spark.databricks.debug.taskKiller.minOutputRows = 1000,000 - query is not killed") {
+    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS, 1000L * 1000L)
+    gen20SecQuery.collect()
+  }
+
+  test("spark.databricks.debug.taskKiller.minOutputRows = 1000 - the same query is terminated") {
+    spark.sessionState.conf.setConf(DatabricksSQLConf.TASK_KILLER_MIN_OUTPUT_ROWS, 1000L)
+    testTaskTermination {
+      gen20SecQuery.collect()
+    }
+  }
 }