apache · salilsurendran · Jan 20, 2017 · Jan 30, 2017 · Jan 30, 2017 · Feb 3, 2017
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -1300,10 +1300,28 @@ Configuration of in-memory caching can be done using the `setConf` method on `Sp
 
 </table>
 
+## QueryExecutionListener Options
+Use this configuration option to attach query execution listeners
+
+<table class="table">
+  <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+  <tr>
+    <td><code>spark.sql.queryExecutionListeners</code></td>
+    <td></td>
+    <td>
+      A comma-separated list of classes that implement QueryExecutionListener. When creating a SparkSession,
+      instances of these listeners will be added to it. These classes needs to have a zero-argument
+      constructor. If the specified class can't be found or the class specified doesn't have a valid
+      constructor the SparkSession creation will fail with an exception.
+    </td>
+  </tr>
+ </table>
+
 ## Other Configuration Options
 
 The following options can also be used to tune the performance of query execution. It is possible
-that these options will be deprecated in future release as more optimizations are performed automatically.
+that these options will be deprecated in future release as more optimizations are performed
+automatically.
 
 <table class="table">
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -133,7 +133,13 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),
-      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query")
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query"),
+
+      // [SPARK-18120 ][SQL] Call QueryExecutionListener callback methods for DataFrameWriter methods
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure")      
     )
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -26,10 +26,13 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogRelation, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.{OutputParams}
 
 /**
  * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
@@ -189,6 +192,32 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     this
   }
 
+  /**
+   * Wrap a DataFrameWriter action to track the query execution and time cost, then report to the
+   * user-registered callback functions.
+   *
+   * @param funcName A identifier for the method executing the query
+   * @param qe the @see `QueryExecution` object associated with the query
+   * @param outputParams The output parameters useful for query analysis
+   * @param action the function that executes the query after which the listener methods gets
+   *               called.
+   */
+  private def withAction(
+      funcName: String,
+      qe: QueryExecution,
+      outputParams: OutputParams)(action: => Unit) = {
+    try {
+      val start = System.nanoTime()
+      action
+      val end = System.nanoTime()
+      df.sparkSession.listenerManager.onSuccess(funcName, qe, end - start, Some(outputParams))
+    } catch {
+      case e: Exception =>
+        df.sparkSession.listenerManager.onFailure(funcName, qe, e, Some(outputParams))
+        throw e
+    }
+  }
+
   /**
    * Saves the content of the `DataFrame` at the specified path.
    *
@@ -218,7 +247,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       bucketSpec = getBucketSpec,
       options = extraOptions.toMap)
 
-    dataSource.write(mode, df)
+    val destination = source match {
+      case "jdbc" => extraOptions.get(JDBCOptions.JDBC_TABLE_NAME)
+      case _ => extraOptions.get("path")
+    }
+    val outputParams = OutputParams(source, destination, extraOptions.toMap)
+    withAction("save", df.queryExecution, outputParams) {
+      dataSource.write(mode, df)
+    }
   }
 
   /**
@@ -261,13 +297,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       )
     }
 
-    df.sparkSession.sessionState.executePlan(
+    val qe = df.sparkSession.sessionState.executePlan(
       InsertIntoTable(
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
         child = df.logicalPlan,
         overwrite = mode == SaveMode.Overwrite,
-        ifNotExists = false)).toRdd
+        ifNotExists = false))
+    val outputParams = OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)
+    withAction("insertInto", qe, outputParams)(qe.toRdd)
   }
 
   private def normalizedParCols: Option[Seq[String]] = partitioningColumns.map { cols =>
@@ -324,7 +362,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
   private def assertNotPartitioned(operation: String): Unit = {
     if (partitioningColumns.isDefined) {
-      throw new AnalysisException( s"'$operation' does not support partitioning")
+      throw new AnalysisException(s"'$operation' does not support partitioning")
     }
   }
 
@@ -428,8 +466,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       partitionColumnNames = partitioningColumns.getOrElse(Nil),
       bucketSpec = getBucketSpec
     )
-    df.sparkSession.sessionState.executePlan(
-      CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
+    val qe = df.sparkSession.sessionState.executePlan(
+      CreateTable(tableDesc, mode, Some(df.logicalPlan)))
+    val outputParams = OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)
+    withAction("saveAsTable", qe, outputParams)(qe.toRdd)
   }
 
   /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -40,12 +40,12 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.ui.SQLListener
-import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState}
+import org.apache.spark.sql.internal._
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{DataType, LongType, StructType}
-import org.apache.spark.sql.util.ExecutionListenerManager
+import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListener}
 import org.apache.spark.util.Utils
 
 
@@ -876,6 +876,9 @@ object SparkSession {
         }
         session = new SparkSession(sparkContext)
         options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
+        for (qeListener <- createQueryExecutionListeners(session.sparkContext.getConf)) {
+          session.listenerManager.register(qeListener)
+        }
         defaultSession.set(session)
 
         // Register a successfully instantiated context to the singleton. This should be at the
@@ -893,6 +896,12 @@ object SparkSession {
     }
   }
 
+  private def createQueryExecutionListeners(conf: SparkConf): Seq[QueryExecutionListener] = {
+    conf.get(StaticSQLConf.QUERY_EXECUTION_LISTENERS)
+      .map(Utils.classForName(_))
+      .map(_.newInstance().asInstanceOf[QueryExecutionListener])
+  }
+
   /**
    * Creates a [[SparkSession.Builder]] for constructing a [[SparkSession]].
    *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1047,4 +1047,14 @@ object StaticSQLConf {
       "SQL configuration and the current database.")
     .booleanConf
     .createWithDefault(false)
+
+  val QUERY_EXECUTION_LISTENERS = buildConf("spark.sql.queryExecutionListeners")
+    .doc("A comma-separated list of classes that implement QueryExecutionListener. When creating " +
+      "a SparkSession, instances of these listeners will be added to it. These classes " +
+      "needs to have a zero-argument constructor. If the specified class can't be found or" +
+      " the class specified doesn't have a valid constructor the SparkSession creation " +
+      "will fail with an exception.")
+    .stringConf
+    .toSequence
+    .createWithDefault(Nil)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -44,27 +44,50 @@ trait QueryExecutionListener {
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param durationNs the execution time for this query in nanoseconds.
-   *
-   * @note This can be invoked by multiple different threads.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see `None`
    */
   @DeveloperApi
-  def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit
-
+  def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      durationNs: Long,
+      outputParams: Option[OutputParams]): Unit
   /**
    * A callback function that will be called when a query execution failed.
    *
    * @param funcName the name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param exception the exception that failed this query.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see `None`
    *
    * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
-  def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit
+  def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams]): Unit
 }
 
-
+/**
+ * Contains extra information useful for query analysis passed on from the methods in
+ * @see `org.apache.spark.sql.DataFrameWriter` while writing to a datasource
+ * @param datasourceType type of data source written to like csv, parquet, json, hive, jdbc etc.
+ * @param destination path or table name written to
+ * @param options the map containing the output options for the underlying datasource
+ *                specified by using the @see `org.apache.spark.sql.DataFrameWriter#option` method
+ * @param writeParams will contain any extra information that the write method wants to provide
+ */
+@DeveloperApi
+case class OutputParams(
+    datasourceType: String,
+    destination: Option[String],
+    options: Map[String, String],
+    writeParams: Map[String, String] = Map.empty)
 /**
  * :: Experimental ::
  *
@@ -98,18 +121,26 @@ class ExecutionListenerManager private[sql] () extends Logging {
     listeners.clear()
   }
 
-  private[sql] def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+  private[sql] def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      duration: Long,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onSuccess(funcName, qe, duration)
+        listener.onSuccess(funcName, qe, duration, outputParams)
       }
     }
   }
 
-  private[sql] def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+  private[sql] def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onFailure(funcName, qe, exception)
+        listener.onFailure(funcName, qe, exception, outputParams)
       }
     }
   }