[SPARK-41407][SQL] Pull out v1 write to WriteFiles

ulysses-you · cloud-fan · commit 2ffa8178df1c · 2022-12-23T18:14:25.000+08:00
### What changes were proposed in this pull request? This pr aims to pull out the details of v1 write files to a new operator `WriteFiles`(logical) `WriteFilesExec`(physical). Then we can make v1 write files support whole stage codegen in future. Introduce `WriteFilesSpec` to hold all v1 write files information: ```scala case class WriteFilesSpec( description: WriteJobDescription, committer: FileCommitProtocol, concurrentOutputWriterSpecFunc: SparkPlan => Option[ConcurrentOutputWriterSpec]) extends WriteSpec ``` In order to compatiable with existed code path, this pr adds a new method `executeWrite` in `SparkPlan`: ```scala def executeWrite(writeSpec: WriteSpec): RDD[WriterCommitMessage] ``` Refactor `FileFormatWriter` to make write files clearly: - execute write using old code path - execute write using `SparkPlan.executeWrite` - extract `writeAndCommit` method to work with both two code path ### Why are the changes needed? This is the preparation work before support v1 write whole stage codegen. ### Does this PR introduce _any_ user-facing change? for user, no for developer, yes: - add a new method `executeWrite` in `SparkPlan` - add a new interface `WriteSpec` ### How was this patch tested? pass CI with spark.sql.optimizer.plannedWrite.enabled on/off Closes #38939 from ulysses-you/v1write-plan. Authored-by: ulysses-you <ulyssesyou18@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json
@@ -3693,7 +3693,7 @@
   },
   "_LEGACY_ERROR_TEMP_2054" : {
     "message" : [
-      "Task failed while writing rows."
+      "Task failed while writing rows. <message>"
     ]
   },
   "_LEGACY_ERROR_TEMP_2055" : {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -785,7 +785,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase {
   def taskFailedWhileWritingRowsError(cause: Throwable): Throwable = {
     new SparkException(
       errorClass = "_LEGACY_ERROR_TEMP_2054",
-      messageParameters = Map.empty,
+      messageParameters = Map("message" -> cause.getMessage),
       cause = cause)
   }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/internal/WriteSpec.java b/sql/core/src/main/java/org/apache/spark/sql/internal/WriteSpec.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.internal;
+
+import java.io.Serializable;
+
+/**
+ * Write spec is a input parameter of
+ * {@link org.apache.spark.sql.execution.SparkPlan#executeWrite}.
+ *
+ * <p>
+ * This is an empty interface, the concrete class which implements
+ * {@link org.apache.spark.sql.execution.SparkPlan#doExecuteWrite}
+ * should define its own class and use it.
+ *
+ * @since 3.4.0
+ */
+public interface WriteSpec extends Serializable {}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -34,9 +34,10 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.trees.{BinaryLike, LeafLike, TreeNodeTag, UnaryLike}
+import org.apache.spark.sql.connector.write.WriterCommitMessage
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{SQLConf, WriteSpec}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStream}
@@ -223,6 +224,19 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     doExecuteColumnar()
   }
 
+  /**
+   * Returns the result of writes as an RDD[WriterCommitMessage] variable by delegating to
+   * `doExecuteWrite` after preparations.
+   *
+   * Concrete implementations of SparkPlan should override `doExecuteWrite`.
+   */
+  def executeWrite(writeSpec: WriteSpec): RDD[WriterCommitMessage] = executeQuery {
+    if (isCanonicalizedPlan) {
+      throw SparkException.internalError("A canonicalized plan is not supposed to be executed.")
+    }
+    doExecuteWrite(writeSpec)
+  }
+
   /**
    * Executes a query after preparing the query and adding query plan information to created RDDs
    * for visualization.
@@ -324,6 +338,16 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
       s" mismatch:\n${this}")
   }
 
+  /**
+   * Produces the result of the writes as an `RDD[WriterCommitMessage]`
+   *
+   * Overridden by concrete implementations of SparkPlan.
+   */
+  protected def doExecuteWrite(writeSpec: WriteSpec): RDD[WriterCommitMessage] = {
+    throw SparkException.internalError(s"Internal Error ${this.getClass} has write support" +
+      s" mismatch:\n${this}")
+  }
+
   /**
    * Converts the output of this plan to row-based if it is columnar plan.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors
 import org.apache.spark.sql.execution.aggregate.AggUtils
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.execution.datasources.{WriteFiles, WriteFilesExec}
 import org.apache.spark.sql.execution.exchange.{REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.python._
 import org.apache.spark.sql.execution.streaming._
@@ -894,6 +895,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         throw QueryExecutionErrors.ddlUnsupportedTemporarilyError("MERGE INTO TABLE")
       case logical.CollectMetrics(name, metrics, child) =>
         execution.CollectMetricsExec(name, metrics, planLater(child)) :: Nil
+      case WriteFiles(child) =>
+        WriteFilesExec(planLater(child)) :: Nil
       case _ => Nil
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -145,6 +145,12 @@ case class CreateDataSourceTableAsSelectCommand(
     outputColumnNames: Seq[String])
   extends V1WriteCommand {
 
+  override def fileFormatProvider: Boolean = {
+    table.provider.forall { provider =>
+      classOf[FileFormat].isAssignableFrom(DataSource.providingClass(provider, conf))
+    }
+  }
+
   override lazy val partitionColumns: Seq[Attribute] = {
     val unresolvedPartitionColumns = table.partitionColumnNames.map(UnresolvedAttribute.quoted)
     DataSource.resolvePartitionColumns(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -97,19 +97,8 @@ case class DataSource(
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
-  lazy val providingClass: Class[_] = {
-    val cls = DataSource.lookupDataSource(className, sparkSession.sessionState.conf)
-    // `providingClass` is used for resolving data source relation for catalog tables.
-    // As now catalog for data source V2 is under development, here we fall back all the
-    // [[FileDataSourceV2]] to [[FileFormat]] to guarantee the current catalog works.
-    // [[FileDataSourceV2]] will still be used if we call the load()/save() method in
-    // [[DataFrameReader]]/[[DataFrameWriter]], since they use method `lookupDataSource`
-    // instead of `providingClass`.
-    cls.newInstance() match {
-      case f: FileDataSourceV2 => f.fallbackFileFormat
-      case _ => cls
-    }
-  }
+  lazy val providingClass: Class[_] =
+    DataSource.providingClass(className, sparkSession.sessionState.conf)
 
   private[sql] def providingInstance(): Any = providingClass.getConstructor().newInstance()
 
@@ -843,4 +832,18 @@ object DataSource extends Logging {
       }
     }
   }
+
+  def providingClass(className: String, conf: SQLConf): Class[_] = {
+    val cls = DataSource.lookupDataSource(className, conf)
+    // `providingClass` is used for resolving data source relation for catalog tables.
+    // As now catalog for data source V2 is under development, here we fall back all the
+    // [[FileDataSourceV2]] to [[FileFormat]] to guarantee the current catalog works.
+    // [[FileDataSourceV2]] will still be used if we call the load()/save() method in
+    // [[DataFrameReader]]/[[DataFrameWriter]], since they use method `lookupDataSource`
+    // instead of `providingClass`.
+    cls.newInstance() match {
+      case f: FileDataSourceV2 => f.fallbackFileFormat
+      case _ => cls
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.connector.write.WriterCommitMessage
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.{ProjectExec, SortExec, SparkPlan, SQLExecution, UnsafeExternalRowSorter}
 import org.apache.spark.sql.internal.SQLConf
@@ -103,14 +104,6 @@ object FileFormatWriter extends Logging {
       .map(FileSourceMetadataAttribute.cleanupFileSourceMetadataInformation))
     val dataColumns = finalOutputSpec.outputColumns.filterNot(partitionSet.contains)
 
-    val hasEmpty2Null = plan.exists(p => V1WritesUtils.hasEmptyToNull(p.expressions))
-    val empty2NullPlan = if (hasEmpty2Null) {
-      plan
-    } else {
-      val projectList = V1WritesUtils.convertEmptyToNull(plan.output, partitionColumns)
-      if (projectList.nonEmpty) ProjectExec(projectList, plan) else plan
-    }
-
     val writerBucketSpec = V1WritesUtils.getWriterBucketSpec(bucketSpec, dataColumns, options)
     val sortColumns = V1WritesUtils.getBucketSortColumns(bucketSpec, dataColumns)
 
@@ -144,9 +137,10 @@ object FileFormatWriter extends Logging {
     // columns.
     val requiredOrdering = partitionColumns.drop(numStaticPartitionCols) ++
         writerBucketSpec.map(_.bucketIdExpression) ++ sortColumns
+    val writeFilesOpt = V1WritesUtils.getWriteFilesOpt(plan)
     // the sort order doesn't matter
     // Use the output ordering from the original plan before adding the empty2null projection.
-    val actualOrdering = plan.outputOrdering.map(_.child)
+    val actualOrdering = writeFilesOpt.map(_.child).getOrElse(plan).outputOrdering.map(_.child)
     val orderingMatched = V1WritesUtils.isOrderingMatched(requiredOrdering, actualOrdering)
 
     SQLExecution.checkSQLExecutionId(sparkSession)
@@ -155,10 +149,6 @@ object FileFormatWriter extends Logging {
     // get an ID guaranteed to be unique.
     job.getConfiguration.set("spark.sql.sources.writeJobUUID", description.uuid)
 
-    // This call shouldn't be put into the `try` block below because it only initializes and
-    // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
-    committer.setupJob(job)
-
     // When `PLANNED_WRITE_ENABLED` is true, the optimizer rule V1Writes will add logical sort
     // operator based on the required ordering of the V1 write command. So the output
     // ordering of the physical plan should always match the required ordering. Here
@@ -169,27 +159,55 @@ object FileFormatWriter extends Logging {
     //    V1 write command will be empty).
     if (Utils.isTesting) outputOrderingMatched = orderingMatched
 
-    try {
+    if (writeFilesOpt.isDefined) {
+      // build `WriteFilesSpec` for `WriteFiles`
+      val concurrentOutputWriterSpecFunc = (plan: SparkPlan) => {
+        val sortPlan = createSortPlan(plan, requiredOrdering, outputSpec)
+        createConcurrentOutputWriterSpec(sparkSession, sortPlan, sortColumns)
+      }
+      val writeSpec = WriteFilesSpec(
+        description = description,
+        committer = committer,
+        concurrentOutputWriterSpecFunc = concurrentOutputWriterSpecFunc
+      )
+      executeWrite(sparkSession, plan, writeSpec, job)
+    } else {
+      executeWrite(sparkSession, plan, job, description, committer, outputSpec,
+        requiredOrdering, partitionColumns, sortColumns, orderingMatched)
+    }
+  }
+  // scalastyle:on argcount
+
+  private def executeWrite(
+      sparkSession: SparkSession,
+      plan: SparkPlan,
+      job: Job,
+      description: WriteJobDescription,
+      committer: FileCommitProtocol,
+      outputSpec: OutputSpec,
+      requiredOrdering: Seq[Expression],
+      partitionColumns: Seq[Attribute],
+      sortColumns: Seq[Attribute],
+      orderingMatched: Boolean): Set[String] = {
+    val hasEmpty2Null = plan.exists(p => V1WritesUtils.hasEmptyToNull(p.expressions))
+    val empty2NullPlan = if (hasEmpty2Null) {
+      plan
+    } else {
+      val projectList = V1WritesUtils.convertEmptyToNull(plan.output, partitionColumns)
+      if (projectList.nonEmpty) ProjectExec(projectList, plan) else plan
+    }
+
+    writeAndCommit(job, description, committer) {
       val (rdd, concurrentOutputWriterSpec) = if (orderingMatched) {
         (empty2NullPlan.execute(), None)
       } else {
-        // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
-        // the physical plan may have different attribute ids due to optimizer removing some
-        // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
-        val orderingExpr = bindReferences(
-          requiredOrdering.map(SortOrder(_, Ascending)), finalOutputSpec.outputColumns)
-        val sortPlan = SortExec(
-          orderingExpr,
-          global = false,
-          child = empty2NullPlan)
-
-        val maxWriters = sparkSession.sessionState.conf.maxConcurrentOutputFileWriters
-        val concurrentWritersEnabled = maxWriters > 0 && sortColumns.isEmpty
-        if (concurrentWritersEnabled) {
-          (empty2NullPlan.execute(),
-            Some(ConcurrentOutputWriterSpec(maxWriters, () => sortPlan.createSorter())))
+        val sortPlan = createSortPlan(empty2NullPlan, requiredOrdering, outputSpec)
+        val concurrentOutputWriterSpec = createConcurrentOutputWriterSpec(
+          sparkSession, sortPlan, sortColumns)
+        if (concurrentOutputWriterSpec.isDefined) {
+          (empty2NullPlan.execute(), concurrentOutputWriterSpec)
         } else {
-          (sortPlan.execute(), None)
+          (sortPlan.execute(), concurrentOutputWriterSpec)
         }
       }
 
@@ -221,7 +239,19 @@ object FileFormatWriter extends Logging {
           committer.onTaskCommit(res.commitMsg)
           ret(index) = res
         })
+      ret
+    }
+  }
 
+  private def writeAndCommit(
+      job: Job,
+      description: WriteJobDescription,
+      committer: FileCommitProtocol)(f: => Array[WriteTaskResult]): Set[String] = {
+    // This call shouldn't be put into the `try` block below because it only initializes and
+    // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
+    committer.setupJob(job)
+    try {
+      val ret = f
       val commitMsgs = ret.map(_.commitMsg)
 
       logInfo(s"Start to commit write Job ${description.uuid}.")
@@ -239,10 +269,70 @@ object FileFormatWriter extends Logging {
       throw cause
     }
   }
-  // scalastyle:on argcount
+
+  /**
+   * Write files using [[SparkPlan.executeWrite]]
+   */
+  private def executeWrite(
+      session: SparkSession,
+      planForWrites: SparkPlan,
+      writeFilesSpec: WriteFilesSpec,
+      job: Job): Set[String] = {
+    val committer = writeFilesSpec.committer
+    val description = writeFilesSpec.description
+
+    writeAndCommit(job, description, committer) {
+      val rdd = planForWrites.executeWrite(writeFilesSpec)
+      val ret = new Array[WriteTaskResult](rdd.partitions.length)
+      session.sparkContext.runJob(
+        rdd,
+        (context: TaskContext, iter: Iterator[WriterCommitMessage]) => {
+          assert(iter.hasNext)
+          val commitMessage = iter.next()
+          assert(!iter.hasNext)
+          commitMessage
+        },
+        rdd.partitions.indices,
+        (index, res: WriterCommitMessage) => {
+          assert(res.isInstanceOf[WriteTaskResult])
+          val writeTaskResult = res.asInstanceOf[WriteTaskResult]
+          committer.onTaskCommit(writeTaskResult.commitMsg)
+          ret(index) = writeTaskResult
+        })
+      ret
+    }
+  }
+
+  private def createSortPlan(
+      plan: SparkPlan,
+      requiredOrdering: Seq[Expression],
+      outputSpec: OutputSpec): SortExec = {
+    // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
+    // the physical plan may have different attribute ids due to optimizer removing some
+    // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
+    val orderingExpr = bindReferences(
+      requiredOrdering.map(SortOrder(_, Ascending)), outputSpec.outputColumns)
+    SortExec(
+      orderingExpr,
+      global = false,
+      child = plan)
+  }
+
+  private def createConcurrentOutputWriterSpec(
+      sparkSession: SparkSession,
+      sortPlan: SortExec,
+      sortColumns: Seq[Attribute]): Option[ConcurrentOutputWriterSpec] = {
+    val maxWriters = sparkSession.sessionState.conf.maxConcurrentOutputFileWriters
+    val concurrentWritersEnabled = maxWriters > 0 && sortColumns.isEmpty
+    if (concurrentWritersEnabled) {
+      Some(ConcurrentOutputWriterSpec(maxWriters, () => sortPlan.createSorter()))
+    } else {
+      None
+    }
+  }
 
   /** Writes data out in a single Spark task. */
-  private def executeTask(
+  private[spark] def executeTask(
       description: WriteJobDescription,
       jobTrackerID: String,
       sparkStageId: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/V1Writes.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriteFiles.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/V1WriteCommandSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -3693,7 +3693,7 @@`
`3693`	`3693`	`},`
`3694`	`3694`	`"_LEGACY_ERROR_TEMP_2054" : {`
`3695`	`3695`	`"message" : [`
`3696`		`- "Task failed while writing rows."`
	`3696`	`+ "Task failed while writing rows. <message>"`
`3697`	`3697`	`]`
`3698`	`3698`	`},`
`3699`	`3699`	`"_LEGACY_ERROR_TEMP_2055" : {`
Original file line number	Diff line number	Diff line change
`@@ -785,7 +785,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase {`
`785`	`785`	`def taskFailedWhileWritingRowsError(cause: Throwable): Throwable = {`
`786`	`786`	`new SparkException(`
`787`	`787`	`errorClass = "_LEGACY_ERROR_TEMP_2054",`
`788`		`- messageParameters = Map.empty,`
	`788`	`+ messageParameters = Map("message" -> cause.getMessage),`
`789`	`789`	`cause = cause)`
`790`	`790`	`}`
`791`	`791`