[CARMEL-7479][CARMEL-6358] Change to file commit algorithm V2 for CTAS (apache#238)

fenzhu · GitHub Enterprise · commit 0ebdf972570e · 2024-03-04T21:19:39.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -197,7 +197,9 @@ class HadoopMapReduceCommitProtocol(
   }
 
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
+    logInfo("Start to commit job ......")
     committer.commitJob(jobContext)
+    logInfo("Commit job finished!")
 
     if (hasValidPath) {
       val (allAbsPathFiles, allPartitionPaths, _) =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.command
 
 import java.net.URI
 
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.plans.logical.{CTEInChildren, CTERelationDef, LogicalPlan, WithCTE}
@@ -167,6 +169,8 @@ case class CreateDataSourceTableAsSelectCommand(
       DDLUtils.verifyOperationNotSupported(table, "Create partitioned table")
     }
     DDLUtils.checkPrivilegeOfSpecifyTableLocation(table, sessionState)
+    val originalFileOutputCommitterAlgorithm = sessionState.conf.getConfString(
+      FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, "1")
 
     if (sessionState.catalog.tableExists(table)) {
       assert(mode != SaveMode.Overwrite,
@@ -193,6 +197,11 @@ case class CreateDataSourceTableAsSelectCommand(
       } else {
         table.storage.locationUri
       }
+      if (!originalFileOutputCommitterAlgorithm.equals("2")) {
+        sessionState.conf.setConfString(
+          FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, "2")
+        logInfo("Set file output committer algorithm as version 2 when CTAS")
+      }
       val result = saveDataIntoTable(
         sparkSession, table, tableLocation, SaveMode.Overwrite, tableExists = false)
       val tableSchema = CharVarcharUtils.getRawSchema(
@@ -216,6 +225,13 @@ case class CreateDataSourceTableAsSelectCommand(
             enableDropPartitions = false), CommandExecutionMode.SKIP).toRdd
         case _ =>
       }
+      if (!originalFileOutputCommitterAlgorithm.equals("2")) {
+        sessionState.conf.setConfString(
+          FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
+          originalFileOutputCommitterAlgorithm)
+        logInfo(s"Set file output committer algorithm " +
+          s"back to version $originalFileOutputCommitterAlgorithm")
+      }
     }
 
     CommandUtils.updateTableStats(sparkSession, table)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.execution
 
 import scala.util.control.NonFatal
 
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+
 import org.apache.spark.sql.{Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.plans.logical.{CommandResult, CTEInChildren, CTERelationDef, LogicalPlan, WithCTE}
@@ -91,7 +93,15 @@ case class CreateHiveTableAsSelectCommand(
       sparkSession.sessionState.catalog.validateTableLocation(tableDesc)
       catalog.createTable(tableDesc.copy(schema = tableSchema), ignoreIfExists = false)
 
+      val originalFileOutputCommitterAlgorithm = sparkSession.sessionState.conf.getConfString(
+        FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, "1")
+
       try {
+        if (!originalFileOutputCommitterAlgorithm.equals("2")) {
+          sparkSession.sessionState.conf.setConfString(
+            FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, "2")
+          logInfo("Set file output committer algorithm as version 2 when CTAS for Hive table")
+        }
         // Read back the metadata of the table which was created just now.
         val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
         val command = getWritingCommand(createdTableMeta, tableExists = false)
@@ -108,6 +118,16 @@ case class CreateHiveTableAsSelectCommand(
           // drop the created table.
           catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
           throw e
+      } finally {
+        val currentFileOutputCommitterAlgorithm = sparkSession.sessionState.conf.getConfString(
+          FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, "1")
+        if (!currentFileOutputCommitterAlgorithm.equals(originalFileOutputCommitterAlgorithm)) {
+          sparkSession.sessionState.conf.setConfString(
+            FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
+            originalFileOutputCommitterAlgorithm)
+          logInfo(s"Set file output committer algorithm " +
+            s"back to version $originalFileOutputCommitterAlgorithm")
+        }
       }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,9 @@ class HadoopMapReduceCommitProtocol(`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {`
	`200`	`+ logInfo("Start to commit job ......")`
`200`	`201`	`committer.commitJob(jobContext)`
	`202`	`+ logInfo("Commit job finished!")`
`201`	`203`
`202`	`204`	`if (hasValidPath) {`
`203`	`205`	`val (allAbsPathFiles, allPartitionPaths, _) =`