[CARMEL-6345] Support backup table command (#1124)

fenzhu · GitHub Enterprise · commit 813d18323ad3 · 2022-11-11T21:41:38.000+08:00
* [CARMEL-6345] Support backup table command

* repair

* grammer

* comment
diff --git a/pom.xml b/pom.xml
@@ -1422,6 +1422,12 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-distcp</artifactId>
+        <version>${hadoop.version}</version>
+        <scope>${hadoop.deps.scope}</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.zookeeper</groupId>
         <artifactId>zookeeper</artifactId>
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -266,6 +266,7 @@ statement
     | unsupportedHiveNativeCommands .*?                                #failNativeCommand
     | COMPACT TABLE target=tableIdentifier partitionSpec?
          (INTO fileNum=INTEGER_VALUE identifier)?                      #compactTable
+    | LOAD TABLE source=tableIdentifier TO target=tableIdentifier      #backupTable
     | SETTABLE tableIdentifier (READY | DONE) FOR
          ( DAY? '(' (date=STRING | dateExpr=primaryExpression) ')' |
            (DAY_RANGE? | WEEK_RANGE? | MONTH_RANGE?) '('
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -173,6 +173,10 @@
       <artifactId>jackson-mapper-asl</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1032,4 +1032,11 @@ class SparkSqlAstBuilder extends AstBuilder {
     val toInfo = if (ctx.to == null) null else string(ctx.to)
     AnalyzeTableLineageCommand(persist, fileInfo, dir, prefixInfo, fromInfo, toInfo)
   }
+
+  override def visitBackupTable(ctx: BackupTableContext): LogicalPlan = withOrigin(ctx) {
+    val sourceTableName = visitTableIdentifier(ctx.source)
+    val targetTableName = visitTableIdentifier(ctx.target)
+    BackupTableCommand(sourceTableName, targetTableName)
+  }
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/BackupTableCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/BackupTableCommand.scala
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.tools.{DistCp, DistCpOptions}
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.types.StringType
+
+/**
+ * Support to backup source table to target table
+ * The syntax of using this command in SQL is:
+ * {{{
+ *   LOAD TABLE db1.tb1 TO db2.tb2
+ * }}}
+ */
+case class BackupTableCommand(sourceTableName: TableIdentifier,
+    targetTableName: TableIdentifier) extends RunnableCommand {
+
+  override val output: Seq[Attribute] = {
+    AttributeReference("Backup Table Operations", StringType, nullable = false)() ::
+      AttributeReference("Result", StringType, nullable = false)() :: Nil
+  }
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val currentQueue = sparkSession.sparkContext.conf.get("spark.yarn.queue", "")
+    if (!currentQueue.contains("reserved")
+      && !currentQueue.contains("test")
+      && !currentQueue.contains("staging")) {
+      throw new SparkException(s"BACKUP TABLE command should only be executed in " +
+        s"reserved, reserved-test and staging queues, current queue is $currentQueue.")
+    }
+    val result = new ArrayBuffer[Row]
+    val catalog = sparkSession.sessionState.catalog
+    if (catalog.tableExists(targetTableName)) {
+      throw new AnalysisException(s"Target table already exists!")
+    }
+    val sourceTable = catalog.getTableMetadata(sourceTableName)
+    val sourceTablePath = new Path(sourceTable.location)
+    val sourceTableIdentWithDB = sourceTableName.quotedString
+    if (sourceTable.tableType == CatalogTableType.VIEW) {
+      throw new AnalysisException(s"BACKUP TABLE command is not " +
+        s"allowed on a view: $sourceTableIdentWithDB")
+    }
+    if (CatalogUtils.isTemporaryTable(sourceTable)) {
+      throw new AnalysisException(s"BACKUP TABLE command is not " +
+        s"allowed on a temporary table: $sourceTableIdentWithDB")
+    }
+
+    val conf = sparkSession.sessionState.newHadoopConf()
+
+    val targetTableLoc = catalog.defaultTablePath(targetTableName)
+    val targetTablePath = new Path(targetTableLoc)
+    val fs = targetTablePath.getFileSystem(conf)
+    val qualifiedTargetPath = fs.makeQualified(targetTablePath)
+    if (fs.exists(qualifiedTargetPath)) {
+      throw new AnalysisException(
+        s"Default target table path already exists: ${targetTablePath.toString}")
+    }
+
+    val cpData = runDistCp(fs, currentQueue,
+      java.util.Arrays.asList(sourceTablePath), qualifiedTargetPath, conf)
+    val copied = cpData._1
+    val step1 = s"Step 1: DistCP $sourceTablePath to $qualifiedTargetPath"
+    val msg = cpData._2
+    result.append(Row(step1, msg))
+    if (!copied) {
+      return result
+    }
+
+    val ddlProp = Map(DDLUtils.DDL_TIME -> (System.currentTimeMillis() / 1000).toString,
+      "backup_table_source" -> sourceTableName.unquotedString)
+    val newTableDesc = sourceTable
+      .copy(identifier = targetTableName,
+        createTime = System.currentTimeMillis,
+        properties = sourceTable.properties ++ ddlProp)
+      .withNewStorage(locationUri = Some(targetTableLoc))
+    // Table location is already validated. No need to check it again during table creation.
+    catalog.createTable(newTableDesc, ignoreIfExists = false, validateLocation = false)
+
+    val step2 = s"Step 2: Create target table ${targetTableName.unquotedString}"
+    if (catalog.tableExists(targetTableName)) {
+      result.append(Row(step2, "Success"))
+    } else {
+      result.append(Row(step2, "Failed"))
+      deleteDirectory(fs, qualifiedTargetPath)
+      return result
+    }
+
+    if (sourceTable.partitionColumnNames.nonEmpty) {
+      val targetTable = catalog.getTableMetadata(targetTableName)
+      try {
+        // Need to recover partitions into the metastore so our saved data is visible.
+        sparkSession.sessionState.executePlan(
+          AlterTableRecoverPartitionsCommand(targetTable.identifier)).toRdd
+        result.append(Row("Step 2 (repair table partitions)", "Success"))
+      } catch {
+        case e: Exception =>
+          result.append(Row("Step 2 (repair table partitions)", s"Failed (${e.toString})"))
+          catalog.dropTable(targetTableName, ignoreIfNotExists = true, purge = true)
+          return result
+      }
+    }
+
+    val step3 = s"Step 3: Validate row count"
+    val skipValidation = sparkSession.sessionState.conf.getConfString(
+      "spark.sql.backup.table.validation.skip", "false").toBoolean
+    if (skipValidation) {
+      result.append(Row(step3, s"Skip backup table row count validation"))
+    } else {
+      val sc = sparkSession.table(sourceTableName).count()
+      val tc = sparkSession.table(targetTableName).count()
+      if (sc == tc) {
+        result.append(Row(step3, s"Success ($sc)"))
+      } else {
+        result.append(Row(step3, s"Failed ($sc vs $tc)"))
+        catalog.dropTable(targetTableName, ignoreIfNotExists = true, purge = true)
+      }
+    }
+
+    result.append(Row("All Backup Table Operations", "Success"))
+    result
+  }
+
+  private def runDistCp(fs: FileSystem, currentQueue: String, srcPaths: java.util.List[Path],
+      dst: Path, conf: Configuration): (Boolean, String) = {
+    val options = new DistCpOptions(srcPaths, dst)
+    logInfo(s"DistCp options: $options")
+    val params = constructDistCpParams(srcPaths, dst, conf)
+    logInfo(s"DistCp parameters: $params")
+    try {
+      conf.setBoolean("mapred.mapper.new-api", true)
+      conf.set("mapreduce.job.name", s"backup data from " +
+        s"${sourceTableName.unquotedString} to ${targetTableName.unquotedString}")
+      conf.set("mapreduce.job.queuename", currentQueue)
+      conf.set("mapreduce.map.memory.mb", "4096")
+      conf.set("mapreduce.map.java.opts", "-Xmx3072m")
+      val distcp = new DistCp(conf, options)
+      val res = distcp.run(params.toArray(new Array[String](params.size))) == 0
+      (res, "Success")
+    } catch {
+      case e: Exception =>
+        logError(s"Cannot execute DistCp process: ${e.toString}")
+        deleteDirectory(fs, dst)
+        (false, s"Fail to DistCp table data: ${e.toString}")
+    }
+  }
+
+  private def constructDistCpParams(srcPaths: java.util.List[Path],
+      dst: Path, conf: Configuration): java.util.ArrayList[String] = {
+    val params = new java.util.ArrayList[String]()
+    val DISTCP_OPTIONS_PREFIX = "distcp.options."
+    val iter = conf.iterator()
+    var specifyMaxTask = false
+    while (iter.hasNext) {
+      val tuple = iter.next()
+      val key = tuple.getKey
+      if (key.startsWith(DISTCP_OPTIONS_PREFIX)) {
+        val skey = key.substring(DISTCP_OPTIONS_PREFIX.length)
+        if (skey.equals("m")) {
+          specifyMaxTask = true
+        }
+        params.add("-" + skey)
+        val value = tuple.getValue
+        if (value != null && value.nonEmpty) {
+          params.add(value)
+        }
+      }
+    }
+    if (!specifyMaxTask) {
+      params.add("-m")
+      params.add("1000")
+    }
+    srcPaths.forEach(p => params.add(p.toString))
+    params.add(dst.toString)
+    params
+  }
+
+  private def deleteDirectory(fs: FileSystem, dir: Path): Unit = {
+    if (fs.exists(dir)) {
+      logInfo(s"Start to clean target path: $dir")
+      try {
+        if (fs.delete(dir, true)) {
+          logInfo(s"Finish clean target path: $dir")
+        } else {
+          logWarning(s"Fail to clean target path: $dir, please delete it manually.")
+        }
+      } catch {
+        case ee: Exception =>
+          logError(s"Delete target path exception: $ee")
+      }
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -1032,4 +1032,11 @@ class SparkSqlAstBuilder extends AstBuilder {`
`1032`	`1032`	`val toInfo = if (ctx.to == null) null else string(ctx.to)`
`1033`	`1033`	`AnalyzeTableLineageCommand(persist, fileInfo, dir, prefixInfo, fromInfo, toInfo)`
`1034`	`1034`	`}`
	`1035`	`+`
	`1036`	`+ override def visitBackupTable(ctx: BackupTableContext): LogicalPlan = withOrigin(ctx) {`
	`1037`	`+ val sourceTableName = visitTableIdentifier(ctx.source)`
	`1038`	`+ val targetTableName = visitTableIdentifier(ctx.target)`
	`1039`	`+ BackupTableCommand(sourceTableName, targetTableName)`
	`1040`	`+ }`
	`1041`	`+`
`1035`	`1042`	`}`