|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.spark.sql.hive.execution |
| 19 | + |
| 20 | +import java.io.IOException |
| 21 | +import java.net.URI |
| 22 | +import java.text.SimpleDateFormat |
| 23 | +import java.util.{Date, Locale, Random} |
| 24 | + |
| 25 | +import scala.util.control.NonFatal |
| 26 | + |
| 27 | +import org.apache.hadoop.conf.Configuration |
| 28 | +import org.apache.hadoop.fs.{FileSystem, Path} |
| 29 | +import org.apache.hadoop.hive.common.FileUtils |
| 30 | +import org.apache.hadoop.hive.ql.exec.TaskRunner |
| 31 | + |
| 32 | +import org.apache.spark.internal.Logging |
| 33 | +import org.apache.spark.sql.SparkSession |
| 34 | +import org.apache.spark.sql.errors.QueryExecutionErrors |
| 35 | +import org.apache.spark.sql.hive.HiveExternalCatalog |
| 36 | +import org.apache.spark.sql.hive.client.HiveVersion |
| 37 | + |
| 38 | +class HiveTempPath(session: SparkSession, val hadoopConf: Configuration, path: Path) |
| 39 | + extends Logging { |
| 40 | + private var stagingDirForCreating: Option[Path] = None |
| 41 | + |
| 42 | + lazy val externalTempPath: Path = getExternalTmpPath(path) |
| 43 | + |
| 44 | + private def getExternalTmpPath(path: Path): Path = { |
| 45 | + import org.apache.spark.sql.hive.client.hive._ |
| 46 | + |
| 47 | + // Before Hive 1.1, when inserting into a table, Hive will create the staging directory under |
| 48 | + // a common scratch directory. After the writing is finished, Hive will simply empty the table |
| 49 | + // directory and move the staging directory to it. |
| 50 | + // After Hive 1.1, Hive will create the staging directory under the table directory, and when |
| 51 | + // moving staging directory to table directory, Hive will still empty the table directory, but |
| 52 | + // will exclude the staging directory there. |
| 53 | + // We have to follow the Hive behavior here, to avoid troubles. For example, if we create |
| 54 | + // staging directory under the table director for Hive prior to 1.1, the staging directory will |
| 55 | + // be removed by Hive when Hive is trying to empty the table directory. |
| 56 | + val hiveVersionsUsingOldExternalTempPath: Set[HiveVersion] = Set(v12, v13, v14, v1_0) |
| 57 | + val hiveVersionsUsingNewExternalTempPath: Set[HiveVersion] = |
| 58 | + Set(v1_1, v1_2, v2_0, v2_1, v2_2, v2_3, v3_0, v3_1) |
| 59 | + |
| 60 | + // Ensure all the supported versions are considered here. |
| 61 | + assert(hiveVersionsUsingNewExternalTempPath ++ hiveVersionsUsingOldExternalTempPath == |
| 62 | + allSupportedHiveVersions) |
| 63 | + |
| 64 | + val externalCatalog = session.sharedState.externalCatalog |
| 65 | + val hiveVersion = externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client.version |
| 66 | + val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") |
| 67 | + val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive") |
| 68 | + |
| 69 | + if (hiveVersionsUsingOldExternalTempPath.contains(hiveVersion)) { |
| 70 | + oldVersionExternalTempPath(path, scratchDir) |
| 71 | + } else if (hiveVersionsUsingNewExternalTempPath.contains(hiveVersion)) { |
| 72 | + newVersionExternalTempPath(path, stagingDir) |
| 73 | + } else { |
| 74 | + throw new IllegalStateException("Unsupported hive version: " + hiveVersion.fullVersion) |
| 75 | + } |
| 76 | + } |
| 77 | + |
| 78 | + // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13 |
| 79 | + private def oldVersionExternalTempPath(path: Path, scratchDir: String): Path = { |
| 80 | + val extURI: URI = path.toUri |
| 81 | + val scratchPath = new Path(scratchDir, executionId) |
| 82 | + var dirPath = new Path( |
| 83 | + extURI.getScheme, |
| 84 | + extURI.getAuthority, |
| 85 | + scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID()) |
| 86 | + |
| 87 | + val fs = dirPath.getFileSystem(hadoopConf) |
| 88 | + dirPath = new Path(fs.makeQualified(dirPath).toString()) |
| 89 | + stagingDirForCreating = Some(dirPath) |
| 90 | + dirPath |
| 91 | + } |
| 92 | + |
| 93 | + // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 |
| 94 | + private def newVersionExternalTempPath(path: Path, stagingDir: String): Path = { |
| 95 | + val extURI: URI = path.toUri |
| 96 | + if (extURI.getScheme == "viewfs") { |
| 97 | + val qualifiedStagingDir = getStagingDir(path, stagingDir) |
| 98 | + stagingDirForCreating = Some(qualifiedStagingDir) |
| 99 | + // Hive uses 10000 |
| 100 | + new Path(qualifiedStagingDir, "-ext-10000") |
| 101 | + } else { |
| 102 | + val qualifiedStagingDir = getExternalScratchDir(extURI, stagingDir) |
| 103 | + stagingDirForCreating = Some(qualifiedStagingDir) |
| 104 | + new Path(qualifiedStagingDir, "-ext-10000") |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + private def getExternalScratchDir(extURI: URI, stagingDir: String): Path = { |
| 109 | + getStagingDir( |
| 110 | + new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), |
| 111 | + stagingDir) |
| 112 | + } |
| 113 | + |
| 114 | + private[hive] def getStagingDir(inputPath: Path, stagingDir: String): Path = { |
| 115 | + val inputPathName: String = inputPath.toString |
| 116 | + val fs: FileSystem = inputPath.getFileSystem(hadoopConf) |
| 117 | + var stagingPathName: String = |
| 118 | + if (inputPathName.indexOf(stagingDir) == -1) { |
| 119 | + new Path(inputPathName, stagingDir).toString |
| 120 | + } else { |
| 121 | + inputPathName.substring(0, inputPathName.indexOf(stagingDir) + stagingDir.length) |
| 122 | + } |
| 123 | + |
| 124 | + // SPARK-20594: This is a walk-around fix to resolve a Hive bug. Hive requires that the |
| 125 | + // staging directory needs to avoid being deleted when users set hive.exec.stagingdir |
| 126 | + // under the table directory. |
| 127 | + if (isSubDir(new Path(stagingPathName), inputPath, fs) && |
| 128 | + !stagingPathName.stripPrefix(inputPathName).stripPrefix("/").startsWith(".")) { |
| 129 | + logDebug(s"The staging dir '$stagingPathName' should be a child directory starts " + |
| 130 | + "with '.' to avoid being deleted if we set hive.exec.stagingdir under the table " + |
| 131 | + "directory.") |
| 132 | + stagingPathName = new Path(inputPathName, ".hive-staging").toString |
| 133 | + } |
| 134 | + |
| 135 | + val dir: Path = |
| 136 | + fs.makeQualified( |
| 137 | + new Path(stagingPathName + "_" + executionId + "-" + TaskRunner.getTaskRunnerID)) |
| 138 | + logDebug("Created staging dir = " + dir + " for path = " + inputPath) |
| 139 | + dir |
| 140 | + } |
| 141 | + |
| 142 | + // HIVE-14259 removed FileUtils.isSubDir(). Adapted it from Hive 1.2's FileUtils.isSubDir(). |
| 143 | + private def isSubDir(p1: Path, p2: Path, fs: FileSystem): Boolean = { |
| 144 | + val path1 = fs.makeQualified(p1).toString + Path.SEPARATOR |
| 145 | + val path2 = fs.makeQualified(p2).toString + Path.SEPARATOR |
| 146 | + path1.startsWith(path2) |
| 147 | + } |
| 148 | + |
| 149 | + private def executionId: String = { |
| 150 | + val rand: Random = new Random |
| 151 | + val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) |
| 152 | + "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) |
| 153 | + } |
| 154 | + |
| 155 | + def deleteTmpPath() : Unit = { |
| 156 | + // Attempt to delete the staging directory and the inclusive files. If failed, the files are |
| 157 | + // expected to be dropped at the normal termination of VM since deleteOnExit is used. |
| 158 | + try { |
| 159 | + stagingDirForCreating.foreach { stagingDir => |
| 160 | + val fs = stagingDir.getFileSystem(hadoopConf) |
| 161 | + if (fs.delete(stagingDir, true)) { |
| 162 | + // If we successfully delete the staging directory, remove it from FileSystem's cache. |
| 163 | + fs.cancelDeleteOnExit(stagingDir) |
| 164 | + } |
| 165 | + } |
| 166 | + } catch { |
| 167 | + case NonFatal(e) => |
| 168 | + val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") |
| 169 | + logWarning(s"Unable to delete staging directory: $stagingDir.\n" + e) |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + def createTmpPath(): Unit = { |
| 174 | + try { |
| 175 | + stagingDirForCreating.foreach { stagingDir => |
| 176 | + val fs: FileSystem = stagingDir.getFileSystem(hadoopConf) |
| 177 | + if (!FileUtils.mkdir(fs, stagingDir, true, hadoopConf)) { |
| 178 | + throw new IllegalStateException( |
| 179 | + "Cannot create staging directory '" + stagingDir.toString + "'") |
| 180 | + } |
| 181 | + fs.deleteOnExit(stagingDir) |
| 182 | + } |
| 183 | + } catch { |
| 184 | + case e: IOException => |
| 185 | + throw QueryExecutionErrors.cannotCreateStagingDirError( |
| 186 | + s"'${stagingDirForCreating.toString}': ${e.getMessage}", e) |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + def deleteIfNotStagingDir(path: Path, fs: FileSystem): Unit = { |
| 191 | + if (Option(path) != stagingDirForCreating) fs.delete(path, true) |
| 192 | + } |
| 193 | +} |
0 commit comments