apache
diff --git a/‎bin/docker-image-tool.sh
Lines changed: 7 additions & 2 deletions b/‎bin/docker-image-tool.sh
Lines changed: 7 additions & 2 deletions
diff --git a/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Lines changed: 40 additions & 8 deletions b/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Lines changed: 40 additions & 8 deletions
diff --git a/‎common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
Lines changed: 22 additions & 1 deletion b/‎common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
Lines changed: 22 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
Lines changed: 25 additions & 9 deletions b/‎core/src/main/scala/org/apache/spark/deploy/DependencyUtils.scala
Lines changed: 25 additions & 9 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Lines changed: 13 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Lines changed: 13 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
Lines changed: 34 additions & 7 deletions b/‎core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
Lines changed: 34 additions & 7 deletions
@@ -64,9 +64,11 @@ function build {
     error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
   fi
 
+  local DOCKERFILE=${DOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
+
   docker build "${BUILD_ARGS[@]}" \
     -t $(image_ref spark) \
-    -f "$IMG_PATH/spark/Dockerfile" .
+    -f "$DOCKERFILE" .
 }
 
 function push {
@@ -84,6 +86,7 @@ Commands:
   push        Push a pre-built image to a registry. Requires a repository address to be provided.
 
 Options:
+  -f file     Dockerfile to build. By default builds the Dockerfile shipped with Spark.
   -r repo     Repository address.
   -t tag      Tag to apply to the built image, or to identify the image to be pushed.
   -m          Use minikube's Docker daemon.
@@ -113,10 +116,12 @@ fi
 
 REPO=
 TAG=
-while getopts mr:t: option
+DOCKERFILE=
+while getopts f:mr:t: option
 do
  case "${option}"
  in
+ f) DOCKERFILE=${OPTARG};;
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
  m)
 
@@ -57,12 +57,43 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5,
-    6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which
+   * indicates the size of the char. See Unicode standard in page 126, Table 3-6:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * Binary    Hex          Comments
+   * 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 11110xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0–C1, F5–FF.
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+    // Continuation bytes cannot appear as the first byte
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF
+    0, 0, // 0xC0..0xC1 - disallowed in UTF-8
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF
+    4, 4, 4, 4, 4, // 0xF0..0xF4
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8
+  };
 
   private static final boolean IS_LITTLE_ENDIAN =
       ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
@@ -187,8 +218,9 @@ public void writeTo(OutputStream out) throws IOException {
    * @param b The first byte of a code point
    */
   private static int numBytesForFirstByte(final byte b) {
-    final int offset = (b & 0xFF) - 192;
-    return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+    final int offset = b & 0xFF;
+    byte numBytes = bytesOfCodePointInUTF8[offset];
+    return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8
   }
 
   /**
 
@@ -58,8 +58,12 @@ private static void checkBasic(String str, int len) {
   @Test
   public void basicTest() {
     checkBasic("", 0);
-    checkBasic("hello", 5);
+    checkBasic("¡", 1); // 2 bytes char
+    checkBasic("ку", 2); // 2 * 2 bytes chars
+    checkBasic("hello", 5); // 5 * 1 byte chars
     checkBasic("大 千 世 界", 7);
+    checkBasic("︽﹋％", 3); // 3 * 3 bytes chars
+    checkBasic("\uD83E\uDD19", 1); // 4 bytes char
   }
 
   @Test
@@ -791,4 +795,21 @@ public void trimRightWithTrimString() {
     assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
     assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
   }
+
+  @Test
+  public void skipWrongFirstByte() {
+    int[] wrongFirstBytes = {
+      0x80, 0x9F, 0xBF, // Skip Continuation bytes
+      0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8
+      // 0xF5..0xFF - disallowed in UTF-8
+      0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
+      0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
+    };
+    byte[] c = new byte[1];
+
+    for (int i = 0; i < wrongFirstBytes.length; ++i) {
+      c[0] = (byte)wrongFirstBytes[i];
+      assertEquals(fromBytes(c).numChars(), 1);
+    }
+  }
 }
@@ -18,12 +18,13 @@
 package org.apache.spark.deploy
 
 import java.io.File
+import java.net.URI
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 private[deploy] object DependencyUtils {
@@ -137,16 +138,31 @@ private[deploy] object DependencyUtils {
   def resolveGlobPaths(paths: String, hadoopConf: Configuration): String = {
     require(paths != null, "paths cannot be null.")
     Utils.stringToSeq(paths).flatMap { path =>
-      val uri = Utils.resolveURI(path)
-      uri.getScheme match {
-        case "local" | "http" | "https" | "ftp" => Array(path)
-        case _ =>
-          val fs = FileSystem.get(uri, hadoopConf)
-          Option(fs.globStatus(new Path(uri))).map { status =>
-            status.filter(_.isFile).map(_.getPath.toUri.toString)
-          }.getOrElse(Array(path))
+      val (base, fragment) = splitOnFragment(path)
+      (resolveGlobPath(base, hadoopConf), fragment) match {
+        case (resolved, Some(_)) if resolved.length > 1 => throw new SparkException(
+            s"${base.toString} resolves ambiguously to multiple files: ${resolved.mkString(",")}")
+        case (resolved, Some(namedAs)) => resolved.map(_ + "#" + namedAs)
+        case (resolved, _) => resolved
       }
     }.mkString(",")
   }
 
+  private def splitOnFragment(path: String): (URI, Option[String]) = {
+    val uri = Utils.resolveURI(path)
+    val withoutFragment = new URI(uri.getScheme, uri.getSchemeSpecificPart, null)
+    (withoutFragment, Option(uri.getFragment))
+  }
+
+  private def resolveGlobPath(uri: URI, hadoopConf: Configuration): Array[String] = {
+    uri.getScheme match {
+      case "local" | "http" | "https" | "ftp" => Array(uri.toString)
+      case _ =>
+        val fs = FileSystem.get(uri, hadoopConf)
+        Option(fs.globStatus(new Path(uri))).map { status =>
+          status.filter(_.isFile).map(_.getPath.toUri.toString)
+        }.getOrElse(Array(uri.toString))
+    }
+  }
+
 }
@@ -245,6 +245,19 @@ object SparkSubmit extends CommandLineUtils with Logging {
       args: SparkSubmitArguments,
       conf: Option[HadoopConfiguration] = None)
       : (Seq[String], Seq[String], SparkConf, String) = {
+    try {
+      doPrepareSubmitEnvironment(args, conf)
+    } catch {
+      case e: SparkException =>
+        printErrorAndExit(e.getMessage)
+        throw e
+    }
+  }
+
+  private def doPrepareSubmitEnvironment(
+      args: SparkSubmitArguments,
+      conf: Option[HadoopConfiguration] = None)
+      : (Seq[String], Seq[String], SparkConf, String) = {
     // Return values
     val childArgs = new ArrayBuffer[String]()
     val childClasspath = new ArrayBuffer[String]()
@@ -320,8 +333,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
         printErrorAndExit("Python applications are currently not supported for Kubernetes.")
       case (KUBERNETES, _) if args.isR =>
         printErrorAndExit("R applications are currently not supported for Kubernetes.")
-      case (KUBERNETES, CLIENT) =>
-        printErrorAndExit("Client mode is currently not supported for Kubernetes.")
       case (LOCAL, CLUSTER) =>
         printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
       case (_, CLUSTER) if isShell(args.primaryResource) =>
 
@@ -143,7 +143,7 @@ private[spark] object ShutdownHookManager extends Logging {
   }
 
   /**
-   * Adds a shutdown hook with the given priority. Hooks with lower priority values run
+   * Adds a shutdown hook with the given priority. Hooks with higher priority values run
    * first.
    *
    * @param hook The code to run during shutdown.
 
@@ -20,7 +20,7 @@ package org.apache.spark.deploy
 import java.io._
 import java.net.URI
 import java.nio.charset.StandardCharsets
-import java.nio.file.Files
+import java.nio.file.{Files, Paths}
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
@@ -606,10 +606,13 @@ class SparkSubmitSuite
   }
 
   test("resolves command line argument paths correctly") {
-    val jars = "/jar1,/jar2"                 // --jars
-    val files = "local:/file1,file2"          // --files
-    val archives = "file:/archive1,archive2" // --archives
-    val pyFiles = "py-file1,py-file2"        // --py-files
+    val dir = Utils.createTempDir()
+    val archive = Paths.get(dir.toPath.toString, "single.zip")
+    Files.createFile(archive)
+    val jars = "/jar1,/jar2"
+    val files = "local:/file1,file2"
+    val archives = s"file:/archive1,${dir.toPath.toAbsolutePath.toString}/*.zip#archive3"
+    val pyFiles = "py-file1,py-file2"
 
     // Test jars and files
     val clArgs = Seq(
@@ -636,9 +639,10 @@ class SparkSubmitSuite
     val appArgs2 = new SparkSubmitArguments(clArgs2)
     val (_, _, conf2, _) = SparkSubmit.prepareSubmitEnvironment(appArgs2)
     appArgs2.files should be (Utils.resolveURIs(files))
-    appArgs2.archives should be (Utils.resolveURIs(archives))
+    appArgs2.archives should fullyMatch regex ("file:/archive1,file:.*#archive3")
     conf2.get("spark.yarn.dist.files") should be (Utils.resolveURIs(files))
-    conf2.get("spark.yarn.dist.archives") should be (Utils.resolveURIs(archives))
+    conf2.get("spark.yarn.dist.archives") should fullyMatch regex
+      ("file:/archive1,file:.*#archive3")
 
     // Test python files
     val clArgs3 = Seq(
@@ -657,6 +661,29 @@ class SparkSubmitSuite
     conf3.get(PYSPARK_PYTHON.key) should be ("python3.5")
   }
 
+  test("ambiguous archive mapping results in error message") {
+    val dir = Utils.createTempDir()
+    val archive1 = Paths.get(dir.toPath.toString, "first.zip")
+    val archive2 = Paths.get(dir.toPath.toString, "second.zip")
+    Files.createFile(archive1)
+    Files.createFile(archive2)
+    val jars = "/jar1,/jar2"
+    val files = "local:/file1,file2"
+    val archives = s"file:/archive1,${dir.toPath.toAbsolutePath.toString}/*.zip#archive3"
+    val pyFiles = "py-file1,py-file2"
+
+    // Test files and archives (Yarn)
+    val clArgs2 = Seq(
+      "--master", "yarn",
+      "--class", "org.SomeClass",
+      "--files", files,
+      "--archives", archives,
+      "thejar.jar"
+    )
+
+    testPrematureExit(clArgs2.toArray, "resolves ambiguously to multiple files")
+  }
+
   test("resolves config paths correctly") {
     val jars = "/jar1,/jar2" // spark.jars
     val files = "local:/file1,file2" // spark.files / spark.yarn.dist.files
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ private[spark] object ShutdownHookManager extends Logging {`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`/**`
`146`		`- * Adds a shutdown hook with the given priority. Hooks with lower priority values run`
	`146`	`+ * Adds a shutdown hook with the given priority. Hooks with higher priority values run`
`147`	`147`	`* first.`
`148`	`148`	`*`
`149`	`149`	`* @param hook The code to run during shutdown.`