apache
diff --git a/‎README.md
Lines changed: 5 additions & 4 deletions b/‎README.md
Lines changed: 5 additions & 4 deletions
diff --git a/‎assembly/pom.xml
Lines changed: 18 additions & 5 deletions b/‎assembly/pom.xml
Lines changed: 18 additions & 5 deletions
diff --git a/‎bin/spark-class
Lines changed: 44 additions & 12 deletions b/‎bin/spark-class
Lines changed: 44 additions & 12 deletions
diff --git a/‎bin/spark-class2.cmd
Lines changed: 7 additions & 1 deletion b/‎bin/spark-class2.cmd
Lines changed: 7 additions & 1 deletion
diff --git a/‎bin/spark-submit
Lines changed: 23 additions & 5 deletions b/‎bin/spark-submit
Lines changed: 23 additions & 5 deletions
diff --git a/‎bin/utils.sh
100644100755 b/‎bin/utils.sh
100644100755
diff --git a/‎conf/spark-defaults.conf.template
Lines changed: 6 additions & 4 deletions b/‎conf/spark-defaults.conf.template
Lines changed: 6 additions & 4 deletions
diff --git a/‎core/pom.xml
Lines changed: 35 additions & 0 deletions b/‎core/pom.xml
Lines changed: 35 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkConf.scala
Lines changed: 8 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/SparkConf.scala
Lines changed: 8 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
Lines changed: 0 additions & 25 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
Lines changed: 0 additions & 25 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
Lines changed: 1 addition & 2 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
Lines changed: 1 addition & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/broadcast/BroadcastFactory.scala
Lines changed: 11 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/broadcast/BroadcastFactory.scala
Lines changed: 11 additions & 0 deletions
@@ -118,11 +118,10 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
 ## A Note About Thrift JDBC server and CLI for Spark SQL
 
 Spark SQL supports Thrift JDBC server and CLI.
-See sql-programming-guide.md for more information about those features.
-You can use those features by setting `-Phive-thriftserver` when building Spark as follows.
-
-    $ sbt/sbt -Phive-thriftserver assembly
+See sql-programming-guide.md for more information about using the JDBC server and CLI.
+You can use those features by setting `-Phive` when building Spark as follows.
 
+    $ sbt/sbt -Phive  assembly
 
 ## Configuration
 
@@ -140,3 +139,5 @@ submitting any copyrighted material via pull request, email, or other means
 you agree to license the material under the project's open source license and
 warrant that you have the legal authority to do so.
 
+Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
+for more information.
@@ -43,6 +43,12 @@
   </properties>
 
   <dependencies>
+    <!-- Promote Guava to compile scope in this module so it's included while shading. -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <scope>compile</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
@@ -113,6 +119,18 @@
               <goal>shade</goal>
             </goals>
             <configuration>
+              <relocations>
+                <relocation>
+                  <pattern>com.google</pattern>
+                  <shadedPattern>org.spark-project.guava</shadedPattern>
+                  <includes>
+                    <include>com.google.common.**</include>
+                  </includes>
+                  <excludes>
+                    <exclude>com.google.common.base.Optional**</exclude>
+                  </excludes>
+                </relocation>
+              </relocations>
               <transformers>
                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
                 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
@@ -163,11 +181,6 @@
           <artifactId>spark-hive_${scala.binary.version}</artifactId>
           <version>${project.version}</version>
         </dependency>
-      </dependencies>
-    </profile>
-    <profile>
-      <id>hive-thriftserver</id>
-      <dependencies>
         <dependency>
           <groupId>org.apache.spark</groupId>
           <artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>
 
@@ -17,6 +17,8 @@
 # limitations under the License.
 #
 
+# NOTE: Any changes to this file must be reflected in SparkSubmitDriverBootstrapper.scala!
+
 cygwin=false
 case "`uname`" in
     CYGWIN*) cygwin=true;;
@@ -39,7 +41,7 @@ fi
 
 if [ -n "$SPARK_MEM" ]; then
   echo -e "Warning: SPARK_MEM is deprecated, please use a more specific config option" 1>&2
-  echo -e "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)." 1>&2
+  echo -e "(e.g., spark.executor.memory or spark.driver.memory)." 1>&2
 fi
 
 # Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
@@ -73,11 +75,17 @@ case "$1" in
     OUR_JAVA_MEM=${SPARK_EXECUTOR_MEMORY:-$DEFAULT_MEM}
     ;;
 
-  # Spark submit uses SPARK_SUBMIT_OPTS and SPARK_JAVA_OPTS
-    'org.apache.spark.deploy.SparkSubmit')
-    OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS \
-      -Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
+  # Spark submit uses SPARK_JAVA_OPTS + SPARK_SUBMIT_OPTS +
+  # SPARK_DRIVER_MEMORY + SPARK_SUBMIT_DRIVER_MEMORY.
+  'org.apache.spark.deploy.SparkSubmit')
+    OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS"
     OUR_JAVA_MEM=${SPARK_DRIVER_MEMORY:-$DEFAULT_MEM}
+    if [ -n "$SPARK_SUBMIT_LIBRARY_PATH" ]; then
+      OUR_JAVA_OPTS="$OUR_JAVA_OPTS -Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
+    fi
+    if [ -n "$SPARK_SUBMIT_DRIVER_MEMORY" ]; then
+      OUR_JAVA_MEM="$SPARK_SUBMIT_DRIVER_MEMORY"
+    fi
     ;;
 
   *)
@@ -97,15 +105,21 @@ else
     exit 1
   fi
 fi
+JAVA_VERSION=$($RUNNER -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
 
 # Set JAVA_OPTS to be able to load native libraries and to set heap size
-JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
+if [ "$JAVA_VERSION" -ge 18 ]; then
+  JAVA_OPTS="$OUR_JAVA_OPTS"
+else
+  JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
+fi
 JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
+
 # Load extra JAVA_OPTS from conf/java-opts, if it exists
 if [ -e "$FWDIR/conf/java-opts" ] ; then
   JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
 fi
-export JAVA_OPTS
+
 # Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
 
 TOOLS_DIR="$FWDIR"/tools
@@ -146,10 +160,28 @@ if $cygwin; then
 fi
 export CLASSPATH
 
-if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
-  echo -n "Spark Command: " 1>&2
-  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
-  echo -e "========================================\n" 1>&2
+# In Spark submit client mode, the driver is launched in the same JVM as Spark submit itself.
+# Here we must parse the properties file for relevant "spark.driver.*" configs before launching
+# the driver JVM itself. Instead of handling this complexity in Bash, we launch a separate JVM
+# to prepare the launch environment of this driver JVM.
+
+if [ -n "$SPARK_SUBMIT_BOOTSTRAP_DRIVER" ]; then
+  # This is used only if the properties file actually contains these special configs
+  # Export the environment variables needed by SparkSubmitDriverBootstrapper
+  export RUNNER
+  export CLASSPATH
+  export JAVA_OPTS
+  export OUR_JAVA_MEM
+  export SPARK_CLASS=1
+  shift # Ignore main class (org.apache.spark.deploy.SparkSubmit) and use our own
+  exec "$RUNNER" org.apache.spark.deploy.SparkSubmitDriverBootstrapper "$@"
+else
+  # Note: The format of this command is closely echoed in SparkSubmitDriverBootstrapper.scala
+  if [ -n "$SPARK_PRINT_LAUNCH_COMMAND" ]; then
+    echo -n "Spark Command: " 1>&2
+    echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
+    echo -e "========================================\n" 1>&2
+  fi
+  exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
 fi
 
-exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
 
@@ -77,7 +77,13 @@ rem All drivers use SPARK_JAVA_OPTS + SPARK_DRIVER_MEMORY. The repl also uses SP
 )
 
 rem Set JAVA_OPTS to be able to load native libraries and to set heap size
-set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+for /f "tokens=3" %%i in ('java -version 2^>^&1 ^| find "version"') do set jversion=%%i
+for /f "tokens=1 delims=_" %%i in ("%jversion:~1,-1%") do set jversion=%%i
+if "%jversion%" geq "1.8.0" (
+  set JAVA_OPTS=%OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+) else (
+  set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+)
 rem Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
 
 rem Test whether the user has built Spark
 
@@ -17,14 +17,18 @@
 # limitations under the License.
 #
 
+# NOTE: Any changes in this file must be reflected in SparkClassLauncher.scala!
+
 export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
 ORIG_ARGS=("$@")
 
 while (($#)); do
   if [ "$1" = "--deploy-mode" ]; then
-    DEPLOY_MODE=$2
+    SPARK_SUBMIT_DEPLOY_MODE=$2
+  elif [ "$1" = "--properties-file" ]; then
+    SPARK_SUBMIT_PROPERTIES_FILE=$2
   elif [ "$1" = "--driver-memory" ]; then
-    DRIVER_MEMORY=$2
+    export SPARK_SUBMIT_DRIVER_MEMORY=$2
   elif [ "$1" = "--driver-library-path" ]; then
     export SPARK_SUBMIT_LIBRARY_PATH=$2
   elif [ "$1" = "--driver-class-path" ]; then
@@ -35,10 +39,24 @@ while (($#)); do
   shift
 done
 
-DEPLOY_MODE=${DEPLOY_MODE:-"client"}
+DEFAULT_PROPERTIES_FILE="$SPARK_HOME/conf/spark-defaults.conf"
+export SPARK_SUBMIT_DEPLOY_MODE=${SPARK_SUBMIT_DEPLOY_MODE:-"client"}
+export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PROPERTIES_FILE"}
+
+# For client mode, the driver will be launched in the same JVM that launches
+# SparkSubmit, so we may need to read the properties file for any extra class
+# paths, library paths, java options and memory early on. Otherwise, it will
+# be too late by the time the driver JVM has started.
 
-if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
-  export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
+if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then
+  # Parse the properties file only if the special configs exist
+  contains_special_configs=$(
+    grep -e "spark.driver.extra*\|spark.driver.memory" "$SPARK_SUBMIT_PROPERTIES_FILE" | \
+    grep -v "^[[:space:]]*#"
+  )
+  if [ -n "$contains_special_configs" ]; then
+    export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1
+  fi
 fi
 
 exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
 
@@ -2,7 +2,9 @@
 # This is useful for setting default environmental settings.
 
 # Example:
-# spark.master            spark://master:7077
-# spark.eventLog.enabled  true
-# spark.eventLog.dir      hdfs://namenode:8021/directory
-# spark.serializer        org.apache.spark.serializer.KryoSerializer
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetail -Dkey=value -Dnumbers="one two three"
@@ -68,9 +68,15 @@
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
     </dependency>
+    <!--
+      Promote Guava to "compile" so that maven-shade-plugin picks it up (for packaging the Optional
+      class exposed in the Java API). The plugin will then remove this dependency from the published
+      pom, so that Guava does not pollute the client's compilation classpath.
+    -->
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
+      <scope>compile</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.commons</groupId>
@@ -322,6 +328,35 @@
           </arguments>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <shadedArtifactAttached>false</shadedArtifactAttached>
+              <artifactSet>
+                <includes>
+                  <include>com.google.guava:guava</include>
+                </includes>
+              </artifactSet>
+              <filters>
+                <!-- See comment in the guava dependency declaration above. -->
+                <filter>
+                  <artifact>com.google.guava:guava</artifact>
+                  <includes>
+                    <include>com/google/common/base/Optional*</include>
+                  </includes>
+                </filter>
+              </filters>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
 
     <resources>
 
@@ -45,7 +45,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
   /** Create a SparkConf that loads defaults from system properties and the classpath */
   def this() = this(true)
 
-  private val settings = new HashMap[String, String]()
+  private[spark] val settings = new HashMap[String, String]()
 
   if (loadDefaults) {
     // Load any spark.* system properties
@@ -210,6 +210,12 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     new SparkConf(false).setAll(settings)
   }
 
+  /**
+   * By using this instead of System.getenv(), environment variables can be mocked
+   * in unit tests.
+   */
+  private[spark] def getenv(name: String): String = System.getenv(name)
+
   /** Checks for illegal or deprecated config settings. Throws an exception for the former. Not
     * idempotent - may mutate this conf object to convert deprecated settings to supported ones. */
   private[spark] def validateSettings() {
@@ -227,7 +233,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     // Validate spark.executor.extraJavaOptions
     settings.get(executorOptsKey).map { javaOpts =>
       if (javaOpts.contains("-Dspark")) {
-        val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts)'. " +
+        val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts'). " +
           "Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit."
         throw new Exception(msg)
       }
 
@@ -62,7 +62,7 @@ private[spark] class PythonRDD(
     val env = SparkEnv.get
     val localdir = env.blockManager.diskBlockManager.localDirs.map(
       f => f.getPath()).mkString(",")
-    envVars += ("SPARK_LOCAL_DIR" -> localdir) // it's also used in monitor thread
+    envVars += ("SPARK_LOCAL_DIRS" -> localdir) // it's also used in monitor thread
     val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
 
     // Start a thread to feed the process input from our parent's iterator
 
@@ -40,28 +40,3 @@ private[spark] object PythonUtils {
     paths.filter(_ != "").mkString(File.pathSeparator)
   }
 }
-
-
-/**
- * A utility class to redirect the child process's stdout or stderr.
- */
-private[spark] class RedirectThread(
-    in: InputStream,
-    out: OutputStream,
-    name: String)
-  extends Thread(name) {
-
-  setDaemon(true)
-  override def run() {
-    scala.util.control.Exception.ignoring(classOf[IOException]) {
-      // FIXME: We copy the stream on the level of bytes to avoid encoding problems.
-      val buf = new Array[Byte](1024)
-      var len = in.read(buf)
-      while (len != -1) {
-        out.write(buf, 0, len)
-        out.flush()
-        len = in.read(buf)
-      }
-    }
-  }
-}
@@ -17,15 +17,14 @@
 
 package org.apache.spark.api.python
 
-import java.lang.Runtime
 import java.io.{DataOutputStream, DataInputStream, InputStream, OutputStreamWriter}
 import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 
 import scala.collection.mutable
 import scala.collection.JavaConversions._
 
 import org.apache.spark._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{RedirectThread, Utils}
 
 private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String, String])
   extends Logging {
 
@@ -32,8 +32,19 @@ import org.apache.spark.annotation.DeveloperApi
  */
 @DeveloperApi
 trait BroadcastFactory {
+
   def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager): Unit
+
+  /**
+   * Creates a new broadcast variable.
+   *
+   * @param value value to broadcast
+   * @param isLocal whether we are in local mode (single JVM process)
+   * @param id unique id representing this broadcast variable
+   */
   def newBroadcast[T: ClassTag](value: T, isLocal: Boolean, id: Long): Broadcast[T]
+
   def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit
+
   def stop(): Unit
 }