Skip to content

Commit 062c31d

Browse files
committed
Merge remote-tracking branch 'upstream/master' into dt-opt3alt
2 parents 6d32ccd + db436e3 commit 062c31d

File tree

186 files changed

+6345
-2011
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

186 files changed

+6345
-2011
lines changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,10 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
118118
## A Note About Thrift JDBC server and CLI for Spark SQL
119119

120120
Spark SQL supports Thrift JDBC server and CLI.
121-
See sql-programming-guide.md for more information about those features.
122-
You can use those features by setting `-Phive-thriftserver` when building Spark as follows.
123-
124-
$ sbt/sbt -Phive-thriftserver assembly
121+
See sql-programming-guide.md for more information about using the JDBC server and CLI.
122+
You can use those features by setting `-Phive` when building Spark as follows.
125123

124+
$ sbt/sbt -Phive assembly
126125

127126
## Configuration
128127

@@ -140,3 +139,5 @@ submitting any copyrighted material via pull request, email, or other means
140139
you agree to license the material under the project's open source license and
141140
warrant that you have the legal authority to do so.
142141

142+
Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
143+
for more information.

assembly/pom.xml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
</properties>
4444

4545
<dependencies>
46+
<!-- Promote Guava to compile scope in this module so it's included while shading. -->
47+
<dependency>
48+
<groupId>com.google.guava</groupId>
49+
<artifactId>guava</artifactId>
50+
<scope>compile</scope>
51+
</dependency>
4652
<dependency>
4753
<groupId>org.apache.spark</groupId>
4854
<artifactId>spark-core_${scala.binary.version}</artifactId>
@@ -113,6 +119,18 @@
113119
<goal>shade</goal>
114120
</goals>
115121
<configuration>
122+
<relocations>
123+
<relocation>
124+
<pattern>com.google</pattern>
125+
<shadedPattern>org.spark-project.guava</shadedPattern>
126+
<includes>
127+
<include>com.google.common.**</include>
128+
</includes>
129+
<excludes>
130+
<exclude>com.google.common.base.Optional**</exclude>
131+
</excludes>
132+
</relocation>
133+
</relocations>
116134
<transformers>
117135
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
118136
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
@@ -163,11 +181,6 @@
163181
<artifactId>spark-hive_${scala.binary.version}</artifactId>
164182
<version>${project.version}</version>
165183
</dependency>
166-
</dependencies>
167-
</profile>
168-
<profile>
169-
<id>hive-thriftserver</id>
170-
<dependencies>
171184
<dependency>
172185
<groupId>org.apache.spark</groupId>
173186
<artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>

bin/spark-class

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
# limitations under the License.
1818
#
1919

20+
# NOTE: Any changes to this file must be reflected in SparkSubmitDriverBootstrapper.scala!
21+
2022
cygwin=false
2123
case "`uname`" in
2224
CYGWIN*) cygwin=true;;
@@ -39,7 +41,7 @@ fi
3941

4042
if [ -n "$SPARK_MEM" ]; then
4143
echo -e "Warning: SPARK_MEM is deprecated, please use a more specific config option" 1>&2
42-
echo -e "(e.g., spark.executor.memory or SPARK_DRIVER_MEMORY)." 1>&2
44+
echo -e "(e.g., spark.executor.memory or spark.driver.memory)." 1>&2
4345
fi
4446

4547
# Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
@@ -73,11 +75,17 @@ case "$1" in
7375
OUR_JAVA_MEM=${SPARK_EXECUTOR_MEMORY:-$DEFAULT_MEM}
7476
;;
7577

76-
# Spark submit uses SPARK_SUBMIT_OPTS and SPARK_JAVA_OPTS
77-
'org.apache.spark.deploy.SparkSubmit')
78-
OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS \
79-
-Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
78+
# Spark submit uses SPARK_JAVA_OPTS + SPARK_SUBMIT_OPTS +
79+
# SPARK_DRIVER_MEMORY + SPARK_SUBMIT_DRIVER_MEMORY.
80+
'org.apache.spark.deploy.SparkSubmit')
81+
OUR_JAVA_OPTS="$SPARK_JAVA_OPTS $SPARK_SUBMIT_OPTS"
8082
OUR_JAVA_MEM=${SPARK_DRIVER_MEMORY:-$DEFAULT_MEM}
83+
if [ -n "$SPARK_SUBMIT_LIBRARY_PATH" ]; then
84+
OUR_JAVA_OPTS="$OUR_JAVA_OPTS -Djava.library.path=$SPARK_SUBMIT_LIBRARY_PATH"
85+
fi
86+
if [ -n "$SPARK_SUBMIT_DRIVER_MEMORY" ]; then
87+
OUR_JAVA_MEM="$SPARK_SUBMIT_DRIVER_MEMORY"
88+
fi
8189
;;
8290

8391
*)
@@ -97,15 +105,21 @@ else
97105
exit 1
98106
fi
99107
fi
108+
JAVA_VERSION=$($RUNNER -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
100109

101110
# Set JAVA_OPTS to be able to load native libraries and to set heap size
102-
JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
111+
if [ "$JAVA_VERSION" -ge 18 ]; then
112+
JAVA_OPTS="$OUR_JAVA_OPTS"
113+
else
114+
JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
115+
fi
103116
JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
117+
104118
# Load extra JAVA_OPTS from conf/java-opts, if it exists
105119
if [ -e "$FWDIR/conf/java-opts" ] ; then
106120
JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
107121
fi
108-
export JAVA_OPTS
122+
109123
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
110124

111125
TOOLS_DIR="$FWDIR"/tools
@@ -146,10 +160,28 @@ if $cygwin; then
146160
fi
147161
export CLASSPATH
148162

149-
if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
150-
echo -n "Spark Command: " 1>&2
151-
echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
152-
echo -e "========================================\n" 1>&2
163+
# In Spark submit client mode, the driver is launched in the same JVM as Spark submit itself.
164+
# Here we must parse the properties file for relevant "spark.driver.*" configs before launching
165+
# the driver JVM itself. Instead of handling this complexity in Bash, we launch a separate JVM
166+
# to prepare the launch environment of this driver JVM.
167+
168+
if [ -n "$SPARK_SUBMIT_BOOTSTRAP_DRIVER" ]; then
169+
# This is used only if the properties file actually contains these special configs
170+
# Export the environment variables needed by SparkSubmitDriverBootstrapper
171+
export RUNNER
172+
export CLASSPATH
173+
export JAVA_OPTS
174+
export OUR_JAVA_MEM
175+
export SPARK_CLASS=1
176+
shift # Ignore main class (org.apache.spark.deploy.SparkSubmit) and use our own
177+
exec "$RUNNER" org.apache.spark.deploy.SparkSubmitDriverBootstrapper "$@"
178+
else
179+
# Note: The format of this command is closely echoed in SparkSubmitDriverBootstrapper.scala
180+
if [ -n "$SPARK_PRINT_LAUNCH_COMMAND" ]; then
181+
echo -n "Spark Command: " 1>&2
182+
echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@" 1>&2
183+
echo -e "========================================\n" 1>&2
184+
fi
185+
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
153186
fi
154187

155-
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"

bin/spark-class2.cmd

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,13 @@ rem All drivers use SPARK_JAVA_OPTS + SPARK_DRIVER_MEMORY. The repl also uses SP
7777
)
7878

7979
rem Set JAVA_OPTS to be able to load native libraries and to set heap size
80-
set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
80+
for /f "tokens=3" %%i in ('java -version 2^>^&1 ^| find "version"') do set jversion=%%i
81+
for /f "tokens=1 delims=_" %%i in ("%jversion:~1,-1%") do set jversion=%%i
82+
if "%jversion%" geq "1.8.0" (
83+
set JAVA_OPTS=%OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
84+
) else (
85+
set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
86+
)
8187
rem Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
8288

8389
rem Test whether the user has built Spark

bin/spark-submit

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,18 @@
1717
# limitations under the License.
1818
#
1919

20+
# NOTE: Any changes in this file must be reflected in SparkClassLauncher.scala!
21+
2022
export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
2123
ORIG_ARGS=("$@")
2224

2325
while (($#)); do
2426
if [ "$1" = "--deploy-mode" ]; then
25-
DEPLOY_MODE=$2
27+
SPARK_SUBMIT_DEPLOY_MODE=$2
28+
elif [ "$1" = "--properties-file" ]; then
29+
SPARK_SUBMIT_PROPERTIES_FILE=$2
2630
elif [ "$1" = "--driver-memory" ]; then
27-
DRIVER_MEMORY=$2
31+
export SPARK_SUBMIT_DRIVER_MEMORY=$2
2832
elif [ "$1" = "--driver-library-path" ]; then
2933
export SPARK_SUBMIT_LIBRARY_PATH=$2
3034
elif [ "$1" = "--driver-class-path" ]; then
@@ -35,10 +39,24 @@ while (($#)); do
3539
shift
3640
done
3741

38-
DEPLOY_MODE=${DEPLOY_MODE:-"client"}
42+
DEFAULT_PROPERTIES_FILE="$SPARK_HOME/conf/spark-defaults.conf"
43+
export SPARK_SUBMIT_DEPLOY_MODE=${SPARK_SUBMIT_DEPLOY_MODE:-"client"}
44+
export SPARK_SUBMIT_PROPERTIES_FILE=${SPARK_SUBMIT_PROPERTIES_FILE:-"$DEFAULT_PROPERTIES_FILE"}
45+
46+
# For client mode, the driver will be launched in the same JVM that launches
47+
# SparkSubmit, so we may need to read the properties file for any extra class
48+
# paths, library paths, java options and memory early on. Otherwise, it will
49+
# be too late by the time the driver JVM has started.
3950

40-
if [ -n "$DRIVER_MEMORY" ] && [ $DEPLOY_MODE == "client" ]; then
41-
export SPARK_DRIVER_MEMORY=$DRIVER_MEMORY
51+
if [[ "$SPARK_SUBMIT_DEPLOY_MODE" == "client" && -f "$SPARK_SUBMIT_PROPERTIES_FILE" ]]; then
52+
# Parse the properties file only if the special configs exist
53+
contains_special_configs=$(
54+
grep -e "spark.driver.extra*\|spark.driver.memory" "$SPARK_SUBMIT_PROPERTIES_FILE" | \
55+
grep -v "^[[:space:]]*#"
56+
)
57+
if [ -n "$contains_special_configs" ]; then
58+
export SPARK_SUBMIT_BOOTSTRAP_DRIVER=1
59+
fi
4260
fi
4361

4462
exec $SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"

bin/utils.sh

100644100755
File mode changed.

conf/spark-defaults.conf.template

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# This is useful for setting default environmental settings.
33

44
# Example:
5-
# spark.master spark://master:7077
6-
# spark.eventLog.enabled true
7-
# spark.eventLog.dir hdfs://namenode:8021/directory
8-
# spark.serializer org.apache.spark.serializer.KryoSerializer
5+
# spark.master spark://master:7077
6+
# spark.eventLog.enabled true
7+
# spark.eventLog.dir hdfs://namenode:8021/directory
8+
# spark.serializer org.apache.spark.serializer.KryoSerializer
9+
# spark.driver.memory 5g
10+
# spark.executor.extraJavaOptions -XX:+PrintGCDetail -Dkey=value -Dnumbers="one two three"

core/pom.xml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,15 @@
6868
<groupId>org.eclipse.jetty</groupId>
6969
<artifactId>jetty-server</artifactId>
7070
</dependency>
71+
<!--
72+
Promote Guava to "compile" so that maven-shade-plugin picks it up (for packaging the Optional
73+
class exposed in the Java API). The plugin will then remove this dependency from the published
74+
pom, so that Guava does not pollute the client's compilation classpath.
75+
-->
7176
<dependency>
7277
<groupId>com.google.guava</groupId>
7378
<artifactId>guava</artifactId>
79+
<scope>compile</scope>
7480
</dependency>
7581
<dependency>
7682
<groupId>org.apache.commons</groupId>
@@ -322,6 +328,35 @@
322328
</arguments>
323329
</configuration>
324330
</plugin>
331+
<plugin>
332+
<groupId>org.apache.maven.plugins</groupId>
333+
<artifactId>maven-shade-plugin</artifactId>
334+
<executions>
335+
<execution>
336+
<phase>package</phase>
337+
<goals>
338+
<goal>shade</goal>
339+
</goals>
340+
<configuration>
341+
<shadedArtifactAttached>false</shadedArtifactAttached>
342+
<artifactSet>
343+
<includes>
344+
<include>com.google.guava:guava</include>
345+
</includes>
346+
</artifactSet>
347+
<filters>
348+
<!-- See comment in the guava dependency declaration above. -->
349+
<filter>
350+
<artifact>com.google.guava:guava</artifact>
351+
<includes>
352+
<include>com/google/common/base/Optional*</include>
353+
</includes>
354+
</filter>
355+
</filters>
356+
</configuration>
357+
</execution>
358+
</executions>
359+
</plugin>
325360
</plugins>
326361

327362
<resources>

core/src/main/scala/org/apache/spark/SparkConf.scala

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
4545
/** Create a SparkConf that loads defaults from system properties and the classpath */
4646
def this() = this(true)
4747

48-
private val settings = new HashMap[String, String]()
48+
private[spark] val settings = new HashMap[String, String]()
4949

5050
if (loadDefaults) {
5151
// Load any spark.* system properties
@@ -210,6 +210,12 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
210210
new SparkConf(false).setAll(settings)
211211
}
212212

213+
/**
214+
* By using this instead of System.getenv(), environment variables can be mocked
215+
* in unit tests.
216+
*/
217+
private[spark] def getenv(name: String): String = System.getenv(name)
218+
213219
/** Checks for illegal or deprecated config settings. Throws an exception for the former. Not
214220
* idempotent - may mutate this conf object to convert deprecated settings to supported ones. */
215221
private[spark] def validateSettings() {
@@ -227,7 +233,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
227233
// Validate spark.executor.extraJavaOptions
228234
settings.get(executorOptsKey).map { javaOpts =>
229235
if (javaOpts.contains("-Dspark")) {
230-
val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts)'. " +
236+
val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts'). " +
231237
"Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit."
232238
throw new Exception(msg)
233239
}

core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ private[spark] class PythonRDD(
6262
val env = SparkEnv.get
6363
val localdir = env.blockManager.diskBlockManager.localDirs.map(
6464
f => f.getPath()).mkString(",")
65-
envVars += ("SPARK_LOCAL_DIR" -> localdir) // it's also used in monitor thread
65+
envVars += ("SPARK_LOCAL_DIRS" -> localdir) // it's also used in monitor thread
6666
val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
6767

6868
// Start a thread to feed the process input from our parent's iterator

core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -40,28 +40,3 @@ private[spark] object PythonUtils {
4040
paths.filter(_ != "").mkString(File.pathSeparator)
4141
}
4242
}
43-
44-
45-
/**
46-
* A utility class to redirect the child process's stdout or stderr.
47-
*/
48-
private[spark] class RedirectThread(
49-
in: InputStream,
50-
out: OutputStream,
51-
name: String)
52-
extends Thread(name) {
53-
54-
setDaemon(true)
55-
override def run() {
56-
scala.util.control.Exception.ignoring(classOf[IOException]) {
57-
// FIXME: We copy the stream on the level of bytes to avoid encoding problems.
58-
val buf = new Array[Byte](1024)
59-
var len = in.read(buf)
60-
while (len != -1) {
61-
out.write(buf, 0, len)
62-
out.flush()
63-
len = in.read(buf)
64-
}
65-
}
66-
}
67-
}

core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,14 @@
1717

1818
package org.apache.spark.api.python
1919

20-
import java.lang.Runtime
2120
import java.io.{DataOutputStream, DataInputStream, InputStream, OutputStreamWriter}
2221
import java.net.{InetAddress, ServerSocket, Socket, SocketException}
2322

2423
import scala.collection.mutable
2524
import scala.collection.JavaConversions._
2625

2726
import org.apache.spark._
28-
import org.apache.spark.util.Utils
27+
import org.apache.spark.util.{RedirectThread, Utils}
2928

3029
private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String, String])
3130
extends Logging {

core/src/main/scala/org/apache/spark/broadcast/BroadcastFactory.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,19 @@ import org.apache.spark.annotation.DeveloperApi
3232
*/
3333
@DeveloperApi
3434
trait BroadcastFactory {
35+
3536
def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager): Unit
37+
38+
/**
39+
* Creates a new broadcast variable.
40+
*
41+
* @param value value to broadcast
42+
* @param isLocal whether we are in local mode (single JVM process)
43+
* @param id unique id representing this broadcast variable
44+
*/
3645
def newBroadcast[T: ClassTag](value: T, isLocal: Boolean, id: Long): Broadcast[T]
46+
3747
def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit
48+
3849
def stop(): Unit
3950
}

0 commit comments

Comments
 (0)