-
Notifications
You must be signed in to change notification settings - Fork 28.6k
Clean up and simplify Spark configuration #299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac2d65e
0faa3b6
6eaf7d0
4982331
1f75238
84cc5e5
5b0ba8e
7cc70e4
761ebcd
437aed1
46555c1
b72d183
ace4ead
b08893b
afc9ed8
4ee6f9d
c2a2909
be42f35
e83cd8f
308f1f6
fda0301
ffa00fe
a762901
d50c388
a56b125
af0adf7
b16e6a2
af09e3e
0086939
b4b496c
a006464
127f301
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,13 @@ while (($#)); do | |
DEPLOY_MODE=$2 | ||
elif [ $1 = "--driver-memory" ]; then | ||
DRIVER_MEMORY=$2 | ||
elif [ $1 = "--driver-library-path" ]; then | ||
export _SPARK_LIBRARY_PATH=$2 | ||
elif [ $1 = "--driver-class-path" ]; then | ||
export SPARK_CLASSPATH="$SPARK_CLASSPATH:$2" | ||
elif [ $1 = "--driver-java-options" ]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't match usage in SparkSubmit where its --driver-java-opts |
||
export SPARK_JAVA_OPTS="$SPARK_JAVA_OPTS $2" | ||
fi | ||
|
||
shift | ||
done | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Default system properties included when running spark-submit. | ||
# This is useful for setting default environmental settings. | ||
|
||
# Example: | ||
# spark.master spark://master:7077 | ||
# spark.eventLog.enabled true | ||
# spark.eventLog.dir hdfs://namenode:8021/directory |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,41 @@ | ||
#!/usr/bin/env bash | ||
|
||
# This file contains environment variables required to run Spark. Copy it as | ||
# spark-env.sh and edit that to configure Spark for your site. | ||
# | ||
# The following variables can be set in this file: | ||
# This file is sourced when running various Spark programs. | ||
# Copy it as spark-env.sh and edit that to configure Spark for your site. | ||
|
||
# Options read when launching programs locally with | ||
# ./bin/run-example or ./bin/spark-submit | ||
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node | ||
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program | ||
# - SPARK_CLASSPATH, default classpath entries to append | ||
|
||
# Options read by executors and drivers running inside the cluster | ||
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node | ||
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program | ||
# - SPARK_CLASSPATH, default classpath entries to append | ||
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data | ||
# - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos | ||
# - SPARK_JAVA_OPTS, to set node-specific JVM options for Spark. Note that | ||
# we recommend setting app-wide options in the application's driver program. | ||
# Examples of node-specific options : -Dspark.local.dir, GC options | ||
# Examples of app-wide options : -Dspark.serializer | ||
# | ||
# If using the standalone deploy mode, you can also set variables for it here: | ||
|
||
# Options read in YARN client mode | ||
# - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) | ||
# - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). | ||
# - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) | ||
# - SPARK_DRIVER_MEMORY, Memory for Master (e.g. 1000M, 2G) (Default: 512 Mb) | ||
# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) | ||
# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: ‘default’) | ||
# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. | ||
# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. | ||
|
||
# Options for the daemons used in the standalone deploy mode: | ||
# - SPARK_MASTER_IP, to bind the master to a different IP address or hostname | ||
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports | ||
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") | ||
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine | ||
# - SPARK_WORKER_MEMORY, to set how much memory to use (e.g. 1000m, 2g) | ||
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) | ||
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT | ||
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node | ||
# - SPARK_WORKER_DIR, to set the working directory of worker processes | ||
# - SPARK_PUBLIC_DNS, to set the public dns name of the master | ||
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") | ||
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") | ||
# - SPARK_DAEMON_OPTS, to set config properties for all daemons (e.g. "-Dx=y") | ||
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -208,6 +208,82 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { | |
new SparkConf(false).setAll(settings) | ||
} | ||
|
||
/** Checks for illegal or deprecated config settings. Throws an exception for the former. Not | ||
* idempotent - may mutate this conf object to convert deprecated settings to supported ones. */ | ||
private[spark] def validateSettings() { | ||
if (settings.contains("spark.local.dir")) { | ||
val msg = "In Spark 1.0 and later spark.local.dir will be overridden by the value set by " + | ||
"the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone and LOCAL_DIRS in YARN)." | ||
logWarning(msg) | ||
} | ||
|
||
val executorOptsKey = "spark.executor.extraJavaOptions" | ||
val executorClasspathKey = "spark.executor.extraClassPath" | ||
val driverOptsKey = "spark.driver.extraJavaOptions" | ||
val driverClassPathKey = "spark.driver.extraClassPath" | ||
|
||
// Validate spark.executor.extraJavaOptions | ||
settings.get(executorOptsKey).map { javaOpts => | ||
if (javaOpts.contains("-Dspark")) { | ||
val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts)'. " + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm getting this error message trying to set things in the properties file running on yarn $ cat spark-conf.properties There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I misunderstood this setting. Ignore that comment. |
||
"Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit." | ||
throw new Exception(msg) | ||
} | ||
if (javaOpts.contains("-Xmx") || javaOpts.contains("-Xms")) { | ||
val msg = s"$executorOptsKey is not allowed to alter memory settings (was '$javaOpts'). " + | ||
"Use spark.executor.memory instead." | ||
throw new Exception(msg) | ||
} | ||
} | ||
|
||
// Check for legacy configs | ||
sys.env.get("SPARK_JAVA_OPTS").foreach { value => | ||
val error = | ||
s""" | ||
|SPARK_JAVA_OPTS was detected (set to '$value'). | ||
|This has undefined behavior when running on a cluster and is deprecated in Spark 1.0+. | ||
| | ||
|Please instead use: | ||
| - ./spark-submit with conf/spark-defaults.conf to set defaults for an application | ||
| - ./spark-submit with --driver-java-options to set -X options for a driver | ||
| - spark.executor.extraJavaOptions to set -X options for executors | ||
| - SPARK_DAEMON_OPTS to set java options for standalone daemons (i.e. master, worker) | ||
""".stripMargin | ||
logError(error) | ||
|
||
for (key <- Seq(executorOptsKey, driverOptsKey)) { | ||
if (getOption(key).isDefined) { | ||
throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.") | ||
} else { | ||
logWarning(s"Setting '$key' to '$value' as a work-around.") | ||
set(key, value) | ||
} | ||
} | ||
} | ||
|
||
sys.env.get("SPARK_CLASSPATH").foreach { value => | ||
val error = | ||
s""" | ||
|SPARK_CLASSPATH was detected (set to '$value'). | ||
| This has undefined behavior when running on a cluster and is deprecated in Spark 1.0+. | ||
| | ||
|Please instead use: | ||
| - ./spark-submit with --driver-class-path to augment the driver classpath | ||
| - spark.executor.extraClassPath to augment the executor classpath | ||
""".stripMargin | ||
logError(error) | ||
|
||
for (key <- Seq(executorClasspathKey, driverClassPathKey)) { | ||
if (getOption(key).isDefined) { | ||
throw new SparkException(s"Found both $key and SPARK_CLASSPATH. Use only the former.") | ||
} else { | ||
logWarning(s"Setting '$key' to '$value' as a work-around.") | ||
set(key, value) | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Return a string listing all keys and values, one per line. This is useful to print the | ||
* configuration out for debugging. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems there is currently no way to set the library path for examples (save via SPARK_JAVA_OPTS), do we need one?