Skip to content

Commit b16e6a2

Browse files
committed
Cleanup of spark-submit script and Scala quick start guide
1 parent af0adf7 commit b16e6a2

File tree

6 files changed

+192
-79
lines changed

6 files changed

+192
-79
lines changed

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -171,11 +171,11 @@ class SparkContext(config: SparkConf) extends Logging {
171171
conf.setIfMissing("spark.driver.host", Utils.localHostName())
172172
conf.setIfMissing("spark.driver.port", "0")
173173

174-
val jars: Seq[String] = if (conf.contains("spark.jars")) {
175-
conf.get("spark.jars").split(",").filter(_.size != 0)
176-
} else {
177-
null
178-
}
174+
val jars: Seq[String] =
175+
conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.size != 0)).toSeq.flatten
176+
177+
val files: Seq[String] =
178+
conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.size != 0)).toSeq.flatten
179179

180180
val master = conf.get("spark.master")
181181
val appName = conf.get("spark.app.name")
@@ -236,6 +236,10 @@ class SparkContext(config: SparkConf) extends Logging {
236236
jars.foreach(addJar)
237237
}
238238

239+
if (files != null) {
240+
files.foreach(addFile)
241+
}
242+
239243
private def warnSparkMem(value: String): String = {
240244
logWarning("Using SPARK_MEM to set amount of memory to use per executor process is " +
241245
"deprecated, please use spark.executor.memory instead.")

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 11 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,11 @@
1717

1818
package org.apache.spark.deploy
1919

20-
import java.io.{File, FileInputStream, IOException, PrintStream}
20+
import java.io.{File, PrintStream}
2121
import java.net.{URI, URL}
22-
import java.util.Properties
2322

24-
import scala.collection.JavaConversions._
2523
import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
2624

27-
import org.apache.spark.SparkException
2825
import org.apache.spark.executor.ExecutorURLClassLoader
2926

3027
/**
@@ -110,23 +107,6 @@ object SparkSubmit {
110107
val sysProps = new HashMap[String, String]()
111108
var childMainClass = ""
112109

113-
// Load system properties by default from the file, if present
114-
if (appArgs.verbose) printStream.println(s"Using properties file: ${appArgs.propertiesFile}")
115-
Option(appArgs.propertiesFile).foreach { filename =>
116-
val file = new File(filename)
117-
getDefaultProperties(file).foreach { case (k, v) =>
118-
if (k.startsWith("spark")) {
119-
if (k == "spark.master")
120-
throw new Exception("Setting spark.master in spark-defaults.properties is not " +
121-
"supported. Use MASTER environment variable or --master.")
122-
sysProps(k) = v
123-
if (appArgs.verbose) printStream.println(s"Adding default property: $k=$v")
124-
}
125-
else {
126-
printWarning(s"Ignoring non-spark config property: $k=$v")
127-
}
128-
}
129-
}
130110

131111
if (clusterManager == MESOS && deployOnCluster) {
132112
printErrorAndExit("Mesos does not support running the driver on the cluster")
@@ -166,10 +146,11 @@ object SparkSubmit {
166146
sysProp = "spark.cores.max"),
167147
new OptionAssigner(appArgs.files, YARN, false, sysProp = "spark.yarn.dist.files"),
168148
new OptionAssigner(appArgs.files, YARN, true, clOption = "--files"),
149+
new OptionAssigner(appArgs.files, STANDALONE | MESOS, true, sysProp = "spark.files"),
169150
new OptionAssigner(appArgs.archives, YARN, false, sysProp = "spark.yarn.dist.archives"),
170151
new OptionAssigner(appArgs.archives, YARN, true, clOption = "--archives"),
171152
new OptionAssigner(appArgs.jars, YARN, true, clOption = "--addJars"),
172-
new OptionAssigner(appArgs.jars, STANDALONE | YARN | MESOS, true, sysProp = "spark.jars")
153+
new OptionAssigner(appArgs.jars, STANDALONE | MESOS, false, sysProp = "spark.jars")
173154
)
174155

175156
// For client mode make any added jars immediately visible on the classpath
@@ -219,6 +200,10 @@ object SparkSubmit {
219200
}
220201
}
221202

203+
for ((k, v) <- appArgs.getDefaultSparkProperties) {
204+
if (!sysProps.contains(k)) sysProps(k) = v
205+
}
206+
222207
(childArgs, childClasspath, sysProps, childMainClass)
223208
}
224209

@@ -259,22 +244,12 @@ object SparkSubmit {
259244
val url = localJarFile.getAbsoluteFile.toURI.toURL
260245
loader.addURL(url)
261246
}
262-
263-
private def getDefaultProperties(file: File): Seq[(String, String)] = {
264-
require(file.exists(), s"Default properties file ${file.getName} does not exist")
265-
val inputStream = new FileInputStream(file)
266-
val properties = new Properties()
267-
try {
268-
properties.load(inputStream)
269-
} catch {
270-
case e: IOException =>
271-
val message = s"Failed when loading Spark properties file ${file.getName}"
272-
throw new SparkException(message, e)
273-
}
274-
properties.stringPropertyNames().toSeq.map(k => (k, properties(k)))
275-
}
276247
}
277248

249+
/**
250+
* Provides an indirection layer for passing arguments as system properties or flags to
251+
* the user's driver program or to downstream launcher tools.
252+
*/
278253
private[spark] class OptionAssigner(val value: String,
279254
val clusterManager: Int,
280255
val deployOnCluster: Boolean,

core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala

Lines changed: 104 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@
1717

1818
package org.apache.spark.deploy
1919

20-
import scala.collection.mutable.ArrayBuffer
21-
import java.io.File
20+
import java.io.{File, FileInputStream, IOException}
21+
import java.util.Properties
22+
23+
import scala.collection.JavaConversions._
24+
import scala.collection.mutable.{HashMap, ArrayBuffer}
25+
26+
import org.apache.spark.SparkException
2227

2328
/**
2429
* Parses and encapsulates arguments from the spark-submit script.
2530
*/
2631
private[spark] class SparkSubmitArguments(args: Array[String]) {
27-
var master: String = "local"
32+
var master: String = null
2833
var deployMode: String = null
2934
var executorMemory: String = null
3035
var executorCores: String = null
@@ -47,22 +52,70 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
4752
var jars: String = null
4853
var verbose: Boolean = false
4954

50-
loadEnvVars()
5155
parseOpts(args.toList)
56+
loadDefaults()
57+
checkRequiredArguments()
58+
59+
/** Return default present in the currently defined defaults file. */
60+
def getDefaultSparkProperties = {
61+
val defaultProperties = new HashMap[String, String]()
62+
if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile")
63+
Option(propertiesFile).foreach { filename =>
64+
val file = new File(filename)
65+
SparkSubmitArguments.getPropertiesFromFile(file).foreach { case (k, v) =>
66+
if (k.startsWith("spark")) {
67+
defaultProperties(k) = v
68+
if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v")
69+
}
70+
else {
71+
SparkSubmit.printWarning(s"Ignoring non-spark config property: $k=$v")
72+
}
73+
}
74+
}
75+
defaultProperties
76+
}
5277

53-
// Sanity checks
54-
if (args.length == 0) printUsageAndExit(-1)
55-
if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource")
56-
if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class")
57-
if (propertiesFile == null) {
58-
sys.env.get("SPARK_HOME").foreach { sparkHome =>
59-
val sep = File.separator
60-
val defaultPath = s"${sparkHome}${sep}conf${sep}spark-defaults.properties"
61-
val file = new File(defaultPath)
62-
if (file.exists()) {
63-
propertiesFile = file.getAbsolutePath
64-
}
78+
/** Fill in any undefined values based on the current properties file or built-in defaults. */
79+
private def loadDefaults() = {
80+
81+
// Use common defaults file, if not specified by user
82+
if (propertiesFile == null) {
83+
sys.env.get("SPARK_HOME").foreach { sparkHome =>
84+
val sep = File.separator
85+
val defaultPath = s"${sparkHome}${sep}conf${sep}spark-defaults.properties"
86+
val file = new File(defaultPath)
87+
if (file.exists()) {
88+
propertiesFile = file.getAbsolutePath
89+
}
90+
}
6591
}
92+
93+
val defaultProperties = getDefaultSparkProperties
94+
// Use properties file as fallback for values which have a direct analog to
95+
// arguments in this script.
96+
master = Option(master).getOrElse(defaultProperties.get("spark.master").orNull)
97+
executorMemory = Option(executorMemory)
98+
.getOrElse(defaultProperties.get("spark.executor.memory").orNull)
99+
executorCores = Option(executorCores)
100+
.getOrElse(defaultProperties.get("spark.executor.cores").orNull)
101+
totalExecutorCores = Option(totalExecutorCores)
102+
.getOrElse(defaultProperties.get("spark.cores.max").orNull)
103+
name = Option(name).getOrElse(defaultProperties.get("spark.app.name").orNull)
104+
jars = Option(jars).getOrElse(defaultProperties.get("spark.jars").orNull)
105+
106+
// This supports env vars in older versions of Spark
107+
master = Option(master).getOrElse(System.getenv("MASTER"))
108+
deployMode = Option(deployMode).getOrElse(System.getenv("DEPLOY_MODE"))
109+
110+
// Global defaults. These should be keep to minimum to avoid confusing behavior.
111+
master = Option(master).getOrElse("local")
112+
}
113+
114+
/** Ensure that required fields exists. Call this only once all defaults are loaded. */
115+
private def checkRequiredArguments() = {
116+
if (args.length == 0) printUsageAndExit(-1)
117+
if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource")
118+
if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class")
66119
}
67120

68121
override def toString = {
@@ -89,14 +142,12 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
89142
| childArgs [${childArgs.mkString(" ")}]
90143
| jars $jars
91144
| verbose $verbose
145+
|
146+
|Default properties from $propertiesFile:
147+
|${getDefaultSparkProperties.mkString(" ", "\n ", "\n")}
92148
""".stripMargin
93149
}
94150

95-
private def loadEnvVars() {
96-
Option(System.getenv("MASTER")).map(master = _)
97-
Option(System.getenv("DEPLOY_MODE")).map(deployMode = _)
98-
}
99-
100151
private def parseOpts(opts: List[String]): Unit = opts match {
101152
case ("--name") :: value :: tail =>
102153
name = value
@@ -189,6 +240,18 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
189240
parseOpts(tail)
190241

191242
case value :: tail =>
243+
if (value.startsWith("-")) {
244+
val errMessage = s"Unrecognized option '$value'."
245+
val suggestion: Option[String] = value match {
246+
case v if v.startsWith("--") && v.contains("=") =>
247+
val parts = v.split("=")
248+
Some(s"Perhaps you meant '${parts(0)} ${parts(1)}'?")
249+
case _ =>
250+
None
251+
}
252+
SparkSubmit.printErrorAndExit(errMessage + suggestion.map(" " + _).getOrElse(""))
253+
}
254+
192255
if (primaryResource != null) {
193256
val error = s"Found two conflicting resources, $value and $primaryResource." +
194257
" Expecting only one resource."
@@ -217,6 +280,8 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
217280
| --jars JARS A comma-separated list of local jars to include on the
218281
| driver classpath and that SparkContext.addJar will work
219282
| with. Doesn't work on standalone with 'cluster' deploy mode.
283+
| --files FILES Comma separated list of files to be placed in the working dir
284+
| of each executor.
220285
| --properties-file FILE Path to a file from which to load extra properties. If not
221286
| specified, this will look for conf/spark-defaults.properties.
222287
|
@@ -225,6 +290,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
225290
| --driver-library-path Extra library path entries to pass to the driver
226291
| --driver-class-path Extra class path entries to pass to the driver
227292
|
293+
| --executor-memory MEM Memory per executor (e.g. 1000M, 2G) (Default: 1G).
228294
|
229295
| Spark standalone with cluster deploy mode only:
230296
| --driver-cores NUM Cores for driver (Default: 1).
@@ -235,14 +301,28 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
235301
|
236302
| YARN-only:
237303
| --executor-cores NUM Number of cores per executor (Default: 1).
238-
| --executor-memory MEM Memory per executor (e.g. 1000M, 2G) (Default: 1G).
239304
| --queue QUEUE_NAME The YARN queue to submit to (Default: 'default').
240305
| --num-executors NUM Number of executors to (Default: 2).
241-
| --files FILES Comma separated list of files to be placed in the working dir
242-
| of each executor.
243306
| --archives ARCHIVES Comma separated list of archives to be extracted into the
244307
| working dir of each executor.""".stripMargin
245308
)
246309
SparkSubmit.exitFn()
247310
}
248311
}
312+
313+
object SparkSubmitArguments {
314+
/** Load properties present in the given file. */
315+
def getPropertiesFromFile(file: File): Seq[(String, String)] = {
316+
require(file.exists(), s"Properties file ${file.getName} does not exist")
317+
val inputStream = new FileInputStream(file)
318+
val properties = new Properties()
319+
try {
320+
properties.load(inputStream)
321+
} catch {
322+
case e: IOException =>
323+
val message = s"Failed when loading Spark properties file ${file.getName}"
324+
throw new SparkException(message, e)
325+
}
326+
properties.stringPropertyNames().toSeq.map(k => (k, properties(k)))
327+
}
328+
}

core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717

1818
package org.apache.spark.deploy
1919

20-
import java.io.{OutputStream, PrintStream}
20+
import java.io.{File, OutputStream, PrintStream}
2121

2222
import scala.collection.mutable.ArrayBuffer
2323

2424
import org.scalatest.FunSuite
2525
import org.scalatest.matchers.ShouldMatchers
2626

2727
import org.apache.spark.deploy.SparkSubmit._
28+
import org.scalatest.prop.Tables.Table
29+
import org.scalatest.prop.TableDrivenPropertyChecks._
30+
import org.apache.spark.util.Utils
2831

2932

3033
class SparkSubmitSuite extends FunSuite with ShouldMatchers {
@@ -71,6 +74,13 @@ class SparkSubmitSuite extends FunSuite with ShouldMatchers {
7174
testPrematureExit(Array("--help"), "Usage: spark-submit") should be (true)
7275
}
7376

77+
test("prints error with unrecognized option") {
78+
testPrematureExit(Array("my.jar --blarg"), "Unrecognized option '--blarg'") should be (true)
79+
testPrematureExit(Array("my.jar -bleg"), "Unrecognized option: '-bleg'") should be (true)
80+
testPrematureExit(Array("my.jar --master=abc"),
81+
"Unrecognized option: '--master=abc'. Perhaps you want '--master abc'?") should be (true)
82+
}
83+
7484
test("handles multiple binary definitions") {
7585
val adjacentJars = Array("foo.jar", "bar.jar")
7686
testPrematureExit(adjacentJars, "error: Found two conflicting resources") should be (true)
@@ -175,4 +185,14 @@ class SparkSubmitSuite extends FunSuite with ShouldMatchers {
175185
sysProps("spark.executor.memory") should be ("5g")
176186
sysProps("spark.cores.max") should be ("5")
177187
}
188+
189+
def runSparkSubmit(args: Seq[String]): String = {
190+
val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.home")).get
191+
Utils.executeAndGetOutput(
192+
Seq("./bin/spark-submit") ++ args,
193+
new File(sparkHome),
194+
Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome))
195+
}
196+
197+
178198
}

docs/cluster-overview.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,14 @@ The following table summarizes terms you'll see used to refer to cluster concept
138138
<td>Application</td>
139139
<td>User program built on Spark. Consists of a <em>driver program</em> and <em>executors</em> on the cluster.</td>
140140
</tr>
141+
<tr>
142+
<td>Application jar</td>
143+
<td>
144+
A jar containing the user's Spark application. In some cases users will want to create
145+
an "uber jar" containing their application along with its dependencies. The user's jar
146+
should never include Hadoop or Spark libraries, however, these will be added at runtime.
147+
</td>
148+
</tr>
141149
<tr>
142150
<td>Driver program</td>
143151
<td>The process running the main() function of the application and creating the SparkContext</td>

0 commit comments

Comments
 (0)