Skip to content

Commit 7a01e4a

Browse files
author
Marcelo Vanzin
committed
Fix pyspark on Yarn.
pyspark (at least) relies on SPARK_HOME (the env variable) to be set for things to work properly. The launcher wasn't making sure that variable was set in all cases, so do that. Also, separately, the Yarn backend didn't seem to propagate that variable to the AM for some reason, so do that too. (Not sure how things worked previously...) Extra: add ".pyo" files to .gitignore (these are generated by `python -O`).
1 parent 1b3f6e9 commit 7a01e4a

File tree

6 files changed

+23
-6
lines changed

6 files changed

+23
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*.iml
77
*.iws
88
*.pyc
9+
*.pyo
910
.idea/
1011
.idea_modules/
1112
build/*.jar

bin/pyspark

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,4 @@ fi
9696

9797
export PYSPARK_DRIVER_PYTHON
9898
export PYSPARK_DRIVER_PYTHON_OPTS
99-
exec $SPARK_HOME/bin/spark-class pyspark "$@"
99+
exec "$SPARK_HOME"/bin/spark-class pyspark "$@"

bin/spark-shell

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
4242
fi
4343

4444
# SPARK-4161: scala does not assume use of the java classpath,
45-
# so we need to add the "-Dscala.usejavacp=true" flag mnually. We
45+
# so we need to add the "-Dscala.usejavacp=true" flag manually. We
4646
# do this specifically for the Spark shell because the scala REPL
4747
# has its own class loader, and any additional classpath specified
4848
# through spark.driver.extraClassPath is not automatically propagated.

bin/spark-shell2.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ if %ERRORLEVEL% equ 0 (
2626
)
2727

2828
rem SPARK-4161: scala does not assume use of the java classpath,
29-
rem so we need to add the "-Dscala.usejavacp=true" flag mnually. We
29+
rem so we need to add the "-Dscala.usejavacp=true" flag manually. We
3030
rem do this specifically for the Spark shell because the scala REPL
3131
rem has its own class loader, and any additional classpath specified
3232
rem through spark.driver.extraClassPath is not automatically propagated.

launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,20 @@ protected String getScalaVersion() {
331331
protected List<String> prepareForOs(List<String> cmd,
332332
String libPath,
333333
Map<String, String> env) {
334+
335+
// If SPARK_HOME does not come from the environment, explicitly set it
336+
// in the child's environment.
337+
Map<String, String> childEnv = env;
338+
if (System.getenv("SPARK_HOME") == null && !env.containsKey("SPARK_HOME")) {
339+
childEnv = new HashMap<String, String>(env);
340+
childEnv.put("SPARK_HOME", sparkHome);
341+
}
342+
334343
if (isWindows()) {
335-
return prepareForWindows(cmd, libPath, env);
344+
return prepareForWindows(cmd, libPath, childEnv);
336345
}
337346

338-
if (isEmpty(libPath) && env.isEmpty()) {
347+
if (isEmpty(libPath) && childEnv.isEmpty()) {
339348
return cmd;
340349
}
341350

@@ -348,7 +357,7 @@ protected List<String> prepareForOs(List<String> cmd,
348357
String newEnvValue = join(File.pathSeparator, currEnvValue, libPath);
349358
newCmd.add(String.format("%s=%s", envName, newEnvValue));
350359
}
351-
for (Map.Entry<String, String> e : env.entrySet()) {
360+
for (Map.Entry<String, String> e : childEnv.entrySet()) {
352361
newCmd.add(String.format("%s=%s", e.getKey(), e.getValue()));
353362
}
354363
newCmd.addAll(cmd);

yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,13 @@ private[spark] trait ClientBase extends Logging {
242242
env("SPARK_YARN_STAGING_DIR") = stagingDir
243243
env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
244244

245+
// Propagate SPARK_HOME to the containers. This is needed for pyspark to
246+
// work, since the executor's PYTHONPATH is built based on the location
247+
// of SPARK_HOME.
248+
sparkConf.getOption("spark.home").orElse(sys.env.get("SPARK_HOME")).foreach { path =>
249+
env("SPARK_HOME") = path
250+
}
251+
245252
// Set the environment variables to be passed on to the executors.
246253
distCacheMgr.setDistFilesEnv(env)
247254
distCacheMgr.setDistArchivesEnv(env)

0 commit comments

Comments
 (0)