Fix pyspark on Yarn.

Marcelo Vanzin · Marcelo Vanzin · commit 7a01e4adaeec · 2015-01-06T13:29:09.000-08:00
pyspark (at least) relies on SPARK_HOME (the env variable) to be set
for things to work properly. The launcher wasn't making sure that
variable was set in all cases, so do that. Also, separately, the
Yarn backend didn't seem to propagate that variable to the AM for
some reason, so do that too. (Not sure how things worked previously...)

Extra: add ".pyo" files to .gitignore (these are generated by `python -O`).
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 *.iml
 *.iws
 *.pyc
+*.pyo
 .idea/
 .idea_modules/
 build/*.jar
diff --git a/bin/pyspark b/bin/pyspark
@@ -96,4 +96,4 @@ fi
 
 export PYSPARK_DRIVER_PYTHON
 export PYSPARK_DRIVER_PYTHON_OPTS
-exec $SPARK_HOME/bin/spark-class pyspark "$@"
+exec "$SPARK_HOME"/bin/spark-class pyspark "$@"
diff --git a/bin/spark-shell b/bin/spark-shell
@@ -42,7 +42,7 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
 fi
 
 # SPARK-4161: scala does not assume use of the java classpath,
-# so we need to add the "-Dscala.usejavacp=true" flag mnually. We
+# so we need to add the "-Dscala.usejavacp=true" flag manually. We
 # do this specifically for the Spark shell because the scala REPL
 # has its own class loader, and any additional classpath specified
 # through spark.driver.extraClassPath is not automatically propagated.
diff --git a/bin/spark-shell2.cmd b/bin/spark-shell2.cmd
@@ -26,7 +26,7 @@ if %ERRORLEVEL% equ 0 (
 )
 
 rem SPARK-4161: scala does not assume use of the java classpath,
-rem so we need to add the "-Dscala.usejavacp=true" flag mnually. We
+rem so we need to add the "-Dscala.usejavacp=true" flag manually. We
 rem do this specifically for the Spark shell because the scala REPL
 rem has its own class loader, and any additional classpath specified
 rem through spark.driver.extraClassPath is not automatically propagated.
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java
@@ -331,11 +331,20 @@ protected String getScalaVersion() {
   protected List<String> prepareForOs(List<String> cmd,
       String libPath,
       Map<String, String> env) {
+
+    // If SPARK_HOME does not come from the environment, explicitly set it
+    // in the child's environment.
+    Map<String, String> childEnv = env;
+    if (System.getenv("SPARK_HOME") == null && !env.containsKey("SPARK_HOME")) {
+      childEnv = new HashMap<String, String>(env);
+      childEnv.put("SPARK_HOME", sparkHome);
+    }
+
     if (isWindows()) {
-      return prepareForWindows(cmd, libPath, env);
+      return prepareForWindows(cmd, libPath, childEnv);
     }
 
-    if (isEmpty(libPath) && env.isEmpty()) {
+    if (isEmpty(libPath) && childEnv.isEmpty()) {
       return cmd;
     }
 
@@ -348,7 +357,7 @@ protected List<String> prepareForOs(List<String> cmd,
       String newEnvValue = join(File.pathSeparator, currEnvValue, libPath);
       newCmd.add(String.format("%s=%s", envName, newEnvValue));
     }
-    for (Map.Entry<String, String> e : env.entrySet()) {
+    for (Map.Entry<String, String> e : childEnv.entrySet()) {
       newCmd.add(String.format("%s=%s", e.getKey(), e.getValue()));
     }
     newCmd.addAll(cmd);
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -242,6 +242,13 @@ private[spark] trait ClientBase extends Logging {
     env("SPARK_YARN_STAGING_DIR") = stagingDir
     env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
 
+    // Propagate SPARK_HOME to the containers. This is needed for pyspark to
+    // work, since the executor's PYTHONPATH is built based on the location
+    // of SPARK_HOME.
+    sparkConf.getOption("spark.home").orElse(sys.env.get("SPARK_HOME")).foreach { path =>
+      env("SPARK_HOME") = path
+    }
+
     // Set the environment variables to be passed on to the executors.
     distCacheMgr.setDistFilesEnv(env)
     distCacheMgr.setDistArchivesEnv(env)

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ if %ERRORLEVEL% equ 0 (`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`rem SPARK-4161: scala does not assume use of the java classpath,`
`29`		`-rem so we need to add the "-Dscala.usejavacp=true" flag mnually. We`
	`29`	`+rem so we need to add the "-Dscala.usejavacp=true" flag manually. We`
`30`	`30`	`rem do this specifically for the Spark shell because the scala REPL`
`31`	`31`	`rem has its own class loader, and any additional classpath specified`
`32`	`32`	`rem through spark.driver.extraClassPath is not automatically propagated.`