Skip to content

Commit 2cfd215

Browse files
committed
[SPARK-33191][YARN][TESTS] Fix PySpark test cases in YarnClusterSuite
### What changes were proposed in this pull request? This PR proposes to fix: ``` org.apache.spark.deploy.yarn.YarnClusterSuite.run Python application in yarn-client mode org.apache.spark.deploy.yarn.YarnClusterSuite.run Python application in yarn-cluster mode org.apache.spark.deploy.yarn.YarnClusterSuite.run Python application in yarn-cluster mode using spark.yarn.appMasterEnv to override local envvar ``` it currently fails as below: ``` 20/10/16 19:20:36 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (amp-jenkins-worker-03.amp executor 1): org.apache.spark.SparkException: Error from python worker: Traceback (most recent call last): File "/usr/lib64/python2.6/runpy.py", line 104, in _run_module_as_main loader, code, fname = _get_module_details(mod_name) File "/usr/lib64/python2.6/runpy.py", line 79, in _get_module_details loader = get_loader(mod_name) File "/usr/lib64/python2.6/pkgutil.py", line 456, in get_loader return find_loader(fullname) File "/usr/lib64/python2.6/pkgutil.py", line 466, in find_loader for importer in iter_importers(fullname): File "/usr/lib64/python2.6/pkgutil.py", line 422, in iter_importers __import__(pkg) File "/home/jenkins/workspace/SparkPullRequestBuilder2/python/pyspark/__init__.py", line 53, in <module> from pyspark.rdd import RDD, RDDBarrier File "/home/jenkins/workspace/SparkPullRequestBuilder2/python/pyspark/rdd.py", line 34, in <module> from pyspark.java_gateway import local_connect_and_auth File "/home/jenkins/workspace/SparkPullRequestBuilder2/python/pyspark/java_gateway.py", line 29, in <module> from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters File "/home/jenkins/workspace/SparkPullRequestBuilder2/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 60 PY4J_TRUE = {"yes", "y", "t", "true"} ^ SyntaxError: invalid syntax ``` I think this was broken when Python 2 was dropped but was not caught because this specific test does not run when there's no change in YARN codes. See also #29843 (comment) The root cause seems like the paths are different, see #29843 (review). I _think_ Jenkins uses a different Python executable via Anaconda and the executor side does not know where it is for some reasons. This PR proposes to fix it just by explicitly specifying the absolute path for Python executable so the tests should pass in any environment. ### Why are the changes needed? To make tests pass. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? This issue looks specific to Jenkins. It should run the tests on Jenkins. Closes #30099 from HyukjinKwon/SPARK-33191. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
1 parent eb9966b commit 2cfd215

File tree

2 files changed

+34
-4
lines changed

2 files changed

+34
-4
lines changed

core/src/main/scala/org/apache/spark/TestUtils.scala

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@ package org.apache.spark
2020
import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
2121
import java.net.{HttpURLConnection, URI, URL}
2222
import java.nio.charset.StandardCharsets
23-
import java.nio.file.{Files => JavaFiles}
23+
import java.nio.file.{Files => JavaFiles, Paths}
2424
import java.nio.file.attribute.PosixFilePermission.{OWNER_EXECUTE, OWNER_READ, OWNER_WRITE}
2525
import java.security.SecureRandom
2626
import java.security.cert.X509Certificate
2727
import java.util.{Arrays, EnumSet, Locale, Properties}
2828
import java.util.concurrent.{TimeoutException, TimeUnit}
2929
import java.util.jar.{JarEntry, JarOutputStream, Manifest}
30+
import java.util.regex.Pattern
3031
import javax.net.ssl._
3132
import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
3233

@@ -37,6 +38,7 @@ import scala.sys.process.{Process, ProcessLogger}
3738
import scala.util.Try
3839

3940
import com.google.common.io.{ByteStreams, Files}
41+
import org.apache.commons.lang3.StringUtils
4042
import org.apache.log4j.PropertyConfigurator
4143
import org.json4s.JsonAST.JValue
4244
import org.json4s.jackson.JsonMethods.{compact, render}
@@ -268,6 +270,24 @@ private[spark] object TestUtils {
268270
attempt.isSuccess && attempt.get == 0
269271
}
270272

273+
/**
274+
* Get the absolute path from the executable. This implementation was borrowed from
275+
* `spark/dev/sparktestsupport/shellutils.py`.
276+
*/
277+
def getAbsolutePathFromExecutable(executable: String): Option[String] = {
278+
val command = if (Utils.isWindows) s"$executable.exe" else executable
279+
if (command.split(File.separator, 2).length == 1 &&
280+
JavaFiles.isRegularFile(Paths.get(command)) &&
281+
JavaFiles.isExecutable(Paths.get(command))) {
282+
Some(Paths.get(command).toAbsolutePath.toString)
283+
} else {
284+
sys.env("PATH").split(Pattern.quote(File.pathSeparator))
285+
.map(path => Paths.get(s"${StringUtils.strip(path, "\"")}${File.separator}$command"))
286+
.find(p => JavaFiles.isRegularFile(p) && JavaFiles.isExecutable(p))
287+
.map(_.toString)
288+
}
289+
}
290+
271291
/**
272292
* Returns the response code from an HTTP(S) URL.
273293
*/

resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,13 @@ import org.apache.spark.util.{Utils, YarnContainerInfoHelper}
5252
@ExtendedYarnTest
5353
class YarnClusterSuite extends BaseYarnClusterSuite {
5454

55+
private val pythonExecutablePath = {
56+
// To make sure to use the same Python executable.
57+
val maybePath = TestUtils.getAbsolutePathFromExecutable("python3")
58+
assert(maybePath.isDefined)
59+
maybePath.get
60+
}
61+
5562
override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
5663

5764
private val TEST_PYFILE = """
@@ -175,9 +182,9 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
175182
clientMode = false,
176183
extraConf = Map(
177184
"spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON"
178-
-> sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python"),
185+
-> sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", pythonExecutablePath),
179186
"spark.yarn.appMasterEnv.PYSPARK_PYTHON"
180-
-> sys.env.getOrElse("PYSPARK_PYTHON", "python")),
187+
-> sys.env.getOrElse("PYSPARK_PYTHON", pythonExecutablePath)),
181188
extraEnv = Map(
182189
"PYSPARK_DRIVER_PYTHON" -> "not python",
183190
"PYSPARK_PYTHON" -> "not python"))
@@ -275,7 +282,10 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
275282
s"$sparkHome/python")
276283
val extraEnvVars = Map(
277284
"PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
278-
"PYTHONPATH" -> pythonPath.mkString(File.pathSeparator)) ++ extraEnv
285+
"PYTHONPATH" -> pythonPath.mkString(File.pathSeparator),
286+
"PYSPARK_DRIVER_PYTHON" -> pythonExecutablePath,
287+
"PYSPARK_PYTHON" -> pythonExecutablePath
288+
) ++ extraEnv
279289

280290
val moduleDir = {
281291
val subdir = new File(tempDir, "pyModules")

0 commit comments

Comments
 (0)