Skip to content

[SPARK-20338][CORE]Spaces in spark.eventLog.dir are not correctly handled #17638

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ class SparkContext(config: SparkConf) extends Logging {
* ------------------------------------------------------------------------------------- */

private var _conf: SparkConf = _
private var _eventLogDir: Option[URI] = None
private var _eventLogDir: Option[String] = None
private var _eventLogCodec: Option[String] = None
private var _env: SparkEnv = _
private var _jobProgressListener: JobProgressListener = _
Expand Down Expand Up @@ -236,7 +236,7 @@ class SparkContext(config: SparkConf) extends Logging {
def appName: String = _conf.get("spark.app.name")

private[spark] def isEventLogEnabled: Boolean = _conf.getBoolean("spark.eventLog.enabled", false)
private[spark] def eventLogDir: Option[URI] = _eventLogDir
private[spark] def eventLogDir: Option[String] = _eventLogDir
private[spark] def eventLogCodec: Option[String] = _eventLogCodec

def isLocal: Boolean = Utils.isLocalMaster(_conf)
Expand Down Expand Up @@ -405,9 +405,7 @@ class SparkContext(config: SparkConf) extends Logging {

_eventLogDir =
if (isEventLogEnabled) {
val unresolvedDir = conf.get("spark.eventLog.dir", EventLoggingListener.DEFAULT_LOG_DIR)
.stripSuffix("/")
Some(Utils.resolveURI(unresolvedDir))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't you URI encode this unresolvedDir here? I think encode space to "%20" should be enough. Not sure why you need to change lots of code.

scala> val a = new URI("/tmp/aa%20nn")
a: java.net.URI = /tmp/aa%20nn

scala> val path = new Path(a)
path: org.apache.hadoop.fs.Path = /tmp/aa nn

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the dir contains space and also contains %20 (e.g "hdfs://nn:9000/a b%20c"), i seems to me that the encode does not work well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in your case we need to percentile encode the unresolvedDir before calling resolveURI.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i suggest to use new Path(path).toURI() instead new URI(path) since new URI(path) not support space in path.
It is not necessary to use encode if we use new Path(path).toURI()

Some(conf.get("spark.eventLog.dir", EventLoggingListener.DEFAULT_LOG_DIR).stripSuffix("/"))
} else {
None
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@

package org.apache.spark.deploy

import java.net.URI

private[spark] case class ApplicationDescription(
name: String,
maxCores: Option[Int],
memoryPerExecutorMB: Int,
command: Command,
appUiUrl: String,
eventLogDir: Option[URI] = None,
eventLogDir: Option[String] = None,
// short name of compression codec used when writing event logs, if any (e.g. lzf)
eventLogCodec: Option[String] = None,
coresPerExecutor: Option[Int] = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.spark.scheduler

import java.io._
import java.net.URI
import java.nio.charset.StandardCharsets
import java.util.Locale

Expand Down Expand Up @@ -50,22 +49,22 @@ import org.apache.spark.util.{JsonProtocol, Utils}
private[spark] class EventLoggingListener(
appId: String,
appAttemptId : Option[String],
logBaseDir: URI,
logBaseDir: String,
sparkConf: SparkConf,
hadoopConf: Configuration)
extends SparkListener with Logging {

import EventLoggingListener._

def this(appId: String, appAttemptId : Option[String], logBaseDir: URI, sparkConf: SparkConf) =
def this(appId: String, appAttemptId : Option[String], logBaseDir: String, sparkConf: SparkConf) =
this(appId, appAttemptId, logBaseDir, sparkConf,
SparkHadoopUtil.get.newConfiguration(sparkConf))

private val shouldCompress = sparkConf.getBoolean("spark.eventLog.compress", false)
private val shouldOverwrite = sparkConf.getBoolean("spark.eventLog.overwrite", false)
private val testing = sparkConf.getBoolean("spark.eventLog.testing", false)
private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024
private val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf)
Copy link
Contributor

@jerryshao jerryshao Apr 20, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably is a by design choice, only when scheme is defined then Spark will pick the right FS, otherwise it will use local FS instead.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about the URI of "hdfs://nn:9000/a b/c" ? Even there is right scheme of FS but it will use local FS instead

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure? Let me investigate a bit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That because resolveURI get URISyntaxException when resolving hdfs://nn:9000/a b/c and resolveURI will change to local File instead. Please see the implementation of resolveURI.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes i have tested. In resolveURI function if path contains space, new URI(path) will throw exception and then will be use as a local FS.
Thanks shao.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So i think we should not use new URI(path) since it not support space in path.
i suggest to use new Path(path).toURI() instead new URI(path)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't agree with you. String or URI representation should be equal, it is not that changing to String representation then the issue is workaround-ed.

I think in your case we need to fix resolveURI to handle space case.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i will try to fix resolveURI to handle space case,Thanks.
What is your opinion if i use val uri = new Path(path).toUri instead val uri = new URI(path) in resolveURI? we do not need to use encode, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, from your case it is workable, but I'm sure if it could handle all the cases in UT.

private val fileSystem = new Path(logBaseDir).getFileSystem(hadoopConf)
private val compressionCodec =
if (shouldCompress) {
Some(CompressionCodec.createCodec(sparkConf))
Expand Down Expand Up @@ -96,8 +95,8 @@ private[spark] class EventLoggingListener(
}

val workingPath = logPath + IN_PROGRESS
val uri = new URI(workingPath)
val path = new Path(workingPath)
val uri = path.toUri
val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme
val isDefaultLocal = defaultFs == null || defaultFs == "file"

Expand Down Expand Up @@ -303,11 +302,11 @@ private[spark] object EventLoggingListener extends Logging {
* @return A path which consists of file-system-safe characters.
*/
def getLogPath(
logBaseDir: URI,
logBaseDir: String,
appId: String,
appAttemptId: Option[String],
compressionCodecName: Option[String] = None): String = {
val base = logBaseDir.toString.stripSuffix("/") + "/" + sanitize(appId)
val base = logBaseDir.stripSuffix("/") + "/" + sanitize(appId)
val codec = compressionCodecName.map("." + _).getOrElse("")
if (appAttemptId.isDefined) {
base + "_" + sanitize(appAttemptId.get) + codec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
inProgress: Boolean,
codec: Option[String] = None): File = {
val ip = if (inProgress) EventLoggingListener.IN_PROGRESS else ""
val logUri = EventLoggingListener.getLogPath(testDir.toURI, appId, appAttemptId)
val logPath = new URI(logUri).getPath + ip
val logPath = EventLoggingListener.getLogPath(testDir.getAbsolutePath, appId, appAttemptId) + ip
new File(logPath)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
private var testDirPath: Path = _

before {
testDir = Utils.createTempDir()
testDir = Utils.createTempDir(namePrefix = s"event log")
testDir.deleteOnExit()
testDirPath = new Path(testDir.getAbsolutePath())
}
Expand All @@ -62,7 +62,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
test("Verify log file exist") {
// Verify logging directory exists
val conf = getLoggingConf(testDirPath)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toString, conf)
eventLogger.start()

val logPath = new Path(eventLogger.logPath + EventLoggingListener.IN_PROGRESS)
Expand Down Expand Up @@ -100,16 +100,15 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
val secretPassword = "secret_password"
val conf = getLoggingConf(testDirPath, None)
.set(key, secretPassword)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toString, conf)
val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, Seq.empty)
val event = SparkListenerEnvironmentUpdate(envDetails)
val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap
assert(redactedProps(key) == "*********(redacted)")
}

test("Log overwriting") {
val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None)
val logPath = new URI(logUri).getPath
val logPath = EventLoggingListener.getLogPath(testDir.toString, "test", None)
// Create file before writing the event log
new FileOutputStream(new File(logPath)).close()
// Expected IOException, since we haven't enabled log overwrite.
Expand All @@ -119,7 +118,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
}

test("Event log name") {
val baseDirUri = Utils.resolveURI("/base-dir")
val baseDirUri = "/base-dir"
// without compression
assert(s"${baseDirUri.toString}/app1" === EventLoggingListener.getLogPath(
baseDirUri, "app1", None))
Expand Down Expand Up @@ -154,7 +153,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
val conf = getLoggingConf(testDirPath, compressionCodec)
extraConf.foreach { case (k, v) => conf.set(k, v) }
val logName = compressionCodec.map("test-" + _).getOrElse("test")
val eventLogger = new EventLoggingListener(logName, None, testDirPath.toUri(), conf)
val eventLogger = new EventLoggingListener(logName, None, testDirPath.toString, conf)
val listenerBus = new LiveListenerBus(sc)
val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None,
125L, "Mickey", None)
Expand Down Expand Up @@ -190,15 +189,12 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
* This runs a simple Spark job and asserts that the expected events are logged when expected.
*/
private def testApplicationEventLogging(compressionCodec: Option[String] = None) {
// Set defaultFS to something that would cause an exception, to make sure we don't run
// into SPARK-6688.
val conf = getLoggingConf(testDirPath, compressionCodec)
.set("spark.hadoop.fs.defaultFS", "unsupported://example.com")
sc = new SparkContext("local-cluster[2,2,1024]", "test", conf)
assert(sc.eventLogger.isDefined)
val eventLogger = sc.eventLogger.get
val eventLogPath = eventLogger.logPath
val expectedLogDir = testDir.toURI()
val expectedLogDir = testDir.getAbsolutePath
assert(eventLogPath === EventLoggingListener.getLogPath(
expectedLogDir, sc.applicationId, None, compressionCodec.map(CompressionCodec.getShortName)))

Expand Down Expand Up @@ -290,7 +286,7 @@ object EventLoggingListenerSuite {
val conf = new SparkConf
conf.set("spark.eventLog.enabled", "true")
conf.set("spark.eventLog.testing", "true")
conf.set("spark.eventLog.dir", logDir.toUri.toString)
conf.set("spark.eventLog.dir", logDir.toString)
compressionCodec.foreach { codec =>
conf.set("spark.eventLog.compress", "true")
conf.set("spark.io.compression.codec", codec)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
* log the events.
*/
private class EventMonster(conf: SparkConf)
extends EventLoggingListener("test", None, new URI("testdir"), conf) {
extends EventLoggingListener("test", None, "test dir", conf) {

override def start() { }

Expand Down