Skip to content

Commit

Permalink
[SPARK-18535][UI][YARN] Redact sensitive information from Spark logs …
Browse files Browse the repository at this point in the history
…and UI

## What changes were proposed in this pull request?

This patch adds a new property called `spark.secret.redactionPattern` that
allows users to specify a scala regex to decide which Spark configuration
properties and environment variables in driver and executor environments
contain sensitive information. When this regex matches the property or
environment variable name, its value is redacted from the environment UI and
various logs like YARN and event logs.

This change uses this property to redact information from event logs and YARN
logs. It also, updates the UI code to adhere to this property instead of
hardcoding the logic to decipher which properties are sensitive.

Here's an image of the UI post-redaction:
![image](https://cloud.githubusercontent.com/assets/1709451/20506215/4cc30654-b007-11e6-8aee-4cde253fba2f.png)

Here's the text in the YARN logs, post-redaction:
``HADOOP_CREDSTORE_PASSWORD -> *********(redacted)``

Here's the text in the event logs, post-redaction:
``...,"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"*********(redacted)","spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD":"*********(redacted)",...``

## How was this patch tested?
1. Unit tests are added to ensure that redaction works.
2. A YARN job reading data off of S3 with confidential information
(hadoop credential provider password) being provided in the environment
variables of driver and executor. And, afterwards, logs were grepped to make
sure that no mention of secret password was present. It was also ensure that
the job was able to read the data off of S3 correctly, thereby ensuring that
the sensitive information was being trickled down to the right places to read
the data.
3. The event logs were checked to make sure no mention of secret password was
present.
4. UI environment tab was checked to make sure there was no secret information
being displayed.

Author: Mark Grover <mark@apache.org>

Closes apache#15971 from markgrover/master_redaction.
  • Loading branch information
markgrover authored and Marcelo Vanzin committed Nov 28, 2016
1 parent d31ff9b commit 237c3b9
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -223,4 +223,13 @@ package object config {
" bigger files.")
.longConf
.createWithDefault(4 * 1024 * 1024)

private[spark] val SECRET_REDACTION_PATTERN =
ConfigBuilder("spark.redaction.regex")
.doc("Regex to decide which Spark configuration properties and environment variables in " +
"driver and executor environments contain sensitive information. When this regex matches " +
"a property, its value is redacted from the environment UI and various logs like YARN " +
"and event logs.")
.stringConf
.createWithDefault("(?i)secret|password")
}
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ private[spark] class EventLoggingListener(

override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event)

override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = logEvent(event)
override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = {
logEvent(redactEvent(event))
}

// Events that trigger a flush
override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
Expand Down Expand Up @@ -231,6 +233,15 @@ private[spark] class EventLoggingListener(
}
}

private[spark] def redactEvent(
event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = {
// "Spark Properties" entry will always exist because the map is always populated with it.
val redactedProps = Utils.redact(sparkConf, event.environmentDetails("Spark Properties"))
val redactedEnvironmentDetails = event.environmentDetails +
("Spark Properties" -> redactedProps)
SparkListenerEnvironmentUpdate(redactedEnvironmentDetails)
}

}

private[spark] object EventLoggingListener extends Logging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,17 @@ import javax.servlet.http.HttpServletRequest
import scala.xml.Node

import org.apache.spark.ui.{UIUtils, WebUIPage}
import org.apache.spark.util.Utils

private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") {
private val listener = parent.listener

private def removePass(kv: (String, String)): (String, String) = {
if (kv._1.toLowerCase.contains("password") || kv._1.toLowerCase.contains("secret")) {
(kv._1, "******")
} else kv
}

def render(request: HttpServletRequest): Seq[Node] = {
val runtimeInformationTable = UIUtils.listingTable(
propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true)
val sparkPropertiesTable = UIUtils.listingTable(
propertyHeader, propertyRow, listener.sparkProperties.map(removePass), fixedWidth = true)
val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow,
Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true)

val systemPropertiesTable = UIUtils.listingTable(
propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true)
val classpathEntriesTable = UIUtils.listingTable(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
val listener = parent.environmentListener
val conf = parent.conf
attachPage(new EnvironmentPage(this))
}

Expand Down
14 changes: 13 additions & 1 deletion core/src/main/scala/org/apache/spark/util/Utils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ import org.slf4j.Logger
import org.apache.spark._
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
import org.apache.spark.internal.config._
import org.apache.spark.network.util.JavaUtils
import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
import org.apache.spark.util.logging.RollingFileAppender
Expand Down Expand Up @@ -2555,6 +2555,18 @@ private[spark] object Utils extends Logging {
sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
}
}

private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"

def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = {
val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r
kvs.map { kv =>
redactionPattern.findFirstIn(kv._1)
.map { ignore => (kv._1, REDACTION_REPLACEMENT_TEXT) }
.getOrElse(kv)
}
}

}

private[util] object CallerContext extends Logging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,18 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
}
}

test("Event logging with password redaction") {
val key = "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD"
val secretPassword = "secret_password"
val conf = getLoggingConf(testDirPath, None)
.set(key, secretPassword)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf)
val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, Seq.empty)
val event = SparkListenerEnvironmentUpdate(envDetails)
val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap
assert(redactedProps(key) == "*********(redacted)")
}

test("Log overwriting") {
val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None)
val logPath = new URI(logUri).getPath
Expand Down
20 changes: 20 additions & 0 deletions core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -974,4 +974,24 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {

assert(pValue > threshold)
}

test("redact sensitive information") {
val sparkConf = new SparkConf

// Set some secret keys
val secretKeys = Seq(
"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
"spark.my.password",
"spark.my.sECreT")
secretKeys.foreach { key => sparkConf.set(key, "secret_password") }
// Set a non-secret key
sparkConf.set("spark.regular.property", "not_a_secret")

// Redact sensitive information
val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap

// Assert that secret information got redacted while the regular property remained the same
secretKeys.foreach { key => assert(redactedConf(key) === Utils.REDACTION_REPLACEMENT_TEXT) }
assert(redactedConf("spark.regular.property") === "not_a_secret")
}
}
9 changes: 9 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,15 @@ Apart from these, the following properties are also available, and may be useful
process. The user can specify multiple of these to set multiple environment variables.
</td>
</tr>
<tr>
<td><code>spark.redaction.regex</code></td>
<td>(?i)secret|password</td>
<td>
Regex to decide which Spark configuration properties and environment variables in driver and
executor environments contain sensitive information. When this regex matches a property, its
value is redacted from the environment UI and various logs like YARN and event logs.
</td>
</tr>
<tr>
<td><code>spark.python.profile</code></td>
<td>false</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
import org.apache.hadoop.yarn.util.{ConverterUtils, Records}

import org.apache.spark.{SecurityManager, SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._
import org.apache.spark.launcher.YarnCommandBuilderUtils
Expand Down Expand Up @@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable(
|===============================================================================
|YARN executor launch context:
| env:
|${env.map { case (k, v) => s" $k -> $v\n" }.mkString}
|${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString}
| command:
| ${commands.mkString(" \\ \n ")}
|
Expand Down

0 comments on commit 237c3b9

Please sign in to comment.