Skip to content

[SPARK-25865][CORE] Add GC information to ExecutorMetrics #22874

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
19 changes: 19 additions & 0 deletions core/src/main/scala/org/apache/spark/internal/config/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.internal
import java.util.concurrent.TimeUnit

import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.metrics.GarbageCollectionMetrics
import org.apache.spark.network.util.ByteUnit
import org.apache.spark.scheduler.{EventLoggingListener, SchedulingMode}
import org.apache.spark.storage.{DefaultTopologyMapper, RandomBlockReplicationPolicy}
Expand Down Expand Up @@ -114,6 +115,24 @@ package object config {
.booleanConf
.createWithDefault(false)

private[spark] val EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS =
ConfigBuilder("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors")
.doc("Names of supported young generation garbage collector. A name usually is " +
" the return of GarbageCollectorMXBean.getName. The built-in young generation garbage " +
s"collectors are ${GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
.stringConf
.toSequence
.createWithDefault(GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS)

private[spark] val EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS =
ConfigBuilder("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors")
.doc("Names of supported old generation garbage collector. A name usually is " +
"the return of GarbageCollectorMXBean.getName. The built-in old generation garbage " +
s"collectors are ${GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
.stringConf
.toSequence
.createWithDefault(GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS)

private[spark] val EVENT_LOG_OVERWRITE =
ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@ package org.apache.spark.metrics
import java.lang.management.{BufferPoolMXBean, ManagementFactory}
import javax.management.ObjectName

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.ProcfsMetricsGetter
import org.apache.spark.internal.{config, Logging}
import org.apache.spark.memory.MemoryManager

/**
Expand Down Expand Up @@ -99,6 +102,63 @@ case object ProcessTreeMetrics extends ExecutorMetricType {
}
}

case object GarbageCollectionMetrics extends ExecutorMetricType with Logging {
private var nonBuiltInCollectors: Seq[String] = Nil

override val names = Seq(
"MinorGCCount",
"MinorGCTime",
"MajorGCCount",
"MajorGCTime"
)

/* We builtin some common GC collectors which categorized as young generation and old */
private[spark] val YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
"Copy",
"PS Scavenge",
"ParNew",
"G1 Young Generation"
)

private[spark] val OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
"MarkSweepCompact",
"PS MarkSweep",
"ConcurrentMarkSweep",
"G1 Old Generation"
)

private lazy val youngGenerationGarbageCollector: Seq[String] = {
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS)
}

private lazy val oldGenerationGarbageCollector: Seq[String] = {
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS)
}

override private[spark] def getMetricValues(memoryManager: MemoryManager): Array[Long] = {
val gcMetrics = new Array[Long](names.length) // minorCount, minorTime, majorCount, majorTime
ManagementFactory.getGarbageCollectorMXBeans.asScala.foreach { mxBean =>
if (youngGenerationGarbageCollector.contains(mxBean.getName)) {
gcMetrics(0) = mxBean.getCollectionCount
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ever possible that multiple collectors are considered young generation (or old generation)? If so, you'd want to use += here.

I'm not sure if there is a configuration where that might happen, but I'm also not super familiar with all variants here. @kiszk maybe you know?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, one collector per generation is supported only.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for being late. I agree with one collector per generation.

gcMetrics(1) = mxBean.getCollectionTime
} else if (oldGenerationGarbageCollector.contains(mxBean.getName)) {
gcMetrics(2) = mxBean.getCollectionCount
gcMetrics(3) = mxBean.getCollectionTime
} else if (!nonBuiltInCollectors.contains(mxBean.getName)) {
nonBuiltInCollectors = mxBean.getName +: nonBuiltInCollectors
// log it when first seen
logWarning(s"To enable non-built-in garbage collector(s) " +
s"$nonBuiltInCollectors, users should configure it(them) to " +
s"${config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS.key} or " +
s"${config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS.key}")
} else {
// do nothing
}
}
gcMetrics
}
}

case object OnHeapExecutionMemory extends MemoryManagerExecutorMetricType(
_.onHeapExecutionMemoryUsed)

Expand Down Expand Up @@ -137,7 +197,8 @@ private[spark] object ExecutorMetricType {
OffHeapUnifiedMemory,
DirectPoolMemory,
MappedPoolMemory,
ProcessTreeMetrics
ProcessTreeMetrics,
GarbageCollectionMetrics
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
[ {
"id": "application_1536831636016_59384",
"name": "Spark Pi",
"attempts": [
{
"attemptId": "1",
"startTime": "2019-01-08T04:33:43.607GMT",
"endTime": "2019-01-08T04:33:58.745GMT",
"lastUpdated": "",
"duration": 15138,
"sparkUser": "lajin",
"completed": true,
"appSparkVersion": "3.0.0-SNAPSHOT",
"lastUpdatedEpoch": 0,
"startTimeEpoch": 1546922023607,
"endTimeEpoch": 1546922038745
}
]
}, {
"id" : "application_1538416563558_0014",
"name" : "PythonBisectingKMeansExample",
"attempts" : [ {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
[ {
"id": "application_1536831636016_59384",
"name": "Spark Pi",
"attempts": [
{
"attemptId": "1",
"startTime": "2019-01-08T04:33:43.607GMT",
"endTime": "2019-01-08T04:33:58.745GMT",
"lastUpdated": "",
"duration": 15138,
"sparkUser": "lajin",
"completed": true,
"appSparkVersion": "3.0.0-SNAPSHOT",
"lastUpdatedEpoch": 0,
"startTimeEpoch": 1546922023607,
"endTimeEpoch": 1546922038745
}
]
}, {
"id" : "application_1538416563558_0014",
"name" : "PythonBisectingKMeansExample",
"attempts" : [ {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
[ {
"id" : "driver",
"hostPort" : "047.company.com:42509",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 0,
"maxTasks" : 0,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 0,
"totalTasks" : 0,
"totalDuration" : 0,
"totalGCTime" : 0,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 100977868,
"addTime" : "2019-01-08T04:33:44.502GMT",
"executorLogs" : {
"stdout" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stdout?start=-4096",
"stderr" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 100977868,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"peakMemoryMetrics" : {
"JVMHeapMemory" : 211171816,
"JVMOffHeapMemory" : 90237256,
"OnHeapExecutionMemory" : 0,
"OffHeapExecutionMemory" : 0,
"OnHeapStorageMemory" : 4876,
"OffHeapStorageMemory" : 0,
"OnHeapUnifiedMemory" : 4876,
"OffHeapUnifiedMemory" : 0,
"DirectPoolMemory" : 806275,
"MappedPoolMemory" : 0,
"ProcessTreeJVMVMemory" : 2646888448,
"ProcessTreeJVMRSSMemory" : 520900608,
"ProcessTreePythonVMemory" : 0,
"ProcessTreePythonRSSMemory" : 0,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount" : 8,
"MinorGCTime" : 374,
"MajorGCCount" : 3,
"MajorGCTime" : 170
},
"attributes" : { }
}, {
"id" : "2",
"hostPort" : "028.company.com:46325",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 4,
"maxTasks" : 4,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 52,
"totalTasks" : 52,
"totalDuration" : 8879,
"totalGCTime" : 420,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 97832140,
"addTime" : "2019-01-08T04:33:54.270GMT",
"executorLogs" : {
"stdout" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stdout?start=-4096",
"stderr" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 97832140,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"attributes" : { }
}, {
"id" : "1",
"hostPort" : "036.company.com:35126",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 4,
"maxTasks" : 4,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 48,
"totalTasks" : 48,
"totalDuration" : 8837,
"totalGCTime" : 1192,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 97832140,
"addTime" : "2019-01-08T04:33:55.929GMT",
"executorLogs" : {
"stdout" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stdout?start=-4096",
"stderr" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 97832140,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"attributes" : { }
} ]
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -193,7 +197,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -244,7 +252,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -295,7 +307,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -346,7 +362,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
} ]
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@
"ProcessTreePythonVMemory" : 408375296,
"ProcessTreePythonRSSMemory" : 40284160,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -94,7 +98,11 @@
"ProcessTreePythonVMemory" : 625926144,
"ProcessTreePythonRSSMemory" : 69013504,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
} ]
Loading