Skip to content

Commit e5c502c

Browse files
LantaoJinsquito
authored andcommitted
[SPARK-25865][CORE] Add GC information to ExecutorMetrics
## What changes were proposed in this pull request? Only memory usage without GC information could not help us to determinate the proper settings of memory. We need the GC metrics about frequency of major & minor GC. For example, two cases, their configured memory for executor are all 10GB and their usages are all near 10GB. So should we increase or decrease the configured memory for them? This metrics may be helpful. We can increase configured memory for the first one if it has very frequency major GC and decrease the second one if only some minor GC and none major GC. GC metrics are only useful in entire lifetime of executors instead of separated stages. ## How was this patch tested? Adding UT. Closes #22874 from LantaoJin/SPARK-25865. Authored-by: LantaoJin <jinlantao@gmail.com> Signed-off-by: Imran Rashid <irashid@cloudera.com>
1 parent caceaec commit e5c502c

15 files changed

+576
-46
lines changed

core/src/main/scala/org/apache/spark/internal/config/package.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.internal
2020
import java.util.concurrent.TimeUnit
2121

2222
import org.apache.spark.launcher.SparkLauncher
23+
import org.apache.spark.metrics.GarbageCollectionMetrics
2324
import org.apache.spark.network.util.ByteUnit
2425
import org.apache.spark.scheduler.{EventLoggingListener, SchedulingMode}
2526
import org.apache.spark.storage.{DefaultTopologyMapper, RandomBlockReplicationPolicy}
@@ -114,6 +115,24 @@ package object config {
114115
.booleanConf
115116
.createWithDefault(false)
116117

118+
private[spark] val EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS =
119+
ConfigBuilder("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors")
120+
.doc("Names of supported young generation garbage collector. A name usually is " +
121+
" the return of GarbageCollectorMXBean.getName. The built-in young generation garbage " +
122+
s"collectors are ${GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
123+
.stringConf
124+
.toSequence
125+
.createWithDefault(GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS)
126+
127+
private[spark] val EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS =
128+
ConfigBuilder("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors")
129+
.doc("Names of supported old generation garbage collector. A name usually is " +
130+
"the return of GarbageCollectorMXBean.getName. The built-in old generation garbage " +
131+
s"collectors are ${GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
132+
.stringConf
133+
.toSequence
134+
.createWithDefault(GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS)
135+
117136
private[spark] val EVENT_LOG_OVERWRITE =
118137
ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)
119138

core/src/main/scala/org/apache/spark/metrics/ExecutorMetricType.scala

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@ package org.apache.spark.metrics
1919
import java.lang.management.{BufferPoolMXBean, ManagementFactory}
2020
import javax.management.ObjectName
2121

22+
import scala.collection.JavaConverters._
2223
import scala.collection.mutable
2324

25+
import org.apache.spark.SparkEnv
2426
import org.apache.spark.executor.ProcfsMetricsGetter
27+
import org.apache.spark.internal.{config, Logging}
2528
import org.apache.spark.memory.MemoryManager
2629

2730
/**
@@ -99,6 +102,63 @@ case object ProcessTreeMetrics extends ExecutorMetricType {
99102
}
100103
}
101104

105+
case object GarbageCollectionMetrics extends ExecutorMetricType with Logging {
106+
private var nonBuiltInCollectors: Seq[String] = Nil
107+
108+
override val names = Seq(
109+
"MinorGCCount",
110+
"MinorGCTime",
111+
"MajorGCCount",
112+
"MajorGCTime"
113+
)
114+
115+
/* We builtin some common GC collectors which categorized as young generation and old */
116+
private[spark] val YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
117+
"Copy",
118+
"PS Scavenge",
119+
"ParNew",
120+
"G1 Young Generation"
121+
)
122+
123+
private[spark] val OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
124+
"MarkSweepCompact",
125+
"PS MarkSweep",
126+
"ConcurrentMarkSweep",
127+
"G1 Old Generation"
128+
)
129+
130+
private lazy val youngGenerationGarbageCollector: Seq[String] = {
131+
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS)
132+
}
133+
134+
private lazy val oldGenerationGarbageCollector: Seq[String] = {
135+
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS)
136+
}
137+
138+
override private[spark] def getMetricValues(memoryManager: MemoryManager): Array[Long] = {
139+
val gcMetrics = new Array[Long](names.length) // minorCount, minorTime, majorCount, majorTime
140+
ManagementFactory.getGarbageCollectorMXBeans.asScala.foreach { mxBean =>
141+
if (youngGenerationGarbageCollector.contains(mxBean.getName)) {
142+
gcMetrics(0) = mxBean.getCollectionCount
143+
gcMetrics(1) = mxBean.getCollectionTime
144+
} else if (oldGenerationGarbageCollector.contains(mxBean.getName)) {
145+
gcMetrics(2) = mxBean.getCollectionCount
146+
gcMetrics(3) = mxBean.getCollectionTime
147+
} else if (!nonBuiltInCollectors.contains(mxBean.getName)) {
148+
nonBuiltInCollectors = mxBean.getName +: nonBuiltInCollectors
149+
// log it when first seen
150+
logWarning(s"To enable non-built-in garbage collector(s) " +
151+
s"$nonBuiltInCollectors, users should configure it(them) to " +
152+
s"${config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS.key} or " +
153+
s"${config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS.key}")
154+
} else {
155+
// do nothing
156+
}
157+
}
158+
gcMetrics
159+
}
160+
}
161+
102162
case object OnHeapExecutionMemory extends MemoryManagerExecutorMetricType(
103163
_.onHeapExecutionMemoryUsed)
104164

@@ -137,7 +197,8 @@ private[spark] object ExecutorMetricType {
137197
OffHeapUnifiedMemory,
138198
DirectPoolMemory,
139199
MappedPoolMemory,
140-
ProcessTreeMetrics
200+
ProcessTreeMetrics,
201+
GarbageCollectionMetrics
141202
)
142203

143204

core/src/test/resources/HistoryServerExpectations/application_list_json_expectation.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,22 @@
11
[ {
2+
"id": "application_1536831636016_59384",
3+
"name": "Spark Pi",
4+
"attempts": [
5+
{
6+
"attemptId": "1",
7+
"startTime": "2019-01-08T04:33:43.607GMT",
8+
"endTime": "2019-01-08T04:33:58.745GMT",
9+
"lastUpdated": "",
10+
"duration": 15138,
11+
"sparkUser": "lajin",
12+
"completed": true,
13+
"appSparkVersion": "3.0.0-SNAPSHOT",
14+
"lastUpdatedEpoch": 0,
15+
"startTimeEpoch": 1546922023607,
16+
"endTimeEpoch": 1546922038745
17+
}
18+
]
19+
}, {
220
"id" : "application_1538416563558_0014",
321
"name" : "PythonBisectingKMeansExample",
422
"attempts" : [ {

core/src/test/resources/HistoryServerExpectations/completed_app_list_json_expectation.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,22 @@
11
[ {
2+
"id": "application_1536831636016_59384",
3+
"name": "Spark Pi",
4+
"attempts": [
5+
{
6+
"attemptId": "1",
7+
"startTime": "2019-01-08T04:33:43.607GMT",
8+
"endTime": "2019-01-08T04:33:58.745GMT",
9+
"lastUpdated": "",
10+
"duration": 15138,
11+
"sparkUser": "lajin",
12+
"completed": true,
13+
"appSparkVersion": "3.0.0-SNAPSHOT",
14+
"lastUpdatedEpoch": 0,
15+
"startTimeEpoch": 1546922023607,
16+
"endTimeEpoch": 1546922038745
17+
}
18+
]
19+
}, {
220
"id" : "application_1538416563558_0014",
321
"name" : "PythonBisectingKMeansExample",
422
"attempts" : [ {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
[ {
2+
"id" : "driver",
3+
"hostPort" : "047.company.com:42509",
4+
"isActive" : true,
5+
"rddBlocks" : 0,
6+
"memoryUsed" : 0,
7+
"diskUsed" : 0,
8+
"totalCores" : 0,
9+
"maxTasks" : 0,
10+
"activeTasks" : 0,
11+
"failedTasks" : 0,
12+
"completedTasks" : 0,
13+
"totalTasks" : 0,
14+
"totalDuration" : 0,
15+
"totalGCTime" : 0,
16+
"totalInputBytes" : 0,
17+
"totalShuffleRead" : 0,
18+
"totalShuffleWrite" : 0,
19+
"isBlacklisted" : false,
20+
"maxMemory" : 100977868,
21+
"addTime" : "2019-01-08T04:33:44.502GMT",
22+
"executorLogs" : {
23+
"stdout" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stdout?start=-4096",
24+
"stderr" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stderr?start=-4096"
25+
},
26+
"memoryMetrics" : {
27+
"usedOnHeapStorageMemory" : 0,
28+
"usedOffHeapStorageMemory" : 0,
29+
"totalOnHeapStorageMemory" : 100977868,
30+
"totalOffHeapStorageMemory" : 0
31+
},
32+
"blacklistedInStages" : [ ],
33+
"peakMemoryMetrics" : {
34+
"JVMHeapMemory" : 211171816,
35+
"JVMOffHeapMemory" : 90237256,
36+
"OnHeapExecutionMemory" : 0,
37+
"OffHeapExecutionMemory" : 0,
38+
"OnHeapStorageMemory" : 4876,
39+
"OffHeapStorageMemory" : 0,
40+
"OnHeapUnifiedMemory" : 4876,
41+
"OffHeapUnifiedMemory" : 0,
42+
"DirectPoolMemory" : 806275,
43+
"MappedPoolMemory" : 0,
44+
"ProcessTreeJVMVMemory" : 2646888448,
45+
"ProcessTreeJVMRSSMemory" : 520900608,
46+
"ProcessTreePythonVMemory" : 0,
47+
"ProcessTreePythonRSSMemory" : 0,
48+
"ProcessTreeOtherVMemory" : 0,
49+
"ProcessTreeOtherRSSMemory" : 0,
50+
"MinorGCCount" : 8,
51+
"MinorGCTime" : 374,
52+
"MajorGCCount" : 3,
53+
"MajorGCTime" : 170
54+
},
55+
"attributes" : { }
56+
}, {
57+
"id" : "2",
58+
"hostPort" : "028.company.com:46325",
59+
"isActive" : true,
60+
"rddBlocks" : 0,
61+
"memoryUsed" : 0,
62+
"diskUsed" : 0,
63+
"totalCores" : 4,
64+
"maxTasks" : 4,
65+
"activeTasks" : 0,
66+
"failedTasks" : 0,
67+
"completedTasks" : 52,
68+
"totalTasks" : 52,
69+
"totalDuration" : 8879,
70+
"totalGCTime" : 420,
71+
"totalInputBytes" : 0,
72+
"totalShuffleRead" : 0,
73+
"totalShuffleWrite" : 0,
74+
"isBlacklisted" : false,
75+
"maxMemory" : 97832140,
76+
"addTime" : "2019-01-08T04:33:54.270GMT",
77+
"executorLogs" : {
78+
"stdout" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stdout?start=-4096",
79+
"stderr" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stderr?start=-4096"
80+
},
81+
"memoryMetrics" : {
82+
"usedOnHeapStorageMemory" : 0,
83+
"usedOffHeapStorageMemory" : 0,
84+
"totalOnHeapStorageMemory" : 97832140,
85+
"totalOffHeapStorageMemory" : 0
86+
},
87+
"blacklistedInStages" : [ ],
88+
"attributes" : { }
89+
}, {
90+
"id" : "1",
91+
"hostPort" : "036.company.com:35126",
92+
"isActive" : true,
93+
"rddBlocks" : 0,
94+
"memoryUsed" : 0,
95+
"diskUsed" : 0,
96+
"totalCores" : 4,
97+
"maxTasks" : 4,
98+
"activeTasks" : 0,
99+
"failedTasks" : 0,
100+
"completedTasks" : 48,
101+
"totalTasks" : 48,
102+
"totalDuration" : 8837,
103+
"totalGCTime" : 1192,
104+
"totalInputBytes" : 0,
105+
"totalShuffleRead" : 0,
106+
"totalShuffleWrite" : 0,
107+
"isBlacklisted" : false,
108+
"maxMemory" : 97832140,
109+
"addTime" : "2019-01-08T04:33:55.929GMT",
110+
"executorLogs" : {
111+
"stdout" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stdout?start=-4096",
112+
"stderr" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stderr?start=-4096"
113+
},
114+
"memoryMetrics" : {
115+
"usedOnHeapStorageMemory" : 0,
116+
"usedOffHeapStorageMemory" : 0,
117+
"totalOnHeapStorageMemory" : 97832140,
118+
"totalOffHeapStorageMemory" : 0
119+
},
120+
"blacklistedInStages" : [ ],
121+
"attributes" : { }
122+
} ]

core/src/test/resources/HistoryServerExpectations/executor_list_with_executor_metrics_json_expectation.json

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@
4343
"ProcessTreePythonVMemory": 0,
4444
"ProcessTreePythonRSSMemory": 0,
4545
"ProcessTreeOtherVMemory": 0,
46-
"ProcessTreeOtherRSSMemory": 0
46+
"ProcessTreeOtherRSSMemory": 0,
47+
"MinorGCCount": 0,
48+
"MinorGCTime": 0,
49+
"MajorGCCount": 0,
50+
"MajorGCTime": 0
4751
},
4852
"attributes" : { }
4953
}, {
@@ -193,7 +197,11 @@
193197
"ProcessTreePythonVMemory": 0,
194198
"ProcessTreePythonRSSMemory": 0,
195199
"ProcessTreeOtherVMemory": 0,
196-
"ProcessTreeOtherRSSMemory": 0
200+
"ProcessTreeOtherRSSMemory": 0,
201+
"MinorGCCount": 0,
202+
"MinorGCTime": 0,
203+
"MajorGCCount": 0,
204+
"MajorGCTime": 0
197205
},
198206
"attributes" : { }
199207
}, {
@@ -244,7 +252,11 @@
244252
"ProcessTreePythonVMemory": 0,
245253
"ProcessTreePythonRSSMemory": 0,
246254
"ProcessTreeOtherVMemory": 0,
247-
"ProcessTreeOtherRSSMemory": 0
255+
"ProcessTreeOtherRSSMemory": 0,
256+
"MinorGCCount": 0,
257+
"MinorGCTime": 0,
258+
"MajorGCCount": 0,
259+
"MajorGCTime": 0
248260
},
249261
"attributes" : { }
250262
}, {
@@ -295,7 +307,11 @@
295307
"ProcessTreePythonVMemory": 0,
296308
"ProcessTreePythonRSSMemory": 0,
297309
"ProcessTreeOtherVMemory": 0,
298-
"ProcessTreeOtherRSSMemory": 0
310+
"ProcessTreeOtherRSSMemory": 0,
311+
"MinorGCCount": 0,
312+
"MinorGCTime": 0,
313+
"MajorGCCount": 0,
314+
"MajorGCTime": 0
299315
},
300316
"attributes" : { }
301317
}, {
@@ -346,7 +362,11 @@
346362
"ProcessTreePythonVMemory": 0,
347363
"ProcessTreePythonRSSMemory": 0,
348364
"ProcessTreeOtherVMemory": 0,
349-
"ProcessTreeOtherRSSMemory": 0
365+
"ProcessTreeOtherRSSMemory": 0,
366+
"MinorGCCount": 0,
367+
"MinorGCTime": 0,
368+
"MajorGCCount": 0,
369+
"MajorGCTime": 0
350370
},
351371
"attributes" : { }
352372
} ]

core/src/test/resources/HistoryServerExpectations/executor_list_with_executor_process_tree_metrics_json_expectation.json

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@
4343
"ProcessTreePythonVMemory" : 408375296,
4444
"ProcessTreePythonRSSMemory" : 40284160,
4545
"ProcessTreeOtherVMemory" : 0,
46-
"ProcessTreeOtherRSSMemory" : 0
46+
"ProcessTreeOtherRSSMemory" : 0,
47+
"MinorGCCount": 0,
48+
"MinorGCTime": 0,
49+
"MajorGCCount": 0,
50+
"MajorGCTime": 0
4751
},
4852
"attributes" : { }
4953
}, {
@@ -94,7 +98,11 @@
9498
"ProcessTreePythonVMemory" : 625926144,
9599
"ProcessTreePythonRSSMemory" : 69013504,
96100
"ProcessTreeOtherVMemory" : 0,
97-
"ProcessTreeOtherRSSMemory" : 0
101+
"ProcessTreeOtherRSSMemory" : 0,
102+
"MinorGCCount": 0,
103+
"MinorGCTime": 0,
104+
"MajorGCCount": 0,
105+
"MajorGCTime": 0
98106
},
99107
"attributes" : { }
100108
} ]

0 commit comments

Comments
 (0)