Skip to content

Commit 36edbac

Browse files
rxingatorsmile
authored andcommitted
[SPARK-26226][SQL] Update query tracker to report timeline for phases
## What changes were proposed in this pull request? This patch changes the query plan tracker added earlier to report phase timeline, rather than just a duration for each phase. This way, we can easily find time that's unaccounted for. ## How was this patch tested? Updated test cases to reflect that. Closes #23183 from rxin/SPARK-26226. Authored-by: Reynold Xin <rxin@databricks.com> Signed-off-by: gatorsmile <gatorsmile@gmail.com>
1 parent 9b23be2 commit 36edbac

File tree

5 files changed

+55
-33
lines changed

5 files changed

+55
-33
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/QueryPlanningTracker.scala

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ object QueryPlanningTracker {
4141
val OPTIMIZATION = "optimization"
4242
val PLANNING = "planning"
4343

44+
/**
45+
* Summary for a rule.
46+
* @param totalTimeNs total amount of time, in nanosecs, spent in this rule.
47+
* @param numInvocations number of times the rule has been invoked.
48+
* @param numEffectiveInvocations number of times the rule has been invoked and
49+
* resulted in a plan change.
50+
*/
4451
class RuleSummary(
4552
var totalTimeNs: Long, var numInvocations: Long, var numEffectiveInvocations: Long) {
4653

@@ -51,6 +58,18 @@ object QueryPlanningTracker {
5158
}
5259
}
5360

61+
/**
62+
* Summary of a phase, with start time and end time so we can construct a timeline.
63+
*/
64+
class PhaseSummary(val startTimeMs: Long, val endTimeMs: Long) {
65+
66+
def durationMs: Long = endTimeMs - startTimeMs
67+
68+
override def toString: String = {
69+
s"PhaseSummary($startTimeMs, $endTimeMs)"
70+
}
71+
}
72+
5473
/**
5574
* A thread local variable to implicitly pass the tracker around. This assumes the query planner
5675
* is single-threaded, and avoids passing the same tracker context in every function call.
@@ -79,15 +98,25 @@ class QueryPlanningTracker {
7998
// Use a Java HashMap for less overhead.
8099
private val rulesMap = new java.util.HashMap[String, RuleSummary]
81100

82-
// From a phase to time in ns.
83-
private val phaseToTimeNs = new java.util.HashMap[String, Long]
101+
// From a phase to its start time and end time, in ms.
102+
private val phasesMap = new java.util.HashMap[String, PhaseSummary]
84103

85-
/** Measure the runtime of function f, and add it to the time for the specified phase. */
86-
def measureTime[T](phase: String)(f: => T): T = {
87-
val startTime = System.nanoTime()
104+
/**
105+
* Measure the start and end time of a phase. Note that if this function is called multiple
106+
* times for the same phase, the recorded start time will be the start time of the first call,
107+
* and the recorded end time will be the end time of the last call.
108+
*/
109+
def measurePhase[T](phase: String)(f: => T): T = {
110+
val startTime = System.currentTimeMillis()
88111
val ret = f
89-
val timeTaken = System.nanoTime() - startTime
90-
phaseToTimeNs.put(phase, phaseToTimeNs.getOrDefault(phase, 0) + timeTaken)
112+
val endTime = System.currentTimeMillis
113+
114+
if (phasesMap.containsKey(phase)) {
115+
val oldSummary = phasesMap.get(phase)
116+
phasesMap.put(phase, new PhaseSummary(oldSummary.startTimeMs, endTime))
117+
} else {
118+
phasesMap.put(phase, new PhaseSummary(startTime, endTime))
119+
}
91120
ret
92121
}
93122

@@ -114,7 +143,7 @@ class QueryPlanningTracker {
114143

115144
def rules: Map[String, RuleSummary] = rulesMap.asScala.toMap
116145

117-
def phases: Map[String, Long] = phaseToTimeNs.asScala.toMap
146+
def phases: Map[String, PhaseSummary] = phasesMap.asScala.toMap
118147

119148
/**
120149
* Returns the top k most expensive rules (as measured by time). If k is larger than the rules

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/QueryPlanningTrackerSuite.scala

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,23 @@ class QueryPlanningTrackerSuite extends SparkFunSuite {
2323

2424
test("phases") {
2525
val t = new QueryPlanningTracker
26-
t.measureTime("p1") {
26+
t.measurePhase("p1") {
2727
Thread.sleep(1)
2828
}
2929

30-
assert(t.phases("p1") > 0)
30+
assert(t.phases("p1").durationMs > 0)
3131
assert(!t.phases.contains("p2"))
32+
}
3233

33-
val old = t.phases("p1")
34+
test("multiple measurePhase call") {
35+
val t = new QueryPlanningTracker
36+
t.measurePhase("p1") { Thread.sleep(1) }
37+
val s1 = t.phases("p1")
38+
assert(s1.durationMs > 0)
3439

35-
t.measureTime("p1") {
36-
Thread.sleep(1)
37-
}
38-
assert(t.phases("p1") > old)
40+
t.measurePhase("p1") { Thread.sleep(1) }
41+
val s2 = t.phases("p1")
42+
assert(s2.durationMs > s1.durationMs)
3943
}
4044

4145
test("rules") {

sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,7 @@ class SparkSession private(
649649
*/
650650
def sql(sqlText: String): DataFrame = {
651651
val tracker = new QueryPlanningTracker
652-
val plan = tracker.measureTime(QueryPlanningTracker.PARSING) {
652+
val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
653653
sessionState.sqlParser.parsePlan(sqlText)
654654
}
655655
Dataset.ofRows(self, plan, tracker)

sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class QueryExecution(
6060
}
6161
}
6262

63-
lazy val analyzed: LogicalPlan = tracker.measureTime(QueryPlanningTracker.ANALYSIS) {
63+
lazy val analyzed: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.ANALYSIS) {
6464
SparkSession.setActiveSession(sparkSession)
6565
sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker)
6666
}
@@ -71,11 +71,11 @@ class QueryExecution(
7171
sparkSession.sharedState.cacheManager.useCachedData(analyzed)
7272
}
7373

74-
lazy val optimizedPlan: LogicalPlan = tracker.measureTime(QueryPlanningTracker.OPTIMIZATION) {
74+
lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) {
7575
sparkSession.sessionState.optimizer.executeAndTrack(withCachedData, tracker)
7676
}
7777

78-
lazy val sparkPlan: SparkPlan = tracker.measureTime(QueryPlanningTracker.PLANNING) {
78+
lazy val sparkPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) {
7979
SparkSession.setActiveSession(sparkSession)
8080
// TODO: We use next(), i.e. take the first plan returned by the planner, here for now,
8181
// but we will implement to choose the best plan.
@@ -84,7 +84,7 @@ class QueryExecution(
8484

8585
// executedPlan should not be used to initialize any SparkPlan. It should be
8686
// only used for execution.
87-
lazy val executedPlan: SparkPlan = tracker.measureTime(QueryPlanningTracker.PLANNING) {
87+
lazy val executedPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) {
8888
prepareForExecution(sparkPlan)
8989
}
9090

sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,7 @@ class QueryPlanningTrackerEndToEndSuite extends SharedSQLContext {
2525
val df = spark.range(1000).selectExpr("count(*)")
2626
df.collect()
2727
val tracker = df.queryExecution.tracker
28-
29-
assert(tracker.phases.size == 3)
30-
assert(tracker.phases("analysis") > 0)
31-
assert(tracker.phases("optimization") > 0)
32-
assert(tracker.phases("planning") > 0)
33-
28+
assert(tracker.phases.keySet == Set("analysis", "optimization", "planning"))
3429
assert(tracker.rules.nonEmpty)
3530
}
3631

@@ -39,13 +34,7 @@ class QueryPlanningTrackerEndToEndSuite extends SharedSQLContext {
3934
df.collect()
4035

4136
val tracker = df.queryExecution.tracker
42-
43-
assert(tracker.phases.size == 4)
44-
assert(tracker.phases("parsing") > 0)
45-
assert(tracker.phases("analysis") > 0)
46-
assert(tracker.phases("optimization") > 0)
47-
assert(tracker.phases("planning") > 0)
48-
37+
assert(tracker.phases.keySet == Set("parsing", "analysis", "optimization", "planning"))
4938
assert(tracker.rules.nonEmpty)
5039
}
5140

0 commit comments

Comments
 (0)