|
17 | 17 |
|
18 | 18 | package org.apache.spark.scheduler.cluster.mesos
|
19 | 19 |
|
20 |
| -import com.codahale.metrics.{Gauge, MetricRegistry} |
| 20 | +import java.util.concurrent.TimeUnit |
| 21 | +import java.util.Date |
21 | 22 |
|
| 23 | +import scala.collection.mutable.HashMap |
| 24 | + |
| 25 | +import com.codahale.metrics.{Counter, Gauge, MetricRegistry, Timer} |
| 26 | + |
| 27 | +import org.apache.spark.TaskState |
| 28 | +import org.apache.spark.deploy.mesos.MesosDriverDescription |
22 | 29 | import org.apache.spark.metrics.source.Source
|
23 | 30 |
|
| 31 | +import org.apache.mesos.Protos.{TaskState => MesosTaskState, _} |
| 32 | + |
24 | 33 | private[mesos] class MesosClusterSchedulerSource(scheduler: MesosClusterScheduler)
|
25 |
| - extends Source { |
| 34 | + extends Source with MesosSchedulerUtils { |
| 35 | + |
| 36 | + // Submission state transitions, to derive metrics from: |
| 37 | + // - submit(): |
| 38 | + // From: NULL |
| 39 | + // To: queuedDrivers |
| 40 | + // - offers/scheduleTasks(): |
| 41 | + // From: queuedDrivers and any pendingRetryDrivers scheduled for retry |
| 42 | + // To: launchedDrivers if success, or |
| 43 | + // finishedDrivers(fail) if exception |
| 44 | + // - taskStatus/statusUpdate(): |
| 45 | + // From: launchedDrivers |
| 46 | + // To: finishedDrivers(success) if success (or fail and not eligible to retry), or |
| 47 | + // pendingRetryDrivers if failed (and eligible to retry) |
| 48 | + // - pruning/retireDriver(): |
| 49 | + // From: finishedDrivers: |
| 50 | + // To: NULL |
26 | 51 |
|
27 | 52 | override val sourceName: String = "mesos_cluster"
|
28 |
| - override val metricRegistry: MetricRegistry = new MetricRegistry() |
| 53 | + override val metricRegistry: MetricRegistry = new MetricRegistry |
| 54 | + |
| 55 | + // PULL METRICS: |
| 56 | + // These gauge metrics are periodically polled/pulled by the metrics system |
29 | 57 |
|
30 |
| - metricRegistry.register(MetricRegistry.name("waitingDrivers"), new Gauge[Int] { |
| 58 | + metricRegistry.register(MetricRegistry.name("driver", "waiting"), new Gauge[Int] { |
31 | 59 | override def getValue: Int = scheduler.getQueuedDriversSize
|
32 | 60 | })
|
33 | 61 |
|
34 |
| - metricRegistry.register(MetricRegistry.name("launchedDrivers"), new Gauge[Int] { |
| 62 | + metricRegistry.register(MetricRegistry.name("driver", "launched"), new Gauge[Int] { |
35 | 63 | override def getValue: Int = scheduler.getLaunchedDriversSize
|
36 | 64 | })
|
37 | 65 |
|
38 |
| - metricRegistry.register(MetricRegistry.name("retryDrivers"), new Gauge[Int] { |
| 66 | + metricRegistry.register(MetricRegistry.name("driver", "retry"), new Gauge[Int] { |
39 | 67 | override def getValue: Int = scheduler.getPendingRetryDriversSize
|
40 | 68 | })
|
| 69 | + |
| 70 | + metricRegistry.register(MetricRegistry.name("driver", "finished"), new Gauge[Int] { |
| 71 | + override def getValue: Int = scheduler.getFinishedDriversSize |
| 72 | + }) |
| 73 | + |
| 74 | + // PUSH METRICS: |
| 75 | + // These metrics are updated directly as events occur |
| 76 | + |
| 77 | + private val queuedCounter = metricRegistry.counter(MetricRegistry.name("driver", "waiting_count")) |
| 78 | + private val launchedCounter = |
| 79 | + metricRegistry.counter(MetricRegistry.name("driver", "launched_count")) |
| 80 | + private val retryCounter = metricRegistry.counter(MetricRegistry.name("driver", "retry_count")) |
| 81 | + private val exceptionCounter = |
| 82 | + metricRegistry.counter(MetricRegistry.name("driver", "exception_count")) |
| 83 | + private val finishedCounter = |
| 84 | + metricRegistry.counter(MetricRegistry.name("driver", "finished_count")) |
| 85 | + |
| 86 | + // Same as finishedCounter above, except grouped by MesosTaskState. |
| 87 | + private val finishedMesosStateCounters = MesosTaskState.values |
| 88 | + // Avoid registering 'finished' metrics for states that aren't considered finished: |
| 89 | + .filter(state => TaskState.isFinished(mesosToTaskState(state))) |
| 90 | + .map(state => (state, metricRegistry.counter( |
| 91 | + MetricRegistry.name("driver", "finished_count_mesos_state", state.name.toLowerCase)))) |
| 92 | + .toMap |
| 93 | + private val finishedMesosUnknownStateCounter = |
| 94 | + metricRegistry.counter(MetricRegistry.name("driver", "finished_count_mesos_state", "UNKNOWN")) |
| 95 | + |
| 96 | + // Duration from submission to FIRST launch. |
| 97 | + // This omits retries since those would exaggerate the time since original submission. |
| 98 | + private val submitToFirstLaunch = |
| 99 | + metricRegistry.timer(MetricRegistry.name("driver", "submit_to_first_launch")) |
| 100 | + // Duration from initial submission to an exception. |
| 101 | + private val submitToException = |
| 102 | + metricRegistry.timer(MetricRegistry.name("driver", "submit_to_exception")) |
| 103 | + |
| 104 | + // Duration from (most recent) launch to a retry. |
| 105 | + private val launchToRetry = metricRegistry.timer(MetricRegistry.name("driver", "launch_to_retry")) |
| 106 | + |
| 107 | + // Duration from initial submission to finished. |
| 108 | + private val submitToFinish = |
| 109 | + metricRegistry.timer(MetricRegistry.name("driver", "submit_to_finish")) |
| 110 | + // Duration from (most recent) launch to finished. |
| 111 | + private val launchToFinish = |
| 112 | + metricRegistry.timer(MetricRegistry.name("driver", "launch_to_finish")) |
| 113 | + |
| 114 | + // Same as submitToFinish and launchToFinish above, except grouped by Spark TaskState. |
| 115 | + class FinishStateTimers(state: String) { |
| 116 | + val submitToFinish = |
| 117 | + metricRegistry.timer(MetricRegistry.name("driver", "submit_to_finish_state", state)) |
| 118 | + val launchToFinish = |
| 119 | + metricRegistry.timer(MetricRegistry.name("driver", "launch_to_finish_state", state)) |
| 120 | + } |
| 121 | + private val finishSparkStateTimers = HashMap.empty[TaskState.TaskState, FinishStateTimers] |
| 122 | + for (state <- TaskState.values) { |
| 123 | + // Avoid registering 'finished' metrics for states that aren't considered finished: |
| 124 | + if (TaskState.isFinished(state)) { |
| 125 | + finishSparkStateTimers += (state -> new FinishStateTimers(state.toString.toLowerCase)) |
| 126 | + } |
| 127 | + } |
| 128 | + private val submitToFinishUnknownState = metricRegistry.timer( |
| 129 | + MetricRegistry.name("driver", "submit_to_finish_state", "UNKNOWN")) |
| 130 | + private val launchToFinishUnknownState = metricRegistry.timer( |
| 131 | + MetricRegistry.name("driver", "launch_to_finish_state", "UNKNOWN")) |
| 132 | + |
| 133 | + // Histogram of retry counts at retry scheduling |
| 134 | + private val retryCount = metricRegistry.histogram(MetricRegistry.name("driver", "retry_counts")) |
| 135 | + |
| 136 | + // Records when a submission initially enters the launch queue. |
| 137 | + def recordQueuedDriver(): Unit = queuedCounter.inc |
| 138 | + |
| 139 | + // Records when a submission has failed an attempt and is eligible to be retried |
| 140 | + def recordRetryingDriver(state: MesosClusterSubmissionState): Unit = { |
| 141 | + state.driverDescription.retryState.foreach(retryState => retryCount.update(retryState.retries)) |
| 142 | + recordTimeSince(state.startDate, launchToRetry) |
| 143 | + retryCounter.inc |
| 144 | + } |
| 145 | + |
| 146 | + // Records when a submission is launched. |
| 147 | + def recordLaunchedDriver(desc: MesosDriverDescription): Unit = { |
| 148 | + if (!desc.retryState.isDefined) { |
| 149 | + recordTimeSince(desc.submissionDate, submitToFirstLaunch) |
| 150 | + } |
| 151 | + launchedCounter.inc |
| 152 | + } |
| 153 | + |
| 154 | + // Records when a submission has successfully finished, or failed and was not eligible for retry. |
| 155 | + def recordFinishedDriver(state: MesosClusterSubmissionState, mesosState: MesosTaskState): Unit = { |
| 156 | + finishedCounter.inc |
| 157 | + |
| 158 | + recordTimeSince(state.driverDescription.submissionDate, submitToFinish) |
| 159 | + recordTimeSince(state.startDate, launchToFinish) |
| 160 | + |
| 161 | + // Timers grouped by Spark TaskState: |
| 162 | + val sparkState = mesosToTaskState(mesosState) |
| 163 | + finishSparkStateTimers.get(sparkState) match { |
| 164 | + case Some(timers) => { |
| 165 | + recordTimeSince(state.driverDescription.submissionDate, timers.submitToFinish) |
| 166 | + recordTimeSince(state.startDate, timers.launchToFinish) |
| 167 | + } |
| 168 | + case None => { |
| 169 | + recordTimeSince(state.driverDescription.submissionDate, submitToFinishUnknownState) |
| 170 | + recordTimeSince(state.startDate, launchToFinishUnknownState) |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + // Counter grouped by MesosTaskState: |
| 175 | + finishedMesosStateCounters.get(mesosState) match { |
| 176 | + case Some(counter) => counter.inc |
| 177 | + case None => finishedMesosUnknownStateCounter.inc |
| 178 | + } |
| 179 | + } |
| 180 | + |
| 181 | + // Records when a submission has terminally failed due to an exception at construction. |
| 182 | + def recordExceptionDriver(desc: MesosDriverDescription): Unit = { |
| 183 | + recordTimeSince(desc.submissionDate, submitToException) |
| 184 | + exceptionCounter.inc |
| 185 | + } |
| 186 | + |
| 187 | + private def recordTimeSince(date: Date, timer: Timer): Unit = |
| 188 | + timer.update(System.currentTimeMillis - date.getTime, TimeUnit.MILLISECONDS) |
41 | 189 | }
|
0 commit comments