@@ -143,7 +143,7 @@ def collect(self) -> Iterator[Metric]:
143143
144144 now = time ()
145145 max_tick_duration = max (
146- self .server .digests_max [ "tick_duration" ] ,
146+ self .server .digests_max . pop ( "tick_duration" , 0 ) ,
147147 now - self .server ._last_tick ,
148148 )
149149 yield GaugeMetricFamily (
@@ -152,46 +152,12 @@ def collect(self) -> Iterator[Metric]:
152152 unit = "seconds" ,
153153 value = max_tick_duration ,
154154 )
155-
156155 yield CounterMetricFamily (
157156 self .build_name ("tick_count" ),
158157 "Total number of ticks observed since the server started" ,
159158 value = self .server ._tick_counter ,
160159 )
161160
162- # This duplicates spill_time_total; however the breakdown is different
163- evloop_blocked_total = CounterMetricFamily (
164- self .build_name ("event_loop_blocked_time" ),
165- "Total time during which the worker's event loop was blocked "
166- "by spill/unspill activity since the latest worker reset" ,
167- unit = "seconds" ,
168- labels = ["cause" ],
169- )
170- # This is typically higher than spill_time_per_key_max, as multiple keys can be
171- # spilled/unspilled without yielding the event loop
172- evloop_blocked_max = GaugeMetricFamily (
173- self .build_name ("event_loop_blocked_time_max" ),
174- "Maximum contiguous time during which the worker's event loop was blocked "
175- "by spill/unspill activity since the previous Prometheus poll" ,
176- unit = "seconds" ,
177- labels = ["cause" ],
178- )
179- for family , digest in (
180- (evloop_blocked_total , self .server .digests_total ),
181- (evloop_blocked_max , self .server .digests_max ),
182- ):
183- for family_label , digest_label in (
184- ("disk-write-target" , "disk-write-target-duration" ),
185- ("disk-write-spill" , "disk-write-spill-duration" ),
186- ("disk-read-execute" , "disk-load-duration" ),
187- ("disk-read-get-data" , "get-data-load-duration" ),
188- ):
189- family .add_metric ([family_label ], digest [digest_label ])
190-
191- yield evloop_blocked_total
192- yield evloop_blocked_max
193- self .server .digests_max .clear ()
194-
195161 def collect_crick (self ) -> Iterator [Metric ]:
196162 # All metrics using digests require crick to be installed.
197163 # The following metrics will export NaN, if the corresponding digests are None
@@ -243,42 +209,36 @@ def collect_spillbuffer(self) -> Iterator[Metric]:
243209 read = spill_bytes.disk_read / spill_time.disk_read
244210 """
245211 try :
246- get_metrics = self .server .data .get_metrics # type: ignore
212+ metrics = self .server .data .cumulative_metrics # type: ignore
247213 except AttributeError :
248214 return # spilling is disabled
249- metrics = get_metrics ()
250215
251- total_bytes = CounterMetricFamily (
252- self .build_name ("spill_bytes" ),
253- "Total size of memory and disk accesses caused by managed data "
254- "since the latest worker restart" ,
255- labels = ["activity" ],
256- )
257- # Note: memory_read is used to calculate cache hit ratios (see docstring)
258- for k in ("memory_read" , "disk_read" , "disk_write" ):
259- total_bytes .add_metric ([k ], metrics [f"{ k } _bytes_total" ])
260- yield total_bytes
261-
262- total_counts = CounterMetricFamily (
263- self .build_name ("spill_count" ),
264- "Total number of memory and disk accesses caused by managed data "
265- "since the latest worker restart" ,
266- labels = ["activity" ],
267- )
216+ counters = {
217+ "bytes" : CounterMetricFamily (
218+ self .build_name ("spill_bytes" ),
219+ "Total size of memory and disk accesses caused by managed data "
220+ "since the latest worker restart" ,
221+ labels = ["activity" ],
222+ ),
223+ "count" : CounterMetricFamily (
224+ self .build_name ("spill_count" ),
225+ "Total number of memory and disk accesses caused by managed data "
226+ "since the latest worker restart" ,
227+ labels = ["activity" ],
228+ ),
229+ "seconds" : CounterMetricFamily (
230+ self .build_name ("spill_time" ),
231+ "Total time spent spilling/unspilling since the latest worker restart" ,
232+ unit = "seconds" ,
233+ labels = ["activity" ],
234+ ),
235+ }
236+
268237 # Note: memory_read is used to calculate cache hit ratios (see docstring)
269- for k in ("memory_read" , "disk_read" , "disk_write" ):
270- total_counts .add_metric ([k ], metrics [f"{ k } _count_total" ])
271- yield total_counts
238+ for (label , unit ), value in metrics .items ():
239+ counters [unit ].add_metric ([label ], value )
272240
273- total_times = CounterMetricFamily (
274- self .build_name ("spill_time" ),
275- "Total time spent spilling/unspilling since the latest worker restart" ,
276- unit = "seconds" ,
277- labels = ["activity" ],
278- )
279- for k in ("pickle" , "disk_write" , "disk_read" , "unpickle" ):
280- total_times .add_metric ([k ], metrics [f"{ k } _time_total" ])
281- yield total_times
241+ yield from counters .values ()
282242
283243
284244class PrometheusHandler (RequestHandler ):
0 commit comments