Skip to content

Commit 93a8a4e

Browse files
committed
add new ingester metrics
Signed-off-by: Ben Ye <benye@amazon.com>
1 parent 752c354 commit 93a8a4e

File tree

2 files changed

+128
-5
lines changed

2 files changed

+128
-5
lines changed

pkg/ingester/metrics.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,11 @@ type tsdbMetrics struct {
262262
tsdbChunksRemovedTotal *prometheus.Desc
263263
tsdbMmapChunkCorruptionTotal *prometheus.Desc
264264
tsdbChunkwriteQueueOperationsTotal *prometheus.Desc
265+
tsdbSamplesAppended *prometheus.Desc
266+
tsdbOutOfOrderSamplesAppended *prometheus.Desc
267+
tsdbSnapshotReplayErrorTotal *prometheus.Desc
268+
tsdbOOOHistogram *prometheus.Desc
269+
tsdbMmapChunksTotal *prometheus.Desc
265270

266271
tsdbExemplarsTotal *prometheus.Desc
267272
tsdbExemplarsInStorage *prometheus.Desc
@@ -429,6 +434,26 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
429434
"cortex_ingester_tsdb_checkpoint_creations_total",
430435
"Total number of TSDB checkpoint creations attempted.",
431436
nil, nil),
437+
tsdbSamplesAppended: prometheus.NewDesc(
438+
"cortex_ingester_tsdb_head_samples_appended_total",
439+
"Total number of appended samples.",
440+
[]string{"type", "user"}, nil),
441+
tsdbOutOfOrderSamplesAppended: prometheus.NewDesc(
442+
"cortex_ingester_tsdb_head_out_of_order_samples_appended_total",
443+
"Total number of appended out of order samples.",
444+
[]string{"user"}, nil),
445+
tsdbSnapshotReplayErrorTotal: prometheus.NewDesc(
446+
"cortex_ingester_tsdb_snapshot_replay_error_total",
447+
"Total number snapshot replays that failed.",
448+
nil, nil),
449+
tsdbOOOHistogram: prometheus.NewDesc(
450+
"cortex_ingester_tsdb_sample_ooo_delta",
451+
"Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
452+
nil, nil),
453+
tsdbMmapChunksTotal: prometheus.NewDesc(
454+
"cortex_ingester_tsdb_mmap_chunks_total",
455+
"Total number of chunks that were memory-mapped.",
456+
nil, nil),
432457

433458
// The most useful exemplar metrics are per-user. The rest
434459
// are global to reduce metrics overhead.
@@ -497,6 +522,11 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
497522
out <- sm.tsdbReloadsFailed
498523
out <- sm.tsdbTimeRetentionCount
499524
out <- sm.tsdbBlocksBytes
525+
out <- sm.tsdbSamplesAppended
526+
out <- sm.tsdbOutOfOrderSamplesAppended
527+
out <- sm.tsdbSnapshotReplayErrorTotal
528+
out <- sm.tsdbOOOHistogram
529+
out <- sm.tsdbMmapChunksTotal
500530
out <- sm.checkpointDeleteFail
501531
out <- sm.checkpointDeleteTotal
502532
out <- sm.checkpointCreationFail
@@ -547,6 +577,11 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
547577
data.SendSumOfCounters(out, sm.tsdbReloadsFailed, "prometheus_tsdb_reloads_failures_total")
548578
data.SendSumOfCounters(out, sm.tsdbTimeRetentionCount, "prometheus_tsdb_time_retentions_total")
549579
data.SendSumOfGaugesPerUser(out, sm.tsdbBlocksBytes, "prometheus_tsdb_storage_blocks_bytes")
580+
data.SendSumOfCountersPerUserWithLabels(out, sm.tsdbSamplesAppended, "prometheus_tsdb_head_samples_appended_total", "type")
581+
data.SendSumOfCountersPerUser(out, sm.tsdbOutOfOrderSamplesAppended, "prometheus_tsdb_head_out_of_order_samples_appended_total")
582+
data.SendSumOfCounters(out, sm.tsdbSnapshotReplayErrorTotal, "prometheus_tsdb_snapshot_replay_error_total")
583+
data.SendSumOfHistograms(out, sm.tsdbOOOHistogram, "prometheus_tsdb_sample_ooo_delta")
584+
data.SendSumOfGauges(out, sm.tsdbMmapChunksTotal, "prometheus_tsdb_mmap_chunks_total")
550585
data.SendSumOfCounters(out, sm.checkpointDeleteFail, "prometheus_tsdb_checkpoint_deletions_failed_total")
551586
data.SendSumOfCounters(out, sm.checkpointDeleteTotal, "prometheus_tsdb_checkpoint_deletions_total")
552587
data.SendSumOfCounters(out, sm.checkpointCreationFail, "prometheus_tsdb_checkpoint_creations_failed_total")

pkg/ingester/metrics_test.go

Lines changed: 93 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,16 @@ func TestTSDBMetrics(t *testing.T) {
104104
# TYPE cortex_ingester_tsdb_head_gc_duration_seconds summary
105105
cortex_ingester_tsdb_head_gc_duration_seconds_sum 9
106106
cortex_ingester_tsdb_head_gc_duration_seconds_count 3
107-
107+
# HELP cortex_ingester_tsdb_head_out_of_order_samples_appended_total Total number of appended out of order samples.
108+
# TYPE cortex_ingester_tsdb_head_out_of_order_samples_appended_total counter
109+
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user1"} 102
110+
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user2"} 102
111+
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user3"} 102
112+
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
113+
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
114+
cortex_ingester_tsdb_head_samples_appended_total{type="user1",user="float"} 101
115+
cortex_ingester_tsdb_head_samples_appended_total{type="user2",user="float"} 101
116+
cortex_ingester_tsdb_head_samples_appended_total{type="user3",user="float"} 101
108117
# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
109118
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
110119
cortex_ingester_tsdb_checkpoint_deletions_failed_total 1586096
@@ -167,15 +176,31 @@ func TestTSDBMetrics(t *testing.T) {
167176
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
168177
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
169178
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406
170-
179+
# HELP cortex_ingester_tsdb_mmap_chunks_total Total number of chunks that were memory-mapped.
180+
# TYPE cortex_ingester_tsdb_mmap_chunks_total gauge
181+
cortex_ingester_tsdb_mmap_chunks_total 0
171182
# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
172183
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
173184
cortex_ingester_tsdb_blocks_loaded 15
174185
175186
# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
176187
# TYPE cortex_ingester_tsdb_reloads_total counter
177188
cortex_ingester_tsdb_reloads_total 30
178-
189+
# HELP cortex_ingester_tsdb_sample_ooo_delta Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).
190+
# TYPE cortex_ingester_tsdb_sample_ooo_delta histogram
191+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="600"} 0
192+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="1800"} 3
193+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="3600"} 3
194+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="7200"} 3
195+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="10800"} 3
196+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="21600"} 3
197+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="43200"} 3
198+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="+Inf"} 3
199+
cortex_ingester_tsdb_sample_ooo_delta_sum 2700
200+
cortex_ingester_tsdb_sample_ooo_delta_count 3
201+
# HELP cortex_ingester_tsdb_snapshot_replay_error_total Total number snapshot replays that failed.
202+
# TYPE cortex_ingester_tsdb_snapshot_replay_error_total counter
203+
cortex_ingester_tsdb_snapshot_replay_error_total 309
179204
# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
180205
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
181206
cortex_ingester_tsdb_reloads_failures_total 21
@@ -318,6 +343,14 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
318343
# TYPE cortex_ingester_tsdb_head_gc_duration_seconds summary
319344
cortex_ingester_tsdb_head_gc_duration_seconds_sum 9
320345
cortex_ingester_tsdb_head_gc_duration_seconds_count 3
346+
# HELP cortex_ingester_tsdb_head_out_of_order_samples_appended_total Total number of appended out of order samples.
347+
# TYPE cortex_ingester_tsdb_head_out_of_order_samples_appended_total counter
348+
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user1"} 102
349+
cortex_ingester_tsdb_head_out_of_order_samples_appended_total{user="user2"} 102
350+
# HELP cortex_ingester_tsdb_head_samples_appended_total Total number of appended samples.
351+
# TYPE cortex_ingester_tsdb_head_samples_appended_total counter
352+
cortex_ingester_tsdb_head_samples_appended_total{type="user1",user="float"} 101
353+
cortex_ingester_tsdb_head_samples_appended_total{type="user2",user="float"} 101
321354
322355
# HELP cortex_ingester_tsdb_checkpoint_deletions_failed_total Total number of TSDB checkpoint deletions that failed.
323356
# TYPE cortex_ingester_tsdb_checkpoint_deletions_failed_total counter
@@ -377,15 +410,31 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
377410
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
378411
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
379412
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406
380-
413+
# HELP cortex_ingester_tsdb_mmap_chunks_total Total number of chunks that were memory-mapped.
414+
# TYPE cortex_ingester_tsdb_mmap_chunks_total gauge
415+
cortex_ingester_tsdb_mmap_chunks_total 0
381416
# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
382417
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
383418
cortex_ingester_tsdb_blocks_loaded 10
384419
385420
# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
386421
# TYPE cortex_ingester_tsdb_reloads_total counter
387422
cortex_ingester_tsdb_reloads_total 30
388-
423+
# HELP cortex_ingester_tsdb_sample_ooo_delta Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).
424+
# TYPE cortex_ingester_tsdb_sample_ooo_delta histogram
425+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="600"} 0
426+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="1800"} 3
427+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="3600"} 3
428+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="7200"} 3
429+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="10800"} 3
430+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="21600"} 3
431+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="43200"} 3
432+
cortex_ingester_tsdb_sample_ooo_delta_bucket{le="+Inf"} 3
433+
cortex_ingester_tsdb_sample_ooo_delta_sum 2700
434+
cortex_ingester_tsdb_sample_ooo_delta_count 3
435+
# HELP cortex_ingester_tsdb_snapshot_replay_error_total Total number snapshot replays that failed.
436+
# TYPE cortex_ingester_tsdb_snapshot_replay_error_total counter
437+
cortex_ingester_tsdb_snapshot_replay_error_total 309
389438
# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
390439
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
391440
cortex_ingester_tsdb_reloads_failures_total 21
@@ -608,6 +657,45 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
608657
})
609658
gcDuration.Observe(3)
610659

660+
samplesAppended := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
661+
Name: "prometheus_tsdb_head_samples_appended_total",
662+
Help: "Total number of appended samples.",
663+
}, []string{"type"})
664+
samplesAppended.WithLabelValues("float").Add(101)
665+
666+
outOfOrderSamplesAppended := promauto.With(r).NewCounter(prometheus.CounterOpts{
667+
Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
668+
Help: "Total number of appended out of order samples.",
669+
})
670+
outOfOrderSamplesAppended.Add(102)
671+
672+
snapshotReplayErrorTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
673+
Name: "prometheus_tsdb_snapshot_replay_error_total",
674+
Help: "Total number snapshot replays that failed.",
675+
})
676+
snapshotReplayErrorTotal.Add(103)
677+
678+
oooHistogram := promauto.With(r).NewHistogram(prometheus.HistogramOpts{
679+
Name: "prometheus_tsdb_sample_ooo_delta",
680+
Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
681+
Buckets: []float64{
682+
60 * 10, // 10 min
683+
60 * 30, // 30 min
684+
60 * 60, // 60 min
685+
60 * 60 * 2, // 2h
686+
60 * 60 * 3, // 3h
687+
60 * 60 * 6, // 6h
688+
60 * 60 * 12, // 12h
689+
},
690+
})
691+
oooHistogram.Observe(60 * 15)
692+
693+
mmapChunksTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
694+
Name: "prometheus_tsdb_mmap_chunks_total",
695+
Help: "Total number of chunks that were memory-mapped.",
696+
})
697+
mmapChunksTotal.Add(104)
698+
611699
loadedBlocks := promauto.With(r).NewGauge(prometheus.GaugeOpts{
612700
Name: "prometheus_tsdb_blocks_loaded",
613701
Help: "Number of currently loaded data blocks",

0 commit comments

Comments
 (0)