Skip to content

Add more TSDB metrics for ingester to emit #3800

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@
* `-blocks-storage.s3.max-connections-per-host`: Maximum number of connections per host. 0 means no limit.
* [ENHANCEMENT] Ingester: when tenant's TSDB is closed, Ingester now removes pushed metrics-metadata from memory, and removes metadata (`cortex_ingester_memory_metadata`, `cortex_ingester_memory_metadata_created_total`, `cortex_ingester_memory_metadata_removed_total`) and validation metrics (`cortex_discarded_samples_total`, `cortex_discarded_metadata_total`). #3782
* [ENHANCEMENT] Distributor: cleanup metrics for inactive tenants. #3784
* [ENHANCEMENT] Ingester: Have ingester to re-emit following TSDB metrics. #3800
* `cortex_ingester_tsdb_blocks_loaded`
* `cortex_ingester_tsdb_reloads_total`
* `cortex_ingester_tsdb_reloads_failures_total`
* `cortex_ingester_tsdb_symbol_table_size_bytes`
* `cortex_ingester_tsdb_storage_blocks_bytes`
* `cortex_ingester_tsdb_time_retentions_total`
* [BUGFIX] Cortex: Fixed issue where fatal errors and various log messages where not logged. #3778
* [BUGFIX] HA Tracker: don't track as error in the `cortex_kv_request_duration_seconds` metric a CAS operation intentionally aborted. #3745
* [BUGFIX] Querier / ruler: do not log "error removing stale clients" if the ring is empty. #3761
Expand Down
44 changes: 44 additions & 0 deletions pkg/ingester/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,14 @@ type tsdbMetrics struct {
tsdbChunksRemovedTotal *prometheus.Desc
tsdbMmapChunkCorruptionTotal *prometheus.Desc

// Follow metrics are from https://github.com/prometheus/prometheus/blob/fbe960f2c1ad9d6f5fe2f267d2559bf7ecfab6df/tsdb/db.go#L179
tsdbLoadedBlocks *prometheus.Desc
tsdbSymbolTableSize *prometheus.Desc
tsdbReloads *prometheus.Desc
tsdbReloadsFailed *prometheus.Desc
tsdbTimeRetentionCount *prometheus.Desc
tsdbBlocksBytes *prometheus.Desc

checkpointDeleteFail *prometheus.Desc
checkpointDeleteTotal *prometheus.Desc
checkpointCreationFail *prometheus.Desc
Expand Down Expand Up @@ -367,6 +375,30 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
"cortex_ingester_tsdb_mmap_chunk_corruptions_total",
"Total number of memory-mapped TSDB chunk corruptions.",
nil, nil),
tsdbLoadedBlocks: prometheus.NewDesc(
"cortex_ingester_tsdb_blocks_loaded",
"Number of currently loaded data blocks",
nil, nil),
tsdbReloads: prometheus.NewDesc(
"cortex_ingester_tsdb_reloads_total",
"Number of times the database reloaded block data from disk.",
nil, nil),
tsdbReloadsFailed: prometheus.NewDesc(
"cortex_ingester_tsdb_reloads_failures_total",
"Number of times the database failed to reloadBlocks block data from disk.",
nil, nil),
tsdbSymbolTableSize: prometheus.NewDesc(
"cortex_ingester_tsdb_symbol_table_size_bytes",
"Size of symbol table in memory for loaded blocks",
[]string{"user"}, nil),
tsdbBlocksBytes: prometheus.NewDesc(
"cortex_ingester_tsdb_storage_blocks_bytes",
"The number of bytes that are currently used for local storage by all blocks.",
[]string{"user"}, nil),
tsdbTimeRetentionCount: prometheus.NewDesc(
"cortex_ingester_tsdb_time_retentions_total",
"The number of times that blocks were deleted because the maximum time limit was exceeded.",
nil, nil),
checkpointDeleteFail: prometheus.NewDesc(
"cortex_ingester_tsdb_checkpoint_deletions_failed_total",
"Total number of TSDB checkpoint deletions that failed.",
Expand Down Expand Up @@ -419,6 +451,12 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
out <- sm.tsdbChunksCreatedTotal
out <- sm.tsdbChunksRemovedTotal
out <- sm.tsdbMmapChunkCorruptionTotal
out <- sm.tsdbLoadedBlocks
out <- sm.tsdbSymbolTableSize
out <- sm.tsdbReloads
out <- sm.tsdbReloadsFailed
out <- sm.tsdbTimeRetentionCount
out <- sm.tsdbBlocksBytes
out <- sm.checkpointDeleteFail
out <- sm.checkpointDeleteTotal
out <- sm.checkpointCreationFail
Expand Down Expand Up @@ -456,6 +494,12 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerUser(out, sm.tsdbChunksCreatedTotal, "prometheus_tsdb_head_chunks_created_total")
data.SendSumOfCountersPerUser(out, sm.tsdbChunksRemovedTotal, "prometheus_tsdb_head_chunks_removed_total")
data.SendSumOfCounters(out, sm.tsdbMmapChunkCorruptionTotal, "prometheus_tsdb_mmap_chunk_corruptions_total")
data.SendSumOfGauges(out, sm.tsdbLoadedBlocks, "prometheus_tsdb_blocks_loaded")
data.SendSumOfGaugesPerUser(out, sm.tsdbSymbolTableSize, "prometheus_tsdb_symbol_table_size_bytes")
data.SendSumOfCounters(out, sm.tsdbReloads, "prometheus_tsdb_reloads_total")
data.SendSumOfCounters(out, sm.tsdbReloadsFailed, "prometheus_tsdb_reloads_failures_total")
data.SendSumOfCounters(out, sm.tsdbTimeRetentionCount, "prometheus_tsdb_time_retentions_total")
data.SendSumOfGaugesPerUser(out, sm.tsdbBlocksBytes, "prometheus_tsdb_storage_blocks_bytes")
data.SendSumOfCounters(out, sm.checkpointDeleteFail, "prometheus_tsdb_checkpoint_deletions_failed_total")
data.SendSumOfCounters(out, sm.checkpointDeleteTotal, "prometheus_tsdb_checkpoint_deletions_total")
data.SendSumOfCounters(out, sm.checkpointCreationFail, "prometheus_tsdb_checkpoint_creations_failed_total")
Expand Down
89 changes: 89 additions & 0 deletions pkg/ingester/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,34 @@ func TestTSDBMetrics(t *testing.T) {
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406

# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
cortex_ingester_tsdb_blocks_loaded 15

# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
# TYPE cortex_ingester_tsdb_reloads_total counter
cortex_ingester_tsdb_reloads_total 30

# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
cortex_ingester_tsdb_reloads_failures_total 21

# HELP cortex_ingester_tsdb_symbol_table_size_bytes Size of symbol table in memory for loaded blocks
# TYPE cortex_ingester_tsdb_symbol_table_size_bytes gauge
cortex_ingester_tsdb_symbol_table_size_bytes{user="user1"} 12641280
cortex_ingester_tsdb_symbol_table_size_bytes{user="user2"} 87845888
cortex_ingester_tsdb_symbol_table_size_bytes{user="user3"} 1022976

# HELP cortex_ingester_tsdb_storage_blocks_bytes The number of bytes that are currently used for local storage by all blocks.
# TYPE cortex_ingester_tsdb_storage_blocks_bytes gauge
cortex_ingester_tsdb_storage_blocks_bytes{user="user1"} 50565120
cortex_ingester_tsdb_storage_blocks_bytes{user="user2"} 351383552
cortex_ingester_tsdb_storage_blocks_bytes{user="user3"} 4091904

# HELP cortex_ingester_tsdb_time_retentions_total The number of times that blocks were deleted because the maximum time limit was exceeded.
# TYPE cortex_ingester_tsdb_time_retentions_total counter
cortex_ingester_tsdb_time_retentions_total 33
`))
require.NoError(t, err)
}
Expand Down Expand Up @@ -325,6 +353,32 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
# HELP cortex_ingester_tsdb_mmap_chunk_corruptions_total Total number of memory-mapped TSDB chunk corruptions.
# TYPE cortex_ingester_tsdb_mmap_chunk_corruptions_total counter
cortex_ingester_tsdb_mmap_chunk_corruptions_total 2577406

# HELP cortex_ingester_tsdb_blocks_loaded Number of currently loaded data blocks
# TYPE cortex_ingester_tsdb_blocks_loaded gauge
cortex_ingester_tsdb_blocks_loaded 10

# HELP cortex_ingester_tsdb_reloads_total Number of times the database reloaded block data from disk.
# TYPE cortex_ingester_tsdb_reloads_total counter
cortex_ingester_tsdb_reloads_total 30

# HELP cortex_ingester_tsdb_reloads_failures_total Number of times the database failed to reloadBlocks block data from disk.
# TYPE cortex_ingester_tsdb_reloads_failures_total counter
cortex_ingester_tsdb_reloads_failures_total 21

# HELP cortex_ingester_tsdb_symbol_table_size_bytes Size of symbol table in memory for loaded blocks
# TYPE cortex_ingester_tsdb_symbol_table_size_bytes gauge
cortex_ingester_tsdb_symbol_table_size_bytes{user="user1"} 12641280
cortex_ingester_tsdb_symbol_table_size_bytes{user="user2"} 87845888

# HELP cortex_ingester_tsdb_storage_blocks_bytes The number of bytes that are currently used for local storage by all blocks.
# TYPE cortex_ingester_tsdb_storage_blocks_bytes gauge
cortex_ingester_tsdb_storage_blocks_bytes{user="user1"} 50565120
cortex_ingester_tsdb_storage_blocks_bytes{user="user2"} 351383552

# HELP cortex_ingester_tsdb_time_retentions_total The number of times that blocks were deleted because the maximum time limit was exceeded.
# TYPE cortex_ingester_tsdb_time_retentions_total counter
cortex_ingester_tsdb_time_retentions_total 33
`))
require.NoError(t, err)
}
Expand Down Expand Up @@ -508,5 +562,40 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
})
gcDuration.Observe(3)

loadedBlocks := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_blocks_loaded",
Help: "Number of currently loaded data blocks",
})
loadedBlocks.Set(5)

reloadsTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_reloads_total",
Help: "Number of times the database reloaded block data from disk.",
})
reloadsTotal.Add(10)

reloadsFailed := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_reloads_failures_total",
Help: "Number of times the database failed to reloadBlocks block data from disk.",
})
reloadsFailed.Add(7)

symbolTableSize := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_symbol_table_size_bytes",
Help: "Size of symbol table in memory for loaded blocks",
})
symbolTableSize.Set(1024 * base)

blocksSize := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_storage_blocks_bytes",
Help: "The number of bytes that are currently used for local storage by all blocks.",
})
blocksSize.Set(4096 * base)

retentionsTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_time_retentions_total",
Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
})
retentionsTotal.Add(11)
return r
}