Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: Use exponential buckets for histogram metrics #1545

Merged
merged 5 commits into from
Nov 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions cmd/thanos/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,7 @@ func registerQuery(m map[string]setupFunc, app *kingpin.Application) {
func storeClientGRPCOpts(logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, secure bool, cert, key, caCert, serverName string) ([]grpc.DialOption, error) {
grpcMets := grpc_prometheus.NewClientMetrics()
grpcMets.EnableClientHandlingTimeHistogram(
grpc_prometheus.WithHistogramBuckets([]float64{
0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4,
}),
grpc_prometheus.WithHistogramBuckets(prometheus.ExponentialBuckets(0.001, 2, 15)),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before:

grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.001"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.01"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.05"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.1"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.2"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.4"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.8"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="1.6"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="3.2"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="6.4"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="+Inf"} 0

After:

grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.001"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.002"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.004"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.008"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.016"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.032"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.064"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.128"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.256"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="0.512"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="1.024"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="2.048"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="4.096"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="8.192"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="16.384"} 0
grpc_server_handling_seconds_bucket{grpc_method="Series",grpc_service="thanos.Store",grpc_type="server_stream",le="+Inf"} 0

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An example distirbution for existing buckets, from a real life system.

sum(grpc_server_handling_seconds_bucket{job=~"thanos-store.*", grpc_type="server_stream"}) by (le)
{le="6.4"} | 158
{le="0.05"} | 2
{le="0.1"} | 5
{le="0.2"} | 13
{le="0.4"} | 34
{le="0.8"} | 62
{le="+Inf"} | 187
{le="0.001"} | 0
{le="0.01"} | 0
{le="1.6"} | 103
{le="3.2"} | 133

)
dialOpts := []grpc.DialOption{
// We want to make sure that we can receive huge gRPC messages from storeAPI.
Expand Down
16 changes: 6 additions & 10 deletions pkg/compact/compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,9 @@ func newSyncerMetrics(reg prometheus.Registerer) *syncerMetrics {
Help: "Total number of failed sync meta operations.",
})
m.syncMetaDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "thanos_compact_sync_meta_duration_seconds",
Help: "Time it took to sync meta files.",
Buckets: []float64{
0.25, 0.6, 1, 2, 3.5, 5, 7.5, 10, 15, 30, 60, 100, 200, 500,
},
Name: "thanos_compact_sync_meta_duration_seconds",
Help: "Time it took to sync meta files.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
})

m.garbageCollectedBlocks = prometheus.NewCounter(prometheus.CounterOpts{
Expand All @@ -101,11 +99,9 @@ func newSyncerMetrics(reg prometheus.Registerer) *syncerMetrics {
Help: "Total number of failed garbage collection operations.",
})
m.garbageCollectionDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "thanos_compact_garbage_collection_duration_seconds",
Help: "Time it took to perform garbage collection iteration.",
Buckets: []float64{
0.25, 0.6, 1, 2, 3.5, 5, 7.5, 10, 15, 30, 60, 100, 200, 500,
},
Name: "thanos_compact_garbage_collection_duration_seconds",
Help: "Time it took to perform garbage collection iteration.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 15),
})

m.compactions = prometheus.NewCounterVec(prometheus.CounterOpts{
Expand Down
5 changes: 3 additions & 2 deletions pkg/extprom/http/instrument_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ func NewInstrumentationMiddleware(reg prometheus.Registerer) InstrumentationMidd
ins := defaultInstrumentationMiddleware{
requestDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "Tracks the latencies for HTTP requests.",
Name: "http_request_duration_seconds",
Help: "Tracks the latencies for HTTP requests.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 17),
},
[]string{"code", "handler", "method"},
),
Expand Down
2 changes: 1 addition & 1 deletion pkg/objstore/objstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func BucketWithMetrics(name string, b Bucket, r prometheus.Registerer) Bucket {
Name: "thanos_objstore_bucket_operation_duration_seconds",
Help: "Duration of operations against the bucket",
ConstLabels: prometheus.Labels{"bucket": name},
Buckets: []float64{0.005, 0.01, 0.02, 0.04, 0.08, 0.15, 0.3, 0.6, 1, 1.5, 2.5, 5, 10, 20, 30},
Buckets: prometheus.ExponentialBuckets(0.001, 2, 17),
}, []string{"operation"}),
lastSuccessfullUploadTime: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "thanos_objstore_bucket_last_successful_upload_time",
Expand Down
4 changes: 1 addition & 3 deletions pkg/server/grpc/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ func New(logger log.Logger, reg prometheus.Registerer, tracer opentracing.Tracer

met := grpc_prometheus.NewServerMetrics()
met.EnableHandlingTimeHistogram(
grpc_prometheus.WithHistogramBuckets([]float64{
0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4,
}),
grpc_prometheus.WithHistogramBuckets(prometheus.ExponentialBuckets(0.001, 2, 15)),
)
panicsTotal := prometheus.NewCounter(prometheus.CounterOpts{
Name: "thanos_grpc_req_panics_recovered_total",
Expand Down
16 changes: 6 additions & 10 deletions pkg/store/bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,14 @@ func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics {
Help: "Number of blocks in a bucket store that were touched to satisfy a query.",
})
m.seriesGetAllDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "thanos_bucket_store_series_get_all_duration_seconds",
Help: "Time it takes until all per-block prepares and preloads for a query are finished.",
Buckets: []float64{
0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 7.5, 10, 15, 30, 60,
},
Name: "thanos_bucket_store_series_get_all_duration_seconds",
Help: "Time it takes until all per-block prepares and preloads for a query are finished.",
Buckets: prometheus.ExponentialBuckets(0.01, 2, 15),
})
m.seriesMergeDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "thanos_bucket_store_series_merge_duration_seconds",
Help: "Time it takes to merge sub-results from all queried blocks into a single result.",
Buckets: []float64{
0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 7.5, 10, 15, 30, 60,
},
Name: "thanos_bucket_store_series_merge_duration_seconds",
Help: "Time it takes to merge sub-results from all queried blocks into a single result.",
Buckets: prometheus.ExponentialBuckets(0.01, 2, 15),
})
m.resultSeriesCount = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "thanos_bucket_store_series_result_series",
Expand Down
8 changes: 3 additions & 5 deletions pkg/store/gate.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ func NewGate(maxConcurrent int, reg prometheus.Registerer) *Gate {
Help: "Number of queries that are currently in flight.",
}),
gateTiming: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "gate_duration_seconds",
Help: "How many seconds it took for queries to wait at the gate.",
Buckets: []float64{
0.01, 0.05, 0.1, 0.25, 0.6, 1, 2, 3.5, 5, 10,
},
Name: "gate_duration_seconds",
Help: "How many seconds it took for queries to wait at the gate.",
Buckets: prometheus.ExponentialBuckets(0.1, 2, 15),
}),
}

Expand Down