Skip to content

Commit 8175ef6

Browse files
authored
Add metrics for rejected queries in QFE (#5356)
* metrics for rejected queries Signed-off-by: Ben Ye <benye@amazon.com>
1 parent 7620194 commit 8175ef6

File tree

5 files changed

+360
-51
lines changed

5 files changed

+360
-51
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* [FEATURE] Store Gateway: Add `max_downloaded_bytes_per_request` to limit max bytes to download per store gateway request.
88
* [FEATURE] Added 2 flags `-alertmanager.alertmanager-client.grpc-max-send-msg-size` and ` -alertmanager.alertmanager-client.grpc-max-recv-msg-size` to configure alert manager grpc client message size limits. #5338
99
* [FEATURE] Query Frontend: Add `cortex_queries_total` metric for total number of queries executed per user. #5360
10+
* [FEATURE] Query Frontend: Add `cortex_discarded_queries_total` metric for throttled queries. #5356
1011
* [ENHANCEMENT] Distributor/Ingester: Add span on push path #5319
1112
* [ENHANCEMENT] Support object storage backends for runtime configuration file. #5292
1213
* [ENHANCEMENT] Query Frontend: Reject subquery with too small step size. #5323

pkg/frontend/transport/handler.go

Lines changed: 85 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ import (
3030
)
3131

3232
const (
33-
// StatusClientClosedRequest is the status code for when a client request cancellation of an http request
33+
// StatusClientClosedRequest is the status code for when a client request cancellation of a http request
3434
StatusClientClosedRequest = 499
3535
ServiceTimingHeaderName = "Server-Timing"
3636
)
@@ -41,6 +41,33 @@ var (
4141
errRequestEntityTooLarge = httpgrpc.Errorf(http.StatusRequestEntityTooLarge, "http: request body too large")
4242
)
4343

44+
const (
45+
reasonRequestBodySizeExceeded = "request_body_size_exceeded"
46+
reasonResponseBodySizeExceeded = "response_body_size_exceeded"
47+
reasonTooManyRequests = "too_many_requests"
48+
reasonTimeRangeExceeded = "time_range_exceeded"
49+
reasonTooManySamples = "too_many_samples"
50+
reasonSeriesFetched = "series_fetched"
51+
reasonChunksFetched = "chunks_fetched"
52+
reasonChunkBytesFetched = "chunk_bytes_fetched"
53+
reasonDataBytesFetched = "data_bytes_fetched"
54+
reasonSeriesLimitStoreGateway = "store_gateway_series_limit"
55+
reasonChunksLimitStoreGateway = "store_gateway_chunks_limit"
56+
reasonBytesLimitStoreGateway = "store_gateway_bytes_limit"
57+
58+
limitTooManySamples = `query processing would load too many samples into memory`
59+
limitTimeRangeExceeded = `the query time range exceeds the limit`
60+
limitSeriesFetched = `the query hit the max number of series limit`
61+
limitChunksFetched = `the query hit the max number of chunks limit`
62+
limitChunkBytesFetched = `the query hit the aggregated chunks size limit`
63+
limitDataBytesFetched = `the query hit the aggregated data size limit`
64+
65+
// Store gateway limits.
66+
limitSeriesStoreGateway = `exceeded series limit`
67+
limitChunksStoreGateway = `exceeded chunks limit`
68+
limitBytesStoreGateway = `exceeded bytes limit`
69+
)
70+
4471
// Config for a Handler.
4572
type HandlerConfig struct {
4673
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
@@ -67,6 +94,7 @@ type Handler struct {
6794
querySeries *prometheus.CounterVec
6895
queryChunkBytes *prometheus.CounterVec
6996
queryDataBytes *prometheus.CounterVec
97+
rejectedQueries *prometheus.CounterVec
7098
activeUsers *util.ActiveUsersCleanupService
7199
}
72100

@@ -104,12 +132,23 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
104132
Help: "Size of all data fetched to execute a query in bytes.",
105133
}, []string{"user"})
106134

135+
h.rejectedQueries = prometheus.NewCounterVec(
136+
prometheus.CounterOpts{
137+
Name: "cortex_rejected_queries_total",
138+
Help: "The total number of queries that were rejected.",
139+
},
140+
[]string{"reason", "user"},
141+
)
142+
107143
h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(func(user string) {
108144
h.queriesCount.DeleteLabelValues(user)
109145
h.querySeconds.DeleteLabelValues(user)
110146
h.querySeries.DeleteLabelValues(user)
111147
h.queryChunkBytes.DeleteLabelValues(user)
112148
h.queryDataBytes.DeleteLabelValues(user)
149+
if err := util.DeleteMatchingLabels(h.rejectedQueries, map[string]string{"user": user}); err != nil {
150+
level.Warn(log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err)
151+
}
113152
})
114153
// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
115154
_ = h.activeUsers.StartAsync(context.Background())
@@ -124,6 +163,12 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
124163
queryString url.Values
125164
)
126165

166+
tenantIDs, err := tenant.TenantIDs(r.Context())
167+
if err != nil {
168+
return
169+
}
170+
userID := tenant.JoinTenantIDs(tenantIDs)
171+
127172
// Initialise the stats in the context and make sure it's propagated
128173
// down the request chain.
129174
if f.cfg.QueryStatsEnabled {
@@ -150,6 +195,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
150195
if !strings.Contains(r.URL.Path, "api/v1/read") {
151196
if err := r.ParseForm(); err != nil {
152197
writeError(w, err)
198+
if f.cfg.QueryStatsEnabled && util.IsRequestBodyTooLarge(err) {
199+
f.rejectedQueries.WithLabelValues(reasonRequestBodySizeExceeded, userID).Inc()
200+
}
153201
return
154202
}
155203
r.Body = io.NopCloser(&buf)
@@ -168,7 +216,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
168216
if shouldReportSlowQuery {
169217
f.reportSlowQuery(r, queryString, queryResponseTime)
170218
}
219+
171220
if f.cfg.QueryStatsEnabled {
221+
// Try to parse error and get status code.
172222
var statusCode int
173223
if err != nil {
174224
statusCode = getStatusCodeFromError(err)
@@ -184,7 +234,7 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
184234
}
185235
}
186236

187-
f.reportQueryStats(r, queryString, queryResponseTime, stats, err, statusCode, resp)
237+
f.reportQueryStats(r, userID, queryString, queryResponseTime, stats, err, statusCode, resp)
188238
}
189239

190240
if err != nil {
@@ -239,12 +289,7 @@ func (f *Handler) reportSlowQuery(r *http.Request, queryString url.Values, query
239289
level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
240290
}
241291

242-
func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, queryResponseTime time.Duration, stats *querier_stats.QueryStats, error error, statusCode int, resp *http.Response) {
243-
tenantIDs, err := tenant.TenantIDs(r.Context())
244-
if err != nil {
245-
return
246-
}
247-
userID := tenant.JoinTenantIDs(tenantIDs)
292+
func (f *Handler) reportQueryStats(r *http.Request, userID string, queryString url.Values, queryResponseTime time.Duration, stats *querier_stats.QueryStats, error error, statusCode int, resp *http.Response) {
248293
wallTime := stats.LoadWallTime()
249294
numSeries := stats.LoadFetchedSeries()
250295
numChunks := stats.LoadFetchedChunks()
@@ -311,6 +356,38 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer
311356
} else {
312357
level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
313358
}
359+
360+
var reason string
361+
if statusCode == http.StatusTooManyRequests {
362+
reason = reasonTooManyRequests
363+
} else if statusCode == http.StatusRequestEntityTooLarge {
364+
reason = reasonResponseBodySizeExceeded
365+
} else if statusCode == http.StatusUnprocessableEntity {
366+
errMsg := error.Error()
367+
if strings.Contains(errMsg, limitTooManySamples) {
368+
reason = reasonTooManySamples
369+
} else if strings.Contains(errMsg, limitTimeRangeExceeded) {
370+
reason = reasonTimeRangeExceeded
371+
} else if strings.Contains(errMsg, limitSeriesFetched) {
372+
reason = reasonSeriesFetched
373+
} else if strings.Contains(errMsg, limitChunksFetched) {
374+
reason = reasonChunksFetched
375+
} else if strings.Contains(errMsg, limitChunkBytesFetched) {
376+
reason = reasonChunkBytesFetched
377+
} else if strings.Contains(errMsg, limitDataBytesFetched) {
378+
reason = reasonDataBytesFetched
379+
} else if strings.Contains(errMsg, limitSeriesStoreGateway) {
380+
reason = reasonSeriesLimitStoreGateway
381+
} else if strings.Contains(errMsg, limitChunksStoreGateway) {
382+
reason = reasonChunksLimitStoreGateway
383+
} else if strings.Contains(errMsg, limitBytesStoreGateway) {
384+
reason = reasonBytesLimitStoreGateway
385+
}
386+
}
387+
if len(reason) > 0 {
388+
f.rejectedQueries.WithLabelValues(reason, userID).Inc()
389+
stats.LimitHit = reason
390+
}
314391
}
315392

316393
func (f *Handler) parseRequestQueryString(r *http.Request, bodyBuf bytes.Buffer) url.Values {

0 commit comments

Comments
 (0)