-
Notifications
You must be signed in to change notification settings - Fork 594
ruler: don't retry on non-retriable error #7216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -185,26 +185,22 @@ func TestRemoteQuerier_Query(t *testing.T) { | |
}) | ||
} | ||
|
||
func TestRemoteQuerier_SendRequest(t *testing.T) { | ||
func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { | ||
const errMsg = "this is an error" | ||
var ( | ||
successfulResponse = &httpgrpc.HTTPResponse{ | ||
Code: http.StatusOK, | ||
Headers: []*httpgrpc.Header{ | ||
{Key: "Content-Type", Values: []string{"application/json"}}, | ||
}, | ||
Body: []byte(`{ | ||
"status": "success","data": {"resultType":"vector","result":[]} | ||
}`), | ||
} | ||
erroneousResponse = &httpgrpc.HTTPResponse{ | ||
Code: http.StatusBadRequest, | ||
Headers: []*httpgrpc.Header{ | ||
{Key: "Content-Type", Values: []string{"application/json"}}, | ||
}, | ||
Body: []byte("this is an error"), | ||
} | ||
) | ||
successfulResponse := &httpgrpc.HTTPResponse{ | ||
Code: http.StatusOK, | ||
Headers: []*httpgrpc.Header{ | ||
{Key: "Content-Type", Values: []string{"application/json"}}, | ||
}, | ||
Body: []byte(`{"status": "success","data": {"resultType":"vector","result":[]}}`), | ||
} | ||
erroneousResponse := &httpgrpc.HTTPResponse{ | ||
Code: http.StatusBadRequest, | ||
Headers: []*httpgrpc.Header{ | ||
{Key: "Content-Type", Values: []string{"application/json"}}, | ||
}, | ||
Body: []byte("this is an error"), | ||
} | ||
|
||
tests := map[string]struct { | ||
response *httpgrpc.HTTPResponse | ||
|
@@ -237,6 +233,16 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { | |
expectedError: status.Error(codes.DeadlineExceeded, context.DeadlineExceeded.Error()), | ||
expectedRetries: true, | ||
}, | ||
"gRPC ResourceExhausted error is retried": { | ||
err: status.Error(codes.ResourceExhausted, errMsg), | ||
expectedError: status.Error(codes.ResourceExhausted, errMsg), | ||
expectedRetries: true, | ||
}, | ||
"errors about execeeding gRPC server's limit are not retried": { | ||
err: status.Error(codes.ResourceExhausted, "trying to send message larger than max"), | ||
expectedError: status.Error(codes.ResourceExhausted, "trying to send message larger than max"), | ||
expectedRetries: false, | ||
}, | ||
"errors with code 4xx are not retried": { | ||
err: httpgrpc.Errorf(http.StatusBadRequest, errMsg), | ||
expectedError: httpgrpc.Errorf(http.StatusBadRequest, errMsg), | ||
|
@@ -257,10 +263,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { | |
} | ||
for testName, testCase := range tests { | ||
t.Run(testName, func(t *testing.T) { | ||
var ( | ||
inReq *httpgrpc.HTTPRequest | ||
count atomic.Int64 | ||
) | ||
var count atomic.Int64 | ||
|
||
ctx, cancel := context.WithCancel(context.Background()) | ||
mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { | ||
|
@@ -275,7 +278,7 @@ func TestRemoteQuerier_SendRequest(t *testing.T) { | |
} | ||
q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) | ||
require.Equal(t, int64(0), count.Load()) | ||
_, err := q.sendRequest(ctx, inReq, log.NewNopLogger()) | ||
_, err := q.Query(ctx, "qs", time.Now()) | ||
if testCase.err == nil { | ||
if testCase.expectedError == nil { | ||
require.NoError(t, err) | ||
|
@@ -717,63 +720,6 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { | |
require.Error(t, err) | ||
} | ||
|
||
func TestRemoteQuerier_BackoffRetry(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note, these tests duplicate what is already covered in |
||
tcs := map[string]struct { | ||
failedRequests int | ||
expectedError string | ||
requestDeadline time.Duration | ||
}{ | ||
"succeed on failed requests <= max retries": { | ||
failedRequests: maxRequestRetries, | ||
}, | ||
"fail on failed requests > max retries": { | ||
failedRequests: maxRequestRetries + 1, | ||
expectedError: "failed request: 4", | ||
}, | ||
"return last known error on context cancellation": { | ||
failedRequests: 1, | ||
requestDeadline: 50 * time.Millisecond, // force context cancellation while waiting for retry | ||
expectedError: "context deadline exceeded while retrying request, last err was: failed request: 1", | ||
}, | ||
} | ||
for tn, tc := range tcs { | ||
t.Run(tn, func(t *testing.T) { | ||
retries := 0 | ||
mockClientFn := func(ctx context.Context, req *httpgrpc.HTTPRequest, _ ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { | ||
retries++ | ||
if retries <= tc.failedRequests { | ||
return nil, fmt.Errorf("failed request: %d", retries) | ||
} | ||
return &httpgrpc.HTTPResponse{ | ||
Code: http.StatusOK, | ||
Headers: []*httpgrpc.Header{ | ||
{Key: "Content-Type", Values: []string{"application/json"}}, | ||
}, | ||
Body: []byte(`{ | ||
"status": "success","data": {"resultType":"vector","result":[{"metric":{"foo":"bar"},"value":[1,"773054.5916666666"]}]} | ||
}`), | ||
}, nil | ||
} | ||
q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) | ||
|
||
ctx := context.Background() | ||
if tc.requestDeadline > 0 { | ||
var cancelFn context.CancelFunc | ||
ctx, cancelFn = context.WithTimeout(ctx, tc.requestDeadline) | ||
defer cancelFn() | ||
} | ||
|
||
resp, err := q.Query(ctx, "qs", time.Now()) | ||
if tc.expectedError != "" { | ||
require.EqualError(t, err, tc.expectedError) | ||
} else { | ||
require.NotNil(t, resp) | ||
require.Len(t, resp, 1) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { | ||
var ( | ||
errorResp = &httpgrpc.HTTPResponse{ | ||
|
@@ -785,8 +731,8 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { | |
"status": "error","errorType": "execution" | ||
}`), | ||
} | ||
error4xx = status.Error(http.StatusUnprocessableEntity, "this is a 4xx error") | ||
error5xx = status.Error(http.StatusInternalServerError, "this is a 5xx error") | ||
error4xx = httpgrpc.Errorf(http.StatusUnprocessableEntity, "this is a 4xx error") | ||
error5xx = httpgrpc.Errorf(http.StatusInternalServerError, "this is a 5xx error") | ||
Comment on lines
+734
to
+735
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: these are outside the scope of changes. It was just confusing, while I was reading through the code, that we use non-gRPC codes in gRPC statuses, while their docs explicitly ask not doing that (I'm now aware that |
||
) | ||
testCases := map[string]struct { | ||
resp *httpgrpc.HTTPResponse | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed via Slack, the initial idea was to make all errors with
codes.ResourceExhausted
status code fail. Some of these errors come from gRPC itself (e.g., https://github.com/grpc/grpc-go/blob/84b85babc00b4b8460e53b6ee110bfb49e9311cf/rpc_util.go#L610-L615). On the other hand, there are some errors with status codecodes.ResourceExhausted
returned by distributors and ingesters that should be retried. These cases are:codes.ResourceExhausted
status code, but also contains amimirpb.ErrorDetail
with causemimirpb.TOO_BUSY
and should always be retriedcodes.ResourceExhausted
status code, but also contains amimirpb.ErrorDetail
with causemimirpb.INGESTION_RATE_LIMITED
and should be retried if distributor's-distributor.service-overload-status-code-on-rate-limit-enabled
is set to truecodes.ResourceExhausted
status code, but also contains amimirpb.ErrorDetail
with causemimirpb.REQUEST_RATE_LIMITED
and should be retried if distributor's-distributor.service-overload-status-code-on-rate-limit-enabled
is set to trueSo I recommend to check whether the error contains a detail of type
mimirpb.ErrorDetails
with one of the causes mentioned above and to retry when needed.An example:
IMPORTANT: this code is just an example, and it doesn't take into account the
-distributor.service-overload-status-code-on-rate-limit-enabled
flag.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After another chat with @narqo I actually realized that
mimirpb.INGESTION_RATE_LIMITED
andmimirpb.REQUEST_RATE_LIMITED
could be thrown on the write path only, while the present PR concerns the read path. Therefore, the-distributor.service-overload-status-code-on-rate-limit-enabled
CLI flag is not important, and the implementation can be simplified.