Skip to content

Modifying https://github.com/cortexproject/cortex/pull/4422 to retry only for GET Requests #4840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
* [CHANGE] Remove support for alertmanager and ruler legacy store configuration. Before upgrading, you need to convert your configuration to use the `alertmanager-storage` and `ruler-storage` configuration on the version that you're already running, then upgrade.
* [CHANGE] Disables TSDB isolation. #4825
* [CHANGE] Drops support Prometheus 1.x rule format on configdb. #4826
* [ENHANCEMENT] AlertManager: Retrying AlertManager Get Requests (Get Alertmanager status, Get Alertmanager Receivers) on next replica on error #4840
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839
* [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783
* [ENHANCEMENT] Cortex now built with Go 1.18. #4829
Expand Down
34 changes: 28 additions & 6 deletions pkg/alertmanager/distributor.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,36 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
defer sp.Finish()
// Until we have a mechanism to combine the results from multiple alertmanagers,
// we forward the request to only only of the alertmanagers.
amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))]
resp, err := d.doRequest(ctx, amDesc, req)
if err != nil {
respondFromError(err, w, logger)
return

var instances []ring.InstanceDesc
if req.GetMethod() == "GET" && d.isUnaryReadPath(r.URL.Path) {
instances = replicationSet.Instances
// Randomize the list of instances to not always query the same one.
rand.Shuffle(len(instances), func(i, j int) {
instances[i], instances[j] = instances[j], instances[i]
})
} else {
//Picking 1 instance at Random for Non-Get Unary Read requests, as shuffling through large number of instances might increase complexity
randN := rand.Intn(len(replicationSet.Instances))
instances = replicationSet.Instances[randN : randN+1]
}

respondFromHTTPGRPCResponse(w, resp)
var lastErr error
for _, instance := range instances {
resp, err := d.doRequest(ctx, instance, req)
// storing the last error message
if err != nil {
lastErr = err
continue
}
// Return on the first succeeded request
respondFromHTTPGRPCResponse(w, resp)
return
}
// throwing the last error if the for loop finish without succeeding
if lastErr != nil {
respondFromError(lastErr, w, logger)
}
}

func respondFromError(err error, w http.ResponseWriter, logger log.Logger) {
Expand Down
18 changes: 18 additions & 0 deletions pkg/alertmanager/distributor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,24 @@ func TestDistributor_DistributeRequest(t *testing.T) {
expStatusCode: http.StatusOK,
expectedTotalCalls: 1,
route: "/status",
}, {
name: "Read /status should try all alert managers on error",
numAM: 3,
numHappyAM: 0,
replicationFactor: 3,
isRead: true,
expStatusCode: http.StatusInternalServerError,
expectedTotalCalls: 3,
route: "/status",
}, {
name: "Read /status is sent to 3 AM when 2 are not happy",
numAM: 3,
numHappyAM: 1,
replicationFactor: 3,
isRead: true,
expStatusCode: http.StatusOK,
expectedTotalCalls: 3,
route: "/status",
}, {
name: "Write /status not supported",
numAM: 5,
Expand Down