Skip to content

Commit 48bc900

Browse files
Modifying #4422 to retry only for GET requests. Retrying AlertManager UnaryPath GET Requests on next replica if one fail. (#4840)
Signed-off-by: Krishna Teja Puttagunta <krishtez@amazon.com> Signed-off-by: Krishna Teja Puttagunta <krishtez@amazon.com> Signed-off-by: Krishna Teja Puttagunta <krishnateja325@gmail.com>
1 parent ad514c8 commit 48bc900

File tree

3 files changed

+47
-6
lines changed

3 files changed

+47
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
* [CHANGE] Remove support for alertmanager and ruler legacy store configuration. Before upgrading, you need to convert your configuration to use the `alertmanager-storage` and `ruler-storage` configuration on the version that you're already running, then upgrade.
4141
* [CHANGE] Disables TSDB isolation. #4825
4242
* [CHANGE] Drops support Prometheus 1.x rule format on configdb. #4826
43+
* [ENHANCEMENT] AlertManager: Retrying AlertManager Get Requests (Get Alertmanager status, Get Alertmanager Receivers) on next replica on error #4840
4344
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839
4445
* [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783
4546
* [ENHANCEMENT] Cortex now built with Go 1.18. #4829

pkg/alertmanager/distributor.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,36 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
254254
defer sp.Finish()
255255
// Until we have a mechanism to combine the results from multiple alertmanagers,
256256
// we forward the request to only only of the alertmanagers.
257-
amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))]
258-
resp, err := d.doRequest(ctx, amDesc, req)
259-
if err != nil {
260-
respondFromError(err, w, logger)
261-
return
257+
258+
var instances []ring.InstanceDesc
259+
if req.GetMethod() == "GET" && d.isUnaryReadPath(r.URL.Path) {
260+
instances = replicationSet.Instances
261+
// Randomize the list of instances to not always query the same one.
262+
rand.Shuffle(len(instances), func(i, j int) {
263+
instances[i], instances[j] = instances[j], instances[i]
264+
})
265+
} else {
266+
//Picking 1 instance at Random for Non-Get Unary Read requests, as shuffling through large number of instances might increase complexity
267+
randN := rand.Intn(len(replicationSet.Instances))
268+
instances = replicationSet.Instances[randN : randN+1]
262269
}
263270

264-
respondFromHTTPGRPCResponse(w, resp)
271+
var lastErr error
272+
for _, instance := range instances {
273+
resp, err := d.doRequest(ctx, instance, req)
274+
// storing the last error message
275+
if err != nil {
276+
lastErr = err
277+
continue
278+
}
279+
// Return on the first succeeded request
280+
respondFromHTTPGRPCResponse(w, resp)
281+
return
282+
}
283+
// throwing the last error if the for loop finish without succeeding
284+
if lastErr != nil {
285+
respondFromError(lastErr, w, logger)
286+
}
265287
}
266288

267289
func respondFromError(err error, w http.ResponseWriter, logger log.Logger) {

pkg/alertmanager/distributor_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,24 @@ func TestDistributor_DistributeRequest(t *testing.T) {
196196
expStatusCode: http.StatusOK,
197197
expectedTotalCalls: 1,
198198
route: "/status",
199+
}, {
200+
name: "Read /status should try all alert managers on error",
201+
numAM: 3,
202+
numHappyAM: 0,
203+
replicationFactor: 3,
204+
isRead: true,
205+
expStatusCode: http.StatusInternalServerError,
206+
expectedTotalCalls: 3,
207+
route: "/status",
208+
}, {
209+
name: "Read /status is sent to 3 AM when 2 are not happy",
210+
numAM: 3,
211+
numHappyAM: 1,
212+
replicationFactor: 3,
213+
isRead: true,
214+
expStatusCode: http.StatusOK,
215+
expectedTotalCalls: 3,
216+
route: "/status",
199217
}, {
200218
name: "Write /status not supported",
201219
numAM: 5,

0 commit comments

Comments
 (0)