Skip to content

Commit 3416d5a

Browse files
wallee94Walther LeeSuperQ
authored
Add context reasons to notifications failed counter (#3631)
--------- Signed-off-by: Walther Lee <walther.lee@reddit.com> Co-authored-by: Walther Lee <walther.lee@reddit.com> Co-authored-by: Ben Kochie <superq@gmail.com>
1 parent dc14664 commit 3416d5a

File tree

3 files changed

+58
-8
lines changed

3 files changed

+58
-8
lines changed

notify/notify.go

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
790790
case <-ctx.Done():
791791
if iErr == nil {
792792
iErr = ctx.Err()
793+
if errors.Is(iErr, context.Canceled) {
794+
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
795+
} else if errors.Is(iErr, context.DeadlineExceeded) {
796+
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
797+
}
793798
}
794799

795800
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
@@ -808,14 +813,15 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
808813
if !retry {
809814
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
810815
}
811-
if ctx.Err() == nil && (iErr == nil || err.Error() != iErr.Error()) {
812-
// Log the error if the context isn't done and the error isn't the same as before.
813-
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
816+
if ctx.Err() == nil {
817+
if iErr == nil || err.Error() != iErr.Error() {
818+
// Log the error if the context isn't done and the error isn't the same as before.
819+
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
820+
}
821+
// Save this error to be able to return the last seen error by an
822+
// integration upon context timeout.
823+
iErr = err
814824
}
815-
816-
// Save this error to be able to return the last seen error by an
817-
// integration upon context timeout.
818-
iErr = err
819825
} else {
820826
lvl := level.Info(l)
821827
if i <= 1 {
@@ -828,6 +834,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
828834
case <-ctx.Done():
829835
if iErr == nil {
830836
iErr = ctx.Err()
837+
if errors.Is(iErr, context.Canceled) {
838+
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
839+
} else if errors.Is(iErr, context.DeadlineExceeded) {
840+
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
841+
}
831842
}
832843

833844
return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)

notify/notify_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,39 @@ func TestRetryStageWithErrorCode(t *testing.T) {
469469
}
470470
}
471471

472+
func TestRetryStageWithContextCanceled(t *testing.T) {
473+
ctx, cancel := context.WithCancel(context.Background())
474+
475+
i := Integration{
476+
name: "test",
477+
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
478+
cancel()
479+
return true, errors.New("request failed: context canceled")
480+
}),
481+
rs: sendResolved(false),
482+
}
483+
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))
484+
485+
alerts := []*types.Alert{
486+
{
487+
Alert: model.Alert{
488+
EndsAt: time.Now().Add(time.Hour),
489+
},
490+
},
491+
}
492+
493+
ctx = WithFiringAlerts(ctx, []uint64{0})
494+
495+
// Notify with a non-recoverable error.
496+
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
497+
counter := r.metrics.numTotalFailedNotifications
498+
499+
require.Equal(t, 1, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), ContextCanceledReason.String()))))
500+
501+
require.NotNil(t, err)
502+
require.NotNil(t, resctx)
503+
}
504+
472505
func TestRetryStageNoResolved(t *testing.T) {
473506
sent := []*types.Alert{}
474507
i := Integration{

notify/util.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ const (
270270
DefaultReason Reason = iota
271271
ClientErrorReason
272272
ServerErrorReason
273+
ContextCanceledReason
274+
ContextDeadlineExceededReason
273275
)
274276

275277
func (s Reason) String() string {
@@ -280,13 +282,17 @@ func (s Reason) String() string {
280282
return "clientError"
281283
case ServerErrorReason:
282284
return "serverError"
285+
case ContextCanceledReason:
286+
return "contextCanceled"
287+
case ContextDeadlineExceededReason:
288+
return "contextDeadlineExceeded"
283289
default:
284290
panic(fmt.Sprintf("unknown Reason: %d", s))
285291
}
286292
}
287293

288294
// possibleFailureReasonCategory is a list of possible failure reason.
289-
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}
295+
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String(), ContextCanceledReason.String(), ContextDeadlineExceededReason.String()}
290296

291297
// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
292298
func GetFailureReasonFromStatusCode(statusCode int) Reason {

0 commit comments

Comments
 (0)