Skip to content

Commit

Permalink
add context reasons to alertmanager notifications_failed_total metric
Browse files Browse the repository at this point in the history
Signed-off-by: Walther Lee <walther.lee@reddit.com>
  • Loading branch information
Walther Lee committed Dec 5, 2023
1 parent 8348683 commit 1aedbee
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 9 deletions.
27 changes: 19 additions & 8 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,11 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
case <-ctx.Done():
if iErr == nil {
iErr = ctx.Err()
if errors.Is(iErr, context.Canceled) {
iErr = NewErrorWithReason(ContextCanceledReason, iErr)
} else if errors.Is(iErr, context.DeadlineExceeded) {
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)
}
}

return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
Expand All @@ -808,14 +813,15 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
if !retry {
return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i)
}
if ctx.Err() == nil && (iErr == nil || err.Error() != iErr.Error()) {
// Log the error if the context isn't done and the error isn't the same as before.
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
if ctx.Err() == nil {
if iErr == nil || err.Error() != iErr.Error() {
// Log the error if the context isn't done and the error isn't the same as before.
level.Warn(l).Log("msg", "Notify attempt failed, will retry later", "attempts", i, "err", err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
}

// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
lvl := level.Info(l)
if i <= 1 {
Expand All @@ -827,7 +833,12 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale
}
case <-ctx.Done():
if iErr == nil {
iErr = ctx.Err()
iErr := ctx.Err()
if errors.Is(iErr, context.Canceled) {
iErr = NewErrorWithReason(ContextCanceledReason, iErr)

Check failure on line 838 in notify/notify.go

View workflow job for this annotation

GitHub Actions / lint

ineffectual assignment to iErr (ineffassign)
} else if errors.Is(iErr, context.DeadlineExceeded) {
iErr = NewErrorWithReason(ContextDeadlineExceededReason, iErr)

Check failure on line 840 in notify/notify.go

View workflow job for this annotation

GitHub Actions / lint

ineffectual assignment to iErr (ineffassign)
}
}

return ctx, nil, errors.Wrapf(iErr, "%s/%s: notify retry canceled after %d attempts", r.groupName, r.integration.String(), i)
Expand Down
33 changes: 33 additions & 0 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,39 @@ func TestRetryStageWithErrorCode(t *testing.T) {
}
}

func TestRetryStageWithContextCanceled(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())

i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
cancel()
return true, errors.New("request failed: context canceled")
}),
rs: sendResolved(false),
}
r := NewRetryStage(i, "", NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}))

alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}

ctx = WithFiringAlerts(ctx, []uint64{0})

// Notify with a non-recoverable error.
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
counter := r.metrics.numTotalFailedNotifications

require.Equal(t, 1, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), ContextCanceledReason.String()))))

require.NotNil(t, err)
require.NotNil(t, resctx)
}

func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
Expand Down
8 changes: 7 additions & 1 deletion notify/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ const (
DefaultReason Reason = iota
ClientErrorReason
ServerErrorReason
ContextCanceledReason
ContextDeadlineExceededReason
)

func (s Reason) String() string {
Expand All @@ -280,13 +282,17 @@ func (s Reason) String() string {
return "clientError"
case ServerErrorReason:
return "serverError"
case ContextCanceledReason:
return "contextCanceled"
case ContextDeadlineExceededReason:
return "contextDeadlineExceeded"
default:
panic(fmt.Sprintf("unknown Reason: %d", s))
}
}

// possibleFailureReasonCategory is a list of possible failure reason.
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String(), ContextCanceledReason.String(), ContextDeadlineExceededReason.String()}

// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
func GetFailureReasonFromStatusCode(statusCode int) Reason {
Expand Down

0 comments on commit 1aedbee

Please sign in to comment.