From dcddc121a81b40d9917d751141f08b565e638fa4 Mon Sep 17 00:00:00 2001 From: TJ Hoplock <33664289+tjhop@users.noreply.github.com> Date: Tue, 13 Feb 2024 06:17:24 -0500 Subject: [PATCH] feat: add counter to track alerts dropped outside of time_intervals (#3565) * feat: add counter to track alerts dropped outside of time_intervals Addresses: #3512 This adds a new counter metric `alertmanager_alerts_supressed_total` that is incremented by `len(alerts)` when an alert is suppressed for being outside of a time_interval, ie inside of a mute_time_intervals or outside of an active_time_intervals. Signed-off-by: TJ Hoplock * test: add time interval suppression metric checks for notify Signed-off-by: TJ Hoplock * test: fix failure message log values in notifier Signed-off-by: TJ Hoplock * ref: address PR feedback for #3565 Signed-off-by: TJ Hoplock * fix: track suppressed notifications metric for inhibit/silence Based on PR feedback: https://github.com/prometheus/alertmanager/pull/3565/files#r1393068026 Signed-off-by: TJ Hoplock * fix: broken notifier tests - fixed metric count check to properly check the diff between input/output notifications from the suppression to compare to suppression metric, was previously inverted to compare to how many notifications it suppressed. - stopped using `Reset()` to compare collection counts between the multiple stages that are executed in `TestMuteStageWithSilences()`. the intent was to compare a clean metric collection after each stage execution, but the final stage where all silences are lifted results in no metric being created in the test, causing `prom_testutil.ToFloat64()` to panic. changed to separate vars to check counts between each stage, with care to consider prior counts. Signed-off-by: TJ Hoplock * rename metric and add constants Signed-off-by: gotjosh --------- Signed-off-by: TJ Hoplock Signed-off-by: gotjosh Co-authored-by: gotjosh Signed-off-by: Gokhan Sari --- notify/notify.go | 58 ++++++++++++++++++++++++++++++++----------- notify/notify_test.go | 45 +++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 20 deletions(-) diff --git a/notify/notify.go b/notify/notify.go index 0a2b0d032b..30861a3027 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -251,6 +251,7 @@ type Metrics struct { numTotalFailedNotifications *prometheus.CounterVec numNotificationRequestsTotal *prometheus.CounterVec numNotificationRequestsFailedTotal *prometheus.CounterVec + numNotificationSuppressedTotal *prometheus.CounterVec notificationLatencySeconds *prometheus.HistogramVec ff featurecontrol.Flagger @@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics { Name: "notification_requests_failed_total", Help: "The total number of failed notification requests.", }, labels), + numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_suppressed_total", + Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.", + }, []string{"reason"}), notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "alertmanager", Name: "notification_latency_seconds", @@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics { r.MustRegister( m.numNotifications, m.numTotalFailedNotifications, m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal, - m.notificationLatencySeconds, + m.numNotificationSuppressedTotal, m.notificationLatencySeconds, ) return m @@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New( rs := make(RoutingStage, len(receivers)) ms := NewGossipSettleStage(peer) - is := NewMuteStage(inhibitor) - tas := NewTimeActiveStage(intervener) - tms := NewTimeMuteStage(intervener) - ss := NewMuteStage(silencer) + is := NewMuteStage(inhibitor, pb.metrics) + tas := NewTimeActiveStage(intervener, pb.metrics) + tms := NewTimeMuteStage(intervener, pb.metrics) + ss := NewMuteStage(silencer, pb.metrics) for name := range receivers { st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics) @@ -507,14 +513,22 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t return ctx, alerts, nil } +const ( + suppressedReasonSilence = "silence" + suppressedReasonInhibition = "inhibition" + suppressedReasonMuteTimeInterval = "mute_time_interval" + suppressedReasonActiveTimeInterval = "active_time_interval" +) + // MuteStage filters alerts through a Muter. type MuteStage struct { - muter types.Muter + muter types.Muter + metrics *Metrics } // NewMuteStage return a new MuteStage. -func NewMuteStage(m types.Muter) *MuteStage { - return &MuteStage{muter: m} +func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage { + return &MuteStage{muter: m, metrics: metrics} } // Exec implements the Stage interface. @@ -535,7 +549,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type } if len(muted) > 0 { level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted)) + + var reason string + switch n.muter.(type) { + case *silence.Silencer: + reason = suppressedReasonSilence + case *inhibit.Inhibitor: + reason = suppressedReasonInhibition + default: + } + n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted))) } + return ctx, filtered, nil } @@ -894,13 +919,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ } type timeStage struct { - muter types.TimeMuter + muter types.TimeMuter + metrics *Metrics } type TimeMuteStage timeStage -func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage { - return &TimeMuteStage{m} +func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage { + return &TimeMuteStage{m, metrics} } // Exec implements the stage interface for TimeMuteStage. @@ -927,7 +953,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type // If the current time is inside a mute time, all alerts are removed from the pipeline. if muted { - level.Debug(l).Log("msg", "Notifications not sent, route is within mute time") + tms.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonMuteTimeInterval).Add(float64(len(alerts))) + level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts)) return ctx, nil, nil } return ctx, alerts, nil @@ -935,8 +962,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type type TimeActiveStage timeStage -func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage { - return &TimeActiveStage{m} +func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage { + return &TimeActiveStage{m, metrics} } // Exec implements the stage interface for TimeActiveStage. @@ -964,7 +991,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty // If the current time is not inside an active time, all alerts are removed from the pipeline if !muted { - level.Debug(l).Log("msg", "Notifications not sent, route is not within active time") + tas.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonActiveTimeInterval).Add(float64(len(alerts))) + level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts)) return ctx, nil, nil } diff --git a/notify/notify_test.go b/notify/notify_test.go index 2ae452d48b..d5ce9612e3 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -666,7 +666,8 @@ func TestMuteStage(t *testing.T) { return ok }) - stage := NewMuteStage(muter) + metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}) + stage := NewMuteStage(muter, metrics) in := []model.LabelSet{ {}, @@ -705,6 +706,10 @@ func TestMuteStage(t *testing.T) { if !reflect.DeepEqual(got, out) { t.Fatalf("Muting failed, expected: %v\ngot %v", out, got) } + suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(in) - len(got)) != suppressed { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed) + } } func TestMuteStageWithSilences(t *testing.T) { @@ -720,9 +725,11 @@ func TestMuteStageWithSilences(t *testing.T) { t.Fatal(err) } - marker := types.NewMarker(prometheus.NewRegistry()) + reg := prometheus.NewRegistry() + marker := types.NewMarker(reg) silencer := silence.NewSilencer(silences, marker, log.NewNopLogger()) - stage := NewMuteStage(silencer) + metrics := NewMetrics(reg, featurecontrol.NoopFlags{}) + stage := NewMuteStage(silencer, metrics) in := []model.LabelSet{ {}, @@ -765,6 +772,10 @@ func TestMuteStageWithSilences(t *testing.T) { if !reflect.DeepEqual(got, out) { t.Fatalf("Muting failed, expected: %v\ngot %v", out, got) } + suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(in) - len(got)) != suppressedRoundOne { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne) + } // Do it again to exercise the version tracking of silences. _, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...) @@ -781,6 +792,11 @@ func TestMuteStageWithSilences(t *testing.T) { t.Fatalf("Muting failed, expected: %v\ngot %v", out, got) } + suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo) + } + // Expire the silence and verify that no alerts are silenced now. if err := silences.Expire(silID); err != nil { t.Fatal(err) @@ -798,6 +814,10 @@ func TestMuteStageWithSilences(t *testing.T) { if !reflect.DeepEqual(got, in) { t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got) } + suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree) + } } func TestTimeMuteStage(t *testing.T) { @@ -874,7 +894,8 @@ func TestTimeMuteStage(t *testing.T) { } m := map[string][]timeinterval.TimeInterval{"test": intervals} intervener := timeinterval.NewIntervener(m) - stage := NewTimeMuteStage(intervener) + metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}) + stage := NewTimeMuteStage(intervener, metrics) outAlerts := []*types.Alert{} nonMuteCount := 0 @@ -908,6 +929,10 @@ func TestTimeMuteStage(t *testing.T) { if len(outAlerts) != nonMuteCount { t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts)) } + suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(cases) - nonMuteCount) != suppressed { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed) + } } func TestTimeActiveStage(t *testing.T) { @@ -933,6 +958,11 @@ func TestTimeActiveStage(t *testing.T) { labels: model.LabelSet{"mute": "me"}, shouldMute: true, }, + { + fireTime: "02 Dec 20 16:59 +0000", + labels: model.LabelSet{"mute": "me"}, + shouldMute: true, + }, { // Tuesday before 5pm fireTime: "01 Dec 20 16:59 +0000", @@ -959,7 +989,8 @@ func TestTimeActiveStage(t *testing.T) { } m := map[string][]timeinterval.TimeInterval{"test": intervals} intervener := timeinterval.NewIntervener(m) - stage := NewTimeActiveStage(intervener) + metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{}) + stage := NewTimeActiveStage(intervener, metrics) outAlerts := []*types.Alert{} nonMuteCount := 0 @@ -993,6 +1024,10 @@ func TestTimeActiveStage(t *testing.T) { if len(outAlerts) != nonMuteCount { t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts)) } + suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal)) + if (len(cases) - nonMuteCount) != suppressed { + t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed) + } } func BenchmarkHashAlert(b *testing.B) {