Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions silence/silence.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ type metrics struct {
silencesPending prometheus.GaugeFunc
silencesExpired prometheus.GaugeFunc
propagatedMessagesTotal prometheus.Counter
maintenanceTotal prometheus.Counter
maintenanceErrorsTotal prometheus.Counter
}

func newSilenceMetricByState(s *Silences, st types.SilenceState) prometheus.GaugeFunc {
Expand Down Expand Up @@ -251,6 +253,14 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
Name: "alertmanager_silences_snapshot_size_bytes",
Help: "Size of the last silence snapshot in bytes.",
})
m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_maintenance_total",
Help: "How many maintenances were executed for silences.",
})
m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_maintenance_errors_total",
Help: "How many maintenances were executed for silences that failed.",
})
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "alertmanager_silences_queries_total",
Help: "How many silence queries were received.",
Expand Down Expand Up @@ -285,6 +295,8 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
m.silencesPending,
m.silencesExpired,
m.propagatedMessagesTotal,
m.maintenanceTotal,
m.maintenanceErrorsTotal,
)
}
return m
Expand Down Expand Up @@ -395,12 +407,17 @@ func (s *Silences) Maintenance(interval time.Duration, snapf string, stopc <-cha
}

runMaintenance := func(do MaintenanceFunc) error {
start := s.nowUTC()
s.metrics.maintenanceTotal.Inc()
level.Debug(s.logger).Log("msg", "Running maintenance")
start := s.nowUTC()
size, err := do()
level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size)
s.metrics.snapshotSize.Set(float64(size))
return err
if err != nil {
s.metrics.maintenanceErrorsTotal.Inc()
return err
}
level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size)
return nil
}

Loop:
Expand Down
15 changes: 14 additions & 1 deletion silence/silence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/go-kit/log"
"github.com/matttproud/golang_protobuf_extensions/pbutil"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
Expand Down Expand Up @@ -208,7 +209,9 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) {
f, err := os.CreateTemp("", "snapshot")
require.NoError(t, err, "creating temp file failed")
clock := clock.NewMock()
s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock, metrics: newMetrics(nil, nil)}
reg := prometheus.NewRegistry()
s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock}
s.metrics = newMetrics(reg, s)
stopc := make(chan struct{})

var calls atomic.Int32
Expand Down Expand Up @@ -237,6 +240,16 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) {
wg.Wait()

require.EqualValues(t, 2, calls.Load())

// Check the maintenance metrics.
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
# HELP alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
# TYPE alertmanager_silences_maintenance_errors_total counter
alertmanager_silences_maintenance_errors_total 0
# HELP alertmanager_silences_maintenance_total How many maintenances were executed for silences.
# TYPE alertmanager_silences_maintenance_total counter
alertmanager_silences_maintenance_total 2
`), "alertmanager_silences_maintenance_total", "alertmanager_silences_maintenance_errors_total"))
}

func TestSilencesSetSilence(t *testing.T) {
Expand Down