Skip to content

Commit 70f47b1

Browse files
author
Guido Trotter
committed
Add metrics for the case where the matches fails to compile/add to the cache
Before the silences would be in the status, but querying would result in query errors. Now they are still in the status (since we have to calculate after calling Merge), but we also know that some broken ones exist. At load snapshot time we can also avoid importing those, since they would not be usable anyway, and we have different labels to distinguish the cases/behaviors. Signed-off-by: Guido Trotter <guido@hudson-trading.com>
1 parent 5387548 commit 70f47b1

File tree

1 file changed

+33
-15
lines changed

1 file changed

+33
-15
lines changed

silence/silence.go

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -215,18 +215,19 @@ type Limits struct {
215215
type MaintenanceFunc func() (int64, error)
216216

217217
type metrics struct {
218-
gcDuration prometheus.Summary
219-
snapshotDuration prometheus.Summary
220-
snapshotSize prometheus.Gauge
221-
queriesTotal prometheus.Counter
222-
queryErrorsTotal prometheus.Counter
223-
queryDuration prometheus.Histogram
224-
silencesActive prometheus.GaugeFunc
225-
silencesPending prometheus.GaugeFunc
226-
silencesExpired prometheus.GaugeFunc
227-
propagatedMessagesTotal prometheus.Counter
228-
maintenanceTotal prometheus.Counter
229-
maintenanceErrorsTotal prometheus.Counter
218+
gcDuration prometheus.Summary
219+
snapshotDuration prometheus.Summary
220+
snapshotSize prometheus.Gauge
221+
queriesTotal prometheus.Counter
222+
queryErrorsTotal prometheus.Counter
223+
queryDuration prometheus.Histogram
224+
silencesActive prometheus.GaugeFunc
225+
silencesPending prometheus.GaugeFunc
226+
silencesExpired prometheus.GaugeFunc
227+
propagatedMessagesTotal prometheus.Counter
228+
maintenanceTotal prometheus.Counter
229+
maintenanceErrorsTotal prometheus.Counter
230+
matcherCompileErrorsTotal *prometheus.CounterVec
230231
}
231232

232233
func newSilenceMetricByState(s *Silences, st types.SilenceState) prometheus.GaugeFunc {
@@ -271,6 +272,13 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
271272
Name: "alertmanager_silences_maintenance_errors_total",
272273
Help: "How many maintenances were executed for silences that failed.",
273274
})
275+
m.matcherCompileErrorsTotal = prometheus.NewCounterVec(
276+
prometheus.CounterOpts{
277+
Name: "alertmanager_silences_matcher_compile_errors_total",
278+
Help: "How many silence matcher compilations failed.",
279+
},
280+
[]string{"stage"},
281+
)
274282
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
275283
Name: "alertmanager_silences_queries_total",
276284
Help: "How many silence queries were received.",
@@ -311,6 +319,7 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
311319
m.propagatedMessagesTotal,
312320
m.maintenanceTotal,
313321
m.maintenanceErrorsTotal,
322+
m.matcherCompileErrorsTotal,
314323
)
315324
}
316325
return m
@@ -562,7 +571,11 @@ func (s *Silences) checkSizeLimits(msil *pb.MeshSilence) error {
562571

563572
func (s *Silences) silenceAdded(sil *pb.Silence) {
564573
s.version++
565-
s.mc.add(sil)
574+
_, err := s.mc.add(sil)
575+
if err != nil {
576+
s.metrics.matcherCompileErrorsTotal.WithLabelValues("silence_added").Inc()
577+
s.logger.Error("Failed to compile silence matchers", "silence_id", sil.Id, "err", err)
578+
}
566579
}
567580

568581
func (s *Silences) getSilence(id string) (*pb.Silence, bool) {
@@ -886,8 +899,13 @@ func (s *Silences) loadSnapshot(r io.Reader) error {
886899
e.Silence.CreatedBy = e.Silence.Comments[0].Author
887900
e.Silence.Comments = nil
888901
}
889-
st[e.Silence.Id] = e
890-
s.mc.add(e.Silence)
902+
// Add to matcher cache, and only if successful, to the new state.
903+
if _, err := s.mc.add(e.Silence); err != nil {
904+
s.metrics.matcherCompileErrorsTotal.WithLabelValues("load_snapshot").Inc()
905+
s.logger.Error("Failed to compile silence matchers during snapshot load", "silence_id", e.Silence.Id, "err", err)
906+
} else {
907+
st[e.Silence.Id] = e
908+
}
891909
}
892910
s.mtx.Lock()
893911
s.st = st

0 commit comments

Comments
 (0)