Skip to content

Commit 1b221e3

Browse files
committed
feat(dispatcher): add maintenance interval config
- make dispatcher maintenance interval configurable Related to #4540 Signed-off-by: Siavash Safi <siavash@cloudflare.com>
1 parent 3e70148 commit 1b221e3

File tree

3 files changed

+31
-25
lines changed

3 files changed

+31
-25
lines changed

cmd/alertmanager/main.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,14 @@ func run() int {
143143
}
144144

145145
var (
146-
configFile = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
147-
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
148-
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
149-
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
150-
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of silences, including expired silences. If negative or zero, no limit is set.").Default("0").Int()
151-
maxSilenceSizeBytes = kingpin.Flag("silences.max-silence-size-bytes", "Maximum silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
152-
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
146+
configFile = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
147+
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
148+
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
149+
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
150+
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of silences, including expired silences. If negative or zero, no limit is set.").Default("0").Int()
151+
maxSilenceSizeBytes = kingpin.Flag("silences.max-silence-size-bytes", "Maximum silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
152+
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
153+
dispatchMaintenanceInterval = kingpin.Flag("dispatch.maintenance-interval", "Interval between maintenance of aggregation groups in the dispatcher.").Default("30s").Duration()
153154

154155
webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
155156
externalURL = kingpin.Flag("web.external-url", "The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically.").String()
@@ -492,7 +493,7 @@ func run() int {
492493
silencer.Mutes(labels)
493494
})
494495

495-
disp = dispatch.NewDispatcher(alerts, routes, pipeline, marker, timeoutFunc, nil, logger, dispMetrics)
496+
disp = dispatch.NewDispatcher(alerts, routes, pipeline, marker, timeoutFunc, *dispatchMaintenanceInterval, nil, logger, dispMetrics)
496497
routes.Walk(func(r *dispatch.Route) {
497498
if r.RouteOpts.RepeatInterval > *retention {
498499
configLogger.Warn(

dispatch/dispatch.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,10 @@ type Dispatcher struct {
8787
aggrGroupsPerRoute map[*Route]map[model.Fingerprint]*aggrGroup
8888
aggrGroupsNum int
8989

90-
done chan struct{}
91-
ctx context.Context
92-
cancel func()
90+
maintenanceInterval time.Duration
91+
done chan struct{}
92+
ctx context.Context
93+
cancel func()
9394

9495
logger *slog.Logger
9596
}
@@ -109,6 +110,7 @@ func NewDispatcher(
109110
s notify.Stage,
110111
mk types.GroupMarker,
111112
to func(time.Duration) time.Duration,
113+
mi time.Duration,
112114
lim Limits,
113115
l *slog.Logger,
114116
m *DispatcherMetrics,
@@ -118,14 +120,15 @@ func NewDispatcher(
118120
}
119121

120122
disp := &Dispatcher{
121-
alerts: ap,
122-
stage: s,
123-
route: r,
124-
marker: mk,
125-
timeout: to,
126-
logger: l.With("component", "dispatcher"),
127-
metrics: m,
128-
limits: lim,
123+
alerts: ap,
124+
stage: s,
125+
route: r,
126+
marker: mk,
127+
timeout: to,
128+
maintenanceInterval: mi,
129+
logger: l.With("component", "dispatcher"),
130+
metrics: m,
131+
limits: lim,
129132
}
130133
return disp
131134
}
@@ -146,7 +149,7 @@ func (d *Dispatcher) Run() {
146149
}
147150

148151
func (d *Dispatcher) run(it provider.AlertIterator) {
149-
maintenance := time.NewTicker(30 * time.Second)
152+
maintenance := time.NewTicker(d.maintenanceInterval)
150153
defer maintenance.Stop()
151154

152155
defer it.Close()

dispatch/dispatch_test.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535
"github.com/prometheus/alertmanager/types"
3636
)
3737

38+
const testMaintenanceInterval = 30 * time.Second
39+
3840
func TestAggrGroup(t *testing.T) {
3941
lset := model.LabelSet{
4042
"a": "v1",
@@ -399,7 +401,7 @@ route:
399401

400402
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
401403
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
402-
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
404+
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
403405
go dispatcher.Run()
404406
defer dispatcher.Stop()
405407

@@ -551,7 +553,7 @@ route:
551553
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
552554
lim := limits{groups: 6}
553555
m := NewDispatcherMetrics(true, prometheus.NewRegistry())
554-
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, lim, logger, m)
556+
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, lim, logger, m)
555557
go dispatcher.Run()
556558
defer dispatcher.Stop()
557559

@@ -669,7 +671,7 @@ func TestDispatcherRace(t *testing.T) {
669671
defer alerts.Close()
670672

671673
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
672-
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
674+
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
673675
go dispatcher.Run()
674676
dispatcher.Stop()
675677
}
@@ -697,7 +699,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
697699

698700
timeout := func(d time.Duration) time.Duration { return d }
699701
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
700-
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
702+
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, prometheus.NewRegistry()))
701703
go dispatcher.Run()
702704
defer dispatcher.Stop()
703705

@@ -749,7 +751,7 @@ func TestDispatcher_DoMaintenance(t *testing.T) {
749751
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
750752

751753
ctx := context.Background()
752-
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, promslog.NewNopLogger(), NewDispatcherMetrics(false, r))
754+
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, promslog.NewNopLogger(), NewDispatcherMetrics(false, r))
753755
aggrGroups := make(map[*Route]map[model.Fingerprint]*aggrGroup)
754756
aggrGroups[route] = make(map[model.Fingerprint]*aggrGroup)
755757

0 commit comments

Comments
 (0)