Skip to content

Commit f59460b

Browse files
authored
Refactor nflog configuration options to make it similar to Silences. (prometheus#3220)
* Refactor nflog configuration options to make it similar to Silences. The Notification Log is a similar component to Silences. They're the only two things that are shared between nodes when running in HA and they both hold some sort of internal state that needs to be cleaned up on an interval. To simplify the code and make it a bit more understandable (among other benefits such as improved testability) - I've refactor the notification log configuration and `run` to be similar to the silences.
1 parent 0f7d21f commit f59460b

File tree

4 files changed

+132
-147
lines changed

4 files changed

+132
-147
lines changed

cmd/alertmanager/main.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,15 +279,14 @@ func run() int {
279279
var wg sync.WaitGroup
280280
wg.Add(1)
281281

282-
notificationLogOpts := []nflog.Option{
283-
nflog.WithRetention(*retention),
284-
nflog.WithSnapshot(filepath.Join(*dataDir, "nflog")),
285-
nflog.WithMaintenance(*maintenanceInterval, stopc, wg.Done, nil),
286-
nflog.WithMetrics(prometheus.DefaultRegisterer),
287-
nflog.WithLogger(log.With(logger, "component", "nflog")),
282+
notificationLogOpts := nflog.Options{
283+
SnapshotFile: filepath.Join(*dataDir, "nflog"),
284+
Retention: *retention,
285+
Logger: log.With(logger, "component", "nflog"),
286+
Metrics: prometheus.DefaultRegisterer,
288287
}
289288

290-
notificationLog, err := nflog.New(notificationLogOpts...)
289+
notificationLog, err := nflog.New(notificationLogOpts)
291290
if err != nil {
292291
level.Error(logger).Log("err", err)
293292
return 1
@@ -297,6 +296,12 @@ func run() int {
297296
notificationLog.SetBroadcast(c.Broadcast)
298297
}
299298

299+
wg.Add(1)
300+
go func() {
301+
notificationLog.Maintenance(*maintenanceInterval, filepath.Join(*dataDir, "nflog"), stopc, nil)
302+
wg.Done()
303+
}()
304+
300305
marker := types.NewMarker(prometheus.DefaultRegisterer)
301306

302307
silenceOpts := silence.Options{

nflog/nflog.go

Lines changed: 71 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"sync"
2828
"time"
2929

30+
"github.com/benbjohnson/clock"
3031
"github.com/go-kit/log"
3132
"github.com/go-kit/log/level"
3233
"github.com/matttproud/golang_protobuf_extensions/pbutil"
@@ -73,23 +74,19 @@ func QGroupKey(gk string) QueryParam {
7374
}
7475
}
7576

77+
// Log holds the notification log state for alerts that have been notified.
7678
type Log struct {
79+
clock clock.Clock
80+
7781
logger log.Logger
7882
metrics *metrics
79-
now func() time.Time
8083
retention time.Duration
8184

82-
runInterval time.Duration
83-
snapf string
84-
stopc chan struct{}
85-
done func()
86-
8785
// For now we only store the most recently added log entry.
8886
// The key is a serialized concatenation of group key and receiver.
89-
mtx sync.RWMutex
90-
st state
91-
broadcast func([]byte)
92-
maintenanceOverride MaintenanceFunc
87+
mtx sync.RWMutex
88+
st state
89+
broadcast func([]byte)
9390
}
9491

9592
// MaintenanceFunc represents the function to run as part of the periodic maintenance for the nflog.
@@ -154,76 +151,6 @@ func newMetrics(r prometheus.Registerer) *metrics {
154151
return m
155152
}
156153

157-
// Option configures a new Log implementation.
158-
type Option func(*Log) error
159-
160-
// WithRetention sets the retention time for log st.
161-
func WithRetention(d time.Duration) Option {
162-
return func(l *Log) error {
163-
l.retention = d
164-
return nil
165-
}
166-
}
167-
168-
// WithNow overwrites the function used to retrieve a timestamp
169-
// for the current point in time.
170-
// This is generally useful for injection during tests.
171-
func WithNow(f func() time.Time) Option {
172-
return func(l *Log) error {
173-
l.now = f
174-
return nil
175-
}
176-
}
177-
178-
// WithLogger configures a logger for the notification log.
179-
func WithLogger(logger log.Logger) Option {
180-
return func(l *Log) error {
181-
l.logger = logger
182-
return nil
183-
}
184-
}
185-
186-
// WithMetrics registers metrics for the notification log.
187-
func WithMetrics(r prometheus.Registerer) Option {
188-
return func(l *Log) error {
189-
l.metrics = newMetrics(r)
190-
return nil
191-
}
192-
}
193-
194-
// WithMaintenance configures the Log to run garbage collection
195-
// and snapshotting, if configured, at the given interval.
196-
//
197-
// The maintenance terminates on receiving from the provided channel.
198-
// The done function is called after the final snapshot was completed.
199-
// If not nil, the last argument is an override for what to do as part of the maintenance - for advanced usage.
200-
func WithMaintenance(d time.Duration, stopc chan struct{}, done func(), maintenanceOverride MaintenanceFunc) Option {
201-
return func(l *Log) error {
202-
if d == 0 {
203-
return errors.New("maintenance interval must not be 0")
204-
}
205-
l.runInterval = d
206-
l.stopc = stopc
207-
l.done = done
208-
l.maintenanceOverride = maintenanceOverride
209-
return nil
210-
}
211-
}
212-
213-
// WithSnapshot configures the log to be initialized from a given snapshot file.
214-
// If maintenance is configured, a snapshot will be saved periodically and on
215-
// shutdown as well.
216-
func WithSnapshot(sf string) Option {
217-
return func(l *Log) error {
218-
l.snapf = sf
219-
return nil
220-
}
221-
}
222-
223-
func utcNow() time.Time {
224-
return time.Now().UTC()
225-
}
226-
227154
type state map[string]*pb.MeshEntry
228155

229156
func (s state) clone() state {
@@ -289,48 +216,80 @@ func marshalMeshEntry(e *pb.MeshEntry) ([]byte, error) {
289216
return buf.Bytes(), nil
290217
}
291218

219+
// Options configures a new Log implementation.
220+
type Options struct {
221+
SnapshotReader io.Reader
222+
SnapshotFile string
223+
224+
Retention time.Duration
225+
226+
Logger log.Logger
227+
Metrics prometheus.Registerer
228+
}
229+
230+
func (o *Options) validate() error {
231+
if o.SnapshotFile != "" && o.SnapshotReader != nil {
232+
return errors.New("only one of SnapshotFile and SnapshotReader must be set")
233+
}
234+
235+
return nil
236+
}
237+
292238
// New creates a new notification log based on the provided options.
293239
// The snapshot is loaded into the Log if it is set.
294-
func New(opts ...Option) (*Log, error) {
240+
func New(o Options) (*Log, error) {
241+
if err := o.validate(); err != nil {
242+
return nil, err
243+
}
244+
295245
l := &Log{
246+
clock: clock.New(),
247+
retention: o.Retention,
296248
logger: log.NewNopLogger(),
297-
now: utcNow,
298249
st: state{},
299250
broadcast: func([]byte) {},
300-
}
301-
for _, o := range opts {
302-
if err := o(l); err != nil {
303-
return nil, err
304-
}
305-
}
306-
if l.metrics == nil {
307-
l.metrics = newMetrics(nil)
251+
metrics: newMetrics(o.Metrics),
308252
}
309253

310-
if l.snapf != "" {
311-
if f, err := os.Open(l.snapf); !os.IsNotExist(err) {
312-
if err != nil {
313-
return l, err
314-
}
315-
defer f.Close()
254+
if o.Logger != nil {
255+
l.logger = o.Logger
256+
}
316257

317-
if err := l.loadSnapshot(f); err != nil {
318-
return l, err
258+
if o.SnapshotFile != "" {
259+
if r, err := os.Open(o.SnapshotFile); err != nil {
260+
if !os.IsNotExist(err) {
261+
return nil, err
319262
}
263+
level.Debug(l.logger).Log("msg", "notification log snapshot file doesn't exist", "err", err)
264+
} else {
265+
o.SnapshotReader = r
266+
defer r.Close()
320267
}
321268
}
322269

323-
go l.run()
270+
if o.SnapshotReader != nil {
271+
if err := l.loadSnapshot(o.SnapshotReader); err != nil {
272+
return l, err
273+
}
274+
}
324275

325276
return l, nil
326277
}
327278

328-
// run periodic background maintenance.
329-
func (l *Log) run() {
330-
if l.runInterval == 0 || l.stopc == nil {
279+
func (l *Log) now() time.Time {
280+
return l.clock.Now()
281+
}
282+
283+
// Maintenance garbage collects the notification log state at the given interval. If the snapshot
284+
// file is set, a snapshot is written to it afterwards.
285+
// Terminates on receiving from stopc.
286+
// If not nil, the last argument is an override for what to do as part of the maintenance - for advanced usage.
287+
func (l *Log) Maintenance(interval time.Duration, snapf string, stopc <-chan struct{}, override MaintenanceFunc) {
288+
if interval == 0 || stopc == nil {
289+
level.Error(l.logger).Log("msg", "interval or stop signal are missing - not running maintenance")
331290
return
332291
}
333-
t := time.NewTicker(l.runInterval)
292+
t := l.clock.Ticker(interval)
334293
defer t.Stop()
335294

336295
var doMaintenance MaintenanceFunc
@@ -339,29 +298,26 @@ func (l *Log) run() {
339298
if _, err := l.GC(); err != nil {
340299
return size, err
341300
}
342-
if l.snapf == "" {
301+
if snapf == "" {
343302
return size, nil
344303
}
345-
f, err := openReplace(l.snapf)
304+
f, err := openReplace(snapf)
346305
if err != nil {
347306
return size, err
348307
}
349308
if size, err = l.Snapshot(f); err != nil {
309+
f.Close()
350310
return size, err
351311
}
352312
return size, f.Close()
353313
}
354314

355-
if l.maintenanceOverride != nil {
356-
doMaintenance = l.maintenanceOverride
357-
}
358-
359-
if l.done != nil {
360-
defer l.done()
315+
if override != nil {
316+
doMaintenance = override
361317
}
362318

363319
runMaintenance := func(do func() (int64, error)) error {
364-
start := l.now()
320+
start := l.now().UTC()
365321
level.Debug(l.logger).Log("msg", "Running maintenance")
366322
size, err := do()
367323
level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size)
@@ -372,16 +328,17 @@ func (l *Log) run() {
372328
Loop:
373329
for {
374330
select {
375-
case <-l.stopc:
331+
case <-stopc:
376332
break Loop
377333
case <-t.C:
378334
if err := runMaintenance(doMaintenance); err != nil {
379335
level.Error(l.logger).Log("msg", "Running maintenance failed", "err", err)
380336
}
381337
}
382338
}
339+
383340
// No need to run final maintenance if we don't want to snapshot.
384-
if l.snapf == "" {
341+
if snapf == "" {
385342
return
386343
}
387344
if err := runMaintenance(doMaintenance); err != nil {

0 commit comments

Comments
 (0)