@@ -16,7 +16,6 @@ import (
16
16
"github.com/cybertec-postgresql/pgwatch3/sources"
17
17
"github.com/shopspring/decimal"
18
18
"github.com/sirupsen/logrus"
19
- "golang.org/x/exp/maps"
20
19
)
21
20
22
21
var monitoredDbs sources.MonitoredDatabases
@@ -29,6 +28,7 @@ type Reaper struct {
29
28
opts * config.Options
30
29
sourcesReaderWriter sources.ReaderWriter
31
30
metricsReaderWriter metrics.ReaderWriter
31
+ measurementCh chan []metrics.MeasurementMessage
32
32
}
33
33
34
34
func NewReaper (opts * config.Options , sourcesReaderWriter sources.ReaderWriter , metricsReaderWriter metrics.ReaderWriter ) * Reaper {
@@ -42,7 +42,7 @@ func NewReaper(opts *config.Options, sourcesReaderWriter sources.ReaderWriter, m
42
42
func (r * Reaper ) Reap (mainContext context.Context ) (err error ) {
43
43
var measurementsWriter * sinks.MultiWriter
44
44
45
- controlChannels := make (map [string ]( chan metrics. ControlMessage ) ) // [db1+metric1]=chan
45
+ cancelFuncs := make (map [string ]context. CancelFunc ) // [db1+metric1]=chan
46
46
47
47
firstLoop := true
48
48
mainLoopCount := 0
@@ -60,14 +60,13 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
60
60
if measurementsWriter , err = sinks .NewMultiWriter (mainContext , opts , metricDefinitionMap ); err != nil {
61
61
logger .Fatal (err )
62
62
}
63
- measurementCh : = make (chan []metrics.MeasurementMessage , 10000 )
63
+ r . measurementCh = make (chan []metrics.MeasurementMessage , 10000 )
64
64
if ! opts .Ping {
65
- go measurementsWriter .WriteMeasurements (mainContext , measurementCh )
65
+ go measurementsWriter .WriteMeasurements (mainContext , r . measurementCh )
66
66
}
67
67
68
68
for { //main loop
69
69
hostsToShutDownDueToRoleChange := make (map [string ]bool ) // hosts went from master to standby and have "only if master" set
70
- var controlChannelNameList []string
71
70
gatherersShutDown := 0
72
71
73
72
if monitoredDbs , err = sourcesReaderWriter .GetMonitoredDatabases (); err != nil {
@@ -92,7 +91,7 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
92
91
UpdateMonitoredDBCache (monitoredDbs )
93
92
94
93
if lastMonitoredDBsUpdate .IsZero () || lastMonitoredDBsUpdate .Before (time .Now ().Add (- 1 * time .Second * monitoredDbsDatastoreSyncIntervalSeconds )) {
95
- go SyncMonitoredDBsToDatastore (mainContext , monitoredDbs , measurementCh )
94
+ go SyncMonitoredDBsToDatastore (mainContext , monitoredDbs , r . measurementCh )
96
95
lastMonitoredDBsUpdate = time .Now ()
97
96
}
98
97
@@ -237,13 +236,14 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
237
236
}
238
237
239
238
dbMetric := dbUnique + dbMetricJoinStr + metric
240
- _ , chOk := controlChannels [dbMetric ]
239
+ _ , chOk := cancelFuncs [dbMetric ]
241
240
242
241
if metricDefOk && ! chOk { // initialize a new per db/per metric control channel
243
242
if interval > 0 {
244
243
hostMetricIntervalMap [dbMetric ] = interval
245
244
logger .WithField ("source" , dbUnique ).WithField ("metric" , metric ).WithField ("interval" , interval ).Info ("starting gatherer" )
246
- controlChannels [dbMetric ] = make (chan metrics.ControlMessage , 1 )
245
+ metricCtx , cancelFunc := context .WithCancel (mainContext )
246
+ cancelFuncs [dbMetric ] = cancelFunc
247
247
248
248
metricNameForStorage := metricName
249
249
if _ , isSpecialMetric := specialMetrics [metricName ]; ! isSpecialMetric {
@@ -264,21 +264,20 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
264
264
logger .Error (err )
265
265
}
266
266
267
- go MetricGathererLoop ( mainContext ,
267
+ go r . reapMetricMeasurementsFromSource ( metricCtx ,
268
268
dbUnique ,
269
269
dbUniqueOrig ,
270
270
srcType ,
271
271
metric ,
272
- metricConfig ,
273
- controlChannels [dbMetric ],
274
- measurementCh ,
275
- opts )
272
+ metricConfig )
276
273
}
277
274
} else if (! metricDefOk && chOk ) || interval <= 0 {
278
275
// metric definition files were recently removed or interval set to zero
276
+ if cancelFunc , isOk := cancelFuncs [dbMetric ]; isOk {
277
+ cancelFunc ()
278
+ }
279
279
logger .Warning ("shutting down metric" , metric , "for" , monitoredDB .DBUniqueName )
280
- controlChannels [dbMetric ] <- metrics.ControlMessage {Action : gathererStatusStop }
281
- delete (controlChannels , dbMetric )
280
+ delete (cancelFuncs , dbMetric )
282
281
} else if ! metricDefOk {
283
282
epoch , ok := lastSQLFetchError .Load (metric )
284
283
if ! ok || ((time .Now ().Unix () - epoch .(int64 )) > 3600 ) { // complain only 1x per hour
@@ -288,8 +287,7 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
288
287
} else {
289
288
// check if interval has changed
290
289
if hostMetricIntervalMap [dbMetric ] != interval {
291
- logger .Warning ("sending interval update for" , dbUnique , metric )
292
- controlChannels [dbMetric ] <- metrics.ControlMessage {Action : gathererStatusStart , Config : metricConfig }
290
+ logger .Warning ("updating interval update for" , dbUnique , metric )
293
291
hostMetricIntervalMap [dbMetric ] = interval
294
292
}
295
293
}
@@ -303,9 +301,7 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
303
301
// loop over existing channels and stop workers if DB or metric removed from config
304
302
// or state change makes it uninteresting
305
303
logger .Debug ("checking if any workers need to be shut down..." )
306
- controlChannelNameList = maps .Keys (controlChannels )
307
-
308
- for _ , dbMetric := range controlChannelNameList {
304
+ for dbMetric , cancelFunc := range cancelFuncs {
309
305
var currentMetricConfig map [string ]float64
310
306
var dbInfo * sources.MonitoredDatabase
311
307
var ok , dbRemovedFromConfig bool
@@ -351,9 +347,9 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
351
347
352
348
if mainContext .Err () != nil || wholeDbShutDownDueToRoleChange || dbRemovedFromConfig || singleMetricDisabled {
353
349
logger .Infof ("shutting down gatherer for [%s:%s] ..." , db , metric )
354
- controlChannels [ dbMetric ] <- metrics. ControlMessage { Action : gathererStatusStop }
355
- delete (controlChannels , dbMetric )
356
- logger .Debugf ("control channel for [%s:%s] deleted" , db , metric )
350
+ cancelFunc ()
351
+ delete (cancelFuncs , dbMetric )
352
+ logger .Debugf ("cancel function for [%s:%s] deleted" , db , metric )
357
353
gatherersShutDown ++
358
354
ClearDBUnreachableStateIfAny (db )
359
355
if err := measurementsWriter .SyncMetrics (db , metric , "remove" ); err != nil {
@@ -384,25 +380,19 @@ func (r *Reaper) Reap(mainContext context.Context) (err error) {
384
380
}
385
381
386
382
// metrics.ControlMessage notifies of shutdown + interval change
387
- func MetricGathererLoop (ctx context.Context ,
383
+ func ( r * Reaper ) reapMetricMeasurementsFromSource (ctx context.Context ,
388
384
dbUniqueName , dbUniqueNameOrig string ,
389
385
srcType sources.Kind ,
390
386
metricName string ,
391
- configMap map [string ]float64 ,
392
- controlCh <- chan metrics.ControlMessage ,
393
- storeCh chan <- []metrics.MeasurementMessage ,
394
- opts * config.Options ) {
387
+ configMap map [string ]float64 ) {
395
388
396
- config := configMap
397
- interval := config [metricName ]
398
389
hostState := make (map [string ]map [string ]string )
399
390
var lastUptimeS int64 = - 1 // used for "server restarted" event detection
400
391
var lastErrorNotificationTime time.Time
401
392
var vme DBVersionMapEntry
402
393
var mvp metrics.Metric
403
394
var err error
404
395
failedFetches := 0
405
- // metricNameForStorage := metricName
406
396
lastDBVersionFetchTime := time .Unix (0 , 0 ) // check DB ver. ev. 5 min
407
397
408
398
l := log .GetLogger (ctx ).WithField ("source" , dbUniqueName ).WithField ("metric" , metricName )
@@ -415,13 +405,14 @@ func MetricGathererLoop(ctx context.Context,
415
405
realDbname := dbPgVersionMap [dbUniqueName ].RealDbname // to manage 2 sets of event counts - monitored DB + global
416
406
dbPgVersionMapLock .RUnlock ()
417
407
conn := mdb .Conn
418
- metrics .ParseLogs (ctx , conn , mdb , realDbname , metricName , configMap , controlCh , storeCh ) // no return
408
+ metrics .ParseLogs (ctx , conn , mdb , realDbname , metricName , configMap , r . measurementCh ) // no return
419
409
return
420
410
}
421
411
422
412
for {
413
+ interval := configMap [metricName ]
423
414
if lastDBVersionFetchTime .Add (time .Minute * time .Duration (5 )).Before (time .Now ()) {
424
- vme , err = DBGetPGVersion (ctx , dbUniqueName , srcType , false , opts .Measurements .SystemIdentifierField ) // in case of errors just ignore metric "disabled" time ranges
415
+ vme , err = DBGetPGVersion (ctx , dbUniqueName , srcType , false , r . opts .Measurements .SystemIdentifierField ) // in case of errors just ignore metric "disabled" time ranges
425
416
if err != nil {
426
417
lastDBVersionFetchTime = time .Now ()
427
418
}
@@ -445,15 +436,15 @@ func MetricGathererLoop(ctx context.Context,
445
436
}
446
437
447
438
// 1st try local overrides for some metrics if operating in push mode
448
- if opts .Metrics .DirectOSStats && IsDirectlyFetchableMetric (metricName ) {
439
+ if r . opts .Metrics .DirectOSStats && IsDirectlyFetchableMetric (metricName ) {
449
440
metricStoreMessages , err = FetchStatsDirectlyFromOS (ctx , mfm , vme , mvp )
450
441
if err != nil {
451
442
l .WithError (err ).Errorf ("Could not reader metric directly from OS" )
452
443
}
453
444
}
454
445
t1 := time .Now ()
455
446
if metricStoreMessages == nil {
456
- metricStoreMessages , err = FetchMetrics (ctx , mfm , hostState , storeCh , "" , opts )
447
+ metricStoreMessages , err = FetchMetrics (ctx , mfm , hostState , r . measurementCh , "" , r . opts )
457
448
}
458
449
t2 := time .Now ()
459
450
@@ -499,27 +490,16 @@ func MetricGathererLoop(ctx context.Context,
499
490
}
500
491
}
501
492
502
- storeCh <- metricStoreMessages
493
+ r . measurementCh <- metricStoreMessages
503
494
}
504
495
}
505
496
506
497
select {
507
498
case <- ctx .Done ():
508
499
return
509
- case msg := <- controlCh :
510
- l .Debug ("got control msg" , msg )
511
- if msg .Action == gathererStatusStart {
512
- config = msg .Config
513
- interval = config [metricName ]
514
- l .Debug ("started MetricGathererLoop with interval:" , interval )
515
- } else if msg .Action == gathererStatusStop {
516
- l .Debug ("exiting MetricGathererLoop with interval:" , interval )
517
- return
518
- }
519
500
case <- time .After (time .Second * time .Duration (interval )):
520
501
l .Debugf ("MetricGathererLoop slept for %s" , time .Second * time .Duration (interval ))
521
502
}
522
-
523
503
}
524
504
}
525
505
0 commit comments