Skip to content

Alertmanager: Remove alertmanager instead of pausing #3722

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* [ENHANCEMENT] Runtime Config: Add a `mode` query parameter for the runtime config endpoint. `/runtime_config?mode=diff` now shows the YAML runtime configuration with all values that differ from the defaults. #3700
* [ENHANCEMENT] Distributor: Enable downstream projects to wrap distributor push function and access the deserialized write requests berfore/after they are pushed. #3755
* [ENHANCEMENT] Add flag `-<prefix>.tls-server-name` to require a specific server name instead of the hostname on the certificate. #3156
* [ENHANCEMENT] Alertmanager: Remove a tenant's alertmanager instead of pausing it as we determine it is no longer needed. #3722
* [BUGFIX] HA Tracker: don't track as error in the `cortex_kv_request_duration_seconds` metric a CAS operation intentionally aborted. #3745

## 1.7.0 in progress
Expand Down
61 changes: 7 additions & 54 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"time"

"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/alertmanager/api"
"github.com/prometheus/alertmanager/cluster"
"github.com/prometheus/alertmanager/config"
Expand Down Expand Up @@ -80,9 +79,6 @@ type Alertmanager struct {
// Further, in upstream AM, this metric is handled using the config coordinator which we don't use
// hence we need to generate the metric ourselves.
configHashMetric prometheus.Gauge

activeMtx sync.Mutex
active bool
}

var (
Expand All @@ -102,11 +98,9 @@ func init() {
// New creates a new Alertmanager.
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am := &Alertmanager{
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
stop: make(chan struct{}),
active: false,
activeMtx: sync.Mutex{},
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
stop: make(chan struct{}),
configHashMetric: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_config_hash",
Help: "Hash of the currently loaded alertmanager configuration.",
Expand Down Expand Up @@ -269,55 +263,10 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
go am.dispatcher.Run()
go am.inhibitor.Run()

// Ensure the alertmanager is set to active
am.activeMtx.Lock()
am.active = true
am.activeMtx.Unlock()

am.configHashMetric.Set(md5HashAsMetricValue([]byte(rawCfg)))
return nil
}

// IsActive returns if the alertmanager is currently running
// or is paused
func (am *Alertmanager) IsActive() bool {
am.activeMtx.Lock()
defer am.activeMtx.Unlock()
return am.active
}

// Pause running jobs in the alertmanager that are able to be restarted and sets
// to inactives
func (am *Alertmanager) Pause() {
// Set to inactive
am.activeMtx.Lock()
am.active = false
am.activeMtx.Unlock()

// Stop the inhibitor and dispatcher which will be recreated when
// a new config is applied
if am.inhibitor != nil {
am.inhibitor.Stop()
am.inhibitor = nil
}
if am.dispatcher != nil {
am.dispatcher.Stop()
am.dispatcher = nil
}

// Remove all of the active silences from the alertmanager
silences, _, err := am.silences.Query()
if err != nil {
level.Warn(am.logger).Log("msg", "unable to retrieve silences for removal", "err", err)
}
for _, si := range silences {
err = am.silences.Expire(si.Id)
if err != nil {
level.Warn(am.logger).Log("msg", "unable to remove silence", "err", err, "silence", si.Id)
}
}
}

// Stop stops the Alertmanager.
func (am *Alertmanager) Stop() {
if am.inhibitor != nil {
Expand All @@ -330,6 +279,10 @@ func (am *Alertmanager) Stop() {

am.alerts.Close()
close(am.stop)
}

func (am *Alertmanager) StopAndWait() {
am.Stop()
am.wg.Wait()
}

Expand Down
6 changes: 6 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Regis
m.regs.AddUserRegistry(user, reg)
}

func (m *alertmanagerMetrics) removeUserRegistry(user string) {
// We neeed to go for a soft deletion here, as hard deletion requires
// that _all_ metrics except gauges are per-user.
m.regs.RemoveUserRegistry(user, false)
}

func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.alertsReceived
out <- m.alertsInvalid
Expand Down
Loading