Skip to content

Alertmanager alerts limits #4253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
* `memberlist_client_kv_store_value_tombstones_removed_total`
* `memberlist_client_messages_to_broadcast_dropped_total`
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_limited_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176
* [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184
Expand Down
11 changes: 11 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4180,6 +4180,17 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
# 0 = no limit.
# CLI flag: -alertmanager.max-dispatcher-aggregation-groups
[alertmanager_max_dispatcher_aggregation_groups: <int> | default = 0]

# Maximum number of alerts that a single user can have. Inserting more alerts
# will fail with a log message and metric increment. 0 = no limit.
# CLI flag: -alertmanager.max-alerts-count
[alertmanager_max_alerts_count: <int> | default = 0]

# Maximum total size of alerts that a single user can have, alert size is the
# sum of the bytes of its labels, annotations and generatorURL. Inserting more
# alerts will fail with a log message and metric increment. 0 = no limit.
# CLI flag: -alertmanager.max-alerts-size-bytes
[alertmanager_max_alerts_size_bytes: <int> | default = 0]
```

### `redis_config`
Expand Down
146 changes: 145 additions & 1 deletion pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
Name: "alertmanager_notification_rate_limited_total",
Help: "Number of rate-limited notifications per integration.",
}, []string{"integration"}), // "integration" is consistent with other alertmanager metrics.

}

am.registry = reg
Expand Down Expand Up @@ -241,7 +242,12 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
am.wg.Done()
}()

am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, nil, am.logger)
var callback mem.AlertStoreCallback
if am.cfg.Limits != nil {
callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, reg)
}

am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger)
if err != nil {
return nil, fmt.Errorf("failed to create alerts: %v", err)
}
Expand Down Expand Up @@ -584,3 +590,141 @@ type dispatcherLimits struct {
func (g *dispatcherLimits) MaxNumberOfAggregationGroups() int {
return g.limits.AlertmanagerMaxDispatcherAggregationGroups(g.tenant)
}

var (
errTooManyAlerts = "too many alerts, limit: %d"
errAlertsTooBig = "alerts too big, total size limit: %d bytes"
)

// alertsLimiter limits the number and size of alerts being received by the Alertmanager.
// We consider an alert unique based on its fingerprint (a hash of its labels) and
// its size it's determined by the sum of bytes of its labels, annotations, and generator URL.
type alertsLimiter struct {
tenant string
limits Limits

failureCounter prometheus.Counter

mx sync.Mutex
sizes map[model.Fingerprint]int
count int
totalSize int
}

func newAlertsLimiter(tenant string, limits Limits, reg prometheus.Registerer) *alertsLimiter {
limiter := &alertsLimiter{
tenant: tenant,
limits: limits,
sizes: map[model.Fingerprint]int{},
failureCounter: promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_alerts_insert_limited_total",
Help: "Number of failures to insert new alerts to in-memory alert store.",
}),
}

promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
Name: "alertmanager_alerts_limiter_current_alerts",
Help: "Number of alerts tracked by alerts limiter.",
}, func() float64 {
c, _ := limiter.currentStats()
return float64(c)
})

promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
Name: "alertmanager_alerts_limiter_current_alerts_size_bytes",
Help: "Total size of alerts tracked by alerts limiter.",
}, func() float64 {
_, s := limiter.currentStats()
return float64(s)
})

return limiter
}

func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error {
if alert == nil {
return nil
}

fp := alert.Fingerprint()

countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant)
sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant)

sizeDiff := alertSize(alert.Alert)

a.mx.Lock()
defer a.mx.Unlock()

if !existing && countLimit > 0 && (a.count+1) > countLimit {
a.failureCounter.Inc()
return fmt.Errorf(errTooManyAlerts, countLimit)
}

if existing {
sizeDiff -= a.sizes[fp]
}

if sizeLimit > 0 && (a.totalSize+sizeDiff) > sizeLimit {
a.failureCounter.Inc()
return fmt.Errorf(errAlertsTooBig, sizeLimit)
}

return nil
}

func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) {
if alert == nil {
return
}

newSize := alertSize(alert.Alert)
fp := alert.Fingerprint()

a.mx.Lock()
defer a.mx.Unlock()

if existing {
a.totalSize -= a.sizes[fp]
} else {
a.count++
}
a.sizes[fp] = newSize
a.totalSize += newSize
}

func (a *alertsLimiter) PostDelete(alert *types.Alert) {
if alert == nil {
return
}

fp := alert.Fingerprint()

a.mx.Lock()
defer a.mx.Unlock()

a.totalSize -= a.sizes[fp]
delete(a.sizes, fp)
a.count--
}

func (a *alertsLimiter) currentStats() (count, totalSize int) {
a.mx.Lock()
defer a.mx.Unlock()

return a.count, a.totalSize
}

func alertSize(alert model.Alert) int {
size := 0
for l, v := range alert.Labels {
size += len(l)
size += len(v)
}
for l, v := range alert.Annotations {
size += len(l)
size += len(v)
}
size += len(alert.GeneratorURL)
return size
}
21 changes: 21 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ type alertmanagerMetrics struct {

notificationRateLimited *prometheus.Desc
dispatcherAggregationGroupsLimitReached *prometheus.Desc
insertAlertFailures *prometheus.Desc
alertsLimiterAlertsCount *prometheus.Desc
alertsLimiterAlertsSize *prometheus.Desc
}

func newAlertmanagerMetrics() *alertmanagerMetrics {
Expand Down Expand Up @@ -214,6 +217,18 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
"cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total",
"Number of times when dispatcher failed to create new aggregation group due to limit.",
[]string{"user"}, nil),
insertAlertFailures: prometheus.NewDesc(
"cortex_alertmanager_alerts_insert_limited_total",
"Total number of failures to store alert due to hitting alertmanager limits.",
[]string{"user"}, nil),
alertsLimiterAlertsCount: prometheus.NewDesc(
"cortex_alertmanager_alerts_limiter_current_alerts",
"Number of alerts tracked by alerts limiter.",
[]string{"user"}, nil),
alertsLimiterAlertsSize: prometheus.NewDesc(
"cortex_alertmanager_alerts_limiter_current_alerts_size_bytes",
"Total size of alerts tracked by alerts limiter.",
[]string{"user"}, nil),
}
}

Expand Down Expand Up @@ -265,6 +280,9 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.persistFailed
out <- m.notificationRateLimited
out <- m.dispatcherAggregationGroupsLimitReached
out <- m.insertAlertFailures
out <- m.alertsLimiterAlertsCount
out <- m.alertsLimiterAlertsSize
}

func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
Expand Down Expand Up @@ -313,4 +331,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {

data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration")
data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total")
data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total")
data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts")
data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes")
}
80 changes: 80 additions & 0 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,22 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
# TYPE cortex_alertmanager_state_persist_total counter
cortex_alertmanager_state_persist_total 0

# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700
`))
require.NoError(t, err)
}
Expand Down Expand Up @@ -557,6 +573,23 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
# TYPE cortex_alertmanager_state_persist_total counter
cortex_alertmanager_state_persist_total 0

# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700

`))
require.NoError(t, err)

Expand Down Expand Up @@ -788,6 +821,19 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
# TYPE cortex_alertmanager_state_persist_total counter
cortex_alertmanager_state_persist_total 0

# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
`))
require.NoError(t, err)
}
Expand Down Expand Up @@ -838,6 +884,11 @@ func populateAlertmanager(base float64) *prometheus.Registry {
v2APIMetrics.invalid.Add(base)
v2APIMetrics.resolved.Add(base * 3)

lm := newLimiterMetrics(reg)
lm.count.Set(10 * base)
lm.size.Set(100 * base)
lm.insertFailures.Add(7 * base)

return reg
}

Expand Down Expand Up @@ -1041,3 +1092,32 @@ func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics {
invalid: numInvalidAlerts,
}
}

type limiterMetrics struct {
count prometheus.Gauge
size prometheus.Gauge
insertFailures prometheus.Counter
}

func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics {
count := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_alerts_limiter_current_alerts",
Help: "Number of alerts tracked by alerts limiter.",
})

size := promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_alerts_limiter_current_alerts_size_bytes",
Help: "Total size of alerts tracked by alerts limiter.",
})

insertAlertFailures := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_alerts_insert_limited_total",
Help: "Number of failures to insert new alerts to in-memory alert store.",
})

return &limiterMetrics{
count: count,
size: size,
insertFailures: insertAlertFailures,
}
}
Loading