Skip to content

feat: configurable provider channel buffer length #4364

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,14 @@ func run() int {
}

var (
configFile = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of silences, including expired silences. If negative or zero, no limit is set.").Default("0").Int()
maxSilenceSizeBytes = kingpin.Flag("silences.max-silence-size-bytes", "Maximum silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
configFile = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of silences, including expired silences. If negative or zero, no limit is set.").Default("0").Int()
maxSilenceSizeBytes = kingpin.Flag("silences.max-silence-size-bytes", "Maximum silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
alertChannelBufferLength = kingpin.Flag("alerts.channel-buffer-length", "Alert channel buffer length").Default("200").Int()
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()

webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
externalURL = kingpin.Flag("web.external-url", "The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically.").String()
Expand Down Expand Up @@ -342,7 +343,7 @@ func run() int {
go peer.Settle(ctx, *gossipInterval*10)
}

alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer)
alerts, err := mem.NewAlerts(context.Background(), marker, *alertChannelBufferLength, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer)
if err != nil {
logger.Error("error creating memory provider", "err", err)
return 1
Expand Down
12 changes: 7 additions & 5 deletions dispatch/dispatch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ import (
"github.com/prometheus/alertmanager/types"
)

var alertChannelLength = 200

func TestAggrGroup(t *testing.T) {
lset := model.LabelSet{
"a": "v1",
Expand Down Expand Up @@ -391,7 +393,7 @@ route:
logger := promslog.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
alerts, err := mem.NewAlerts(context.Background(), marker, alertChannelLength, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -541,7 +543,7 @@ route:
logger := promslog.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
alerts, err := mem.NewAlerts(context.Background(), marker, alertChannelLength, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -662,7 +664,7 @@ func newAlert(labels model.LabelSet) *types.Alert {
func TestDispatcherRace(t *testing.T) {
logger := promslog.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
alerts, err := mem.NewAlerts(context.Background(), marker, alertChannelLength, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand All @@ -679,7 +681,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)

logger := promslog.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
alerts, err := mem.NewAlerts(context.Background(), marker, alertChannelLength, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -733,7 +735,7 @@ func TestDispatcher_DoMaintenance(t *testing.T) {
r := prometheus.NewRegistry()
marker := types.NewMarker(r)

alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, nil, promslog.NewNopLogger(), nil)
alerts, err := mem.NewAlerts(context.Background(), marker, alertChannelLength, time.Minute, nil, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down
2 changes: 1 addition & 1 deletion inhibit/inhibit_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func lastRuleMatchesBenchmark(b *testing.B, n int) benchmarkOptions {
func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
r := prometheus.NewRegistry()
m := types.NewMarker(r)
s, err := mem.NewAlerts(context.TODO(), m, time.Minute, nil, promslog.NewNopLogger(), r)
s, err := mem.NewAlerts(context.TODO(), m, 200, time.Minute, nil, promslog.NewNopLogger(), r)
if err != nil {
b.Fatal(err)
}
Expand Down
33 changes: 19 additions & 14 deletions provider/mem/mem.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package mem

import (
"context"
"fmt"
"log/slog"
"sync"
"time"
Expand All @@ -27,8 +28,6 @@ import (
"github.com/prometheus/alertmanager/types"
)

const alertChannelLength = 200

// Alerts gives access to a set of alerts. All methods are goroutine-safe.
type Alerts struct {
cancel context.CancelFunc
Expand All @@ -38,8 +37,9 @@ type Alerts struct {
alerts *store.Alerts
marker types.AlertMarker

listeners map[int]listeningAlerts
next int
channelLength int
listeners map[int]listeningAlerts
next int

callback AlertStoreCallback

Expand Down Expand Up @@ -85,20 +85,25 @@ func (a *Alerts) registerMetrics(r prometheus.Registerer) {
}

// NewAlerts returns a new alert provider.
func NewAlerts(ctx context.Context, m types.AlertMarker, intervalGC time.Duration, alertCallback AlertStoreCallback, l *slog.Logger, r prometheus.Registerer) (*Alerts, error) {
func NewAlerts(ctx context.Context, m types.AlertMarker, channelLength int, intervalGC time.Duration, alertCallback AlertStoreCallback, l *slog.Logger, r prometheus.Registerer) (*Alerts, error) {
if alertCallback == nil {
alertCallback = noopCallback{}
}

if channelLength < 1 {
return nil, fmt.Errorf("channel length has to be > zero, provided %d", channelLength)
}

ctx, cancel := context.WithCancel(ctx)
a := &Alerts{
marker: m,
alerts: store.NewAlerts(),
cancel: cancel,
listeners: map[int]listeningAlerts{},
next: 0,
logger: l.With("component", "provider"),
callback: alertCallback,
marker: m,
alerts: store.NewAlerts(),
cancel: cancel,
channelLength: channelLength,
listeners: map[int]listeningAlerts{},
next: 0,
logger: l.With("component", "provider"),
callback: alertCallback,
}

if r != nil {
Expand Down Expand Up @@ -170,7 +175,7 @@ func (a *Alerts) Subscribe() provider.AlertIterator {
var (
done = make(chan struct{})
alerts = a.alerts.List()
ch = make(chan *types.Alert, max(len(alerts), alertChannelLength))
ch = make(chan *types.Alert, max(len(alerts), a.channelLength))
)

for _, a := range alerts {
Expand All @@ -187,7 +192,7 @@ func (a *Alerts) Subscribe() provider.AlertIterator {
// pending notifications.
func (a *Alerts) GetPending() provider.AlertIterator {
var (
ch = make(chan *types.Alert, alertChannelLength)
ch = make(chan *types.Alert, a.channelLength)
done = make(chan struct{})
)
a.mtx.Lock()
Expand Down
20 changes: 11 additions & 9 deletions provider/mem/mem_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ import (
)

var (
alertChannelLength = 200

t0 = time.Now()
t1 = t0.Add(100 * time.Millisecond)

Expand Down Expand Up @@ -87,7 +89,7 @@ func init() {
// a listener can not unsubscribe as the lock is hold by `alerts.Lock`.
func TestAlertsSubscribePutStarvation(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -142,7 +144,7 @@ func TestDeadLock(t *testing.T) {

marker := types.NewMarker(prometheus.NewRegistry())
// Run gc every 5 milliseconds to increase the possibility of a deadlock with Subscribe()
alerts, err := NewAlerts(context.Background(), marker, 5*time.Millisecond, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 5*time.Millisecond, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -195,7 +197,7 @@ func TestDeadLock(t *testing.T) {

func TestAlertsPut(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -223,7 +225,7 @@ func TestAlertsSubscribe(t *testing.T) {

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(ctx, marker, alertChannelLength, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -300,7 +302,7 @@ func TestAlertsSubscribe(t *testing.T) {

func TestAlertsGetPending(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 30*time.Minute, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -343,7 +345,7 @@ func TestAlertsGetPending(t *testing.T) {

func TestAlertsGC(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 200*time.Millisecond, noopCallback{}, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -380,7 +382,7 @@ func TestAlertsStoreCallback(t *testing.T) {
cb := &limitCountCallback{limit: 3}

marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 200*time.Millisecond, cb, promslog.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -443,7 +445,7 @@ func TestAlertsStoreCallback(t *testing.T) {

func TestAlerts_Count(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, promslog.NewNopLogger(), nil)
alerts, err := NewAlerts(context.Background(), marker, alertChannelLength, 200*time.Millisecond, nil, promslog.NewNopLogger(), nil)
require.NoError(t, err)

states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}
Expand Down Expand Up @@ -565,7 +567,7 @@ func (l *limitCountCallback) PostDelete(_ *types.Alert) {

func TestAlertsConcurrently(t *testing.T) {
callback := &limitCountCallback{limit: 100}
a, err := NewAlerts(context.Background(), types.NewMarker(prometheus.NewRegistry()), time.Millisecond, callback, promslog.NewNopLogger(), nil)
a, err := NewAlerts(context.Background(), types.NewMarker(prometheus.NewRegistry()), alertChannelLength, time.Millisecond, callback, promslog.NewNopLogger(), nil)
require.NoError(t, err)

stopc := make(chan struct{})
Expand Down