-
Notifications
You must be signed in to change notification settings - Fork 820
Delete alertmanager state objects from remote storage on user deletion. #4167
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -663,7 +663,7 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR | |
level.Info(am.logger).Log("msg", "synchronizing alertmanager configs for users") | ||
am.syncTotal.WithLabelValues(syncReason).Inc() | ||
|
||
cfgs, err := am.loadAlertmanagerConfigs(ctx) | ||
allUsers, cfgs, err := am.loadAlertmanagerConfigs(ctx) | ||
if err != nil { | ||
am.syncFailures.WithLabelValues(syncReason).Inc() | ||
return err | ||
|
@@ -672,6 +672,13 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR | |
am.syncConfigs(cfgs) | ||
am.deleteUnusedLocalUserState() | ||
|
||
// Currently, remote state persistence is only used when sharding is enabled. | ||
if am.cfg.ShardingEnabled { | ||
// Note when cleaning up remote state, remember that the user may not necessarily be configured | ||
// in this instance. Therefore, pass the list of _all_ configured users to filter by. | ||
am.deleteUnusedRemoteUserState(ctx, allUsers) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
|
@@ -696,35 +703,35 @@ func (am *MultitenantAlertmanager) stopping(_ error) error { | |
return nil | ||
} | ||
|
||
// loadAlertmanagerConfigs Loads (and filters) the alertmanagers configuration from object storage, taking into consideration the sharding strategy. | ||
func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) (map[string]alertspb.AlertConfigDesc, error) { | ||
// loadAlertmanagerConfigs Loads (and filters) the alertmanagers configuration from object storage, taking into consideration the sharding strategy. Returns: | ||
// - The list of discovered users (all users with a configuration in storage) | ||
// - The configurations of users owned by this instance. | ||
func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) ([]string, map[string]alertspb.AlertConfigDesc, error) { | ||
stevesg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Find all users with an alertmanager config. | ||
userIDs, err := am.store.ListAllUsers(ctx) | ||
allUserIDs, err := am.store.ListAllUsers(ctx) | ||
if err != nil { | ||
return nil, errors.Wrap(err, "failed to list users with alertmanager configuration") | ||
return nil, nil, errors.Wrap(err, "failed to list users with alertmanager configuration") | ||
} | ||
numUsersDiscovered := len(userIDs) | ||
numUsersDiscovered := len(allUserIDs) | ||
ownedUserIDs := make([]string, 0, len(allUserIDs)) | ||
|
||
// Filter out users not owned by this shard. | ||
for i := 0; i < len(userIDs); { | ||
if !am.isUserOwned(userIDs[i]) { | ||
userIDs = append(userIDs[:i], userIDs[i+1:]...) | ||
continue | ||
for _, userID := range allUserIDs { | ||
if am.isUserOwned(userID) { | ||
ownedUserIDs = append(ownedUserIDs, userID) | ||
} | ||
|
||
i++ | ||
} | ||
numUsersOwned := len(userIDs) | ||
numUsersOwned := len(ownedUserIDs) | ||
|
||
// Load the configs for the owned users. | ||
configs, err := am.store.GetAlertConfigs(ctx, userIDs) | ||
configs, err := am.store.GetAlertConfigs(ctx, ownedUserIDs) | ||
if err != nil { | ||
return nil, errors.Wrapf(err, "failed to load alertmanager configurations for owned users") | ||
return nil, nil, errors.Wrapf(err, "failed to load alertmanager configurations for owned users") | ||
} | ||
|
||
am.tenantsDiscovered.Set(float64(numUsersDiscovered)) | ||
am.tenantsOwned.Set(float64(numUsersOwned)) | ||
return configs, nil | ||
return allUserIDs, configs, nil | ||
} | ||
|
||
func (am *MultitenantAlertmanager) isUserOwned(userID string) bool { | ||
|
@@ -1147,6 +1154,34 @@ func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *cluste | |
return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil | ||
} | ||
|
||
// deleteUnusedRemoteUserState deletes state objects in remote storage for users that are no longer configured. | ||
func (am *MultitenantAlertmanager) deleteUnusedRemoteUserState(ctx context.Context, allUsers []string) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [food for thought] This will be executed on every alertmanager replica, so the number of @pstibrany @stevesg What's your take on this? I'm dubious, so asking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It only needs to be done by one/any instance, in fact it's a completely independent process. I don't have a good answer as to whether the saving is worth the added complexity though. One bucket iteration per There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's wait for @pstibrany feedback. We need then 2nd reviewer anyway, so I wouldn't be able to merge the PR right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have strong opinion on whether to do it by single instance or all... doing it by all is certainly easier. To reduce the API operations, we can do it less often then on every sync, and add some jitter to the process. |
||
|
||
users := make(map[string]struct{}, len(allUsers)) | ||
for _, userID := range allUsers { | ||
users[userID] = struct{}{} | ||
} | ||
|
||
usersWithState, err := am.store.ListUsersWithFullState(ctx) | ||
if err != nil { | ||
level.Warn(am.logger).Log("msg", "failed to list users with state", "err", err) | ||
return | ||
} | ||
|
||
for _, userID := range usersWithState { | ||
if _, ok := users[userID]; ok { | ||
continue | ||
} | ||
|
||
err := am.store.DeleteFullState(ctx, userID) | ||
if err != nil { | ||
level.Warn(am.logger).Log("msg", "failed to delete remote state for user", "user", userID, "err", err) | ||
} else { | ||
level.Info(am.logger).Log("msg", "deleted remote state for user", "user", userID) | ||
} | ||
} | ||
} | ||
|
||
// deleteUnusedLocalUserState deletes local files for users that we no longer need. | ||
func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { | ||
userDirs := am.getPerUserDirectories() | ||
|
Uh oh!
There was an error while loading. Please reload this page.