-
Notifications
You must be signed in to change notification settings - Fork 103
Add a safety check before changing coordinators #2373
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -126,6 +126,13 @@ type FoundationDBClusterReconciler struct { | |||||
// wait time will increase the chances that all updates are part of the list but will also delay the rollout of | ||||||
// the change. | ||||||
GlobalSynchronizationWaitDuration time.Duration | ||||||
// MinimumUptimeForCoordinatorChangeWithMissingProcess defines the minimum uptime of the cluster before coordinator | ||||||
// changes because of a missing coordinator are allowed. | ||||||
MinimumUptimeForCoordinatorChangeWithMissingProcess time.Duration | ||||||
// MinimumUptimeForCoordinatorChangeWithUndesiredProcess defines the minimum uptime of the cluster before coordinator | ||||||
// changes because of an undesired coordinator are allowed. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Is there a different meaning of undesired other than excluded here? If not, feels like we should just use "excluded" |
||||||
MinimumUptimeForCoordinatorChangeWithUndesiredProcess time.Duration | ||||||
|
||||||
// MinimumRecoveryTimeForInclusion defines the duration in seconds that a cluster must be up | ||||||
// before new inclusions are allowed. The operator issuing frequent inclusions in a short time window | ||||||
// could cause instability for the cluster as each inclusion will/can cause a recovery. Delaying the inclusion | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -25,6 +25,7 @@ import ( | |||||
"math" | ||||||
"strconv" | ||||||
"strings" | ||||||
"time" | ||||||
|
||||||
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2" | ||||||
"github.com/FoundationDB/fdb-kubernetes-operator/v2/pkg/fdbadminclient" | ||||||
|
@@ -840,3 +841,87 @@ func ClusterIsConfigured( | |||||
status.Client.DatabaseStatus.Available && | ||||||
status.Cluster.Layers.Error != "configurationMissing" | ||||||
} | ||||||
|
||||||
// CanSafelyChangeCoordinators returns nil when it is safe to change coordinators in the cluster or returns an error | ||||||
// with more information why it's not safe to change coordinators. This function differentiates between missing (down) | ||||||
// processes and processes that are only excluded or undesired, applying different minimum uptime requirements for each case. | ||||||
func CanSafelyChangeCoordinators( | ||||||
logger logr.Logger, | ||||||
cluster *fdbv1beta2.FoundationDBCluster, | ||||||
status *fdbv1beta2.FoundationDBStatus, | ||||||
minimumUptimeForMissing time.Duration, | ||||||
minimumUptimeForExcluded time.Duration, | ||||||
recoveryStateEnabled bool, | ||||||
) error { | ||||||
// TODO double check setting here + true | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👀 |
||||||
currentMinimumUptime, _, err := GetMinimumUptimeAndAddressMap( | ||||||
logger, | ||||||
cluster, | ||||||
status, | ||||||
recoveryStateEnabled, | ||||||
) | ||||||
if err != nil { | ||||||
return fmt.Errorf("failed to get minimum uptime: %w", err) | ||||||
} | ||||||
|
||||||
// Analyze current coordinators using the coordinator information from status | ||||||
// This gives us the definitive list of coordinators regardless of whether processes are running | ||||||
missingCoordinators := 0 | ||||||
|
||||||
// Create a map of process addresses to process group IDs for faster lookup | ||||||
coordinators := map[string]bool{} | ||||||
for _, coordinator := range status.Client.Coordinators.Coordinators { | ||||||
// TODO validate if logic will work with DNS names. | ||||||
coordinators[coordinator.Address.String()] = false | ||||||
} | ||||||
|
||||||
for _, process := range status.Cluster.Processes { | ||||||
processAddr := process.Address.String() | ||||||
if _, ok := coordinators[processAddr]; !ok { | ||||||
continue | ||||||
} | ||||||
|
||||||
coordinators[processAddr] = true | ||||||
} | ||||||
|
||||||
for _, isPresent := range coordinators { | ||||||
if !isPresent { | ||||||
missingCoordinators++ | ||||||
} | ||||||
} | ||||||
|
||||||
// Apply different uptime requirements based on the type of coordinator issues | ||||||
var requiredUptime float64 | ||||||
var reason string | ||||||
|
||||||
logger.V(1). | ||||||
Info("Checking if it is safe to change coordinators", "missingCoordinators", missingCoordinators, "currentMinimumUptime", currentMinimumUptime) | ||||||
if missingCoordinators > 0 { | ||||||
// Missing coordinators indicate processes that are down, use lower threshold | ||||||
requiredUptime = minimumUptimeForMissing.Seconds() | ||||||
reason = fmt.Sprintf("cluster has %d missing coordinators", missingCoordinators) | ||||||
} else { | ||||||
requiredUptime = minimumUptimeForExcluded.Seconds() | ||||||
reason = "cluster is not up for long enough" | ||||||
|
||||||
// Perform the default safet checks in case of "normal" coordinator changes or if processes are exclude. If | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// the cluster has missing coordinators, we should bypass those checks to ensure we recruit the new coordinators | ||||||
// in a timely manner. | ||||||
err = DefaultSafetyChecks(status, 10, "change coordinators") | ||||||
if err != nil { | ||||||
return err | ||||||
} | ||||||
} | ||||||
|
||||||
// Check that the cluster has been stable for the required time | ||||||
if currentMinimumUptime < requiredUptime { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. edge case kinda but could this get thrown off by a crashlooping coordinator? |
||||||
return fmt.Errorf( | ||||||
"cannot change coordinators: %s, cluster minimum uptime is %.2f seconds but %.2f seconds required for safe coordinator change", | ||||||
reason, | ||||||
currentMinimumUptime, | ||||||
requiredUptime, | ||||||
) | ||||||
} | ||||||
|
||||||
return nil | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this guarantee it is changing the coordinators? feels like there should be a more explicit check, esp wrt time passed