Skip to content

Commit

Permalink
Merge pull request #9261 from benluddy/wait-for-etcd-bootstrap-member…
Browse files Browse the repository at this point in the history
…-removal

OCPBUGS-45482: Wait for etcd bootstrap member removal
  • Loading branch information
openshift-merge-bot[bot] authored Dec 7, 2024
2 parents 0170a6a + e1f9d5c commit 2ea001d
Show file tree
Hide file tree
Showing 246 changed files with 22,322 additions and 64 deletions.
77 changes: 13 additions & 64 deletions cmd/openshift-install/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,9 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
Expand All @@ -35,6 +32,7 @@ import (
configclient "github.com/openshift/client-go/config/clientset/versioned"
configinformers "github.com/openshift/client-go/config/informers/externalversions"
configlisters "github.com/openshift/client-go/config/listers/config/v1"
operatorclient "github.com/openshift/client-go/operator/clientset/versioned"
routeclient "github.com/openshift/client-go/route/clientset/versioned"
"github.com/openshift/installer/cmd/openshift-install/command"
"github.com/openshift/installer/pkg/asset"
Expand Down Expand Up @@ -483,7 +481,7 @@ func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *cluster
return err
}

if err := waitForStableSNOBootstrap(ctx, config); err != nil {
if err := waitForEtcdBootstrapMemberRemoval(ctx, config); err != nil {
return newBootstrapError(err)
}

Expand Down Expand Up @@ -525,58 +523,31 @@ func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset
return nil
}

// When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
// here we make sure that the deployment is stable.
// Given the nature of single node we just need to make sure things such as etcd are in the proper state
// before continuing.
func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
// If the bootstrap etcd member is cleaned up before it has been removed from the etcd cluster, the
// etcd cluster cannot maintain quorum through the rollout of any single permanent member.
func waitForEtcdBootstrapMemberRemoval(ctx context.Context, config *rest.Config) error {
timeout := 5 * time.Minute

// If we're not in a single node deployment, bail early
if isSNO, err := IsSingleNode(); err != nil {
logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
return nil
} else if !isSNO {
return nil
}

snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

untilTime := time.Now().Add(timeout)
timezone, _ := untilTime.Zone()
logrus.Info("Detected Single Node deployment")
logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
timeout, untilTime.Format(time.Kitchen), timezone)

client, err := dynamic.NewForConfig(config)
client, err := operatorclient.NewForConfig(config)
if err != nil {
return fmt.Errorf("error creating dynamic client: %w", err)
return fmt.Errorf("error creating operator client: %w", err)
}
gvr := schema.GroupVersionResource{
Group: operatorv1.SchemeGroupVersion.Group,
Version: operatorv1.SchemeGroupVersion.Version,
Resource: "etcds",
}
resourceClient := client.Resource(gvr)
// Validate the etcd operator has removed the bootstrap etcd member
return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
etcdOperator := &operatorv1.Etcd{}
etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
return wait.PollUntilContextTimeout(ctx, 1*time.Second, timeout, true, func(ctx context.Context) (done bool, err error) {
etcd, err := client.OperatorV1().Etcds().Get(ctx, "cluster", metav1.GetOptions{})
if err != nil {
// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
logrus.Debugf("Error getting etcd operator singleton, retrying: %v", err)
return false, nil
}
err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
if err != nil {
// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
return false, nil
}
for _, condition := range etcdOperator.Status.Conditions {

for _, condition := range etcd.Status.Conditions {
if condition.Type == "EtcdBootstrapMemberRemoved" {
return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
return condition.Status == operatorv1.ConditionTrue, nil
}
}
return false, nil
Expand Down Expand Up @@ -942,25 +913,3 @@ func handleUnreachableAPIServer(ctx context.Context, config *rest.Config) error

return nil
}

// IsSingleNode determines if we are in a single node configuration based off of the install config
// loaded from the asset store.
func IsSingleNode() (bool, error) {
assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
if err != nil {
return false, fmt.Errorf("error loading asset store: %w", err)
}
installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
if err != nil {
return false, fmt.Errorf("error loading installConfig: %w", err)
}
if installConfig == nil {
return false, fmt.Errorf("installConfig loaded from asset store was nil")
}

config := installConfig.(*installconfig.InstallConfig).Config
if machinePool := config.ControlPlane; machinePool != nil {
return *machinePool.Replicas == int64(1), nil
}
return false, nil
}
Loading

0 comments on commit 2ea001d

Please sign in to comment.