Skip to content

Commit ab53134

Browse files
sethp-nrmnguyendetiber
committed
✨ KCP adopts existing machines
The KCP controller identifies Machines that belong to the control plane of an existing cluster and adopts them, including finding PKI materials that may be owned by the machine's bootstrap config and pivoting their ownership to the KCP as well. Prior to adopting machines (which, if unsuccessful, will block the KCP from taking any management actions), it runs a number of safety checks including: - Ensuring the KCP has not been deleted (to prevent re-adoption of orphans, though this process races with the garbage collector) - Checking that the machine's bootstrap provider was KubeadmConfig - Verifying that the Machine is no further than one minor version off of the KCP's spec Additionally, we set set a "best guess" value for the kubeadm.controlplane.cluster.x-k8s.io/hash on the adopted machine as if it were generated by a KCP in the past. The intent is that a KCP will adopt machines matching its "spec" (to the best of its ability) without modification, which in practice works well for adopting machines with the same spec'd version. Co-authored-by: mnguyen <mnguyen@newrelic.com> Co-Authored-By: Jason DeTiberus <detiberusj@vmware.com>
1 parent 48be8c5 commit ab53134

17 files changed

+929
-119
lines changed

controlplane/kubeadm/config/rbac/role.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ rules:
6161
- get
6262
- list
6363
- patch
64+
- update
6465
- watch
6566

6667
---

controlplane/kubeadm/controllers/fakes_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
4545
return f.Machines, nil
4646
}
4747

48-
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey, _ string) error {
48+
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
4949
if !f.ControlPlaneHealthy {
5050
return errors.New("control plane is not healthy")
5151
}
5252
return nil
5353
}
5454

55-
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey, _ string) error {
55+
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
5656
if !f.EtcdHealthy {
5757
return errors.New("etcd is not healthy")
5858
}

controlplane/kubeadm/controllers/kubeadm_control_plane_controller.go

Lines changed: 154 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
kerrors "k8s.io/apimachinery/pkg/util/errors"
3434
"k8s.io/apiserver/pkg/storage/names"
3535
"k8s.io/client-go/tools/record"
36+
"k8s.io/utils/pointer"
3637
ctrl "sigs.k8s.io/controller-runtime"
3738
"sigs.k8s.io/controller-runtime/pkg/client"
3839
"sigs.k8s.io/controller-runtime/pkg/controller"
@@ -69,7 +70,7 @@ const (
6970
)
7071

7172
// +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch
72-
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;patch
73+
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch
7374
// +kubebuilder:rbac:groups=core,resources=configmaps,namespace=kube-system,verbs=get;list;watch;create
7475
// +kubebuilder:rbac:groups=rbac,resources=roles,namespace=kube-system,verbs=get;list;watch;create
7576
// +kubebuilder:rbac:groups=rbac,resources=rolebindings,namespace=kube-system,verbs=get;list;watch;create
@@ -86,6 +87,8 @@ type KubeadmControlPlaneReconciler struct {
8687
recorder record.EventRecorder
8788

8889
managementCluster internal.ManagementCluster
90+
91+
uncachedClient client.Reader
8992
}
9093

9194
func (r *KubeadmControlPlaneReconciler) SetupWithManager(mgr ctrl.Manager, options controller.Options) error {
@@ -110,6 +113,7 @@ func (r *KubeadmControlPlaneReconciler) SetupWithManager(mgr ctrl.Manager, optio
110113
if r.managementCluster == nil {
111114
r.managementCluster = &internal.Management{Client: r.Client}
112115
}
116+
r.uncachedClient = mgr.GetAPIReader()
113117

114118
return nil
115119
}
@@ -227,13 +231,25 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
227231
return ctrl.Result{}, err
228232
}
229233

230-
// TODO: handle proper adoption of Machines
231-
ownedMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), internal.OwnedControlPlaneMachines(kcp.Name))
234+
controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), internal.ControlPlaneMachines(cluster.Name))
232235
if err != nil {
233236
logger.Error(err, "failed to retrieve control plane machines for cluster")
234237
return ctrl.Result{}, err
235238
}
236239

240+
adoptableMachines := controlPlaneMachines.Filter(internal.AdoptableControlPlaneMachines(cluster.Name))
241+
if len(adoptableMachines) > 0 {
242+
// We adopt the Machines and then wait for the update event for the ownership reference to re-queue them so the cache is up-to-date
243+
err = r.adoptMachines(ctx, kcp, adoptableMachines)
244+
return ctrl.Result{}, err
245+
}
246+
247+
ownedMachines := controlPlaneMachines.Filter(internal.OwnedMachines(kcp))
248+
if len(ownedMachines) != len(controlPlaneMachines) {
249+
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
250+
return ctrl.Result{}, nil
251+
}
252+
237253
now := metav1.Now()
238254
var requireUpgrade internal.FilterableMachineCollection
239255
if kcp.Spec.UpgradeAfter != nil && kcp.Spec.UpgradeAfter.Before(&now) {
@@ -276,17 +292,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
276292
}
277293

278294
func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error {
279-
labelSelector := internal.ControlPlaneSelectorForCluster(cluster.Name)
280-
selector, err := metav1.LabelSelectorAsSelector(labelSelector)
281-
if err != nil {
282-
// Since we are building up the LabelSelector above, this should not fail
283-
return errors.Wrap(err, "failed to parse label selector")
284-
}
295+
selector := internal.ControlPlaneSelectorForCluster(cluster.Name)
285296
// Copy label selector to its status counterpart in string format.
286297
// This is necessary for CRDs including scale subresources.
287298
kcp.Status.Selector = selector.String()
288299

289-
ownedMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), internal.OwnedControlPlaneMachines(kcp.Name))
300+
ownedMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), internal.OwnedMachines(kcp))
290301
if err != nil {
291302
return errors.Wrap(err, "failed to get list of owned machines")
292303
}
@@ -444,13 +455,13 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
444455

445456
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, machines internal.FilterableMachineCollection) (ctrl.Result, error) {
446457
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
447-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
458+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
448459
logger.Error(err, "waiting for control plane to pass control plane health check before adding an additional control plane machine")
449460
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before adding additional control plane machine: %v", err)
450461
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: HealthCheckFailedRequeueAfter}
451462
}
452463

453-
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
464+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
454465
logger.Error(err, "waiting for control plane to pass etcd health check before adding an additional control plane machine")
455466
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check before adding additional control plane machine: %v", err)
456467
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: HealthCheckFailedRequeueAfter}
@@ -511,7 +522,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Contex
511522

512523
if !internal.HasAnnotationKey(controlplanev1.ScaleDownEtcdMemberRemovedAnnotation)(machineToDelete) {
513524
// Ensure etcd is healthy prior to attempting to remove the member
514-
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
525+
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
515526
logger.Error(err, "waiting for control plane to pass etcd health check before removing a control plane machine")
516527
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass etcd health check before removing a control plane machine: %v", err)
517528
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: HealthCheckFailedRequeueAfter}
@@ -526,7 +537,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Contex
526537
}
527538

528539
if !internal.HasAnnotationKey(controlplanev1.ScaleDownConfigMapEntryRemovedAnnotation)(machineToDelete) {
529-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
540+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
530541
logger.Error(err, "waiting for control plane to pass control plane health check before removing a control plane machine")
531542
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
532543
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: HealthCheckFailedRequeueAfter}
@@ -542,7 +553,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Contex
542553
}
543554

544555
// Do a final health check of the Control Plane components prior to actually deleting the machine
545-
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster), kcp.Name); err != nil {
556+
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
546557
logger.Error(err, "waiting for control plane to pass control plane health check before removing a control plane machine")
547558
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy", "Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
548559
return ctrl.Result{}, &capierrors.RequeueAfterError{RequeueAfter: HealthCheckFailedRequeueAfter}
@@ -724,7 +735,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
724735
logger.Error(err, "failed to retrieve machines for cluster")
725736
return ctrl.Result{}, err
726737
}
727-
ownedMachines := allMachines.Filter(internal.OwnedControlPlaneMachines(kcp.Name))
738+
ownedMachines := allMachines.Filter(internal.OwnedMachines(kcp))
728739

729740
// If no control plane machines remain, remove the finalizer
730741
if len(ownedMachines) == 0 {
@@ -834,3 +845,130 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
834845

835846
return nil
836847
}
848+
849+
func (r *KubeadmControlPlaneReconciler) adoptMachines(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, machines internal.FilterableMachineCollection) error {
850+
// We do an uncached full quorum read against the KCP to avoid re-adopting Machines the garbage collector just intentionally orphaned
851+
// See https://github.com/kubernetes/kubernetes/issues/42639
852+
uncached := controlplanev1.KubeadmControlPlane{}
853+
err := r.uncachedClient.Get(ctx, client.ObjectKey{Namespace: kcp.Namespace, Name: kcp.Name}, &uncached)
854+
if err != nil {
855+
return errors.Wrapf(err, "failed to check whether %v/%v was deleted before adoption", kcp.GetNamespace(), kcp.GetName())
856+
}
857+
if !uncached.DeletionTimestamp.IsZero() {
858+
return errors.Errorf("%v/%v has just been deleted at %v", kcp.GetNamespace(), kcp.GetName(), kcp.GetDeletionTimestamp())
859+
}
860+
861+
kcpVersion, err := semver.ParseTolerant(kcp.Spec.Version)
862+
if err != nil {
863+
return errors.Wrapf(err, "failed to parse kubernetes version %q", kcp.Spec.Version)
864+
}
865+
866+
for _, m := range machines {
867+
ref := m.Spec.Bootstrap.ConfigRef
868+
869+
// TODO instead of returning error here, we should instead Event and add a watch on potentially adoptable Machines
870+
if ref == nil || ref.Kind != "KubeadmConfig" {
871+
return errors.Errorf("unable to adopt Machine %v/%v: expected a ConfigRef of kind KubeadmConfig but instead found %v", m.Namespace, m.Name, ref)
872+
}
873+
874+
// TODO instead of returning error here, we should instead Event and add a watch on potentially adoptable Machines
875+
if ref.Namespace != "" && ref.Namespace != kcp.Namespace {
876+
return errors.Errorf("could not adopt resources from KubeadmConfig %v/%v: cannot adopt across namespaces", ref.Namespace, ref.Name)
877+
}
878+
879+
if m.Spec.Version == nil {
880+
// if the machine's version is not immediately apparent, assume the operator knows what they're doing
881+
continue
882+
}
883+
884+
machineVersion, err := semver.ParseTolerant(*m.Spec.Version)
885+
if err != nil {
886+
return errors.Wrapf(err, "failed to parse kubernetes version %q", *m.Spec.Version)
887+
}
888+
889+
if !util.IsSupportedVersionSkew(kcpVersion, machineVersion) {
890+
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "AdoptionFailed", "Could not adopt Machine %s/%s: its version (%q) is outside supported +/- one minor version skew from KCP's (%q)", m.Namespace, m.Name, *m.Spec.Version, kcp.Spec.Version)
891+
// avoid returning an error here so we don't cause the KCP controller to spin until the operator clarifies their intent
892+
return nil
893+
}
894+
}
895+
896+
for _, m := range machines {
897+
ref := m.Spec.Bootstrap.ConfigRef
898+
obj := &bootstrapv1.KubeadmConfig{}
899+
err := r.Client.Get(ctx, client.ObjectKey{Name: ref.Name, Namespace: kcp.Namespace}, obj)
900+
if err != nil {
901+
return err
902+
}
903+
904+
err = r.adoptOwnedSecrets(ctx, kcp, obj)
905+
if err != nil {
906+
return err
907+
}
908+
909+
patchHelper, err := patch.NewHelper(m, r.Client)
910+
if err != nil {
911+
return err
912+
}
913+
914+
if err = controllerutil.SetControllerReference(m, kcp, r.scheme); err != nil {
915+
return err
916+
}
917+
918+
// 0. get machine.Spec.Version - the easy answer
919+
machineKubernetesVersion := ""
920+
if m.Spec.Version != nil {
921+
machineKubernetesVersion = *m.Spec.Version
922+
}
923+
924+
// 1. hash the version (kubernetes version) and kubeadm_controlplane's Spec.infrastructureTemplate
925+
asIfSpec := controlplanev1.KubeadmControlPlaneSpec{
926+
Version: machineKubernetesVersion,
927+
InfrastructureTemplate: kcp.Spec.InfrastructureTemplate,
928+
}
929+
newConfigurationHash := hash.Compute(&asIfSpec)
930+
// 2. add kubeadm.controlplane.cluster.x-k8s.io/hash as a label in each machine
931+
m.Labels["kubeadm.controlplane.cluster.x-k8s.io/hash"] = newConfigurationHash
932+
933+
// Note that ValidateOwnerReferences() will reject this patch if another
934+
// OwnerReference exists with controller=true.
935+
if err := patchHelper.Patch(ctx, m); err != nil {
936+
return err
937+
}
938+
}
939+
return nil
940+
}
941+
942+
func (r *KubeadmControlPlaneReconciler) adoptOwnedSecrets(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, currentOwner metav1.Object) error {
943+
secrets := corev1.SecretList{}
944+
if err := r.Client.List(ctx, &secrets, client.InNamespace(kcp.Namespace)); err != nil {
945+
return errors.Wrap(err, "error finding secrets for adoption")
946+
}
947+
948+
for _, s := range secrets.Items {
949+
if !util.PointsTo(s.GetOwnerReferences(), currentOwner) {
950+
continue
951+
}
952+
// avoid taking ownership of the bootstrap data secret
953+
if s.Name == currentOwner.GetName() {
954+
continue
955+
}
956+
957+
ss := s.DeepCopy()
958+
959+
ss.SetOwnerReferences(util.ReplaceOwnerRef(ss.GetOwnerReferences(), metav1.OwnerReference{
960+
APIVersion: controlplanev1.GroupVersion.String(),
961+
Kind: "KubeadmControlPlane",
962+
Name: kcp.Name,
963+
UID: kcp.UID,
964+
Controller: pointer.BoolPtr(true),
965+
BlockOwnerDeletion: pointer.BoolPtr(true),
966+
}, currentOwner))
967+
968+
if err := r.Client.Update(ctx, ss); err != nil {
969+
return errors.Wrapf(err, "error changing secret %v ownership from KubeadmConfig/%v to KubeadmControlPlane/%v", s.Name, currentOwner.GetName(), kcp.Name)
970+
}
971+
}
972+
973+
return nil
974+
}

0 commit comments

Comments
 (0)