Skip to content

Commit

Permalink
Enhance error code detection in worker controller (gardener#8242)
Browse files Browse the repository at this point in the history
* Few nits

* Enhance worker controller to check for error codes in error returned from machine status

* Check extension codition first in case of EveryNodeReady check

There can be a scenario when infra resource is depleted while creating the first node(or min no of nodes for a worker pool). In this case, the error that comes now is `Not enough worker nodes registered` which is not correct rather error should say infra is at limit which comes from worker extension status.

* Address review
  • Loading branch information
acumino authored Jul 25, 2023
1 parent fa50315 commit 63346c3
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 14 deletions.
4 changes: 4 additions & 0 deletions extensions/pkg/controller/worker/genericactuator/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"

extensionscontroller "github.com/gardener/gardener/extensions/pkg/controller"
"github.com/gardener/gardener/extensions/pkg/controller/healthcheck"
"github.com/gardener/gardener/extensions/pkg/controller/worker"
extensionsworkerhelper "github.com/gardener/gardener/extensions/pkg/controller/worker/helper"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
Expand Down Expand Up @@ -61,6 +62,7 @@ type genericActuator struct {
gardenerClientset kubernetesclient.Interface
chartApplier kubernetesclient.ChartApplier
chartRendererFactory extensionscontroller.ChartRendererFactory
errorCodeCheckFunc healthcheck.ErrorCodeCheckFunc
}

// NewActuator creates a new Actuator that reconciles
Expand All @@ -75,6 +77,7 @@ func NewActuator(
mcmShootChart chart.Interface,
imageVector imagevector.ImageVector,
chartRendererFactory extensionscontroller.ChartRendererFactory,
errorCodeCheckFunc healthcheck.ErrorCodeCheckFunc,
) (worker.Actuator, error) {
gardenerClientset, err := kubernetesclient.NewWithConfig(kubernetesclient.WithRESTConfig(mgr.GetConfig()))
if err != nil {
Expand All @@ -94,6 +97,7 @@ func NewActuator(
gardenerClientset: gardenerClientset,
chartApplier: gardenerClientset.ChartApplier(),
chartRendererFactory: chartRendererFactory,
errorCodeCheckFunc: errorCodeCheckFunc,
}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

extensionscontroller "github.com/gardener/gardener/extensions/pkg/controller"
v1beta1helper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
extensionsv1alpha1 "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1"
"github.com/gardener/gardener/pkg/utils/flow"
kubernetesutils "github.com/gardener/gardener/pkg/utils/kubernetes"
Expand Down Expand Up @@ -101,7 +102,11 @@ func (a *genericActuator) Delete(ctx context.Context, log logr.Logger, worker *e

// Wait until all machine resources have been properly deleted.
if err := a.waitUntilMachineResourcesDeleted(ctx, log, worker, workerDelegate); err != nil {
return fmt.Errorf("Failed while waiting for all machine resources to be deleted: %w", err)
newError := fmt.Errorf("failed while waiting for all machine resources to be deleted: %w", err)
if a.errorCodeCheckFunc != nil {
return v1beta1helper.NewErrorWithCodes(newError, a.errorCodeCheckFunc(err)...)
}
return newError
}

// Wait until the machine class credentials secret has been released.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/gardener/gardener/extensions/pkg/controller"
extensionscontroller "github.com/gardener/gardener/extensions/pkg/controller"
extensionsworkercontroller "github.com/gardener/gardener/extensions/pkg/controller/worker"
extensionsworkerhelper "github.com/gardener/gardener/extensions/pkg/controller/worker/helper"
v1beta1constants "github.com/gardener/gardener/pkg/apis/core/v1beta1/constants"
v1beta1helper "github.com/gardener/gardener/pkg/apis/core/v1beta1/helper"
extensionsv1alpha1 "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1"
extensionsv1alpha1helper "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1/helper"
"github.com/gardener/gardener/pkg/client/kubernetes"
Expand All @@ -42,7 +42,7 @@ import (
retryutils "github.com/gardener/gardener/pkg/utils/retry"
)

func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker *extensionsv1alpha1.Worker, cluster *controller.Cluster) error {
func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker *extensionsv1alpha1.Worker, cluster *extensionscontroller.Cluster) error {
log = log.WithValues("operation", "reconcile")

workerDelegate, err := a.delegateFactory.WorkerDelegate(ctx, worker, cluster)
Expand Down Expand Up @@ -99,7 +99,7 @@ func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker
// TODO(rfranzke): Remove this code after v1.77 was released.
// When the Shoot is hibernated we want to remove the cluster autoscaler so that it does not interfer
// with Gardeners modifications on the machine deployment's replicas fields.
isHibernationEnabled := controller.IsHibernationEnabled(cluster)
isHibernationEnabled := extensionscontroller.IsHibernationEnabled(cluster)
if clusterAutoscalerUsed && isHibernationEnabled {
if err = a.scaleClusterAutoscaler(ctx, log, worker, 0); err != nil {
return err
Expand Down Expand Up @@ -161,7 +161,11 @@ func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker
log.Info("Successfully deleted stuck machine-controller-manager pod", "reason", msg)
}

return fmt.Errorf("Failed while waiting for all machine deployments to be ready: %w", err)
newError := fmt.Errorf("failed while waiting for all machine deployments to be ready: %w", err)
if a.errorCodeCheckFunc != nil {
return v1beta1helper.NewErrorWithCodes(newError, a.errorCodeCheckFunc(err)...)
}
return newError
}

// Delete all old machine deployments (i.e. those which were not previously computed but exist in the cluster).
Expand Down Expand Up @@ -241,7 +245,7 @@ func deployMachineDeployments(
switch {
// If the Shoot is hibernated then the machine deployment's replicas should be zero.
// Also mark all machines for forceful deletion to avoid respecting of PDBs/SLAs in case of cluster hibernation.
case controller.IsHibernationEnabled(cluster):
case extensionscontroller.IsHibernationEnabled(cluster):
replicas = 0
if err := markAllMachinesForcefulDeletion(ctx, log, cl, worker.Namespace); err != nil {
return fmt.Errorf("marking all machines for forceful deletion failed: %w", err)
Expand All @@ -262,7 +266,7 @@ func deployMachineDeployments(
}
// If the Shoot was hibernated and is now woken up we set replicas to min so that the cluster
// autoscaler can scale them as required.
case shootIsAwake(controller.IsHibernationEnabled(cluster), existingMachineDeployments):
case shootIsAwake(extensionscontroller.IsHibernationEnabled(cluster), existingMachineDeployments):
replicas = deployment.Minimum
// If the shoot worker pool minimum was updated and if the current machine deployment replica
// count is less than minimum, we update the machine deployment replica count to updated minimum.
Expand Down Expand Up @@ -377,7 +381,7 @@ func (a *genericActuator) waitUntilWantedMachineDeploymentsAvailable(ctx context
numberOfAwakeMachines += deployment.Status.Replicas

// Skip further checks if cluster is hibernated because machine-controller-manager is usually scaled down during hibernation.
if controller.IsHibernationEnabled(cluster) {
if extensionscontroller.IsHibernationEnabled(cluster) {
continue
}

Expand Down Expand Up @@ -417,7 +421,7 @@ func (a *genericActuator) waitUntilWantedMachineDeploymentsAvailable(ctx context

var msg string
switch {
case !controller.IsHibernationEnabled(cluster):
case !extensionscontroller.IsHibernationEnabled(cluster):
// numUpdated == numberOfAwakeMachines waits until the old machine is deleted in the case of a rolling update with maxUnavailability = 0
// numUnavailable == 0 makes sure that every machine joined the cluster (during creation & in the case of a rolling update with maxUnavailability > 0)
if numUnavailable == 0 && numUpdated == numberOfAwakeMachines && int(numHealthyDeployments) == len(wantedMachineDeployments) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/gardener/gardener/extensions/pkg/controller"
extensionscontroller "github.com/gardener/gardener/extensions/pkg/controller"
extensionsv1alpha1 "github.com/gardener/gardener/pkg/apis/extensions/v1alpha1"
"github.com/gardener/gardener/pkg/client/kubernetes"
Expand Down Expand Up @@ -139,7 +138,7 @@ func scaleMachineControllerManager(ctx context.Context, logger logr.Logger, cl c
return client.IgnoreNotFound(kubernetes.ScaleDeployment(ctx, cl, kubernetesutils.Key(worker.Namespace, McmDeploymentName), replicas))
}

func (a *genericActuator) applyMachineControllerManagerShootChart(ctx context.Context, logger logr.Logger, workerDelegate WorkerDelegate, workerObj *extensionsv1alpha1.Worker, cluster *controller.Cluster) error {
func (a *genericActuator) applyMachineControllerManagerShootChart(ctx context.Context, logger logr.Logger, workerDelegate WorkerDelegate, workerObj *extensionsv1alpha1.Worker, cluster *extensionscontroller.Cluster) error {
if !a.mcmManaged {
logger.Info("Skip machine-controller-manager shoot chart application since gardenlet manages it")
return nil
Expand Down
6 changes: 3 additions & 3 deletions pkg/operation/care/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,12 +478,12 @@ func (h *Health) checkClusterNodes(
condition gardencorev1beta1.Condition,
extensionConditions []ExtensionCondition,
) (*gardencorev1beta1.Condition, error) {
if exitCondition, err := checker.CheckClusterNodes(ctx, shootClient, seedNamespace, h.shoot.GetInfo().Spec.Provider.Workers, condition); err != nil || exitCondition != nil {
return exitCondition, err
}
if exitCondition := checker.CheckExtensionCondition(condition, extensionConditions); exitCondition != nil {
return exitCondition, nil
}
if exitCondition, err := checker.CheckClusterNodes(ctx, shootClient, seedNamespace, h.shoot.GetInfo().Spec.Provider.Workers, condition); err != nil || exitCondition != nil {
return exitCondition, err
}

c := v1beta1helper.UpdatedConditionWithClock(h.clock, condition, gardencorev1beta1.ConditionTrue, "EveryNodeReady", "All nodes are ready.")
return &c, nil
Expand Down
1 change: 1 addition & 0 deletions pkg/provider-local/controller/worker/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func NewActuator(mgr manager.Manager, gardenletManagesMCM bool) (worker.Actuator
mcmChartShoot,
imageVector,
chartRendererFactory,
nil,
)
if err != nil {
return nil, err
Expand Down

0 comments on commit 63346c3

Please sign in to comment.