Skip to content

Commit

Permalink
Deploy CA before worker pools are ready (gardener#7774)
Browse files Browse the repository at this point in the history
* deploy CA before worker pools are ready

* address review comments

* address review comments part 2

* address review comments part 3

* address review comments part 4

* address review comments part 5

* address review comments part 6

* update waitUntilWorkerReady dependencies

* fix unit tests after rebase
  • Loading branch information
rishabh-11 authored Apr 27, 2023
1 parent cd89145 commit 6c02eb3
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 16 deletions.
14 changes: 14 additions & 0 deletions docs/api-reference/extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -4294,6 +4294,20 @@ DefaultStatus
the cluster-autoscaler properly.</p>
</td>
</tr>
<tr>
<td>
<code>machineDeploymentsLastUpdateTime</code></br>
<em>
<a href="https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.25/#time-v1-meta">
Kubernetes meta/v1.Time
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>MachineDeploymentsLastUpdateTime is the timestamp when the status.MachineDeployments slice was last updated.</p>
</td>
</tr>
</tbody>
</table>
<hr/>
Expand Down
3 changes: 2 additions & 1 deletion docs/extensions/worker.md
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ Our cluster-autoscaler only needs to know the minimum and maximum number of repl
Gardener deploys this autoscaler if there is at least one worker pool that specifies `max>min`.
In order to know how it needs to configure it, the provider-specific `Worker` extension controller must expose which `MachineDeployment`s it has created and how the `min`/`max` numbers should look like.

Consequently, your controller should write this information into the `Worker` resource's `.status.machineDeployments` field:
Consequently, your controller should write this information into the `Worker` resource's `.status.machineDeployments` field. It should also update the `.status.machineDeploymentsLastUpdateTime` field along with `.status.machineDeployments`, so that gardener is able to deploy Cluster-Autoscaler right after the status is updated with the latest `MachineDeployment`s and does not wait for the reconciliation to be completed:

```yaml
---
Expand All @@ -298,6 +298,7 @@ status:
- name: shoot--foo--bar-cpu-worker-z2
minimum: 1
maximum: 2
machineDeploymentsLastUpdateTime: "2023-05-01T12:44:27Z"
```

In order to support a new worker provider, you need to write a controller that watches all `Worker`s with `.spec.type=<my-provider-name>`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,11 @@ spec:
- name
type: object
type: array
machineDeploymentsLastUpdateTime:
description: MachineDeploymentsLastUpdateTime is the timestamp when
the status.MachineDeployments slice was last updated.
format: date-time
type: string
observedGeneration:
description: ObservedGeneration is the most recent generation observed
for this resource.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker
return fmt.Errorf("failed to generate the machine deployment config: %w", err)
}

// update machineDeploymentsLastUpdateTime and the machine deployment slice in worker status
if err := a.updateWorkerStatusMachineDeployments(ctx, worker, wantedMachineDeployments); err != nil {
return fmt.Errorf("failed to update the machine deployments in worker status: %w", err)
}

// Wait until all generated machine deployments are healthy/available.
if err := a.waitUntilWantedMachineDeploymentsAvailable(ctx, log, cluster, worker, existingMachineDeploymentNames, existingMachineClassNames, wantedMachineDeployments); err != nil {
// check if the machine-controller-manager is stuck
Expand Down Expand Up @@ -199,10 +204,6 @@ func (a *genericActuator) Reconcile(ctx context.Context, log logr.Logger, worker
}
}

if err := a.updateWorkerStatusMachineDeployments(ctx, worker, wantedMachineDeployments); err != nil {
return fmt.Errorf("failed to update the machine deployments in worker status: %w", err)
}

// Call post reconcilation hook after Worker reconciliation has happened.
if err := workerDelegate.PostReconcileHook(ctx); err != nil {
return fmt.Errorf("post worker reconciliation hook failed: %w", err)
Expand Down Expand Up @@ -469,9 +470,11 @@ func (a *genericActuator) updateWorkerStatusMachineDeployments(ctx context.Conte
Maximum: machineDeployment.Maximum,
})
}
updateTime := metav1.Now()

patch := client.MergeFrom(worker.DeepCopy())
worker.Status.MachineDeployments = statusMachineDeployments
worker.Status.MachineDeploymentsLastUpdateTime = &updateTime
return a.client.Status().Patch(ctx, worker, patch)
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/extensions/v1alpha1/types_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ type WorkerStatus struct {
// +patchMergeKey=name
// +patchStrategy=merge
MachineDeployments []MachineDeployment `json:"machineDeployments,omitempty" patchStrategy:"merge" patchMergeKey:"name"`
// MachineDeploymentsLastUpdateTime is the timestamp when the status.MachineDeployments slice was last updated.
// +optional
MachineDeploymentsLastUpdateTime *metav1.Time `json:"machineDeploymentsLastUpdateTime,omitempty"`
}

// MachineDeployment is a created machine deployment.
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/extensions/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 12 additions & 7 deletions pkg/gardenlet/controller/shoot/shoot/reconciler_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,16 @@ func (r *Reconciler) runReconcileShootFlow(ctx context.Context, o *operation.Ope
Fn: flow.TaskFn(botanist.DeployWorker).RetryUntilTimeout(defaultInterval, defaultTimeout),
Dependencies: flow.NewTaskIDs(deployCloudProviderSecret, deployReferencedResources, waitUntilInfrastructureReady, initializeShootClients, waitUntilOperatingSystemConfigReady, waitUntilNetworkIsReady, createNewServiceAccountSecrets),
})
waitUntilWorkerStatusUpdate = g.Add(flow.Task{
Name: "Waiting until worker resource status is updated with latest machine deployments",
Fn: botanist.Shoot.Components.Extensions.Worker.WaitUntilWorkerStatusMachineDeploymentsUpdated,
Dependencies: flow.NewTaskIDs(deployWorker),
})
deployClusterAutoscaler = g.Add(flow.Task{
Name: "Deploying cluster autoscaler",
Fn: flow.TaskFn(botanist.DeployClusterAutoscaler).RetryUntilTimeout(defaultInterval, defaultTimeout),
Dependencies: flow.NewTaskIDs(waitUntilWorkerStatusUpdate, deployManagedResourcesForAddons, deployManagedResourceForCloudConfigExecutor),
})
_ = g.Add(flow.Task{
Name: "Reconciling Grafana for Shoot in Seed for the logging stack",
Fn: flow.TaskFn(botanist.DeploySeedGrafana).RetryUntilTimeout(defaultInterval, 2*time.Minute),
Expand All @@ -614,7 +624,7 @@ func (r *Reconciler) runReconcileShootFlow(ctx context.Context, o *operation.Ope
waitUntilWorkerReady = g.Add(flow.Task{
Name: "Waiting until shoot worker nodes have been reconciled",
Fn: flow.TaskFn(botanist.Shoot.Components.Extensions.Worker.Wait).SkipIf(skipReadiness),
Dependencies: flow.NewTaskIDs(deployWorker, deployManagedResourceForCloudConfigExecutor),
Dependencies: flow.NewTaskIDs(deployWorker, waitUntilWorkerStatusUpdate, deployManagedResourceForCloudConfigExecutor),
})
nginxLBReady = g.Add(flow.Task{
Name: "Waiting until nginx ingress LoadBalancer is ready",
Expand Down Expand Up @@ -678,11 +688,6 @@ func (r *Reconciler) runReconcileShootFlow(ctx context.Context, o *operation.Ope
Fn: flow.TaskFn(botanist.DeploySeedGrafana).RetryUntilTimeout(defaultInterval, 2*time.Minute),
Dependencies: flow.NewTaskIDs(deploySeedMonitoring),
})
deployClusterAutoscaler = g.Add(flow.Task{
Name: "Deploying cluster autoscaler",
Fn: flow.TaskFn(botanist.DeployClusterAutoscaler).RetryUntilTimeout(defaultInterval, defaultTimeout),
Dependencies: flow.NewTaskIDs(waitUntilWorkerReady, deployManagedResourcesForAddons, deployManagedResourceForCloudConfigExecutor),
})

deployExtensionResourcesAfterKAPI = g.Add(flow.Task{
Name: deployExtensionAfterKAPIMsg,
Expand All @@ -698,7 +703,7 @@ func (r *Reconciler) runReconcileShootFlow(ctx context.Context, o *operation.Ope
hibernateControlPlane = g.Add(flow.Task{
Name: "Hibernating control plane",
Fn: flow.TaskFn(botanist.HibernateControlPlane).RetryUntilTimeout(defaultInterval, 2*time.Minute).DoIf(o.Shoot.HibernationEnabled),
Dependencies: flow.NewTaskIDs(initializeShootClients, deploySeedMonitoring, deploySeedLogging, deployClusterAutoscaler, waitUntilExtensionResourcesAfterKAPIReady),
Dependencies: flow.NewTaskIDs(initializeShootClients, deploySeedMonitoring, deploySeedLogging, deployClusterAutoscaler, waitUntilWorkerReady, waitUntilExtensionResourcesAfterKAPIReady),
})

// logic is inverted here
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,11 @@ spec:
- name
type: object
type: array
machineDeploymentsLastUpdateTime:
description: MachineDeploymentsLastUpdateTime is the timestamp when
the status.MachineDeployments slice was last updated.
format: date-time
type: string
observedGeneration:
description: ObservedGeneration is the most recent generation observed
for this resource.
Expand Down
14 changes: 14 additions & 0 deletions pkg/operation/botanist/component/extensions/worker/mock/mocks.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 47 additions & 2 deletions pkg/operation/botanist/component/extensions/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package worker

import (
"context"
"fmt"
"time"

"github.com/Masterminds/semver"
Expand All @@ -35,6 +36,7 @@ import (
"github.com/gardener/gardener/pkg/operation/botanist/component"
"github.com/gardener/gardener/pkg/operation/botanist/component/extensions/operatingsystemconfig"
gardenerutils "github.com/gardener/gardener/pkg/utils/gardener"
"github.com/gardener/gardener/pkg/utils/kubernetes/health"
)

const (
Expand All @@ -58,6 +60,7 @@ type Interface interface {
SetInfrastructureProviderStatus(*runtime.RawExtension)
SetWorkerNameToOperatingSystemConfigsMap(map[string]*operatingsystemconfig.OperatingSystemConfigs)
MachineDeployments() []extensionsv1alpha1.MachineDeployment
WaitUntilWorkerStatusMachineDeploymentsUpdated(ctx context.Context) error
}

// Values contains the values used to create a Worker resources.
Expand Down Expand Up @@ -121,8 +124,9 @@ type worker struct {
waitSevereThreshold time.Duration
waitTimeout time.Duration

worker *extensionsv1alpha1.Worker
machineDeployments []extensionsv1alpha1.MachineDeployment
worker *extensionsv1alpha1.Worker
machineDeployments []extensionsv1alpha1.MachineDeployment
machineDeploymentsLastUpdateTime *metav1.Time
}

// Deploy uses the seed client to create or update the Worker resource.
Expand Down Expand Up @@ -250,6 +254,10 @@ func (w *worker) deploy(ctx context.Context, operation string) (extensionsv1alph
return nil
})

// populate the MachineDeploymentsLastUpdate time as it will be used later to confirm if the machineDeployments slice in the worker
// status got updated with the latest ones.
w.machineDeploymentsLastUpdateTime = obj.Status.MachineDeploymentsLastUpdateTime

return w.worker, err
}

Expand Down Expand Up @@ -293,6 +301,22 @@ func (w *worker) Wait(ctx context.Context) error {
w.waitInterval,
w.waitSevereThreshold,
w.waitTimeout,
nil,
)
}

// WaitUntilWorkerStatusMachineDeploymentsUpdated waits until the worker status is updated with the latest machineDeployment slice.
func (w *worker) WaitUntilWorkerStatusMachineDeploymentsUpdated(ctx context.Context) error {
return extensions.WaitUntilObjectReadyWithHealthFunction(
ctx,
w.client,
w.log,
w.checkWorkerStatusMachineDeploymentsUpdated,
w.worker,
extensionsv1alpha1.WorkerResource,
w.waitInterval,
w.waitSevereThreshold,
w.waitTimeout,
func() error {
w.machineDeployments = w.worker.Status.MachineDeployments
return nil
Expand Down Expand Up @@ -353,3 +377,24 @@ func (w *worker) findNodeTemplateAndMachineTypeByPoolName(ctx context.Context, o
}
return nil, ""
}

// checkWorkerStatusMachineDeploymentsUpdated checks if the status of the worker is updated or not during its reconciliation.
// It is updated if
// * The status.MachineDeploymentsLastUpdateTime > the value of the time stamp stored in worker struct before the reconciliation begins.
func (w *worker) checkWorkerStatusMachineDeploymentsUpdated(o client.Object) error {
obj, ok := o.(*extensionsv1alpha1.Worker)
if !ok {
return fmt.Errorf("expected *extensionsv1alpha1.Worker but got %T", o)
}

if obj.Status.MachineDeploymentsLastUpdateTime != nil && (w.machineDeploymentsLastUpdateTime == nil || obj.Status.MachineDeploymentsLastUpdateTime.After(w.machineDeploymentsLastUpdateTime.Time)) {
return nil
}

// TODO(rishabh-11): Remove this check in a future release when it's ensured that all extensions were upgraded and follow the contract to maintain `MachineDeploymentsLastUpdateTime`.
if obj.Status.MachineDeploymentsLastUpdateTime == nil {
return health.CheckExtensionObject(o)
}

return fmt.Errorf("worker status machineDeployments has not been updated")
}
Loading

0 comments on commit 6c02eb3

Please sign in to comment.