Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't CommitUpdate twice in a single reconcile loop #1684

Merged
merged 2 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions hack/generated/controllers/generic_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ type GenericReconciler struct {
Name string
GVK schema.GroupVersionKind
Controller controller.Controller
RequeueDelay time.Duration
RequeueDelayOverride time.Duration
CreateDeploymentName func(obj metav1.Object) (string, error)
}

Expand All @@ -54,11 +54,6 @@ type Options struct {
}

func (options *Options) setDefaults() {
// default requeue delay to 5 seconds
if options.RequeueDelay == 0 {
options.RequeueDelay = 5 * time.Second
}

// override deployment name generator, if provided
if options.CreateDeploymentName == nil {
options.CreateDeploymentName = createDeploymentName
Expand Down Expand Up @@ -135,7 +130,7 @@ func register(mgr ctrl.Manager, reconciledResourceLookup map[schema.GroupKind]sc
Log: log.WithName(controllerName),
Recorder: mgr.GetEventRecorderFor(controllerName),
GVK: gvk,
RequeueDelay: options.RequeueDelay,
RequeueDelayOverride: options.RequeueDelay,
CreateDeploymentName: options.CreateDeploymentName,
}

Expand Down Expand Up @@ -225,10 +220,13 @@ func (gr *GenericReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
return ctrl.Result{}, err
}

// TODO: This actually means we don't get exponential backoff which Kubernetes does by default,
// TODO: but we need this for test today to get fast reconciles.
if result.Requeue && result.RequeueAfter == time.Duration(0) {
result.RequeueAfter = gr.RequeueDelay
// If we have a requeue delay override, set it for all situations where
// we are requeueing.
hasRequeueDelayOverride := gr.RequeueDelayOverride != time.Duration(0)
isRequeueing := result.Requeue || result.RequeueAfter > time.Duration(0)
if hasRequeueDelayOverride && isRequeueing {
result.RequeueAfter = gr.RequeueDelayOverride
result.Requeue = true
matthchr marked this conversation as resolved.
Show resolved Hide resolved
}

return result, nil
Expand Down
169 changes: 69 additions & 100 deletions hack/generated/pkg/reconcilers/azure_deployment_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -47,8 +46,6 @@ const (
ResourceStateAnnotation = "resource-state.infra.azure.com"
ResourceErrorAnnotation = "resource-error.infra.azure.com"
ResourceSigAnnotationKey = "resource-sig.infra.azure.com"
// PreserveDeploymentAnnotation is the key which tells the applier to keep or delete the deployment
PreserveDeploymentAnnotation = "x-preserve-deployment"
)

// TODO: Do we actually want this at the controller level or this level?
Expand Down Expand Up @@ -186,23 +183,6 @@ func (r *AzureDeploymentReconciler) SetDeploymentName(name string) {
genruntime.AddAnnotation(r.obj, DeploymentNameAnnotation, name)
}

func (r *AzureDeploymentReconciler) GetShouldPreserveDeployment() bool {
preserveDeploymentString, ok := r.obj.GetAnnotations()[PreserveDeploymentAnnotation]
if !ok {
return false
}

preserveDeployment, err := strconv.ParseBool(preserveDeploymentString)
// Anything other than an error is assumed to be false...
// TODO: Would we rather have any usage of this key imply true (regardless of value?)
if err != nil {
// TODO: Log here
return false
}

return preserveDeployment
}

func (r *AzureDeploymentReconciler) SetResourceProvisioningState(state armclient.ProvisioningState) {
// TODO: It's almost certainly not safe to use this as our serialized format as it's not guaranteed backwards compatible?
genruntime.AddAnnotation(r.obj, ResourceStateAnnotation, string(state))
Expand Down Expand Up @@ -497,7 +477,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.

err = r.CommitUpdate(ctx)
if err != nil {
// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
return ctrl.Result{}, client.IgnoreNotFound(err)
}
Expand Down Expand Up @@ -531,7 +511,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
err = r.CommitUpdate(ctx)

if err != nil {
// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
return ctrl.Result{}, client.IgnoreNotFound(err)
}
Expand Down Expand Up @@ -560,7 +540,40 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
return result, err
}

// TODO: There's a bit too much duplicated code between this and create deployment -- should be a good way to combine them?
func (r *AzureDeploymentReconciler) handleDeploymentFinished(ctx context.Context, deployment *armclient.Deployment) (ctrl.Result, error) {
var status genruntime.FromARMConverter
if deployment.IsSuccessful() {
// TODO: There's some overlap here with what Update does
if len(deployment.Properties.OutputResources) == 0 {
return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
}

resourceID, err := deployment.ResourceID()
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "getting resource ID from resource")
}

status, _, err = r.getStatus(ctx, resourceID)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "getting status from ARM")
}
}

err := r.Update(deployment, status)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "updating obj")
}

err = r.CommitUpdate(ctx)
if err != nil {
// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
matthchr marked this conversation as resolved.
Show resolved Hide resolved
return ctrl.Result{}, client.IgnoreNotFound(err)
}

return ctrl.Result{}, nil
}

func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl.Result, error) {
deploymentID, deploymentIDOk := r.GetDeploymentID()
if !deploymentIDOk {
Expand All @@ -576,7 +589,7 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
r.SetDeploymentName("")
err = r.CommitUpdate(ctx)
if err != nil {
// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
return ctrl.Result{}, client.IgnoreNotFound(err)
}
Expand All @@ -593,90 +606,46 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
return ctrl.Result{}, errors.Wrapf(err, "getting deployment %q from ARM", deploymentID)
}

var status genruntime.FromARMConverter
if deployment.IsSuccessful() {
// TODO: There's some overlap here with what Update does
if len(deployment.Properties.OutputResources) == 0 {
return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
}

resourceID, idErr := deployment.ResourceID()
if idErr != nil {
return ctrl.Result{}, errors.Wrap(idErr, "getting resource ID from resource")
}

s, _, statusErr := r.getStatus(ctx, resourceID)
if statusErr != nil {
return ctrl.Result{}, errors.Wrap(statusErr, "getting status from ARM")
}
r.log.V(4).Info(
"Monitoring deployment",
"action", string(CreateOrUpdateActionMonitorDeployment),
"id", deploymentID,
"state", deployment.ProvisioningStateOrUnknown())
r.recorder.Event(
r.obj,
v1.EventTypeNormal,
string(CreateOrUpdateActionMonitorDeployment),
fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deploymentID, deployment.ProvisioningStateOrUnknown()))

status = s
// If the deployment isn't done yet, there's nothing to do just bail out
if !deployment.IsTerminalProvisioningState() {
r.log.V(3).Info("Deployment still running")
return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, nil
}

err = r.Update(deployment, status)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "updating obj")
}

err = r.CommitUpdate(ctx)
// The deployment is in a terminal state - let's handle it
r.log.Info(
"Deployment in terminal state",
"DeploymentID", deployment.ID,
"State", deployment.ProvisioningStateOrUnknown(),
"Error", deployment.ErrorOrEmpty())

// It is possible that we delete the deployment here and then are unable to persist the details of the created
// resource to etcd below. If this happens, a subsequent reconciliation will attempt to GET the deployment which will
// fail. That will trigger us to throw the deployment ID away and create a new one (which will end up being a no-op
// because the Azure resource already exists). Since it's expected that this sequence of events is rare, we don't
// try to optimize for preventing it with some sort of two phase commit or anything.
// TODO: Create a unit test that forces this specific sequence of events
r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)
_, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
if err != nil {
// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
return ctrl.Result{}, client.IgnoreNotFound(err)
return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
}

// TODO: Could somehow have a method that grouped both of these calls
currentState := r.GetResourceProvisioningState()
r.log.V(4).Info("Monitoring deployment", "action", string(CreateOrUpdateActionMonitorDeployment), "id", deployment.ID, "state", currentState)
r.recorder.Event(r.obj, v1.EventTypeNormal, string(CreateOrUpdateActionMonitorDeployment), fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deployment.ID, currentState))

// We do two patches here because if we remove the deployment before we've actually confirmed we persisted
// the resource ID, then we will be unable to get the resource ID the next time around. Only once we have
// persisted the resource ID can we safely delete the deployment
retryAfter = time.Duration(0) // ARM can tell us how long to check after issuing DELETE
if deployment.IsTerminalProvisioningState() && !r.GetShouldPreserveDeployment() {
r.log.Info(
"Deployment in terminal state",
"DeploymentID", deployment.ID,
"State", deployment.ProvisioningStateOrUnknown(),
"Error", deployment.ErrorOrEmpty())

r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)

retryAfter, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
if err != nil {
return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
}

deployment.ID = ""
deployment.Name = ""

err = r.Update(deployment, status)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "updating obj")
}

err = r.CommitUpdate(ctx)
if err != nil {
// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
return ctrl.Result{}, client.IgnoreNotFound(err)
}

r.log.V(4).Info(
"Cleared deployment info from resource",
"DeploymentID", r.GetDeploymentIDOrDefault(),
"DeploymentName", r.GetDeploymentNameOrDefault())
}

if deployment.IsTerminalProvisioningState() {
// we are done
return ctrl.Result{}, nil
}
deployment.ID = ""
deployment.Name = ""

r.log.V(3).Info("Deployment still running")
return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, err
return r.handleDeploymentFinished(ctx, deployment)
}

func (r *AzureDeploymentReconciler) ManageOwnership(ctx context.Context) (ctrl.Result, error) {
Expand Down
2 changes: 1 addition & 1 deletion hack/generated/pkg/testcommon/kube_test_context_envtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func createEnvtestContext(perTestContext PerTestContext) (*KubeBaseTestContext,
return nil, errors.Wrapf(err, "creating controller-runtime manager")
}

var requeueDelay time.Duration // defaults to 5s when zero is passed
var requeueDelay time.Duration
if perTestContext.AzureClientRecorder.Mode() == recorder.ModeReplaying {
perTestContext.T.Log("Minimizing requeue delay")
// skip requeue delays when replaying
Expand Down