Azure · matthchr · Aug 4, 2021 · Jul 31, 2021 · Aug 4, 2021
diff --git a/hack/generated/controllers/generic_controller.go b/hack/generated/controllers/generic_controller.go
@@ -39,7 +39,7 @@ type GenericReconciler struct {
 	Name                 string
 	GVK                  schema.GroupVersionKind
 	Controller           controller.Controller
-	RequeueDelay         time.Duration
+	RequeueDelayOverride time.Duration
 	CreateDeploymentName func(obj metav1.Object) (string, error)
 }
 
@@ -54,11 +54,6 @@ type Options struct {
 }
 
 func (options *Options) setDefaults() {
-	// default requeue delay to 5 seconds
-	if options.RequeueDelay == 0 {
-		options.RequeueDelay = 5 * time.Second
-	}
-
 	// override deployment name generator, if provided
 	if options.CreateDeploymentName == nil {
 		options.CreateDeploymentName = createDeploymentName
@@ -135,7 +130,7 @@ func register(mgr ctrl.Manager, reconciledResourceLookup map[schema.GroupKind]sc
 		Log:                  log.WithName(controllerName),
 		Recorder:             mgr.GetEventRecorderFor(controllerName),
 		GVK:                  gvk,
-		RequeueDelay:         options.RequeueDelay,
+		RequeueDelayOverride: options.RequeueDelay,
 		CreateDeploymentName: options.CreateDeploymentName,
 	}
 
@@ -225,10 +220,13 @@ func (gr *GenericReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
 		return ctrl.Result{}, err
 	}
 
-	// TODO: This actually means we don't get exponential backoff which Kubernetes does by default,
-	// TODO: but we need this for test today to get fast reconciles.
-	if result.Requeue && result.RequeueAfter == time.Duration(0) {
-		result.RequeueAfter = gr.RequeueDelay
+	// If we have a requeue delay override, set it for all situations where
+	// we are requeueing.
+	hasRequeueDelayOverride := gr.RequeueDelayOverride != time.Duration(0)
+	isRequeueing := result.Requeue || result.RequeueAfter > time.Duration(0)
+	if hasRequeueDelayOverride && isRequeueing {
+		result.RequeueAfter = gr.RequeueDelayOverride
+		result.Requeue = true
 	}
 
 	return result, nil

diff --git a/hack/generated/pkg/reconcilers/azure_deployment_reconciler.go b/hack/generated/pkg/reconcilers/azure_deployment_reconciler.go
@@ -12,7 +12,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
-	"strconv"
 	"strings"
 	"time"
 
@@ -47,8 +46,6 @@ const (
 	ResourceStateAnnotation  = "resource-state.infra.azure.com"
 	ResourceErrorAnnotation  = "resource-error.infra.azure.com"
 	ResourceSigAnnotationKey = "resource-sig.infra.azure.com"
-	// PreserveDeploymentAnnotation is the key which tells the applier to keep or delete the deployment
-	PreserveDeploymentAnnotation = "x-preserve-deployment"
 )
 
 // TODO: Do we actually want this at the controller level or this level?
@@ -186,23 +183,6 @@ func (r *AzureDeploymentReconciler) SetDeploymentName(name string) {
 	genruntime.AddAnnotation(r.obj, DeploymentNameAnnotation, name)
 }
 
-func (r *AzureDeploymentReconciler) GetShouldPreserveDeployment() bool {
-	preserveDeploymentString, ok := r.obj.GetAnnotations()[PreserveDeploymentAnnotation]
-	if !ok {
-		return false
-	}
-
-	preserveDeployment, err := strconv.ParseBool(preserveDeploymentString)
-	// Anything other than an error is assumed to be false...
-	// TODO: Would we rather have any usage of this key imply true (regardless of value?)
-	if err != nil {
-		// TODO: Log here
-		return false
-	}
-
-	return preserveDeployment
-}
-
 func (r *AzureDeploymentReconciler) SetResourceProvisioningState(state armclient.ProvisioningState) {
 	// TODO: It's almost certainly not safe to use this as our serialized format as it's not guaranteed backwards compatible?
 	genruntime.AddAnnotation(r.obj, ResourceStateAnnotation, string(state))
@@ -497,7 +477,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 
 	err = r.CommitUpdate(ctx)
 	if err != nil {
-		// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+		// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 		return ctrl.Result{}, client.IgnoreNotFound(err)
 	}
@@ -531,7 +511,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 				err = r.CommitUpdate(ctx)
 
 				if err != nil {
-					// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+					// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 					// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 					return ctrl.Result{}, client.IgnoreNotFound(err)
 				}
@@ -560,7 +540,40 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 	return result, err
 }
 
-// TODO: There's a bit too much duplicated code between this and create deployment -- should be a good way to combine them?
+func (r *AzureDeploymentReconciler) handleDeploymentFinished(ctx context.Context, deployment *armclient.Deployment) (ctrl.Result, error) {
+	var status genruntime.FromARMConverter
+	if deployment.IsSuccessful() {
+		// TODO: There's some overlap here with what Update does
+		if len(deployment.Properties.OutputResources) == 0 {
+			return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
+		}
+
+		resourceID, err := deployment.ResourceID()
+		if err != nil {
+			return ctrl.Result{}, errors.Wrap(err, "getting resource ID from resource")
+		}
+
+		status, _, err = r.getStatus(ctx, resourceID)
+		if err != nil {
+			return ctrl.Result{}, errors.Wrap(err, "getting status from ARM")
+		}
+	}
+
+	err := r.Update(deployment, status)
+	if err != nil {
+		return ctrl.Result{}, errors.Wrap(err, "updating obj")
+	}
+
+	err = r.CommitUpdate(ctx)
+	if err != nil {
+		// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
+		return ctrl.Result{}, client.IgnoreNotFound(err)
+	}
+
+	return ctrl.Result{}, nil
+}
+
 func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl.Result, error) {
 	deploymentID, deploymentIDOk := r.GetDeploymentID()
 	if !deploymentIDOk {
@@ -576,7 +589,7 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
 			r.SetDeploymentName("")
 			err = r.CommitUpdate(ctx)
 			if err != nil {
-				// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+				// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 				// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 				return ctrl.Result{}, client.IgnoreNotFound(err)
 			}
@@ -593,90 +606,46 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
 		return ctrl.Result{}, errors.Wrapf(err, "getting deployment %q from ARM", deploymentID)
 	}
 
-	var status genruntime.FromARMConverter
-	if deployment.IsSuccessful() {
-		// TODO: There's some overlap here with what Update does
-		if len(deployment.Properties.OutputResources) == 0 {
-			return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
-		}
-
-		resourceID, idErr := deployment.ResourceID()
-		if idErr != nil {
-			return ctrl.Result{}, errors.Wrap(idErr, "getting resource ID from resource")
-		}
-
-		s, _, statusErr := r.getStatus(ctx, resourceID)
-		if statusErr != nil {
-			return ctrl.Result{}, errors.Wrap(statusErr, "getting status from ARM")
-		}
+	r.log.V(4).Info(
+		"Monitoring deployment",
+		"action", string(CreateOrUpdateActionMonitorDeployment),
+		"id", deploymentID,
+		"state", deployment.ProvisioningStateOrUnknown())
+	r.recorder.Event(
+		r.obj,
+		v1.EventTypeNormal,
+		string(CreateOrUpdateActionMonitorDeployment),
+		fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deploymentID, deployment.ProvisioningStateOrUnknown()))
 
-		status = s
+	// If the deployment isn't done yet, there's nothing to do just bail out
+	if !deployment.IsTerminalProvisioningState() {
+		r.log.V(3).Info("Deployment still running")
+		return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, nil
 	}
 
-	err = r.Update(deployment, status)
-	if err != nil {
-		return ctrl.Result{}, errors.Wrap(err, "updating obj")
-	}
-
-	err = r.CommitUpdate(ctx)
+	// The deployment is in a terminal state - let's handle it
+	r.log.Info(
+		"Deployment in terminal state",
+		"DeploymentID", deployment.ID,
+		"State", deployment.ProvisioningStateOrUnknown(),
+		"Error", deployment.ErrorOrEmpty())
 
+	// It is possible that we delete the deployment here and then are unable to persist the details of the created
+	// resource to etcd below. If this happens, a subsequent reconciliation will attempt to GET the deployment which will
+	// fail. That will trigger us to throw the deployment ID away and create a new one (which will end up being a no-op
+	// because the Azure resource already exists). Since it's expected that this sequence of events is rare, we don't
+	// try to optimize for preventing it with some sort of two phase commit or anything.
+	// TODO: Create a unit test that forces this specific sequence of events
+	r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)
+	_, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
 	if err != nil {
-		// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
-		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
-		return ctrl.Result{}, client.IgnoreNotFound(err)
+		return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
 	}
 
-	// TODO: Could somehow have a method that grouped both of these calls
-	currentState := r.GetResourceProvisioningState()
-	r.log.V(4).Info("Monitoring deployment", "action", string(CreateOrUpdateActionMonitorDeployment), "id", deployment.ID, "state", currentState)
-	r.recorder.Event(r.obj, v1.EventTypeNormal, string(CreateOrUpdateActionMonitorDeployment), fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deployment.ID, currentState))
-
-	// We do two patches here because if we remove the deployment before we've actually confirmed we persisted
-	// the resource ID, then we will be unable to get the resource ID the next time around. Only once we have
-	// persisted the resource ID can we safely delete the deployment
-	retryAfter = time.Duration(0) // ARM can tell us how long to check after issuing DELETE
-	if deployment.IsTerminalProvisioningState() && !r.GetShouldPreserveDeployment() {
-		r.log.Info(
-			"Deployment in terminal state",
-			"DeploymentID", deployment.ID,
-			"State", deployment.ProvisioningStateOrUnknown(),
-			"Error", deployment.ErrorOrEmpty())
-
-		r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)
-
-		retryAfter, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
-		if err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
-		}
-
-		deployment.ID = ""
-		deployment.Name = ""
-
-		err = r.Update(deployment, status)
-		if err != nil {
-			return ctrl.Result{}, errors.Wrap(err, "updating obj")
-		}
-
-		err = r.CommitUpdate(ctx)
-		if err != nil {
-			// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
-			// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
-			return ctrl.Result{}, client.IgnoreNotFound(err)
-		}
-
-		r.log.V(4).Info(
-			"Cleared deployment info from resource",
-			"DeploymentID", r.GetDeploymentIDOrDefault(),
-			"DeploymentName", r.GetDeploymentNameOrDefault())
-	}
-
-	if deployment.IsTerminalProvisioningState() {
-		// we are done
-		return ctrl.Result{}, nil
-	}
+	deployment.ID = ""
+	deployment.Name = ""
 
-	r.log.V(3).Info("Deployment still running")
-	return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, err
+	return r.handleDeploymentFinished(ctx, deployment)
 }
 
 func (r *AzureDeploymentReconciler) ManageOwnership(ctx context.Context) (ctrl.Result, error) {

diff --git a/hack/generated/pkg/testcommon/kube_test_context_envtest.go b/hack/generated/pkg/testcommon/kube_test_context_envtest.go
@@ -65,7 +65,7 @@ func createEnvtestContext(perTestContext PerTestContext) (*KubeBaseTestContext,
 		return nil, errors.Wrapf(err, "creating controller-runtime manager")
 	}
 
-	var requeueDelay time.Duration // defaults to 5s when zero is passed
+	var requeueDelay time.Duration
 	if perTestContext.AzureClientRecorder.Mode() == recorder.ModeReplaying {
 		perTestContext.T.Log("Minimizing requeue delay")
 		// skip requeue delays when replaying