Skip to content

Commit dfd25de

Browse files
authored
Set Progressing to Succeeded on ClusterExtensionRevision only after availability probes pass (#2524)
Previously, `Progressing` reason was set to `Succeeded` as soon as the rollout transition completed, not taking into account availability probes of the last phase. This meant setting `.spec.progressDeadlineMinutes` could not catch workloads that were applied to the cluster but never became available, due to for example bad image reference. Now `Progressing` stays at `RollingOut` during transition and is only set to `Succeeded` once probes confirm availability, allowing the progress deadline to correctly expire when a workload fails to become available in time.
1 parent 662ecf2 commit dfd25de

File tree

4 files changed

+65
-19
lines changed

4 files changed

+65
-19
lines changed

internal/operator-controller/controllers/clusterextensionrevision_controller.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"errors"
99
"fmt"
1010
"strings"
11-
"sync"
1211
"time"
1312

1413
appsv1 "k8s.io/api/apps/v1"
@@ -20,6 +19,7 @@ import (
2019
"k8s.io/apimachinery/pkg/runtime/schema"
2120
"k8s.io/apimachinery/pkg/types"
2221
"k8s.io/apimachinery/pkg/util/sets"
22+
"k8s.io/utils/clock"
2323
"pkg.package-operator.run/boxcutter"
2424
"pkg.package-operator.run/boxcutter/machinery"
2525
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -48,9 +48,7 @@ type ClusterExtensionRevisionReconciler struct {
4848
Client client.Client
4949
RevisionEngineFactory RevisionEngineFactory
5050
TrackingCache trackingCache
51-
// track if we have queued up the reconciliation that detects eventual progress deadline issues
52-
// keys is revision UUID, value is boolean
53-
progressDeadlineCheckInFlight sync.Map
51+
Clock clock.Clock
5452
}
5553

5654
type trackingCache interface {
@@ -86,15 +84,19 @@ func (c *ClusterExtensionRevisionReconciler) Reconcile(ctx context.Context, req
8684
// check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet
8785
if isStillProgressing && !succeeded {
8886
timeout := time.Duration(pd) * time.Minute
89-
if time.Since(existingRev.CreationTimestamp.Time) > timeout {
87+
if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout {
9088
// progress deadline reached, reset any errors and stop reconciling this revision
91-
markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minutes.", pd))
89+
markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minute(s).", pd))
9290
reconcileErr = nil
9391
res = ctrl.Result{}
94-
} else if _, found := c.progressDeadlineCheckInFlight.Load(existingRev.GetUID()); !found && reconcileErr == nil {
95-
// if we haven't already queued up a reconcile to check for progress deadline, queue one up, but only once
96-
c.progressDeadlineCheckInFlight.Store(existingRev.GetUID(), true)
97-
res = ctrl.Result{RequeueAfter: timeout}
92+
} else if reconcileErr == nil {
93+
// We want to requeue so far in the future that the next reconciliation
94+
// can detect if the revision did not progress within the given timeout.
95+
// Thus, we plan the next reconcile slightly after (+2secs) the timeout is passed.
96+
drift := 2 * time.Second
97+
requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(drift).Sub(c.Clock.Now()).Round(time.Second)
98+
l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter))
99+
res = ctrl.Result{RequeueAfter: requeueAfter}
98100
}
99101
}
100102
}
@@ -208,9 +210,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer
208210
}
209211

210212
revVersion := cer.GetAnnotations()[labels.BundleVersionKey]
211-
if !rres.InTransition() {
212-
markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
213-
} else {
213+
if rres.InTransition() {
214214
markAsProgressing(cer, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion))
215215
}
216216

@@ -231,6 +231,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer
231231
}
232232
}
233233

234+
markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
234235
markAsAvailable(cer, ocv1.ClusterExtensionRevisionReasonProbesSucceeded, "Objects are available and pass all probes.")
235236

236237
// We'll probably only want to remove this once we are done updating the ClusterExtension conditions
@@ -275,6 +276,9 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer
275276
} else {
276277
markAsUnavailable(cer, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion))
277278
}
279+
if meta.FindStatusCondition(cer.Status.Conditions, ocv1.ClusterExtensionRevisionTypeProgressing) == nil {
280+
markAsProgressing(cer, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion))
281+
}
278282
}
279283

280284
return ctrl.Result{}, nil
@@ -333,6 +337,7 @@ func (c *ClusterExtensionRevisionReconciler) SetupWithManager(mgr ctrl.Manager)
333337
return true
334338
},
335339
}
340+
c.Clock = clock.RealClock{}
336341
return ctrl.NewControllerManagedBy(mgr).
337342
For(
338343
&ocv1.ClusterExtensionRevision{},

internal/operator-controller/controllers/clusterextensionrevision_controller_test.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ import (
1616
"k8s.io/apimachinery/pkg/runtime/schema"
1717
"k8s.io/apimachinery/pkg/types"
1818
"k8s.io/apimachinery/pkg/util/sets"
19+
"k8s.io/utils/clock"
20+
clocktesting "k8s.io/utils/clock/testing"
1921
"pkg.package-operator.run/boxcutter"
2022
"pkg.package-operator.run/boxcutter/machinery"
2123
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -372,6 +374,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
372374
name: "set Progressing:True:Succeeded once transition rollout is finished",
373375
revisionResult: mockRevisionResult{
374376
inTransition: false,
377+
isComplete: true,
375378
},
376379
reconcilingRevisionName: clusterExtensionRevisionName,
377380
existingObjs: func() []client.Object {
@@ -380,7 +383,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
380383
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
381384
Type: ocv1.TypeProgressing,
382385
Status: metav1.ConditionTrue,
383-
Reason: ocv1.ReasonSucceeded,
386+
Reason: ocv1.ReasonRollingOut,
384387
Message: "Revision 1.0.0 is rolling out.",
385388
ObservedGeneration: 1,
386389
})
@@ -945,16 +948,19 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
945948
validate func(*testing.T, client.Client)
946949
reconcileErr error
947950
reconcileResult ctrl.Result
951+
clock clock.Clock
948952
}{
949953
{
950954
name: "progressing set to false when progress deadline is exceeded",
951955
existingObjs: func() []client.Object {
952956
ext := newTestClusterExtension()
953957
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
954958
rev1.Spec.ProgressDeadlineMinutes = 1
955-
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-61 * time.Second))
959+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC))
956960
return []client.Object{rev1, ext}
957961
},
962+
// 61sec elapsed since the creation of the revision
963+
clock: clocktesting.NewFakeClock(time.Date(2022, 1, 1, 0, 1, 1, 0, time.UTC)),
958964
revisionResult: &mockRevisionResult{
959965
inTransition: true,
960966
},
@@ -975,13 +981,14 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
975981
ext := newTestClusterExtension()
976982
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
977983
rev1.Spec.ProgressDeadlineMinutes = 1
978-
rev1.CreationTimestamp = metav1.NewTime(time.Now())
984+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC))
979985
return []client.Object{rev1, ext}
980986
},
987+
clock: clocktesting.NewFakeClock(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC)),
981988
revisionResult: &mockRevisionResult{
982989
inTransition: true,
983990
},
984-
reconcileResult: ctrl.Result{RequeueAfter: 1 * time.Minute},
991+
reconcileResult: ctrl.Result{RequeueAfter: 62 * time.Second},
985992
validate: func(t *testing.T, c client.Client) {
986993
rev := &ocv1.ClusterExtensionRevision{}
987994
err := c.Get(t.Context(), client.ObjectKey{
@@ -1049,6 +1056,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
10491056
TrackingCache: &mockTrackingCache{
10501057
client: testClient,
10511058
},
1059+
Clock: tc.clock,
10521060
}).Reconcile(t.Context(), ctrl.Request{
10531061
NamespacedName: types.NamespacedName{
10541062
Name: clusterExtensionRevisionName,

test/e2e/features/install.feature

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,39 @@ Feature: Install ClusterExtension
383383
invalid ClusterExtension configuration: invalid configuration: unknown field "watchNamespace"
384384
"""
385385

386+
@BoxcutterRuntime
387+
@ProgressDeadline
388+
Scenario: Report ClusterExtension as not progressing if the rollout does not become available within given timeout
389+
Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1
390+
And min value for ClusterExtensionRevision .spec.progressDeadlineMinutes is set to 1
391+
When ClusterExtension is applied
392+
"""
393+
apiVersion: olm.operatorframework.io/v1
394+
kind: ClusterExtension
395+
metadata:
396+
name: ${NAME}
397+
spec:
398+
namespace: ${TEST_NAMESPACE}
399+
progressDeadlineMinutes: 1
400+
serviceAccount:
401+
name: olm-sa
402+
source:
403+
sourceType: Catalog
404+
catalog:
405+
packageName: test
406+
# bundle refers bad image references, so that the deployment never becomes available
407+
version: 1.0.2
408+
selector:
409+
matchLabels:
410+
"olm.operatorframework.io/metadata.name": test-catalog
411+
"""
412+
Then ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded
413+
And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message:
414+
"""
415+
Revision has not rolled out for 1 minute(s).
416+
"""
417+
And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation
418+
386419
@BoxcutterRuntime
387420
@ProgressDeadline
388421
Scenario: Report ClusterExtension as not progressing if the rollout does not complete within given timeout
@@ -411,7 +444,7 @@ Feature: Install ClusterExtension
411444
Then ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded
412445
And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message:
413446
"""
414-
Revision has not rolled out for 1 minutes.
447+
Revision has not rolled out for 1 minute(s).
415448
"""
416449
And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation
417450

test/e2e/features/update.feature

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,6 @@ Feature: Update ClusterExtension
240240
And ClusterExtension is available
241241
When ClusterExtension is updated to version "1.0.2"
242242
Then ClusterExtension reports "${NAME}-1, ${NAME}-2" as active revisions
243-
And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason Succeeded
243+
And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason RollingOut
244244
And ClusterExtensionRevision "${NAME}-2" reports Available as False with Reason ProbeFailure
245245

0 commit comments

Comments
 (0)