Skip to content

Commit e505739

Browse files
pedjakclaude
andcommitted
Set Progressing to Succeeded on ClusterExtensionRevision only after availability probes pass
Previously, `Progressing` reason was set to `Succeeded` as soon as the rollout transition completed, not taking into account availability probes of the last phase. This meant setting `.spec.progressDeadlineMinutes` could not catch workloads that were applied to the cluster but never became available, due to for example bad image reference. Now `Progressing` stays at `RollingOut` during transition and is only set to `Succeeded` once probes confirm availability, allowing the progress deadline to correctly expire when a workload fails to become available in time. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent af6733e commit e505739

File tree

3 files changed

+54
-16
lines changed

3 files changed

+54
-16
lines changed

internal/operator-controller/controllers/clusterextensionrevision_controller.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"errors"
99
"fmt"
1010
"strings"
11-
"sync"
1211
"time"
1312

1413
appsv1 "k8s.io/api/apps/v1"
@@ -20,6 +19,7 @@ import (
2019
"k8s.io/apimachinery/pkg/runtime/schema"
2120
"k8s.io/apimachinery/pkg/types"
2221
"k8s.io/apimachinery/pkg/util/sets"
22+
"k8s.io/utils/clock"
2323
"pkg.package-operator.run/boxcutter"
2424
"pkg.package-operator.run/boxcutter/machinery"
2525
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -48,9 +48,7 @@ type ClusterExtensionRevisionReconciler struct {
4848
Client client.Client
4949
RevisionEngineFactory RevisionEngineFactory
5050
TrackingCache trackingCache
51-
// track if we have queued up the reconciliation that detects eventual progress deadline issues
52-
// keys is revision UUID, value is boolean
53-
progressDeadlineCheckInFlight sync.Map
51+
Clock clock.Clock
5452
}
5553

5654
type trackingCache interface {
@@ -86,15 +84,15 @@ func (c *ClusterExtensionRevisionReconciler) Reconcile(ctx context.Context, req
8684
// check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet
8785
if isStillProgressing && !succeeded {
8886
timeout := time.Duration(pd) * time.Minute
89-
if time.Since(existingRev.CreationTimestamp.Time) > timeout {
87+
if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout {
9088
// progress deadline reached, reset any errors and stop reconciling this revision
9189
markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minutes.", pd))
9290
reconcileErr = nil
9391
res = ctrl.Result{}
94-
} else if _, found := c.progressDeadlineCheckInFlight.Load(existingRev.GetUID()); !found && reconcileErr == nil {
95-
// if we haven't already queued up a reconcile to check for progress deadline, queue one up, but only once
96-
c.progressDeadlineCheckInFlight.Store(existingRev.GetUID(), true)
97-
res = ctrl.Result{RequeueAfter: timeout}
92+
} else if reconcileErr == nil {
93+
requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(2 * time.Second).Sub(c.Clock.Now()).Round(time.Second)
94+
l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter))
95+
res = ctrl.Result{RequeueAfter: requeueAfter}
9896
}
9997
}
10098
}
@@ -203,9 +201,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, rev
203201
}
204202
}
205203

206-
if !rres.InTransition() {
207-
markAsProgressing(rev, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
208-
} else {
204+
if rres.InTransition() {
209205
markAsProgressing(rev, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion))
210206
}
211207

@@ -226,6 +222,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, rev
226222
}
227223
}
228224

225+
markAsProgressing(rev, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion))
229226
markAsAvailable(rev, ocv1.ClusterExtensionRevisionReasonProbesSucceeded, "Objects are available and pass all probes.")
230227

231228
// We'll probably only want to remove this once we are done updating the ClusterExtension conditions
@@ -313,6 +310,7 @@ func (c *ClusterExtensionRevisionReconciler) SetupWithManager(mgr ctrl.Manager)
313310
return true
314311
},
315312
}
313+
c.Clock = clock.RealClock{}
316314
return ctrl.NewControllerManagedBy(mgr).
317315
For(
318316
&ocv1.ClusterExtensionRevision{},

internal/operator-controller/controllers/clusterextensionrevision_controller_test.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717
"k8s.io/apimachinery/pkg/runtime/schema"
1818
"k8s.io/apimachinery/pkg/types"
1919
"k8s.io/apimachinery/pkg/util/sets"
20+
"k8s.io/utils/clock"
21+
testing2 "k8s.io/utils/clock/testing"
2022
"pkg.package-operator.run/boxcutter"
2123
"pkg.package-operator.run/boxcutter/machinery"
2224
machinerytypes "pkg.package-operator.run/boxcutter/machinery/types"
@@ -373,6 +375,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
373375
name: "set Progressing:True:Succeeded once transition rollout is finished",
374376
revisionResult: mockRevisionResult{
375377
inTransition: false,
378+
isComplete: true,
376379
},
377380
reconcilingRevisionName: clusterExtensionRevisionName,
378381
existingObjs: func() []client.Object {
@@ -381,7 +384,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_RevisionReconciliation(t
381384
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
382385
Type: ocv1.TypeProgressing,
383386
Status: metav1.ConditionTrue,
384-
Reason: ocv1.ReasonSucceeded,
387+
Reason: ocv1.ReasonRollingOut,
385388
Message: "Revision 1.0.0 is rolling out.",
386389
ObservedGeneration: 1,
387390
})
@@ -875,16 +878,19 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
875878
validate func(*testing.T, client.Client)
876879
reconcileErr error
877880
reconcileResult ctrl.Result
881+
clock clock.Clock
878882
}{
879883
{
880884
name: "progressing set to false when progress deadline is exceeded",
881885
existingObjs: func() []client.Object {
882886
ext := newTestClusterExtension()
883887
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
884888
rev1.Spec.ProgressDeadlineMinutes = 1
885-
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-61 * time.Second))
889+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC))
886890
return []client.Object{rev1, ext}
887891
},
892+
// 61sec elapsed since the creation of the revision
893+
clock: testing2.NewFakeClock(time.Date(2022, 1, 1, 0, 1, 1, 0, time.UTC)),
888894
revisionResult: &mockRevisionResult{
889895
inTransition: true,
890896
},
@@ -905,13 +911,14 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
905911
ext := newTestClusterExtension()
906912
rev1 := newTestClusterExtensionRevision(t, clusterExtensionRevisionName, ext, testScheme)
907913
rev1.Spec.ProgressDeadlineMinutes = 1
908-
rev1.CreationTimestamp = metav1.NewTime(time.Now())
914+
rev1.CreationTimestamp = metav1.NewTime(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC))
909915
return []client.Object{rev1, ext}
910916
},
917+
clock: testing2.NewFakeClock(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC)),
911918
revisionResult: &mockRevisionResult{
912919
inTransition: true,
913920
},
914-
reconcileResult: ctrl.Result{RequeueAfter: 1 * time.Minute},
921+
reconcileResult: ctrl.Result{RequeueAfter: 62 * time.Second},
915922
validate: func(t *testing.T, c client.Client) {
916923
rev := &ocv1.ClusterExtensionRevision{}
917924
err := c.Get(t.Context(), client.ObjectKey{
@@ -979,6 +986,7 @@ func Test_ClusterExtensionRevisionReconciler_Reconcile_ProgressDeadline(t *testi
979986
TrackingCache: &mockTrackingCache{
980987
client: testClient,
981988
},
989+
Clock: tc.clock,
982990
}).Reconcile(t.Context(), ctrl.Request{
983991
NamespacedName: types.NamespacedName{
984992
Name: clusterExtensionRevisionName,

test/e2e/features/install.feature

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,38 @@ Feature: Install ClusterExtension
359359
invalid ClusterExtension configuration: invalid configuration: unknown field "watchNamespace"
360360
"""
361361

362+
@BoxcutterRuntime
363+
@ProgressDeadline
364+
Scenario: Report ClusterExtension as not progressing if the rollout does not become available within given timeout
365+
Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1
366+
And min value for ClusterExtensionRevision .spec.progressDeadlineMinutes is set to 1
367+
When ClusterExtension is applied
368+
"""
369+
apiVersion: olm.operatorframework.io/v1
370+
kind: ClusterExtension
371+
metadata:
372+
name: ${NAME}
373+
spec:
374+
namespace: ${TEST_NAMESPACE}
375+
progressDeadlineMinutes: 1
376+
serviceAccount:
377+
name: olm-sa
378+
source:
379+
sourceType: Catalog
380+
catalog:
381+
packageName: test
382+
version: 1.0.2
383+
selector:
384+
matchLabels:
385+
"olm.operatorframework.io/metadata.name": test-catalog
386+
"""
387+
Then ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded
388+
And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message:
389+
"""
390+
Revision has not rolled out for 1 minutes.
391+
"""
392+
And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation
393+
362394
@BoxcutterRuntime
363395
@ProgressDeadline
364396
Scenario: Report ClusterExtension as not progressing if the rollout does not complete within given timeout

0 commit comments

Comments
 (0)