Skip to content

Commit 046b827

Browse files
(fix) catalog deletion resilience support
Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. Assisted-by: Cursor
1 parent bdf1b63 commit 046b827

File tree

12 files changed

+758
-34
lines changed

12 files changed

+758
-34
lines changed

cmd/operator-controller/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl
629629
controllers.HandleFinalizers(c.finalizers),
630630
controllers.MigrateStorage(storageMigrator),
631631
controllers.RetrieveRevisionStates(revisionStatesGetter),
632-
controllers.ResolveBundle(c.resolver),
632+
controllers.ResolveBundle(c.resolver, c.mgr.GetClient()),
633633
controllers.UnpackBundle(c.imagePuller, c.imageCache),
634634
controllers.ApplyBundleWithBoxcutter(appl.Apply),
635635
}
@@ -748,7 +748,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster
748748
ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{
749749
controllers.HandleFinalizers(c.finalizers),
750750
controllers.RetrieveRevisionStates(revisionStatesGetter),
751-
controllers.ResolveBundle(c.resolver),
751+
controllers.ResolveBundle(c.resolver, c.mgr.GetClient()),
752752
controllers.UnpackBundle(c.imagePuller, c.imageCache),
753753
controllers.ApplyBundle(appl),
754754
}

internal/operator-controller/applier/boxcutter.go

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -312,21 +312,38 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, user user.Info, rev *oc
312312
return bc.Client.Patch(ctx, rev, client.Apply, client.FieldOwner(bc.FieldOwner), client.ForceOwnership)
313313
}
314314

315-
func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error {
316-
// Generate desired revision
317-
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
315+
func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) {
316+
// List existing revisions first to validate cluster connectivity before checking contentFS.
317+
// This ensures we fail fast on API errors rather than attempting fallback behavior when
318+
// cluster access is unavailable (since the ClusterExtensionRevision controller also requires
319+
// API access to maintain resources). The revision list is also needed to determine if fallback
320+
// is possible when contentFS is nil (at least one revision must exist).
321+
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
318322
if err != nil {
319-
return err
323+
return false, "", err
320324
}
321325

322-
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
323-
return fmt.Errorf("set ownerref: %w", err)
326+
// If contentFS is nil, we're maintaining the current state without catalog access.
327+
// In this case, we should use the existing installed revision without generating a new one.
328+
if contentFS == nil {
329+
if len(existingRevisions) == 0 {
330+
return false, "", fmt.Errorf("catalog content unavailable and no revision installed")
331+
}
332+
// Returning true here signals that the rollout has succeeded using the current revision.
333+
// This assumes the ClusterExtensionRevision controller is running and will continue to
334+
// reconcile, apply, and maintain the resources defined in that revision via Server-Side Apply,
335+
// ensuring the workload keeps running even when catalog access is unavailable.
336+
return true, "", nil
324337
}
325338

326-
// List all existing revisions
327-
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
339+
// Generate desired revision
340+
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
328341
if err != nil {
329-
return err
342+
return false, "", err
343+
}
344+
345+
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
346+
return false, "", fmt.Errorf("set ownerref: %w", err)
330347
}
331348

332349
currentRevision := &ocv1.ClusterExtensionRevision{}
@@ -348,7 +365,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
348365
// inplace patch was successful, no changes in phases
349366
state = StateUnchanged
350367
default:
351-
return fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err)
368+
return false, "", fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err)
352369
}
353370
}
354371

@@ -362,7 +379,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
362379
case StateNeedsInstall:
363380
err := preflight.Install(ctx, plainObjs)
364381
if err != nil {
365-
return err
382+
return false, "", err
366383
}
367384
// TODO: jlanford's IDE says that "StateNeedsUpgrade" condition is always true, but
368385
// it isn't immediately obvious why that is. Perhaps len(existingRevisions) is
@@ -371,7 +388,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
371388
case StateNeedsUpgrade:
372389
err := preflight.Upgrade(ctx, plainObjs)
373390
if err != nil {
374-
return err
391+
return false, "", err
375392
}
376393
}
377394
}
@@ -385,15 +402,15 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
385402
desiredRevision.Spec.Revision = revisionNumber
386403

387404
if err = bc.garbageCollectOldRevisions(ctx, prevRevisions); err != nil {
388-
return fmt.Errorf("garbage collecting old revisions: %w", err)
405+
return false, "", fmt.Errorf("garbage collecting old revisions: %w", err)
389406
}
390407

391408
if err := bc.createOrUpdate(ctx, getUserInfo(ext), desiredRevision); err != nil {
392-
return fmt.Errorf("creating new Revision: %w", err)
409+
return false, "", fmt.Errorf("creating new Revision: %w", err)
393410
}
394411
}
395412

396-
return nil
413+
return true, "", nil
397414
}
398415

399416
// runPreAuthorizationChecks runs PreAuthorization checks if the PreAuthorizer is set. An error will be returned if

internal/operator-controller/applier/boxcutter_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,14 +991,18 @@ func TestBoxcutter_Apply(t *testing.T) {
991991
labels.PackageNameKey: "test-package",
992992
}
993993
}
994-
err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations)
994+
completed, status, err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations)
995995

996996
// Assert
997997
if tc.expectedErr != "" {
998998
require.Error(t, err)
999999
assert.Contains(t, err.Error(), tc.expectedErr)
1000+
assert.False(t, completed)
1001+
assert.Empty(t, status)
10001002
} else {
10011003
require.NoError(t, err)
1004+
assert.True(t, completed)
1005+
assert.Empty(t, status)
10021006
}
10031007

10041008
if tc.validate != nil {

internal/operator-controller/applier/helm.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE
8484
}
8585

8686
func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) {
87+
// If contentFS is nil, we're maintaining the current state without catalog access.
88+
// In this case, reconcile the existing Helm release if it exists.
89+
if contentFS == nil {
90+
ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
91+
if err != nil {
92+
return false, "", err
93+
}
94+
return h.reconcileExistingRelease(ctx, ac, ext)
95+
}
96+
8797
chrt, err := h.buildHelmChart(contentFS, ext)
8898
if err != nil {
8999
return false, "", err
@@ -178,6 +188,62 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
178188
return true, "", nil
179189
}
180190

191+
// reconcileExistingRelease reconciles an existing Helm release without catalog access.
192+
// This is used when the catalog is unavailable but we need to maintain the current installation.
193+
// It reconciles the release to actively maintain resources, and sets up watchers for monitoring/observability.
194+
func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) {
195+
rel, err := ac.Get(ext.GetName())
196+
if errors.Is(err, driver.ErrReleaseNotFound) {
197+
return false, "", fmt.Errorf("catalog content unavailable and no release installed")
198+
}
199+
if err != nil {
200+
return false, "", fmt.Errorf("failed to get current release: %w", err)
201+
}
202+
203+
// Reconcile the existing release to ensure resources are maintained
204+
if err := ac.Reconcile(rel); err != nil {
205+
// Reconcile failed - resources NOT maintained
206+
// Return false (rollout failed) with error
207+
return false, "", err
208+
}
209+
210+
// At this point: Reconcile succeeded - resources ARE maintained (applied to cluster via Server-Side Apply)
211+
// The operations below are for setting up watches to detect drift (i.e., if someone manually modifies the
212+
// resources). If watch setup fails, the resources are still successfully maintained, but we won't detect
213+
// and auto-correct manual modifications. We return true (rollout succeeded) and log watch errors.
214+
logger := klog.FromContext(ctx)
215+
216+
relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name))
217+
if err != nil {
218+
logger.Error(err, "failed to parse manifest objects, cannot set up drift detection watches (resources are applied but drift detection disabled)")
219+
return true, "", nil
220+
}
221+
222+
logger.V(1).Info("setting up drift detection watches on managed objects")
223+
224+
// Defensive nil checks to prevent panics if Manager or Watcher not properly initialized
225+
if h.Manager == nil {
226+
logger.Error(fmt.Errorf("manager is nil"), "Manager not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)")
227+
return true, "", nil
228+
}
229+
cache, err := h.Manager.Get(ctx, ext)
230+
if err != nil {
231+
logger.Error(err, "failed to get managed content cache, cannot set up drift detection watches (resources are applied but drift detection disabled)")
232+
return true, "", nil
233+
}
234+
235+
if h.Watcher == nil {
236+
logger.Error(fmt.Errorf("watcher is nil"), "Watcher not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)")
237+
return true, "", nil
238+
}
239+
if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil {
240+
logger.Error(err, "failed to set up drift detection watches (resources are applied but drift detection disabled)")
241+
return true, "", nil
242+
}
243+
244+
return true, "", nil
245+
}
246+
181247
func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) {
182248
if h.HelmChartProvider == nil {
183249
return nil, errors.New("HelmChartProvider is nil")

internal/operator-controller/controllers/boxcutter_reconcile_steps.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func MigrateStorage(m StorageMigrator) ReconcileStepFunc {
9494
}
9595
}
9696

97-
func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error) ReconcileStepFunc {
97+
func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error)) ReconcileStepFunc {
9898
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
9999
l := log.FromContext(ctx)
100100
revisionAnnotations := map[string]string{
@@ -109,7 +109,8 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e
109109
}
110110

111111
l.Info("applying bundle contents")
112-
if err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations); err != nil {
112+
_, _, err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations)
113+
if err != nil {
113114
// If there was an error applying the resolved bundle,
114115
// report the error via the Progressing condition.
115116
setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err))

internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ func TestApplyBundleWithBoxcutter(t *testing.T) {
133133
imageFS: fstest.MapFS{},
134134
}
135135

136-
stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) error {
137-
return nil
136+
stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) (bool, string, error) {
137+
return true, "", nil
138138
})
139139
result, err := stepFunc(ctx, state, ext)
140140
require.NoError(t, err)

internal/operator-controller/controllers/clusterextension_admission_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ import (
1313
)
1414

1515
func TestClusterExtensionSourceConfig(t *testing.T) {
16-
sourceTypeEmptyError := "Invalid value: null"
16+
// NOTE: Kubernetes validation error format for JSON null values varies across K8s versions.
17+
// We check for the common part "Invalid value:" which appears in all versions.
18+
sourceTypeEmptyError := "Invalid value:"
1719
sourceTypeMismatchError := "spec.source.sourceType: Unsupported value"
1820
sourceConfigInvalidError := "spec.source: Invalid value"
1921
// unionField represents the required Catalog or (future) Bundle field required by SourceConfig

internal/operator-controller/controllers/clusterextension_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req
168168

169169
// ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension,
170170
// and assigns a specified reason and custom message to any missing condition.
171+
//
172+
//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value
171173
func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) {
172174
for _, condType := range conditionsets.ConditionTypes {
173175
cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType)

0 commit comments

Comments
 (0)