Skip to content

Commit 492c7e5

Browse files
🐛 (fix) Fix race condition in Helm to Boxcutter migration during OLM upgrades (#2440)
* (fix) Helm to Boxcutter migration during OLM upgrade When upgrading OLM from standard (Helm runtime) to experimental (Boxcutter runtime), the BoxcutterStorageMigrator creates a ClusterExtensionRevision from the existing Helm release. However, the migrated revision was created without status conditions, causing a race condition where it wasn't recognized as "Installed". Assisted-by: CLAUDE * Update internal/operator-controller/applier/boxcutter_test.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update internal/operator-controller/applier/boxcutter.go Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Add test requested by copilot --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent ee4bab5 commit 492c7e5

File tree

4 files changed

+542
-13
lines changed

4 files changed

+542
-13
lines changed

internal/operator-controller/applier/boxcutter.go

Lines changed: 125 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"helm.sh/helm/v3/pkg/release"
1616
"helm.sh/helm/v3/pkg/storage/driver"
1717
apierrors "k8s.io/apimachinery/pkg/api/errors"
18+
"k8s.io/apimachinery/pkg/api/meta"
1819
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2021
"k8s.io/apimachinery/pkg/runtime"
@@ -244,8 +245,7 @@ func (m *BoxcutterStorageMigrator) Migrate(ctx context.Context, ext *ocv1.Cluste
244245
return fmt.Errorf("listing ClusterExtensionRevisions before attempting migration: %w", err)
245246
}
246247
if len(existingRevisionList.Items) != 0 {
247-
// No migration needed.
248-
return nil
248+
return m.ensureMigratedRevisionStatus(ctx, existingRevisionList.Items)
249249
}
250250

251251
ac, err := m.ActionClientGetter.ActionClientFor(ctx, ext)
@@ -262,11 +262,36 @@ func (m *BoxcutterStorageMigrator) Migrate(ctx context.Context, ext *ocv1.Cluste
262262
return err
263263
}
264264

265+
// Only migrate from a Helm release that represents a deployed, working installation.
266+
// If the latest revision is not deployed (e.g. FAILED), look through the history and
267+
// select the most-recent deployed release instead.
268+
if helmRelease == nil || helmRelease.Info == nil || helmRelease.Info.Status != release.StatusDeployed {
269+
var err error
270+
helmRelease, err = m.findLatestDeployedRelease(ac, ext.GetName())
271+
if err != nil {
272+
return err
273+
}
274+
if helmRelease == nil {
275+
// No deployed release found in history - skip migration. The ClusterExtension
276+
// controller will handle this via normal rollout.
277+
return nil
278+
}
279+
}
280+
265281
rev, err := m.RevisionGenerator.GenerateRevisionFromHelmRelease(ctx, helmRelease, ext, objectLabels)
266282
if err != nil {
267283
return err
268284
}
269285

286+
// Mark this revision as migrated from Helm so we can distinguish it from
287+
// normal Boxcutter revisions. This label is critical for ensuring we only
288+
// set Succeeded=True status on actually-migrated revisions, not on revision 1
289+
// created during normal Boxcutter operation.
290+
if rev.Labels == nil {
291+
rev.Labels = make(map[string]string)
292+
}
293+
rev.Labels[labels.MigratedFromHelmKey] = "true"
294+
270295
// Set ownerReference for proper garbage collection when the ClusterExtension is deleted.
271296
if err := controllerutil.SetControllerReference(ext, rev, m.Scheme); err != nil {
272297
return fmt.Errorf("set ownerref: %w", err)
@@ -276,9 +301,105 @@ func (m *BoxcutterStorageMigrator) Migrate(ctx context.Context, ext *ocv1.Cluste
276301
return err
277302
}
278303

279-
// Re-fetch to get server-managed fields like Generation
304+
// Set initial status on the migrated revision to mark it as succeeded.
305+
//
306+
// The revision must have a Succeeded=True status condition immediately after creation.
307+
//
308+
// A revision is only considered "Installed" (vs "RollingOut") when it has this condition.
309+
// Without it, the system cannot determine what version is currently installed, which breaks:
310+
// - Version resolution (can't compute upgrade paths from unknown starting point)
311+
// - Status reporting (installed bundle appears as nil)
312+
// - Subsequent upgrades (resolution fails without knowing current version)
313+
//
314+
// While the ClusterExtensionRevision controller would eventually reconcile and set this status,
315+
// that creates a timing gap where the ClusterExtension reconciliation happens before the status
316+
// is set, causing failures during the OLM upgrade window.
317+
//
318+
// Since we're creating this revision from a successfully deployed Helm release, we know it
319+
// represents a working installation and can safely mark it as succeeded immediately.
320+
return m.ensureRevisionStatus(ctx, rev)
321+
}
322+
323+
// ensureMigratedRevisionStatus checks if revision 1 exists and needs its status set.
324+
// This handles the case where revision creation succeeded but status update failed.
325+
// Returns nil if no action is needed.
326+
func (m *BoxcutterStorageMigrator) ensureMigratedRevisionStatus(ctx context.Context, revisions []ocv1.ClusterExtensionRevision) error {
327+
for i := range revisions {
328+
if revisions[i].Spec.Revision != 1 {
329+
continue
330+
}
331+
// Skip if already succeeded - status is already set correctly.
332+
if meta.IsStatusConditionTrue(revisions[i].Status.Conditions, ocv1.ClusterExtensionRevisionTypeSucceeded) {
333+
return nil
334+
}
335+
// Ensure revision 1 status is set correctly, including for previously migrated
336+
// revisions that may not carry the MigratedFromHelm label.
337+
return m.ensureRevisionStatus(ctx, &revisions[i])
338+
}
339+
// No revision 1 found - migration not applicable (revisions created by normal operation).
340+
return nil
341+
}
342+
343+
// findLatestDeployedRelease searches the Helm release history for the most recent deployed release.
344+
// Returns nil if no deployed release is found.
345+
func (m *BoxcutterStorageMigrator) findLatestDeployedRelease(ac helmclient.ActionInterface, name string) (*release.Release, error) {
346+
history, err := ac.History(name)
347+
if errors.Is(err, driver.ErrReleaseNotFound) {
348+
// no Helm Release history -> no prior installation.
349+
return nil, nil
350+
}
351+
if err != nil {
352+
return nil, err
353+
}
354+
355+
var latestDeployed *release.Release
356+
for _, rel := range history {
357+
if rel == nil || rel.Info == nil {
358+
continue
359+
}
360+
if rel.Info.Status != release.StatusDeployed {
361+
continue
362+
}
363+
if latestDeployed == nil || rel.Version > latestDeployed.Version {
364+
latestDeployed = rel
365+
}
366+
}
367+
368+
return latestDeployed, nil
369+
}
370+
371+
// ensureRevisionStatus ensures the revision has the Succeeded status condition set.
372+
// Returns nil if the status is already set or after successfully setting it.
373+
// Only sets status on revisions that were actually migrated from Helm (marked with MigratedFromHelmKey label).
374+
func (m *BoxcutterStorageMigrator) ensureRevisionStatus(ctx context.Context, rev *ocv1.ClusterExtensionRevision) error {
375+
// Re-fetch to get latest version before checking status
280376
if err := m.Client.Get(ctx, client.ObjectKeyFromObject(rev), rev); err != nil {
281-
return fmt.Errorf("getting created revision: %w", err)
377+
return fmt.Errorf("getting existing revision for status check: %w", err)
378+
}
379+
380+
// Only set status if this revision was actually migrated from Helm.
381+
// This prevents us from incorrectly marking normal Boxcutter revision 1 as succeeded
382+
// when it's still in progress.
383+
if rev.Labels[labels.MigratedFromHelmKey] != "true" {
384+
return nil
385+
}
386+
387+
// Check if status is already set to Succeeded=True
388+
if meta.IsStatusConditionTrue(rev.Status.Conditions, ocv1.ClusterExtensionRevisionTypeSucceeded) {
389+
return nil
390+
}
391+
392+
// Set the Succeeded status condition
393+
meta.SetStatusCondition(&rev.Status.Conditions, metav1.Condition{
394+
Type: ocv1.ClusterExtensionRevisionTypeSucceeded,
395+
Status: metav1.ConditionTrue,
396+
Reason: ocv1.ReasonSucceeded,
397+
Message: "Revision succeeded - migrated from Helm release",
398+
ObservedGeneration: rev.GetGeneration(),
399+
})
400+
401+
if err := m.Client.Status().Update(ctx, rev); err != nil {
402+
return fmt.Errorf("updating migrated revision status: %w", err)
282403
}
283404

284405
return nil

0 commit comments

Comments
 (0)