Skip to content

Commit bdf1b63

Browse files
(chore): add e2e tests for workload resilience when catalog is deleted
1 parent 8167ff8 commit bdf1b63

File tree

4 files changed

+172
-11
lines changed

4 files changed

+172
-11
lines changed

internal/operator-controller/applier/boxcutter_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1190,10 +1190,12 @@ func Test_PreAuthorizer_Integration(t *testing.T) {
11901190
RevisionGenerator: dummyGenerator,
11911191
PreAuthorizer: tc.preAuthorizer(t),
11921192
}
1193-
err := boxcutter.Apply(t.Context(), dummyBundleFs, ext, nil, revisionAnnotations)
1193+
completed, status, err := boxcutter.Apply(t.Context(), dummyBundleFs, ext, nil, revisionAnnotations)
11941194
if tc.validate != nil {
11951195
tc.validate(t, err)
11961196
}
1197+
_ = completed
1198+
_ = status
11971199
})
11981200
}
11991201
}

test/e2e/features/recover.feature

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,127 @@ Feature: Recover cluster extension from errors that might occur during its lifet
149149
Then ClusterExtension is available
150150
And ClusterExtension reports Progressing as True with Reason Succeeded
151151
And ClusterExtension reports Installed as True
152+
153+
# CATALOG DELETION RESILIENCE SCENARIOS
154+
155+
Scenario: Extension continues running after catalog deletion
156+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
157+
And ClusterExtension is applied
158+
"""
159+
apiVersion: olm.operatorframework.io/v1
160+
kind: ClusterExtension
161+
metadata:
162+
name: ${NAME}
163+
spec:
164+
namespace: ${TEST_NAMESPACE}
165+
serviceAccount:
166+
name: olm-sa
167+
source:
168+
sourceType: Catalog
169+
catalog:
170+
packageName: test
171+
selector:
172+
matchLabels:
173+
"olm.operatorframework.io/metadata.name": test-catalog
174+
"""
175+
And ClusterExtension is rolled out
176+
And ClusterExtension is available
177+
And resource "deployment/test-operator" is available
178+
And resource "configmap/test-configmap" is available
179+
When ClusterCatalog "test" is deleted
180+
# Verify controller still maintains resources after catalog deletion by removing and restoring a resource.
181+
# This approach avoids race conditions because:
182+
# - We don't rely on status flags that might be unchanged (e.g., Installed=True before and after)
183+
# - Resource restoration is an observable event that PROVES the controller reconciled after deletion
184+
# - The controller must actively apply manifests to restore the removed resource
185+
And resource "configmap/test-configmap" is removed
186+
Then resource "configmap/test-configmap" is eventually restored
187+
And resource "deployment/test-operator" is available
188+
189+
Scenario: Config changes are allowed even when the catalog does not exist anymore
190+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
191+
And ClusterExtension is applied
192+
"""
193+
apiVersion: olm.operatorframework.io/v1
194+
kind: ClusterExtension
195+
metadata:
196+
name: ${NAME}
197+
spec:
198+
namespace: ${TEST_NAMESPACE}
199+
serviceAccount:
200+
name: olm-sa
201+
source:
202+
sourceType: Catalog
203+
catalog:
204+
packageName: test
205+
selector:
206+
matchLabels:
207+
"olm.operatorframework.io/metadata.name": test-catalog
208+
"""
209+
And ClusterExtension is rolled out
210+
And ClusterExtension is available
211+
And ClusterCatalog "test" is deleted
212+
When ClusterExtension is updated to add preflight config
213+
"""
214+
apiVersion: olm.operatorframework.io/v1
215+
kind: ClusterExtension
216+
metadata:
217+
name: ${NAME}
218+
spec:
219+
namespace: ${TEST_NAMESPACE}
220+
serviceAccount:
221+
name: olm-sa
222+
install:
223+
preflight:
224+
crdUpgradeSafety:
225+
enforcement: None
226+
source:
227+
sourceType: Catalog
228+
catalog:
229+
packageName: test
230+
selector:
231+
matchLabels:
232+
"olm.operatorframework.io/metadata.name": test-catalog
233+
"""
234+
# Wait for reconciliation of the updated spec (config change should succeed without catalog)
235+
# First ensure the controller has reconciled the new generation (spec update)
236+
And ClusterExtension latest generation has been reconciled
237+
# Config-only changes don't trigger resolution failure because the bundle version hasn't changed.
238+
# The controller falls back to the installed bundle (via handleResolutionError), which allows
239+
# Apply to run and successfully maintain resources. This results in Progressing=Succeeded.
240+
And ClusterExtension reports Progressing as True with Reason Succeeded
241+
Then ClusterExtension is available
242+
And ClusterExtension reports Installed as True
243+
244+
Scenario: Version upgrade does not proceed when catalog does not exist
245+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
246+
And ClusterExtension is applied
247+
"""
248+
apiVersion: olm.operatorframework.io/v1
249+
kind: ClusterExtension
250+
metadata:
251+
name: ${NAME}
252+
spec:
253+
namespace: ${TEST_NAMESPACE}
254+
serviceAccount:
255+
name: olm-sa
256+
source:
257+
sourceType: Catalog
258+
catalog:
259+
packageName: test
260+
version: "1.0.0"
261+
selector:
262+
matchLabels:
263+
"olm.operatorframework.io/metadata.name": test-catalog
264+
"""
265+
And ClusterExtension is rolled out
266+
And ClusterExtension is available
267+
And bundle "test-operator.1.0.0" is installed in version "1.0.0"
268+
When ClusterCatalog "test" is deleted
269+
And ClusterExtension is updated to version "1.0.1"
270+
# Wait for reconciliation after the version change request
271+
# Note: Retrying status means controller will auto-upgrade when catalog becomes available
272+
Then ClusterExtension reports Progressing as True with Reason Retrying
273+
# Verify upgrade did not proceed: version remains at 1.0.0 (not 1.0.1)
274+
And bundle "test-operator.1.0.0" is installed in version "1.0.0"
275+
And ClusterExtension reports Installed as True

test/e2e/steps/hooks.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,28 +139,33 @@ func stderrOutput(err error) string {
139139
return ""
140140
}
141141

142-
func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, err error) (context.Context, error) {
142+
func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, scenarioErr error) (context.Context, error) {
143143
sc := scenarioCtx(ctx)
144144
for _, bgCmd := range sc.backGroundCmds {
145145
if p := bgCmd.Process; p != nil {
146146
_ = p.Kill()
147147
}
148148
}
149-
if err != nil {
150-
return ctx, err
151-
}
152149

150+
// Run cleanup ALWAYS, even if scenario failed (to prevent resource leaks into next scenario)
153151
forDeletion := []resource{}
154152
if sc.clusterExtensionName != "" {
155153
forDeletion = append(forDeletion, resource{name: sc.clusterExtensionName, kind: "clusterextension"})
156154
}
157155
forDeletion = append(forDeletion, resource{name: sc.namespace, kind: "namespace"})
158-
go func() {
159-
for _, r := range forDeletion {
160-
if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true"); err != nil {
161-
logger.Info("Error deleting resource", "name", r.name, "namespace", sc.namespace, "stderr", stderrOutput(err))
156+
157+
// Cleanup must be synchronous to ensure proper test isolation.
158+
// If cleanup runs in background, the next scenario may start before resources are deleted.
159+
for _, r := range forDeletion {
160+
// Try graceful deletion first (60s timeout), fall back to force if stuck
161+
if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--wait=true", "--timeout=60s"); err != nil {
162+
// Force delete if stuck on finalizers. Use grace-period=30s instead of 0
163+
// to allow pods to shut down gracefully and avoid stale state in subsequent scenarios.
164+
if _, forceErr := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--force", "--grace-period=30"); forceErr != nil {
165+
logger.Info("Error force deleting resource", "kind", r.kind, "name", r.name, "stderr", stderrOutput(forceErr))
162166
}
163167
}
164-
}()
165-
return ctx, nil
168+
}
169+
170+
return ctx, scenarioErr
166171
}

test/e2e/steps/steps.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ func RegisterSteps(sc *godog.ScenarioContext) {
5656
sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied)
5757
sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable)
5858
sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut)
59+
sc.Step(`^(?i)ClusterExtension (?:latest generation )?has (?:been )?reconciled(?: the latest generation)?$`, ClusterExtensionReconciledLatestGeneration)
5960
sc.Step(`^(?i)ClusterExtension reports "([^"]+)" as active revision(s?)$`, ClusterExtensionReportsActiveRevisions)
6061
sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message:$`, ClusterExtensionReportsCondition)
6162
sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message includes:$`, ClusterExtensionReportsConditionWithMessageFragment)
@@ -89,6 +90,7 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8990
sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles)
9091
sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage)
9192
sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage)
93+
sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted)
9294

9395
sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace)
9496
sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned)
@@ -246,6 +248,25 @@ func ClusterExtensionIsAvailable(ctx context.Context) error {
246248
return nil
247249
}
248250

251+
func ClusterExtensionReconciledLatestGeneration(ctx context.Context) error {
252+
sc := scenarioCtx(ctx)
253+
waitFor(ctx, func() bool {
254+
// Get both generation and observedGeneration in a single kubectl call
255+
output, err := k8sClient("get", "clusterextension", sc.clusterExtensionName,
256+
"-o", "jsonpath={.metadata.generation},{.status.conditions[?(@.type=='Progressing')].observedGeneration}")
257+
if err != nil || output == "" {
258+
return false
259+
}
260+
parts := strings.Split(output, ",")
261+
if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
262+
return false
263+
}
264+
// Both exist and are equal means reconciliation happened
265+
return parts[0] == parts[1]
266+
})
267+
return nil
268+
}
269+
249270
func ClusterExtensionIsRolledOut(ctx context.Context) error {
250271
sc := scenarioCtx(ctx)
251272
require.Eventually(godog.T(ctx), func() bool {
@@ -727,6 +748,15 @@ func TagCatalogImage(name, oldTag, newTag string) error {
727748
return crane.Tag(imageRef, newTag, crane.Insecure)
728749
}
729750

751+
func CatalogIsDeleted(ctx context.Context, catalogName string) error {
752+
catalogFullName := fmt.Sprintf("%s-catalog", catalogName)
753+
_, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true", "--wait=true")
754+
if err != nil {
755+
return fmt.Errorf("failed to delete catalog: %v", err)
756+
}
757+
return nil
758+
}
759+
730760
func PrometheusMetricsAreReturned(ctx context.Context) error {
731761
sc := scenarioCtx(ctx)
732762
for podName, mr := range sc.metricsResponse {

0 commit comments

Comments
 (0)