From 9e9460de8c7ae20ca6d2ecc09159ce209cef472e Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:17:30 +0000 Subject: [PATCH 1/2] (chore): add e2e tests for workload resilience when catalog is deleted --- .../applier/boxcutter_test.go | 4 +- test/e2e/features/recover.feature | 102 ++++++++++++++++++ test/e2e/steps/steps.go | 30 ++++++ 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/internal/operator-controller/applier/boxcutter_test.go b/internal/operator-controller/applier/boxcutter_test.go index 130a6057b..af400b2f0 100644 --- a/internal/operator-controller/applier/boxcutter_test.go +++ b/internal/operator-controller/applier/boxcutter_test.go @@ -1190,10 +1190,12 @@ func Test_PreAuthorizer_Integration(t *testing.T) { RevisionGenerator: dummyGenerator, PreAuthorizer: tc.preAuthorizer(t), } - err := boxcutter.Apply(t.Context(), dummyBundleFs, ext, nil, revisionAnnotations) + completed, status, err := boxcutter.Apply(t.Context(), dummyBundleFs, ext, nil, revisionAnnotations) if tc.validate != nil { tc.validate(t, err) } + _ = completed + _ = status }) } } diff --git a/test/e2e/features/recover.feature b/test/e2e/features/recover.feature index bf500e997..c222756a4 100644 --- a/test/e2e/features/recover.feature +++ b/test/e2e/features/recover.feature @@ -149,3 +149,105 @@ Feature: Recover cluster extension from errors that might occur during its lifet Then ClusterExtension is available And ClusterExtension reports Progressing as True with Reason Succeeded And ClusterExtension reports Installed as True + + # CATALOG DELETION RESILIENCE SCENARIOS + + Scenario: Auto-healing continues working after catalog deletion + # This test proves that extensions continue to auto-heal (restore deleted resources) even when + # their source catalog is unavailable. We verify this by: + # 1. Deleting the catalog + # 2. Manually deleting a managed resource (configmap) + # 3. Verifying the resource is automatically restored + # + # Why this proves auto-healing works: + # - If the controller stopped reconciling, the configmap would stay deleted + # - Resource restoration is an observable event that PROVES active reconciliation + # - The deployment staying healthy proves the workload continues running + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + When ClusterCatalog "test" is deleted + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + And resource "deployment/test-operator" is available + + Scenario: Spec changes are allowed when catalog is unavailable + # This test proves that users can modify extension configuration (non-version changes) even when + # the catalog is missing. We verify this by: + # 1. Deleting the catalog + # 2. Changing the preflight configuration in the ClusterExtension spec + # 3. Verifying the controller accepts and reconciles the change successfully + # + # Why this proves spec changes work without catalog: + # - If the controller rejected the change, Progressing would show Retrying or Failed + # - Reconciliation completing (observedGeneration == generation) proves the spec was processed + # - Progressing=Succeeded proves the controller didn't block on missing catalog + # - Extension staying Available proves workload continues running + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterCatalog "test" is deleted + When ClusterExtension is updated to add preflight config + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + install: + preflight: + crdUpgradeSafety: + enforcement: None + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension latest generation has been reconciled + And ClusterExtension reports Progressing as True with Reason Succeeded + Then ClusterExtension is available + And ClusterExtension reports Installed as True diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index 9ae772ce8..a0a371485 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -56,6 +56,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied) sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable) sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut) + sc.Step(`^(?i)ClusterExtension (?:latest generation )?has (?:been )?reconciled(?: the latest generation)?$`, ClusterExtensionReconciledLatestGeneration) sc.Step(`^(?i)ClusterExtension reports "([^"]+)" as active revision(s?)$`, ClusterExtensionReportsActiveRevisions) sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message:$`, ClusterExtensionReportsCondition) sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message includes:$`, ClusterExtensionReportsConditionWithMessageFragment) @@ -89,6 +90,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles) sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) + sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted) sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace) sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned) @@ -246,6 +248,25 @@ func ClusterExtensionIsAvailable(ctx context.Context) error { return nil } +func ClusterExtensionReconciledLatestGeneration(ctx context.Context) error { + sc := scenarioCtx(ctx) + waitFor(ctx, func() bool { + // Get both generation and observedGeneration in a single kubectl call + output, err := k8sClient("get", "clusterextension", sc.clusterExtensionName, + "-o", "jsonpath={.metadata.generation},{.status.conditions[?(@.type=='Progressing')].observedGeneration}") + if err != nil || output == "" { + return false + } + parts := strings.Split(output, ",") + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + return false + } + // Both exist and are equal means reconciliation happened + return parts[0] == parts[1] + }) + return nil +} + func ClusterExtensionIsRolledOut(ctx context.Context) error { sc := scenarioCtx(ctx) require.Eventually(godog.T(ctx), func() bool { @@ -727,6 +748,15 @@ func TagCatalogImage(name, oldTag, newTag string) error { return crane.Tag(imageRef, newTag, crane.Insecure) } +func CatalogIsDeleted(ctx context.Context, catalogName string) error { + catalogFullName := fmt.Sprintf("%s-catalog", catalogName) + _, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true", "--wait=true") + if err != nil { + return fmt.Errorf("failed to delete catalog: %v", err) + } + return nil +} + func PrometheusMetricsAreReturned(ctx context.Context) error { sc := scenarioCtx(ctx) for podName, mr := range sc.metricsResponse { From 5652b14ee413a7027c4daff2d56c129d3b47d23f Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Tue, 20 Jan 2026 01:12:43 +0000 Subject: [PATCH 2/2] (fix) catalog deletion resilience support Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. Assisted-by: Cursor --- cmd/operator-controller/main.go | 4 +- .../operator-controller/applier/boxcutter.go | 47 +- .../applier/boxcutter_test.go | 6 +- internal/operator-controller/applier/helm.go | 66 +++ .../controllers/boxcutter_reconcile_steps.go | 5 +- .../boxcutter_reconcile_steps_apply_test.go | 4 +- .../clusterextension_admission_test.go | 4 +- .../clusterextension_controller.go | 2 + .../clusterextension_controller_test.go | 440 ++++++++++++++++++ .../clusterextension_reconcile_steps.go | 211 ++++++++- .../controllers/suite_test.go | 2 +- test/utils.go | 1 + 12 files changed, 758 insertions(+), 34 deletions(-) diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index f02818f1d..dd0bc98f6 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -629,7 +629,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl controllers.HandleFinalizers(c.finalizers), controllers.MigrateStorage(storageMigrator), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundleWithBoxcutter(appl.Apply), } @@ -748,7 +748,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{ controllers.HandleFinalizers(c.finalizers), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundle(appl), } diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index 6030d1c77..f71650329 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -312,21 +312,38 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, user user.Info, rev *oc return bc.Client.Patch(ctx, rev, client.Apply, client.FieldOwner(bc.FieldOwner), client.ForceOwnership) } -func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error { - // Generate desired revision - desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) +func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) { + // List existing revisions first to validate cluster connectivity before checking contentFS. + // This ensures we fail fast on API errors rather than attempting fallback behavior when + // cluster access is unavailable (since the ClusterExtensionRevision controller also requires + // API access to maintain resources). The revision list is also needed to determine if fallback + // is possible when contentFS is nil (at least one revision must exist). + existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) if err != nil { - return err + return false, "", err } - if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { - return fmt.Errorf("set ownerref: %w", err) + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, we should use the existing installed revision without generating a new one. + if contentFS == nil { + if len(existingRevisions) == 0 { + return false, "", fmt.Errorf("catalog content unavailable and no revision installed") + } + // Returning true here signals that the rollout has succeeded using the current revision. + // This assumes the ClusterExtensionRevision controller is running and will continue to + // reconcile, apply, and maintain the resources defined in that revision via Server-Side Apply, + // ensuring the workload keeps running even when catalog access is unavailable. + return true, "", nil } - // List all existing revisions - existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) + // Generate desired revision + desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) if err != nil { - return err + return false, "", err + } + + if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { + return false, "", fmt.Errorf("set ownerref: %w", err) } currentRevision := &ocv1.ClusterExtensionRevision{} @@ -348,7 +365,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust // inplace patch was successful, no changes in phases state = StateUnchanged default: - return fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) + return false, "", fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) } } @@ -362,7 +379,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsInstall: err := preflight.Install(ctx, plainObjs) if err != nil { - return err + return false, "", err } // TODO: jlanford's IDE says that "StateNeedsUpgrade" condition is always true, but // it isn't immediately obvious why that is. Perhaps len(existingRevisions) is @@ -371,7 +388,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsUpgrade: err := preflight.Upgrade(ctx, plainObjs) if err != nil { - return err + return false, "", err } } } @@ -385,15 +402,15 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust desiredRevision.Spec.Revision = revisionNumber if err = bc.garbageCollectOldRevisions(ctx, prevRevisions); err != nil { - return fmt.Errorf("garbage collecting old revisions: %w", err) + return false, "", fmt.Errorf("garbage collecting old revisions: %w", err) } if err := bc.createOrUpdate(ctx, getUserInfo(ext), desiredRevision); err != nil { - return fmt.Errorf("creating new Revision: %w", err) + return false, "", fmt.Errorf("creating new Revision: %w", err) } } - return nil + return true, "", nil } // runPreAuthorizationChecks runs PreAuthorization checks if the PreAuthorizer is set. An error will be returned if diff --git a/internal/operator-controller/applier/boxcutter_test.go b/internal/operator-controller/applier/boxcutter_test.go index af400b2f0..02194550f 100644 --- a/internal/operator-controller/applier/boxcutter_test.go +++ b/internal/operator-controller/applier/boxcutter_test.go @@ -991,14 +991,18 @@ func TestBoxcutter_Apply(t *testing.T) { labels.PackageNameKey: "test-package", } } - err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) + completed, status, err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) // Assert if tc.expectedErr != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedErr) + assert.False(t, completed) + assert.Empty(t, status) } else { require.NoError(t, err) + assert.True(t, completed) + assert.Empty(t, status) } if tc.validate != nil { diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 86b5440bc..1ab81c195 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -84,6 +84,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE } func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) { + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, reconcile the existing Helm release if it exists. + if contentFS == nil { + ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext) + if err != nil { + return false, "", err + } + return h.reconcileExistingRelease(ctx, ac, ext) + } + chrt, err := h.buildHelmChart(contentFS, ext) if err != nil { return false, "", err @@ -178,6 +188,62 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return true, "", nil } +// reconcileExistingRelease reconciles an existing Helm release without catalog access. +// This is used when the catalog is unavailable but we need to maintain the current installation. +// It reconciles the release to actively maintain resources, and sets up watchers for monitoring/observability. +func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) { + rel, err := ac.Get(ext.GetName()) + if errors.Is(err, driver.ErrReleaseNotFound) { + return false, "", fmt.Errorf("catalog content unavailable and no release installed") + } + if err != nil { + return false, "", fmt.Errorf("failed to get current release: %w", err) + } + + // Reconcile the existing release to ensure resources are maintained + if err := ac.Reconcile(rel); err != nil { + // Reconcile failed - resources NOT maintained + // Return false (rollout failed) with error + return false, "", err + } + + // At this point: Reconcile succeeded - resources ARE maintained (applied to cluster via Server-Side Apply) + // The operations below are for setting up watches to detect drift (i.e., if someone manually modifies the + // resources). If watch setup fails, the resources are still successfully maintained, but we won't detect + // and auto-correct manual modifications. We return true (rollout succeeded) and log watch errors. + logger := klog.FromContext(ctx) + + relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name)) + if err != nil { + logger.Error(err, "failed to parse manifest objects, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + logger.V(1).Info("setting up drift detection watches on managed objects") + + // Defensive nil checks to prevent panics if Manager or Watcher not properly initialized + if h.Manager == nil { + logger.Error(fmt.Errorf("manager is nil"), "Manager not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + cache, err := h.Manager.Get(ctx, ext) + if err != nil { + logger.Error(err, "failed to get managed content cache, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + if h.Watcher == nil { + logger.Error(fmt.Errorf("watcher is nil"), "Watcher not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil { + logger.Error(err, "failed to set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + return true, "", nil +} + func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.HelmChartProvider == nil { return nil, errors.New("HelmChartProvider is nil") diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go index 4c9eec36c..5c746c7b0 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go @@ -94,7 +94,7 @@ func MigrateStorage(m StorageMigrator) ReconcileStepFunc { } } -func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error) ReconcileStepFunc { +func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error)) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) revisionAnnotations := map[string]string{ @@ -109,7 +109,8 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e } l.Info("applying bundle contents") - if err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations); err != nil { + _, _, err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations) + if err != nil { // If there was an error applying the resolved bundle, // report the error via the Progressing condition. setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go index 63aff2b0a..78adb7009 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go @@ -133,8 +133,8 @@ func TestApplyBundleWithBoxcutter(t *testing.T) { imageFS: fstest.MapFS{}, } - stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) error { - return nil + stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) (bool, string, error) { + return true, "", nil }) result, err := stepFunc(ctx, state, ext) require.NoError(t, err) diff --git a/internal/operator-controller/controllers/clusterextension_admission_test.go b/internal/operator-controller/controllers/clusterextension_admission_test.go index 2f8791999..6ce9fc3c7 100644 --- a/internal/operator-controller/controllers/clusterextension_admission_test.go +++ b/internal/operator-controller/controllers/clusterextension_admission_test.go @@ -13,7 +13,9 @@ import ( ) func TestClusterExtensionSourceConfig(t *testing.T) { - sourceTypeEmptyError := "Invalid value: null" + // NOTE: Kubernetes validation error format for JSON null values varies across K8s versions. + // We check for the common part "Invalid value:" which appears in all versions. + sourceTypeEmptyError := "Invalid value:" sourceTypeMismatchError := "spec.source.sourceType: Unsupported value" sourceConfigInvalidError := "spec.source: Invalid value" // unionField represents the required Catalog or (future) Bundle field required by SourceConfig diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 7f3192b0f..f381152e6 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req // ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension, // and assigns a specified reason and custom message to any missing condition. +// +//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) { for _, condType := range conditionsets.ConditionTypes { cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 1b09a43ad..eabdba4f9 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -1611,3 +1611,443 @@ func TestGetInstalledBundleHistory(t *testing.T) { } } } + +// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic +func TestResolutionFallbackToInstalledBundle(t *testing.T) { + t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First reconcile: catalog available, second reconcile: catalog unavailable + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog available, resolve to version 1.0.0 + v := bundle.VersionRelease{Version: bsemver.MustParse("1.0.0")} + return &declcfg.Bundle{ + Name: "test.1.0.0", + Package: "test-pkg", + Image: "test-image:1.0.0", + }, &v, &declcfg.Deprecation{}, nil + } + // Second reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + // Applier succeeds (resources maintained) + d.Applier = &MockApplier{ + installCompleted: true, + installStatus: "", + err: nil, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - should fall back + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog available, install version 1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after first install + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions are present and valid after first reconcile + verifyInvariants(ctx, t, cl, ext) + + // Second reconcile: catalog unavailable, should fallback to installed version + // Catalog watch will trigger reconciliation when catalog becomes available again + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows successful reconciliation after fallback + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Version should remain 1.0.0 (maintained from fallback) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Progressing should be Succeeded (apply completed successfully) + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Installed should be True (maintaining current version) + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + // Verify all conditions remain valid after fallback + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("fails when version upgrade requested without catalog", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension requesting version upgrade + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Version: "1.0.1", // Requesting upgrade + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail (can't upgrade without catalog) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Progressing should be Retrying (can't resolve without catalog) + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + + // Installed should be True (v1.0.0 is already installed per RevisionStatesGetter) + // but we can't upgrade to v1.0.1 without catalog + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify all conditions are present and valid + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("auto-updates when catalog becomes available after fallback", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First attempt: catalog unavailable, then becomes available + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog temporarily unavailable") + } + // Second reconcile (triggered by catalog watch): catalog available with new version + v := bundle.VersionRelease{Version: bsemver.MustParse("2.0.0")} + return &declcfg.Bundle{ + Name: "test.2.0.0", + Package: "test-pkg", + Image: "test-image:2.0.0", + }, &v, &declcfg.Deprecation{}, nil + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - auto-update to latest + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog unavailable, falls back to v1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify core status after fallback to installed version + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Note: When falling back without catalog access initially, deprecation conditions + // may not be set yet. Full validation happens after catalog is available. + + // Second reconcile: simulating catalog watch trigger, catalog now available with v2.0.0 + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Should have upgraded to v2.0.0 + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "2.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after upgrade + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions remain valid after upgrade + verifyInvariants(ctx, t, cl, ext) + + // Verify resolution was attempted twice (fallback, then success) + require.Equal(t, 2, resolveAttempt) + }) + + t.Run("retries when catalogs exist but resolution fails", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // Resolver fails (transient issue) + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("transient catalog issue: cache stale") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create a ClusterCatalog matching the extension's selector + catalog := &ocv1.ClusterCatalog{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-catalog", + }, + Spec: ocv1.ClusterCatalogSpec{ + Source: ocv1.CatalogSource{ + Type: ocv1.SourceTypeImage, + Image: &ocv1.ImageSource{ + Ref: "test-registry/catalog:latest", + }, + }, + }, + } + require.NoError(t, cl.Create(ctx, catalog)) + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version specified + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail and RETRY (not fallback) + // Catalogs exist, so this is likely a transient issue (catalog updating, cache stale, etc.) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying (not falling back to installed bundle) + require.NoError(t, cl.Get(ctx, extKey, ext)) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + require.Contains(t, progCond.Message, "transient catalog issue") + + // Installed should remain True (existing installation is maintained) + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify we did NOT fall back - status should show we're retrying + verifyInvariants(ctx, t, cl, ext) + + // Clean up the catalog so it doesn't affect other tests + require.NoError(t, cl.Delete(ctx, catalog)) + }) +} + +func TestCheckCatalogsExist(t *testing.T) { + t.Run("returns false when no catalogs exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist") + }) + + t.Run("returns false when no selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: nil, // No selector + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist (no selector)") + }) + + t.Run("returns false when empty selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{}, // Empty selector (matches everything) + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "empty selector should not cause error") + require.False(t, exists, "should return false when no catalogs exist (empty selector)") + }) + + t.Run("returns error for invalid selector", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", // Invalid operator + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.Error(t, err, "should return error for invalid selector") + require.Contains(t, err.Error(), "invalid catalog selector") + require.False(t, exists) + }) +} diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 09dd2ce7d..1e5996414 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -19,10 +19,12 @@ package controllers import ( "context" "errors" + "fmt" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/log" @@ -83,7 +85,14 @@ func RetrieveRevisionStates(r RevisionStatesGetter) ReconcileStepFunc { } } -func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { +// ResolveBundle resolves the bundle to install or roll out for a ClusterExtension. +// It requires a controller-runtime client (in addition to the resolve.Resolver) to enable +// intelligent error handling when resolution fails. The client is used to check if ClusterCatalogs +// matching the extension's selector still exist in the cluster, allowing the controller to +// distinguish between "ClusterCatalog deleted" (fall back to installed bundle) and "transient failure" +// (retry resolution). This ensures workload resilience during ClusterCatalog outages while maintaining +// responsiveness during ClusterCatalog updates. +func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) var resolvedRevisionMetadata *RevisionMetadata @@ -95,11 +104,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm) if err != nil { - // Note: We don't distinguish between resolution-specific errors and generic errors - setStatusProgressing(ext, err) - setInstalledStatusFromRevisionStates(ext, state.revisionStates) - ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error()) - return nil, err + return handleResolutionError(ctx, c, state, ext, err) } // set deprecation status after _successful_ resolution @@ -134,19 +139,205 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } } +// handleResolutionError handles the case when bundle resolution fails. +// +// Decision logic (evaluated in order): +// 1. No installed bundle → Retry (cannot proceed without any bundle) +// 2. Version change requested → Retry (cannot upgrade without catalog) +// 3. Cannot check catalog existence → Retry (API error, cannot safely decide) +// 4. Catalogs exist → Retry (transient error, catalog may be updating) +// 5. Catalogs deleted → Fallback to installed bundle (maintain current state) +// +// When falling back (case 5), we set the resolved bundle to the installed bundle and return +// no error, allowing the Apply step to run and maintain resources using the existing installation. +// The controller watches ClusterCatalog resources, so reconciliation will automatically resume +// when catalogs return, enabling upgrades. +func handleResolutionError(ctx context.Context, c client.Client, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) { + l := log.FromContext(ctx) + + // No installed bundle and resolution failed - cannot proceed + if state.revisionStates.Installed == nil { + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // Check if the spec is requesting a specific version that differs from installed + specVersion := "" + if ext.Spec.Source.Catalog != nil { + specVersion = ext.Spec.Source.Catalog.Version + } + installedVersion := state.revisionStates.Installed.Version + + // If spec requests a different version, we cannot fall back - must fail and retry + if specVersion != "" && specVersion != installedVersion { + msg := fmt.Sprintf("unable to upgrade to version %s: %v (currently installed: %s)", specVersion, err, installedVersion) + l.Error(err, "resolution failed and spec requests version change - cannot fall back", + "requestedVersion", specVersion, + "installedVersion", installedVersion) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // No version change requested - check if ClusterCatalogs exist + // Only fall back if ClusterCatalogs have been deleted + catalogsExist, catalogCheckErr := CheckCatalogsExist(ctx, c, ext) + if catalogCheckErr != nil { + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Error(catalogCheckErr, "error checking if ClusterCatalogs exist, will retry resolution", + "resolutionError", err, + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + if catalogsExist { + // ClusterCatalogs exist but resolution failed - likely a transient issue (ClusterCatalog updating, cache stale, etc.) + // Retry resolution instead of falling back + msg := fmt.Sprintf("failed to resolve bundle, retrying: %v", err) + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Error(err, "resolution failed but matching ClusterCatalogs exist - retrying instead of falling back", + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // ClusterCatalogs don't exist (deleted) - fall back to installed bundle to maintain current state. + // The controller watches ClusterCatalog resources, so when ClusterCatalogs become available again, + // a reconcile will be triggered automatically, allowing the extension to upgrade. + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Info("matching ClusterCatalogs unavailable or deleted - falling back to installed bundle to maintain workload", + "resolutionError", err.Error(), + "packageName", getPackageName(ext), + "catalogName", catalogName, + "installedBundle", state.revisionStates.Installed.Name, + "installedVersion", state.revisionStates.Installed.Version) + // Set installed status based on current revision states (needed before Apply runs) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + state.resolvedRevisionMetadata = state.revisionStates.Installed + // Return no error to allow Apply step to run and maintain resources. + // Apply will set Progressing=Succeeded when it completes successfully. + return nil, nil +} + +// getCatalogNameFromSelector extracts the catalog name from the selector if available. +// Returns empty string if selector is nil or doesn't contain the metadata.name label. +func getCatalogNameFromSelector(selector *metav1.LabelSelector) string { + if selector == nil || selector.MatchLabels == nil { + return "" + } + return selector.MatchLabels["olm.operatorframework.io/metadata.name"] +} + +// getPackageName safely extracts the package name from the extension spec. +// Returns empty string if Catalog source is nil. +func getPackageName(ext *ocv1.ClusterExtension) string { + if ext.Spec.Source.Catalog == nil { + return "" + } + return ext.Spec.Source.Catalog.PackageName +} + +// CheckCatalogsExist checks if any ClusterCatalogs matching the extension's selector exist. +// Returns true if at least one matching ClusterCatalog exists, false if none exist. +// Treats "CRD doesn't exist" errors as "no ClusterCatalogs exist" (returns false, nil). +// Returns an error only if the check itself fails unexpectedly. +func CheckCatalogsExist(ctx context.Context, c client.Client, ext *ocv1.ClusterExtension) (bool, error) { + var catalogList *ocv1.ClusterCatalogList + var listErr error + + if ext.Spec.Source.Catalog == nil || ext.Spec.Source.Catalog.Selector == nil { + // No selector means all ClusterCatalogs match - check if any ClusterCatalogs exist at all + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.Limit(1)) + } else { + // Convert label selector to k8slabels.Selector + // Note: An empty LabelSelector matches everything by default + selector, err := metav1.LabelSelectorAsSelector(ext.Spec.Source.Catalog.Selector) + if err != nil { + return false, fmt.Errorf("invalid catalog selector: %w", err) + } + + // List ClusterCatalogs matching the selector (limit to 1 since we only care if any exist) + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.MatchingLabelsSelector{Selector: selector}, client.Limit(1)) + } + + if listErr != nil { + // Check if the error is because the ClusterCatalog CRD doesn't exist + // This can happen if catalogd is not installed, which means no ClusterCatalogs exist + if apimeta.IsNoMatchError(listErr) { + return false, nil + } + return false, fmt.Errorf("failed to list ClusterCatalogs: %w", listErr) + } + + return len(catalogList.Items) > 0, nil +} + func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) - l.Info("unpacking resolved bundle") + + // Defensive check: resolvedRevisionMetadata should be set by ResolveBundle step + if state.resolvedRevisionMetadata == nil { + return nil, fmt.Errorf("unable to retrieve bundle information") + } + + // Always try to pull the bundle content (Pull uses cache-first strategy, so this is efficient) + l.V(1).Info("pulling bundle content") imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache) + + // Check if resolved bundle matches installed bundle (no version change) + bundleUnchanged := state.revisionStates != nil && + state.revisionStates.Installed != nil && + state.resolvedRevisionMetadata.Name == state.revisionStates.Installed.Name && + state.resolvedRevisionMetadata.Version == state.revisionStates.Installed.Version + if err != nil { - // Wrap the error passed to this with the resolution information until we have successfully - // installed since we intend for the progressing condition to replace the resolved condition - // and will be removing the .status.resolution field from the ClusterExtension status API + if bundleUnchanged { + // Bundle hasn't changed and Pull failed (likely cache miss + catalog unavailable). + // This happens in fallback mode after catalog deletion. Set imageFS to nil so the + // applier can maintain the workload using existing Helm release or ClusterExtensionRevision. + l.V(1).Info("bundle content unavailable but version unchanged, maintaining current installation", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version, + "error", err.Error()) + state.imageFS = nil + return nil, nil + } + // New bundle version but Pull failed - this is an error condition setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) setInstalledStatusFromRevisionStates(ext, state.revisionStates) return nil, err } + + if bundleUnchanged { + l.V(1).Info("bundle unchanged, using cached content for resource reconciliation", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version) + } + state.imageFS = imageFS return nil, nil } diff --git a/internal/operator-controller/controllers/suite_test.go b/internal/operator-controller/controllers/suite_test.go index 6258a5d30..57305a75f 100644 --- a/internal/operator-controller/controllers/suite_test.go +++ b/internal/operator-controller/controllers/suite_test.go @@ -106,7 +106,7 @@ func newClientAndReconciler(t *testing.T, opts ...reconcilerOption) (client.Clie } reconciler.ReconcileSteps = []controllers.ReconcileStepFunc{controllers.HandleFinalizers(d.Finalizers), controllers.RetrieveRevisionStates(d.RevisionStatesGetter)} if r := d.Resolver; r != nil { - reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r)) + reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r, cl)) } if i := d.ImagePuller; i != nil { reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.UnpackBundle(i, d.ImageCache)) diff --git a/test/utils.go b/test/utils.go index 22a50b2b8..60cfaa38a 100644 --- a/test/utils.go +++ b/test/utils.go @@ -14,6 +14,7 @@ func NewEnv() *envtest.Environment { testEnv := &envtest.Environment{ CRDDirectoryPaths: []string{ pathFromProjectRoot("helm/olmv1/base/operator-controller/crd/experimental"), + pathFromProjectRoot("helm/olmv1/base/catalogd/crd/experimental"), }, ErrorIfCRDPathMissing: true, }