Skip to content

Commit

Permalink
Update controller logic when some failure occurs
Browse files Browse the repository at this point in the history
An infinite loop may occur on certain misconfigurations, e.g., if the Github/OpenStack client is misconfigured. In such cases, the operator correctly sets a condition that can be used for the user to debug. However, it also returns an error, which means that the operator reconciles immediately again.

There is no need to return the error, as the user checks the conditions stored in the status of the objectrather than potential error logs. The controller's default syncPeriod, set to 5 minutes, ensures that the controller will try again later to check whether the failed state has been resolved.

See related SovereignCloudStack/cluster-stack-operator#50

Signed-off-by: Matej Feder <matej.feder@dnation.cloud>
  • Loading branch information
matofeder committed Jan 15, 2024
1 parent eb89331 commit 4ead3ef
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 38 deletions.
19 changes: 13 additions & 6 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package main
import (
"flag"
"os"
"time"

githubclient "github.com/SovereignCloudStack/cluster-stack-operator/pkg/github/client"
infrastructureclusterstackxk8siov1alpha1 "github.com/sovereignCloudStack/cluster-stack-provider-openstack/api/v1alpha1"
Expand All @@ -31,6 +32,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
_ "k8s.io/client-go/plugin/pkg/client/auth"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
Expand All @@ -49,8 +51,8 @@ func init() {
}

var (
releaseDir string
waitForImageBecomeActiveMinutes int
releaseDir string
imageImportTimeout int
)

func main() {
Expand All @@ -63,7 +65,7 @@ func main() {
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.StringVar(&releaseDir, "release-dir", "/tmp/downloads/", "Specify release directory for cluster-stack releases")
flag.IntVar(&waitForImageBecomeActiveMinutes, "import-timeout", 0, "Maximum time in minutes that you allow cspo to import image. If import-timeout <= 0, cspo waits forever.")
flag.IntVar(&imageImportTimeout, "image-import-timeout", 0, "Maximum time in minutes that you allow cspo to import image. If image-import-timeout <= 0, cspo waits forever.")
opts := zap.Options{
Development: true,
}
Expand All @@ -72,6 +74,8 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

syncPeriod := 5 * time.Minute

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{BindAddress: metricsAddr},
Expand All @@ -89,6 +93,9 @@ func main() {
// if you are doing or is intended to do any operation such as perform cleanups
// after the manager stops then its usage might be unsafe.
// LeaderElectionReleaseOnCancel: true,
Cache: cache.Options{
SyncPeriod: &syncPeriod,
},
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand All @@ -107,9 +114,9 @@ func main() {
os.Exit(1)
}
if err = (&controller.OpenStackNodeImageReleaseReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
WaitForImageBecomeActiveMinutes: waitForImageBecomeActiveMinutes,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
ImageImportTimeout: imageImportTimeout,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "OpenStackNodeImageRelease")
os.Exit(1)
Expand Down
18 changes: 7 additions & 11 deletions internal/controller/openstackclusterstackrelease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ const (
metadataFileName = "metadata.yaml"
nodeImagesFileName = "node-images.yaml"
waitForOpenStackNodeImageReleasesBecomeReady = 30 * time.Second
reconcileOpenStackNodeImageReleases = 3 * time.Minute
)

//+kubebuilder:rbac:groups=infrastructure.clusterstack.x-k8s.io,resources=openstackclusterstackreleases,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -113,7 +112,8 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
err.Error(),
)
record.Warnf(openstackclusterstackrelease, "GitTokenOrEnvVariableNotSet", err.Error())
return ctrl.Result{}, fmt.Errorf("failed to create Github client: %w", err)
logger.Error(err, "failed to create Github client")
return ctrl.Result{}, nil
}

conditions.MarkTrue(openstackclusterstackrelease, apiv1alpha1.GitAPIAvailableCondition)
Expand All @@ -140,7 +140,7 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
r.openStackClusterStackRelDownloadDirectoryMutex.Lock()

if err := downloadReleaseAssets(ctx, releaseTag, releaseAssets.LocalDownloadPath, gc); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to download release assets: %w", err)
return ctrl.Result{RequeueAfter: 1 * time.Minute}, fmt.Errorf("failed to download release assets: %w", err)
}

r.openStackClusterStackRelDownloadDirectoryMutex.Unlock()
Expand Down Expand Up @@ -170,7 +170,7 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,

osnirName := fmt.Sprintf("%s-%s-%s", nameWithoutVersion, openStackNodeImage.CreateOpts.Name, nodeImageVersion)

if err := r.getOrCreateOpenStackNodeImageRelease(ctx, openstackclusterstackrelease, osnirName, openStackNodeImage, ownerRef); err != nil {
if err := r.createOrUpdateOpenStackNodeImageRelease(ctx, openstackclusterstackrelease, osnirName, openStackNodeImage, ownerRef); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to get or create OpenStackNodeImageRelease %s/%s: %w", openstackclusterstackrelease.Namespace, osnirName, err)
}
}
Expand Down Expand Up @@ -209,11 +209,10 @@ func (r *OpenStackClusterStackReleaseReconciler) Reconcile(ctx context.Context,
conditions.MarkTrue(openstackclusterstackrelease, apiv1alpha1.OpenStackNodeImageReleasesReadyCondition)
openstackclusterstackrelease.Status.Ready = true

// Requeue to ensure the OpenStackNodeImageReleases are still ready
return ctrl.Result{Requeue: true, RequeueAfter: reconcileOpenStackNodeImageReleases}, nil
return ctrl.Result{}, nil
}

func (r *OpenStackClusterStackReleaseReconciler) getOrCreateOpenStackNodeImageRelease(ctx context.Context, openstackclusterstackrelease *apiv1alpha1.OpenStackClusterStackRelease, osnirName string, openStackNodeImage *apiv1alpha1.OpenStackNodeImage, ownerRef *metav1.OwnerReference) error {
func (r *OpenStackClusterStackReleaseReconciler) createOrUpdateOpenStackNodeImageRelease(ctx context.Context, openstackclusterstackrelease *apiv1alpha1.OpenStackClusterStackRelease, osnirName string, openStackNodeImage *apiv1alpha1.OpenStackNodeImage, ownerRef *metav1.OwnerReference) error {
openStackNodeImageRelease := &apiv1alpha1.OpenStackNodeImageRelease{}

err := r.Get(ctx, types.NamespacedName{Name: osnirName, Namespace: openstackclusterstackrelease.Namespace}, openStackNodeImageRelease)
Expand Down Expand Up @@ -252,10 +251,7 @@ func (r *OpenStackClusterStackReleaseReconciler) getOrCreateOpenStackNodeImageRe
openStackNodeImageRelease.Spec.IdentityRef = openstackclusterstackrelease.Spec.IdentityRef

if err := r.Create(ctx, openStackNodeImageRelease); err != nil {
record.Eventf(openStackNodeImageRelease,
"ErrorOpenStackNodeImageRelease",
"failed to create %s OpenStackNodeImageRelease: %s", osnirName, err.Error(),
)
record.Warnf(openStackNodeImageRelease, "ErrorOpenStackNodeImageRelease", err.Error())
return fmt.Errorf("failed to create OpenStackNodeImageRelease: %w", err)
}

Expand Down
56 changes: 35 additions & 21 deletions internal/controller/openstacknodeimagerelease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,13 @@ import (
// OpenStackNodeImageReleaseReconciler reconciles a OpenStackNodeImageRelease object.
type OpenStackNodeImageReleaseReconciler struct {
client.Client
Scheme *runtime.Scheme
WaitForImageBecomeActiveMinutes int
Scheme *runtime.Scheme
ImageImportTimeout int
}

const (
cloudsSecretKey = "clouds.yaml"
waitForImageBecomeActive = 30 * time.Second
reconcileImage = 3 * time.Minute
)

//+kubebuilder:rbac:groups=infrastructure.clusterstack.x-k8s.io,resources=openstacknodeimagereleases,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -113,7 +112,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
opts := &clientconfig.ClientOpts{AuthInfo: cloud.AuthInfo}
providerClient, err := clientconfig.AuthenticatedClient(opts)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to create a provider client: %w", err)
record.Warnf(openstacknodeimagerelease, "OpenStackProviderClientNotSet", err.Error())
logger.Error(err, "failed to create a provider client")
return ctrl.Result{}, nil
}

// Create an OpenStack image service client
Expand All @@ -126,7 +127,8 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
err.Error(),
)
record.Warnf(openstacknodeimagerelease, "OpenStackImageServiceClientNotSet", err.Error())
return ctrl.Result{}, fmt.Errorf("failed to create an image client: %w", err)
logger.Error(err, "failed to create an image client")
return ctrl.Result{}, nil
}

conditions.MarkTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageServiceClientAvailableCondition)
Expand All @@ -139,7 +141,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
clusterv1beta1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, fmt.Errorf("failed to find an image: %w", err)
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToFind", err.Error())
logger.Error(err, "failed to find an image")
return ctrl.Result{}, nil
}

if imageID == "" {
Expand All @@ -156,7 +160,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
clusterv1beta1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, fmt.Errorf("failed to create an image: %w", err)
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToCreate", err.Error())
logger.Error(err, "failed to create an image")
return ctrl.Result{}, nil
}

imageImportOpts := imageimport.CreateOpts{
Expand All @@ -171,7 +177,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
clusterv1beta1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, fmt.Errorf("failed to import an image: %w", err)
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToImport", err.Error())
logger.Error(err, "failed to import an image")
return ctrl.Result{}, nil
}

conditions.MarkTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition)
Expand All @@ -188,27 +196,30 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
clusterv1beta1.ConditionSeverityError,
err.Error(),
)
return ctrl.Result{}, fmt.Errorf("failed to get an image: %w", err)
record.Warnf(openstacknodeimagerelease, "OpenStackImageFailedToGet", err.Error())
logger.Error(err, "failed to get an image")
return ctrl.Result{}, nil
}

// Check wait for image ACTIVE status duration
if r.WaitForImageBecomeActiveMinutes > 0 && conditions.IsTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition) {
// Calculate elapsed time since the OpenStackImageImportStartedCondition is true
// Ensure that the import does not exceed the timeout duration
if r.ImageImportTimeout > 0 && conditions.IsTrue(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition) {
// Calculate elapsed time since the OpenStackImageImportStartCondition is true
startTime := conditions.GetLastTransitionTime(openstacknodeimagerelease, apiv1alpha1.OpenStackImageImportStartedCondition)
elapsedTime := time.Since(startTime.Time)

waitForImageBecomeActiveTimeout := time.Duration(r.WaitForImageBecomeActiveMinutes) * time.Minute
imageImportTimeout := time.Duration(r.ImageImportTimeout) * time.Minute

// Check if the image has been active after waitForImageBecomeActiveTimeout minutes
if image.Status != images.ImageStatusActive && elapsedTime > waitForImageBecomeActiveTimeout {
err = fmt.Errorf("timeout - wait for the image %s to transition to the ACTIVE status exceeds the timeout duration %d minutes", image.Name, r.WaitForImageBecomeActiveMinutes)
logger.Error(err, "Timeout duration exceeded")
if image.Status != images.ImageStatusActive && elapsedTime > imageImportTimeout {
err = fmt.Errorf("timeout - wait for the image %s to transition to the ACTIVE status exceeds the timeout duration %d minutes", image.Name, r.ImageImportTimeout)
conditions.MarkFalse(openstacknodeimagerelease,
apiv1alpha1.OpenStackImageReadyCondition,
apiv1alpha1.OpenStackImageImportTimeOutReason,
clusterv1beta1.ConditionSeverityError,
err.Error(),
)
record.Warnf(openstacknodeimagerelease, "OpenStackImageImportTimeout", err.Error())
logger.Error(err, "timeout - import duration exceeded")
// Image import timeout - nothing to do
return ctrl.Result{}, nil
}
Expand All @@ -233,7 +244,9 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
err.Error(),
)
openstacknodeimagerelease.Status.Ready = false
return ctrl.Result{}, err
record.Warnf(openstacknodeimagerelease, "OpenStackImageStatusUnexpected", err.Error())
logger.Error(err, "unexpected image status")
return ctrl.Result{}, nil

case images.ImageStatusQueued, images.ImageStatusSaving, images.ImageStatusDeleted, images.ImageStatusPendingDelete, images.ImageStatusImporting:
// The other statuses are expected. See the explanation below:
Expand All @@ -242,7 +255,7 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
logger.Info("OpenStackNodeImageRelease **not ready** yet - waiting for image to become ACTIVE", "name", openstacknodeimagerelease.Spec.Image.CreateOpts.Name, "ID", imageID, "status", image.Status)
conditions.MarkFalse(openstacknodeimagerelease, apiv1alpha1.OpenStackImageReadyCondition, apiv1alpha1.OpenStackImageNotImportedYetReason, clusterv1beta1.ConditionSeverityInfo, "waiting for image to become ACTIVE")
openstacknodeimagerelease.Status.Ready = false
// Wait for image
// Wait for image to become active
return ctrl.Result{RequeueAfter: waitForImageBecomeActive}, nil

default:
Expand All @@ -255,11 +268,12 @@ func (r *OpenStackNodeImageReleaseReconciler) Reconcile(ctx context.Context, req
err.Error(),
)
openstacknodeimagerelease.Status.Ready = false
return ctrl.Result{}, err
record.Warnf(openstacknodeimagerelease, "OpenStackImageStatusUnknown", err.Error())
logger.Error(err, "unknown image status")
return ctrl.Result{}, nil
}

// Requeue to ensure the image's presence
return ctrl.Result{Requeue: true, RequeueAfter: reconcileImage}, nil
return ctrl.Result{}, nil
}

func (r *OpenStackNodeImageReleaseReconciler) getCloudFromSecret(ctx context.Context, secretNamespace, secretName, cloudName string) (clientconfig.Cloud, error) {
Expand Down

0 comments on commit 4ead3ef

Please sign in to comment.