Skip to content

Commit

Permalink
handle sigterm when stateroot job is progress
Browse files Browse the repository at this point in the history
  • Loading branch information
pixelsoccupied committed May 13, 2024
1 parent 3592ad0 commit 5e2b8d2
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 60 deletions.
14 changes: 8 additions & 6 deletions controllers/idle_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ import (
"fmt"
"os"

"github.com/openshift-kni/lifecycle-agent/internal/prep"

"github.com/go-logr/logr"
"github.com/openshift-kni/lifecycle-agent/internal/extramanifest"
"github.com/openshift-kni/lifecycle-agent/internal/ostreeclient"
"github.com/openshift-kni/lifecycle-agent/internal/prep"
"github.com/openshift-kni/lifecycle-agent/lca-cli/ops"
rpmostreeclient "github.com/openshift-kni/lifecycle-agent/lca-cli/ostreeclient"

Expand Down Expand Up @@ -150,22 +151,23 @@ func (r *ImageBasedUpgradeReconciler) cleanup(ctx context.Context, ibu *lcav1alp
r.Log.Error(err, msg)
errorMessage += err.Error() + " "
}

r.Log.Info("Cleaning up stateroot")
if err := CleanupUnbootedStateroots(r.Log, r.Ops, r.OstreeClient, r.RPMOstreeClient); err != nil {
handleError(err, "failed to cleanup stateroots.")
}
r.Log.Info("Cleaning up stateroot setup job")
err := prep.DeleteStaterootSetupJob(ctx, r.Client, r.Log)
if err != nil {
handleError(err, "failed to cleanup stateroots setup job.")
}

r.Log.Info("Cleaning up stateroot")
if err := CleanupUnbootedStateroots(r.Log, r.Ops, r.OstreeClient, r.RPMOstreeClient); err != nil {
handleError(err, "failed to cleanup stateroots.")
}

r.Log.Info("Cleaning up precache")
if err := r.Precache.Cleanup(ctx); err != nil {
handleError(err, "failed to cleanup precaching resources.")
}

r.Log.Info("Removing annotation with warning")
if err := extramanifest.RemoveAnnotationWarnUnknownCRD(r.Client, ibu, r.Log); err != nil {
handleError(err, "failed to remove extra manifest warning annotation from IBU")
}
Expand Down
8 changes: 4 additions & 4 deletions controllers/prep_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *lcav1
if _, err := prep.LaunchStaterootSetupJob(ctx, r.Client, ibu, r.Scheme, r.Log); err != nil {
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed launch stateroot job: %s", err.Error()), ibu)
}
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", prep.JobName, common.LcaNamespace), ibu)
return prepInProgressRequeue(r.Log, fmt.Sprintf("Successfully launched a new job `%s` in namespace `%s`", prep.StaterootSetupJobName, common.LcaNamespace), ibu)
}
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("failed to get stateroot setup job: %s", err.Error()), ibu)
}
Expand All @@ -385,7 +385,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *lcav1

// job deletion not allowed
if staterootSetupJob.GetDeletionTimestamp() != nil {
return prepFailDoNotRequeue(r.Log, "stateroot job is marked to be deleted, this is not allowed", ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot job is marked to be deleted, this is not allowed. please wait additional %d seconds before trying again. job_name: %s, job_ns: %s", prep.StaterootSetupTerminationGracePeriodSeconds, staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
}

// check .status
Expand All @@ -395,7 +395,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *lcav1
common.LogPodLogs(staterootSetupJob, r.Log, r.Clientset)
return prepInProgressRequeue(r.Log, "Stateroot setup in progress", ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job could not complete. Please check job logs for more, job_name: %s, job_ns: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("stateroot setup job could not complete. please check job logs for more, job_name: %s, job_ns: %s", staterootSetupJob.GetName(), staterootSetupJob.GetNamespace()), ibu)
case kbatch.JobComplete:
r.Log.Info("Stateroot job completed successfully", "completion time", staterootSetupJob.Status.CompletionTime, "total time", staterootSetupJob.Status.CompletionTime.Sub(staterootSetupJob.Status.StartTime.Time))
}
Expand Down Expand Up @@ -427,7 +427,7 @@ func (r *ImageBasedUpgradeReconciler) handlePrep(ctx context.Context, ibu *lcav1
common.LogPodLogs(precacheJob, r.Log, r.Clientset) // pod logs
return prepInProgressRequeue(r.Log, fmt.Sprintf("Precache job in progress: %s", precache.GetPrecacheStatusFileContent()), ibu)
case kbatch.JobFailed:
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job could not complete. Please check job logs for more, job_name: %s, job_ns: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
return prepFailDoNotRequeue(r.Log, fmt.Sprintf("precache job could not complete. please check job logs for more, job_name: %s, job_ns: %s", precacheJob.GetName(), precacheJob.GetNamespace()), ibu)
case kbatch.JobComplete:
r.Log.Info("Precache job completed successfully", "completion time", precacheJob.Status.CompletionTime, "total time", precacheJob.Status.CompletionTime.Sub(precacheJob.Status.StartTime.Time))
}
Expand Down
2 changes: 1 addition & 1 deletion internal/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func IsJobFinished(job *kbatch.Job) (bool, kbatch.JobConditionType) {
}

func GenerateDeleteOptions() *client.DeleteOptions {
propagationPolicy := metav1.DeletePropagationBackground
propagationPolicy := metav1.DeletePropagationForeground // delete only when dependents are deleted

delOpt := client.DeleteOptions{
PropagationPolicy: &propagationPolicy,
Expand Down
61 changes: 46 additions & 15 deletions internal/prep/prep_stateroot_setup_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ package prep
import (
"context"
"fmt"
"time"

"k8s.io/apimachinery/pkg/util/wait"

"github.com/go-logr/logr"
"github.com/openshift-kni/lifecycle-agent/api/v1alpha1"
Expand All @@ -20,13 +23,16 @@ import (
)

const (
JobName = "lca-prep-stateroot-setup"
jobFinalizer = "lca.openshift.io/stateroot-setup-finalizer"
StaterootSetupJobName = "lca-prep-stateroot-setup"
staterootSetupJobFinalizer = "lca.openshift.io/stateroot-setup-finalizer"
)

// StaterootSetupTerminationGracePeriodSeconds max time wait before the stateroot job pod gets SIGKILL from k8s. Assuming the seed image is already in the system, the stateroot job should complete within this time.
var StaterootSetupTerminationGracePeriodSeconds int64 = 60

func GetStaterootSetupJob(ctx context.Context, c client.Client, log logr.Logger) (*batchv1.Job, error) {
job := &batchv1.Job{}
if err := c.Get(ctx, types.NamespacedName{Name: JobName, Namespace: common.LcaNamespace}, job); err != nil {
if err := c.Get(ctx, types.NamespacedName{Name: StaterootSetupJobName, Namespace: common.LcaNamespace}, job); err != nil {
return job, err //nolint:wrapcheck
}

Expand Down Expand Up @@ -65,10 +71,10 @@ func constructJobForStaterootSetup(ctx context.Context, c client.Client, ibu *v1
var backoffLimit int32 = 0
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: JobName,
Name: StaterootSetupJobName,
Namespace: common.LcaNamespace,
Annotations: map[string]string{
"app.kubernetes.io/name": JobName,
"app.kubernetes.io/name": StaterootSetupJobName,
},
},
Spec: batchv1.JobSpec{
Expand All @@ -82,7 +88,7 @@ func constructJobForStaterootSetup(ctx context.Context, c client.Client, ibu *v1
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: JobName,
Name: StaterootSetupJobName,
Image: manager.Image,
ImagePullPolicy: manager.ImagePullPolicy,
Command: []string{"lca-cli", "ibu-stateroot-setup"},
Expand All @@ -93,10 +99,11 @@ func constructJobForStaterootSetup(ctx context.Context, c client.Client, ibu *v1
Resources: manager.Resources,
},
},
HostPID: lcaDeployment.Spec.Template.Spec.HostPID, // this is needed for rpmostree
ServiceAccountName: lcaDeployment.Spec.Template.Spec.ServiceAccountName,
RestartPolicy: corev1.RestartPolicyNever,
Volumes: lcaDeployment.Spec.Template.Spec.Volumes,
HostPID: lcaDeployment.Spec.Template.Spec.HostPID, // this is needed for rpmostree
ServiceAccountName: lcaDeployment.Spec.Template.Spec.ServiceAccountName,
RestartPolicy: corev1.RestartPolicyNever,
Volumes: lcaDeployment.Spec.Template.Spec.Volumes,
TerminationGracePeriodSeconds: &StaterootSetupTerminationGracePeriodSeconds,
},
},
},
Expand All @@ -108,7 +115,7 @@ func constructJobForStaterootSetup(ctx context.Context, c client.Client, ibu *v1
}

// set finalizer
controllerutil.AddFinalizer(job, jobFinalizer)
controllerutil.AddFinalizer(job, staterootSetupJobFinalizer)

log.Info("Done rendering a new job", "job", job.Name)
return job, nil
Expand All @@ -130,7 +137,7 @@ func DeleteStaterootSetupJob(ctx context.Context, c client.Client, log logr.Logg
}
stateroot := batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: JobName,
Name: StaterootSetupJobName,
Namespace: common.LcaNamespace,
},
}
Expand All @@ -140,9 +147,33 @@ func DeleteStaterootSetupJob(ctx context.Context, c client.Client, log logr.Logg
}
}

log.Info(fmt.Sprintf("Waiting up to additional %s to verify that job's pod no longer exists", time.Duration(StaterootSetupTerminationGracePeriodSeconds)*time.Second), "job", stateroot.GetName())
if err := waitUntilStaterootSetupPodIsRemoved(ctx, c); err != nil { // todo: in most cases we expect this to just go through very quickly...this is blocking call and can block up to StaterootSetupTerminationGracePeriodSeconds. Should look into using reconcile instead
return fmt.Errorf("failed to wait until stateroot setup job pod is removed: %w", err)
}

return nil
}

// waitUntilStaterootSetupPodIsRemoved the delete client call is async, so we need to wait until the pod is completely removed make sure stateroot is cleaned up properly
func waitUntilStaterootSetupPodIsRemoved(ctx context.Context, c client.Client) error {
return wait.PollUntilContextTimeout(ctx, time.Second, time.Duration(StaterootSetupTerminationGracePeriodSeconds)*time.Second, true, func(context.Context) (bool, error) { //nolint:wrapcheck
opts := []client.ListOption{
client.InNamespace(common.LcaNamespace),
client.MatchingLabels{"job-name": StaterootSetupJobName},
}
podList := &corev1.PodList{}
if err := c.List(ctx, podList, opts...); err != nil {
return false, fmt.Errorf("failed to list pods: %w", err)
}

if len(podList.Items) == 0 {
return true, nil
}
return false, nil
})
}

// removePrecacheFinalizer remove the finalizer if present
func removeStaterootSetupJobFinalizer(ctx context.Context, c client.Client, log logr.Logger) error {
staterootjob, err := GetStaterootSetupJob(ctx, c, log)
Expand All @@ -153,15 +184,15 @@ func removeStaterootSetupJobFinalizer(ctx context.Context, c client.Client, log
return fmt.Errorf("failed get precache job to remove finalizer: %w", err)
}

if controllerutil.ContainsFinalizer(staterootjob, jobFinalizer) {
finalizerRemoved := controllerutil.RemoveFinalizer(staterootjob, jobFinalizer)
if controllerutil.ContainsFinalizer(staterootjob, staterootSetupJobFinalizer) {
finalizerRemoved := controllerutil.RemoveFinalizer(staterootjob, staterootSetupJobFinalizer)
if finalizerRemoved {
if err := c.Update(ctx, staterootjob); err != nil {
return fmt.Errorf("failed to remove finalizer during update: %w", err)
}
}
}

log.Info("Removed stateroot setup finalizer", "finalizer", jobFinalizer)
log.Info("Removed stateroot setup finalizer", "finalizer", staterootSetupJobFinalizer)
return nil
}
105 changes: 71 additions & 34 deletions lca-cli/cmd/ibuStaterootSetup.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,25 @@ import (
"context"
"fmt"
"os"
"os/signal"
"syscall"
"time"

"github.com/go-logr/logr"
"github.com/openshift-kni/lifecycle-agent/internal/ostreeclient"
"github.com/openshift-kni/lifecycle-agent/lca-cli/ops"
ostree "github.com/openshift-kni/lifecycle-agent/lca-cli/ostreeclient"

lcav1alpha1 "github.com/openshift-kni/lifecycle-agent/api/v1alpha1"
"github.com/openshift-kni/lifecycle-agent/controllers"
"github.com/openshift-kni/lifecycle-agent/controllers/utils"
"k8s.io/apimachinery/pkg/types"

"github.com/openshift-kni/lifecycle-agent/controllers"
"github.com/openshift-kni/lifecycle-agent/internal/common"
"github.com/openshift-kni/lifecycle-agent/internal/ostreeclient"
"github.com/openshift-kni/lifecycle-agent/internal/precache"
"github.com/openshift-kni/lifecycle-agent/internal/prep"
"github.com/openshift-kni/lifecycle-agent/internal/reboot"
"github.com/openshift-kni/lifecycle-agent/lca-cli/ops"
ostree "github.com/openshift-kni/lifecycle-agent/lca-cli/ostreeclient"
lcautils "github.com/openshift-kni/lifecycle-agent/utils"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
Expand All @@ -25,51 +31,57 @@ import (
"github.com/spf13/cobra"
)

var (
// ibuStaterootSetupCmd represents the ibuStaterootSetup command
ibuStaterootSetupCmd = &cobra.Command{
Use: "ibuStaterootSetup",
Aliases: []string{"ibu-stateroot-setup"},
Short: "Setup a new stateroot during IBU",
Long: `Setup stateroot during IBU. This is meant to be used as k8s job!`,
Run: func(cmd *cobra.Command, args []string) {
if err := ibuStaterootSetupRun(); err != nil {
log.Error(err, "something went wrong!")
os.Exit(1)
}
},
}
)
// ibuStaterootSetupCmd represents the ibuStaterootSetup command
var ibuStaterootSetupCmd = &cobra.Command{
Use: "ibuStaterootSetup",
Aliases: []string{"ibu-stateroot-setup"},
Short: "Setup a new stateroot during IBU",
Long: `Setup stateroot during IBU. This is meant to be used as k8s job!`,
Run: func(cmd *cobra.Command, args []string) {
if err := ibuStaterootSetupRun(); err != nil {
log.Error(err)
os.Exit(1)
}
},
}

func init() {
rootCmd.AddCommand(ibuStaterootSetupCmd)
}

// ibuStaterootSetupRun main function for do stateroot setup
// ibuStaterootSetupRun main function to do stateroot setup
func ibuStaterootSetupRun() error {
// k8s client
var (
hostCommandsExecutor = ops.NewChrootExecutor(log, true, common.Host)
opsClient = ops.NewOps(log, hostCommandsExecutor)
rpmOstreeClient = ostree.NewClient("lca-cli-stateroot-setup", hostCommandsExecutor)
ostreeClient = ostreeclient.NewClient(hostCommandsExecutor, false)
ctx, cancelCtx = context.WithCancel(context.Background())
)

// additional logger setup
loggerOpt := zap.Options{Development: true}
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&loggerOpt)))
logger := ctrl.Log.WithName("prep-stateroot-job")

// defer cleanup
defer cancelCtx()

logger.Info("Starting a new client")
c, err := getClient()
if err != nil {
return err
}

// internals
hostCommandsExecutor := ops.NewChrootExecutor(log, true, common.Host)
opsClient := ops.NewOps(log, hostCommandsExecutor)
rpmOstreeClient := ostree.NewClient("lca-cli-stateroot-setup", hostCommandsExecutor)
ostreeClient := ostreeclient.NewClient(hostCommandsExecutor, false)
ctx := context.Background()
loggerOpt := zap.Options{Development: true}
// logger init...we have two loggers
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&loggerOpt)))
logger := ctrl.Log.WithName("prep-stateroot-job")

logger.Info("Fetching the latest IBU cr")
ibu := &lcav1alpha1.ImageBasedUpgrade{}
if err := c.Get(context.Background(), types.NamespacedName{Namespace: common.LcaNamespace, Name: utils.IBUName}, ibu); err != nil {
if err := c.Get(ctx, types.NamespacedName{Namespace: common.LcaNamespace, Name: utils.IBUName}, ibu); err != nil {
return fmt.Errorf("failed get IBU cr: %w", err)
}

logger.Info("Starting signal handler")
initStaterootSetupSigHandler(logger, opsClient, ibu.Spec.SeedImageRef.Image)

logger.Info("Pulling seed image")
if err := controllers.GetSeedImage(c, ctx, ibu, logger, hostCommandsExecutor); err != nil {
return fmt.Errorf("failed to pull seed image: %w", err)
Expand All @@ -94,6 +106,31 @@ func ibuStaterootSetupRun() error {
return nil
}

func initStaterootSetupSigHandler(logger logr.Logger, opsClient ops.Ops, seedImage string) {
signalChannel := make(chan os.Signal, 1)
signal.Notify(signalChannel, syscall.SIGTERM) // to handle any additional signals add a new param here and also handle it specifically in the switch below

go func() {
logger.Info("Listening for signal")
s := <-signalChannel
switch s {
case syscall.SIGTERM:
logger.Error(fmt.Errorf("unexpected SIGTERM received to stop stateroot setup"), "")

// If the seed image is present we can sleep and allow the rest of the stateroot setup to proceed
if seedImageExists, _ := opsClient.ImageExists(seedImage); seedImageExists {
logger.Error(fmt.Errorf("seed images exists. prepare to shut down in at most %v sec", prep.StaterootSetupTerminationGracePeriodSeconds), "")
time.Sleep(time.Duration(prep.StaterootSetupTerminationGracePeriodSeconds) * time.Second)
}

logger.Error(fmt.Errorf("proceeding to shutdown stateroot setup job"), "")
os.Exit(1)
default:
logger.Error(fmt.Errorf(s.String()), "Unknown signal") // this is not expected to be hit
}
}()
}

// getClient returns a client for this job
func getClient() (client.Client, error) {
cfg, err := config.GetConfig()
Expand Down

0 comments on commit 5e2b8d2

Please sign in to comment.