-
Notifications
You must be signed in to change notification settings - Fork 12
chore: rollback rule #733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
chore: rollback rule #733
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,159 @@ | ||
| package rollback | ||
|
|
||
| import ( | ||
| "context" | ||
| "time" | ||
| "workspace-engine/pkg/oapi" | ||
| "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" | ||
| "workspace-engine/pkg/workspace/releasemanager/verification" | ||
| "workspace-engine/pkg/workspace/store" | ||
|
|
||
| "github.com/google/uuid" | ||
| "go.opentelemetry.io/otel" | ||
| "go.opentelemetry.io/otel/attribute" | ||
| "go.opentelemetry.io/otel/codes" | ||
| ) | ||
|
|
||
| var hookTracer = otel.Tracer("RollbackHooks") | ||
|
|
||
| type RollbackHooks struct { | ||
| store *store.Store | ||
| dispatcher *jobs.Dispatcher | ||
| } | ||
|
|
||
| var _ verification.VerificationHooks = &RollbackHooks{} | ||
|
|
||
| func NewRollbackHooks(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackHooks { | ||
| return &RollbackHooks{ | ||
| store: store, | ||
| dispatcher: dispatcher, | ||
| } | ||
| } | ||
|
|
||
| func (h *RollbackHooks) OnVerificationStarted(ctx context.Context, verification *oapi.JobVerification) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (h *RollbackHooks) OnMeasurementTaken(ctx context.Context, verification *oapi.JobVerification, metricIndex int, measurement *oapi.VerificationMeasurement) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (h *RollbackHooks) OnMetricComplete(ctx context.Context, verification *oapi.JobVerification, metricIndex int) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (h *RollbackHooks) OnVerificationComplete(ctx context.Context, verificationResult *oapi.JobVerification) error { | ||
| ctx, span := hookTracer.Start(ctx, "RollbackHooks.OnVerificationComplete") | ||
| defer span.End() | ||
|
|
||
| span.SetAttributes( | ||
| attribute.String("verification.id", verificationResult.Id), | ||
| attribute.String("verification.job_id", verificationResult.JobId), | ||
| ) | ||
|
|
||
| status := verificationResult.Status() | ||
| span.SetAttributes(attribute.String("verification.status", string(status))) | ||
|
|
||
| if status != oapi.JobVerificationStatusFailed { | ||
| span.SetStatus(codes.Ok, "verification did not fail") | ||
| return nil | ||
| } | ||
|
|
||
| job, ok := h.store.Jobs.Get(verificationResult.JobId) | ||
| if !ok { | ||
| span.SetStatus(codes.Error, "job not found") | ||
| return nil | ||
| } | ||
|
|
||
| release, ok := h.store.Releases.Get(job.ReleaseId) | ||
| if !ok { | ||
| span.SetStatus(codes.Error, "release not found") | ||
| return nil | ||
| } | ||
|
|
||
| span.SetAttributes( | ||
| attribute.String("release.id", release.ID()), | ||
| attribute.String("release_target.key", release.ReleaseTarget.Key()), | ||
| ) | ||
|
|
||
| policies, err := h.store.ReleaseTargets.GetPolicies(ctx, &release.ReleaseTarget) | ||
| if err != nil { | ||
| span.RecordError(err) | ||
| span.SetStatus(codes.Error, "failed to get policies") | ||
| return nil | ||
| } | ||
|
|
||
| if !h.shouldRollbackOnVerificationFailure(policies) { | ||
| span.SetAttributes(attribute.Bool("rollback_applicable", false)) | ||
| span.SetStatus(codes.Ok, "no applicable rollback policy for verification failure") | ||
| return nil | ||
| } | ||
|
|
||
| span.SetAttributes(attribute.Bool("rollback_applicable", true)) | ||
|
|
||
| currentRelease, lastSuccessfulJob, err := h.store.ReleaseTargets.GetCurrentRelease(ctx, &release.ReleaseTarget) | ||
| if err != nil { | ||
| span.AddEvent("No previous release to roll back to") | ||
| span.SetStatus(codes.Ok, "no previous release available") | ||
| return nil | ||
| } | ||
|
|
||
| // Don't rollback to the same release | ||
| if currentRelease.ID() == release.ID() { | ||
| span.AddEvent("Current release is the same as failed release, no rollback needed") | ||
| span.SetStatus(codes.Ok, "already on current release") | ||
| return nil | ||
| } | ||
|
|
||
| span.SetAttributes( | ||
| attribute.String("rollback_to_release.id", currentRelease.ID()), | ||
| attribute.String("rollback_to_version.id", currentRelease.Version.Id), | ||
| attribute.String("rollback_to_version.tag", currentRelease.Version.Tag), | ||
| ) | ||
|
|
||
| now := time.Now() | ||
| newJob := oapi.Job{ | ||
| Id: uuid.New().String(), | ||
| ReleaseId: lastSuccessfulJob.ReleaseId, | ||
| JobAgentId: lastSuccessfulJob.JobAgentId, | ||
| JobAgentConfig: lastSuccessfulJob.JobAgentConfig, | ||
| Status: oapi.JobStatusPending, | ||
| CreatedAt: now, | ||
| UpdatedAt: now, | ||
| } | ||
|
|
||
| h.store.Jobs.Upsert(ctx, &newJob) | ||
|
|
||
| if err := h.dispatcher.DispatchJob(ctx, &newJob); err != nil { | ||
| span.RecordError(err) | ||
| span.SetStatus(codes.Error, "rollback execution failed") | ||
| return err | ||
| } | ||
|
|
||
| span.SetStatus(codes.Ok, "rollback executed successfully") | ||
| return nil | ||
| } | ||
|
|
||
| func (h *RollbackHooks) OnVerificationStopped(ctx context.Context, verification *oapi.JobVerification) error { | ||
| return nil | ||
| } | ||
|
|
||
| func (h *RollbackHooks) shouldRollbackOnVerificationFailure(policies []*oapi.Policy) bool { | ||
| for _, policy := range policies { | ||
| if !policy.Enabled { | ||
| continue | ||
| } | ||
|
|
||
| for _, rule := range policy.Rules { | ||
| if rule.Rollback == nil { | ||
| continue | ||
| } | ||
|
|
||
| if rule.Rollback.OnVerificationFailure != nil && *rule.Rollback.OnVerificationFailure { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return false | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| package rollback | ||
|
|
||
| import ( | ||
| "context" | ||
| "slices" | ||
| "time" | ||
| "workspace-engine/pkg/oapi" | ||
| "workspace-engine/pkg/workspace/releasemanager/action" | ||
| "workspace-engine/pkg/workspace/releasemanager/deployment/jobs" | ||
| "workspace-engine/pkg/workspace/store" | ||
|
|
||
| "github.com/google/uuid" | ||
| "go.opentelemetry.io/otel" | ||
| "go.opentelemetry.io/otel/attribute" | ||
| "go.opentelemetry.io/otel/codes" | ||
| ) | ||
|
|
||
| var tracer = otel.Tracer("RollbackAction") | ||
|
|
||
| type RollbackAction struct { | ||
| store *store.Store | ||
| dispatcher *jobs.Dispatcher | ||
| } | ||
|
|
||
| func NewRollbackAction(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackAction { | ||
| return &RollbackAction{ | ||
| store: store, | ||
| dispatcher: dispatcher, | ||
| } | ||
| } | ||
|
|
||
| func (r *RollbackAction) Name() string { | ||
| return "rollback" | ||
| } | ||
|
|
||
| func (r *RollbackAction) Execute( | ||
| ctx context.Context, | ||
| trigger action.ActionTrigger, | ||
| actx action.ActionContext, | ||
| ) error { | ||
| ctx, span := tracer.Start(ctx, "RollbackAction.Execute") | ||
| defer span.End() | ||
|
|
||
| span.SetAttributes( | ||
| attribute.String("trigger", string(trigger)), | ||
| attribute.String("release.id", actx.Release.ID()), | ||
| attribute.String("job.id", actx.Job.Id), | ||
| attribute.String("job.status", string(actx.Job.Status)), | ||
| ) | ||
|
|
||
| if !r.shouldRollback(actx.Policies, actx.Job.Status) { | ||
| span.SetAttributes(attribute.Bool("rollback_applicable", false)) | ||
| span.SetStatus(codes.Ok, "no applicable rollback policy") | ||
| return nil | ||
| } | ||
|
|
||
| span.SetAttributes(attribute.Bool("rollback_applicable", true)) | ||
|
|
||
| currentRelease, lastSuccessfulJob, err := r.store.ReleaseTargets.GetCurrentRelease(ctx, &actx.Release.ReleaseTarget) | ||
| if err != nil { | ||
| span.AddEvent("No previous release to roll back to") | ||
| span.SetStatus(codes.Ok, "no previous release available") | ||
| return nil | ||
| } | ||
|
|
||
| if currentRelease.ID() == actx.Release.ID() { | ||
| span.AddEvent("Current release is the same as failed release, no rollback needed") | ||
| span.SetStatus(codes.Ok, "already on current release") | ||
| return nil | ||
| } | ||
|
|
||
| span.SetAttributes( | ||
| attribute.String("rollback_to_release.id", currentRelease.ID()), | ||
| attribute.String("rollback_to_version.id", currentRelease.Version.Id), | ||
| attribute.String("rollback_to_version.tag", currentRelease.Version.Tag), | ||
| ) | ||
|
|
||
| now := time.Now() | ||
| newJob := oapi.Job{ | ||
| Id: uuid.New().String(), | ||
| ReleaseId: lastSuccessfulJob.ReleaseId, | ||
| JobAgentId: lastSuccessfulJob.JobAgentId, | ||
| JobAgentConfig: lastSuccessfulJob.JobAgentConfig, | ||
| Status: oapi.JobStatusPending, | ||
| CreatedAt: now, | ||
| UpdatedAt: now, | ||
| } | ||
|
|
||
| r.store.Jobs.Upsert(ctx, &newJob) | ||
|
Comment on lines
+79
to
+89
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check Upsert error and initialize Job fields. Two issues here:
🔎 Proposed fix now := time.Now()
newJob := oapi.Job{
Id: uuid.New().String(),
ReleaseId: lastSuccessfulJob.ReleaseId,
JobAgentId: lastSuccessfulJob.JobAgentId,
JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
Status: oapi.JobStatusPending,
CreatedAt: now,
UpdatedAt: now,
+ Metadata: make(map[string]string),
+ TraceToken: lastSuccessfulJob.TraceToken,
}
- r.store.Jobs.Upsert(ctx, &newJob)
+ if err := r.store.Jobs.Upsert(ctx, &newJob); err != nil {
+ span.RecordError(err)
+ span.SetStatus(codes.Error, "failed to persist rollback job")
+ return err
+ }🤖 Prompt for AI Agents |
||
|
|
||
| if err := r.dispatcher.DispatchJob(ctx, &newJob); err != nil { | ||
| span.RecordError(err) | ||
| span.SetStatus(codes.Error, "rollback execution failed") | ||
| return err | ||
| } | ||
|
|
||
| span.SetStatus(codes.Ok, "rollback executed successfully") | ||
| return nil | ||
| } | ||
|
|
||
| func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, jobStatus oapi.JobStatus) bool { | ||
| for _, policy := range policies { | ||
| if !policy.Enabled { | ||
| continue | ||
| } | ||
|
|
||
| for _, rule := range policy.Rules { | ||
| if rule.Rollback == nil { | ||
| continue | ||
| } | ||
|
|
||
| if rule.Rollback.OnJobStatuses != nil && | ||
| slices.Contains(*rule.Rollback.OnJobStatuses, jobStatus) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return false | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OnVerificationFailure rollback rules are never honored in RollbackAction
Right now
Executebails out unless:Rollbackrule, andRollBackJobStatusesslice is non-nil and contains the current job status.OnVerificationFailureis not considered at all inshouldRollback, so any policy that configures rollback only viaOnVerificationFailure(withRollBackJobStatuses == nil) will never trigger a rollback from this action, even if verifications have clearly failed and the evaluator has decided the rollback rule should deny.This contradicts the intended behavior implied by the RollbackRule schema and the new e2e tests that exercise
WithRuleRollbackOnVerificationFailure()and combined job/verification configurations.To fix this, you probably want
shouldRollback(or the Execute gating) to also takeOnVerificationFailureinto account, for example by:shouldRollbackto:RollBackJobStatusesagainstjobStatus, andOnVerificationFailureis true, inspectstore.JobVerifications.GetByJobId(job.Id)and return true if any verification hasJobVerificationStatusFailed.A possible direction (simplified, assuming you pass the job into
shouldRollback) would look like:Example adjustment to consider
That keeps the local decision logic aligned with how the rollback evaluator interprets
RollbackRule, and makes theOnVerificationFailureflag effective for action execution.Also applies to: 101-117
🤖 Prompt for AI Agents
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the rollback action only fires on job updates, so this is intentional, we only care about the job part of the rule here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@adityachoudhari26, thank you for the clarification! I understand now—the rollback logic has two separate trigger paths:
RollbackAction.Execute()→ checksRollBackJobStatusesRollbackHooks.OnVerificationComplete()→ checksOnVerificationFailureSo
shouldRollback()correctly only needs to inspectRollBackJobStatusessince this action is exclusively invoked when job statuses change. The verification-driven rollback path is handled independently by the hooks. That makes perfect sense!✏️ Learnings added
🧠 Learnings used