Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions apps/workspace-engine/oapi/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,9 @@
"retry": {
"$ref": "#/components/schemas/RetryRule"
},
"rollback": {
"$ref": "#/components/schemas/RollbackRule"
},
"verification": {
"$ref": "#/components/schemas/VerificationRule"
},
Expand Down Expand Up @@ -1851,6 +1854,23 @@
],
"type": "object"
},
"RollbackRule": {
"properties": {
"onJobStatuses": {
"description": "Job statuses that will trigger a rollback",
"items": {
"$ref": "#/components/schemas/JobStatus"
},
"type": "array"
},
"onVerificationFailure": {
"default": false,
"description": "If true, a release target will be rolled back if the verification fails",
"type": "boolean"
}
},
"type": "object"
},
"RuleEvaluation": {
"properties": {
"actionRequired": {
Expand Down
17 changes: 17 additions & 0 deletions apps/workspace-engine/oapi/spec/schemas/policy.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ local openapi = import '../lib/openapi.libsonnet';
deploymentWindow: openapi.schemaRef('DeploymentWindowRule'),
verification: openapi.schemaRef('VerificationRule'),
versionCooldown: openapi.schemaRef('VersionCooldownRule'),
rollback: openapi.schemaRef('RollbackRule'),
},
},

Expand Down Expand Up @@ -235,6 +236,22 @@ local openapi = import '../lib/openapi.libsonnet';
},
},

RollbackRule: {
type: 'object',
properties: {
onJobStatuses: {
type: 'array',
items: openapi.schemaRef('JobStatus'),
description: 'Job statuses that will trigger a rollback',
},
onVerificationFailure: {
type: 'boolean',
default: false,
description: 'If true, a release target will be rolled back if the verification fails',
},
},
},

DeployDecision: {
type: 'object',
required: ['policyResults'],
Expand Down
10 changes: 10 additions & 0 deletions apps/workspace-engine/pkg/oapi/oapi.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ import (
type ActionTrigger string

const (
TriggerJobCreated ActionTrigger = "job.created"
TriggerJobStarted ActionTrigger = "job.started"
TriggerJobSuccess ActionTrigger = "job.success"
TriggerJobFailure ActionTrigger = "job.failure"
TriggerJobCreated ActionTrigger = "job.created"
TriggerJobStarted ActionTrigger = "job.started"
TriggerJobSuccess ActionTrigger = "job.success"
TriggerJobFailure ActionTrigger = "job.failure"
TriggerJobStatusChange ActionTrigger = "job.statuschange"
)

// ActionContext provides context for action execution
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,9 @@ func determineTrigger(
return TriggerJobFailure
}

if currentStatus != previousStatus {
return TriggerJobStatusChange
}

return ""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package rollback

import (
"context"
"time"
"workspace-engine/pkg/oapi"
"workspace-engine/pkg/workspace/releasemanager/deployment/jobs"
"workspace-engine/pkg/workspace/releasemanager/verification"
"workspace-engine/pkg/workspace/store"

"github.com/google/uuid"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
)

var hookTracer = otel.Tracer("RollbackHooks")

type RollbackHooks struct {
store *store.Store
dispatcher *jobs.Dispatcher
}

var _ verification.VerificationHooks = &RollbackHooks{}

func NewRollbackHooks(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackHooks {
return &RollbackHooks{
store: store,
dispatcher: dispatcher,
}
}

func (h *RollbackHooks) OnVerificationStarted(ctx context.Context, verification *oapi.JobVerification) error {
return nil
}

func (h *RollbackHooks) OnMeasurementTaken(ctx context.Context, verification *oapi.JobVerification, metricIndex int, measurement *oapi.VerificationMeasurement) error {
return nil
}

func (h *RollbackHooks) OnMetricComplete(ctx context.Context, verification *oapi.JobVerification, metricIndex int) error {
return nil
}

func (h *RollbackHooks) OnVerificationComplete(ctx context.Context, verificationResult *oapi.JobVerification) error {
ctx, span := hookTracer.Start(ctx, "RollbackHooks.OnVerificationComplete")
defer span.End()

span.SetAttributes(
attribute.String("verification.id", verificationResult.Id),
attribute.String("verification.job_id", verificationResult.JobId),
)

status := verificationResult.Status()
span.SetAttributes(attribute.String("verification.status", string(status)))

if status != oapi.JobVerificationStatusFailed {
span.SetStatus(codes.Ok, "verification did not fail")
return nil
}

job, ok := h.store.Jobs.Get(verificationResult.JobId)
if !ok {
span.SetStatus(codes.Error, "job not found")
return nil
}

release, ok := h.store.Releases.Get(job.ReleaseId)
if !ok {
span.SetStatus(codes.Error, "release not found")
return nil
}

span.SetAttributes(
attribute.String("release.id", release.ID()),
attribute.String("release_target.key", release.ReleaseTarget.Key()),
)

policies, err := h.store.ReleaseTargets.GetPolicies(ctx, &release.ReleaseTarget)
if err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "failed to get policies")
return nil
}

if !h.shouldRollbackOnVerificationFailure(policies) {
span.SetAttributes(attribute.Bool("rollback_applicable", false))
span.SetStatus(codes.Ok, "no applicable rollback policy for verification failure")
return nil
}

span.SetAttributes(attribute.Bool("rollback_applicable", true))

currentRelease, lastSuccessfulJob, err := h.store.ReleaseTargets.GetCurrentRelease(ctx, &release.ReleaseTarget)
if err != nil {
span.AddEvent("No previous release to roll back to")
span.SetStatus(codes.Ok, "no previous release available")
return nil
}

// Don't rollback to the same release
if currentRelease.ID() == release.ID() {
span.AddEvent("Current release is the same as failed release, no rollback needed")
span.SetStatus(codes.Ok, "already on current release")
return nil
}

span.SetAttributes(
attribute.String("rollback_to_release.id", currentRelease.ID()),
attribute.String("rollback_to_version.id", currentRelease.Version.Id),
attribute.String("rollback_to_version.tag", currentRelease.Version.Tag),
)

now := time.Now()
newJob := oapi.Job{
Id: uuid.New().String(),
ReleaseId: lastSuccessfulJob.ReleaseId,
JobAgentId: lastSuccessfulJob.JobAgentId,
JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
Status: oapi.JobStatusPending,
CreatedAt: now,
UpdatedAt: now,
}

h.store.Jobs.Upsert(ctx, &newJob)

if err := h.dispatcher.DispatchJob(ctx, &newJob); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "rollback execution failed")
return err
}

span.SetStatus(codes.Ok, "rollback executed successfully")
return nil
}

func (h *RollbackHooks) OnVerificationStopped(ctx context.Context, verification *oapi.JobVerification) error {
return nil
}

func (h *RollbackHooks) shouldRollbackOnVerificationFailure(policies []*oapi.Policy) bool {
for _, policy := range policies {
if !policy.Enabled {
continue
}

for _, rule := range policy.Rules {
if rule.Rollback == nil {
continue
}

if rule.Rollback.OnVerificationFailure != nil && *rule.Rollback.OnVerificationFailure {
return true
}
}
}

return false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package rollback

import (
"context"
"slices"
"time"
"workspace-engine/pkg/oapi"
"workspace-engine/pkg/workspace/releasemanager/action"
"workspace-engine/pkg/workspace/releasemanager/deployment/jobs"
"workspace-engine/pkg/workspace/store"

"github.com/google/uuid"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
)

var tracer = otel.Tracer("RollbackAction")

type RollbackAction struct {
store *store.Store
dispatcher *jobs.Dispatcher
}

func NewRollbackAction(store *store.Store, dispatcher *jobs.Dispatcher) *RollbackAction {
return &RollbackAction{
store: store,
dispatcher: dispatcher,
}
}

func (r *RollbackAction) Name() string {
return "rollback"
}

func (r *RollbackAction) Execute(
ctx context.Context,
trigger action.ActionTrigger,
actx action.ActionContext,
) error {
ctx, span := tracer.Start(ctx, "RollbackAction.Execute")
defer span.End()

span.SetAttributes(
attribute.String("trigger", string(trigger)),
attribute.String("release.id", actx.Release.ID()),
attribute.String("job.id", actx.Job.Id),
attribute.String("job.status", string(actx.Job.Status)),
)

if !r.shouldRollback(actx.Policies, actx.Job.Status) {
span.SetAttributes(attribute.Bool("rollback_applicable", false))
span.SetStatus(codes.Ok, "no applicable rollback policy")
return nil
}

Comment on lines +51 to +56
Copy link
Contributor

@coderabbitai coderabbitai bot Dec 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

OnVerificationFailure rollback rules are never honored in RollbackAction

Right now Execute bails out unless:

  • At least one enabled policy has a Rollback rule, and
  • That rule’s RollBackJobStatuses slice is non-nil and contains the current job status.

OnVerificationFailure is not considered at all in shouldRollback, so any policy that configures rollback only via OnVerificationFailure (with RollBackJobStatuses == nil) will never trigger a rollback from this action, even if verifications have clearly failed and the evaluator has decided the rollback rule should deny.

This contradicts the intended behavior implied by the RollbackRule schema and the new e2e tests that exercise WithRuleRollbackOnVerificationFailure() and combined job/verification configurations.

To fix this, you probably want shouldRollback (or the Execute gating) to also take OnVerificationFailure into account, for example by:

  • Either removing the policy re-check here and letting the action be invoked only when the rollback evaluator has already decided rollback is required, or
  • Extending shouldRollback to:
    • Still check RollBackJobStatuses against jobStatus, and
    • When OnVerificationFailure is true, inspect store.JobVerifications.GetByJobId(job.Id) and return true if any verification has JobVerificationStatusFailed.

A possible direction (simplified, assuming you pass the job into shouldRollback) would look like:

Example adjustment to consider
-func (r *RollbackAction) Execute(
+func (r *RollbackAction) Execute(
     ctx context.Context,
     trigger action.ActionTrigger,
     actx action.ActionContext,
 ) error {
@@
-    if !r.shouldRollback(actx.Policies, actx.Job.Status) {
+    if !r.shouldRollback(actx.Policies, actx.Job) {
         span.SetAttributes(attribute.Bool("rollback_applicable", false))
         span.SetStatus(codes.Ok, "no applicable rollback policy")
         return nil
     }
@@
-func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, jobStatus oapi.JobStatus) bool {
+func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, job *oapi.Job) bool {
+    jobStatus := job.Status
     for _, policy := range policies {
         if !policy.Enabled {
             continue
         }
 
         for _, rule := range policy.Rules {
             if rule.Rollback == nil {
                 continue
             }
 
-            if rule.Rollback.RollBackJobStatuses != nil &&
-                slices.Contains(*rule.Rollback.RollBackJobStatuses, jobStatus) {
-                return true
-            }
+            rb := rule.Rollback
+            if rb.RollBackJobStatuses != nil &&
+                slices.Contains(*rb.RollBackJobStatuses, jobStatus) {
+                return true
+            }
+
+            if rb.OnVerificationFailure != nil && *rb.OnVerificationFailure {
+                verifications := r.store.JobVerifications.GetByJobId(job.Id)
+                for _, v := range verifications {
+                    if v.Status() == oapi.JobVerificationStatusFailed {
+                        return true
+                    }
+                }
+            }
         }
     }
 
     return false
 }

That keeps the local decision logic aligned with how the rollback evaluator interprets RollbackRule, and makes the OnVerificationFailure flag effective for action execution.

Also applies to: 101-117

🤖 Prompt for AI Agents
In
apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go
around lines 51-56 (also apply same change to 101-117), the Execute path bails
out unless a policy has Rollback.RollBackJobStatuses matching the current job
status and therefore ignores RollbackRule.OnVerificationFailure; update the
gating so OnVerificationFailure is honored: either remove this local policy
re-check and rely on the evaluator decision, or extend shouldRollback (or
Execute) to accept the job and when a policy's OnVerificationFailure is true
call store.JobVerifications.GetByJobId(job.ID) and return true if any
verification has JobVerificationStatusFailed while still keeping existing
RollBackJobStatuses checks. Ensure span attributes/status handling remains
consistent when deciding to return nil vs proceed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the rollback action only fires on job updates, so this is intentional, we only care about the job part of the rule here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adityachoudhari26, thank you for the clarification! I understand now—the rollback logic has two separate trigger paths:

  1. Job status changesRollbackAction.Execute() → checks RollBackJobStatuses
  2. Verification completionRollbackHooks.OnVerificationComplete() → checks OnVerificationFailure

So shouldRollback() correctly only needs to inspect RollBackJobStatuses since this action is exclusively invoked when job statuses change. The verification-driven rollback path is handled independently by the hooks. That makes perfect sense!


✏️ Learnings added
Learnt from: adityachoudhari26
Repo: ctrlplanedev/ctrlplane PR: 733
File: apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go:51-56
Timestamp: 2025-12-24T17:22:38.067Z
Learning: The rollback system uses two separate trigger mechanisms: RollbackAction (in rollback.go) handles rollbacks triggered by job status changes and checks the RollBackJobStatuses field, while RollbackHooks (in hooks.go) handles rollbacks triggered by verification failures and checks the OnVerificationFailure field. Each component only needs to check its respective portion of the RollbackRule configuration.

Note: Learnings are effective only in the context of similar code segments. To apply general coding guidelines, please configure review instructions. You can manage existing learnings in the app.


🧠 Learnings used
Learnt from: adityachoudhari26
Repo: ctrlplanedev/ctrlplane PR: 658
File: apps/workspace-engine/pkg/engine/policy/rules/environment-version-rollout/rollout_start_time.go:65-80
Timestamp: 2025-08-19T22:58:15.170Z
Learning: In the environment-version-rollout rule's rollout start time logic, the code intentionally returns the latest ApprovedAt time (first non-nil record from records ordered by updatedAt) rather than the earliest. This is because the goal is to find the latest approval that caused all the rules to pass. While the complete logic for determining "the latest that caused all rules to pass" would be too complex, this approximation works well given that the UI prevents approving if all rules are already passing.

Learnt from: adityachoudhari26
Repo: ctrlplanedev/ctrlplane PR: 637
File: packages/events/src/kafka/client.ts:10-16
Timestamp: 2025-08-01T04:41:41.345Z
Learning: User adityachoudhari26 prefers not to add null safety checks for required environment variables when they are guaranteed to be present in their deployment configuration, similar to their preference for simplicity over defensive programming in test code.

Learnt from: adityachoudhari26
Repo: ctrlplanedev/ctrlplane PR: 601
File: e2e/tests/api/policies/retry-policy.spec.ts:23-24
Timestamp: 2025-06-24T23:52:50.732Z
Learning: The user adityachoudhari26 prefers not to add null safety checks or defensive programming in test code, particularly in e2e tests, as they prioritize simplicity and focus on the main functionality being tested rather than comprehensive error handling within the test itself.

span.SetAttributes(attribute.Bool("rollback_applicable", true))

currentRelease, lastSuccessfulJob, err := r.store.ReleaseTargets.GetCurrentRelease(ctx, &actx.Release.ReleaseTarget)
if err != nil {
span.AddEvent("No previous release to roll back to")
span.SetStatus(codes.Ok, "no previous release available")
return nil
}

if currentRelease.ID() == actx.Release.ID() {
span.AddEvent("Current release is the same as failed release, no rollback needed")
span.SetStatus(codes.Ok, "already on current release")
return nil
}

span.SetAttributes(
attribute.String("rollback_to_release.id", currentRelease.ID()),
attribute.String("rollback_to_version.id", currentRelease.Version.Id),
attribute.String("rollback_to_version.tag", currentRelease.Version.Tag),
)

now := time.Now()
newJob := oapi.Job{
Id: uuid.New().String(),
ReleaseId: lastSuccessfulJob.ReleaseId,
JobAgentId: lastSuccessfulJob.JobAgentId,
JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
Status: oapi.JobStatusPending,
CreatedAt: now,
UpdatedAt: now,
}

r.store.Jobs.Upsert(ctx, &newJob)
Comment on lines +79 to +89
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Check Upsert error and initialize Job fields.

Two issues here:

  1. Critical: The Upsert call on line 89 doesn't check for errors, which could lead to silent failures if the job cannot be persisted.

  2. Major: The new job doesn't initialize Metadata or copy TraceToken from lastSuccessfulJob, which may cause:

    • Nil map serialization issues or panics if downstream code attempts map operations
    • Broken trace continuity, making it harder to correlate rollback jobs with their triggers in observability tooling
🔎 Proposed fix
 	now := time.Now()
 	newJob := oapi.Job{
 		Id:             uuid.New().String(),
 		ReleaseId:      lastSuccessfulJob.ReleaseId,
 		JobAgentId:     lastSuccessfulJob.JobAgentId,
 		JobAgentConfig: lastSuccessfulJob.JobAgentConfig,
 		Status:         oapi.JobStatusPending,
 		CreatedAt:      now,
 		UpdatedAt:      now,
+		Metadata:       make(map[string]string),
+		TraceToken:     lastSuccessfulJob.TraceToken,
 	}
 
-	r.store.Jobs.Upsert(ctx, &newJob)
+	if err := r.store.Jobs.Upsert(ctx, &newJob); err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "failed to persist rollback job")
+		return err
+	}
🤖 Prompt for AI Agents
In
apps/workspace-engine/pkg/workspace/releasemanager/action/rollback/rollback.go
around lines 79 to 89, the new Job creation and persistence silently ignore
errors and omit important fields; ensure you initialize Metadata (set to an
empty map if lastSuccessfulJob.Metadata is nil) and copy TraceToken from
lastSuccessfulJob into newJob, then call r.store.Jobs.Upsert(ctx, &newJob) and
check its returned error; on error return or wrap it with context (or log and
return) so the caller can handle persistence failures instead of failing
silently.


if err := r.dispatcher.DispatchJob(ctx, &newJob); err != nil {
span.RecordError(err)
span.SetStatus(codes.Error, "rollback execution failed")
return err
}

span.SetStatus(codes.Ok, "rollback executed successfully")
return nil
}

func (r *RollbackAction) shouldRollback(policies []*oapi.Policy, jobStatus oapi.JobStatus) bool {
for _, policy := range policies {
if !policy.Enabled {
continue
}

for _, rule := range policy.Rules {
if rule.Rollback == nil {
continue
}

if rule.Rollback.OnJobStatuses != nil &&
slices.Contains(*rule.Rollback.OnJobStatuses, jobStatus) {
return true
}
}
}

return false
}
Loading
Loading