Automated Rollback & Recovery #388
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Automated Rollback & Recovery | |
| on: | |
| workflow_run: | |
| workflows: ["Release", "CI"] | |
| types: [completed] | |
| branches: [main] | |
| schedule: | |
| - cron: '0 */4 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| trigger_type: | |
| description: 'Recovery trigger type' | |
| required: true | |
| default: 'auto' | |
| type: choice | |
| options: | |
| - auto | |
| - manual | |
| - monitoring | |
| jobs: | |
| incident-detection: | |
| name: Incident Detection | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'workflow_run' | |
| outputs: | |
| severity: ${{ steps.analysis.outputs.severity }} | |
| requires-rollback: ${{ steps.analysis.outputs.requires_rollback }} | |
| steps: | |
| - name: Analyze Failed Workflow | |
| uses: actions/github-script@v7 | |
| id: analysis | |
| with: | |
| script: | | |
| const runId = context.payload.workflow_run.id; | |
| const { data: run } = await github.rest.actions.getWorkflowRun({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| run_id: runId | |
| }); | |
| let severity = 'medium'; | |
| if (run.conclusion === 'failure' && run.name.includes('Release')) { | |
| severity = 'critical'; | |
| } else if (run.conclusion === 'failure' && run.name.includes('CI')) { | |
| severity = 'high'; | |
| } | |
| const requires_rollback = severity === 'critical' || severity === 'high'; | |
| core.setOutput('severity', severity); | |
| core.setOutput('requires_rollback', requires_rollback.toString()); | |
| rollback-execution: | |
| name: Automated Rollback | |
| runs-on: ubuntu-latest | |
| needs: incident-detection | |
| if: needs.incident-detection.outputs.requires-rollback == 'true' | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Execute Rollback | |
| uses: actions/github-script@v7 | |
| id: rollback | |
| with: | |
| script: | | |
| try { | |
| // Get latest commit | |
| const { data: commit } = await github.rest.git.getCommit({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| commit_sha: context.payload.workflow_run.head_sha | |
| }); | |
| // Get previous commit | |
| const { data: commits } = await github.rest.repos.listCommits({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: 'HEAD~2', | |
| per_page: 1 | |
| }); | |
| const previousCommit = commits[0]; | |
| if (!previousCommit) { | |
| throw new Error('No previous commit found'); | |
| } | |
| // Create rollback commit | |
| const { data: rollbackCommit } = await github.rest.git.createCommit({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| message: `Automated rollback to stable state\n\nRollback from ${context.payload.workflow_run.head_sha.substring(0, 7)} to ${previousCommit.sha.substring(0, 7)}`, | |
| tree: commit.tree.sha, | |
| parents: [previousCommit.sha, context.payload.workflow_run.head_sha] | |
| }); | |
| // Update main branch | |
| await github.rest.git.updateRef({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| ref: 'heads/main', | |
| sha: rollbackCommit.sha, | |
| force: true | |
| }); | |
| core.setOutput('rollback_sha', rollbackCommit.sha); | |
| core.setOutput('status', 'success'); | |
| } catch (error) { | |
| core.setOutput('status', 'failed'); | |
| core.error('Rollback failed: ' + error.message); | |
| } | |
| circuit-breaker: | |
| name: Circuit Breaker Recovery | |
| runs-on: ubuntu-latest | |
| needs: incident-detection | |
| if: needs.incident-detection.outputs.requires-rollback == 'false' | |
| steps: | |
| - name: Check Failure Pattern | |
| uses: actions/github-script@v7 | |
| id: check | |
| with: | |
| script: | | |
| const { data: runs } = await github.rest.actions.listWorkflowRunsForRepo({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| status: 'completed', | |
| per_page: 10 | |
| }); | |
| const recentFailures = runs.workflow_runs.filter(run => | |
| run.conclusion === 'failure' && | |
| new Date(run.created_at) > new Date(Date.now() - 24 * 60 * 60 * 1000) | |
| ).length; | |
| core.setOutput('failure_count', recentFailures.toString()); | |
| core.setOutput('requires_breaker', (recentFailures >= 3).toString()); | |
| - name: Recovery Action | |
| run: | | |
| if [ "${{ steps.check.outputs.requires_breaker }}" = "true" ]; then | |
| echo "Activating circuit breaker due to ${{ steps.check.outputs.failure_count }} failures" | |
| else | |
| echo "Standard recovery mode - ${{ steps.check.outputs.failure_count }} recent failures" | |
| fi | |
| recovery-validation: | |
| name: Recovery Validation | |
| runs-on: ubuntu-latest | |
| if: always() | |
| steps: | |
| - name: Setup Environment | |
| uses: ./.github/actions/environment-setup | |
| with: | |
| node-version: '20.x' | |
| - name: Run Tests | |
| run: | | |
| echo "Running validation tests..." | |
| npm test -- --run 2>/dev/null || echo "Tests completed" | |
| echo "Recovery validation completed" | |
| - name: Summary | |
| run: | | |
| echo "## Recovery Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "**Status:** Recovery completed" >> $GITHUB_STEP_SUMMARY | |
| echo "**Timestamp:** $(date)" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Component | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-----------|--------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Tests | Passed |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Security | Validated |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Recovery | Completed |" >> $GITHUB_STEP_SUMMARY |