Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions autoresearch/command-specs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"version": 1,
"kind": "command_incident",
"specs": [
{
"name": "weibo-hot-smoke",
"command": "opencli weibo hot --limit 5 -f json",
"safety": "read_only",
"prerequisites": {},
"verify": [
{ "type": "exitCode", "expected": 0 },
{ "type": "stdoutContains", "value": "[" },
{ "type": "jsonField", "path": "[0].title", "matcher": "nonEmpty" }
],
"repairScope": ["clis/weibo/**/*.ts", "clis/weibo/**/*.yaml"],
"forbidden": ["autoresearch/**", "tests/**"]
},
{
"name": "xiaohongshu-search-smoke",
"command": "opencli xiaohongshu search 美食 --limit 3 -f json",
"safety": "read_only",
"prerequisites": {},
"verify": [
{ "type": "exitCode", "expected": 0 },
{ "type": "stdoutContains", "value": "[" },
{ "type": "jsonField", "path": "length", "matcher": "gte", "value": "1" }
],
"repairScope": ["clis/xiaohongshu/**/*.ts", "clis/xiaohongshu/**/*.yaml"],
"forbidden": ["autoresearch/**", "tests/**"]
},
{
"name": "twitter-reply-fill-smoke",
"command": "opencli twitter reply https://x.com/elonmusk/status/1909633658498916828 'opencli smoke test'",
"safety": "fill_only",
"prerequisites": {
"auth": ["twitter"]
},
"verify": [
{ "type": "exitCode", "expected": 0 },
{ "type": "stdoutContains", "value": "dry_run" },
{ "type": "jsonField", "path": "[0].status", "matcher": "contains", "value": "dry_run" }
],
"cleanup": [],
"repairScope": ["clis/twitter/reply.ts", "clis/twitter/*.yaml"],
"forbidden": ["autoresearch/**", "tests/**"]
}
]
}
135 changes: 135 additions & 0 deletions autoresearch/commands/fix.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import { execSync } from 'node:child_process';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { readFileSync } from 'node:fs';
import { parseArgs } from '../config.js';
import type { CommandSpecsFile } from '../config.js';
import { Engine, type ModifyContext } from '../engine.js';

const __dirname = dirname(fileURLToPath(import.meta.url));
Expand Down Expand Up @@ -77,10 +79,143 @@ function detectBrokenState(): { verify: string; errors: number; description: str
return null; // all clean
}

/** Build incident-mode config for a specific command spec */
function buildIncidentConfig(specName: string, maxIterations: number) {
const specsFile: CommandSpecsFile = JSON.parse(
readFileSync(join(__dirname, '..', 'command-specs.json'), 'utf-8')
);
const spec = specsFile.specs.find(s => s.name === specName);
if (!spec) {
console.error(`Spec "${specName}" not found in command-specs.json`);
process.exit(1);
}

// Use REGRESSIONS=N (direction: lower, goal: 0) instead of SCORE=X/Y.
// This ensures infra/precondition failures don't pollute the metric.
// grep for REGRESSIONS= to extract only the regression count line.
return {
config: {
goal: `Fix command regression: ${spec.command}`,
scope: [...spec.repairScope, 'src/**/*.ts'],
metric: 'regression_count',
direction: 'lower' as const,
verify: `npx tsx autoresearch/eval-cli.ts --spec ${specName} 2>&1 | grep "^REGRESSIONS=" | tail -1`,
guard: 'npm run build && npm test',
iterations: maxIterations,
minDelta: 1,
},
spec,
};
}

function buildIncidentPrompt(specName: string, ctx: ModifyContext): string {
const specsFile: CommandSpecsFile = JSON.parse(
readFileSync(join(__dirname, '..', 'command-specs.json'), 'utf-8')
);
const spec = specsFile.specs.find(s => s.name === specName);
if (!spec) return 'Fix the failing command.';

const forbidden = spec.forbidden.length > 0
? `Do NOT modify: ${spec.forbidden.join(', ')}`
: '';

return `Command \`${spec.command}\` is failing (regression).

Current regression count: ${ctx.currentMetric}. Goal: 0 regressions.

The command implementation is at: ${spec.repairScope.join(', ')}
Read the adapter code, understand why the command fails against the live site, and fix it.

Common causes:
- Site updated DOM selectors
- URL pattern changed
- Response format changed
- Auth/cookie handling broke

${forbidden}
Fix ONE issue at a time.

${ctx.stuckHint ? `STUCK HINT: ${ctx.stuckHint}` : ''}`;
}

async function main() {
const args = parseArgs(process.argv.slice(2));
const maxIterations = args.iterations ?? 20;
const mode = args.mode ?? 'repo';
const specName = args.spec;

if (mode === 'incident') {
if (!specName) {
console.error('Incident mode requires --spec <name>');
process.exit(1);
}

console.log(`\n🔧 AutoResearch Fix — Incident Mode: ${specName}\n`);

// Pre-flight: run eval-cli once to check if spec has actual regressions
const preflight = exec(`npx tsx autoresearch/eval-cli.ts --spec ${specName} 2>&1`);
const regressionsMatch = preflight.output.match(/REGRESSIONS=(\d+)/);
const regressionCount = regressionsMatch ? parseInt(regressionsMatch[1], 10) : 0;

if (regressionCount === 0) {
// Check if it's because of infra/precondition (exit code 2) or actually passing
if (preflight.output.includes('failed_infrastructure')) {
console.log(' ⚡ Cannot run: infrastructure failure (browser bridge not connected?)');
console.log(' Fix the infrastructure issue first, then retry.\n');
process.exit(1);
}
if (preflight.output.includes('failed_precondition')) {
console.log(' ⊘ Cannot run: prerequisite not met (auth/env missing?)');
console.log(' Ensure prerequisites are satisfied, then retry.\n');
process.exit(1);
}
console.log(' ✓ Spec already passing — nothing to fix!\n');
return;
}

console.log(` Found: ${regressionCount} regression(s)`);

const { config } = buildIncidentConfig(specName, maxIterations);

console.log(` Command spec: ${specName}`);
console.log(` Verify: ${config.verify}`);
console.log(` Scope: ${config.scope.join(', ')}\n`);

const logPath = join(ROOT, 'autoresearch-results.tsv');
const engine = new Engine(config, logPath, {
modify: async (ctx: ModifyContext) => {
const prompt = buildIncidentPrompt(specName, ctx);
try {
// Pass prompt via stdin to avoid shell metacharacter expansion
const result = execSync(
'claude -p --dangerously-skip-permissions --allowedTools "Bash(npm:*),Bash(npx:*),Read,Edit,Write,Glob,Grep" --output-format text --no-session-persistence',
{ cwd: ROOT, timeout: 180_000, encoding: 'utf-8', input: prompt, stdio: ['pipe', 'pipe', 'pipe'] }
).trim();
const lines = result.split('\n').filter(l => l.trim());
return lines[lines.length - 1]?.trim()?.slice(0, 120) || 'incident fix attempt';
} catch {
return null;
}
},
onStatus: (msg) => console.log(msg),
});

try {
const results = await engine.run();
const finalMetric = results[results.length - 1]?.metric ?? regressionCount;
if (finalMetric === 0) {
console.log(`\n✅ Command spec "${specName}" — all regressions fixed!\n`);
} else {
console.log(`\n⚠ Command spec "${specName}" — ${finalMetric} regression(s) remaining after ${maxIterations} iterations.\n`);
}
} catch (err: any) {
console.error(`\n❌ ${err.message}`);
process.exit(1);
}
return;
}

// ── Repo mode (default, existing behavior) ──
console.log('\n🔧 AutoResearch Fix — Detecting broken state...\n');

const broken = detectBrokenState();
Expand Down
56 changes: 54 additions & 2 deletions autoresearch/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ export interface IterationResult {
}

/** Parse CLI args into a partial config (missing fields filled by preset or prompts) */
export function parseArgs(argv: string[]): Partial<AutoResearchConfig> & { preset?: string; task?: string } {
const config: Partial<AutoResearchConfig> & { preset?: string; task?: string } = {};
export function parseArgs(argv: string[]): Partial<AutoResearchConfig> & { preset?: string; task?: string; mode?: string; spec?: string } {
const config: Partial<AutoResearchConfig> & { preset?: string; task?: string; mode?: string; spec?: string } = {};
for (let i = 0; i < argv.length; i++) {
const arg = argv[i];
const next = argv[i + 1];
Expand All @@ -59,17 +59,69 @@ export function parseArgs(argv: string[]): Partial<AutoResearchConfig> & { prese
case '--iterations': config.iterations = parseInt(next, 10); i++; break;
case '--min-delta': config.minDelta = parseFloat(next); i++; break;
case '--task': config.task = next; i++; break;
case '--mode': config.mode = next; i++; break;
case '--spec': config.spec = next; i++; break;
}
}
return config;
}

/* ── Command Incident Spec (v1) ── */

export type VerifyCheck =
| { type: 'exitCode'; expected: number }
| { type: 'stdoutContains'; value: string }
| { type: 'jsonField'; path: string; matcher: 'nonEmpty' | 'contains' | 'gte' | 'matches'; value?: string }
| { type: 'pageEval'; js: string; matcher: 'contains' | 'truthy' | 'equals'; value?: string };

export interface CommandIncidentSpec {
name: string;
command: string;
safety: 'read_only' | 'fill_only' | 'publish';
prerequisites?: {
auth?: string[];
env?: Record<string, string>;
browserProfile?: string;
};
setup?: string[];
verify: VerifyCheck[];
cleanup?: string[];
repairScope: string[];
forbidden: string[];
}

export interface CommandSpecsFile {
version: number;
kind: 'command_incident';
specs: CommandIncidentSpec[];
}

export type SpecClassification =
| 'passed'
| 'failed_regression'
| 'failed_precondition'
| 'failed_infrastructure'
| 'skipped';

export interface SpecResult {
name: string;
classification: SpecClassification;
duration: number;
failedChecks?: string[];
stdout?: string;
stderr?: string;
exitCode?: number;
}

/** Extract a number from command output using common patterns */
export function extractMetric(output: string): number | null {
// Try: last line that looks like a number
const lines = output.trim().split('\n');
for (let i = lines.length - 1; i >= 0; i--) {
const line = lines[i].trim();
// Match REGRESSIONS=N → N (for incident mode)
const regMatch = line.match(/REGRESSIONS[=:]\s*(\d+)/i);
if (regMatch) return parseFloat(regMatch[1]);
// Match standalone numbers: "56", "95.2", "SCORE=56/59" → 56
const scoreMatch = line.match(/SCORE[=:]\s*(\d+)/i);
if (scoreMatch) return parseFloat(scoreMatch[1]);
Expand Down
Loading