Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions autoresearch/commands/debug.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#!/usr/bin/env npx tsx
/**
* /autoresearch:debug — Hypothesis-driven debugging for specific failing tasks.
*
* Scientific method: Gather → Hypothesize → Test → Classify → Log → Repeat
*
* Usage:
* npx tsx autoresearch/commands/debug.ts --task extract-npm-description
* npx tsx autoresearch/commands/debug.ts --task bench-imdb-matrix --iterations 5
*/

import { execSync } from 'node:child_process';
import { readFileSync, appendFileSync, writeFileSync, existsSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { parseArgs } from '../config.js';

const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = join(__dirname, '..', '..');
const TASKS_FILE = join(__dirname, '..', 'browse-tasks.json');
const DEBUG_LOG = join(ROOT, 'debug-results.tsv');

interface BrowseTask {
name: string;
steps: string[];
judge: { type: string; value?: string; minLength?: number; pattern?: string };
}

function exec(cmd: string): string {
try {
return execSync(cmd, {
cwd: ROOT, timeout: 30_000, encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
} catch (err: any) {
return err.stdout?.trim() ?? err.message ?? '';
}
}

function initLog(): void {
if (!existsSync(DEBUG_LOG)) {
writeFileSync(DEBUG_LOG, '# AutoResearch Debug Log\niteration\ttask\thypothesis\tresult\tverdict\tdescription\n', 'utf-8');
}
}

function appendLog(iteration: number, task: string, hypothesis: string, result: string, verdict: string, description: string): void {
appendFileSync(DEBUG_LOG, `${iteration}\t${task}\t${hypothesis}\t${result}\t${verdict}\t${description}\n`, 'utf-8');
}

async function main() {
const args = parseArgs(process.argv.slice(2));
const taskName = args.task;
const maxIterations = args.iterations ?? 10;

if (!taskName) {
console.error('Usage: npx tsx autoresearch/commands/debug.ts --task <task-name> [--iterations N]');
console.error('\nAvailable tasks:');
const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
// Show only failing tasks
for (const task of tasks) {
try { exec('opencli operate close'); } catch {}
let lastOutput = '';
for (const step of task.steps) lastOutput = exec(step);
const passed = lastOutput.trim().length > 0; // simplified check
if (!passed) console.error(` ✗ ${task.name}`);
}
process.exit(1);
}

const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
const task = tasks.find(t => t.name === taskName);
if (!task) {
console.error(`Task not found: ${taskName}`);
process.exit(1);
}

console.log(`\n🔍 AutoResearch Debug: ${taskName}`);
console.log(` Steps: ${task.steps.length}`);
console.log(` Judge: ${task.judge.type}${task.judge.value ? ` "${task.judge.value}"` : ''}`);
console.log(` Max iterations: ${maxIterations}\n`);

initLog();

// Phase 1: Gather — run the task and capture output
console.log('Phase 1: Gathering symptoms...');
try { exec('opencli operate close'); } catch {}

let lastOutput = '';
for (let i = 0; i < task.steps.length; i++) {
const step = task.steps[i];
console.log(` Step ${i + 1}: ${step.slice(0, 80)}`);
lastOutput = exec(step);
if (i < task.steps.length - 1) {
console.log(` → ${lastOutput.slice(0, 100)}`);
}
}
console.log(`\n Final output: ${lastOutput.slice(0, 200)}`);
console.log(` Judge expects: ${JSON.stringify(task.judge)}`);

// Phase 2: Hypothesize + investigate via Claude Code
for (let iter = 1; iter <= maxIterations; iter++) {
console.log(`\n━━━ Debug Iteration ${iter}/${maxIterations} ━━━`);

const prompt = `You are debugging a failing browser automation task.

## Task: ${taskName}
Steps:
${task.steps.map((s, i) => ` ${i + 1}. ${s}`).join('\n')}

## Judge criteria
${JSON.stringify(task.judge)}

## Last output
${lastOutput.slice(0, 500)}

## Instructions
1. Form a SPECIFIC, FALSIFIABLE hypothesis about why this task fails
2. Run the MINIMUM experiment to test your hypothesis (e.g. run one step, check output)
3. Classify: CONFIRMED (bug found), DISPROVEN (try different hypothesis), INCONCLUSIVE
4. If CONFIRMED: describe the root cause and suggest a fix
5. Output format: one line "HYPOTHESIS: ...", one line "RESULT: CONFIRMED|DISPROVEN|INCONCLUSIVE — ..."

Do NOT fix the code — just diagnose. Use opencli operate commands to investigate.`;

try {
const result = execSync(
`claude -p --dangerously-skip-permissions --allowedTools "Bash(opencli:*),Bash(npm:*),Read,Grep,Glob" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
{ cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
).trim();

// Extract hypothesis and result
const hypMatch = result.match(/HYPOTHESIS:\s*(.+)/i);
const resMatch = result.match(/RESULT:\s*(CONFIRMED|DISPROVEN|INCONCLUSIVE)\s*[-—]\s*(.+)/i);

const hypothesis = hypMatch?.[1]?.trim() ?? 'unknown';
const verdict = resMatch?.[1]?.trim() ?? 'INCONCLUSIVE';
const description = resMatch?.[2]?.trim() ?? result.split('\n').pop()?.trim() ?? '';

console.log(` Hypothesis: ${hypothesis.slice(0, 100)}`);
console.log(` Verdict: ${verdict} — ${description.slice(0, 100)}`);

appendLog(iter, taskName, hypothesis, lastOutput.slice(0, 50), verdict, description);

if (verdict === 'CONFIRMED') {
console.log(`\n✅ Root cause found at iteration ${iter}!`);
console.log(` ${description}`);
break;
}
} catch (err: any) {
console.error(` Error: ${err.message?.slice(0, 100)}`);
appendLog(iter, taskName, 'error', '', 'CRASH', err.message?.slice(0, 80) ?? '');
}

// Re-run task for fresh output
try { exec('opencli operate close'); } catch {}
for (const step of task.steps) lastOutput = exec(step);
}

try { exec('opencli operate close'); } catch {}
console.log(`\nDebug log saved to: ${DEBUG_LOG}\n`);
}

main();
145 changes: 145 additions & 0 deletions autoresearch/commands/fix.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env npx tsx
/**
* /autoresearch:fix — Iterative error elimination.
*
* Auto-detects broken state (build → test → browse tests) and iteratively
* fixes errors one at a time. Stops when error count reaches 0.
*
* Priority: build errors → test failures → browse task failures
*
* Usage:
* npx tsx autoresearch/commands/fix.ts
* npx tsx autoresearch/commands/fix.ts --iterations 10
*/

import { execSync } from 'node:child_process';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { parseArgs } from '../config.js';
import { Engine, type ModifyContext } from '../engine.js';

const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = join(__dirname, '..', '..');

function exec(cmd: string): { ok: boolean; output: string } {
try {
const output = execSync(cmd, {
cwd: ROOT, timeout: 120_000, encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
return { ok: true, output };
} catch (err: any) {
return { ok: false, output: (err.stdout ?? '') + '\n' + (err.stderr ?? '') };
}
}

/** Detect current broken state and return verify command + error count */
function detectBrokenState(): { verify: string; errors: number; description: string } | null {
// 1. Build
const build = exec('npm run build 2>&1');
if (!build.ok) {
const errorCount = (build.output.match(/error TS/g) || []).length || 1;
return {
verify: 'npm run build 2>&1 | grep -c "error TS" || echo 0',
errors: errorCount,
description: `${errorCount} TypeScript build error(s)`,
};
}

// 2. Tests
const test = exec('npm test 2>&1');
if (!test.ok) {
const failMatch = test.output.match(/(\d+)\s+fail/i);
const errorCount = failMatch ? parseInt(failMatch[1], 10) : 1;
return {
verify: 'npm test 2>&1 | grep -oP "\\d+(?= fail)" || echo 0',
errors: errorCount,
description: `${errorCount} test failure(s)`,
};
}

// 3. Browse tests
const browse = exec('npx tsx autoresearch/eval-browse.ts 2>&1');
const scoreMatch = browse.output.match(/SCORE=(\d+)\/(\d+)/);
if (scoreMatch) {
const passed = parseInt(scoreMatch[1], 10);
const total = parseInt(scoreMatch[2], 10);
const failures = total - passed;
if (failures > 0) {
return {
verify: 'npx tsx autoresearch/eval-browse.ts 2>&1 | tail -1',
errors: failures,
description: `${failures} browse task failure(s) (${passed}/${total})`,
};
}
}

return null; // all clean
}

async function main() {
const args = parseArgs(process.argv.slice(2));
const maxIterations = args.iterations ?? 20;

console.log('\n🔧 AutoResearch Fix — Detecting broken state...\n');

const broken = detectBrokenState();
if (!broken) {
console.log(' ✓ All clean — nothing to fix!\n');
return;
}

console.log(` Found: ${broken.description}`);
console.log(` Verify: ${broken.verify}\n`);

const config = {
goal: `Fix all errors: ${broken.description}`,
scope: ['src/**/*.ts', 'extension/src/**/*.ts'],
metric: 'error_count',
direction: 'lower' as const,
verify: broken.verify,
guard: 'npm run build',
iterations: maxIterations,
minDelta: 1,
};

const logPath = join(ROOT, 'autoresearch-results.tsv');
const engine = new Engine(config, logPath, {
modify: async (ctx: ModifyContext) => {
const prompt = `Fix ONE error. Current error count: ${ctx.currentMetric}. Goal: 0 errors.

Read the error output, understand the root cause, and make ONE focused fix.
Do NOT fix multiple unrelated errors at once.
Do NOT modify test files.

${ctx.stuckHint ? `STUCK HINT: ${ctx.stuckHint}` : ''}`;

try {
const result = execSync(
`claude -p --dangerously-skip-permissions --allowedTools "Bash(npm:*),Bash(npx:*),Read,Edit,Write,Glob,Grep" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
{ cwd: ROOT, timeout: 180_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
).trim();
const lines = result.split('\n').filter(l => l.trim());
return lines[lines.length - 1]?.trim()?.slice(0, 120) || 'fix attempt';
} catch {
return null;
}
},
onStatus: (msg) => console.log(msg),
});

try {
const results = await engine.run();
const finalMetric = results[results.length - 1]?.metric ?? broken.errors;
if (finalMetric === 0) {
console.log('\n✅ All errors fixed!\n');
} else {
console.log(`\n⚠ ${finalMetric} error(s) remaining after ${maxIterations} iterations.\n`);
}
} catch (err: any) {
console.error(`\n❌ ${err.message}`);
process.exit(1);
}
}

main();
88 changes: 88 additions & 0 deletions autoresearch/commands/plan.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env npx tsx
/**
* /autoresearch:plan — Interactive configuration wizard.
*
* Walks through goal, scope, metric, verify, guard settings
* and outputs a ready-to-paste run command.
*
* Usage:
* npx tsx autoresearch/commands/plan.ts
*/

import { execSync } from 'node:child_process';
import { createInterface } from 'node:readline';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { PRESETS } from '../presets/index.js';

const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = join(__dirname, '..', '..');

const rl = createInterface({ input: process.stdin, output: process.stdout });
const ask = (q: string): Promise<string> => new Promise(r => rl.question(q, r));

async function main() {
console.log('\n🔬 AutoResearch — Configuration Wizard\n');

// Offer presets first
const presetNames = Object.keys(PRESETS);
console.log('Available presets:');
presetNames.forEach((name, i) => {
console.log(` [${i + 1}] ${name} — ${PRESETS[name].goal}`);
});
console.log(` [0] Custom config\n`);

const choice = await ask('Choose preset or 0 for custom: ');
const idx = parseInt(choice, 10);

if (idx > 0 && idx <= presetNames.length) {
const name = presetNames[idx - 1];
const iterations = await ask('Iterations (empty = unbounded): ');
const iterFlag = iterations ? ` --iterations ${iterations}` : '';
console.log(`\n✅ Ready to run:\n`);
console.log(` npx tsx autoresearch/commands/run.ts --preset ${name}${iterFlag}\n`);
rl.close();
return;
}

// Custom config
const goal = await ask('Goal (what to improve): ');
const scope = await ask('Scope (file globs, comma-separated): ');
const metric = await ask('Metric name (e.g. pass_count, coverage): ');
const direction = await ask('Direction (higher/lower): ') as 'higher' | 'lower';
const verify = await ask('Verify command (must output a number): ');

// Dry-run verify
console.log('\n Dry-running verify command...');
try {
const output = execSync(verify, { cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim();
const { extractMetric } = await import('../config.js');
const value = extractMetric(output);
if (value != null) {
console.log(` ✓ Verify works — current ${metric}: ${value}`);
} else {
console.log(` ⚠ Verify ran but no number extracted from output:\n ${output.slice(0, 200)}`);
}
} catch (err: any) {
console.log(` ✗ Verify failed: ${err.message?.slice(0, 100)}`);
}

const guard = await ask('Guard command (optional, press Enter to skip): ');
const iterations = await ask('Iterations (empty = unbounded): ');

const parts = ['npx tsx autoresearch/commands/run.ts'];
parts.push(`--goal "${goal}"`);
parts.push(`--scope "${scope}"`);
parts.push(`--metric "${metric}"`);
parts.push(`--direction ${direction}`);
parts.push(`--verify "${verify}"`);
if (guard) parts.push(`--guard "${guard}"`);
if (iterations) parts.push(`--iterations ${iterations}`);

console.log(`\n✅ Ready to run:\n`);
console.log(` ${parts.join(' \\\n ')}\n`);

rl.close();
}

main();
Loading