jackwener · jackwener · Apr 3, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/autoresearch/commands/debug.ts b/autoresearch/commands/debug.ts
@@ -0,0 +1,163 @@
+#!/usr/bin/env npx tsx
+/**
+ * /autoresearch:debug — Hypothesis-driven debugging for specific failing tasks.
+ *
+ * Scientific method: Gather → Hypothesize → Test → Classify → Log → Repeat
+ *
+ * Usage:
+ *   npx tsx autoresearch/commands/debug.ts --task extract-npm-description
+ *   npx tsx autoresearch/commands/debug.ts --task bench-imdb-matrix --iterations 5
+ */
+
+import { execSync } from 'node:child_process';
+import { readFileSync, appendFileSync, writeFileSync, existsSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { parseArgs } from '../config.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(__dirname, '..', '..');
+const TASKS_FILE = join(__dirname, '..', 'browse-tasks.json');
+const DEBUG_LOG = join(ROOT, 'debug-results.tsv');
+
+interface BrowseTask {
+  name: string;
+  steps: string[];
+  judge: { type: string; value?: string; minLength?: number; pattern?: string };
+}
+
+function exec(cmd: string): string {
+  try {
+    return execSync(cmd, {
+      cwd: ROOT, timeout: 30_000, encoding: 'utf-8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+    }).trim();
+  } catch (err: any) {
+    return err.stdout?.trim() ?? err.message ?? '';
+  }
+}
+
+function initLog(): void {
+  if (!existsSync(DEBUG_LOG)) {
+    writeFileSync(DEBUG_LOG, '# AutoResearch Debug Log\niteration\ttask\thypothesis\tresult\tverdict\tdescription\n', 'utf-8');
+  }
+}
+
+function appendLog(iteration: number, task: string, hypothesis: string, result: string, verdict: string, description: string): void {
+  appendFileSync(DEBUG_LOG, `${iteration}\t${task}\t${hypothesis}\t${result}\t${verdict}\t${description}\n`, 'utf-8');
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  const taskName = args.task;
+  const maxIterations = args.iterations ?? 10;
+
+  if (!taskName) {
+    console.error('Usage: npx tsx autoresearch/commands/debug.ts --task <task-name> [--iterations N]');
+    console.error('\nAvailable tasks:');
+    const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
+    // Show only failing tasks
+    for (const task of tasks) {
+      try { exec('opencli operate close'); } catch {}
+      let lastOutput = '';
+      for (const step of task.steps) lastOutput = exec(step);
+      const passed = lastOutput.trim().length > 0; // simplified check
+      if (!passed) console.error(`  ✗ ${task.name}`);
+    }
+    process.exit(1);
+  }
+
+  const tasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
+  const task = tasks.find(t => t.name === taskName);
+  if (!task) {
+    console.error(`Task not found: ${taskName}`);
+    process.exit(1);
+  }
+
+  console.log(`\n🔍 AutoResearch Debug: ${taskName}`);
+  console.log(`   Steps: ${task.steps.length}`);
+  console.log(`   Judge: ${task.judge.type}${task.judge.value ? ` "${task.judge.value}"` : ''}`);
+  console.log(`   Max iterations: ${maxIterations}\n`);
+
+  initLog();
+
+  // Phase 1: Gather — run the task and capture output
+  console.log('Phase 1: Gathering symptoms...');
+  try { exec('opencli operate close'); } catch {}
+
+  let lastOutput = '';
+  for (let i = 0; i < task.steps.length; i++) {
+    const step = task.steps[i];
+    console.log(`  Step ${i + 1}: ${step.slice(0, 80)}`);
+    lastOutput = exec(step);
+    if (i < task.steps.length - 1) {
+      console.log(`    → ${lastOutput.slice(0, 100)}`);
+    }
+  }
+  console.log(`\n  Final output: ${lastOutput.slice(0, 200)}`);
+  console.log(`  Judge expects: ${JSON.stringify(task.judge)}`);
+
+  // Phase 2: Hypothesize + investigate via Claude Code
+  for (let iter = 1; iter <= maxIterations; iter++) {
+    console.log(`\n━━━ Debug Iteration ${iter}/${maxIterations} ━━━`);
+
+    const prompt = `You are debugging a failing browser automation task.
+
+## Task: ${taskName}
+Steps:
+${task.steps.map((s, i) => `  ${i + 1}. ${s}`).join('\n')}
+
+## Judge criteria
+${JSON.stringify(task.judge)}
+
+## Last output
+${lastOutput.slice(0, 500)}
+
+## Instructions
+1. Form a SPECIFIC, FALSIFIABLE hypothesis about why this task fails
+2. Run the MINIMUM experiment to test your hypothesis (e.g. run one step, check output)
+3. Classify: CONFIRMED (bug found), DISPROVEN (try different hypothesis), INCONCLUSIVE
+4. If CONFIRMED: describe the root cause and suggest a fix
+5. Output format: one line "HYPOTHESIS: ...", one line "RESULT: CONFIRMED|DISPROVEN|INCONCLUSIVE — ..."
+
+Do NOT fix the code — just diagnose. Use opencli operate commands to investigate.`;
+
+    try {
+      const result = execSync(
+        `claude -p --dangerously-skip-permissions --allowedTools "Bash(opencli:*),Bash(npm:*),Read,Grep,Glob" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
+        { cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
+      ).trim();
+
+      // Extract hypothesis and result
+      const hypMatch = result.match(/HYPOTHESIS:\s*(.+)/i);
+      const resMatch = result.match(/RESULT:\s*(CONFIRMED|DISPROVEN|INCONCLUSIVE)\s*[-—]\s*(.+)/i);
+
+      const hypothesis = hypMatch?.[1]?.trim() ?? 'unknown';
+      const verdict = resMatch?.[1]?.trim() ?? 'INCONCLUSIVE';
+      const description = resMatch?.[2]?.trim() ?? result.split('\n').pop()?.trim() ?? '';
+
+      console.log(`  Hypothesis: ${hypothesis.slice(0, 100)}`);
+      console.log(`  Verdict: ${verdict} — ${description.slice(0, 100)}`);
+
+      appendLog(iter, taskName, hypothesis, lastOutput.slice(0, 50), verdict, description);
+
+      if (verdict === 'CONFIRMED') {
+        console.log(`\n✅ Root cause found at iteration ${iter}!`);
+        console.log(`   ${description}`);
+        break;
+      }
+    } catch (err: any) {
+      console.error(`  Error: ${err.message?.slice(0, 100)}`);
+      appendLog(iter, taskName, 'error', '', 'CRASH', err.message?.slice(0, 80) ?? '');
+    }
+
+    // Re-run task for fresh output
+    try { exec('opencli operate close'); } catch {}
+    for (const step of task.steps) lastOutput = exec(step);
+  }
+
+  try { exec('opencli operate close'); } catch {}
+  console.log(`\nDebug log saved to: ${DEBUG_LOG}\n`);
+}
+
+main();
diff --git a/autoresearch/commands/fix.ts b/autoresearch/commands/fix.ts
@@ -0,0 +1,145 @@
+#!/usr/bin/env npx tsx
+/**
+ * /autoresearch:fix — Iterative error elimination.
+ *
+ * Auto-detects broken state (build → test → browse tests) and iteratively
+ * fixes errors one at a time. Stops when error count reaches 0.
+ *
+ * Priority: build errors → test failures → browse task failures
+ *
+ * Usage:
+ *   npx tsx autoresearch/commands/fix.ts
+ *   npx tsx autoresearch/commands/fix.ts --iterations 10
+ */
+
+import { execSync } from 'node:child_process';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { parseArgs } from '../config.js';
+import { Engine, type ModifyContext } from '../engine.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(__dirname, '..', '..');
+
+function exec(cmd: string): { ok: boolean; output: string } {
+  try {
+    const output = execSync(cmd, {
+      cwd: ROOT, timeout: 120_000, encoding: 'utf-8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+    }).trim();
+    return { ok: true, output };
+  } catch (err: any) {
+    return { ok: false, output: (err.stdout ?? '') + '\n' + (err.stderr ?? '') };
+  }
+}
+
+/** Detect current broken state and return verify command + error count */
+function detectBrokenState(): { verify: string; errors: number; description: string } | null {
+  // 1. Build
+  const build = exec('npm run build 2>&1');
+  if (!build.ok) {
+    const errorCount = (build.output.match(/error TS/g) || []).length || 1;
+    return {
+      verify: 'npm run build 2>&1 | grep -c "error TS" || echo 0',
+      errors: errorCount,
+      description: `${errorCount} TypeScript build error(s)`,
+    };
+  }
+
+  // 2. Tests
+  const test = exec('npm test 2>&1');
+  if (!test.ok) {
+    const failMatch = test.output.match(/(\d+)\s+fail/i);
+    const errorCount = failMatch ? parseInt(failMatch[1], 10) : 1;
+    return {
+      verify: 'npm test 2>&1 | grep -oP "\\d+(?= fail)" || echo 0',
+      errors: errorCount,
+      description: `${errorCount} test failure(s)`,
+    };
+  }
+
+  // 3. Browse tests
+  const browse = exec('npx tsx autoresearch/eval-browse.ts 2>&1');
+  const scoreMatch = browse.output.match(/SCORE=(\d+)\/(\d+)/);
+  if (scoreMatch) {
+    const passed = parseInt(scoreMatch[1], 10);
+    const total = parseInt(scoreMatch[2], 10);
+    const failures = total - passed;
+    if (failures > 0) {
+      return {
+        verify: 'npx tsx autoresearch/eval-browse.ts 2>&1 | tail -1',
+        errors: failures,
+        description: `${failures} browse task failure(s) (${passed}/${total})`,
+      };
+    }
+  }
+
+  return null; // all clean
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  const maxIterations = args.iterations ?? 20;
+
+  console.log('\n🔧 AutoResearch Fix — Detecting broken state...\n');
+
+  const broken = detectBrokenState();
+  if (!broken) {
+    console.log('  ✓ All clean — nothing to fix!\n');
+    return;
+  }
+
+  console.log(`  Found: ${broken.description}`);
+  console.log(`  Verify: ${broken.verify}\n`);
+
+  const config = {
+    goal: `Fix all errors: ${broken.description}`,
+    scope: ['src/**/*.ts', 'extension/src/**/*.ts'],
+    metric: 'error_count',
+    direction: 'lower' as const,
+    verify: broken.verify,
+    guard: 'npm run build',
+    iterations: maxIterations,
+    minDelta: 1,
+  };
+
+  const logPath = join(ROOT, 'autoresearch-results.tsv');
+  const engine = new Engine(config, logPath, {
+    modify: async (ctx: ModifyContext) => {
+      const prompt = `Fix ONE error. Current error count: ${ctx.currentMetric}. Goal: 0 errors.
+
+Read the error output, understand the root cause, and make ONE focused fix.
+Do NOT fix multiple unrelated errors at once.
+Do NOT modify test files.
+
+${ctx.stuckHint ? `STUCK HINT: ${ctx.stuckHint}` : ''}`;
+
+      try {
+        const result = execSync(
+          `claude -p --dangerously-skip-permissions --allowedTools "Bash(npm:*),Bash(npx:*),Read,Edit,Write,Glob,Grep" --output-format text --no-session-persistence "${prompt.replace(/"/g, '\\"')}"`,
+          { cwd: ROOT, timeout: 180_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
+        ).trim();
+        const lines = result.split('\n').filter(l => l.trim());
+        return lines[lines.length - 1]?.trim()?.slice(0, 120) || 'fix attempt';
+      } catch {
+        return null;
+      }
+    },
+    onStatus: (msg) => console.log(msg),
+  });
+
+  try {
+    const results = await engine.run();
+    const finalMetric = results[results.length - 1]?.metric ?? broken.errors;
+    if (finalMetric === 0) {
+      console.log('\n✅ All errors fixed!\n');
+    } else {
+      console.log(`\n⚠ ${finalMetric} error(s) remaining after ${maxIterations} iterations.\n`);
+    }
+  } catch (err: any) {
+    console.error(`\n❌ ${err.message}`);
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/autoresearch/commands/plan.ts b/autoresearch/commands/plan.ts
@@ -0,0 +1,88 @@
+#!/usr/bin/env npx tsx
+/**
+ * /autoresearch:plan — Interactive configuration wizard.
+ *
+ * Walks through goal, scope, metric, verify, guard settings
+ * and outputs a ready-to-paste run command.
+ *
+ * Usage:
+ *   npx tsx autoresearch/commands/plan.ts
+ */
+
+import { execSync } from 'node:child_process';
+import { createInterface } from 'node:readline';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { PRESETS } from '../presets/index.js';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const ROOT = join(__dirname, '..', '..');
+
+const rl = createInterface({ input: process.stdin, output: process.stdout });
+const ask = (q: string): Promise<string> => new Promise(r => rl.question(q, r));
+
+async function main() {
+  console.log('\n🔬 AutoResearch — Configuration Wizard\n');
+
+  // Offer presets first
+  const presetNames = Object.keys(PRESETS);
+  console.log('Available presets:');
+  presetNames.forEach((name, i) => {
+    console.log(`  [${i + 1}] ${name} — ${PRESETS[name].goal}`);
+  });
+  console.log(`  [0] Custom config\n`);
+
+  const choice = await ask('Choose preset or 0 for custom: ');
+  const idx = parseInt(choice, 10);
+
+  if (idx > 0 && idx <= presetNames.length) {
+    const name = presetNames[idx - 1];
+    const iterations = await ask('Iterations (empty = unbounded): ');
+    const iterFlag = iterations ? ` --iterations ${iterations}` : '';
+    console.log(`\n✅ Ready to run:\n`);
+    console.log(`  npx tsx autoresearch/commands/run.ts --preset ${name}${iterFlag}\n`);
+    rl.close();
+    return;
+  }
+
+  // Custom config
+  const goal = await ask('Goal (what to improve): ');
+  const scope = await ask('Scope (file globs, comma-separated): ');
+  const metric = await ask('Metric name (e.g. pass_count, coverage): ');
+  const direction = await ask('Direction (higher/lower): ') as 'higher' | 'lower';
+  const verify = await ask('Verify command (must output a number): ');
+
+  // Dry-run verify
+  console.log('\n  Dry-running verify command...');
+  try {
+    const output = execSync(verify, { cwd: ROOT, timeout: 120_000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim();
+    const { extractMetric } = await import('../config.js');
+    const value = extractMetric(output);
+    if (value != null) {
+      console.log(`  ✓ Verify works — current ${metric}: ${value}`);
+    } else {
+      console.log(`  ⚠ Verify ran but no number extracted from output:\n    ${output.slice(0, 200)}`);
+    }
+  } catch (err: any) {
+    console.log(`  ✗ Verify failed: ${err.message?.slice(0, 100)}`);
+  }
+
+  const guard = await ask('Guard command (optional, press Enter to skip): ');
+  const iterations = await ask('Iterations (empty = unbounded): ');
+
+  const parts = ['npx tsx autoresearch/commands/run.ts'];
+  parts.push(`--goal "${goal}"`);
+  parts.push(`--scope "${scope}"`);
+  parts.push(`--metric "${metric}"`);
+  parts.push(`--direction ${direction}`);
+  parts.push(`--verify "${verify}"`);
+  if (guard) parts.push(`--guard "${guard}"`);
+  if (iterations) parts.push(`--iterations ${iterations}`);
+
+  console.log(`\n✅ Ready to run:\n`);
+  console.log(`  ${parts.join(' \\\n    ')}\n`);
+
+  rl.close();
+}
+
+main();