Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions autoresearch/browse-tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@
"name": "extract-npm-description",
"steps": [
"opencli operate open https://www.npmjs.com/package/express",
"opencli operate eval \"document.querySelector('p[class*=description], [data-testid=package-description], #readme p')?.textContent?.trim()\""
"opencli operate wait time 2",
"opencli operate eval \"(function(){var ps=document.querySelectorAll('p');for(var i=0;i<ps.length;i++){var t=ps[i].textContent.trim();if(t.length>10&&t.length<200)return t;}return '';})()\""
],
"judge": {
"type": "nonEmpty"
Expand Down Expand Up @@ -294,7 +295,7 @@
"opencli operate open https://example.com",
"opencli operate eval \"document.querySelector('a')?.click();'clicked'\"",
"opencli operate wait time 2",
"opencli operate eval \"document.title\""
"opencli operate eval \"document.title + ' ' + location.href\""
],
"judge": {
"type": "contains",
Expand Down Expand Up @@ -580,7 +581,8 @@
"name": "bench-imdb-matrix",
"steps": [
"opencli operate open https://www.imdb.com/title/tt0133093/",
"opencli operate eval \"JSON.stringify({title:document.querySelector('h1 span, [data-testid=hero__pageTitle] span')?.textContent,year:document.querySelector('a[href*=releaseinfo], [data-testid=hero-title-block__metadata] a')?.textContent})\""
"opencli operate wait time 3",
"opencli operate eval \"(function(){var title=document.querySelector('h1')?.textContent?.trim()||'';var year='';var links=document.querySelectorAll('a');for(var i=0;i<links.length;i++){if(links[i].textContent.trim()==='1999'){year='1999';break;}}var rating=document.querySelector('[data-testid=hero-rating-bar__aggregate-rating__score] span, .sc-bde20123-1')?.textContent?.trim()||'';return JSON.stringify({title:title,year:year,rating:rating});})()\""
],
"judge": {
"type": "contains",
Expand Down
127 changes: 127 additions & 0 deletions autoresearch/eval-all.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env npx tsx
/**
* Combined Test Suite Runner — runs browse + V2EX + Zhihu tasks.
* Reports combined score for AutoResearch iteration.
*
* Usage:
* npx tsx autoresearch/eval-all.ts # Run all
* npx tsx autoresearch/eval-all.ts --suite v2ex # Run one suite
*/

import { execSync } from 'node:child_process';
import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';

const __dirname = dirname(fileURLToPath(import.meta.url));
const ROOT = join(__dirname, '..');
const RESULTS_DIR = join(__dirname, 'results');

interface SuiteResult {
name: string;
passed: number;
total: number;
failures: string[];
duration: number;
}

function runSuite(name: string, script: string): SuiteResult {
const start = Date.now();
try {
const output = execSync(`npx tsx ${script}`, {
cwd: ROOT,
timeout: 600_000,
encoding: 'utf-8',
env: process.env,
stdio: ['pipe', 'pipe', 'pipe'],
});

// Parse SCORE=X/Y from output
const scoreMatch = output.match(/SCORE=(\d+)\/(\d+)/);
const passed = scoreMatch ? parseInt(scoreMatch[1], 10) : 0;
const total = scoreMatch ? parseInt(scoreMatch[2], 10) : 0;

// Parse failures
const failures: string[] = [];
const failLines = output.match(/✗.*$/gm) || [];
for (const line of failLines) {
const m = line.match(/✗\s+(?:\[.*?\]\s+)?(\S+)/);
if (m) failures.push(m[1].replace(/:$/, ''));
}

return { name, passed, total, failures, duration: Date.now() - start };
} catch (err: any) {
const output = err.stdout ?? '';
const scoreMatch = output.match(/SCORE=(\d+)\/(\d+)/);
const passed = scoreMatch ? parseInt(scoreMatch[1], 10) : 0;
const total = scoreMatch ? parseInt(scoreMatch[2], 10) : 0;
const failures: string[] = [];
const failLines = output.match(/✗.*$/gm) || [];
for (const line of failLines) {
const m = line.match(/✗\s+(?:\[.*?\]\s+)?(\S+)/);
if (m) failures.push(m[1].replace(/:$/, ''));
}
return { name, passed, total, failures, duration: Date.now() - start };
}
}

function main() {
const args = process.argv.slice(2);
const singleSuite = args.includes('--suite') ? args[args.indexOf('--suite') + 1] : null;

const suites = [
{ name: 'browse', script: 'autoresearch/eval-browse.ts' },
{ name: 'v2ex', script: 'autoresearch/eval-v2ex.ts' },
{ name: 'zhihu', script: 'autoresearch/eval-zhihu.ts' },
].filter(s => !singleSuite || s.name === singleSuite);

console.log(`\n🔬 Combined AutoResearch — ${suites.length} suites\n`);

const results: SuiteResult[] = [];
for (const suite of suites) {
console.log(` Running ${suite.name}...`);
const result = runSuite(suite.name, suite.script);
results.push(result);
const icon = result.passed === result.total ? '✓' : '✗';
console.log(` ${icon} ${result.name}: ${result.passed}/${result.total} (${Math.round(result.duration / 1000)}s)`);
if (result.failures.length > 0) {
for (const f of result.failures.slice(0, 5)) {
console.log(` ✗ ${f}`);
}
}
}

// Summary
const totalPassed = results.reduce((s, r) => s + r.passed, 0);
const totalTasks = results.reduce((s, r) => s + r.total, 0);
const totalDuration = results.reduce((s, r) => s + r.duration, 0);
const allFailures = results.flatMap(r => r.failures.map(f => `${r.name}:${f}`));

console.log(`\n${'━'.repeat(50)}`);
console.log(` Combined: ${totalPassed}/${totalTasks}`);
for (const r of results) {
console.log(` ${r.name}: ${r.passed}/${r.total}`);
}
console.log(` Time: ${Math.round(totalDuration / 60000)}min`);
if (allFailures.length > 0) {
console.log(`\n All failures:`);
for (const f of allFailures) console.log(` ✗ ${f}`);
}

// Save result
mkdirSync(RESULTS_DIR, { recursive: true });
const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('all-')).length;
const roundNum = String(existing + 1).padStart(3, '0');
const resultPath = join(RESULTS_DIR, `all-${roundNum}.json`);
writeFileSync(resultPath, JSON.stringify({
timestamp: new Date().toISOString(),
score: `${totalPassed}/${totalTasks}`,
suites: Object.fromEntries(results.map(r => [r.name, `${r.passed}/${r.total}`])),
failures: allFailures,
duration: `${Math.round(totalDuration / 60000)}min`,
}, null, 2), 'utf-8');
console.log(`\n Results saved to: ${resultPath}`);
console.log(`\nSCORE=${totalPassed}/${totalTasks}`);
}

main();
2 changes: 1 addition & 1 deletion autoresearch/eval-browse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ function runCommand(cmd: string): string {
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
} catch (err: any) {
return err.stdout?.trim() ?? '';
return err.stdout?.trim() || err.stderr?.trim() || '';
}
}

Expand Down
22 changes: 14 additions & 8 deletions autoresearch/eval-skill.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';

const __dirname = dirname(fileURLToPath(import.meta.url));
const TASKS_FILE = join(__dirname, 'skill-tasks.yaml');
const RESULTS_DIR = join(__dirname, 'results');
const SKILL_PATH = join(__dirname, '..', 'skills', 'opencli-operate', 'SKILL.md');

Expand Down Expand Up @@ -160,13 +159,20 @@ Always close the browser with 'opencli operate close' when done.`;
}

function extractVerdict(text: string): { success: boolean; explanation: string } {
// Try to find {"success": ...} JSON in the text
const jsonMatches = text.match(/\{"success"\s*:\s*(true|false)\s*,\s*"explanation"\s*:\s*"([^"]*)"\s*\}/g);
if (jsonMatches) {
const last = jsonMatches[jsonMatches.length - 1];
try {
return JSON.parse(last);
} catch { /* fall through */ }
// Try to find and parse {"success": ...} JSON from the last occurrence
const idx = text.lastIndexOf('{"success"');
if (idx !== -1) {
// Find the matching closing brace (handle escaped quotes in explanation)
const sub = text.slice(idx);
let braceCount = 0;
let end = -1;
for (let i = 0; i < sub.length; i++) {
if (sub[i] === '{') braceCount++;
else if (sub[i] === '}') { braceCount--; if (braceCount === 0) { end = i + 1; break; } }
}
if (end > 0) {
try { return JSON.parse(sub.slice(0, end)); } catch { /* fall through */ }
}
}

// Fallback: check for success indicators in text
Expand Down
Loading
Loading