Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.test.example
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ E2B_TEMPLATE=base

# E2B 沙箱超时时间(毫秒,可选,默认 300000)
E2B_TIMEOUT_MS=300000

# =============================================================================
# Benchmark (for benchmark tests)
# =============================================================================
# Docker 代理(可选,SWE full 模式 git clone 和 Docker 容器使用)
# BENCHMARK_DOCKER_PROXY=http://127.0.0.1:7897
210 changes: 210 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
name: Benchmark Full Suite

on:
workflow_dispatch:
inputs:
benchmark:
description: "Which benchmark to run"
type: choice
required: true
default: both
options:
- all
- both
- swe
- tau
- tb2
provider:
description: "SWE/TAU provider filter"
type: choice
required: true
default: all
options:
- all
- anthropic
- openai
- gemini
tau_domain:
description: "TAU domain (airline by default for faster runs)"
type: choice
required: true
default: airline
options:
- airline
- retail
- telecom
- all
tb2_model:
description: "TB2 model in provider/model format"
type: string
required: true
default: openai/glm-5
push:
branches:
- add_benchmark_test
pull_request:
branches:
- main

env:
NODE_VERSION: "20"

permissions:
contents: read

jobs:
benchmark:
name: Benchmark
runs-on: ubuntu-latest
timeout-minutes: 360
if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: npm

- name: Setup uv
uses: astral-sh/setup-uv@v4

- name: Login to Docker Hub (optional)
if: ${{ env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
uses: docker/login-action@v3
with:
username: ${{ env.DOCKERHUB_USERNAME }}
password: ${{ env.DOCKERHUB_TOKEN }}

- name: Install dependencies
run: npm ci

- name: Create benchmark environment
run: |
cat > .env.test << 'EOT'
ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}

OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}
OPENAI_MODEL_ID=${{ vars.OPENAI_MODEL_ID }}
OPENAI_BASE_URL=${{ vars.OPENAI_BASE_URL }}

GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }}
GEMINI_MODEL_ID=${{ vars.GEMINI_MODEL_ID }}
GEMINI_BASE_URL=${{ vars.GEMINI_BASE_URL }}

BENCHMARK_DOCKER_PROXY=${{ vars.BENCHMARK_DOCKER_PROXY }}
BENCHMARK_TIMEOUT_MS=${{ vars.BENCHMARK_TIMEOUT_MS }}
EOT

- name: Run unified benchmark command
run: |
mkdir -p tests/tmp
benchmark="${{ github.event.inputs.benchmark || 'both' }}"
provider="${{ github.event.inputs.provider || 'all' }}"
tau_domain="${{ github.event.inputs.tau_domain || 'airline' }}"
tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}"

args=(
--benchmark=${benchmark}
--tau-domain=${tau_domain}
--tb2-model=${tb2_model}
--tb2-agent=oracle
--tb2-runner=uvx
--tb2-python=3.12
--tb2-jobs-dir=./tests/tmp/jobs
--output=json
--output-file=tests/tmp/benchmark-report.json
)

if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then
args+=(--provider=${provider})
fi

npm run test:benchmark -- "${args[@]}"

- name: Write step summary
if: ${{ always() }}
run: |
node - <<'NODE' >> "$GITHUB_STEP_SUMMARY"
const fs = require('fs');
function readJson(p) {
if (!fs.existsSync(p)) return null;
try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch { return null; }
}

const report = readJson('tests/tmp/benchmark-report.json');
console.log('## Benchmark Report');
console.log('');

if (!report) {
console.log('- report not found');
process.exit(0);
}

if (Array.isArray(report.swe) && report.swe.length > 0) {
console.log('### SWE-bench-Verified');
console.log('');
console.log('| Provider / Model | Resolved | Rate |');
console.log('|---|---:|---:|');
for (const r of report.swe) {
const name = `${r.provider.id} / ${r.provider.model}`;
const resolved = `${r.summary.resolved}/${r.summary.total}`;
const rate = `${(r.summary.rate * 100).toFixed(1)}%`;
console.log(`| ${name} | ${resolved} | ${rate} |`);
}
console.log('');
}

if (Array.isArray(report.tau) && report.tau.length > 0) {
console.log('### TAU-bench');
console.log('');
console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |');
console.log('|---|---|---:|---:|');
for (const r of report.tau) {
const name = `${r.provider.id} / ${r.provider.model}`;
const domain = r.summary.domain;
const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`;
const observed = (r.summary.token_observed_trials ?? 0) > 0;
const avgTokens = observed
? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`)
: '-';
console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`);
}
console.log('');
}

if (report.tb2) {
const tb2 = report.tb2;
console.log('### Terminal Bench 2.0');
console.log('');
console.log(`- Agent: \`${tb2.agent}\``);
if (tb2.model) console.log(`- Model: \`${tb2.model}\``);
console.log(`- Passed: **${tb2.passed}/${tb2.total}**`);
console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`);
if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) {
console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`);
} else {
console.log(`- Avg tokens: **N/A**`);
}
console.log('');
}
NODE

- name: Upload benchmark artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-artifacts-${{ github.run_id }}
if-no-files-found: warn
path: |
tests/tmp/benchmark-report.json
tests/tmp/jobs/*/result.json
tests/tmp/tau2-data/simulations/*.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ See [docs/en/guides/architecture.md](./docs/en/guides/architecture.md) for detai
| [Providers](./docs/en/guides/providers.md) | Model provider configuration |
| [Database](./docs/en/guides/database.md) | SQLite/PostgreSQL persistence |
| [Resume & Fork](./docs/en/guides/resume-fork.md) | Crash recovery & branching |
| [Benchmark Results](./docs/en/guides/benchmark-results.md) | Confirmed benchmark score tables |
| **Project** | |
| [Contribution Guide](./docs/en/contribution.md) | How to contribute |
| **Reference** | |
Expand Down
1 change: 1 addition & 0 deletions README.zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ npm run example:room # 多Agent协作
| [Provider 配置](./docs/zh-CN/guides/providers.md) | 模型 Provider 配置 |
| [数据库存储](./docs/zh-CN/guides/database.md) | SQLite/PostgreSQL 持久化 |
| [恢复与分叉](./docs/zh-CN/guides/resume-fork.md) | 崩溃恢复与分支 |
| [Benchmark 结果](./docs/zh-CN/guides/benchmark-results.md) | 已确认的跑分结果表格 |
| **项目** | |
| [贡献指南](./docs/zh-CN/contribution.md) | 提交 PR 的要求与流程 |
| **参考** | |
Expand Down
32 changes: 32 additions & 0 deletions docs/en/guides/benchmark-results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Benchmark Results (Confirmed)

Last updated: 2026-02-26

## SWE-bench-Verified

| Provider / Model | Instances | Resolved | Rate | Avg Tokens | Avg Duration |
|---|---:|---:|---:|---:|---:|
| openai / glm-5 | 12 | 12/12 | 100.0% | 17.2k | 134.5k ms |

Source: local full run log (`2026-02-25__21-06-21`).

## Terminal Bench 2.0

| Agent / Model | Passed | Parseable | Unknown | Rate (parseable) | Notes |
|---|---:|---:|---:|---:|---|
| oracle / glm-5 | 1 | 31 | 58 | 3.2% | From the same full run; many tasks ended with runtime/timeout errors. |

## Reproduce

```bash
npm run test:benchmark -- \
--benchmark=both \
--tb2-model=openai/glm-5 \
--tb2-agent=oracle \
--tb2-runner=uvx \
--tb2-jobs-dir=./tests/tmp/jobs \
--output=json \
--output-file=tests/tmp/benchmark-report.json
```

The JSON report includes both `swe` and `tb2` sections.
Loading