shareAI-lab · Gui-Yue · Feb 12, 2026 · Feb 26, 2026 · Feb 27, 2026
diff --git a/.env.test.example b/.env.test.example
@@ -63,3 +63,9 @@ E2B_TEMPLATE=base
 
 # E2B 沙箱超时时间（毫秒，可选，默认 300000）
 E2B_TIMEOUT_MS=300000
+
+# =============================================================================
+# Benchmark (for benchmark tests)
+# =============================================================================
+# Docker 代理（可选，SWE full 模式 git clone 和 Docker 容器使用）
+# BENCHMARK_DOCKER_PROXY=http://127.0.0.1:7897
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,210 @@
+name: Benchmark Full Suite
+
+on:
+  workflow_dispatch:
+    inputs:
+      benchmark:
+        description: "Which benchmark to run"
+        type: choice
+        required: true
+        default: both
+        options:
+          - all
+          - both
+          - swe
+          - tau
+          - tb2
+      provider:
+        description: "SWE/TAU provider filter"
+        type: choice
+        required: true
+        default: all
+        options:
+          - all
+          - anthropic
+          - openai
+          - gemini
+      tau_domain:
+        description: "TAU domain (airline by default for faster runs)"
+        type: choice
+        required: true
+        default: airline
+        options:
+          - airline
+          - retail
+          - telecom
+          - all
+      tb2_model:
+        description: "TB2 model in provider/model format"
+        type: string
+        required: true
+        default: openai/glm-5
+  push:
+    branches:
+      - add_benchmark_test
+  pull_request:
+    branches:
+      - main
+
+env:
+  NODE_VERSION: "20"
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    name: Benchmark
+    runs-on: ubuntu-latest
+    timeout-minutes: 360
+    if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }}
+    env:
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+          cache: npm
+
+      - name: Setup uv
+        uses: astral-sh/setup-uv@v4
+
+      - name: Login to Docker Hub (optional)
+        if: ${{ env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
+        uses: docker/login-action@v3
+        with:
+          username: ${{ env.DOCKERHUB_USERNAME }}
+          password: ${{ env.DOCKERHUB_TOKEN }}
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Create benchmark environment
+        run: |
+          cat > .env.test << 'EOT'
+          ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
+          ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
+          ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}
+
+          OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}
+          OPENAI_MODEL_ID=${{ vars.OPENAI_MODEL_ID }}
+          OPENAI_BASE_URL=${{ vars.OPENAI_BASE_URL }}
+
+          GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }}
+          GEMINI_MODEL_ID=${{ vars.GEMINI_MODEL_ID }}
+          GEMINI_BASE_URL=${{ vars.GEMINI_BASE_URL }}
+
+          BENCHMARK_DOCKER_PROXY=${{ vars.BENCHMARK_DOCKER_PROXY }}
+          BENCHMARK_TIMEOUT_MS=${{ vars.BENCHMARK_TIMEOUT_MS }}
+          EOT
+
+      - name: Run unified benchmark command
+        run: |
+          mkdir -p tests/tmp
+          benchmark="${{ github.event.inputs.benchmark || 'both' }}"
+          provider="${{ github.event.inputs.provider || 'all' }}"
+          tau_domain="${{ github.event.inputs.tau_domain || 'airline' }}"
+          tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}"
+
+          args=(
+            --benchmark=${benchmark}
+            --tau-domain=${tau_domain}
+            --tb2-model=${tb2_model}
+            --tb2-agent=oracle
+            --tb2-runner=uvx
+            --tb2-python=3.12
+            --tb2-jobs-dir=./tests/tmp/jobs
+            --output=json
+            --output-file=tests/tmp/benchmark-report.json
+          )
+
+          if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then
+            args+=(--provider=${provider})
+          fi
+
+          npm run test:benchmark -- "${args[@]}"
+
+      - name: Write step summary
+        if: ${{ always() }}
+        run: |
+          node - <<'NODE' >> "$GITHUB_STEP_SUMMARY"
+          const fs = require('fs');
+          function readJson(p) {
+            if (!fs.existsSync(p)) return null;
+            try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch { return null; }
+          }
+
+          const report = readJson('tests/tmp/benchmark-report.json');
+          console.log('## Benchmark Report');
+          console.log('');
+
+          if (!report) {
+            console.log('- report not found');
+            process.exit(0);
+          }
+
+          if (Array.isArray(report.swe) && report.swe.length > 0) {
+            console.log('### SWE-bench-Verified');
+            console.log('');
+            console.log('| Provider / Model | Resolved | Rate |');
+            console.log('|---|---:|---:|');
+            for (const r of report.swe) {
+              const name = `${r.provider.id} / ${r.provider.model}`;
+              const resolved = `${r.summary.resolved}/${r.summary.total}`;
+              const rate = `${(r.summary.rate * 100).toFixed(1)}%`;
+              console.log(`| ${name} | ${resolved} | ${rate} |`);
+            }
+            console.log('');
+          }
+
+          if (Array.isArray(report.tau) && report.tau.length > 0) {
+            console.log('### TAU-bench');
+            console.log('');
+            console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |');
+            console.log('|---|---|---:|---:|');
+            for (const r of report.tau) {
+              const name = `${r.provider.id} / ${r.provider.model}`;
+              const domain = r.summary.domain;
+              const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`;
+              const observed = (r.summary.token_observed_trials ?? 0) > 0;
+              const avgTokens = observed
+                ? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`)
+                : '-';
+              console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`);
+            }
+            console.log('');
+          }
+
+          if (report.tb2) {
+            const tb2 = report.tb2;
+            console.log('### Terminal Bench 2.0');
+            console.log('');
+            console.log(`- Agent: \`${tb2.agent}\``);
+            if (tb2.model) console.log(`- Model: \`${tb2.model}\``);
+            console.log(`- Passed: **${tb2.passed}/${tb2.total}**`);
+            console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`);
+            if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) {
+              console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`);
+            } else {
+              console.log(`- Avg tokens: **N/A**`);
+            }
+            console.log('');
+          }
+          NODE
+
+      - name: Upload benchmark artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-artifacts-${{ github.run_id }}
+          if-no-files-found: warn
+          path: |
+            tests/tmp/benchmark-report.json
+            tests/tmp/jobs/*/result.json
+            tests/tmp/tau2-data/simulations/*.json
diff --git a/README.md b/README.md
@@ -146,6 +146,7 @@ See [docs/en/guides/architecture.md](./docs/en/guides/architecture.md) for detai
 | [Providers](./docs/en/guides/providers.md) | Model provider configuration |
 | [Database](./docs/en/guides/database.md) | SQLite/PostgreSQL persistence |
 | [Resume & Fork](./docs/en/guides/resume-fork.md) | Crash recovery & branching |
+| [Benchmark Results](./docs/en/guides/benchmark-results.md) | Confirmed benchmark score tables |
 | **Project** | |
 | [Contribution Guide](./docs/en/contribution.md) | How to contribute |
 | **Reference** | |

diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -106,6 +106,7 @@ npm run example:room               # 多Agent协作
 | [Provider 配置](./docs/zh-CN/guides/providers.md) | 模型 Provider 配置 |
 | [数据库存储](./docs/zh-CN/guides/database.md) | SQLite/PostgreSQL 持久化 |
 | [恢复与分叉](./docs/zh-CN/guides/resume-fork.md) | 崩溃恢复与分支 |
+| [Benchmark 结果](./docs/zh-CN/guides/benchmark-results.md) | 已确认的跑分结果表格 |
 | **项目** | |
 | [贡献指南](./docs/zh-CN/contribution.md) | 提交 PR 的要求与流程 |
 | **参考** | |

diff --git a/docs/en/guides/benchmark-results.md b/docs/en/guides/benchmark-results.md
@@ -0,0 +1,32 @@
+# Benchmark Results (Confirmed)
+
+Last updated: 2026-02-26
+
+## SWE-bench-Verified
+
+| Provider / Model | Instances | Resolved | Rate | Avg Tokens | Avg Duration |
+|---|---:|---:|---:|---:|---:|
+| openai / glm-5 | 12 | 12/12 | 100.0% | 17.2k | 134.5k ms |
+
+Source: local full run log (`2026-02-25__21-06-21`).
+
+## Terminal Bench 2.0
+
+| Agent / Model | Passed | Parseable | Unknown | Rate (parseable) | Notes |
+|---|---:|---:|---:|---:|---|
+| oracle / glm-5 | 1 | 31 | 58 | 3.2% | From the same full run; many tasks ended with runtime/timeout errors. |
+
+## Reproduce
+
+```bash
+npm run test:benchmark -- \
+  --benchmark=both \
+  --tb2-model=openai/glm-5 \
+  --tb2-agent=oracle \
+  --tb2-runner=uvx \
+  --tb2-jobs-dir=./tests/tmp/jobs \
+  --output=json \
+  --output-file=tests/tmp/benchmark-report.json
+```
+
+The JSON report includes both `swe` and `tb2` sections.