Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/nightly-terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ name: Nightly Terminal-Bench

on:
schedule:
# Run twice daily, 12 h apart, at times representing peak and trough model load.
# 10:00 UTC (2 AM PT) – lowest US model usage (overnight)
# 22:00 UTC (2 PM PT) – highest US model usage (mid-afternoon business hours)
- cron: "0 10 * * *"
- cron: "0 22 * * *"
# Run twice weekly (Mon + Thu), at peak and trough model load times.
# Monday 10:00 UTC (2 AM PT) – lowest US model usage
# Thursday 22:00 UTC (2 PM PT) – highest US model usage
- cron: "0 10 * * 1"
- cron: "0 22 * * 4"
workflow_dispatch:
inputs:
models:
Expand Down
66 changes: 66 additions & 0 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,27 @@ jobs:
- name: Build dist/ (skip icons - not needed for benchmark)
run: make build-main build-preload

# Best-effort snapshot of existing Daytona sandboxes so post-run cleanup
# only deletes sandboxes created by *this* job, not ones from parallel jobs.
# continue-on-error: transient API/pip failures must not block the benchmark.
- name: Snapshot Daytona sandboxes
if: inputs.env == 'daytona'
continue-on-error: true
env:
DAYTONA_API_KEY: ${{ secrets.DAYTONA_API_KEY }}
run: |
if [ -z "$DAYTONA_API_KEY" ]; then
exit 0
fi
pip install --quiet daytona
python3 -c "
from daytona import Daytona
d = Daytona()
ids = [sb.id for sb in d.list()]
print('\n'.join(ids))
" > /tmp/daytona-pre-existing.txt
echo "Snapshot $(wc -l < /tmp/daytona-pre-existing.txt) pre-existing sandbox(es)"

- name: Run Terminal-Bench
run: make benchmark-terminal 2>&1 | tee benchmark.log
env:
Expand All @@ -169,6 +190,51 @@ jobs:
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
DAYTONA_API_KEY: ${{ secrets.DAYTONA_API_KEY }}

# Best-effort cleanup of sandboxes created during this run. Compares
# against the pre-run snapshot and only deletes stopped/errored sandboxes,
# so active sandboxes from parallel jobs are never touched.
# Failures here must never mark the job as failed (benchmark results matter).
- name: Cleanup Daytona sandboxes
if: always() && inputs.env == 'daytona'
continue-on-error: true
env:
DAYTONA_API_KEY: ${{ secrets.DAYTONA_API_KEY }}
run: |
if [ -z "$DAYTONA_API_KEY" ]; then
echo "No DAYTONA_API_KEY, skipping cleanup"
exit 0
fi
if [ ! -f /tmp/daytona-pre-existing.txt ]; then
echo "No pre-run snapshot found, skipping cleanup to avoid deleting unrelated sandboxes"
exit 0
fi
pip install --quiet daytona
python3 -c "
from daytona import Daytona
with open('/tmp/daytona-pre-existing.txt') as f:
pre_existing = {line.strip() for line in f if line.strip()}
d = Daytona()
# Only consider sandboxes created after our snapshot (not pre-existing)
# and only delete ones that are no longer running (safe for parallel jobs).
ACTIVE_STATES = {'started', 'creating', 'starting'}
candidates = [sb for sb in d.list() if sb.id not in pre_existing]
to_delete = [sb for sb in candidates if str(sb.state).lower() not in ACTIVE_STATES]
skipped = len(candidates) - len(to_delete)
if skipped:
print(f'Skipping {skipped} still-active sandbox(es)')
if not to_delete:
print('No stopped sandboxes to clean up')
else:
print(f'Cleaning up {len(to_delete)} stopped sandbox(es) from this run...')
for sb in to_delete:
try:
print(f' Deleting {sb.id} (state={sb.state})...')
d.delete(sb)
print(f' Deleted {sb.id}')
except Exception as e:
print(f' Failed to delete {sb.id}: {e}')
"

- name: Print results summary
if: always()
run: |
Expand Down