Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .actrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Act configuration for running GitHub Actions locally
# Uses larger Docker image to support full Ubuntu functionality

# Use medium-sized Ubuntu image (ubuntu-latest equivalent)
-P ubuntu-latest=catthehacker/ubuntu:act-latest

# Enable verbose output for debugging
--verbose

# Container architecture (automatically detected)
--container-architecture linux/amd64

24 changes: 19 additions & 5 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,12 @@ jobs:
integration-tests:
name: Terminal-bench Integration
runs-on: ubuntu-latest
# Temporarily disabled - needs debugging
if: false
timeout-minutes: 20 # Fail fast if tests hang
# Run on main branch and e/* branches for testing
if: |
github.ref == 'refs/heads/main' ||
startsWith(github.ref, 'refs/heads/e/') ||
startsWith(github.head_ref, 'e/')

steps:
- uses: actions/checkout@v4
Expand All @@ -114,10 +118,15 @@ jobs:
uses: docker/setup-buildx-action@v3

- name: Install uv
run: curl -LsSf https://astral.sh/uv/install.sh | sh
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Install terminal-bench
run: uv tool install terminal-bench
run: |
export PATH="$HOME/.local/bin:$PATH"
uv tool install terminal-bench
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Create virtual environment
run: uv venv
Expand All @@ -134,9 +143,14 @@ jobs:
uv pip install -e .

- name: Run terminal-bench integration test
timeout-minutes: 15 # Per-step timeout
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
PYTHONUNBUFFERED: "1" # Force immediate output
run: |
source .venv/bin/activate
pytest tests/ -m "integration" -v --tb=short
export PATH="$HOME/.local/bin:$PATH"
echo "Starting terminal-bench integration tests at $(date)"
pytest tests/ -m "integration" -v --tb=short -s --log-cli-level=INFO
echo "Terminal-bench tests completed at $(date)"

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ logs/

# Benchmark runs
runs/
.secrets
10 changes: 9 additions & 1 deletion benchmark/adapters/fireteam_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def perform_task(self, instruction, session, logging_dir):
fireteam_root = Path(__file__).parent.parent.parent

# Create directory structure in container first
session.container.exec_run(["mkdir", "-p", "/fireteam/src/agents", "/fireteam/src/state"])
session.container.exec_run(["mkdir", "-p", "/fireteam/src/agents", "/fireteam/src/state", "/fireteam/src/memory"])

# Copy main files
session.copy_to_container(
Expand Down Expand Up @@ -176,6 +176,14 @@ def perform_task(self, instruction, session, logging_dir):
container_filename=state_file.name
)

# Copy memory module files
for memory_file in (fireteam_root / "src" / "memory").glob("*.py"):
session.copy_to_container(
paths=[memory_file],
container_dir="/fireteam/src/memory",
container_filename=memory_file.name
)

# Run parent's setup and execution
return super().perform_task(instruction, session, logging_dir)

3 changes: 3 additions & 0 deletions benchmark/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ dependencies = [
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["adapters"]

[dependency-groups]
dev = []

Expand Down
4 changes: 2 additions & 2 deletions tests/test_terminal_bench_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def test_hello_world_task(self):
'--dataset', 'terminal-bench-core==0.1.1',
'--task-id', 'hello-world',
'--global-agent-timeout-sec', '600',
'--log-level', 'debug',
'--livestream' # Enable real-time output
'--log-level', 'debug'
# Note: --livestream removed to show output in CI/act logs
]

print("\n🚀 Running terminal-bench hello-world task...")
Expand Down
Loading