Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,39 @@ jobs:
if: always()
run: git worktree remove /tmp/baseline-tree --force 2>/dev/null || true

skill-eval:
name: Skill Evals
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6

- name: Check for skill or eval changes
id: filter
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3
with:
filters: |
skill:
- 'skills/basecamp/SKILL.md'
- 'skill-evals/**'

- name: Set up Ruby
if: steps.filter.outputs.skill == 'true'
uses: ruby/setup-ruby@13e7a03dc3ac6c3798f4570bfead2aed4d96abfb # v1
with:
ruby-version: '3.3'

- name: Run skill evals
if: steps.filter.outputs.skill == 'true'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
if [ -z "$ANTHROPIC_API_KEY" ]; then
echo "::warning::ANTHROPIC_API_KEY not configured, skipping skill evals"
exit 0
fi
make skill-eval

benchmarks:
name: Benchmarks
runs-on: ubuntu-latest
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,11 @@ tools:
@command -v jq >/dev/null 2>&1 || echo "NOTE: jq is also required (install via your package manager)"


# Run skill evals (requires ANTHROPIC_API_KEY and Ruby)
.PHONY: skill-eval
skill-eval:
$(MAKE) -C skill-evals eval

# Sync skills to basecamp/skills distribution repo
# Usage: make sync-skills TAG=v1.2.3
.PHONY: sync-skills
Expand Down
18 changes: 18 additions & 0 deletions skill-evals/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
MODEL ?= claude-sonnet-4-20250514
SKILL ?= ../skills/basecamp/SKILL.md

.PHONY: eval eval-syntax eval-save eval-compare

eval:
./run --model $(MODEL) --skill $(SKILL)

eval-syntax:
./run --model $(MODEL) --skill $(SKILL) --tag calling-convention

eval-save:
@test -n "$(NAME)" || (echo "Usage: make eval-save NAME=baseline" && exit 1)
./run --model $(MODEL) --skill $(SKILL) --save $(NAME)

eval-compare:
@test -n "$(NAME)" || (echo "Usage: make eval-compare NAME=baseline" && exit 1)
./run --model $(MODEL) --skill $(SKILL) --compare $(NAME)
11 changes: 11 additions & 0 deletions skill-evals/cases/agent-output-mode.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
task: List all projects in machine-readable format
tags: [workflow]
mocks:
- match: 'projects list'
output: '{"ok":true,"data":[{"id":111,"name":"Redesign"},{"id":222,"name":"Other"}],"summary":"2 projects"}'
accept:
- 'projects list'
- '--(json|agent)'
reject:
- '--md'
max_commands: 3
5 changes: 5 additions & 0 deletions skill-evals/cases/assign-todo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
task: Assign todo 12345 to Alice
tags: [shortcuts]
accept:
- 'assign.*12345'
- '--to Alice'
5 changes: 5 additions & 0 deletions skill-evals/cases/checkin-answer.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
task: Answer check-in question 456 with "Shipped v2" in project 12345
tags: [domain]
accept:
- 'checkins answer create.*456.*Shipped'
- '--(in|project) 12345'
6 changes: 6 additions & 0 deletions skill-evals/cases/complete-multiple.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
task: Complete todos 111, 222, and 333
tags: [shortcuts]
accept:
- 'done.*(111.*222.*333|111,222,333)'
reject:
- '--id'
6 changes: 6 additions & 0 deletions skill-evals/cases/complete-todo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
task: Mark todo 98765 as complete
tags: [shortcuts]
accept:
- '(done|todos complete).*98765'
reject:
- '--id'
9 changes: 9 additions & 0 deletions skill-evals/cases/create-card.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
task: Create a card "Design review" in project 77777
tags: [calling-convention, create]
accept:
- 'card(s create)? .*Design review'
- '--(in|project) 77777'
reject:
- '--title .*Design'
- '--body'
- '--content'
7 changes: 7 additions & 0 deletions skill-evals/cases/create-comment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: Comment "Looks good!" on recording 54321 in project 99999
tags: [calling-convention, create]
accept:
- 'comment(s create)? .*54321.*Looks good'
- '--(in|project) 99999'
reject:
- '--content .*Looks'
5 changes: 5 additions & 0 deletions skill-evals/cases/create-doc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
task: Create a document titled "API Guide" with body "Getting started with the API" in project 12345
tags: [domain]
accept:
- '(files|docs) doc create.*API Guide.*Getting started'
- '--(in|project) 12345'
10 changes: 10 additions & 0 deletions skill-evals/cases/create-message.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
task: Post a message titled "Weekly Update" with body "Here are this week's highlights" in project 55555
tags: [calling-convention, create]
accept:
- 'message(s create)? .*Weekly Update'
- '--(in|project) 55555'
reject:
- '--subject'
- '--title'
- '--body'
- '--content'
7 changes: 7 additions & 0 deletions skill-evals/cases/create-todo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: Create a todo "Buy milk" in project 12345
tags: [calling-convention, create]
accept:
- 'todo(s create)? .*Buy milk'
- '--(in|project) 12345'
reject:
- '--content'
7 changes: 7 additions & 0 deletions skill-evals/cases/list-cards.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: List cards in project 12345
tags: [calling-convention]
accept:
- 'cards list'
- '--(in|project) 12345'
reject:
- '^cards\s+--'
7 changes: 7 additions & 0 deletions skill-evals/cases/list-files.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: List files in project 12345
tags: [calling-convention]
accept:
- 'files list'
- '--(in|project) 12345'
reject:
- '^files\s+--'
7 changes: 7 additions & 0 deletions skill-evals/cases/list-messages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: List messages in project 12345
tags: [calling-convention]
accept:
- 'messages list'
- '--(in|project) 12345'
reject:
- '^messages\s+--'
7 changes: 7 additions & 0 deletions skill-evals/cases/list-todolists.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: List todolists in project 12345
tags: [calling-convention]
accept:
- 'todolists list'
- '--(in|project) 12345'
reject:
- '^todolists\s+--'
8 changes: 8 additions & 0 deletions skill-evals/cases/list-todos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
task: List todos assigned to me in project 12345
tags: [calling-convention]
accept:
- 'todos list'
- '--assignee me'
- '--(in|project) 12345'
reject:
- '^todos\s+--'
7 changes: 7 additions & 0 deletions skill-evals/cases/people-in-project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: List people in project 12345
tags: [domain]
accept:
- 'people list'
- '--project 12345'
reject:
- '--in'
8 changes: 8 additions & 0 deletions skill-evals/cases/project-scope.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
task: List overdue todos in project 12345
tags: [workflow]
accept:
- 'todos list'
- '--overdue'
- '--(in|project) 12345'
reject:
- 'reports'
5 changes: 5 additions & 0 deletions skill-evals/cases/recordings-browse.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
task: Browse all archived todos across projects
tags: [semantics]
accept:
- 'recordings.*todos'
- '--status archived'
4 changes: 4 additions & 0 deletions skill-evals/cases/reopen-todo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
task: Reopen todo 98765
tags: [shortcuts]
accept:
- '(reopen|todos uncomplete).*98765'
6 changes: 6 additions & 0 deletions skill-evals/cases/reports-assigned.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
task: List all my assigned todos across projects
tags: [semantics]
accept:
- 'reports assigned'
reject:
- 'todos list'
6 changes: 6 additions & 0 deletions skill-evals/cases/reports-overdue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
task: Find overdue todos across all projects
tags: [semantics]
accept:
- 'reports overdue'
reject:
- 'todos list.*--overdue'
7 changes: 7 additions & 0 deletions skill-evals/cases/schedule-create.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
task: Create a meeting "Standup" tomorrow 9am-9:30am in project 12345
tags: [domain]
accept:
- 'schedule create.*Standup'
- '--starts-at'
- '--ends-at'
- '--(in|project) 12345'
6 changes: 6 additions & 0 deletions skill-evals/cases/search.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
task: Find items mentioning "quarterly goals"
tags: [semantics]
accept:
- 'search.*quarterly goals'
reject:
- 'recordings'
10 changes: 10 additions & 0 deletions skill-evals/cases/url-then-comment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
task: Comment "Done!" on https://3.basecamp.com/123/buckets/456/todos/789
tags: [workflow]
mocks:
- match: 'url parse'
output: '{"ok":true,"data":{"account_id":"123","project_id":"456","recording_id":"789"}}'
- match: 'comment'
output: '{"ok":true,"data":{"id":999}}'
expect_sequence:
- match: 'url parse.*3\.basecamp\.com/123/buckets/456/todos/789'
- match: 'comment.*789.*Done'
5 changes: 5 additions & 0 deletions skill-evals/cases/webhook-create.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
task: Create a webhook for https://example.com/hook in project 12345
tags: [domain]
accept:
- 'webhooks create.*example\.com'
- '--(in|project) 12345'
Empty file added skill-evals/results/.gitkeep
Empty file.
Loading
Loading