getsentry · dcramer · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -30,8 +30,14 @@ src/
 ├── cli/               # CLI entry and commands
 │   └── output/        # CLI output formatting
 ├── action/            # GitHub Action entry
+├── evals/             # Eval runner, judge, and types
 ├── utils/             # Shared utilities
 └── examples/          # Example configurations
+
+evals/                 # Eval specs, fixtures, and test skills (see evals/README.md)
+├── *.yaml             # YAML eval definitions
+├── skills/            # Test skills used as eval vehicles
+└── fixtures/          # Source code with known issues
 ```
 
 ## Key Conventions
@@ -86,6 +92,10 @@ Skills define **what to look for**, not how to respond to findings:
 - Skills should only change to improve detection accuracy, not to reduce reported findings
 - Each skill owns its domain expertise; severity definitions are intentionally domain-agnostic
 
+## Evals
+
+End-to-end behavioral tests for the full pipeline. See [`evals/README.md`](evals/README.md) for the YAML spec, how to add evals, and how it all works. Run with `pnpm test:evals`.
+
 ## Voice
 
 Warden watches over your code. Not "AI code reviewer" or similar.

diff --git a/README.md b/README.md
@@ -36,8 +36,12 @@ npx warden --fix
 git clone git@github.com:getsentry/warden.git
 cd warden
 pnpm install && pnpm build
+pnpm test              # unit tests
+pnpm test:evals        # end-to-end evals (requires API key)
 ```
 
+See [`evals/README.md`](evals/README.md) for the eval framework.
+
 ## License
 
 FSL-1.1-ALv2
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,154 @@
+# Warden Evals
+
+End-to-end behavioral evaluations for the Warden pipeline. These evals verify
+that Warden correctly runs skills, invokes the agent, extracts findings, and
+produces the expected behavioral outcomes on known code.
+
+## Philosophy
+
+Evals are not unit tests or A/B comparisons. They answer one question:
+
+> **Does the Warden pipeline behave correctly when given known inputs?**
+
+Each eval provides code with a known issue, runs the full Warden agent pipeline
+(skill loading, prompt construction, SDK invocation, finding extraction), and
+uses an LLM judge to verify the output matches behavioral expectations.
+
+Evals test **Warden's behavior**, not individual skills. Skills are used as
+test vehicles to exercise the pipeline.
+
+The only thing mocked is the GitHub event payload. Everything else runs for
+real.
+
+## YAML Format
+
+Evals are defined in YAML files at the top level of `evals/`. Each file
+describes a category of behaviors with a shared test skill and a list of
+scenarios. No custom code per eval. Adding a new eval means adding an entry
+to a YAML file and a fixture file.
+
+```yaml
+skill: skills/bug-detection.md
+
+evals:
+  - name: null-property-access
+    given: code that accesses properties on an array .find() result without null checking
+    files:
+      - fixtures/null-property-access/handler.ts
+    should_find:
+      - finding: accessing .name on a potentially undefined user object from Array.find()
+        severity: high
+    should_not_find:
+      - style, formatting, or naming issues
+      - the lack of try/catch around the fetch call
+```
+
+This reads as:
+
+> **Given** code that accesses properties on an array `.find()` result without
+> null checking, Warden **should find** a null access bug and **should not
+> find** style issues.
+
+## Eval Structure
+
+```
+evals/
+├── README.md
+├── bug-detection.yaml          # Category: finding logic bugs
+├── security-scanning.yaml      # Category: finding security vulnerabilities
+├── precision.yaml              # Category: avoiding false positives
+├── skills/                     # Test skills (vehicles for exercising pipeline)
+│   ├── bug-detection.md
+│   ├── security-scanning.md
+│   └── precision.md
+└── fixtures/                   # Source code with known issues
+    ├── null-property-access/
+    │   └── handler.ts
+    ├── off-by-one/
+    │   └── paginator.ts
+    ├── missing-await/
+    │   └── cache.ts
+    ├── wrong-comparison/
+    │   └── validator.ts
+    ├── stale-closure/
+    │   └── counter.tsx
+    ├── sql-injection/
+    │   └── api.ts
+    ├── xss-reflected/
+    │   └── server.ts
+    └── ignores-style-issues/
+        └── utils.ts
+```
+
+## YAML Schema
+
+### File-level fields
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `skill` | Yes | Path to test skill, relative to `evals/` |
+| `model` | No | Default model for all evals (default: `claude-sonnet-4-5-20250514`) |
+| `evals` | Yes | List of eval scenarios (at least one) |
+
+### Per-eval fields
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `name` | Yes | Scenario name (used in test output) |
+| `given` | Yes | What code/situation the eval sets up (BDD "given") |
+| `files` | Yes | Fixture files, relative to `evals/` |
+| `model` | No | Model override for this scenario |
+| `should_find` | Yes | What the pipeline should detect (at least one) |
+| `should_find[].finding` | Yes | Natural language description for the LLM judge |
+| `should_find[].severity` | No | Expected severity (hint, not strict) |
+| `should_find[].required` | No | If true (default), eval fails when not found |
+| `should_not_find` | No | Things the pipeline should NOT report (precision) |
+
+## Running Evals
+
+```bash
+# Run all evals (requires ANTHROPIC_API_KEY)
+pnpm test:evals
+
+# Run evals for a specific category
+pnpm test:evals -- --grep "bug-detection"
+
+# Run a single eval
+pnpm test:evals -- --grep "null-property-access"
+```
+
+Evals make real API calls. They run skills on `claude-sonnet-4-5-20250514` by
+default.
+
+## Adding a New Eval
+
+1. Pick an existing YAML file or create a new `evals/<category>.yaml`
+2. Add a scenario entry under the `evals:` key
+3. Create a fixture file under `evals/fixtures/<scenario>/`
+4. Run `pnpm test:evals` to verify
+
+If a new category needs a different test skill, add it to `evals/skills/`.
+
+### Guidelines
+
+- **One bug per eval.** Each scenario tests one specific behavior.
+- **Make bugs realistic.** Code should look like something a human wrote.
+- **Write precise `should_find`.** "null access on user.name from Array.find()"
+  is better than "finds a bug."
+- **Include `should_not_find`.** If the code has issues the skill should ignore,
+  call them out.
+- **Keep fixtures small.** 20-80 lines. The agent analyzes hunks, not novels.
+- **No custom code.** Every eval is just YAML + fixture files.
+
+## How It Works
+
+1. **Discovery**: Scan `evals/` for `.yaml` files
+2. **Loading**: Parse YAML, validate with Zod, resolve paths
+3. **Git repo**: Create a temp repo with fixture files committed on an `eval`
+   branch (empty `main` as base), so the agent has a real repo to explore
+4. **Context**: Build `EventContext` from real `git diff main...eval`
+5. **Execution**: Run the skill via `runSkill()` with the real SDK pipeline;
+   the agent operates in the temp repo with Read/Grep tools
+6. **Judgment**: An LLM judge (Sonnet) evaluates findings against assertions
+7. **Verdict**: Pass if all required `should_find` are met and no
+   `should_not_find` are violated
diff --git a/evals/bug-detection.yaml b/evals/bug-detection.yaml
@@ -0,0 +1,56 @@
+skill: skills/bug-detection.md
+
+evals:
+  - name: null-property-access
+    given: code that accesses properties on an array .find() result without null checking
+    files:
+      - fixtures/null-property-access/handler.ts
+    should_find:
+      - finding: accessing .name and .profile.avatar on a potentially undefined user object from Array.find()
+        severity: high
+    should_not_find:
+      - style, formatting, or naming issues
+      - the lack of try/catch around the fetch call
+
+  - name: off-by-one
+    given: pagination logic that uses Math.floor instead of Math.ceil, skipping the last page
+    files:
+      - fixtures/off-by-one/paginator.ts
+    should_find:
+      - finding: off-by-one error in page count calculation that loses the last page when totalItems is not evenly divisible by pageSize
+        severity: medium
+    should_not_find:
+      - use of any[] type
+      - missing error handling
+
+  - name: missing-await
+    given: async cache lookup missing await, causing a Promise object to be used as a truthy value
+    files:
+      - fixtures/missing-await/cache.ts
+    should_find:
+      - finding: missing await on loadFromCache() call, so cached is always a truthy Promise and the function never actually fetches fresh data
+        severity: high
+    should_not_find:
+      - console.log statements
+      - missing return type annotations
+
+  - name: wrong-comparison
+    given: permission check using <= instead of >=, inverting the access control logic
+    files:
+      - fixtures/wrong-comparison/validator.ts
+    should_find:
+      - finding: comparison operator is <= instead of >=, granting access to lower-privilege users while denying higher-privilege users
+        severity: high
+    should_not_find:
+      - hardcoded role strings
+      - suggestion to use an enum for roles
+
+  - name: stale-closure
+    given: React useEffect with setInterval that captures count in a stale closure
+    files:
+      - fixtures/stale-closure/counter.tsx
+    should_find:
+      - finding: "stale closure: setInterval callback captures initial count value and never sees updates, so the counter always sets the same value"
+        severity: high
+    should_not_find:
+      - TypeScript type annotation issues
diff --git a/evals/fixtures/ignores-style-issues/utils.ts b/evals/fixtures/ignores-style-issues/utils.ts
@@ -0,0 +1,48 @@
+// This code is functionally correct but has style issues.
+// A precision-focused eval: the skill should NOT report any of these as bugs.
+
+// Inconsistent naming convention (camelCase vs snake_case)
+export function calculate_total(items: number[]): number {
+  let runningTotal = 0;
+  for (let i = 0; i < items.length; i++) {
+    runningTotal = runningTotal + items[i]!;
+  }
+  return runningTotal;
+}
+
+// Verbose conditional (could be simplified but is correct)
+export function isEligible(age: number, hasConsent: boolean): boolean {
+  if (age >= 18) {
+    if (hasConsent === true) {
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+}
+
+// Missing JSDoc, long parameter list, but functionally correct
+export function formatAddress(
+  street: string,
+  city: string,
+  state: string,
+  zip: string,
+  country: string
+): string {
+  const parts = [street, city, state, zip, country];
+  return parts.filter((p) => p.length > 0).join(', ');
+}
+
+// Magic numbers but correct behavior
+export function calculateDiscount(price: number, quantity: number): number {
+  if (quantity >= 100) {
+    return price * 0.8;
+  } else if (quantity >= 50) {
+    return price * 0.9;
+  } else if (quantity >= 10) {
+    return price * 0.95;
+  }
+  return price;
+}
diff --git a/evals/fixtures/missing-await/cache.ts b/evals/fixtures/missing-await/cache.ts
@@ -0,0 +1,45 @@
+interface CacheEntry {
+  key: string;
+  value: string;
+  expiresAt: number;
+}
+
+const store = new Map<string, CacheEntry>();
+
+async function saveToCache(key: string, value: string, ttlMs: number): Promise<void> {
+  // Simulate async storage (e.g., Redis, database)
+  await new Promise((resolve) => setTimeout(resolve, 1));
+  store.set(key, {
+    key,
+    value,
+    expiresAt: Date.now() + ttlMs,
+  });
+}
+
+async function loadFromCache(key: string): Promise<string | null> {
+  await new Promise((resolve) => setTimeout(resolve, 1));
+  const entry = store.get(key);
+  if (!entry) return null;
+  if (Date.now() > entry.expiresAt) {
+    store.delete(key);
+    return null;
+  }
+  return entry.value;
+}
+
+export async function getOrFetchData(key: string, fetchFn: () => Promise<string>): Promise<string> {
+  // Bug: missing await on loadFromCache. The result `cached` will be a
+  // Promise, which is truthy, so the function always returns a Promise
+  // object (as a string) instead of the actual cached value.
+  const cached = loadFromCache(key);
+
+  if (cached) {
+    console.log('Cache hit:', key);
+    return cached as unknown as string;
+  }
+
+  console.log('Cache miss:', key);
+  const fresh = await fetchFn();
+  await saveToCache(key, fresh, 60_000);
+  return fresh;
+}
diff --git a/evals/fixtures/null-property-access/handler.ts b/evals/fixtures/null-property-access/handler.ts
@@ -0,0 +1,36 @@
+interface User {
+  id: string;
+  name: string;
+  email: string;
+  profile: {
+    avatar: string;
+    bio: string;
+  };
+}
+
+interface ApiResponse {
+  users: User[];
+  total: number;
+}
+
+async function fetchUsers(endpoint: string): Promise<ApiResponse> {
+  const response = await fetch(endpoint);
+  return response.json() as Promise<ApiResponse>;
+}
+
+export async function getUserDisplayName(userId: string): Promise<string> {
+  const data = await fetchUsers(`/api/users?id=${userId}`);
+  const user = data.users.find((u) => u.id === userId);
+
+  // Bug: user could be undefined if not found in the array,
+  // but we access .name without checking
+  const displayName = user.name;
+  const avatarUrl = user.profile.avatar;
+
+  return `${displayName} (${avatarUrl})`;
+}
+
+export async function getTeamMembers(teamId: string): Promise<string[]> {
+  const data = await fetchUsers(`/api/teams/${teamId}/members`);
+  return data.users.map((u) => u.name);
+}