Skip to content

Commit 0289ded

Browse files
sjarmakclaude
andcommitted
fix: hydrate oracle fields + add repo-name normalization to evaluator
Two bugs caused all 16 zero-score baseline results for tasks 121-141: 1. Empty oracle in task_spec.json: generate_mcp_unique_tasks.py (run for chown optimization) overwrote hydrated oracle sections with empty arrays. Re-ran hydrate_task_specs.py to repopulate from oracle_answer.json. 2. Repo name mismatch: oracle uses mirror names (sg-evals/jdk--742e735d) while agents use upstream names (openjdk/jdk). Added three-tier matching to oracle_checks.py: exact match → normalized-repo match → path-only fallback. All comparison functions updated (file_set_match, symbol_resolution, dependency_chain). Verified: task 128 scores 0.9166 (was 0.0000); all doctests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8248c1c commit 0289ded

File tree

146 files changed

+16375
-6116
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

146 files changed

+16375
-6116
lines changed

benchmarks/ccb_mcp_compliance/ccx-compliance-051/tests/oracle_checks.py

Lines changed: 160 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -21,46 +21,115 @@
2121
import argparse
2222
import json
2323
import os
24+
import re
2425
import subprocess
2526
import sys
2627
from pathlib import Path
27-
import re
2828
from typing import Any, Dict, List, Optional
2929

30-
# Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31-
# Agents sometimes forget to strip these; normalize before comparison.
32-
_HOSTING_PREFIX_RE = re.compile(r"^(?:github\.com|gitlab\.com|bitbucket\.org)/")
33-
34-
# sg-evals mirrors → upstream canonical names.
35-
# Allows oracle answers and agent answers to use either name interchangeably.
36-
_SG_MIRROR_ALIASES = {
37-
"sg-evals/kubernetes-client-go": "kubernetes/client-go",
38-
"sg-evals/kubernetes-api": "kubernetes/api",
39-
"sg-evals/etcd-io-etcd": "etcd-io/etcd",
40-
"sg-evals/expressjs-express": "expressjs/express",
41-
"sg-evals/grafana-loki": "grafana/loki",
42-
"sg-evals/grafana-mimir": "grafana/mimir",
43-
"sg-evals/prisma-prisma": "prisma/prisma",
44-
"sg-evals/lodash": "lodash/lodash",
45-
"sg-evals/numpy": "numpy/numpy",
46-
"sg-evals/scipy": "scipy/scipy",
47-
"sg-evals/grafana": "grafana/grafana",
48-
"sg-evals/prometheus": "prometheus/prometheus",
49-
}
50-
51-
52-
def _normalize_repo(repo: str) -> str:
53-
"""Strip hosting-provider prefix and resolve sg-evals aliases.
54-
55-
>>> _normalize_repo("github.com/sg-evals/kubernetes-client-go")
56-
'kubernetes/client-go'
57-
>>> _normalize_repo("sg-evals/kubernetes-client-go")
58-
'kubernetes/client-go'
59-
>>> _normalize_repo("etcd-io/etcd")
60-
'etcd-io/etcd'
30+
31+
# ---------------------------------------------------------------------------
32+
# Repo-name normalization
33+
# ---------------------------------------------------------------------------
34+
# Oracle uses mirror names like "sg-evals/firefox--871325b8" while agents use
35+
# upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
36+
# both sides so that matching works regardless of which convention is used.
37+
38+
_MIRROR_HASH_RE = re.compile(r"--[0-9a-f]{6,}$")
39+
40+
41+
def _normalize_repo(name: str) -> str:
42+
"""Reduce a repo identifier to its base name for fuzzy comparison.
43+
44+
Examples:
45+
sg-evals/firefox--871325b8 -> firefox
46+
sg-evals/jdk--742e735d -> jdk
47+
openjdk/jdk -> jdk
48+
chromium/chromium -> chromium
49+
rust-lang/rust -> rust
50+
arangodb/arangodb -> arangodb
51+
"""
52+
# Strip leading org/ prefix (take the part after the last '/')
53+
base = name.rsplit("/", 1)[-1]
54+
# Strip mirror hash suffix (--<hex>)
55+
base = _MIRROR_HASH_RE.sub("", base)
56+
return base.lower()
57+
58+
59+
def _match_items(
60+
answer_items: List[Dict[str, str]],
61+
oracle_items: List[Dict[str, str]],
62+
key_fields: List[str],
63+
) -> tuple:
64+
"""Two-pass matching: exact first, then path-only fallback.
65+
66+
Returns (matched, missing, extra) as sets of tuples built from key_fields.
6167
"""
62-
repo = _HOSTING_PREFIX_RE.sub("", repo)
63-
return _SG_MIRROR_ALIASES.get(repo, repo)
68+
def _exact_key(item: Dict[str, str]) -> tuple:
69+
return tuple(item.get(k, "") for k in key_fields)
70+
71+
def _norm_key(item: Dict[str, str]) -> tuple:
72+
"""Normalized key: use _normalize_repo for repo field, rest as-is."""
73+
return tuple(
74+
_normalize_repo(item.get(k, "")) if k == "repo" else item.get(k, "")
75+
for k in key_fields
76+
)
77+
78+
def _path_key(item: Dict[str, str]) -> tuple:
79+
"""Path-only key: skip repo, keep remaining fields."""
80+
return tuple(item.get(k, "") for k in key_fields if k != "repo")
81+
82+
# Pass 1: exact (repo, path, ...) match
83+
oracle_exact = {_exact_key(f): f for f in oracle_items}
84+
answer_exact = {_exact_key(f): f for f in answer_items}
85+
exact_matched = set(oracle_exact.keys()) & set(answer_exact.keys())
86+
87+
# Pass 2: normalized-repo match on remaining items
88+
remaining_oracle = {k: v for k, v in oracle_exact.items() if k not in exact_matched}
89+
remaining_answer = {k: v for k, v in answer_exact.items() if k not in exact_matched}
90+
91+
norm_oracle = {} # norm_key -> exact_key
92+
for ek, item in remaining_oracle.items():
93+
norm_oracle[_norm_key(item)] = ek
94+
norm_answer = {}
95+
for ek, item in remaining_answer.items():
96+
norm_answer[_norm_key(item)] = ek
97+
98+
norm_matched_oracle = set()
99+
norm_matched_answer = set()
100+
for nk in set(norm_oracle.keys()) & set(norm_answer.keys()):
101+
norm_matched_oracle.add(norm_oracle[nk])
102+
norm_matched_answer.add(norm_answer[nk])
103+
104+
# Pass 3: path-only fallback for still-unmatched items
105+
still_oracle = {k: v for k, v in remaining_oracle.items()
106+
if k not in norm_matched_oracle}
107+
still_answer = {k: v for k, v in remaining_answer.items()
108+
if k not in norm_matched_answer}
109+
110+
path_oracle = {} # path_key -> exact_key
111+
for ek, item in still_oracle.items():
112+
pk = _path_key(item)
113+
path_oracle[pk] = ek
114+
path_answer = {}
115+
for ek, item in still_answer.items():
116+
pk = _path_key(item)
117+
path_answer[pk] = ek
118+
119+
path_matched_oracle = set()
120+
path_matched_answer = set()
121+
for pk in set(path_oracle.keys()) & set(path_answer.keys()):
122+
path_matched_oracle.add(path_oracle[pk])
123+
path_matched_answer.add(path_answer[pk])
124+
125+
# Combine all matched keys (using oracle keys as canonical)
126+
all_matched_oracle = exact_matched | norm_matched_oracle | path_matched_oracle
127+
all_matched_answer = exact_matched | norm_matched_answer | path_matched_answer
128+
129+
missing = set(oracle_exact.keys()) - all_matched_oracle
130+
extra = set(answer_exact.keys()) - all_matched_answer
131+
132+
return all_matched_oracle, missing, extra
64133

65134

66135
def check_file_set_match(
@@ -70,7 +139,8 @@ def check_file_set_match(
70139
"""Check overlap between agent-reported files and oracle files.
71140
72141
Each file item is a dict with at least {"repo", "path"}.
73-
Matching is by (repo, path) tuple — both must match.
142+
Matching uses two-pass repo normalization: exact match first, then
143+
normalised-repo and path-only fallback for mirror/upstream name mismatches.
74144
75145
Returns raw scores without thresholds.
76146
@@ -82,21 +152,21 @@ def check_file_set_match(
82152
0.5
83153
>>> result["precision"]
84154
1.0
85-
>>> result["matched"]
86-
[{'repo': 'a/b', 'path': 'x.go'}]
87-
"""
88-
def _key(item: Dict[str, str]) -> tuple:
89-
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
90155
91-
oracle_set = {_key(f) for f in oracle_files}
92-
answer_set = {_key(f) for f in answer_files}
156+
>>> result = check_file_set_match(
157+
... [{"repo": "openjdk/jdk", "path": "src/Foo.java"}],
158+
... [{"repo": "sg-evals/jdk--742e735d", "path": "src/Foo.java"}],
159+
... )
160+
>>> result["f1"]
161+
1.0
162+
"""
163+
matched, missing, extra = _match_items(answer_files, oracle_files, ["repo", "path"])
93164

94-
matched = oracle_set & answer_set
95-
missing = oracle_set - answer_set
96-
extra = answer_set - oracle_set
165+
n_oracle = len({(f.get("repo", ""), f.get("path", "")) for f in oracle_files})
166+
n_answer = len({(f.get("repo", ""), f.get("path", "")) for f in answer_files})
97167

98-
recall = len(matched) / len(oracle_set) if oracle_set else 1.0
99-
precision = len(matched) / len(answer_set) if answer_set else 0.0
168+
recall = len(matched) / n_oracle if n_oracle else 1.0
169+
precision = len(matched) / n_answer if n_answer else 0.0
100170
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
101171

102172
return {
@@ -116,7 +186,7 @@ def check_symbol_resolution(
116186
"""Check overlap between agent-identified symbols and oracle symbols.
117187
118188
Each symbol item has at least {"repo", "path", "symbol"}.
119-
Matching is by (repo, path, symbol) tuple.
189+
Matching uses two-pass repo normalization (see _match_items).
120190
121191
>>> result = check_symbol_resolution(
122192
... [{"repo": "a/b", "path": "x.go", "symbol": "Foo"}],
@@ -125,18 +195,15 @@ def check_symbol_resolution(
125195
>>> result["recall"]
126196
1.0
127197
"""
128-
def _key(item: Dict[str, str]) -> tuple:
129-
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
130-
131-
oracle_set = {_key(s) for s in oracle_symbols}
132-
answer_set = {_key(s) for s in answer_symbols}
198+
matched, missing, extra = _match_items(
199+
answer_symbols, oracle_symbols, ["repo", "path", "symbol"]
200+
)
133201

134-
matched = oracle_set & answer_set
135-
missing = oracle_set - answer_set
136-
extra = answer_set - oracle_set
202+
n_oracle = len({(s.get("repo", ""), s.get("path", ""), s.get("symbol", "")) for s in oracle_symbols})
203+
n_answer = len({(s.get("repo", ""), s.get("path", ""), s.get("symbol", "")) for s in answer_symbols})
137204

138-
recall = len(matched) / len(oracle_set) if oracle_set else 1.0
139-
precision = len(matched) / len(answer_set) if answer_set else 0.0
205+
recall = len(matched) / n_oracle if n_oracle else 1.0
206+
precision = len(matched) / n_answer if n_answer else 0.0
140207

141208
return {
142209
"matched": [{"repo": r, "path": p, "symbol": s} for r, p, s in sorted(matched)],
@@ -155,7 +222,7 @@ def check_dependency_chain(
155222
156223
Each step is {"repo", "path", "symbol"}. Order matters — we check both
157224
set membership (did agent find the step?) and order (is the sequence
158-
correct?).
225+
correct?). Uses repo-name normalization for matching.
159226
160227
>>> result = check_dependency_chain(
161228
... [{"repo": "a", "path": "x", "symbol": "f1"},
@@ -168,30 +235,50 @@ def check_dependency_chain(
168235
>>> result["chain_recall"]
169236
1.0
170237
"""
171-
def _key(item: Dict[str, str]) -> tuple:
172-
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
238+
def _norm_key(item: Dict[str, str]) -> tuple:
239+
return (_normalize_repo(item.get("repo", "")),
240+
item.get("path", ""),
241+
item.get("symbol", ""))
173242

174-
oracle_keys = [_key(s) for s in oracle_chain]
175-
answer_keys = [_key(s) for s in answer_chain]
243+
def _path_key(item: Dict[str, str]) -> tuple:
244+
return (item.get("path", ""), item.get("symbol", ""))
176245

177-
oracle_set = set(oracle_keys)
178-
answer_set = set(answer_keys)
246+
# Use normalised keys for set matching
247+
oracle_norm = [_norm_key(s) for s in oracle_chain]
248+
answer_norm = [_norm_key(s) for s in answer_chain]
179249

250+
oracle_set = set(oracle_norm)
251+
answer_set = set(answer_norm)
180252
matched = oracle_set & answer_set
181-
missing = oracle_set - answer_set
182253

183-
# Check order: extract the subsequence of answer that matches oracle steps
184-
# and verify it preserves the oracle ordering.
185-
oracle_positions = {k: i for i, k in enumerate(oracle_keys)}
186-
matched_in_order = [k for k in answer_keys if k in oracle_set]
254+
# Path-only fallback for remaining items
255+
remaining_oracle = oracle_set - matched
256+
remaining_answer = answer_set - matched
257+
path_oracle = {_path_key({"path": k[1], "symbol": k[2]}): k for k in remaining_oracle}
258+
path_answer = {_path_key({"path": k[1], "symbol": k[2]}): k for k in remaining_answer}
259+
path_matched = set(path_oracle.keys()) & set(path_answer.keys())
260+
for pk in path_matched:
261+
matched.add(path_oracle[pk])
262+
263+
missing_set = oracle_set - matched
264+
missing = sorted(missing_set)
265+
266+
# Check order using normalised keys
267+
oracle_positions = {k: i for i, k in enumerate(oracle_norm)}
268+
matched_in_order = [k for k in answer_norm if k in matched]
269+
# Also try path-only for order check
270+
if not matched_in_order:
271+
answer_path = [_path_key({"path": k[1], "symbol": k[2]}) for k in answer_norm]
272+
oracle_path_map = {_path_key({"path": k[1], "symbol": k[2]}): k for k in oracle_norm}
273+
matched_in_order = [oracle_path_map[pk] for pk in answer_path if pk in oracle_path_map]
187274
positions = [oracle_positions[k] for k in matched_in_order if k in oracle_positions]
188275
order_correct = positions == sorted(positions) and len(matched) == len(oracle_set)
189276

190277
chain_recall = len(matched) / len(oracle_set) if oracle_set else 1.0
191278

192279
return {
193280
"matched_steps": len(matched),
194-
"missing_steps": [{"repo": r, "path": p, "symbol": s} for r, p, s in sorted(missing)],
281+
"missing_steps": [{"repo": r, "path": p, "symbol": s} for r, p, s in missing],
195282
"order_correct": order_correct,
196283
"chain_recall": round(chain_recall, 4),
197284
}
@@ -420,16 +507,10 @@ def run_all_checks(
420507
oracle = spec.get("artifacts", {}).get("oracle", {})
421508
eval_checks = spec.get("evaluation", {}).get("checks", [])
422509

423-
# If answer is a dict with "text" key, extract the text for text-based checks.
424-
# Also include the full JSON serialization so that provenance citations in
425-
# structured fields (e.g. chain[].repo) are found by substring matching.
426-
# This prevents penalizing agents that correctly cite repos in structured
427-
# data but use natural language (e.g. "Loki") in the narrative text.
510+
# If answer is a dict with "text" key, extract the text for text-based checks
428511
answer_text = ""
429512
if isinstance(answer_data, dict):
430-
narrative = answer_data.get("text", answer_data.get("answer", ""))
431-
full_json = json.dumps(answer_data)
432-
answer_text = f"{narrative}\n{full_json}" if narrative else full_json
513+
answer_text = answer_data.get("text", answer_data.get("answer", json.dumps(answer_data)))
433514
elif isinstance(answer_data, str):
434515
answer_text = answer_data
435516

0 commit comments

Comments
 (0)