Skip to content

Commit a596e26

Browse files
sjarmakclaude
andcommitted
fix: hydrate oracle fields + normalize repo names for tasks 121-141
All 21 MCP-unique task_spec.json files had empty oracle arrays, causing the verifier to score 0.0 regardless of agent quality. This hydrates required_files, required_symbols, and dependency_chains from each task's oracle_answer.json and enriches evaluation checks to match oracle_check_types. Also adds _normalize_repo() to oracle_checks.py to strip github.com/ prefix from repo names, fixing the mismatch between agent-reported repos (github.com/sg-evals/...) and oracle repos (sg-evals/...). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f7f92c4 commit a596e26

File tree

49 files changed

+3318
-432
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+3318
-432
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ full operations manual.
4444

4545
## Maintenance
4646
- Root and local `AGENTS.md` / `CLAUDE.md` files are generated from sources in `docs/ops/`.
47+
- `docs/START_HERE_BY_TASK.md` is generated from `docs/ops/task_routes.json`.
4748
- Regenerate after edits (single command):
4849
```bash
4950
python3 scripts/refresh_agent_navigation.py

benchmarks/ccb_mcp_compliance/ccx-compliance-124/tests/oracle_checks.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@
2727
from typing import Any, Dict, List, Optional
2828

2929

30+
def _normalize_repo(repo: str) -> str:
31+
"""Normalize repo name by stripping github.com/ prefix.
32+
33+
Agents may report repos as 'github.com/sg-evals/foo' while oracle
34+
uses 'sg-evals/foo'. This normalizes both to a common form.
35+
"""
36+
if repo.startswith("github.com/"):
37+
return repo[len("github.com/"):]
38+
return repo
39+
40+
3041
def check_file_set_match(
3142
answer_files: List[Dict[str, str]],
3243
oracle_files: List[Dict[str, str]],
@@ -50,7 +61,7 @@ def check_file_set_match(
5061
[{'repo': 'a/b', 'path': 'x.go'}]
5162
"""
5263
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
64+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5465

5566
oracle_set = {_key(f) for f in oracle_files}
5667
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +101,7 @@ def check_symbol_resolution(
90101
1.0
91102
"""
92103
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
104+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94105

95106
oracle_set = {_key(s) for s in oracle_symbols}
96107
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +144,7 @@ def check_dependency_chain(
133144
1.0
134145
"""
135146
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
147+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137148

138149
oracle_keys = [_key(s) for s in oracle_chain]
139150
answer_keys = [_key(s) for s in answer_chain]

benchmarks/ccb_mcp_compliance/ccx-compliance-124/tests/task_spec.json

Lines changed: 191 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,210 @@
55
"category": "F",
66
"mcp_suite": "ccb_mcp_compliance",
77
"prd": {
8-
"user_story": "As a developer, I want to: Audit the Content Security Policy (CSP) enforcement infrastructure in Firefox. Find all C++ source files in `mozilla-firefox/firefox` under `dom/security/` that implement CSP parsing, evaluation, and violation reporting. Specifically: 1. The file that defines `nsCSPParser` — the CSP directive parser. 2. The file that implements `nsCSPContext` — the main CSP context that holds policies. 3. The file that implements CSP violation reporting (`nsCSPUtils` or similar). 4. The header file that declares the `nsIContentSecurityPolicy` XPCOM interface. 5. The file under `dom/security/` that performs script-src evaluation for inline scripts. Report each file path and its primary class or function.",
9-
"constraints": ["Provide specific file paths and repository names in your answer.", "Write your findings to /workspace/answer.json."],
8+
"user_story": "As a developer, I want to: Audit the Content Security Policy (CSP) enforcement infrastructure in Firefox. Find all C++ source files in `mozilla-firefox/firefox` under `dom/security/` that implement CSP parsing, evaluation, and violation reporting. Specifically: 1. The file that defines `nsCSPParser` \u2014 the CSP directive parser. 2. The file that implements `nsCSPContext` \u2014 the main CSP context that holds policies. 3. The file that implements CSP violation reporting (`nsCSPUtils` or similar). 4. The header file that declares the `nsIContentSecurityPolicy` XPCOM interface. 5. The file under `dom/security/` that performs script-src evaluation for inline scripts. Report each file path and its primary class or function.",
9+
"constraints": [
10+
"Provide specific file paths and repository names in your answer.",
11+
"Write your findings to /workspace/answer.json."
12+
],
1013
"success_definition": "Agent successfully identifies relevant files and symbols across all repos in the mozilla-firefox fixture.",
11-
"seed_prompt": "Audit the Content Security Policy (CSP) enforcement infrastructure in Firefox. Find all C++ source files in `mozilla-firefox/firefox` under `dom/security/` that implement CSP parsing, evaluation, and violation reporting. Specifically: 1. The file that defines `nsCSPParser` the CSP directive parser. 2. The file that implements `nsCSPContext` the main CSP context that holds policies. 3. The file that implements CSP violation reporting (`nsCSPUtils` or similar). 4. The header file that declares the `nsIContentSecurityPolicy` XPCOM interface. 5. The file under `dom/security/` that performs script-src evaluation for inline scripts. Report each file path and its primary class or function."
14+
"seed_prompt": "Audit the Content Security Policy (CSP) enforcement infrastructure in Firefox. Find all C++ source files in `mozilla-firefox/firefox` under `dom/security/` that implement CSP parsing, evaluation, and violation reporting. Specifically: 1. The file that defines `nsCSPParser` \u2014 the CSP directive parser. 2. The file that implements `nsCSPContext` \u2014 the main CSP context that holds policies. 3. The file that implements CSP violation reporting (`nsCSPUtils` or similar). 4. The header file that declares the `nsIContentSecurityPolicy` XPCOM interface. 5. The file under `dom/security/` that performs script-src evaluation for inline scripts. Report each file path and its primary class or function."
1215
},
1316
"artifacts": {
1417
"repo_set_id": "mozilla-firefox",
1518
"oracle": {
16-
"required_files": [],
17-
"required_symbols": [],
19+
"required_files": [
20+
{
21+
"repo": "sg-evals/firefox--871325b8",
22+
"path": "dom/security/nsCSPParser.h"
23+
},
24+
{
25+
"repo": "sg-evals/firefox--871325b8",
26+
"path": "dom/security/nsCSPParser.cpp"
27+
},
28+
{
29+
"repo": "sg-evals/firefox--871325b8",
30+
"path": "dom/security/nsCSPContext.h"
31+
},
32+
{
33+
"repo": "sg-evals/firefox--871325b8",
34+
"path": "dom/security/nsCSPContext.cpp"
35+
},
36+
{
37+
"repo": "sg-evals/firefox--871325b8",
38+
"path": "dom/security/nsCSPUtils.h"
39+
},
40+
{
41+
"repo": "sg-evals/firefox--871325b8",
42+
"path": "dom/security/nsCSPUtils.cpp"
43+
},
44+
{
45+
"repo": "sg-evals/firefox--871325b8",
46+
"path": "dom/security/nsCSPService.h"
47+
},
48+
{
49+
"repo": "sg-evals/firefox--871325b8",
50+
"path": "dom/security/nsCSPService.cpp"
51+
},
52+
{
53+
"repo": "sg-evals/firefox--871325b8",
54+
"path": "dom/security/CSPViolationData.h"
55+
},
56+
{
57+
"repo": "sg-evals/firefox--871325b8",
58+
"path": "dom/security/CSPViolationData.cpp"
59+
},
60+
{
61+
"repo": "sg-evals/firefox--871325b8",
62+
"path": "dom/interfaces/security/nsIContentSecurityPolicy.idl"
63+
},
64+
{
65+
"repo": "sg-evals/firefox--871325b8",
66+
"path": "dom/security/moz.build"
67+
},
68+
{
69+
"repo": "sg-evals/firefox--871325b8",
70+
"path": "dom/security/nsContentSecurityUtils.cpp"
71+
},
72+
{
73+
"repo": "sg-evals/firefox--871325b8",
74+
"path": "dom/security/nsMixedContentBlocker.cpp"
75+
},
76+
{
77+
"repo": "sg-evals/firefox--871325b8",
78+
"path": "dom/security/SRIMetadata.cpp"
79+
},
80+
{
81+
"repo": "sg-evals/firefox--871325b8",
82+
"path": "dom/security/PolicyContainer.cpp"
83+
}
84+
],
85+
"required_symbols": [
86+
{
87+
"repo": "sg-evals/firefox--871325b8",
88+
"path": "dom/security/nsCSPParser.h",
89+
"symbol": "nsCSPParser"
90+
},
91+
{
92+
"repo": "sg-evals/firefox--871325b8",
93+
"path": "dom/security/nsCSPParser.h",
94+
"symbol": "parseContentSecurityPolicy"
95+
},
96+
{
97+
"repo": "sg-evals/firefox--871325b8",
98+
"path": "dom/security/nsCSPContext.h",
99+
"symbol": "nsCSPContext"
100+
},
101+
{
102+
"repo": "sg-evals/firefox--871325b8",
103+
"path": "dom/security/nsCSPContext.cpp",
104+
"symbol": "ShouldLoad"
105+
},
106+
{
107+
"repo": "sg-evals/firefox--871325b8",
108+
"path": "dom/security/nsCSPUtils.h",
109+
"symbol": "nsCSPUtils"
110+
},
111+
{
112+
"repo": "sg-evals/firefox--871325b8",
113+
"path": "dom/security/nsCSPService.h",
114+
"symbol": "nsCSPService"
115+
},
116+
{
117+
"repo": "sg-evals/firefox--871325b8",
118+
"path": "dom/security/CSPViolationData.h",
119+
"symbol": "CSPViolationData"
120+
},
121+
{
122+
"repo": "sg-evals/firefox--871325b8",
123+
"path": "dom/interfaces/security/nsIContentSecurityPolicy.idl",
124+
"symbol": "nsIContentSecurityPolicy"
125+
}
126+
],
18127
"required_references": [],
19-
"dependency_chains": []
128+
"dependency_chains": [
129+
{
130+
"steps": [
131+
{
132+
"repo": "sg-evals/firefox--871325b8",
133+
"path": "dom/interfaces/security/nsIContentSecurityPolicy.idl",
134+
"symbol": "nsIContentSecurityPolicy"
135+
},
136+
{
137+
"repo": "sg-evals/firefox--871325b8",
138+
"path": "dom/security/nsCSPParser.h",
139+
"symbol": "nsCSPParser"
140+
},
141+
{
142+
"repo": "sg-evals/firefox--871325b8",
143+
"path": "dom/security/nsCSPParser.cpp",
144+
"symbol": "parseContentSecurityPolicy"
145+
},
146+
{
147+
"repo": "sg-evals/firefox--871325b8",
148+
"path": "dom/security/nsCSPContext.h",
149+
"symbol": "nsCSPContext"
150+
},
151+
{
152+
"repo": "sg-evals/firefox--871325b8",
153+
"path": "dom/security/nsCSPContext.cpp",
154+
"symbol": "ShouldLoad"
155+
},
156+
{
157+
"repo": "sg-evals/firefox--871325b8",
158+
"path": "dom/security/nsCSPService.cpp",
159+
"symbol": "nsCSPService"
160+
},
161+
{
162+
"repo": "sg-evals/firefox--871325b8",
163+
"path": "dom/security/nsCSPUtils.cpp",
164+
"symbol": "nsCSPUtils"
165+
},
166+
{
167+
"repo": "sg-evals/firefox--871325b8",
168+
"path": "dom/security/CSPViolationData.h",
169+
"symbol": "CSPViolationData"
170+
}
171+
]
172+
}
173+
]
20174
}
21175
},
22176
"evaluation": {
23-
"modes": ["deterministic"],
177+
"modes": [
178+
"deterministic"
179+
],
24180
"checks": [
25-
{
26-
"type": "file_set_match",
27-
"params": {
28-
"search_pattern": "",
29-
"file_filter": ""
30-
}
31-
}
32-
],
181+
{
182+
"type": "file_set_match",
183+
"params": {
184+
"search_pattern": "",
185+
"file_filter": ""
186+
}
187+
},
188+
{
189+
"type": "keyword_presence",
190+
"params": {
191+
"required_keywords": [
192+
"nsCSPParser",
193+
"parseContentSecurityPolicy",
194+
"nsCSPContext",
195+
"ShouldLoad",
196+
"nsCSPUtils",
197+
"nsCSPService",
198+
"CSPViolationData",
199+
"nsIContentSecurityPolicy"
200+
]
201+
}
202+
}
203+
],
33204
"eval_script": "/tests/eval.sh",
34205
"pass_exit_code": 0
35206
},
36207
"logging": {
37-
"required_metrics": ["oracle_coverage", "time_to_first_oracle_hit_ms", "unique_repos_touched"]
208+
"required_metrics": [
209+
"oracle_coverage",
210+
"time_to_first_oracle_hit_ms",
211+
"unique_repos_touched"
212+
]
38213
}
39214
}

benchmarks/ccb_mcp_crossorg/ccx-crossorg-121/tests/oracle_checks.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@
2727
from typing import Any, Dict, List, Optional
2828

2929

30+
def _normalize_repo(repo: str) -> str:
31+
"""Normalize repo name by stripping github.com/ prefix.
32+
33+
Agents may report repos as 'github.com/sg-evals/foo' while oracle
34+
uses 'sg-evals/foo'. This normalizes both to a common form.
35+
"""
36+
if repo.startswith("github.com/"):
37+
return repo[len("github.com/"):]
38+
return repo
39+
40+
3041
def check_file_set_match(
3142
answer_files: List[Dict[str, str]],
3243
oracle_files: List[Dict[str, str]],
@@ -50,7 +61,7 @@ def check_file_set_match(
5061
[{'repo': 'a/b', 'path': 'x.go'}]
5162
"""
5263
def _key(item: Dict[str, str]) -> tuple:
53-
return (item.get("repo", ""), item.get("path", ""))
64+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""))
5465

5566
oracle_set = {_key(f) for f in oracle_files}
5667
answer_set = {_key(f) for f in answer_files}
@@ -90,7 +101,7 @@ def check_symbol_resolution(
90101
1.0
91102
"""
92103
def _key(item: Dict[str, str]) -> tuple:
93-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
104+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
94105

95106
oracle_set = {_key(s) for s in oracle_symbols}
96107
answer_set = {_key(s) for s in answer_symbols}
@@ -133,7 +144,7 @@ def check_dependency_chain(
133144
1.0
134145
"""
135146
def _key(item: Dict[str, str]) -> tuple:
136-
return (item.get("repo", ""), item.get("path", ""), item.get("symbol", ""))
147+
return (_normalize_repo(item.get("repo", "")), item.get("path", ""), item.get("symbol", ""))
137148

138149
oracle_keys = [_key(s) for s in oracle_chain]
139150
answer_keys = [_key(s) for s in answer_chain]

0 commit comments

Comments
 (0)