2121import argparse
2222import json
2323import os
24+ import re
2425import subprocess
2526import sys
2627from pathlib import Path
27- import re
2828from typing import Any , Dict , List , Optional
2929
30- # Hosting prefixes that Sourcegraph MCP tools prepend to repo names.
31- # Agents sometimes forget to strip these; normalize before comparison.
32- _HOSTING_PREFIX_RE = re .compile (r"^(?:github\.com|gitlab\.com|bitbucket\.org)/" )
33-
34- # sg-evals mirrors → upstream canonical names.
35- # Allows oracle answers and agent answers to use either name interchangeably.
36- _SG_MIRROR_ALIASES = {
37- "sg-evals/kubernetes-client-go" : "kubernetes/client-go" ,
38- "sg-evals/kubernetes-api" : "kubernetes/api" ,
39- "sg-evals/etcd-io-etcd" : "etcd-io/etcd" ,
40- "sg-evals/expressjs-express" : "expressjs/express" ,
41- "sg-evals/grafana-loki" : "grafana/loki" ,
42- "sg-evals/grafana-mimir" : "grafana/mimir" ,
43- "sg-evals/prisma-prisma" : "prisma/prisma" ,
44- "sg-evals/lodash" : "lodash/lodash" ,
45- "sg-evals/numpy" : "numpy/numpy" ,
46- "sg-evals/scipy" : "scipy/scipy" ,
47- "sg-evals/grafana" : "grafana/grafana" ,
48- "sg-evals/prometheus" : "prometheus/prometheus" ,
49- }
50-
51-
52- def _normalize_repo (repo : str ) -> str :
53- """Strip hosting-provider prefix and resolve sg-evals aliases.
54-
55- >>> _normalize_repo("github.com/sg-evals/kubernetes-client-go")
56- 'kubernetes/client-go'
57- >>> _normalize_repo("sg-evals/kubernetes-client-go")
58- 'kubernetes/client-go'
59- >>> _normalize_repo("etcd-io/etcd")
60- 'etcd-io/etcd'
30+
31+ # ---------------------------------------------------------------------------
32+ # Repo-name normalization
33+ # ---------------------------------------------------------------------------
34+ # Oracle uses mirror names like "sg-evals/firefox--871325b8" while agents use
35+ # upstream names like "mozilla-firefox/firefox" or "openjdk/jdk". Normalise
36+ # both sides so that matching works regardless of which convention is used.
37+
38+ _MIRROR_HASH_RE = re .compile (r"--[0-9a-f]{6,}$" )
39+
40+
41+ def _normalize_repo (name : str ) -> str :
42+ """Reduce a repo identifier to its base name for fuzzy comparison.
43+
44+ Examples:
45+ sg-evals/firefox--871325b8 -> firefox
46+ sg-evals/jdk--742e735d -> jdk
47+ openjdk/jdk -> jdk
48+ chromium/chromium -> chromium
49+ rust-lang/rust -> rust
50+ arangodb/arangodb -> arangodb
51+ """
52+ # Strip leading org/ prefix (take the part after the last '/')
53+ base = name .rsplit ("/" , 1 )[- 1 ]
54+ # Strip mirror hash suffix (--<hex>)
55+ base = _MIRROR_HASH_RE .sub ("" , base )
56+ return base .lower ()
57+
58+
59+ def _match_items (
60+ answer_items : List [Dict [str , str ]],
61+ oracle_items : List [Dict [str , str ]],
62+ key_fields : List [str ],
63+ ) -> tuple :
64+ """Two-pass matching: exact first, then path-only fallback.
65+
66+ Returns (matched, missing, extra) as sets of tuples built from key_fields.
6167 """
62- repo = _HOSTING_PREFIX_RE .sub ("" , repo )
63- return _SG_MIRROR_ALIASES .get (repo , repo )
68+ def _exact_key (item : Dict [str , str ]) -> tuple :
69+ return tuple (item .get (k , "" ) for k in key_fields )
70+
71+ def _norm_key (item : Dict [str , str ]) -> tuple :
72+ """Normalized key: use _normalize_repo for repo field, rest as-is."""
73+ return tuple (
74+ _normalize_repo (item .get (k , "" )) if k == "repo" else item .get (k , "" )
75+ for k in key_fields
76+ )
77+
78+ def _path_key (item : Dict [str , str ]) -> tuple :
79+ """Path-only key: skip repo, keep remaining fields."""
80+ return tuple (item .get (k , "" ) for k in key_fields if k != "repo" )
81+
82+ # Pass 1: exact (repo, path, ...) match
83+ oracle_exact = {_exact_key (f ): f for f in oracle_items }
84+ answer_exact = {_exact_key (f ): f for f in answer_items }
85+ exact_matched = set (oracle_exact .keys ()) & set (answer_exact .keys ())
86+
87+ # Pass 2: normalized-repo match on remaining items
88+ remaining_oracle = {k : v for k , v in oracle_exact .items () if k not in exact_matched }
89+ remaining_answer = {k : v for k , v in answer_exact .items () if k not in exact_matched }
90+
91+ norm_oracle = {} # norm_key -> exact_key
92+ for ek , item in remaining_oracle .items ():
93+ norm_oracle [_norm_key (item )] = ek
94+ norm_answer = {}
95+ for ek , item in remaining_answer .items ():
96+ norm_answer [_norm_key (item )] = ek
97+
98+ norm_matched_oracle = set ()
99+ norm_matched_answer = set ()
100+ for nk in set (norm_oracle .keys ()) & set (norm_answer .keys ()):
101+ norm_matched_oracle .add (norm_oracle [nk ])
102+ norm_matched_answer .add (norm_answer [nk ])
103+
104+ # Pass 3: path-only fallback for still-unmatched items
105+ still_oracle = {k : v for k , v in remaining_oracle .items ()
106+ if k not in norm_matched_oracle }
107+ still_answer = {k : v for k , v in remaining_answer .items ()
108+ if k not in norm_matched_answer }
109+
110+ path_oracle = {} # path_key -> exact_key
111+ for ek , item in still_oracle .items ():
112+ pk = _path_key (item )
113+ path_oracle [pk ] = ek
114+ path_answer = {}
115+ for ek , item in still_answer .items ():
116+ pk = _path_key (item )
117+ path_answer [pk ] = ek
118+
119+ path_matched_oracle = set ()
120+ path_matched_answer = set ()
121+ for pk in set (path_oracle .keys ()) & set (path_answer .keys ()):
122+ path_matched_oracle .add (path_oracle [pk ])
123+ path_matched_answer .add (path_answer [pk ])
124+
125+ # Combine all matched keys (using oracle keys as canonical)
126+ all_matched_oracle = exact_matched | norm_matched_oracle | path_matched_oracle
127+ all_matched_answer = exact_matched | norm_matched_answer | path_matched_answer
128+
129+ missing = set (oracle_exact .keys ()) - all_matched_oracle
130+ extra = set (answer_exact .keys ()) - all_matched_answer
131+
132+ return all_matched_oracle , missing , extra
64133
65134
66135def check_file_set_match (
@@ -70,7 +139,8 @@ def check_file_set_match(
70139 """Check overlap between agent-reported files and oracle files.
71140
72141 Each file item is a dict with at least {"repo", "path"}.
73- Matching is by (repo, path) tuple — both must match.
142+ Matching uses two-pass repo normalization: exact match first, then
143+ normalised-repo and path-only fallback for mirror/upstream name mismatches.
74144
75145 Returns raw scores without thresholds.
76146
@@ -82,21 +152,21 @@ def check_file_set_match(
82152 0.5
83153 >>> result["precision"]
84154 1.0
85- >>> result["matched"]
86- [{'repo': 'a/b', 'path': 'x.go'}]
87- """
88- def _key (item : Dict [str , str ]) -> tuple :
89- return (_normalize_repo (item .get ("repo" , "" )), item .get ("path" , "" ))
90155
91- oracle_set = {_key (f ) for f in oracle_files }
92- answer_set = {_key (f ) for f in answer_files }
156+ >>> result = check_file_set_match(
157+ ... [{"repo": "openjdk/jdk", "path": "src/Foo.java"}],
158+ ... [{"repo": "sg-evals/jdk--742e735d", "path": "src/Foo.java"}],
159+ ... )
160+ >>> result["f1"]
161+ 1.0
162+ """
163+ matched , missing , extra = _match_items (answer_files , oracle_files , ["repo" , "path" ])
93164
94- matched = oracle_set & answer_set
95- missing = oracle_set - answer_set
96- extra = answer_set - oracle_set
165+ n_oracle = len ({(f .get ("repo" , "" ), f .get ("path" , "" )) for f in oracle_files })
166+ n_answer = len ({(f .get ("repo" , "" ), f .get ("path" , "" )) for f in answer_files })
97167
98- recall = len (matched ) / len ( oracle_set ) if oracle_set else 1.0
99- precision = len (matched ) / len ( answer_set ) if answer_set else 0.0
168+ recall = len (matched ) / n_oracle if n_oracle else 1.0
169+ precision = len (matched ) / n_answer if n_answer else 0.0
100170 f1 = (2 * precision * recall / (precision + recall )) if (precision + recall ) > 0 else 0.0
101171
102172 return {
@@ -116,7 +186,7 @@ def check_symbol_resolution(
116186 """Check overlap between agent-identified symbols and oracle symbols.
117187
118188 Each symbol item has at least {"repo", "path", "symbol"}.
119- Matching is by ( repo, path, symbol) tuple .
189+ Matching uses two-pass repo normalization (see _match_items) .
120190
121191 >>> result = check_symbol_resolution(
122192 ... [{"repo": "a/b", "path": "x.go", "symbol": "Foo"}],
@@ -125,18 +195,15 @@ def check_symbol_resolution(
125195 >>> result["recall"]
126196 1.0
127197 """
128- def _key (item : Dict [str , str ]) -> tuple :
129- return (_normalize_repo (item .get ("repo" , "" )), item .get ("path" , "" ), item .get ("symbol" , "" ))
130-
131- oracle_set = {_key (s ) for s in oracle_symbols }
132- answer_set = {_key (s ) for s in answer_symbols }
198+ matched , missing , extra = _match_items (
199+ answer_symbols , oracle_symbols , ["repo" , "path" , "symbol" ]
200+ )
133201
134- matched = oracle_set & answer_set
135- missing = oracle_set - answer_set
136- extra = answer_set - oracle_set
202+ n_oracle = len ({(s .get ("repo" , "" ), s .get ("path" , "" ), s .get ("symbol" , "" )) for s in oracle_symbols })
203+ n_answer = len ({(s .get ("repo" , "" ), s .get ("path" , "" ), s .get ("symbol" , "" )) for s in answer_symbols })
137204
138- recall = len (matched ) / len ( oracle_set ) if oracle_set else 1.0
139- precision = len (matched ) / len ( answer_set ) if answer_set else 0.0
205+ recall = len (matched ) / n_oracle if n_oracle else 1.0
206+ precision = len (matched ) / n_answer if n_answer else 0.0
140207
141208 return {
142209 "matched" : [{"repo" : r , "path" : p , "symbol" : s } for r , p , s in sorted (matched )],
@@ -155,7 +222,7 @@ def check_dependency_chain(
155222
156223 Each step is {"repo", "path", "symbol"}. Order matters — we check both
157224 set membership (did agent find the step?) and order (is the sequence
158- correct?).
225+ correct?). Uses repo-name normalization for matching.
159226
160227 >>> result = check_dependency_chain(
161228 ... [{"repo": "a", "path": "x", "symbol": "f1"},
@@ -168,30 +235,50 @@ def check_dependency_chain(
168235 >>> result["chain_recall"]
169236 1.0
170237 """
171- def _key (item : Dict [str , str ]) -> tuple :
172- return (_normalize_repo (item .get ("repo" , "" )), item .get ("path" , "" ), item .get ("symbol" , "" ))
238+ def _norm_key (item : Dict [str , str ]) -> tuple :
239+ return (_normalize_repo (item .get ("repo" , "" )),
240+ item .get ("path" , "" ),
241+ item .get ("symbol" , "" ))
173242
174- oracle_keys = [ _key ( s ) for s in oracle_chain ]
175- answer_keys = [ _key ( s ) for s in answer_chain ]
243+ def _path_key ( item : Dict [ str , str ]) -> tuple :
244+ return ( item . get ( "path" , "" ), item . get ( "symbol" , "" ))
176245
177- oracle_set = set (oracle_keys )
178- answer_set = set (answer_keys )
246+ # Use normalised keys for set matching
247+ oracle_norm = [_norm_key (s ) for s in oracle_chain ]
248+ answer_norm = [_norm_key (s ) for s in answer_chain ]
179249
250+ oracle_set = set (oracle_norm )
251+ answer_set = set (answer_norm )
180252 matched = oracle_set & answer_set
181- missing = oracle_set - answer_set
182253
183- # Check order: extract the subsequence of answer that matches oracle steps
184- # and verify it preserves the oracle ordering.
185- oracle_positions = {k : i for i , k in enumerate (oracle_keys )}
186- matched_in_order = [k for k in answer_keys if k in oracle_set ]
254+ # Path-only fallback for remaining items
255+ remaining_oracle = oracle_set - matched
256+ remaining_answer = answer_set - matched
257+ path_oracle = {_path_key ({"path" : k [1 ], "symbol" : k [2 ]}): k for k in remaining_oracle }
258+ path_answer = {_path_key ({"path" : k [1 ], "symbol" : k [2 ]}): k for k in remaining_answer }
259+ path_matched = set (path_oracle .keys ()) & set (path_answer .keys ())
260+ for pk in path_matched :
261+ matched .add (path_oracle [pk ])
262+
263+ missing_set = oracle_set - matched
264+ missing = sorted (missing_set )
265+
266+ # Check order using normalised keys
267+ oracle_positions = {k : i for i , k in enumerate (oracle_norm )}
268+ matched_in_order = [k for k in answer_norm if k in matched ]
269+ # Also try path-only for order check
270+ if not matched_in_order :
271+ answer_path = [_path_key ({"path" : k [1 ], "symbol" : k [2 ]}) for k in answer_norm ]
272+ oracle_path_map = {_path_key ({"path" : k [1 ], "symbol" : k [2 ]}): k for k in oracle_norm }
273+ matched_in_order = [oracle_path_map [pk ] for pk in answer_path if pk in oracle_path_map ]
187274 positions = [oracle_positions [k ] for k in matched_in_order if k in oracle_positions ]
188275 order_correct = positions == sorted (positions ) and len (matched ) == len (oracle_set )
189276
190277 chain_recall = len (matched ) / len (oracle_set ) if oracle_set else 1.0
191278
192279 return {
193280 "matched_steps" : len (matched ),
194- "missing_steps" : [{"repo" : r , "path" : p , "symbol" : s } for r , p , s in sorted ( missing ) ],
281+ "missing_steps" : [{"repo" : r , "path" : p , "symbol" : s } for r , p , s in missing ],
195282 "order_correct" : order_correct ,
196283 "chain_recall" : round (chain_recall , 4 ),
197284 }
@@ -420,16 +507,10 @@ def run_all_checks(
420507 oracle = spec .get ("artifacts" , {}).get ("oracle" , {})
421508 eval_checks = spec .get ("evaluation" , {}).get ("checks" , [])
422509
423- # If answer is a dict with "text" key, extract the text for text-based checks.
424- # Also include the full JSON serialization so that provenance citations in
425- # structured fields (e.g. chain[].repo) are found by substring matching.
426- # This prevents penalizing agents that correctly cite repos in structured
427- # data but use natural language (e.g. "Loki") in the narrative text.
510+ # If answer is a dict with "text" key, extract the text for text-based checks
428511 answer_text = ""
429512 if isinstance (answer_data , dict ):
430- narrative = answer_data .get ("text" , answer_data .get ("answer" , "" ))
431- full_json = json .dumps (answer_data )
432- answer_text = f"{ narrative } \n { full_json } " if narrative else full_json
513+ answer_text = answer_data .get ("text" , answer_data .get ("answer" , json .dumps (answer_data )))
433514 elif isinstance (answer_data , str ):
434515 answer_text = answer_data
435516
0 commit comments