Skip to content

Commit e00bee0

Browse files
sjarmakclaude
andcommitted
feat: add output_files/evidence_files/gt_type to TaskGroundTruth schema
Extends the GT data model to distinguish files the agent must write (output_files) from files the agent must read (evidence_files), plus a gt_type field (edit/generate/evidence/answer/mixed). Backward compatible: legacy entries with only `files` continue to work via `all_files` fallback. Also regenerates script registry + agent navigation (stale from prior sessions). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0289ded commit e00bee0

File tree

3 files changed

+65
-4
lines changed

3 files changed

+65
-4
lines changed

docs/ops/SCRIPT_INDEX.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
3636
- `scripts/analyze_run_coverage.py` - Analysis/comparison script for analyze run coverage.
3737
- `scripts/audit_traces.py` - Analysis/comparison script for audit traces.
3838
- `scripts/compare_configs.py` - Compares benchmark outcomes across configs on matched task sets.
39+
- `scripts/comprehensive_analysis.py` - Analysis/comparison script for comprehensive analysis.
3940
- `scripts/compute_retrieval_metrics.py` - Analysis/comparison script for compute retrieval metrics.
4041
- `scripts/cost_breakdown_analysis.py` - Analysis/comparison script for cost breakdown analysis.
4142
- `scripts/cost_report.py` - Aggregates token and cost metrics per run, suite, and config.
@@ -153,6 +154,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
153154
## Misc
154155

155156
- `scripts/add_verification_metadata.py` - Utility script for add verification metadata.
157+
- `scripts/audit_official_scores.py` - Utility script for audit official scores.
156158
- `scripts/audit_unpinned_repos.py` - Utility script for audit unpinned repos.
157159
- `scripts/backfill_instruction_artifacts.py` [one_off] - Historical one-off script: backfill instruction artifacts.
158160
- `scripts/backfill_size_metadata.py` [one_off] - Historical one-off script: backfill size metadata.

scripts/ccb_metrics/ground_truth.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,48 @@ def from_dict(cls, d: dict) -> "DefectAnnotation":
7474

7575
@dataclass
7676
class TaskGroundTruth:
77-
"""Ground truth files for a single benchmark task."""
77+
"""Ground truth files for a single benchmark task.
78+
79+
Fields:
80+
files: Flat list of all relevant repo-relative paths (backward compat).
81+
When output_files or evidence_files are populated, ``files`` is
82+
their union; otherwise it is set directly by legacy extractors.
83+
output_files: Files the agent must CREATE or MODIFY (the deliverable).
84+
evidence_files: Files the agent must READ to produce the output
85+
(the retrieval targets for IR evaluation).
86+
gt_type: Classification of the ground-truth mode:
87+
- "edit" — standard SWE-bench style (output_files are edits)
88+
- "generate" — agent creates new files (output_files are new)
89+
- "evidence" — agent reads files to produce non-file output (e.g. doc gen)
90+
- "answer" — factual answer task, no meaningful file GT
91+
- "mixed" — both output and evidence files matter
92+
"""
7893

7994
task_id: str
8095
benchmark: str
81-
files: list[str] # repo-relative paths needing modification
96+
files: list[str] # union of output + evidence (backward compat)
8297
source: str # "patch" | "diff" | "ground_truth_dir" | "test_script" | "instruction"
8398
confidence: str # "high" | "medium" | "low"
8499
defect_annotations: list[DefectAnnotation] = field(default_factory=list)
100+
output_files: list[str] = field(default_factory=list)
101+
evidence_files: list[str] = field(default_factory=list)
102+
gt_type: str = "edit"
103+
104+
@property
105+
def all_files(self) -> list[str]:
106+
"""Union of output + evidence files (deduped, ordered).
107+
108+
Falls back to ``files`` when neither output nor evidence is populated.
109+
"""
110+
if not self.output_files and not self.evidence_files:
111+
return self.files
112+
seen: set[str] = set()
113+
result: list[str] = []
114+
for f in self.output_files + self.evidence_files:
115+
if f not in seen:
116+
seen.add(f)
117+
result.append(f)
118+
return result
85119

86120
def to_dict(self) -> dict:
87121
d = {
@@ -93,6 +127,12 @@ def to_dict(self) -> dict:
93127
}
94128
if self.defect_annotations:
95129
d["defect_annotations"] = [a.to_dict() for a in self.defect_annotations]
130+
if self.output_files:
131+
d["output_files"] = self.output_files
132+
if self.evidence_files:
133+
d["evidence_files"] = self.evidence_files
134+
if self.gt_type != "edit":
135+
d["gt_type"] = self.gt_type
96136
return d
97137

98138
@classmethod
@@ -107,6 +147,9 @@ def from_dict(cls, d: dict) -> "TaskGroundTruth":
107147
source=d["source"],
108148
confidence=d["confidence"],
109149
defect_annotations=annotations,
150+
output_files=d.get("output_files", []),
151+
evidence_files=d.get("evidence_files", []),
152+
gt_type=d.get("gt_type", "edit"),
110153
)
111154

112155

scripts/registry.json

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@
9090
"language": "shell",
9191
"summary": "Helper library/wrapper used by other scripts (artifact verifier lib)."
9292
},
93+
{
94+
"name": "audit_official_scores.py",
95+
"path": "scripts/audit_official_scores.py",
96+
"category": "misc",
97+
"status": "maintained",
98+
"language": "python",
99+
"summary": "Utility script for audit official scores."
100+
},
93101
{
94102
"name": "audit_traces.py",
95103
"path": "scripts/audit_traces.py",
@@ -162,6 +170,14 @@
162170
"language": "python",
163171
"summary": "Compares benchmark outcomes across configs on matched task sets."
164172
},
173+
{
174+
"name": "comprehensive_analysis.py",
175+
"path": "scripts/comprehensive_analysis.py",
176+
"category": "analysis_comparison",
177+
"status": "maintained",
178+
"language": "python",
179+
"summary": "Analysis/comparison script for comprehensive analysis."
180+
},
165181
{
166182
"name": "compute_retrieval_metrics.py",
167183
"path": "scripts/compute_retrieval_metrics.py",
@@ -1100,14 +1116,14 @@
11001116
}
11011117
],
11021118
"category_counts": {
1103-
"analysis_comparison": 19,
1119+
"analysis_comparison": 20,
11041120
"core_operations": 13,
11051121
"data_management": 10,
11061122
"generation": 5,
11071123
"infra_mirrors": 16,
11081124
"library_helpers": 7,
11091125
"migration": 3,
1110-
"misc": 34,
1126+
"misc": 35,
11111127
"qa_quality": 10,
11121128
"submission_reporting": 7,
11131129
"task_creation_selection": 12,

0 commit comments

Comments
 (0)