feat: add output_files/evidence_files/gt_type to TaskGroundTruth schema

sjarmak · claude · sjarmak · commit e00bee0984a7 · 2026-02-25T14:14:24.000Z
Extends the GT data model to distinguish files the agent must write
(output_files) from files the agent must read (evidence_files), plus a
gt_type field (edit/generate/evidence/answer/mixed). Backward compatible:
legacy entries with only `files` continue to work via `all_files` fallback.

Also regenerates script registry + agent navigation (stale from prior sessions).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/ops/SCRIPT_INDEX.md b/docs/ops/SCRIPT_INDEX.md
@@ -36,6 +36,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 - `scripts/analyze_run_coverage.py` - Analysis/comparison script for analyze run coverage.
 - `scripts/audit_traces.py` - Analysis/comparison script for audit traces.
 - `scripts/compare_configs.py` - Compares benchmark outcomes across configs on matched task sets.
+- `scripts/comprehensive_analysis.py` - Analysis/comparison script for comprehensive analysis.
 - `scripts/compute_retrieval_metrics.py` - Analysis/comparison script for compute retrieval metrics.
 - `scripts/cost_breakdown_analysis.py` - Analysis/comparison script for cost breakdown analysis.
 - `scripts/cost_report.py` - Aggregates token and cost metrics per run, suite, and config.
@@ -153,6 +154,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 ## Misc
 
 - `scripts/add_verification_metadata.py` - Utility script for add verification metadata.
+- `scripts/audit_official_scores.py` - Utility script for audit official scores.
 - `scripts/audit_unpinned_repos.py` - Utility script for audit unpinned repos.
 - `scripts/backfill_instruction_artifacts.py` [one_off] - Historical one-off script: backfill instruction artifacts.
 - `scripts/backfill_size_metadata.py` [one_off] - Historical one-off script: backfill size metadata.
diff --git a/scripts/ccb_metrics/ground_truth.py b/scripts/ccb_metrics/ground_truth.py
@@ -74,14 +74,48 @@ def from_dict(cls, d: dict) -> "DefectAnnotation":
 
 @dataclass
 class TaskGroundTruth:
-    """Ground truth files for a single benchmark task."""
+    """Ground truth files for a single benchmark task.
+
+    Fields:
+        files: Flat list of all relevant repo-relative paths (backward compat).
+               When output_files or evidence_files are populated, ``files`` is
+               their union; otherwise it is set directly by legacy extractors.
+        output_files: Files the agent must CREATE or MODIFY (the deliverable).
+        evidence_files: Files the agent must READ to produce the output
+                        (the retrieval targets for IR evaluation).
+        gt_type: Classification of the ground-truth mode:
+            - "edit"     — standard SWE-bench style (output_files are edits)
+            - "generate" — agent creates new files (output_files are new)
+            - "evidence" — agent reads files to produce non-file output (e.g. doc gen)
+            - "answer"   — factual answer task, no meaningful file GT
+            - "mixed"    — both output and evidence files matter
+    """
 
     task_id: str
     benchmark: str
-    files: list[str]        # repo-relative paths needing modification
+    files: list[str]        # union of output + evidence (backward compat)
     source: str             # "patch" | "diff" | "ground_truth_dir" | "test_script" | "instruction"
     confidence: str         # "high" | "medium" | "low"
     defect_annotations: list[DefectAnnotation] = field(default_factory=list)
+    output_files: list[str] = field(default_factory=list)
+    evidence_files: list[str] = field(default_factory=list)
+    gt_type: str = "edit"
+
+    @property
+    def all_files(self) -> list[str]:
+        """Union of output + evidence files (deduped, ordered).
+
+        Falls back to ``files`` when neither output nor evidence is populated.
+        """
+        if not self.output_files and not self.evidence_files:
+            return self.files
+        seen: set[str] = set()
+        result: list[str] = []
+        for f in self.output_files + self.evidence_files:
+            if f not in seen:
+                seen.add(f)
+                result.append(f)
+        return result
 
     def to_dict(self) -> dict:
         d = {
@@ -93,6 +127,12 @@ def to_dict(self) -> dict:
         }
         if self.defect_annotations:
             d["defect_annotations"] = [a.to_dict() for a in self.defect_annotations]
+        if self.output_files:
+            d["output_files"] = self.output_files
+        if self.evidence_files:
+            d["evidence_files"] = self.evidence_files
+        if self.gt_type != "edit":
+            d["gt_type"] = self.gt_type
         return d
 
     @classmethod
@@ -107,6 +147,9 @@ def from_dict(cls, d: dict) -> "TaskGroundTruth":
             source=d["source"],
             confidence=d["confidence"],
             defect_annotations=annotations,
+            output_files=d.get("output_files", []),
+            evidence_files=d.get("evidence_files", []),
+            gt_type=d.get("gt_type", "edit"),
         )
 
 
diff --git a/scripts/registry.json b/scripts/registry.json
@@ -90,6 +90,14 @@
       "language": "shell",
       "summary": "Helper library/wrapper used by other scripts (artifact verifier lib)."
     },
+    {
+      "name": "audit_official_scores.py",
+      "path": "scripts/audit_official_scores.py",
+      "category": "misc",
+      "status": "maintained",
+      "language": "python",
+      "summary": "Utility script for audit official scores."
+    },
     {
       "name": "audit_traces.py",
       "path": "scripts/audit_traces.py",
@@ -162,6 +170,14 @@
       "language": "python",
       "summary": "Compares benchmark outcomes across configs on matched task sets."
     },
+    {
+      "name": "comprehensive_analysis.py",
+      "path": "scripts/comprehensive_analysis.py",
+      "category": "analysis_comparison",
+      "status": "maintained",
+      "language": "python",
+      "summary": "Analysis/comparison script for comprehensive analysis."
+    },
     {
       "name": "compute_retrieval_metrics.py",
       "path": "scripts/compute_retrieval_metrics.py",
@@ -1100,14 +1116,14 @@
     }
   ],
   "category_counts": {
-    "analysis_comparison": 19,
+    "analysis_comparison": 20,
     "core_operations": 13,
     "data_management": 10,
     "generation": 5,
     "infra_mirrors": 16,
     "library_helpers": 7,
     "migration": 3,
-    "misc": 34,
+    "misc": 35,
     "qa_quality": 10,
     "submission_reporting": 7,
     "task_creation_selection": 12,