feat: add SCIP precise indexing ablation configs and tooling

sjarmak · claude · sjarmak · commit 4ec00f2bce2c · 2026-02-23T18:34:58.000Z
Add mcp-scip-remote-direct and mcp-scip-remote-artifact configs for
comparing SCIP precise code intelligence vs search-based heuristics.

- configs/_common.sh: register SCIP configs in config_to_mcp_type(),
  validate_config_name(), add config_uses_scip() helper, export
  SOURCEGRAPH_SEARCH_BRANCH=scip-enabled for SCIP configs
- agents/claude_baseline_agent.py: inject Branch Search Instructions
  into MCP preamble when SOURCEGRAPH_SEARCH_BRANCH is set, directing
  agent to use rev:scip-enabled in all MCP tool calls
- configs/eval_matrix.json: add sourcegraph_scip config definition
- scripts/create_scip_branches.sh: batch-create scip-enabled branches
  on all sg-benchmarks repos (1561 created, 31 empty skipped)
- scripts/swap_default_branch.sh: swap default branch across org for
  Deep Search (which only indexes HEAD)
- Retrieval eval pipeline: multi-run pooling, write-tool tracking,
  Spearman fix for tied ranks

Sourcegraph policies created:
- "Benchmarks: Main (No SCIP)" — indexingEnabled=false on main
- "Benchmarks: SCIP Enabled" — indexingEnabled=true on scip-enabled

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/agents/claude_baseline_agent.py b/agents/claude_baseline_agent.py
@@ -607,6 +607,29 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                     "before searching.\n"
                 )
 
+            # SCIP ablation: inject branch targeting instructions when
+            # SOURCEGRAPH_SEARCH_BRANCH is set (e.g., "scip-enabled").
+            scip_branch = os.environ.get("SOURCEGRAPH_SEARCH_BRANCH", "")
+            if scip_branch:
+                branch_instructions = (
+                    f"\n**Branch Search Instructions**\n\n"
+                    f"IMPORTANT: You must search the `{scip_branch}` branch for all "
+                    f"repositories in `github.com/sg-benchmarks/`.\n\n"
+                    f"When using search and file tools, always specify the "
+                    f"`{scip_branch}` branch:\n\n"
+                    f"- **keyword_search / nls_search:** Include "
+                    f"`rev:{scip_branch}` in your query alongside the repo filter\n"
+                    f'  Example: `repo:^github\\.com/sg-benchmarks/REPO$ '
+                    f"rev:{scip_branch} YOUR_SEARCH_TERMS`\n"
+                    f"- **read_file / list_files:** Set the `revision` parameter "
+                    f'to `"{scip_branch}"`\n'
+                    f"- **go_to_definition / find_references:** Set the `revision` "
+                    f'parameter to `"{scip_branch}"`\n\n'
+                    f"Do NOT use the default branch. All searches and file "
+                    f"operations must target the `{scip_branch}` branch.\n"
+                )
+                repo_scope += branch_instructions
+
             # Workflow steps 3-4 vary by config: direct configs edit+test
             # locally, artifact configs produce diffs as output artifacts.
             if mcp_type == "artifact_full":
diff --git a/configs/_common.sh b/configs/_common.sh
@@ -51,18 +51,30 @@ VERIFIER_MODE="direct"
 SOURCE_ACCESS="local"
 
 # Map composite config name → internal mcp_type for Harbor.
-# Side effects: sets VERIFIER_MODE and SOURCE_ACCESS globals.
+# Side effects: sets VERIFIER_MODE, SOURCE_ACCESS, and SOURCEGRAPH_SEARCH_BRANCH globals.
+# SCIP configs set SOURCEGRAPH_SEARCH_BRANCH=scip-enabled so the agent
+# targets the SCIP-indexed branch in all MCP tool calls.
 config_to_mcp_type() {
     local config_name="$1"
+    # Clear branch override unless it's a SCIP config
+    unset SOURCEGRAPH_SEARCH_BRANCH 2>/dev/null || true
     case "$config_name" in
         baseline-local-direct)
             VERIFIER_MODE="direct"; SOURCE_ACCESS="local"; echo "none" ;;
         mcp-remote-direct)
             VERIFIER_MODE="direct"; SOURCE_ACCESS="remote"; echo "sourcegraph_full" ;;
+        mcp-scip-remote-direct)
+            VERIFIER_MODE="direct"; SOURCE_ACCESS="remote"
+            export SOURCEGRAPH_SEARCH_BRANCH="scip-enabled"
+            echo "sourcegraph_full" ;;
         baseline-local-artifact)
             VERIFIER_MODE="artifact"; SOURCE_ACCESS="local"; echo "none" ;;
         mcp-remote-artifact)
             VERIFIER_MODE="artifact"; SOURCE_ACCESS="remote"; echo "artifact_full" ;;
+        mcp-scip-remote-artifact)
+            VERIFIER_MODE="artifact"; SOURCE_ACCESS="remote"
+            export SOURCEGRAPH_SEARCH_BRANCH="scip-enabled"
+            echo "artifact_full" ;;
         # Legacy names
         baseline)
             VERIFIER_MODE="direct"; SOURCE_ACCESS="local"; echo "none" ;;
@@ -88,18 +100,29 @@ baseline_config_for() {
     esac
 }
 
+# Check whether a config uses SCIP precise indexing (requires branch swap).
+config_uses_scip() {
+    local config_name="$1"
+    case "$config_name" in
+        mcp-scip-*) return 0 ;;
+        *)          return 1 ;;
+    esac
+}
+
 # Validate a config name against the known whitelist.
 # Exits 1 with error message if unknown. Call before config_to_mcp_type().
 validate_config_name() {
     local config_name="$1"
     case "$config_name" in
         baseline-local-direct|mcp-remote-direct|\
+        mcp-scip-remote-direct|mcp-scip-remote-artifact|\
         baseline-local-artifact|mcp-remote-artifact|\
         baseline|sourcegraph_full|artifact_full|none)
             return 0 ;;
         *)
             echo "ERROR: Unknown config name: '$config_name'" >&2
-            echo "  Valid: baseline-local-direct, mcp-remote-direct, baseline-local-artifact, mcp-remote-artifact" >&2
+            echo "  Valid: baseline-local-direct, mcp-remote-direct, mcp-scip-remote-direct" >&2
+            echo "         baseline-local-artifact, mcp-remote-artifact, mcp-scip-remote-artifact" >&2
             echo "  Legacy: baseline, sourcegraph_full, artifact_full, none" >&2
             exit 1 ;;
     esac
diff --git a/configs/eval_matrix.json b/configs/eval_matrix.json
@@ -7,6 +7,7 @@
   "supported_configs": [
     "baseline",
     "sourcegraph_full",
+    "sourcegraph_scip",
     "sourcegraph_isolated",
     "sg_only_env",
     "github_full"
@@ -26,6 +27,15 @@
       "track_in_official": true,
       "status": "active"
     },
+    "sourcegraph_scip": {
+      "baseline_mcp_type": "sourcegraph_full",
+      "mcp_enabled": true,
+      "provider": "sourcegraph",
+      "track_in_official": false,
+      "status": "experimental",
+      "composite_name": "mcp-scip-remote-direct",
+      "notes": "SCIP precise indexing ablation. Same runtime as sourcegraph_full but sg-benchmarks repos have scip-enabled branch set as default. Requires branch swap before runs: scripts/swap_default_branch.sh scip-enabled"
+    },
     "sourcegraph_isolated": {
       "baseline_mcp_type": "sourcegraph_isolated",
       "mcp_enabled": true,
diff --git a/scripts/create_scip_branches.sh b/scripts/create_scip_branches.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# Create scip-enabled branches on all sg-benchmarks repos.
+# Each branch points to the same commit as the repo's default branch HEAD.
+#
+# Usage:
+#   ./scripts/create_scip_branches.sh [--dry-run] [--parallel N]
+#
+# Requires: gh CLI authenticated with access to sg-benchmarks org.
+
+set -euo pipefail
+
+DRY_RUN=false
+PARALLEL=10
+LOG_DIR="/tmp/scip_branch_creation"
+BRANCH_NAME="scip-enabled"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --dry-run)  DRY_RUN=true; shift ;;
+    --parallel) PARALLEL="$2"; shift 2 ;;
+    *)          echo "Unknown arg: $1"; exit 1 ;;
+  esac
+done
+
+mkdir -p "$LOG_DIR"
+SUCCESS_LOG="$LOG_DIR/success.log"
+SKIP_LOG="$LOG_DIR/skip.log"
+FAIL_LOG="$LOG_DIR/fail.log"
+> "$SUCCESS_LOG"
+> "$SKIP_LOG"
+> "$FAIL_LOG"
+
+echo "=== SCIP Branch Creator ==="
+echo "Branch: $BRANCH_NAME"
+echo "Parallel: $PARALLEL"
+echo "Dry run: $DRY_RUN"
+echo "Logs: $LOG_DIR/"
+echo ""
+
+# Fetch all repo names in the org
+echo "Fetching repo list from sg-benchmarks org..."
+REPOS=$(gh api --paginate orgs/sg-benchmarks/repos \
+  --jq '.[].name' 2>/dev/null | sort)
+TOTAL=$(echo "$REPOS" | wc -l)
+echo "Found $TOTAL repos"
+echo ""
+
+create_branch() {
+  local repo_name="$1"
+  local full_name="sg-benchmarks/$repo_name"
+
+  # Get default branch HEAD SHA
+  local sha
+  sha=$(gh api "repos/$full_name/git/refs/heads/main" --jq '.object.sha' 2>/dev/null || true)
+
+  # If main doesn't exist, try the default branch
+  if [[ -z "$sha" ]]; then
+    local default_branch
+    default_branch=$(gh api "repos/$full_name" --jq '.default_branch' 2>/dev/null || true)
+    if [[ -n "$default_branch" && "$default_branch" != "main" ]]; then
+      sha=$(gh api "repos/$full_name/git/refs/heads/$default_branch" --jq '.object.sha' 2>/dev/null || true)
+    fi
+  fi
+
+  if [[ -z "$sha" ]]; then
+    echo "SKIP $repo_name: empty repo or no resolvable HEAD" >> "$SKIP_LOG"
+    return 0
+  fi
+
+  # Check if branch already exists
+  local existing
+  existing=$(gh api "repos/$full_name/git/refs/heads/$BRANCH_NAME" --jq '.object.sha' 2>/dev/null || true)
+  if [[ -n "$existing" ]]; then
+    echo "SKIP $repo_name: $BRANCH_NAME already exists (sha=$existing)" >> "$SKIP_LOG"
+    return 0
+  fi
+
+  if $DRY_RUN; then
+    echo "DRY-RUN: would create $BRANCH_NAME on $full_name at $sha"
+    echo "DRYRUN $repo_name: $sha" >> "$SUCCESS_LOG"
+    return 0
+  fi
+
+  # Create the branch
+  local result
+  result=$(gh api "repos/$full_name/git/refs" \
+    -f "ref=refs/heads/$BRANCH_NAME" \
+    -f "sha=$sha" 2>&1) || {
+    echo "FAIL $repo_name: $result" >> "$FAIL_LOG"
+    return 1
+  }
+
+  echo "OK $repo_name: created $BRANCH_NAME at $sha" >> "$SUCCESS_LOG"
+}
+
+export -f create_branch
+export DRY_RUN BRANCH_NAME SUCCESS_LOG SKIP_LOG FAIL_LOG
+
+# Run in parallel with progress
+echo "$REPOS" | xargs -P "$PARALLEL" -I {} bash -c 'create_branch "$@"' _ {}
+
+echo ""
+echo "=== Results ==="
+echo "Success: $(wc -l < "$SUCCESS_LOG")"
+echo "Skipped: $(wc -l < "$SKIP_LOG")"
+echo "Failed:  $(wc -l < "$FAIL_LOG")"
+
+if [[ -s "$FAIL_LOG" ]]; then
+  echo ""
+  echo "=== Failures ==="
+  cat "$FAIL_LOG"
+fi
+
+echo ""
+echo "Full logs in $LOG_DIR/"
diff --git a/scripts/generate_retrieval_report.py b/scripts/generate_retrieval_report.py
@@ -227,8 +227,28 @@ def generate_report(
         if matched.get("computable"):
             lines.append("### Matched Comparison")
             lines.append("")
-            lines.append(f"Paired comparison of **{matched['n_matched_tasks']}** matched tasks")
-            lines.append(f"({matched['baseline_config']} vs {matched['mcp_config']}):")
+            n_runs_compared = matched.get("n_runs_compared")
+            if n_runs_compared and n_runs_compared > 1:
+                lines.append(
+                    f"Pooled paired comparison of **{matched['n_matched_tasks']}** matched tasks "
+                    f"across **{n_runs_compared}** runs:"
+                )
+                baseline_cfgs = matched.get("baseline_configs") or []
+                mcp_cfgs = matched.get("mcp_configs") or []
+                if baseline_cfgs or mcp_cfgs:
+                    lines.append("")
+                    lines.append(f"- Baseline config(s): {', '.join(baseline_cfgs) if baseline_cfgs else 'N/A'}")
+                    lines.append(f"- MCP config(s): {', '.join(mcp_cfgs) if mcp_cfgs else 'N/A'}")
+                run_pairs = matched.get("run_pairs", [])
+                if run_pairs:
+                    lines.append("")
+                    lines.append(f"- Runs compared: {len(run_pairs)}")
+                    skipped = matched.get("n_runs_skipped", 0)
+                    if skipped:
+                        lines.append(f"- Runs skipped: {skipped}")
+            else:
+                lines.append(f"Paired comparison of **{matched['n_matched_tasks']}** matched tasks")
+                lines.append(f"({matched.get('baseline_config')} vs {matched.get('mcp_config')}):")
             lines.append("")
             lines.append("| Metric | Mean Δ | Median Δ | IQR | +/−/0 | n |")
             lines.append("|--------|--------|---------|-----|-------|---|")
@@ -248,6 +268,9 @@ def generate_report(
             lines.append("**These are comparative observations, not causal claims.**")
             lines.append(f"**Unmatched tasks**: {matched.get('n_baseline_only', 0)} baseline-only, "
                          f"{matched.get('n_mcp_only', 0)} MCP-only.")
+            if matched.get("skipped_runs"):
+                lines.append(f"**Skipped runs**: {len(matched.get('skipped_runs', []))} "
+                             f"(missing paired configs or insufficient matched tasks).")
             lines.append("")
         else:
             reason = matched.get("reason", "unknown")
diff --git a/scripts/normalize_retrieval_events.py b/scripts/normalize_retrieval_events.py
@@ -115,10 +115,11 @@
     "WebSearch": "other",
 }
 
-# Tools that are retrieval-relevant (we emit events for these)
-_RETRIEVAL_TOOLS = (
+# Tools that are tracked by the retrieval evaluation pipeline.
+# Includes retrieval tools plus local write tools for utilization/taxonomy stages.
+_TRACKED_TOOLS = (
     set(_MCP_TOOL_CATEGORIES.keys())
-    | {"Read", "Glob", "Grep"}
+    | {"Read", "Glob", "Grep", "Write", "Edit", "NotebookEdit"}
 )
 
 
@@ -137,8 +138,8 @@ def _is_mcp(name: str) -> bool:
 
 
 def _is_retrieval_tool(name: str) -> bool:
-    """True for tools that access or search for code."""
-    if name in _RETRIEVAL_TOOLS:
+    """True for tools tracked by the retrieval evaluation pipeline."""
+    if name in _TRACKED_TOOLS:
         return True
     if name.startswith("mcp__sourcegraph__"):
         return True
@@ -179,7 +180,7 @@ def _extract_files_from_tool_input(tool_name: str, tool_input: dict) -> list[str
         return files
 
     # Local file tools
-    if tool_name in ("Read", "Glob", "Grep"):
+    if tool_name in ("Read", "Glob", "Grep", "Write", "Edit", "NotebookEdit"):
         fp = tool_input.get("file_path") or tool_input.get("path") or ""
         if fp and _looks_like_file(_normalize(fp)):
             files.append(fp)
@@ -252,6 +253,10 @@ def _salient_arguments(tool_name: str, tool_input: dict) -> dict:
         for k in ("file_path", "path", "repo", "startLine", "endLine"):
             if k in tool_input:
                 args[k] = tool_input[k]
+    elif cat == "file_write":
+        for k in ("file_path", "path", "old_string", "new_string", "replace_all"):
+            if k in tool_input:
+                args[k] = tool_input[k]
     elif cat == "file_search":
         for k in ("pattern", "path", "repo", "query"):
             if k in tool_input:
@@ -627,7 +632,7 @@ def walk_run_tasks(run_dir: Path) -> list[dict]:
 
 def walk_all_runs(runs_root: Path) -> list[dict]:
     """Walk a runs/staging/ or runs/official/ root and yield task info dicts."""
-    all_tasks: dict[tuple[str, str], dict] = {}  # (config, task_name) -> info
+    all_tasks: list[dict] = []
 
     for run_dir in sorted(runs_root.iterdir()):
         if not run_dir.is_dir() or run_dir.name in ("archive", "MANIFEST.json"):
@@ -638,17 +643,9 @@ def walk_all_runs(runs_root: Path) -> list[dict]:
         for info in walk_run_tasks(run_dir):
             info["run_id"] = run_dir.name
             info["benchmark"] = _infer_benchmark(run_dir.name)
-            key = (info["config_name"], info["task_name"])
-            # Dedup: keep latest by started_at
-            existing = all_tasks.get(key)
-            if existing:
-                new_ts = info["result_data"].get("started_at", "")
-                old_ts = existing["result_data"].get("started_at", "")
-                if new_ts <= old_ts:
-                    continue
-            all_tasks[key] = info
+            all_tasks.append(info)
 
-    return list(all_tasks.values())
+    return all_tasks
 
 
 # ---------------------------------------------------------------------------
@@ -852,7 +849,12 @@ def main() -> None:
         sys.exit(0)
 
     # Sort for deterministic output ordering
-    task_infos.sort(key=lambda t: (t.get("config_name", ""), t.get("task_name", "")))
+    task_infos.sort(key=lambda t: (
+        t.get("run_id", ""),
+        t.get("config_name", ""),
+        t.get("task_name", ""),
+        t.get("batch_timestamp", ""),
+    ))
 
     written = 0
     skipped = 0
diff --git a/scripts/retrieval_eval_pipeline.py b/scripts/retrieval_eval_pipeline.py
@@ -621,9 +621,17 @@ def main() -> None:
             skipped_no_gt += 1
 
         # Write task-level artifact
-        out_dir = args.output_dir or ef.parent
-        task_name = doc.get("provenance", {}).get("task_name", ef.stem.replace(".retrieval_events", ""))
-        config_name = doc.get("provenance", {}).get("config_name", "unknown")
+        prov = doc.get("provenance", {})
+        task_name = prov.get("task_name", ef.stem.replace(".retrieval_events", ""))
+        config_name = prov.get("config_name", "unknown")
+        run_id = prov.get("run_id", "unknown_run")
+        if args.output_dir:
+            if args.all:
+                out_dir = args.output_dir / run_id / config_name
+            else:
+                out_dir = args.output_dir / config_name
+        else:
+            out_dir = ef.parent
         out_path = out_dir / f"{task_name}.retrieval_metrics.json"
 
         if args.dry_run:
@@ -641,6 +649,9 @@ def main() -> None:
 
     if args.output_dir:
         summary_path = args.output_dir / "run_retrieval_summary.json"
+    elif args.all and args.run_dir and not args.events_dir:
+        # In --all mode this summary spans multiple runs; keep it out of any single run directory.
+        summary_path = args.run_dir / "retrieval_events_aggregate" / "run_retrieval_summary.json"
     elif events_root:
         summary_path = events_root / "run_retrieval_summary.json"
     else:
diff --git a/scripts/retrieval_impact_analysis.py b/scripts/retrieval_impact_analysis.py
diff --git a/scripts/swap_default_branch.sh b/scripts/swap_default_branch.sh