Skip to content

Commit 4ec00f2

Browse files
sjarmakclaude
andcommitted
feat: add SCIP precise indexing ablation configs and tooling
Add mcp-scip-remote-direct and mcp-scip-remote-artifact configs for comparing SCIP precise code intelligence vs search-based heuristics. - configs/_common.sh: register SCIP configs in config_to_mcp_type(), validate_config_name(), add config_uses_scip() helper, export SOURCEGRAPH_SEARCH_BRANCH=scip-enabled for SCIP configs - agents/claude_baseline_agent.py: inject Branch Search Instructions into MCP preamble when SOURCEGRAPH_SEARCH_BRANCH is set, directing agent to use rev:scip-enabled in all MCP tool calls - configs/eval_matrix.json: add sourcegraph_scip config definition - scripts/create_scip_branches.sh: batch-create scip-enabled branches on all sg-benchmarks repos (1561 created, 31 empty skipped) - scripts/swap_default_branch.sh: swap default branch across org for Deep Search (which only indexes HEAD) - Retrieval eval pipeline: multi-run pooling, write-tool tracking, Spearman fix for tied ranks Sourcegraph policies created: - "Benchmarks: Main (No SCIP)" — indexingEnabled=false on main - "Benchmarks: SCIP Enabled" — indexingEnabled=true on scip-enabled Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 83edf34 commit 4ec00f2

9 files changed

+542
-122
lines changed

agents/claude_baseline_agent.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,29 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
607607
"before searching.\n"
608608
)
609609

610+
# SCIP ablation: inject branch targeting instructions when
611+
# SOURCEGRAPH_SEARCH_BRANCH is set (e.g., "scip-enabled").
612+
scip_branch = os.environ.get("SOURCEGRAPH_SEARCH_BRANCH", "")
613+
if scip_branch:
614+
branch_instructions = (
615+
f"\n**Branch Search Instructions**\n\n"
616+
f"IMPORTANT: You must search the `{scip_branch}` branch for all "
617+
f"repositories in `github.com/sg-benchmarks/`.\n\n"
618+
f"When using search and file tools, always specify the "
619+
f"`{scip_branch}` branch:\n\n"
620+
f"- **keyword_search / nls_search:** Include "
621+
f"`rev:{scip_branch}` in your query alongside the repo filter\n"
622+
f' Example: `repo:^github\\.com/sg-benchmarks/REPO$ '
623+
f"rev:{scip_branch} YOUR_SEARCH_TERMS`\n"
624+
f"- **read_file / list_files:** Set the `revision` parameter "
625+
f'to `"{scip_branch}"`\n'
626+
f"- **go_to_definition / find_references:** Set the `revision` "
627+
f'parameter to `"{scip_branch}"`\n\n'
628+
f"Do NOT use the default branch. All searches and file "
629+
f"operations must target the `{scip_branch}` branch.\n"
630+
)
631+
repo_scope += branch_instructions
632+
610633
# Workflow steps 3-4 vary by config: direct configs edit+test
611634
# locally, artifact configs produce diffs as output artifacts.
612635
if mcp_type == "artifact_full":

configs/_common.sh

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,30 @@ VERIFIER_MODE="direct"
5151
SOURCE_ACCESS="local"
5252

5353
# Map composite config name → internal mcp_type for Harbor.
54-
# Side effects: sets VERIFIER_MODE and SOURCE_ACCESS globals.
54+
# Side effects: sets VERIFIER_MODE, SOURCE_ACCESS, and SOURCEGRAPH_SEARCH_BRANCH globals.
55+
# SCIP configs set SOURCEGRAPH_SEARCH_BRANCH=scip-enabled so the agent
56+
# targets the SCIP-indexed branch in all MCP tool calls.
5557
config_to_mcp_type() {
5658
local config_name="$1"
59+
# Clear branch override unless it's a SCIP config
60+
unset SOURCEGRAPH_SEARCH_BRANCH 2>/dev/null || true
5761
case "$config_name" in
5862
baseline-local-direct)
5963
VERIFIER_MODE="direct"; SOURCE_ACCESS="local"; echo "none" ;;
6064
mcp-remote-direct)
6165
VERIFIER_MODE="direct"; SOURCE_ACCESS="remote"; echo "sourcegraph_full" ;;
66+
mcp-scip-remote-direct)
67+
VERIFIER_MODE="direct"; SOURCE_ACCESS="remote"
68+
export SOURCEGRAPH_SEARCH_BRANCH="scip-enabled"
69+
echo "sourcegraph_full" ;;
6270
baseline-local-artifact)
6371
VERIFIER_MODE="artifact"; SOURCE_ACCESS="local"; echo "none" ;;
6472
mcp-remote-artifact)
6573
VERIFIER_MODE="artifact"; SOURCE_ACCESS="remote"; echo "artifact_full" ;;
74+
mcp-scip-remote-artifact)
75+
VERIFIER_MODE="artifact"; SOURCE_ACCESS="remote"
76+
export SOURCEGRAPH_SEARCH_BRANCH="scip-enabled"
77+
echo "artifact_full" ;;
6678
# Legacy names
6779
baseline)
6880
VERIFIER_MODE="direct"; SOURCE_ACCESS="local"; echo "none" ;;
@@ -88,18 +100,29 @@ baseline_config_for() {
88100
esac
89101
}
90102

103+
# Check whether a config uses SCIP precise indexing (requires branch swap).
104+
config_uses_scip() {
105+
local config_name="$1"
106+
case "$config_name" in
107+
mcp-scip-*) return 0 ;;
108+
*) return 1 ;;
109+
esac
110+
}
111+
91112
# Validate a config name against the known whitelist.
92113
# Exits 1 with error message if unknown. Call before config_to_mcp_type().
93114
validate_config_name() {
94115
local config_name="$1"
95116
case "$config_name" in
96117
baseline-local-direct|mcp-remote-direct|\
118+
mcp-scip-remote-direct|mcp-scip-remote-artifact|\
97119
baseline-local-artifact|mcp-remote-artifact|\
98120
baseline|sourcegraph_full|artifact_full|none)
99121
return 0 ;;
100122
*)
101123
echo "ERROR: Unknown config name: '$config_name'" >&2
102-
echo " Valid: baseline-local-direct, mcp-remote-direct, baseline-local-artifact, mcp-remote-artifact" >&2
124+
echo " Valid: baseline-local-direct, mcp-remote-direct, mcp-scip-remote-direct" >&2
125+
echo " baseline-local-artifact, mcp-remote-artifact, mcp-scip-remote-artifact" >&2
103126
echo " Legacy: baseline, sourcegraph_full, artifact_full, none" >&2
104127
exit 1 ;;
105128
esac

configs/eval_matrix.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"supported_configs": [
88
"baseline",
99
"sourcegraph_full",
10+
"sourcegraph_scip",
1011
"sourcegraph_isolated",
1112
"sg_only_env",
1213
"github_full"
@@ -26,6 +27,15 @@
2627
"track_in_official": true,
2728
"status": "active"
2829
},
30+
"sourcegraph_scip": {
31+
"baseline_mcp_type": "sourcegraph_full",
32+
"mcp_enabled": true,
33+
"provider": "sourcegraph",
34+
"track_in_official": false,
35+
"status": "experimental",
36+
"composite_name": "mcp-scip-remote-direct",
37+
"notes": "SCIP precise indexing ablation. Same runtime as sourcegraph_full but sg-benchmarks repos have scip-enabled branch set as default. Requires branch swap before runs: scripts/swap_default_branch.sh scip-enabled"
38+
},
2939
"sourcegraph_isolated": {
3040
"baseline_mcp_type": "sourcegraph_isolated",
3141
"mcp_enabled": true,

scripts/create_scip_branches.sh

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env bash
2+
# Create scip-enabled branches on all sg-benchmarks repos.
3+
# Each branch points to the same commit as the repo's default branch HEAD.
4+
#
5+
# Usage:
6+
# ./scripts/create_scip_branches.sh [--dry-run] [--parallel N]
7+
#
8+
# Requires: gh CLI authenticated with access to sg-benchmarks org.
9+
10+
set -euo pipefail
11+
12+
DRY_RUN=false
13+
PARALLEL=10
14+
LOG_DIR="/tmp/scip_branch_creation"
15+
BRANCH_NAME="scip-enabled"
16+
17+
while [[ $# -gt 0 ]]; do
18+
case "$1" in
19+
--dry-run) DRY_RUN=true; shift ;;
20+
--parallel) PARALLEL="$2"; shift 2 ;;
21+
*) echo "Unknown arg: $1"; exit 1 ;;
22+
esac
23+
done
24+
25+
mkdir -p "$LOG_DIR"
26+
SUCCESS_LOG="$LOG_DIR/success.log"
27+
SKIP_LOG="$LOG_DIR/skip.log"
28+
FAIL_LOG="$LOG_DIR/fail.log"
29+
> "$SUCCESS_LOG"
30+
> "$SKIP_LOG"
31+
> "$FAIL_LOG"
32+
33+
echo "=== SCIP Branch Creator ==="
34+
echo "Branch: $BRANCH_NAME"
35+
echo "Parallel: $PARALLEL"
36+
echo "Dry run: $DRY_RUN"
37+
echo "Logs: $LOG_DIR/"
38+
echo ""
39+
40+
# Fetch all repo names in the org
41+
echo "Fetching repo list from sg-benchmarks org..."
42+
REPOS=$(gh api --paginate orgs/sg-benchmarks/repos \
43+
--jq '.[].name' 2>/dev/null | sort)
44+
TOTAL=$(echo "$REPOS" | wc -l)
45+
echo "Found $TOTAL repos"
46+
echo ""
47+
48+
create_branch() {
49+
local repo_name="$1"
50+
local full_name="sg-benchmarks/$repo_name"
51+
52+
# Get default branch HEAD SHA
53+
local sha
54+
sha=$(gh api "repos/$full_name/git/refs/heads/main" --jq '.object.sha' 2>/dev/null || true)
55+
56+
# If main doesn't exist, try the default branch
57+
if [[ -z "$sha" ]]; then
58+
local default_branch
59+
default_branch=$(gh api "repos/$full_name" --jq '.default_branch' 2>/dev/null || true)
60+
if [[ -n "$default_branch" && "$default_branch" != "main" ]]; then
61+
sha=$(gh api "repos/$full_name/git/refs/heads/$default_branch" --jq '.object.sha' 2>/dev/null || true)
62+
fi
63+
fi
64+
65+
if [[ -z "$sha" ]]; then
66+
echo "SKIP $repo_name: empty repo or no resolvable HEAD" >> "$SKIP_LOG"
67+
return 0
68+
fi
69+
70+
# Check if branch already exists
71+
local existing
72+
existing=$(gh api "repos/$full_name/git/refs/heads/$BRANCH_NAME" --jq '.object.sha' 2>/dev/null || true)
73+
if [[ -n "$existing" ]]; then
74+
echo "SKIP $repo_name: $BRANCH_NAME already exists (sha=$existing)" >> "$SKIP_LOG"
75+
return 0
76+
fi
77+
78+
if $DRY_RUN; then
79+
echo "DRY-RUN: would create $BRANCH_NAME on $full_name at $sha"
80+
echo "DRYRUN $repo_name: $sha" >> "$SUCCESS_LOG"
81+
return 0
82+
fi
83+
84+
# Create the branch
85+
local result
86+
result=$(gh api "repos/$full_name/git/refs" \
87+
-f "ref=refs/heads/$BRANCH_NAME" \
88+
-f "sha=$sha" 2>&1) || {
89+
echo "FAIL $repo_name: $result" >> "$FAIL_LOG"
90+
return 1
91+
}
92+
93+
echo "OK $repo_name: created $BRANCH_NAME at $sha" >> "$SUCCESS_LOG"
94+
}
95+
96+
export -f create_branch
97+
export DRY_RUN BRANCH_NAME SUCCESS_LOG SKIP_LOG FAIL_LOG
98+
99+
# Run in parallel with progress
100+
echo "$REPOS" | xargs -P "$PARALLEL" -I {} bash -c 'create_branch "$@"' _ {}
101+
102+
echo ""
103+
echo "=== Results ==="
104+
echo "Success: $(wc -l < "$SUCCESS_LOG")"
105+
echo "Skipped: $(wc -l < "$SKIP_LOG")"
106+
echo "Failed: $(wc -l < "$FAIL_LOG")"
107+
108+
if [[ -s "$FAIL_LOG" ]]; then
109+
echo ""
110+
echo "=== Failures ==="
111+
cat "$FAIL_LOG"
112+
fi
113+
114+
echo ""
115+
echo "Full logs in $LOG_DIR/"

scripts/generate_retrieval_report.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,28 @@ def generate_report(
227227
if matched.get("computable"):
228228
lines.append("### Matched Comparison")
229229
lines.append("")
230-
lines.append(f"Paired comparison of **{matched['n_matched_tasks']}** matched tasks")
231-
lines.append(f"({matched['baseline_config']} vs {matched['mcp_config']}):")
230+
n_runs_compared = matched.get("n_runs_compared")
231+
if n_runs_compared and n_runs_compared > 1:
232+
lines.append(
233+
f"Pooled paired comparison of **{matched['n_matched_tasks']}** matched tasks "
234+
f"across **{n_runs_compared}** runs:"
235+
)
236+
baseline_cfgs = matched.get("baseline_configs") or []
237+
mcp_cfgs = matched.get("mcp_configs") or []
238+
if baseline_cfgs or mcp_cfgs:
239+
lines.append("")
240+
lines.append(f"- Baseline config(s): {', '.join(baseline_cfgs) if baseline_cfgs else 'N/A'}")
241+
lines.append(f"- MCP config(s): {', '.join(mcp_cfgs) if mcp_cfgs else 'N/A'}")
242+
run_pairs = matched.get("run_pairs", [])
243+
if run_pairs:
244+
lines.append("")
245+
lines.append(f"- Runs compared: {len(run_pairs)}")
246+
skipped = matched.get("n_runs_skipped", 0)
247+
if skipped:
248+
lines.append(f"- Runs skipped: {skipped}")
249+
else:
250+
lines.append(f"Paired comparison of **{matched['n_matched_tasks']}** matched tasks")
251+
lines.append(f"({matched.get('baseline_config')} vs {matched.get('mcp_config')}):")
232252
lines.append("")
233253
lines.append("| Metric | Mean Δ | Median Δ | IQR | +/−/0 | n |")
234254
lines.append("|--------|--------|---------|-----|-------|---|")
@@ -248,6 +268,9 @@ def generate_report(
248268
lines.append("**These are comparative observations, not causal claims.**")
249269
lines.append(f"**Unmatched tasks**: {matched.get('n_baseline_only', 0)} baseline-only, "
250270
f"{matched.get('n_mcp_only', 0)} MCP-only.")
271+
if matched.get("skipped_runs"):
272+
lines.append(f"**Skipped runs**: {len(matched.get('skipped_runs', []))} "
273+
f"(missing paired configs or insufficient matched tasks).")
251274
lines.append("")
252275
else:
253276
reason = matched.get("reason", "unknown")

scripts/normalize_retrieval_events.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,11 @@
115115
"WebSearch": "other",
116116
}
117117

118-
# Tools that are retrieval-relevant (we emit events for these)
119-
_RETRIEVAL_TOOLS = (
118+
# Tools that are tracked by the retrieval evaluation pipeline.
119+
# Includes retrieval tools plus local write tools for utilization/taxonomy stages.
120+
_TRACKED_TOOLS = (
120121
set(_MCP_TOOL_CATEGORIES.keys())
121-
| {"Read", "Glob", "Grep"}
122+
| {"Read", "Glob", "Grep", "Write", "Edit", "NotebookEdit"}
122123
)
123124

124125

@@ -137,8 +138,8 @@ def _is_mcp(name: str) -> bool:
137138

138139

139140
def _is_retrieval_tool(name: str) -> bool:
140-
"""True for tools that access or search for code."""
141-
if name in _RETRIEVAL_TOOLS:
141+
"""True for tools tracked by the retrieval evaluation pipeline."""
142+
if name in _TRACKED_TOOLS:
142143
return True
143144
if name.startswith("mcp__sourcegraph__"):
144145
return True
@@ -179,7 +180,7 @@ def _extract_files_from_tool_input(tool_name: str, tool_input: dict) -> list[str
179180
return files
180181

181182
# Local file tools
182-
if tool_name in ("Read", "Glob", "Grep"):
183+
if tool_name in ("Read", "Glob", "Grep", "Write", "Edit", "NotebookEdit"):
183184
fp = tool_input.get("file_path") or tool_input.get("path") or ""
184185
if fp and _looks_like_file(_normalize(fp)):
185186
files.append(fp)
@@ -252,6 +253,10 @@ def _salient_arguments(tool_name: str, tool_input: dict) -> dict:
252253
for k in ("file_path", "path", "repo", "startLine", "endLine"):
253254
if k in tool_input:
254255
args[k] = tool_input[k]
256+
elif cat == "file_write":
257+
for k in ("file_path", "path", "old_string", "new_string", "replace_all"):
258+
if k in tool_input:
259+
args[k] = tool_input[k]
255260
elif cat == "file_search":
256261
for k in ("pattern", "path", "repo", "query"):
257262
if k in tool_input:
@@ -627,7 +632,7 @@ def walk_run_tasks(run_dir: Path) -> list[dict]:
627632

628633
def walk_all_runs(runs_root: Path) -> list[dict]:
629634
"""Walk a runs/staging/ or runs/official/ root and yield task info dicts."""
630-
all_tasks: dict[tuple[str, str], dict] = {} # (config, task_name) -> info
635+
all_tasks: list[dict] = []
631636

632637
for run_dir in sorted(runs_root.iterdir()):
633638
if not run_dir.is_dir() or run_dir.name in ("archive", "MANIFEST.json"):
@@ -638,17 +643,9 @@ def walk_all_runs(runs_root: Path) -> list[dict]:
638643
for info in walk_run_tasks(run_dir):
639644
info["run_id"] = run_dir.name
640645
info["benchmark"] = _infer_benchmark(run_dir.name)
641-
key = (info["config_name"], info["task_name"])
642-
# Dedup: keep latest by started_at
643-
existing = all_tasks.get(key)
644-
if existing:
645-
new_ts = info["result_data"].get("started_at", "")
646-
old_ts = existing["result_data"].get("started_at", "")
647-
if new_ts <= old_ts:
648-
continue
649-
all_tasks[key] = info
646+
all_tasks.append(info)
650647

651-
return list(all_tasks.values())
648+
return all_tasks
652649

653650

654651
# ---------------------------------------------------------------------------
@@ -852,7 +849,12 @@ def main() -> None:
852849
sys.exit(0)
853850

854851
# Sort for deterministic output ordering
855-
task_infos.sort(key=lambda t: (t.get("config_name", ""), t.get("task_name", "")))
852+
task_infos.sort(key=lambda t: (
853+
t.get("run_id", ""),
854+
t.get("config_name", ""),
855+
t.get("task_name", ""),
856+
t.get("batch_timestamp", ""),
857+
))
856858

857859
written = 0
858860
skipped = 0

scripts/retrieval_eval_pipeline.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -621,9 +621,17 @@ def main() -> None:
621621
skipped_no_gt += 1
622622

623623
# Write task-level artifact
624-
out_dir = args.output_dir or ef.parent
625-
task_name = doc.get("provenance", {}).get("task_name", ef.stem.replace(".retrieval_events", ""))
626-
config_name = doc.get("provenance", {}).get("config_name", "unknown")
624+
prov = doc.get("provenance", {})
625+
task_name = prov.get("task_name", ef.stem.replace(".retrieval_events", ""))
626+
config_name = prov.get("config_name", "unknown")
627+
run_id = prov.get("run_id", "unknown_run")
628+
if args.output_dir:
629+
if args.all:
630+
out_dir = args.output_dir / run_id / config_name
631+
else:
632+
out_dir = args.output_dir / config_name
633+
else:
634+
out_dir = ef.parent
627635
out_path = out_dir / f"{task_name}.retrieval_metrics.json"
628636

629637
if args.dry_run:
@@ -641,6 +649,9 @@ def main() -> None:
641649

642650
if args.output_dir:
643651
summary_path = args.output_dir / "run_retrieval_summary.json"
652+
elif args.all and args.run_dir and not args.events_dir:
653+
# In --all mode this summary spans multiple runs; keep it out of any single run directory.
654+
summary_path = args.run_dir / "retrieval_events_aggregate" / "run_retrieval_summary.json"
644655
elif events_root:
645656
summary_path = events_root / "run_retrieval_summary.json"
646657
else:

0 commit comments

Comments
 (0)