sourcegraph
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 9 additions & 0 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎configs/ground_truth_files.json‎
Lines changed: 393 additions & 2241 deletions b/‎configs/ground_truth_files.json‎
Lines changed: 393 additions & 2241 deletions
diff --git a/‎configs/rerun_mcp_distracted.sh‎
Lines changed: 115 additions & 0 deletions b/‎configs/rerun_mcp_distracted.sh‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎configs/selected_benchmark_tasks.json‎
Lines changed: 209 additions & 11 deletions b/‎configs/selected_benchmark_tasks.json‎
Lines changed: 209 additions & 11 deletions
@@ -111,6 +111,15 @@
 
 {repo_scope}
 
+## Local File Editing
+
+Local source files may be truncated (empty). Use Sourcegraph to *read and understand* code, then *edit local files* based on what you learn. The verifier restores the full codebase and applies your local edits on top.
+
+- **Search/Read remotely:** Use MCP tools to find files, understand patterns, read implementations
+- **Edit locally:** Use Edit, Write, and Bash to modify files in your working directory
+- **Don't over-read:** Once you understand the pattern, start implementing. Reading 20+ remote files without writing code wastes time.
+- **Verify locally:** Run tests with Bash to check your changes
+
 ## Tool Selection Logic
 
 **Start here:**
 
@@ -0,0 +1,115 @@
+#!/bin/bash
+# Targeted rerun of 36 MCP-distracted tasks (SG_full reward < baseline - 0.10).
+#
+# Root causes:
+#   (a) 6 code review tasks — Dockerfile.sg_only bug (defect injection missing)
+#   (b) 11 doc-gen/understand/debug tasks — genuine mild distraction
+#   (c) 19 tasks with SG_full=0.0 — likely infra failures (rate limits) + navprove bugs
+#
+# The V4 preamble now includes "Local File Editing" guidance to reduce over-reading.
+# This rerun tests whether the preamble fix improves SG_full scores.
+#
+# Usage:
+#   ./configs/rerun_mcp_distracted.sh                  # all 36 tasks
+#   ./configs/rerun_mcp_distracted.sh --suite build     # only build suite
+#   ./configs/rerun_mcp_distracted.sh --full-only       # SG_full only (skip baseline)
+
+set -e
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Parse args
+SUITE_FILTER=""
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --suite) SUITE_FILTER="$2"; shift 2 ;;
+        *) EXTRA_ARGS+=("$1"); shift ;;
+    esac
+done
+
+run_suite() {
+    local suite=$1
+    shift
+    local tasks=("$@")
+
+    if [ -n "$SUITE_FILTER" ] && [ "$SUITE_FILTER" != "$suite" ]; then
+        return
+    fi
+
+    echo ""
+    echo "=========================================="
+    echo "Rerunning $suite: ${#tasks[@]} distracted tasks"
+    echo "=========================================="
+
+    local task_flags=""
+    for t in "${tasks[@]}"; do
+        task_flags="$task_flags --task $t"
+    done
+
+    "$SCRIPT_DIR/${suite}_2config.sh" $task_flags "${EXTRA_ARGS[@]}"
+}
+
+# ── build (3 tasks) ──
+run_suite build \
+    flipt-dep-refactor-001 \
+    rust-subtype-relation-refac-001 \
+    flink-pricing-window-feat-001
+
+# ── debug (5 tasks) ──
+run_suite debug \
+    envoy-duplicate-headers-debug-001 \
+    istio-xds-destrul-debug-001 \
+    qutebrowser-download-regression-prove-001 \
+    qutebrowser-bookmark-regression-prove-001 \
+    qutebrowser-tab-regression-prove-001
+
+# ── design (4 tasks) ──
+run_suite design \
+    django-pre-validate-signal-design-001 \
+    k8s-dra-allocation-impact-001 \
+    camel-routing-arch-001 \
+    kafka-flink-streaming-arch-001 \
+    flipt-protobuf-metadata-design-001
+
+# ── document (5 tasks) ──
+run_suite document \
+    k8s-controller-mgr-doc-gen-001 \
+    k8s-applyconfig-doc-gen-001 \
+    envoy-migration-doc-gen-001 \
+    k8s-clientgo-doc-gen-001 \
+    k8s-fairqueuing-doc-gen-001
+
+# ── fix (1 task) ──
+run_suite fix \
+    django-modelchoice-fk-fix-001
+
+# ── secure (5 tasks) ──
+run_suite secure \
+    django-policy-enforcement-001 \
+    curl-cve-triage-001 \
+    django-sensitive-file-exclusion-001 \
+    grpcurl-transitive-vuln-001 \
+    flipt-degraded-context-fix-001 \
+    django-cross-team-boundary-001
+
+# ── test (7 tasks) ──
+run_suite test \
+    terraform-code-review-001 \
+    kafka-security-review-001 \
+    vscode-code-review-001 \
+    ghost-code-review-001 \
+    envoy-code-review-001 \
+    curl-security-review-001 \
+    pandas-groupby-perf-001 \
+    test-unitgen-py-001
+
+# ── understand (2 tasks) ──
+run_suite understand \
+    kafka-message-lifecycle-qa-001 \
+    terraform-state-backend-handoff-001 \
+    cilium-ebpf-fault-qa-001
+
+echo ""
+echo "=========================================="
+echo "MCP distraction rerun complete"
+echo "=========================================="
@@ -5,7 +5,7 @@
     "generated_by": "SDLC suite migration from migration_map.json",
     "generated_date": "2026-02-18",
     "total_available": 835,
-    "total_selected": 157,
+    "total_selected": 166,
     "migration_source": "migration_map.json (157 mapped tasks across 8 SDLC suites)",
     "target_total": 170,
     "target_note": "ccb_test and ccb_document target 20 each (see docs/backlog_ccb_test.json, docs/backlog_ccb_document.json)"
@@ -46,33 +46,33 @@
       "Debugging": 14,
       "Documentation": 13,
       "Implementation (bug fix)": 33,
-      "Implementation (feature)": 25,
+      "Implementation (feature)": 26,
       "Implementation (refactor)": 2,
       "Implementation (refactoring)": 2,
       "Planning (impact analysis)": 2,
       "Refactoring": 4,
-      "Requirements & Discovery": 38,
+      "Requirements & Discovery": 44,
       "Security review": 3,
-      "Testing & QA": 12
+      "Testing & QA": 14
     },
     "tasks_per_benchmark": {
-      "ccb_build": 25,
+      "ccb_build": 26,
       "ccb_debug": 20,
       "ccb_design": 20,
-      "ccb_document": 13,
+      "ccb_document": 17,
       "ccb_fix": 25,
       "ccb_secure": 20,
-      "ccb_test": 14,
-      "ccb_understand": 20
+      "ccb_test": 16,
+      "ccb_understand": 22
     },
     "tasks_per_language": {
       "c": 10,
       "cpp": 20,
       "csharp": 3,
-      "go": 56,
-      "java": 16,
+      "go": 61,
+      "java": 17,
       "javascript": 5,
-      "python": 33,
+      "python": 36,
       "python,cpp": 1,
       "rust": 4,
       "typescript": 9
@@ -3533,6 +3533,204 @@
       "context_length_source": "task_metrics_run",
       "files_count": 6,
       "files_count_source": "task_metrics_run"
+    },
+    {
+      "task_id": "cgen-deps-install-001",
+      "benchmark": "ccb_build",
+      "sdlc_phase": "Implementation (feature)",
+      "language": "python",
+      "difficulty": "medium",
+      "category": "dependency-inference",
+      "repo": "",
+      "mcp_benefit_score": 0.55,
+      "mcp_breakdown": {
+        "context_complexity": 0.5,
+        "cross_file_deps": 0.4,
+        "semantic_search_potential": 0.6,
+        "task_category_weight": 0.7
+      },
+      "selection_rationale": "New SDLC task: dependency inference from DIBench",
+      "task_dir": "ccb_build/cgen-deps-install-001",
+      "context_length": 500000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 8,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "django-composite-field-recover-001",
+      "benchmark": "ccb_understand",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "python",
+      "difficulty": "hard",
+      "category": "enterprise_knowledge_fragmentation",
+      "repo": "django/django",
+      "mcp_benefit_score": 0.85,
+      "mcp_breakdown": {
+        "context_complexity": 0.9,
+        "cross_file_deps": 0.85,
+        "semantic_search_potential": 0.8,
+        "task_category_weight": 0.85
+      },
+      "selection_rationale": "New SDLC task: knowledge fragmentation recovery across Django packages",
+      "task_dir": "ccb_understand/django-composite-field-recover-001",
+      "context_length": 850000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 15,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "django-template-inherit-recall-001",
+      "benchmark": "ccb_understand",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "python",
+      "difficulty": "hard",
+      "category": "enterprise_institutional_memory",
+      "repo": "django/django",
+      "mcp_benefit_score": 0.85,
+      "mcp_breakdown": {
+        "context_complexity": 0.9,
+        "cross_file_deps": 0.85,
+        "semantic_search_potential": 0.8,
+        "task_category_weight": 0.85
+      },
+      "selection_rationale": "New SDLC task: institutional memory recall for Django template regression",
+      "task_dir": "ccb_understand/django-template-inherit-recall-001",
+      "context_length": 850000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 12,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "docgen-changelog-001",
+      "benchmark": "ccb_document",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "go",
+      "difficulty": "medium",
+      "category": "changelog_generation",
+      "repo": "hashicorp/terraform",
+      "mcp_benefit_score": 0.82,
+      "mcp_breakdown": {
+        "context_complexity": 0.85,
+        "cross_file_deps": 0.75,
+        "semantic_search_potential": 0.85,
+        "task_category_weight": 0.85
+      },
+      "selection_rationale": "New SDLC task: changelog generation requiring cross-module change discovery",
+      "task_dir": "ccb_document/docgen-changelog-001",
+      "context_length": 750000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 10,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "docgen-changelog-002",
+      "benchmark": "ccb_document",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "go",
+      "difficulty": "medium",
+      "category": "changelog_generation",
+      "repo": "flipt-io/flipt",
+      "mcp_benefit_score": 0.82,
+      "mcp_breakdown": {
+        "context_complexity": 0.85,
+        "cross_file_deps": 0.75,
+        "semantic_search_potential": 0.85,
+        "task_category_weight": 0.85
+      },
+      "selection_rationale": "New SDLC task: release notes generation requiring API change discovery",
+      "task_dir": "ccb_document/docgen-changelog-002",
+      "context_length": 600000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 10,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "docgen-inline-002",
+      "benchmark": "ccb_document",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "java",
+      "difficulty": "hard",
+      "category": "inline_docstring_generation",
+      "repo": "apache/kafka",
+      "mcp_benefit_score": 0.88,
+      "mcp_breakdown": {
+        "context_complexity": 0.9,
+        "cross_file_deps": 0.85,
+        "semantic_search_potential": 0.9,
+        "task_category_weight": 0.85
+      },
+      "selection_rationale": "New SDLC task: Javadoc generation requiring thread-safety and performance analysis",
+      "task_dir": "ccb_document/docgen-inline-002",
+      "context_length": 800000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 12,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "docgen-onboard-001",
+      "benchmark": "ccb_document",
+      "sdlc_phase": "Requirements & Discovery",
+      "language": "go",
+      "difficulty": "hard",
+      "category": "onboarding_guide",
+      "repo": "istio/istio",
+      "mcp_benefit_score": 0.9,
+      "mcp_breakdown": {
+        "context_complexity": 0.95,
+        "cross_file_deps": 0.85,
+        "semantic_search_potential": 0.9,
+        "task_category_weight": 0.9
+      },
+      "selection_rationale": "New SDLC task: onboarding guide requiring cross-package architecture discovery",
+      "task_dir": "ccb_document/docgen-onboard-001",
+      "context_length": 900000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 15,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "test-integration-001",
+      "benchmark": "ccb_test",
+      "sdlc_phase": "Testing & QA",
+      "language": "go",
+      "difficulty": "hard",
+      "category": "integration-test-authoring",
+      "repo": "flipt-io/flipt",
+      "mcp_benefit_score": 0.78,
+      "mcp_breakdown": {
+        "context_complexity": 0.8,
+        "cross_file_deps": 0.75,
+        "semantic_search_potential": 0.75,
+        "task_category_weight": 0.8
+      },
+      "selection_rationale": "New SDLC task: integration test authoring requiring API endpoint discovery",
+      "task_dir": "ccb_test/test-integration-001",
+      "context_length": 700000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 10,
+      "files_count_source": "mcp_breakdown_proxy"
+    },
+    {
+      "task_id": "test-unitgen-go-001",
+      "benchmark": "ccb_test",
+      "sdlc_phase": "Testing & QA",
+      "language": "go",
+      "difficulty": "hard",
+      "category": "unit-test-generation",
+      "repo": "kubernetes/kubernetes",
+      "mcp_benefit_score": 0.8,
+      "mcp_breakdown": {
+        "context_complexity": 0.85,
+        "cross_file_deps": 0.75,
+        "semantic_search_potential": 0.8,
+        "task_category_weight": 0.8
+      },
+      "selection_rationale": "New SDLC task: unit test generation requiring function discovery and pattern analysis",
+      "task_dir": "ccb_test/test-unitgen-go-001",
+      "context_length": 800000,
+      "context_length_source": "mcp_breakdown_proxy",
+      "files_count": 12,
+      "files_count_source": "mcp_breakdown_proxy"
     }
   ]
 }