sourcegraph
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 18 additions & 6 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎benchmarks/ccb_design/camel-routing-arch-001/tests/answer_json_verifier_lib.sh‎
Lines changed: 216 additions & 0 deletions b/‎benchmarks/ccb_design/camel-routing-arch-001/tests/answer_json_verifier_lib.sh‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎benchmarks/ccb_design/camel-routing-arch-001/tests/test.sh‎
Lines changed: 10 additions & 0 deletions b/‎benchmarks/ccb_design/camel-routing-arch-001/tests/test.sh‎
Lines changed: 10 additions & 0 deletions
@@ -481,11 +481,23 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
             # locally, artifact configs produce diffs as output artifacts.
             if mcp_type == "artifact_full":
                 workflow_tail = (
-                    "3. **Produce artifacts** — Express all code changes as "
-                    "**unified diffs** in your output artifact (e.g., "
-                    "`fix_patch` fields in review.json, or a standalone "
-                    "`solution.patch` file). Do NOT edit source files directly "
-                    "— there are none in your workspace."
+                    "3. **Produce answer.json** — Write ALL output to "
+                    "`/workspace/answer.json` with this structure:\n"
+                    "   ```json\n"
+                    "   {\n"
+                    '     "analysis": {\n'
+                    '       "summary": "Brief description of your approach",\n'
+                    '       "files_examined": [{"path": "file.ext", "description": "..."}],\n'
+                    '       "reasoning": "Detailed explanation or analysis"\n'
+                    "     },\n"
+                    '     "changes": [\n'
+                    '       {"file": "path.ext", "description": "...", "diff": "unified diff"}\n'
+                    "     ]\n"
+                    "   }\n"
+                    "   ```\n"
+                    "   Omit `changes` if the task is analysis-only. "
+                    "Do NOT edit source files directly — produce diffs in "
+                    "`changes[]` instead."
                 )
             else:
                 workflow_tail = (
@@ -640,7 +652,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 repo_filter_system = "Use list_repos to discover available repositories first."
 
             if mcp_type == "artifact_full":
-                mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code, then express your changes as unified diffs in your output artifact.
+                mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code. Write ALL output to /workspace/answer.json with "analysis" (summary, files_examined, reasoning) and optional "changes" (file, description, diff) arrays. Do NOT edit source files directly.
 
 {repo_filter_system}"""
             else:
 
@@ -0,0 +1,216 @@
+#!/bin/bash
+# answer_json_verifier_lib.sh — Unified answer.json verifier for artifact configs.
+#
+# Source this at the TOP of test.sh. It detects /tmp/.artifact_only_mode and:
+#   1. Validates /workspace/answer.json exists and is valid JSON
+#   2. Extracts analysis.reasoning → $ANALYSIS_TEXT_FILE (for keyword/pattern scoring)
+#   3. Extracts analysis.files_examined → $ANALYSIS_FILES_FILE (for IR metrics)
+#   4. If changes[] has diffs: copies /repo_full → /tmp/verify_repo, applies all diffs
+#   5. Exports VERIFY_REPO, ARTIFACT_ONLY, ANALYSIS_TEXT_FILE, etc.
+#
+# For non-artifact-only runs, this script is a no-op that sets safe defaults.
+#
+# Usage in test.sh:
+#   #!/bin/bash
+#   set -e
+#   # Artifact mode: parse answer.json, apply patches, export analysis
+#   if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
+#       source /tests/answer_json_verifier_lib.sh
+#   fi
+#   # ... rest of test.sh uses $VERIFY_REPO, $ANALYSIS_TEXT_FILE, etc. ...
+
+if [ ! -f /tmp/.artifact_only_mode ]; then
+    # Not in artifact-only mode — export defaults for backward compat
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    export ARTIFACT_ONLY=false
+    export ANALYSIS_TEXT_FILE=""
+    export ANALYSIS_FILES_FILE=""
+    export ANSWER_JSON=""
+    return 0 2>/dev/null || true
+fi
+
+echo "[answer_json_verifier] Detected artifact-only mode"
+export ARTIFACT_ONLY=true
+export ANSWER_JSON="/workspace/answer.json"
+export ANALYSIS_TEXT_FILE="/tmp/analysis.txt"
+export ANALYSIS_FILES_FILE="/tmp/analysis_files.json"
+
+# ── Validate answer.json ──────────────────────────────────────────────────
+
+if [ ! -f "$ANSWER_JSON" ]; then
+    echo "[answer_json_verifier] ERROR: /workspace/answer.json not found"
+    echo "[answer_json_verifier] Agent did not produce required artifact"
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    # Signal to test.sh that there's no output — it should score 0
+    return 0 2>/dev/null || true
+fi
+
+# Validate JSON and extract fields using Python
+python3 - "$ANSWER_JSON" <<'PYEOF'
+import json, sys, os, subprocess, tempfile, re
+
+answer_path = sys.argv[1]
+analysis_text_file = os.environ.get("ANALYSIS_TEXT_FILE", "/tmp/analysis.txt")
+analysis_files_file = os.environ.get("ANALYSIS_FILES_FILE", "/tmp/analysis_files.json")
+
+# ── Parse answer.json ─────────────────────────────────────────────────────
+try:
+    with open(answer_path) as f:
+        raw = f.read()
+
+    # Strip markdown code fences if agent wrapped JSON in ```json blocks
+    m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
+    if m:
+        raw = m.group(1).strip()
+
+    answer = json.loads(raw)
+    if not isinstance(answer, dict):
+        print("[answer_json_verifier] WARNING: answer.json is not a JSON object", file=sys.stderr)
+        answer = {}
+except (json.JSONDecodeError, ValueError) as e:
+    print(f"[answer_json_verifier] ERROR: Failed to parse answer.json: {e}", file=sys.stderr)
+    answer = {}
+except FileNotFoundError:
+    print("[answer_json_verifier] ERROR: answer.json not found", file=sys.stderr)
+    answer = {}
+
+# ── Extract analysis fields ───────────────────────────────────────────────
+analysis = answer.get("analysis", {})
+if not isinstance(analysis, dict):
+    analysis = {}
+
+# Build analysis text from summary + reasoning (what verifiers will grep)
+parts = []
+summary = analysis.get("summary", "")
+if summary:
+    parts.append(summary)
+reasoning = analysis.get("reasoning", "")
+if reasoning:
+    parts.append(reasoning)
+analysis_text = "\n\n".join(parts)
+
+with open(analysis_text_file, "w") as f:
+    f.write(analysis_text)
+print(f"[answer_json_verifier] Wrote analysis text ({len(analysis_text)} chars) to {analysis_text_file}")
+
+# Extract files_examined for IR metrics
+files_examined = analysis.get("files_examined", [])
+if not isinstance(files_examined, list):
+    files_examined = []
+with open(analysis_files_file, "w") as f:
+    json.dump(files_examined, f, indent=2)
+print(f"[answer_json_verifier] Wrote {len(files_examined)} examined files to {analysis_files_file}")
+
+# ── Extract and apply diffs from changes[] ────────────────────────────────
+changes = answer.get("changes", [])
+if not isinstance(changes, list):
+    changes = []
+
+if not changes:
+    print("[answer_json_verifier] No changes[] in answer.json (analysis-only task)")
+    # Signal no patches needed
+    with open("/tmp/.answer_json_no_changes", "w") as f:
+        f.write("1")
+    sys.exit(0)
+
+# We have diffs to apply — need /repo_full
+verify_repo = "/tmp/verify_repo"
+repo_full = "/repo_full"
+
+if not os.path.isdir(repo_full):
+    print(f"[answer_json_verifier] WARNING: {repo_full} not found. Cannot apply diffs.")
+    with open("/tmp/.answer_json_no_changes", "w") as f:
+        f.write("1")
+    sys.exit(0)
+
+# Copy /repo_full to /tmp/verify_repo
+print(f"[answer_json_verifier] Copying {repo_full} -> {verify_repo}...")
+subprocess.run(["rm", "-rf", verify_repo], check=True)
+subprocess.run(["cp", "-a", repo_full, verify_repo], check=True)
+subprocess.run(
+    ["git", "config", "--global", "--add", "safe.directory", verify_repo],
+    capture_output=True
+)
+
+# Apply each diff
+applied = 0
+failed = 0
+
+for entry in changes:
+    diff_text = entry.get("diff", "")
+    if not diff_text or not diff_text.strip():
+        continue
+
+    file_name = entry.get("file", "unknown")
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
+        pf.write(diff_text)
+        pf.flush()
+        pf_path = pf.name
+
+    # Try git apply (strictest)
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (git apply)")
+        continue
+
+    # Fallback: patch -p1 --fuzz=3
+    result = subprocess.run(
+        ["patch", "-p1", "--fuzz=3", "-i", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (patch -p1)")
+        continue
+
+    # Fallback: git apply --3way
+    result = subprocess.run(
+        ["git", "apply", "--allow-empty", "--3way", pf_path],
+        cwd=verify_repo, capture_output=True, text=True
+    )
+    if result.returncode == 0:
+        applied += 1
+        os.unlink(pf_path)
+        print(f"[answer_json_verifier] Applied diff for {file_name} (git apply --3way)")
+        continue
+
+    failed += 1
+    print(f"[answer_json_verifier] WARNING: Diff for {file_name} failed to apply", file=sys.stderr)
+    os.unlink(pf_path)
+
+print(f"[answer_json_verifier] Diffs applied: {applied}, failed: {failed}")
+
+# Write verify_repo path for shell to pick up
+with open("/tmp/.answer_json_verify_repo", "w") as f:
+    f.write(verify_repo)
+PYEOF
+
+# Pick up VERIFY_REPO from Python output
+if [ -f /tmp/.answer_json_verify_repo ]; then
+    export VERIFY_REPO="$(cat /tmp/.answer_json_verify_repo)"
+    cd "$VERIFY_REPO"
+    echo "[answer_json_verifier] VERIFY_REPO set to $VERIFY_REPO"
+elif [ -f /tmp/.answer_json_no_changes ]; then
+    # Analysis-only: no repo copy needed, use /workspace or /repo_full as fallback
+    if [ -d /repo_full ]; then
+        export VERIFY_REPO="/repo_full"
+    else
+        export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    fi
+    echo "[answer_json_verifier] Analysis-only mode, VERIFY_REPO=$VERIFY_REPO"
+else
+    export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
+    echo "[answer_json_verifier] WARNING: Using fallback VERIFY_REPO=$VERIFY_REPO"
+fi
+
+# Clean up temp markers
+rm -f /tmp/.answer_json_verify_repo /tmp/.answer_json_no_changes
+
+echo "[answer_json_verifier] Library loaded (ARTIFACT_ONLY=$ARTIFACT_ONLY, VERIFY_REPO=$VERIFY_REPO)"
@@ -6,6 +6,10 @@ set -e
 
 # sg_only_env: restore full repo before verification (no-op for regular runs)
 [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
+# Artifact mode: parse answer.json, extract analysis text
+if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
+    source /tests/answer_json_verifier_lib.sh
+fi
 
 cd /workspace
 mkdir -p /logs/verifier
@@ -18,6 +22,12 @@ source /tests/verifier_lib.sh
 
 # ── Change detection guard ────────────────────────────────────────────────
 SOLUTION_FILE="/logs/agent/solution.md"
+# In artifact mode, populate expected output from answer.json analysis
+if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ -f "${ANALYSIS_TEXT_FILE:-}" ]; then
+    mkdir -p "/logs/agent"
+    cp "$ANALYSIS_TEXT_FILE" "/logs/agent/solution.md"
+    echo "[answer_json] Copied analysis text to /logs/agent/solution.md"
+fi
 if [ ! -f "$SOLUTION_FILE" ]; then
     echo "No solution.md found — agent did not produce output"
     echo "0.0" > /logs/verifier/reward.txt