Skip to content

Commit 8162342

Browse files
sjarmakclaude
andcommitted
feat: unified answer.json artifact evaluation for SDLC suites (Wave 1)
Add answer_json_verifier_lib.sh shared library and integrate into 45 output-only tasks across design/understand/document suites. In artifact mode, the library parses /workspace/answer.json, extracts analysis text, and populates expected output paths so existing verifiers score unchanged. - New: scripts/answer_json_verifier_lib.sh (parse answer.json, apply diffs) - New: scripts/integrate_answer_json_wave1.py (batch integration tool) - Updated: claude_baseline_agent.py artifact_full preamble (answer.json schema) - Integrated: 9 design, 16 understand, 20 document tasks Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 114fbac commit 8162342

File tree

93 files changed

+10636
-6
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+10636
-6
lines changed

agents/claude_baseline_agent.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -481,11 +481,23 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
481481
# locally, artifact configs produce diffs as output artifacts.
482482
if mcp_type == "artifact_full":
483483
workflow_tail = (
484-
"3. **Produce artifacts** — Express all code changes as "
485-
"**unified diffs** in your output artifact (e.g., "
486-
"`fix_patch` fields in review.json, or a standalone "
487-
"`solution.patch` file). Do NOT edit source files directly "
488-
"— there are none in your workspace."
484+
"3. **Produce answer.json** — Write ALL output to "
485+
"`/workspace/answer.json` with this structure:\n"
486+
" ```json\n"
487+
" {\n"
488+
' "analysis": {\n'
489+
' "summary": "Brief description of your approach",\n'
490+
' "files_examined": [{"path": "file.ext", "description": "..."}],\n'
491+
' "reasoning": "Detailed explanation or analysis"\n'
492+
" },\n"
493+
' "changes": [\n'
494+
' {"file": "path.ext", "description": "...", "diff": "unified diff"}\n'
495+
" ]\n"
496+
" }\n"
497+
" ```\n"
498+
" Omit `changes` if the task is analysis-only. "
499+
"Do NOT edit source files directly — produce diffs in "
500+
"`changes[]` instead."
489501
)
490502
else:
491503
workflow_tail = (
@@ -640,7 +652,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
640652
repo_filter_system = "Use list_repos to discover available repositories first."
641653

642654
if mcp_type == "artifact_full":
643-
mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code, then express your changes as unified diffs in your output artifact.
655+
mcp_system_prompt = f"""IMPORTANT: Local source files are not present. You MUST use Sourcegraph MCP tools to discover and read code. Write ALL output to /workspace/answer.json with "analysis" (summary, files_examined, reasoning) and optional "changes" (file, description, diff) arrays. Do NOT edit source files directly.
644656
645657
{repo_filter_system}"""
646658
else:
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/bin/bash
2+
# answer_json_verifier_lib.sh — Unified answer.json verifier for artifact configs.
3+
#
4+
# Source this at the TOP of test.sh. It detects /tmp/.artifact_only_mode and:
5+
# 1. Validates /workspace/answer.json exists and is valid JSON
6+
# 2. Extracts analysis.reasoning → $ANALYSIS_TEXT_FILE (for keyword/pattern scoring)
7+
# 3. Extracts analysis.files_examined → $ANALYSIS_FILES_FILE (for IR metrics)
8+
# 4. If changes[] has diffs: copies /repo_full → /tmp/verify_repo, applies all diffs
9+
# 5. Exports VERIFY_REPO, ARTIFACT_ONLY, ANALYSIS_TEXT_FILE, etc.
10+
#
11+
# For non-artifact-only runs, this script is a no-op that sets safe defaults.
12+
#
13+
# Usage in test.sh:
14+
# #!/bin/bash
15+
# set -e
16+
# # Artifact mode: parse answer.json, apply patches, export analysis
17+
# if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
18+
# source /tests/answer_json_verifier_lib.sh
19+
# fi
20+
# # ... rest of test.sh uses $VERIFY_REPO, $ANALYSIS_TEXT_FILE, etc. ...
21+
22+
if [ ! -f /tmp/.artifact_only_mode ]; then
23+
# Not in artifact-only mode — export defaults for backward compat
24+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
25+
export ARTIFACT_ONLY=false
26+
export ANALYSIS_TEXT_FILE=""
27+
export ANALYSIS_FILES_FILE=""
28+
export ANSWER_JSON=""
29+
return 0 2>/dev/null || true
30+
fi
31+
32+
echo "[answer_json_verifier] Detected artifact-only mode"
33+
export ARTIFACT_ONLY=true
34+
export ANSWER_JSON="/workspace/answer.json"
35+
export ANALYSIS_TEXT_FILE="/tmp/analysis.txt"
36+
export ANALYSIS_FILES_FILE="/tmp/analysis_files.json"
37+
38+
# ── Validate answer.json ──────────────────────────────────────────────────
39+
40+
if [ ! -f "$ANSWER_JSON" ]; then
41+
echo "[answer_json_verifier] ERROR: /workspace/answer.json not found"
42+
echo "[answer_json_verifier] Agent did not produce required artifact"
43+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
44+
# Signal to test.sh that there's no output — it should score 0
45+
return 0 2>/dev/null || true
46+
fi
47+
48+
# Validate JSON and extract fields using Python
49+
python3 - "$ANSWER_JSON" <<'PYEOF'
50+
import json, sys, os, subprocess, tempfile, re
51+
52+
answer_path = sys.argv[1]
53+
analysis_text_file = os.environ.get("ANALYSIS_TEXT_FILE", "/tmp/analysis.txt")
54+
analysis_files_file = os.environ.get("ANALYSIS_FILES_FILE", "/tmp/analysis_files.json")
55+
56+
# ── Parse answer.json ─────────────────────────────────────────────────────
57+
try:
58+
with open(answer_path) as f:
59+
raw = f.read()
60+
61+
# Strip markdown code fences if agent wrapped JSON in ```json blocks
62+
m = re.search(r'```(?:json)?\s*\n(.*?)```', raw, re.DOTALL)
63+
if m:
64+
raw = m.group(1).strip()
65+
66+
answer = json.loads(raw)
67+
if not isinstance(answer, dict):
68+
print("[answer_json_verifier] WARNING: answer.json is not a JSON object", file=sys.stderr)
69+
answer = {}
70+
except (json.JSONDecodeError, ValueError) as e:
71+
print(f"[answer_json_verifier] ERROR: Failed to parse answer.json: {e}", file=sys.stderr)
72+
answer = {}
73+
except FileNotFoundError:
74+
print("[answer_json_verifier] ERROR: answer.json not found", file=sys.stderr)
75+
answer = {}
76+
77+
# ── Extract analysis fields ───────────────────────────────────────────────
78+
analysis = answer.get("analysis", {})
79+
if not isinstance(analysis, dict):
80+
analysis = {}
81+
82+
# Build analysis text from summary + reasoning (what verifiers will grep)
83+
parts = []
84+
summary = analysis.get("summary", "")
85+
if summary:
86+
parts.append(summary)
87+
reasoning = analysis.get("reasoning", "")
88+
if reasoning:
89+
parts.append(reasoning)
90+
analysis_text = "\n\n".join(parts)
91+
92+
with open(analysis_text_file, "w") as f:
93+
f.write(analysis_text)
94+
print(f"[answer_json_verifier] Wrote analysis text ({len(analysis_text)} chars) to {analysis_text_file}")
95+
96+
# Extract files_examined for IR metrics
97+
files_examined = analysis.get("files_examined", [])
98+
if not isinstance(files_examined, list):
99+
files_examined = []
100+
with open(analysis_files_file, "w") as f:
101+
json.dump(files_examined, f, indent=2)
102+
print(f"[answer_json_verifier] Wrote {len(files_examined)} examined files to {analysis_files_file}")
103+
104+
# ── Extract and apply diffs from changes[] ────────────────────────────────
105+
changes = answer.get("changes", [])
106+
if not isinstance(changes, list):
107+
changes = []
108+
109+
if not changes:
110+
print("[answer_json_verifier] No changes[] in answer.json (analysis-only task)")
111+
# Signal no patches needed
112+
with open("/tmp/.answer_json_no_changes", "w") as f:
113+
f.write("1")
114+
sys.exit(0)
115+
116+
# We have diffs to apply — need /repo_full
117+
verify_repo = "/tmp/verify_repo"
118+
repo_full = "/repo_full"
119+
120+
if not os.path.isdir(repo_full):
121+
print(f"[answer_json_verifier] WARNING: {repo_full} not found. Cannot apply diffs.")
122+
with open("/tmp/.answer_json_no_changes", "w") as f:
123+
f.write("1")
124+
sys.exit(0)
125+
126+
# Copy /repo_full to /tmp/verify_repo
127+
print(f"[answer_json_verifier] Copying {repo_full} -> {verify_repo}...")
128+
subprocess.run(["rm", "-rf", verify_repo], check=True)
129+
subprocess.run(["cp", "-a", repo_full, verify_repo], check=True)
130+
subprocess.run(
131+
["git", "config", "--global", "--add", "safe.directory", verify_repo],
132+
capture_output=True
133+
)
134+
135+
# Apply each diff
136+
applied = 0
137+
failed = 0
138+
139+
for entry in changes:
140+
diff_text = entry.get("diff", "")
141+
if not diff_text or not diff_text.strip():
142+
continue
143+
144+
file_name = entry.get("file", "unknown")
145+
146+
with tempfile.NamedTemporaryFile(mode='w', suffix='.patch', delete=False, dir='/tmp') as pf:
147+
pf.write(diff_text)
148+
pf.flush()
149+
pf_path = pf.name
150+
151+
# Try git apply (strictest)
152+
result = subprocess.run(
153+
["git", "apply", "--allow-empty", pf_path],
154+
cwd=verify_repo, capture_output=True, text=True
155+
)
156+
if result.returncode == 0:
157+
applied += 1
158+
os.unlink(pf_path)
159+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply)")
160+
continue
161+
162+
# Fallback: patch -p1 --fuzz=3
163+
result = subprocess.run(
164+
["patch", "-p1", "--fuzz=3", "-i", pf_path],
165+
cwd=verify_repo, capture_output=True, text=True
166+
)
167+
if result.returncode == 0:
168+
applied += 1
169+
os.unlink(pf_path)
170+
print(f"[answer_json_verifier] Applied diff for {file_name} (patch -p1)")
171+
continue
172+
173+
# Fallback: git apply --3way
174+
result = subprocess.run(
175+
["git", "apply", "--allow-empty", "--3way", pf_path],
176+
cwd=verify_repo, capture_output=True, text=True
177+
)
178+
if result.returncode == 0:
179+
applied += 1
180+
os.unlink(pf_path)
181+
print(f"[answer_json_verifier] Applied diff for {file_name} (git apply --3way)")
182+
continue
183+
184+
failed += 1
185+
print(f"[answer_json_verifier] WARNING: Diff for {file_name} failed to apply", file=sys.stderr)
186+
os.unlink(pf_path)
187+
188+
print(f"[answer_json_verifier] Diffs applied: {applied}, failed: {failed}")
189+
190+
# Write verify_repo path for shell to pick up
191+
with open("/tmp/.answer_json_verify_repo", "w") as f:
192+
f.write(verify_repo)
193+
PYEOF
194+
195+
# Pick up VERIFY_REPO from Python output
196+
if [ -f /tmp/.answer_json_verify_repo ]; then
197+
export VERIFY_REPO="$(cat /tmp/.answer_json_verify_repo)"
198+
cd "$VERIFY_REPO"
199+
echo "[answer_json_verifier] VERIFY_REPO set to $VERIFY_REPO"
200+
elif [ -f /tmp/.answer_json_no_changes ]; then
201+
# Analysis-only: no repo copy needed, use /workspace or /repo_full as fallback
202+
if [ -d /repo_full ]; then
203+
export VERIFY_REPO="/repo_full"
204+
else
205+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
206+
fi
207+
echo "[answer_json_verifier] Analysis-only mode, VERIFY_REPO=$VERIFY_REPO"
208+
else
209+
export VERIFY_REPO="${VERIFY_REPO:-/workspace}"
210+
echo "[answer_json_verifier] WARNING: Using fallback VERIFY_REPO=$VERIFY_REPO"
211+
fi
212+
213+
# Clean up temp markers
214+
rm -f /tmp/.answer_json_verify_repo /tmp/.answer_json_no_changes
215+
216+
echo "[answer_json_verifier] Library loaded (ARTIFACT_ONLY=$ARTIFACT_ONLY, VERIFY_REPO=$VERIFY_REPO)"

benchmarks/ccb_design/camel-routing-arch-001/tests/test.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ set -e
66

77
# sg_only_env: restore full repo before verification (no-op for regular runs)
88
[ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ] && source /tests/sgonly_verifier_wrapper.sh
9+
# Artifact mode: parse answer.json, extract analysis text
10+
if [ -f /tmp/.artifact_only_mode ] && [ -f /tests/answer_json_verifier_lib.sh ]; then
11+
source /tests/answer_json_verifier_lib.sh
12+
fi
913

1014
cd /workspace
1115
mkdir -p /logs/verifier
@@ -18,6 +22,12 @@ source /tests/verifier_lib.sh
1822

1923
# ── Change detection guard ────────────────────────────────────────────────
2024
SOLUTION_FILE="/logs/agent/solution.md"
25+
# In artifact mode, populate expected output from answer.json analysis
26+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ -f "${ANALYSIS_TEXT_FILE:-}" ]; then
27+
mkdir -p "/logs/agent"
28+
cp "$ANALYSIS_TEXT_FILE" "/logs/agent/solution.md"
29+
echo "[answer_json] Copied analysis text to /logs/agent/solution.md"
30+
fi
2131
if [ ! -f "$SOLUTION_FILE" ]; then
2232
echo "No solution.md found — agent did not produce output"
2333
echo "0.0" > /logs/verifier/reward.txt

0 commit comments

Comments
 (0)