Skip to content

Commit a5d9671

Browse files
sjarmakclaude
andcommitted
fix: add || true to oracle_checks.py invocation in 61 eval.sh files
oracle_checks.py exits 1 when composite score is 0 (by design, for SWE-Factory exit-code-first pattern). With set -euo pipefail in eval.sh, this kills the script before reward.txt gets written, causing Harbor to report RewardFileNotFoundError for every zero-score task. Fix: append `|| true` to the SCORE= pipeline so eval.sh can proceed to write reward.txt regardless of oracle_checks.py exit code. Also fixes the eval.sh.j2 template to prevent regression in future task generation. Affected: 61 of 81 MCP-unique eval.sh files (the original 20 already had the fix from initial generation). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4b17558 commit a5d9671

File tree

62 files changed

+62
-62
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+62
-62
lines changed

benchmarks/ccb_mcp_compliance/ccx-compliance-052/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_compliance/ccx-compliance-053/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_compliance/ccx-compliance-115/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_compliance/ccx-compliance-118/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_compliance/ccx-compliance-124/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossorg/ccx-crossorg-062/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossorg/ccx-crossorg-121/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossorg/ccx-crossorg-132/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossrepo/ccx-dep-trace-106/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-003/tests/eval.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ if [ ! -f "$ORACLE_CHECKS" ]; then
5151
fi
5252

5353
echo "Running oracle checks..."
54-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1)
54+
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
5555

5656
# Validate score is a number
5757
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then

0 commit comments

Comments
 (0)