Skip to content

Commit edff972

Browse files
sbryngelsonSpencer Bryngelson
andauthored
Fix self-hosted CI robustness: build cache, SLURM QOS, and submit resilience (#1295)
Co-authored-by: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
1 parent f95b2c4 commit edff972

File tree

21 files changed

+299
-249
lines changed

21 files changed

+299
-249
lines changed

.github/scripts/monitor_slurm_job.sh

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@ cleanup() {
99
if [ -n "${tail_pid:-}" ]; then
1010
kill "${tail_pid}" 2>/dev/null || true
1111
fi
12-
# Cancel the SLURM job if the monitor is exiting due to an error
13-
# (e.g., the CI runner is being killed). Don't cancel on success.
12+
# Cancel the SLURM job only if it is still active in the scheduler.
13+
# If the job already left the queue (squeue returns empty), it has finished
14+
# and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
1415
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
15-
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
16-
scancel "$job_id" 2>/dev/null || true
16+
active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
17+
if [ -n "$active_state" ]; then
18+
echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
19+
scancel "$job_id" 2>/dev/null || true
20+
else
21+
echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
22+
fi
1723
fi
1824
}
1925
trap cleanup EXIT
@@ -56,9 +62,11 @@ get_job_state() {
5662
}
5763

5864
# Check if a state is terminal (job is done, for better or worse)
65+
# PREEMPTED is intentionally excluded: with --requeue the job restarts under
66+
# the same job ID and we must keep monitoring rather than exiting early.
5967
is_terminal_state() {
6068
case "$1" in
61-
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
69+
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
6270
return 0 ;;
6371
*)
6472
return 1 ;;
@@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
7482
state=$(get_job_state "$job_id")
7583

7684
case "$state" in
77-
PENDING|CONFIGURING)
85+
PENDING|CONFIGURING|PREEMPTED)
7886
unknown_count=0
7987
sleep 5
8088
;;

.github/scripts/prebuild-case-optimization.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ case "$cluster" in
2121
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
2222
esac
2323

24+
rm -rf build
25+
2426
. ./mfc.sh load -c "$flag" -m g
2527
source .github/scripts/gpu-opts.sh
2628

.github/scripts/retry-build.sh

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,13 @@
11
#!/bin/bash
2-
# Provides retry_build(): 3-attempt loop with configurable cleanup.
3-
# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
2+
# Provides retry_build(): 2-attempt loop.
3+
# On failure of attempt 1, nukes the entire build directory before attempt 2.
44
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
55
# Usage: source .github/scripts/retry-build.sh
66
# retry_build ./mfc.sh build -j 8 --gpu acc
77

8-
# Try normal cleanup; if it fails, escalate to cache nuke.
9-
_retry_clean() {
10-
local clean_cmd="$1"
11-
if eval "$clean_cmd" 2>/dev/null; then
12-
return 0
13-
fi
14-
echo " Normal cleanup failed."
15-
if type _cache_nuke > /dev/null 2>&1; then
16-
echo " Escalating to NFS cache nuke..."
17-
_cache_nuke
18-
else
19-
echo " _cache_nuke not available, best-effort rm."
20-
rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
21-
fi
22-
}
23-
248
retry_build() {
25-
local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
269
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
27-
local max_attempts=3
10+
local max_attempts=2
2811
local attempt=1
2912
while [ $attempt -le $max_attempts ]; do
3013
echo "Build attempt $attempt of $max_attempts..."
@@ -33,8 +16,8 @@ retry_build() {
3316
if ! eval "$validate_cmd"; then
3417
echo "Post-build validation failed on attempt $attempt."
3518
if [ $attempt -lt $max_attempts ]; then
36-
echo "Cleaning and retrying in 5s..."
37-
_retry_clean "$clean_cmd"
19+
echo " Nuking build directory before retry..."
20+
rm -rf build 2>/dev/null || true
3821
sleep 5
3922
attempt=$((attempt + 1))
4023
continue
@@ -48,8 +31,8 @@ retry_build() {
4831
return 0
4932
fi
5033
if [ $attempt -lt $max_attempts ]; then
51-
echo "Build failed on attempt $attempt. Retrying in 30s..."
52-
_retry_clean "$clean_cmd"
34+
echo " Build failed — nuking build directory before retry..."
35+
rm -rf build 2>/dev/null || true
5336
sleep 30
5437
else
5538
echo "Build failed after $max_attempts attempts."

.github/scripts/run_monitored_slurm_job.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ if [ "$monitor_exit" -ne 0 ]; then
2525
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
2626
# Give the SLURM epilog time to finalize if the job just finished
2727
sleep 30
28-
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
29-
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
28+
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
29+
final_state="${final_state:-UNKNOWN}"
30+
final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || true)
31+
final_exit="${final_exit:-}"
3032
echo "Final SLURM state=$final_state exit=$final_exit"
3133
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
3234
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,31 @@ echo "=========================================="
2020
echo "Starting parallel benchmark jobs..."
2121
echo "=========================================="
2222

23+
# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24+
# both parallel jobs so PR and master always land on the same GPU type.
25+
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
26+
echo "Selecting Phoenix GPU partition for benchmark consistency..."
27+
# Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
28+
# large modern nodes (h200, h100, a100) free for production workloads.
29+
# rtx6000 has the most nodes and gives the most consistent baselines.
30+
BENCH_GPU_PARTITION=""
31+
for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
32+
# || true: grep -c exits 1 on zero matches (or when sinfo returns no output
33+
# for an unknown partition); suppress so set -euo pipefail doesn't abort.
34+
idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
35+
if [ "${idle:-0}" -gt 0 ]; then
36+
BENCH_GPU_PARTITION="$part"
37+
echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
38+
break
39+
fi
40+
done
41+
if [ -z "$BENCH_GPU_PARTITION" ]; then
42+
echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
43+
BENCH_GPU_PARTITION="gpu-rtx6000"
44+
fi
45+
export BENCH_GPU_PARTITION
46+
fi
47+
2348
# Run both jobs with monitoring using dedicated script from PR
2449
# Use stdbuf for line-buffered output and prefix each line for clarity
2550
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
@@ -40,6 +65,8 @@ wait "$pr_pid"
4065
pr_exit=$?
4166
if [ "$pr_exit" -ne 0 ]; then
4267
echo "PR job exited with code: $pr_exit"
68+
echo "Last 50 lines of PR job log:"
69+
tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
4370
else
4471
echo "PR job completed successfully"
4572
fi
@@ -48,6 +75,8 @@ wait "$master_pid"
4875
master_exit=$?
4976
if [ "$master_exit" -ne 0 ]; then
5077
echo "Master job exited with code: $master_exit"
78+
echo "Last 50 lines of master job log:"
79+
tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
5180
else
5281
echo "Master job completed successfully"
5382
fi

.github/scripts/setup-build-cache.sh

Lines changed: 0 additions & 101 deletions
This file was deleted.

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,18 @@ device="$2"
1414
interface="$3"
1515
cluster="$4"
1616

17+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18+
1719
echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
1820
cd "$dir"
1921

20-
# Submit and monitor job (submit.sh auto-detects bench mode from script name)
21-
bash .github/workflows/$cluster/submit.sh \
22-
.github/workflows/$cluster/bench.sh "$device" "$interface"
22+
# Always use the PR's submit.sh so both master and PR builds benefit from the
23+
# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is
24+
# still resolved relative to the current directory (master/ or pr/) so the
25+
# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs
26+
# in the right directory regardless of which submit.sh is invoked.
27+
PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
28+
bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"
2329

2430
# Verify the YAML output file was created
2531
job_slug="bench-$device-$interface"

.github/workflows/bench.yml

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ jobs:
8585
device: gpu
8686
interface: omp
8787
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
88+
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
8889
runs-on:
8990
group: ${{ matrix.group }}
9091
labels: ${{ matrix.labels }}
@@ -106,7 +107,7 @@ jobs:
106107
if: matrix.build_script != ''
107108
uses: nick-fields/retry@v3
108109
with:
109-
max_attempts: 3
110+
max_attempts: 2
110111
retry_wait_seconds: 60
111112
timeout_minutes: 150
112113
command: |
@@ -118,13 +119,20 @@ jobs:
118119
wait $pid2; e2=$?
119120
[ $e1 -eq 0 ] && [ $e2 -eq 0 ]
120121
on_retry_command: |
121-
(cd pr && ./mfc.sh clean) &
122-
(cd master && ./mfc.sh clean) &
123-
wait
122+
rm -rf pr/build master/build
124123
125124
- name: Bench (Master v. PR)
126125
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
127126

127+
- name: Cancel SLURM Jobs
128+
if: cancelled()
129+
run: |
130+
find . -name "*.slurm_job_id" | while read -r f; do
131+
job_id=$(cat "$f")
132+
echo "Cancelling SLURM job $job_id"
133+
scancel "$job_id" 2>/dev/null || true
134+
done
135+
128136
- name: Generate & Post Comment
129137
if: always()
130138
run: |
@@ -137,6 +145,29 @@ jobs:
137145
cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
138146
cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
139147
148+
- name: Print Per-Case Logs
149+
if: always()
150+
run: |
151+
passed=() failed=()
152+
for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
153+
[ -f "$out" ] || continue
154+
[ -f "${out%.out}.yaml" ] && passed+=("$out") || failed+=("$out")
155+
done
156+
157+
echo "=== Per-Case Summary: ${#failed[@]} failed, ${#passed[@]} passed ==="
158+
for out in "${failed[@]}"; do echo " [FAILED] $out"; done
159+
for out in "${passed[@]}"; do echo " [PASSED] $out"; done
160+
161+
if [ ${#failed[@]} -gt 0 ]; then
162+
echo ""
163+
echo "=== Failed Case Logs ==="
164+
for out in "${failed[@]}"; do
165+
echo "--- $out ---"
166+
cat "$out"
167+
echo ""
168+
done
169+
fi
170+
140171
# All other runners (non-Phoenix) just run without special env
141172
- name: Archive Logs (Frontier)
142173
if: always() && matrix.cluster != 'phoenix'

.github/workflows/frontier/bench.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
source .github/scripts/bench-preamble.sh
44

5+
# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
6+
n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
7+
58
if [ "$job_device" = "gpu" ]; then
69
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
710
else
8-
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
11+
./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
912
fi

0 commit comments

Comments
 (0)