@@ -20,6 +20,31 @@ echo "=========================================="
2020echo " Starting parallel benchmark jobs..."
2121echo " =========================================="
2222
23+ # For Phoenix GPU benchmarks, select a consistent GPU partition before launching
24+ # both parallel jobs so PR and master always land on the same GPU type.
25+ if [ " $device " = " gpu" ] && [ " $cluster " = " phoenix" ]; then
26+ echo " Selecting Phoenix GPU partition for benchmark consistency..."
27+ # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
28+ # large modern nodes (h200, h100, a100) free for production workloads.
29+ # rtx6000 has the most nodes and gives the most consistent baselines.
30+ BENCH_GPU_PARTITION=" "
31+ for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
32+ # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
33+ # for an unknown partition); suppress so set -euo pipefail doesn't abort.
34+ idle=$( sinfo -p " $part " --noheader -o " %t" 2> /dev/null | grep -cE " ^(idle|mix)" || true)
35+ if [ " ${idle:- 0} " -gt 0 ]; then
36+ BENCH_GPU_PARTITION=" $part "
37+ echo " Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
38+ break
39+ fi
40+ done
41+ if [ -z " $BENCH_GPU_PARTITION " ]; then
42+ echo " WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
43+ BENCH_GPU_PARTITION=" gpu-rtx6000"
44+ fi
45+ export BENCH_GPU_PARTITION
46+ fi
47+
2348# Run both jobs with monitoring using dedicated script from PR
2449# Use stdbuf for line-buffered output and prefix each line for clarity
2550(set -o pipefail; stdbuf -oL -eL bash " ${SCRIPT_DIR} /submit_and_monitor_bench.sh" pr " $device " " $interface " " $cluster " 2>&1 | while IFS= read -r line; do echo " [PR] $line " ; done) &
@@ -40,6 +65,8 @@ wait "$pr_pid"
4065pr_exit=$?
4166if [ " $pr_exit " -ne 0 ]; then
4267 echo " PR job exited with code: $pr_exit "
68+ echo " Last 50 lines of PR job log:"
69+ tail -n 50 " pr/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read PR log"
4370else
4471 echo " PR job completed successfully"
4572fi
@@ -48,6 +75,8 @@ wait "$master_pid"
4875master_exit=$?
4976if [ " $master_exit " -ne 0 ]; then
5077 echo " Master job exited with code: $master_exit "
78+ echo " Last 50 lines of master job log:"
79+ tail -n 50 " master/bench-${device} -${interface} .out" 2> /dev/null || echo " Could not read master log"
5180else
5281 echo " Master job completed successfully"
5382fi
0 commit comments