Skip to content

Speed up gfortran debug CI builds #5793

Speed up gfortran debug CI builds

Speed up gfortran debug CI builds #5793

Workflow file for this run

name: 'Test Suite'
on:
push:
branches: [master]
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
workflow_dispatch:
schedule:
- cron: '0 6 * * 1' # Weekly Monday 6 AM UTC: refresh coverage cache before 7-day expiry
concurrency:
# PRs: group by branch (new push cancels old). Push to master: unique per SHA (never cancelled).
group: ${{ github.workflow }}-${{ github.event_name == 'push' && github.sha || github.ref }}
cancel-in-progress: ${{ github.event_name != 'push' }}
jobs:
lint-gate:
name: Lint Gate
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Initialize MFC
run: ./mfc.sh init
- name: Check Formatting
run: |
./mfc.sh format -j "$(nproc)"
git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)
- name: Spell Check
run: ./mfc.sh spelling
- name: Lint Toolchain
run: ./mfc.sh lint
- name: Lint Source
run: python3 toolchain/mfc/lint_source.py
- name: Lint Docs
run: python3 toolchain/mfc/lint_docs.py
- name: Lint Parameter Docs
run: python3 toolchain/mfc/lint_param_docs.py
file-changes:
name: Detect File Changes
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
cases_py: ${{ steps.changes.outputs.cases_py }}
dep_changed: ${{ steps.dep-check.outputs.dep_changed }}
steps:
- name: Clone
uses: actions/checkout@v4
- name: Detect Changes
uses: dorny/paths-filter@v3
id: changes
with:
filters: ".github/file-filter.yml"
- name: Check for Fortran dependency changes
id: dep-check
env:
GH_TOKEN: ${{ github.token }}
run: |
# Detect added/removed use/include statements that change the
# Fortran dependency graph, which would make the coverage cache stale.
PR_NUMBER="${{ github.event.pull_request.number }}"
BEFORE="${{ github.event.before }}"
AFTER="${{ github.event.after }}"
if [ "${{ github.event_name }}" = "pull_request" ]; then
# Default to dep_changed=true if gh pr diff fails (safe fallback).
DIFF=$(gh pr diff "$PR_NUMBER" 2>/dev/null) || {
echo "gh pr diff failed — defaulting to dep_changed=true for safety."
echo "dep_changed=true" >> "$GITHUB_OUTPUT"
exit 0
}
elif [ "${{ github.event_name }}" = "push" ]; then
DIFF=$(git diff "$BEFORE".."$AFTER" 2>/dev/null) || {
echo "git diff failed for push event — defaulting to dep_changed=true for safety."
echo "dep_changed=true" >> "$GITHUB_OUTPUT"
exit 0
}
else
DIFF=""
fi
if echo "$DIFF" | \
grep -qE '^[+-][[:space:]]*(use[[:space:],]+[a-zA-Z_]|#:include[[:space:]]|include[[:space:]]+['"'"'"])'; then
echo "dep_changed=true" >> "$GITHUB_OUTPUT"
echo "Fortran dependency change detected — will rebuild coverage cache."
else
echo "dep_changed=false" >> "$GITHUB_OUTPUT"
fi
rebuild-cache:
name: Rebuild Coverage Cache
needs: [lint-gate, file-changes]
if: >-
github.repository == 'MFlowCode/MFC' &&
(
(github.event_name == 'pull_request' &&
(needs.file-changes.outputs.cases_py == 'true' ||
needs.file-changes.outputs.dep_changed == 'true')) ||
(github.event_name == 'push' &&
(needs.file-changes.outputs.cases_py == 'true' ||
needs.file-changes.outputs.dep_changed == 'true')) ||
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
)
timeout-minutes: 240
runs-on:
group: phoenix
labels: gt
steps:
- name: Clone
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
clean: false
- name: Rebuild Cache via SLURM
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/rebuild-cache.sh cpu none phoenix
- name: Print Logs
if: always()
run: cat rebuild-cache-cpu-none.out
- name: Upload Cache Artifact
uses: actions/upload-artifact@v4
with:
name: coverage-cache
path: toolchain/mfc/test/test_coverage_cache.json.gz
retention-days: 1
- name: Save Coverage Cache
uses: actions/cache/save@v4
with:
path: toolchain/mfc/test/test_coverage_cache.json.gz
key: coverage-cache-${{ github.event.pull_request.number || 'master' }}-${{ hashFiles('toolchain/mfc/test/cases.py') }}-${{ github.sha }}
continue-on-error: true
github:
name: Github
needs: [lint-gate, file-changes, rebuild-cache]
if: >-
!cancelled() &&
needs.lint-gate.result == 'success' &&
needs.file-changes.result == 'success' &&
(needs.rebuild-cache.result == 'success' || needs.rebuild-cache.result == 'skipped') &&
needs.file-changes.outputs.checkall == 'true'
strategy:
matrix:
os: ['ubuntu', 'macos']
mpi: ['mpi']
precision: ['']
debug: ['reldebug', 'no-debug']
intel: [true, false]
exclude:
- os: macos
intel: true
include:
- os: ubuntu
mpi: no-mpi
precision: single
debug: no-debug
intel: false
fail-fast: false
continue-on-error: true
runs-on: ${{ matrix.os }}-latest
steps:
- name: Clone
uses: actions/checkout@v4
- name: Fetch master for coverage diff
run: |
git fetch origin master:master --depth=1
git fetch --deepen=200
continue-on-error: true
- name: Download Coverage Cache (from rebuild in this run)
id: cache-artifact
uses: actions/download-artifact@v4
with:
name: coverage-cache
path: toolchain/mfc/test
continue-on-error: true
- name: Restore Coverage Cache (from previous run)
if: steps.cache-artifact.outcome != 'success'
id: cache-restore
uses: actions/cache/restore@v4
with:
path: toolchain/mfc/test/test_coverage_cache.json.gz
key: coverage-cache-${{ github.event.pull_request.number || 'master' }}-${{ hashFiles('toolchain/mfc/test/cases.py') }}-${{ github.sha }}
restore-keys: |
coverage-cache-${{ github.event.pull_request.number || 'master' }}-
coverage-cache-master-
continue-on-error: true
- name: Coverage Cache Status
run: |
if [ "${{ steps.cache-artifact.outcome }}" = "success" ]; then
echo "Coverage cache: loaded from rebuild artifact (this run)"
elif [ "${{ steps.cache-restore.outputs.cache-hit }}" = "true" ]; then
echo "Coverage cache: restored from actions/cache (previous run)"
elif [ -f toolchain/mfc/test/test_coverage_cache.json.gz ]; then
echo "Coverage cache: using committed fallback in repo"
else
echo "Coverage cache: none available — full test suite will run"
fi
- name: Setup MacOS
if: matrix.os == 'macos'
run: |
brew update
brew upgrade || true
brew install coreutils python fftw hdf5 gcc@15 boost open-mpi lapack
echo "FC=gfortran-15" >> $GITHUB_ENV
echo "BOOST_INCLUDE=/opt/homebrew/include/" >> $GITHUB_ENV
- name: Setup Ubuntu
if: matrix.os == 'ubuntu' && matrix.intel == false
run: |
sudo apt update -y
sudo apt install -y cmake gcc g++ python3 python3-dev hdf5-tools \
libfftw3-dev libhdf5-dev openmpi-bin libopenmpi-dev \
libblas-dev liblapack-dev
- name: Setup Ubuntu (Intel)
if: matrix.os == 'ubuntu' && matrix.intel == true
run: |
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
sudo apt-get update
sudo apt-get install -y intel-oneapi-compiler-fortran intel-oneapi-mpi intel-oneapi-mpi-devel
# Export only new/changed env vars from setvars.sh.
# `printenv >> $GITHUB_ENV` dumps all vars including shell internals
# with special characters that corrupt GITHUB_ENV parsing.
printenv | sort > /tmp/env_before
source /opt/intel/oneapi/setvars.sh
printenv | sort > /tmp/env_after
diff /tmp/env_before /tmp/env_after | grep '^>' | sed 's/^> //' >> $GITHUB_ENV
- name: Build
run: |
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} $PRECISION $TEST_ALL
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
PRECISION: ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }}
- name: Test
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $ONLY_CHANGES $TEST_ALL $TEST_PCT
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'reldebug' && '-% 20' || '' }}
ONLY_CHANGES: ${{ github.event_name == 'pull_request' && '--only-changes' || '' }}
self:
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
needs: [lint-gate, file-changes, rebuild-cache]
if: >-
!cancelled() &&
needs.lint-gate.result == 'success' &&
needs.file-changes.result == 'success' &&
(needs.rebuild-cache.result == 'success' || needs.rebuild-cache.result == 'skipped') &&
github.repository == 'MFlowCode/MFC' &&
needs.file-changes.outputs.checkall == 'true' &&
github.event.pull_request.draft != true
# Frontier CCE compiler is periodically broken by toolchain updates (e.g.
# cpe/25.03 introduced an IPA SIGSEGV in CCE 19.0.0). Allow Frontier to
# fail without blocking PR merges; Phoenix remains a hard gate.
continue-on-error: ${{ matrix.runner == 'frontier' }}
timeout-minutes: 480
strategy:
matrix:
include:
# Phoenix (GT) — build+test combined in SLURM job
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
# Frontier (ORNL) — CCE
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'cpu'
interface: 'none'
# Frontier AMD — build on login node, GPU tests sharded for batch partition
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'cpu'
interface: 'none'
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
steps:
- name: Clone
uses: actions/checkout@v4
with:
# clean: false preserves .slurm_job_id files across reruns so
# submit-slurm-job.sh can detect and cancel stale SLURM jobs on retry.
clean: false
- name: Clean stale output files
run: rm -f *.out
- name: Download Coverage Cache (from rebuild in this run)
id: cache-artifact
uses: actions/download-artifact@v4
with:
name: coverage-cache
path: toolchain/mfc/test
continue-on-error: true
- name: Restore Coverage Cache (from previous run)
if: steps.cache-artifact.outcome != 'success'
id: cache-restore
uses: actions/cache/restore@v4
with:
path: toolchain/mfc/test/test_coverage_cache.json.gz
key: coverage-cache-${{ github.event.pull_request.number || 'master' }}-${{ hashFiles('toolchain/mfc/test/cases.py') }}-${{ github.sha }}
restore-keys: |
coverage-cache-${{ github.event.pull_request.number || 'master' }}-
coverage-cache-master-
continue-on-error: true
- name: Coverage Cache Status
run: |
if [ "${{ steps.cache-artifact.outcome }}" = "success" ]; then
echo "Coverage cache: loaded from rebuild artifact (this run)"
elif [ "${{ steps.cache-restore.outputs.cache-hit }}" = "true" ]; then
echo "Coverage cache: restored from actions/cache (previous run)"
elif [ -f toolchain/mfc/test/test_coverage_cache.json.gz ]; then
echo "Coverage cache: using committed fallback in repo"
else
echo "Coverage cache: none available — full test suite will run"
fi
- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
timeout-minutes: 60
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
- name: Build
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
- name: Test
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
- name: Cancel SLURM Jobs
if: cancelled()
run: |
find . -name "*.slurm_job_id" | while read -r f; do
job_id=$(cat "$f")
echo "Cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
done
- name: Compute Log Slug
if: always()
id: log
run: |
SHARD_SUFFIX=""
SHARD="${{ matrix.shard }}"
if [ -n "$SHARD" ]; then
SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
fi
echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Print Logs
if: always()
run: |
for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
done
- name: Archive Logs
uses: actions/upload-artifact@v4
if: matrix.cluster != 'phoenix'
with:
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
path: |
${{ steps.log.outputs.build_slug }}.out
${{ steps.log.outputs.test_slug }}.out
case-optimization:
name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true' && github.event.pull_request.draft != true
needs: [lint-gate, file-changes]
# Frontier is non-blocking for the same reason as the self job above.
continue-on-error: ${{ matrix.runner == 'frontier' }}
timeout-minutes: 480
strategy:
matrix:
include:
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
steps:
- name: Clone
uses: actions/checkout@v4
with:
clean: false
- name: Clean stale output files
run: rm -f *.out
- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
- name: Pre-Build (SLURM)
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
- name: Run Case-Optimization Tests
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
- name: Cancel SLURM Jobs
if: cancelled()
run: |
find . -name "*.slurm_job_id" | while read -r f; do
job_id=$(cat "$f")
echo "Cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
done
- name: Print Logs
if: always()
run: |
for f in prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out \
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out; do
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
done
- name: Archive Logs
uses: actions/upload-artifact@v4
if: always()
with:
name: case-opt-${{ strategy.job-index }}-${{ matrix.cluster }}-${{ matrix.interface }}
path: |
prebuild-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out
run-case-optimization-${{ matrix.device }}-${{ matrix.interface }}.out