Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): rework ci to adapt to the shortage of h100 #1742

Merged
merged 1 commit into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions .github/workflows/gpu_signed_integer_classic_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
name: TFHE Cuda Backend - Signed integer tests with classical PBS

env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]

jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}

- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml

setup-instance:
name: Setup instance (cuda-signed-classic-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: gpu-test

cuda-tests-linux:
name: CUDA signed integer tests with classical PBS
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install


- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable

- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"

- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run signed integer tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci

slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

teardown-instance:
name: Teardown instance (cuda-signed-classic-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}

- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
4 changes: 0 additions & 4 deletions .github/workflows/gpu_signed_integer_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,6 @@ jobs:
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run signed integer tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci

- name: Run signed integer multi-bit tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
Expand Down
185 changes: 185 additions & 0 deletions .github/workflows/gpu_unsigned_integer_classic_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
name: TFHE Cuda Backend - Unsigned integer tests with classical PBS

env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]

jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}

- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c
with:
since_last_remote_commit: true
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml

setup-instance:
name: Setup instance (cuda-unsigned-classic-tests)
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: gpu-test

cuda-tests-linux:
name: CUDA unsigned integer tests with classical PBS
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install


- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable

- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"

- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run unsigned integer tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci

slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

teardown-instance:
name: Teardown instance (cuda-unsigned-classic-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}

- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
Loading
Loading