Skip to content

Commit

Permalink
chore(ci): transfer all GPU CI to hyperstack
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Aug 6, 2024
1 parent a9bb6ea commit b7f70ee
Show file tree
Hide file tree
Showing 10 changed files with 84 additions and 23 deletions.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ jobs:
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_base_tests.yml'
setup-instance:
name: Setup instance (cuda-tests)
Expand All @@ -65,7 +66,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
backend: hyperstack
profile: gpu-test

cuda-tests-linux:
Expand All @@ -84,11 +85,23 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
Expand Down Expand Up @@ -122,6 +135,10 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- Makefile
- '.github/workflows/aws_tfhe_multi_gpu**'
- '.github/workflows/**_multi_gpu_tests.yml'
- scripts/**
- ci/**
Expand All @@ -71,7 +71,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
backend: hyperstack
profile: multi-gpu-test

cuda-tests-linux:
Expand All @@ -90,11 +90,23 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

Expand Down Expand Up @@ -126,6 +138,10 @@ jobs:
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ jobs:
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_signed_integer_tests.yml'
setup-instance:
name: Setup instance (cuda-signed-integer-tests)
Expand All @@ -75,7 +76,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
backend: hyperstack
profile: gpu-test

cuda-signed-integer-tests:
Expand All @@ -94,11 +95,23 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

Expand Down Expand Up @@ -138,6 +151,10 @@ jobs:
echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run signed integer multi-bit tests
run: |
make test_signed_integer_multi_bit_gpu_ci
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
- '.github/workflows/gpu_unsigned_integer_tests.yml'
setup-instance:
name: Setup instance (cuda-unsigned-integer-tests)
Expand All @@ -74,7 +75,7 @@ jobs:
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
backend: hyperstack
profile: gpu-test

cuda-unsigned-integer-tests:
Expand All @@ -93,11 +94,23 @@ jobs:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

Expand Down Expand Up @@ -137,6 +150,10 @@ jobs:
echo "NIGHTLY_TESTS=TRUE";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run unsigned integer multi-bit tests
run: |
make test_unsigned_integer_multi_bit_gpu_ci
Expand Down
22 changes: 8 additions & 14 deletions ci/slab.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,10 @@ region = "us-east-1"
image_id = "ami-06b3d61f41bf8350a"
instance_type = "m6i.4xlarge"

[backend.aws.gpu-test]
region = "us-east-1"
image_id = "ami-06b3d61f41bf8350a"
instance_type = "p3.2xlarge"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
spawn_retry_duration = 60
[backend.hyperstack.gpu-test]
environment_name = "canada"
image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
flavor_name = "n3-RTX-A6000x1"

[backend.hyperstack.single-h100]
environment_name = "canada"
Expand All @@ -58,13 +55,10 @@ environment_name = "canada"
image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
flavor_name = "n3-A100x8-NVLink"

[backend.aws.multi-gpu-test]
region = "us-east-1"
image_id = "ami-06b3d61f41bf8350a"
instance_type = "p3.8xlarge"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
spawn_retry_duration = 60
[backend.hyperstack.multi-gpu-test]
environment_name = "canada"
image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
flavor_name = "n3-RTX-A6000x4"

[command.signed_integer_full_bench]
workflow = "signed_integer_full_benchmark.yml"
Expand Down

0 comments on commit b7f70ee

Please sign in to comment.