From b7f70ee52a96013f9b16ecfa3176303a26fb242c Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Tue, 6 Aug 2024 14:05:41 +0200 Subject: [PATCH] chore(ci): transfer all GPU CI to hyperstack --- ..._gpu_4090_tests.yml => gpu_4090_tests.yml} | 0 ..._gpu_tests.yml => gpu_base_h100_tests.yml} | 0 ..._tfhe_gpu_tests.yml => gpu_base_tests.yml} | 21 ++++++++++++++++-- ...tests.yml => gpu_full_multi_gpu_tests.yml} | 22 ++++++++++++++++--- .../{aws_tfhe_gpu_pcc.yml => gpu_pcc.yml} | 0 ....yml => gpu_signed_integer_h100_tests.yml} | 0 ...tests.yml => gpu_signed_integer_tests.yml} | 21 ++++++++++++++++-- ...ml => gpu_unsigned_integer_h100_tests.yml} | 0 ...sts.yml => gpu_unsigned_integer_tests.yml} | 21 ++++++++++++++++-- ci/slab.toml | 22 +++++++------------ 10 files changed, 84 insertions(+), 23 deletions(-) rename .github/workflows/{aws_tfhe_gpu_4090_tests.yml => gpu_4090_tests.yml} (100%) rename .github/workflows/{hyperstack_tfhe_gpu_tests.yml => gpu_base_h100_tests.yml} (100%) rename .github/workflows/{aws_tfhe_gpu_tests.yml => gpu_base_tests.yml} (88%) rename .github/workflows/{aws_tfhe_multi_gpu_tests.yml => gpu_full_multi_gpu_tests.yml} (89%) rename .github/workflows/{aws_tfhe_gpu_pcc.yml => gpu_pcc.yml} (100%) rename .github/workflows/{hyperstack_tfhe_signed_integer_gpu_tests.yml => gpu_signed_integer_h100_tests.yml} (100%) rename .github/workflows/{aws_tfhe_signed_integer_gpu_tests.yml => gpu_signed_integer_tests.yml} (89%) rename .github/workflows/{hyperstack_tfhe_unsigned_integer_gpu_tests.yml => gpu_unsigned_integer_h100_tests.yml} (100%) rename .github/workflows/{aws_tfhe_integer_gpu_tests.yml => gpu_unsigned_integer_tests.yml} (89%) diff --git a/.github/workflows/aws_tfhe_gpu_4090_tests.yml b/.github/workflows/gpu_4090_tests.yml similarity index 100% rename from .github/workflows/aws_tfhe_gpu_4090_tests.yml rename to .github/workflows/gpu_4090_tests.yml diff --git a/.github/workflows/hyperstack_tfhe_gpu_tests.yml b/.github/workflows/gpu_base_h100_tests.yml similarity index 100% rename from .github/workflows/hyperstack_tfhe_gpu_tests.yml rename to .github/workflows/gpu_base_h100_tests.yml diff --git a/.github/workflows/aws_tfhe_gpu_tests.yml b/.github/workflows/gpu_base_tests.yml similarity index 88% rename from .github/workflows/aws_tfhe_gpu_tests.yml rename to .github/workflows/gpu_base_tests.yml index 7f6908e3bb..85be570ef2 100644 --- a/.github/workflows/aws_tfhe_gpu_tests.yml +++ b/.github/workflows/gpu_base_tests.yml @@ -47,6 +47,7 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_base_tests.yml' setup-instance: name: Setup instance (cuda-tests) @@ -65,7 +66,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-tests-linux: @@ -84,11 +85,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 with: @@ -122,6 +135,10 @@ jobs: echo "HOME=/home/ubuntu"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run core crypto and internal CUDA backend tests run: | make test_core_crypto_gpu diff --git a/.github/workflows/aws_tfhe_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml similarity index 89% rename from .github/workflows/aws_tfhe_multi_gpu_tests.yml rename to .github/workflows/gpu_full_multi_gpu_tests.yml index 5edeb280dc..11451651d0 100644 --- a/.github/workflows/aws_tfhe_multi_gpu_tests.yml +++ b/.github/workflows/gpu_full_multi_gpu_tests.yml @@ -49,7 +49,7 @@ jobs: - tfhe/src/c_api/** - 'tfhe/docs/**.md' - Makefile - - '.github/workflows/aws_tfhe_multi_gpu**' + - '.github/workflows/**_multi_gpu_tests.yml' - scripts/** - ci/** @@ -71,7 +71,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: multi-gpu-test cuda-tests-linux: @@ -90,11 +90,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 @@ -126,6 +138,10 @@ jobs: echo "HOME=/home/ubuntu"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU. - name: Run multi-bit CUDA integer tests run: | diff --git a/.github/workflows/aws_tfhe_gpu_pcc.yml b/.github/workflows/gpu_pcc.yml similarity index 100% rename from .github/workflows/aws_tfhe_gpu_pcc.yml rename to .github/workflows/gpu_pcc.yml diff --git a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml b/.github/workflows/gpu_signed_integer_h100_tests.yml similarity index 100% rename from .github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml rename to .github/workflows/gpu_signed_integer_h100_tests.yml diff --git a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml b/.github/workflows/gpu_signed_integer_tests.yml similarity index 89% rename from .github/workflows/aws_tfhe_signed_integer_gpu_tests.yml rename to .github/workflows/gpu_signed_integer_tests.yml index b4dc8dc92b..15a2588236 100644 --- a/.github/workflows/aws_tfhe_signed_integer_gpu_tests.yml +++ b/.github/workflows/gpu_signed_integer_tests.yml @@ -56,6 +56,7 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_signed_integer_tests.yml' setup-instance: name: Setup instance (cuda-signed-integer-tests) @@ -75,7 +76,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-signed-integer-tests: @@ -94,11 +95,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 @@ -138,6 +151,10 @@ jobs: echo "NIGHTLY_TESTS=TRUE"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run signed integer multi-bit tests run: | make test_signed_integer_multi_bit_gpu_ci diff --git a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml b/.github/workflows/gpu_unsigned_integer_h100_tests.yml similarity index 100% rename from .github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml rename to .github/workflows/gpu_unsigned_integer_h100_tests.yml diff --git a/.github/workflows/aws_tfhe_integer_gpu_tests.yml b/.github/workflows/gpu_unsigned_integer_tests.yml similarity index 89% rename from .github/workflows/aws_tfhe_integer_gpu_tests.yml rename to .github/workflows/gpu_unsigned_integer_tests.yml index 2cee339d89..1417f95b52 100644 --- a/.github/workflows/aws_tfhe_integer_gpu_tests.yml +++ b/.github/workflows/gpu_unsigned_integer_tests.yml @@ -55,6 +55,7 @@ jobs: - tfhe/src/high_level_api/** - tfhe/src/c_api/** - 'tfhe/docs/**.md' + - '.github/workflows/gpu_unsigned_integer_tests.yml' setup-instance: name: Setup instance (cuda-unsigned-integer-tests) @@ -74,7 +75,7 @@ jobs: github-token: ${{ secrets.SLAB_ACTION_TOKEN }} slab-url: ${{ secrets.SLAB_BASE_URL }} job-secret: ${{ secrets.JOB_SECRET }} - backend: aws + backend: hyperstack profile: gpu-test cuda-unsigned-integer-tests: @@ -93,11 +94,23 @@ jobs: include: - os: ubuntu-22.04 cuda: "12.2" - gcc: 9 + gcc: 11 env: CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + - name: Checkout tfhe-rs uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 @@ -137,6 +150,10 @@ jobs: echo "NIGHTLY_TESTS=TRUE"; } >> "${GITHUB_ENV}" + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + - name: Run unsigned integer multi-bit tests run: | make test_unsigned_integer_multi_bit_gpu_ci diff --git a/ci/slab.toml b/ci/slab.toml index 8c6179be51..c6d36a8c14 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -30,13 +30,10 @@ region = "us-east-1" image_id = "ami-06b3d61f41bf8350a" instance_type = "m6i.4xlarge" -[backend.aws.gpu-test] -region = "us-east-1" -image_id = "ami-06b3d61f41bf8350a" -instance_type = "p3.2xlarge" -# One spawn attempt every 30 seconds for 1 hour -spawn_retry_attempts = 120 -spawn_retry_duration = 60 +[backend.hyperstack.gpu-test] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-RTX-A6000x1" [backend.hyperstack.single-h100] environment_name = "canada" @@ -58,13 +55,10 @@ environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" flavor_name = "n3-A100x8-NVLink" -[backend.aws.multi-gpu-test] -region = "us-east-1" -image_id = "ami-06b3d61f41bf8350a" -instance_type = "p3.8xlarge" -# One spawn attempt every 30 seconds for 1 hour -spawn_retry_attempts = 120 -spawn_retry_duration = 60 +[backend.hyperstack.multi-gpu-test] +environment_name = "canada" +image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2" +flavor_name = "n3-RTX-A6000x4" [command.signed_integer_full_bench] workflow = "signed_integer_full_benchmark.yml"