diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark_p5.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark_p5.yml new file mode 100644 index 0000000000..6b93a98bed --- /dev/null +++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark_p5.yml @@ -0,0 +1,210 @@ +# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot. +name: AWS p5 integer multi GPU Multi-bit benchmarks + +on: + workflow_dispatch: + inputs: + all_precisions: + description: "Run all precisions" + type: boolean + default: false + fast_default: + description: "Run only deduplicated default operations without scalar variants" + type: boolean + default: false + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + FAST_BENCH: TRUE + BENCH_OP_FLAVOR: default + +jobs: + setup-instance: + name: Setup instance (cuda-integer-multi-bit-multi-gpu-p5-benchmarks) + runs-on: ubuntu-latest + if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }} + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: aws + profile: multi-h100-nvlink + + cuda-integer-multi-bit-multi-gpu-p5-benchmarks: + name: Execute multi GPU P5 integer multi-bit benchmarks + needs: setup-instance + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + timeout-minutes: 1440 # 24 hours + continue-on-error: true + strategy: + fail-fast: false + max-parallel: 1 + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 11 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + + steps: + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + fetch-depth: 0 + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Set up home + # "Install rust" step require root user to have a HOME directory which is not set. + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17 + with: + toolchain: nightly + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + { + echo "CUDA_PATH=$CUDA_PATH"; + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH"; + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc"; + } >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + } >> "${GITHUB_ENV}" + + - name: Checkout Slab repo + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + repository: zama-ai/slab + path: slab + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Should run benchmarks with all precisions + if: inputs.all_precisions + run: | + echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}" + + - name: Should run fast subset benchmarks + if: inputs.fast_default + run: | + echo "BENCH_OP_FLAVOR=fast_default" >> "${GITHUB_ENV}" + + - name: Run multi-bit benchmarks with AVX512 + run: | + make bench_unsigned_integer_multi_bit_gpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ + --database tfhe_rs \ + --hardware "p5.48xlarge" \ + --backend gpu \ + --project-version "${{ env.COMMIT_HASH }}" \ + --branch ${{ github.ref_name }} \ + --commit-date "${{ env.COMMIT_DATE }}" \ + --bench-date "${{ env.BENCH_DATE }}" \ + --walk-subdirs \ + --name-suffix avx512 \ + --throughput + + - name: Upload parsed results artifact + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b + with: + name: ${{ github.sha }}_integer + path: ${{ env.RESULTS_FILENAME }} + + - name: Send data to Slab + shell: bash + run: | + echo "Computing HMac on results file" + SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')" + echo "Sending results to Slab..." + curl -v -k \ + -H "Content-Type: application/json" \ + -H "X-Slab-Repository: ${{ github.repository }}" \ + -H "X-Slab-Command: store_data_v2" \ + -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \ + -d @${{ env.RESULTS_FILENAME }} \ + ${{ secrets.SLAB_URL }} + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-p5-benchmarks ] + runs-on: ubuntu-latest + if: ${{ !success() && !cancelled() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-p5-benchmarks.result }} + SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-p5-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-integer-multi-bit-multi-gpu-p5-benchmarks) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-p5-benchmarks ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-p5-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/.github/workflows/integer_multi_gpu_full_benchmark_p5.yml b/.github/workflows/integer_multi_gpu_full_benchmark_p5.yml new file mode 100644 index 0000000000..3bedab3725 --- /dev/null +++ b/.github/workflows/integer_multi_gpu_full_benchmark_p5.yml @@ -0,0 +1,193 @@ +# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot. +name: AWS p5 integer multi GPU full benchmarks + +on: + workflow_dispatch: + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +jobs: + setup-instance: + name: Setup instance (cuda-integer-full-multi-gpu-p5-benchmarks) + runs-on: ubuntu-latest + if: github.event_name != 'schedule' || + (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: aws + profile: multi-h100-nvlink + + cuda-integer-full-multi-gpu-p5-benchmarks: + name: Execute multi GPU P5 integer benchmarks for all operations flavor + needs: setup-instance + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + timeout-minutes: 1440 # 24 hours + continue-on-error: true + strategy: + fail-fast: false + max-parallel: 1 + matrix: + command: [integer, integer_multi_bit] + op_flavor: [default, unchecked] + # explicit include-based build matrix, of known valid options + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 11 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + + steps: + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + fetch-depth: 0 + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Set up home + # "Install rust" step require root user to have a HOME directory which is not set. + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17 + with: + toolchain: nightly + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + { + echo "CUDA_PATH=$CUDA_PATH"; + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH"; + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc"; + } >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + } >> "${GITHUB_ENV}" + + - name: Checkout Slab repo + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 + with: + repository: zama-ai/slab + path: slab + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Run benchmarks with AVX512 + run: | + make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ + --database tfhe_rs \ + --hardware "p5.48xlarge" \ + --backend gpu \ + --project-version "${{ env.COMMIT_HASH }}" \ + --branch ${{ github.ref_name }} \ + --commit-date "${{ env.COMMIT_DATE }}" \ + --bench-date "${{ env.BENCH_DATE }}" \ + --walk-subdirs \ + --name-suffix avx512 \ + --throughput + + - name: Upload parsed results artifact + uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b + with: + name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }} + path: ${{ env.RESULTS_FILENAME }} + + - name: Send data to Slab + shell: bash + run: | + echo "Computing HMac on results file" + SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')" + echo "Sending results to Slab..." + curl -v -k \ + -H "Content-Type: application/json" \ + -H "X-Slab-Repository: ${{ github.repository }}" \ + -H "X-Slab-Command: store_data_v2" \ + -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \ + -d @${{ env.RESULTS_FILENAME }} \ + ${{ secrets.SLAB_URL }} + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-integer-full-multi-gpu-p5-benchmarks ] + runs-on: ubuntu-latest + if: ${{ !success() && !cancelled() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-p5-benchmarks.result }} + SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-p5-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-integer-full-multi-gpu-p5-benchmarks) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-integer-full-multi-gpu-p5-benchmarks ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261 + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-p5-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/ci/slab.toml b/ci/slab.toml index bbf2f11dc9..4660df8e69 100644 --- a/ci/slab.toml +++ b/ci/slab.toml @@ -38,6 +38,13 @@ instance_type = "p3.2xlarge" spawn_retry_attempts = 120 spawn_retry_duration = 60 +[backend.aws.multi-h100-nvlink] +region = "us-east-1" +image_id = "ami-08f0d5468c82263f2" +instance_type = "p5.48xlarge" +spawn_retry_attempts = 240 +spawn_retry_duration = 60 + [backend.hyperstack.single-h100] environment_name = "canada" image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"