From ba5e71718362e536af53e0ffa3811344ab316b82 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Tue, 5 Nov 2024 12:08:50 +0100 Subject: [PATCH] chore(gpu): add workflows for erc20 with 2 and 8 H100 --- .../workflows/benchmark_gpu_erc20_2h100.yml | 195 ++++++++++++++++++ .../workflows/benchmark_gpu_erc20_8h100.yml | 195 ++++++++++++++++++ 2 files changed, 390 insertions(+) create mode 100644 .github/workflows/benchmark_gpu_erc20_2h100.yml create mode 100644 .github/workflows/benchmark_gpu_erc20_8h100.yml diff --git a/.github/workflows/benchmark_gpu_erc20_2h100.yml b/.github/workflows/benchmark_gpu_erc20_2h100.yml new file mode 100644 index 0000000000..4bb01fc8ae --- /dev/null +++ b/.github/workflows/benchmark_gpu_erc20_2h100.yml @@ -0,0 +1,195 @@ +# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot. +name: ERC20 GPU 2xH100 benchmarks + +on: + workflow_dispatch: + schedule: + # Weekly benchmarks will be triggered each Saturday at 5a.m. + - cron: '0 5 * * 6' + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +jobs: + setup-instance: + name: Setup instance (cuda-erc20-benchmarks) + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8 + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: hyperstack + profile: 2-h100 + + cuda-erc20-benchmarks: + name: Execute GPU integer benchmarks + needs: setup-instance + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + strategy: + fail-fast: false + # explicit include-based build matrix, of known valid options + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 11 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + CMAKE_VERSION: 3.29.6 + steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + fetch-depth: 0 + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Set up home + # "Install rust" step require root user to have a HOME directory which is not set. + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a + with: + toolchain: nightly + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + { + echo "CUDA_PATH=$CUDA_PATH"; + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH"; + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc"; + } >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + } >> "${GITHUB_ENV}" + + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + + - name: Run benchmarks + run: | + make bench_hlapi_erc20_gpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ + --database tfhe_rs \ + --hardware "n3-H100x2" \ + --backend gpu \ + --project-version "${{ env.COMMIT_HASH }}" \ + --branch ${{ github.ref_name }} \ + --commit-date "${{ env.COMMIT_DATE }}" \ + --bench-date "${{ env.BENCH_DATE }}" \ + --walk-subdirs \ + --name-suffix avx512 + + - name: Parse PBS counts + run: | + python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \ + --object-sizes \ + --append-results + + - name: Upload parsed results artifact + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 + with: + name: ${{ github.sha }}_erc20 + path: ${{ env.RESULTS_FILENAME }} + + - name: Checkout Slab repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + repository: zama-ai/slab + path: slab + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Send data to Slab + shell: bash + run: | + python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \ + --slab-url "${{ secrets.SLAB_URL }}" + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-erc20-benchmarks ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 + env: + SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }} + SLACK_MESSAGE: "ERC20 2xH100 benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-erc20-benchmarks) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8 + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" diff --git a/.github/workflows/benchmark_gpu_erc20_8h100.yml b/.github/workflows/benchmark_gpu_erc20_8h100.yml new file mode 100644 index 0000000000..2218218ec5 --- /dev/null +++ b/.github/workflows/benchmark_gpu_erc20_8h100.yml @@ -0,0 +1,195 @@ +# Run ERC20 benchmarks on an instance with CUDA and return parsed results to Slab CI bot. +name: ERC20 GPU 8xH100 benchmarks + +on: + workflow_dispatch: + schedule: + # Weekly benchmarks will be triggered each Saturday at 5a.m. + - cron: '0 5 * * 6' + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +jobs: + setup-instance: + name: Setup instance (cuda-erc20-benchmarks) + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || + (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') + outputs: + runner-name: ${{ steps.start-instance.outputs.label }} + steps: + - name: Start instance + id: start-instance + uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8 + with: + mode: start + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + backend: hyperstack + profile: multi-h100 + + cuda-erc20-benchmarks: + name: Execute GPU integer benchmarks + needs: setup-instance + runs-on: ${{ needs.setup-instance.outputs.runner-name }} + strategy: + fail-fast: false + # explicit include-based build matrix, of known valid options + matrix: + include: + - os: ubuntu-22.04 + cuda: "12.2" + gcc: 11 + env: + CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }} + CMAKE_VERSION: 3.29.6 + steps: + # Mandatory on hyperstack since a bootable volume is not re-usable yet. + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y checkinstall zlib1g-dev libssl-dev + wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz + tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz + cd cmake-${{ env.CMAKE_VERSION }} + ./bootstrap + make -j"$(nproc)" + sudo make install + + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + fetch-depth: 0 + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Set up home + # "Install rust" step require root user to have a HOME directory which is not set. + run: | + echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a + with: + toolchain: nightly + + - name: Export CUDA variables + if: ${{ !cancelled() }} + run: | + { + echo "CUDA_PATH=$CUDA_PATH"; + echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH"; + echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc"; + } >> "${GITHUB_ENV}" + echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}" + + # Specify the correct host compilers + - name: Export gcc and g++ variables + if: ${{ !cancelled() }} + run: | + { + echo "CC=/usr/bin/gcc-${{ matrix.gcc }}"; + echo "CXX=/usr/bin/g++-${{ matrix.gcc }}"; + echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}"; + } >> "${GITHUB_ENV}" + + - name: Check device is detected + if: ${{ !cancelled() }} + run: nvidia-smi + + - name: Run benchmarks + run: | + make bench_hlapi_erc20_gpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \ + --database tfhe_rs \ + --hardware "n3-H100x8" \ + --backend gpu \ + --project-version "${{ env.COMMIT_HASH }}" \ + --branch ${{ github.ref_name }} \ + --commit-date "${{ env.COMMIT_DATE }}" \ + --bench-date "${{ env.BENCH_DATE }}" \ + --walk-subdirs \ + --name-suffix avx512 + + - name: Parse PBS counts + run: | + python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \ + --object-sizes \ + --append-results + + - name: Upload parsed results artifact + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 + with: + name: ${{ github.sha }}_erc20 + path: ${{ env.RESULTS_FILENAME }} + + - name: Checkout Slab repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + repository: zama-ai/slab + path: slab + token: ${{ secrets.FHE_ACTIONS_TOKEN }} + + - name: Send data to Slab + shell: bash + run: | + python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \ + --slab-url "${{ secrets.SLAB_URL }}" + + slack-notify: + name: Slack Notification + needs: [ setup-instance, cuda-erc20-benchmarks ] + runs-on: ubuntu-latest + if: ${{ always() && needs.cuda-erc20-benchmarks.result != 'skipped' && failure() }} + continue-on-error: true + steps: + - name: Send message + uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 + env: + SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }} + SLACK_MESSAGE: "ERC20 8xH100 benchmarks finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})" + + teardown-instance: + name: Teardown instance (cuda-erc20-benchmarks) + if: ${{ always() && needs.setup-instance.result != 'skipped' }} + needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ] + runs-on: ubuntu-latest + steps: + - name: Stop instance + id: stop-instance + uses: zama-ai/slab-github-runner@801df0b8db5ea2b06128b7476c652f5ed5f193a8 + with: + mode: stop + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} + slab-url: ${{ secrets.SLAB_BASE_URL }} + job-secret: ${{ secrets.JOB_SECRET }} + label: ${{ needs.setup-instance.outputs.runner-name }} + + - name: Slack Notification + if: ${{ failure() }} + continue-on-error: true + uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "Instance teardown (cuda-erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"