From 16c19eaa4abbfcd7bb7c1e15b6763d0c57e99c12 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 1 Aug 2024 17:12:09 +0200
Subject: [PATCH] fix(gpu): fix multi-gpu error in division

---
 .github/workflows/integer_multi_bit_multi_gpu_benchmark.yml | 4 ++--
 .github/workflows/integer_multi_gpu_full_benchmark.yml      | 4 ++--
 backends/tfhe-cuda-backend/cuda/include/integer.h           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
index ad8f43177d..017e15d91f 100644
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -49,7 +49,7 @@ jobs:
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
           backend: hyperstack
-          profile: multi-h100
+          profile: multi-h100-nvlink
 
   cuda-integer-multi-bit-multi-gpu-benchmarks:
     name: Execute multi GPU integer multi-bit benchmarks
@@ -153,7 +153,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "n3-H100x8-NVLink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \
diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml
index 677da988fa..b19ed9fe5b 100644
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -36,7 +36,7 @@ jobs:
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
           backend: hyperstack
-          profile: multi-h100
+          profile: multi-h100-nvlink
 
   cuda-integer-full-multi-gpu-benchmarks:
     name: Execute multi GPU integer benchmarks for all operations flavor
@@ -133,7 +133,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "n3-H100x8-NVLink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 2fc75c10b9..265368ce27 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -2796,7 +2796,7 @@ template <typename Torus> struct int_div_rem_memory {
   int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
                      uint32_t num_blocks, bool allocate_gpu_memory) {
-    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
 
     this->params = params;
     shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(