fix(gpu): fix multi-gpu error in division

zama-ai · Aug 2, 2024 · 16c19ea · 16c19ea
1 parent 5547d92
commit 16c19ea
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 5 deletions.
diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -49,7 +49,7 @@ jobs:
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
           backend: hyperstack
-          profile: multi-h100
+          profile: multi-h100-nvlink
 
   cuda-integer-multi-bit-multi-gpu-benchmarks:
     name: Execute multi GPU integer multi-bit benchmarks
@@ -153,7 +153,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "n3-H100x8-NVLink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \

diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -36,7 +36,7 @@ jobs:
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
           backend: hyperstack
-          profile: multi-h100
+          profile: multi-h100-nvlink
 
   cuda-integer-full-multi-gpu-benchmarks:
     name: Execute multi GPU integer benchmarks for all operations flavor
@@ -133,7 +133,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "n3-H100x8" \
+          --hardware "n3-H100x8-NVLink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \

diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -2796,7 +2796,7 @@ template <typename Torus> struct int_div_rem_memory {
   int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
                      uint32_t num_blocks, bool allocate_gpu_memory) {
-    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
 
     this->params = params;
     shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(