chore(gpu): define higher values for the sm size based on compute cap…

…ability
zama-ai · Aug 6, 2024 · d69dd20 · d69dd20
1 parent 80fe45f
commit d69dd20
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 12 deletions.
diff --git a/.github/workflows/hyperstack_tfhe_gpu_tests.yml b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -144,20 +144,20 @@ jobs:
 
       - name: Run core crypto and internal CUDA backend tests
         run: |
-          make test_core_crypto_gpu
-          make test_cuda_backend
+          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
 
       - name: Run user docs tests
         run: |
-          make test_user_doc_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
 
       - name: Test C API
         run: |
-          make test_c_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
 
       - name: Run High Level API Tests
         run: |
-          make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
 
   slack-notify:
     name: Slack Notification

diff --git a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml b/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml
@@ -144,11 +144,11 @@ jobs:
 
       - name: Run signed integer tests
         run: |
-          make test_signed_integer_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
 
       - name: Run signed integer multi-bit tests
         run: |
-          make test_signed_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
 
   slack-notify:
     name: Slack Notification

diff --git a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml b/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml
@@ -144,11 +144,11 @@ jobs:
 
       - name: Run unsigned integer tests
         run: |
-          make test_unsigned_integer_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
 
       - name: Run unsigned integer multi-bit tests
         run: |
-          make test_unsigned_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
 
   slack-notify:
     name: Slack Notification

diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -247,5 +247,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
   cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                          gpu_index);
   check_cuda_error(cudaGetLastError());
+#if CUDA_ARCH == 900
+  max_shared_memory = 226000;
+#elif CUDA_ARCH == 890
+  max_shared_memory = 127000;
+#elif CUDA_ARCH == 800
+  max_shared_memory = 163000;
+#elif CUDA_ARCH == 700
+  max_shared_memory = 95000;
+#endif
   return max_shared_memory;
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -234,7 +234,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
   int32_t h_smart_copy_in[r * num_blocks];
   int32_t h_smart_copy_out[r * num_blocks];
 
-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  int max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
 
   // create lut object for message and carry
   // we allocate luts_message_carry in the host function (instead of scratch)

diff --git a/scripts/integer-tests.sh b/scripts/integer-tests.sh
@@ -129,8 +129,13 @@ fi
 
 # Override test-threads number to avoid Out-of-memory issues on GPU instances
 if [[ "${backend}" == "gpu" ]]; then
-    test_threads=5
-    doctest_threads=5
+    if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
+        test_threads=5
+        doctest_threads=5
+    else
+        test_threads=3
+        doctest_threads=3
+    fi
 fi
 
 filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})