From cc4721a9bc00a32a46b752d5f11102b880f797bd Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 1 Aug 2024 10:35:43 +0200
Subject: [PATCH] chore(gpu): fix lwe chunk size argument

---
 .../cuda/include/programmable_bootstrap_multibit.h | 14 +++++++-------
 .../cuda/src/pbs/programmable_bootstrap.cuh        |  4 ++--
 .../src/pbs/programmable_bootstrap_cg_multibit.cuh |  4 ++--
 .../src/pbs/programmable_bootstrap_multibit.cuh    |  4 ++--
 .../pbs/programmable_bootstrap_tbc_multibit.cuh    |  4 ++--
 .../tests/test_multibit_pbs.cpp                    |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
index ce1617872c..92378ed8e0 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -20,7 +20,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    uint32_t chunk_size = 0);
+    uint32_t chunk_size);
 
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -28,7 +28,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);
+    uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size);
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
@@ -71,14 +71,14 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -88,7 +88,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -96,7 +96,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -106,7 +106,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 __host__ __device__ uint64_t
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index f8f12d4403..2eaad68560 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -198,7 +198,7 @@ void execute_pbs_async(
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu);
+            num_inputs_on_gpu, 0);
       }
       break;
     case CLASSICAL:
@@ -270,7 +270,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
       scratch_cuda_multi_bit_programmable_bootstrap_64(
           stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
           polynomial_size, level_count, grouping_factor,
-          input_lwe_ciphertext_count, allocate_gpu_memory);
+          input_lwe_ciphertext_count, allocate_gpu_memory, 0);
       break;
     case CLASSICAL:
       scratch_cuda_programmable_bootstrap_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 4f9e96c4f4..99b55242f8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -177,7 +177,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size) {
 
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -337,7 +337,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size) {
 
   if (!lwe_chunk_size)
     lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index f913280eb9..4378a102ca 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -376,7 +376,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
 
   int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm_keybundle =
@@ -632,7 +632,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size) {
 
   // If a chunk size is not passed to this function, select one.
   if (!lwe_chunk_size)
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 7e13451fd1..0474957499 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -172,7 +172,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
 
   bool supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -365,7 +365,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
index 82e3bbb193..335e6bdeda 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -93,7 +93,7 @@ class MultiBitProgrammableBootstrapTestPrimitives_u64
 
     scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
-        polynomial_size, pbs_level, grouping_factor, number_of_inputs, true);
+        polynomial_size, pbs_level, grouping_factor, number_of_inputs, true, 0);
 
     lwe_ct_out_array =
         (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *