From 256378f7b15ad62ea74ba7bd86a4b20dee10695e Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 3 Oct 2024 10:32:19 +0200
Subject: [PATCH] chore(gpu): increase max sm size again to gain performance on
 3_3 params on H100

This reverts commit 1c0b6fbbd466b31e5594581218fcfa52355ec1c7.
---
 backends/tfhe-cuda-backend/cuda/src/device.cu |  9 ++
 .../cuda/src/pbs/bootstrapping_key.cuh        | 98 +++++++++++++++++++
 .../pbs/programmable_bootstrap_amortized.cuh  | 22 +++++
 .../pbs/programmable_bootstrap_cg_classic.cuh | 24 +++++
 .../programmable_bootstrap_cg_multibit.cuh    | 63 ++++++++++++
 .../pbs/programmable_bootstrap_classic.cuh    | 49 ++++++++++
 .../pbs/programmable_bootstrap_multibit.cuh   | 91 +++++++++++++++++
 .../programmable_bootstrap_tbc_classic.cuh    | 43 ++++++++
 .../programmable_bootstrap_tbc_multibit.cuh   | 78 +++++++++++++++
 9 files changed, 477 insertions(+)

diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index 9a880ad3cd..afad0b21fb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -271,5 +271,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
   cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                          gpu_index);
   check_cuda_error(cudaGetLastError());
+#if CUDA_ARCH == 900
+  max_shared_memory = 226000;
+#elif CUDA_ARCH == 890
+  max_shared_memory = 127000;
+#elif CUDA_ARCH == 800
+  max_shared_memory = 163000;
+#elif CUDA_ARCH == 700
+  max_shared_memory = 95000;
+#endif
   return max_shared_memory;
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
index 7c0a7d6241..13f878f055 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -115,6 +115,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
   switch (polynomial_size) {
   case 256:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -127,6 +133,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -139,6 +151,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -151,6 +169,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -163,6 +187,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -175,6 +205,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -187,6 +223,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 16384:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -227,6 +269,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 256:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -240,6 +290,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -253,6 +311,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -266,6 +332,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -279,6 +353,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -292,6 +374,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -305,6 +395,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   case 16384:
     if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
                            FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index 250f147988..6a60a0f6d1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -258,6 +258,28 @@ __host__ void scratch_programmable_bootstrap_amortized(
     uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size, glwe_dimension);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    cudaFuncSetAttribute(
+        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
   if (allocate_gpu_memory) {
     uint64_t buffer_size =
         get_buffer_size_programmable_bootstrap_amortized<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 668cf2e0ae..04ff5348c5 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -194,6 +194,30 @@ __host__ void scratch_programmable_bootstrap_cg(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
+          polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   *buffer = new pbs_buffer<Torus, CLASSICAL>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index f26f45b810..29d8ee9590 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -213,6 +213,69 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_cg_accumulate =
+      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t partial_sm_cg_accumulate =
+      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory < partial_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory < full_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
+                                                              FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index d4966f1cdc..b9dfdf415c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -309,6 +309,55 @@ __host__ void scratch_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
+          polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
+
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
+  // Configure step one
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Configure step two
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   *buffer = new pbs_buffer<Torus, CLASSICAL>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 455233f057..0f0ddaf79d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -389,6 +389,97 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_one =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_two =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
+          polynomial_size);
+  uint64_t partial_sm_accumulate_step_one =
+      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+          Torus>(polynomial_size);
+
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory < partial_sm_accumulate_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory < full_sm_accumulate_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm_accumulate_step_one));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_one));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one<
+            Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory < full_sm_accumulate_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two<
+            Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two<
+            Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two<
+            Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_two));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two<
+            Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index 515527ade1..5dccab3606 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -200,6 +200,49 @@ __host__ void scratch_programmable_bootstrap_tbc(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  bool supports_dsm =
+      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+          Torus>(polynomial_size);
+
+  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc = 0;
+  if (supports_dsm)
+    minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
+            polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
+  if (max_shared_memory >= full_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm + minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   *buffer = new pbs_buffer<Torus, CLASSICAL>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 1e839e6c5b..06223abc3c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -203,6 +203,84 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
+  bool supports_dsm =
+      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
+          Torus>(polynomial_size);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_tbc_accumulate =
+      get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t partial_sm_tbc_accumulate =
+      get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc_accumulate = 0;
+  if (supports_dsm)
+    minimum_sm_tbc_accumulate =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory <
+      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory <
+             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
   auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(