diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
index 1e109b4409..350b5862f4 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
   uint32_t lwe_chunk_size;
   double2 *keybundle_fft;
   Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;
 
   PBS_VARIANT pbs_variant;
 
@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
           num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
           stream, gpu_index);
       global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
-          gpu_index);
-      global_accumulator_fft = (double2 *)cuda_malloc_async(
-          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
+          input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+              sizeof(Torus),
+          stream, gpu_index);
+      global_join_buffer = (double2 *)cuda_malloc_async(
+          level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+              (polynomial_size / 2) * sizeof(double2),
           stream, gpu_index);
     }
   }
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
 
     cuda_drop_async(keybundle_fft, stream, gpu_index);
     cuda_drop_async(global_accumulator, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
   }
 };
 
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
index b5451f5bdd..a9e21f77ab 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
   int8_t *d_mem;
 
   Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;
 
   PBS_VARIANT pbs_variant;
 
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
         // Otherwise, both kernels run all in shared memory
         d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
 
-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
             (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                 (polynomial_size / 2) * sizeof(double2),
             stream, gpu_index);
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
         // Otherwise, both kernels run all in shared memory
         d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
 
-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
             (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                 polynomial_size / 2 * sizeof(double2),
             stream, gpu_index);
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
         // Otherwise, both kernels run all in shared memory
         d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
 
-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
             (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                 polynomial_size / 2 * sizeof(double2),
             stream, gpu_index);
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 
   void release(cudaStream_t stream, uint32_t gpu_index) {
     cuda_drop_async(d_mem, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
 
     if (pbs_variant == DEFAULT)
       cuda_drop_async(global_accumulator, stream, gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
index 32df0caff7..7b4653af3b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_CRYPTO_CUH
 #define CNCRT_CRPYTO_CUH
 
+#include "crypto/torus.cuh"
 #include "device.h"
 #include <cstdint>
 
@@ -21,7 +22,6 @@ private:
   uint32_t base_log;
   uint32_t mask;
   uint32_t num_poly;
-  int current_level;
   T mask_mod_b;
   T *state;
 
@@ -32,7 +32,6 @@ public:
         state(state) {
 
     mask_mod_b = (1ll << base_log) - 1ll;
-    current_level = level_count;
     int tid = threadIdx.x;
     for (int i = 0; i < num_poly * params::opt; i++) {
       state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
@@ -52,8 +51,6 @@ public:
   // Decomposes a single polynomial
   __device__ void decompose_and_compress_next_polynomial(double2 *result,
                                                          int j) {
-    if (j == 0)
-      current_level -= 1;
 
     int tid = threadIdx.x;
     auto state_slice = state + j * params::degree;
@@ -72,8 +69,8 @@ public:
       res_re -= carry_re << base_log;
       res_im -= carry_im << base_log;
 
-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
+      typecast_torus_to_double(res_re, result[tid].x);
+      typecast_torus_to_double(res_im, result[tid].y);
 
       tid += params::degree / params::opt;
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index b4ef3259a9..f9875b107a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH
 
+#include "device.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
 #include "utils/kernel_dimensions.cuh"
@@ -43,6 +44,21 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
   typecast_double_to_torus(round(frac), r);
 }
 
+template <typename T>
+__device__ inline void typecast_torus_to_double(T x, double &r);
+
+template <>
+__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
+                                                          double &r) {
+  r = __int2double_rn(x);
+}
+
+template <>
+__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
+                                                          double &r) {
+  r = __ll2double_rn(x);
+}
+
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                               uint32_t level_count) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 95a7a78c14..9215bc044e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -7,6 +7,7 @@
 #include "fft/bnsmfft.cuh"
 #include "helper_multi_gpu.h"
 #include "pbs/programmable_bootstrap_multibit.h"
+#include "polynomial/polynomial_math.cuh"
 
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
                         double2 *global_memory_buffer, uint32_t polynomial_size,
                         uint32_t glwe_dimension, bool support_dsm);
 
-template <typename Torus, typename G, class params>
+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension x relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
 __device__ void
-mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
-              const double2 *__restrict__ bootstrapping_key,
-              int polynomial_size, uint32_t glwe_dimension, int level_count,
-              int iteration, G &group, bool support_dsm = false) {
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  // Get the pieces of the bootstrapping key that will be needed for the
-  // external product; blockIdx.x is the ID of the block that's executing
-  // this function, so we end up getting the lines of the bootstrapping key
-  // needed to perform the external product in this block (corresponding to
-  // the same decomposition level)
-  auto bsk_slice = get_ith_mask_kth_block(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
-      glwe_dimension, level_count);
-
-  // Perform the matrix multiplication between the GGSW and the GLWE,
-  // each block operating on a single level for mask and body
+mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
+                                const double2 *__restrict__ bootstrapping_key,
+                                int iteration, G &group,
+                                bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.x;
 
   // The first product is used to initialize level_join_buffer
-  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
   auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-  auto buffer_slice =
-      get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
-    tid += params::degree / params::opt;
-  }
-
-  group.sync();
 
   // Continues multiplying fft by every polynomial in that particular bsk level
   // Each y-block accumulates in a different polynomial at each iteration
-  for (int j = 1; j < (glwe_dimension + 1); j++) {
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
     int idx = (j + this_block_rank) % (glwe_dimension + 1);
 
-    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
     auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
                                                    join_buffer, polynomial_size,
                                                    glwe_dimension, support_dsm);
 
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
-      tid += params::degree / params::opt;
-    }
+    polynomial_product_accumulate_in_fourier_domain<params, double2>(
+        buffer_slice, fft, bsk_poly, j == 0);
     group.sync();
   }
 
@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
   // All blocks are synchronized here; after this sync, level_join_buffer has
   // the values needed from every other block
 
-  auto src_acc =
-      get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  // copy first product into fft buffer
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = src_acc[tid];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
   // accumulate rest of the products into fft buffer
-  for (int l = 1; l < gridDim.x; l++) {
+  for (int l = 0; l < level_count; l++) {
     auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
                                                   join_buffer, polynomial_size,
                                                   glwe_dimension, support_dsm);
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] += cur_src_acc[tid];
-      tid += params::degree / params::opt;
-    }
-  }
 
-  synchronize_threads_in_block();
+    polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
+  }
 
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(fft);
   synchronize_threads_in_block();
-
-  add_to_torus<Torus, params>(fft, accumulator);
-
-  __syncthreads();
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 682f073ca7..0cfd95efb7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
     GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
                                            accumulator_rotated);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
 
     // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator);
   }
 
   auto block_lwe_array_out =
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
                          (glwe_dimension * polynomial_size + 1) +
                      blockIdx.y * polynomial_size];
 
-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
-
-        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 1, i * lut_stride);
+  if (blockIdx.x == 0) {
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
       }
-    }
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
-
-        sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 0, i * lut_stride);
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
       }
     }
   }
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
   uint64_t partial_dm = full_dm - partial_sm;
 
   int8_t *d_mem = buffer->d_mem;
-  double2 *buffer_fft = buffer->global_accumulator_fft;
+  double2 *buffer_fft = buffer->global_join_buffer;
 
   int thds = polynomial_size / params::opt;
   dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index b8ddc8fdc7..1e89164235 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
         int8_t *device_mem, uint64_t device_memory_size_per_block,
         uint32_t lut_count, uint32_t lut_stride) {
-
   grid_group grid = this_grid();
 
   // We use shared memory for the polynomials that are used often during the
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     selected_memory = &device_mem[block_index * device_memory_size_per_block];
   }
 
-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated = (Torus *)selected_memory;
   double2 *accumulator_fft =
-      (double2 *)accumulator +
+      (double2 *)accumulator_rotated +
       (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
 
   if constexpr (SMD == PARTIALSM)
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
                    params::degree / 2];
 
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+  Torus *global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];
 
-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.z * keybundle_size_per_input];
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
@@ -87,12 +85,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
   } else {
-    // Load the accumulator calculated in previous iterations
+    // Load the accumulator_rotated calculated in previous iterations
     copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
+        global_accumulator_slice, accumulator_rotated);
   }
 
   for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -100,79 +98,82 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     // bootstrapped ciphertext
     round_to_closest_multiple_inplace<Torus, params::opt,
                                       params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator_rotated, base_log, level_count);
 
-    // Decompose the accumulator. Each block gets one level of the
+    // Decompose the accumulator_rotated. Each block gets one level of the
     // decomposition, for the mask and the body (so block 0 will have the
-    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    // accumulator_rotated decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
 
     // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
   }
 
-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                           (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
-
-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      // Always extract one by default
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 1, i * lut_stride);
+  auto accumulator = accumulator_rotated;
+
+  if (blockIdx.x == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];
+
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized Always extract one by
+        // default
+        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 1, i * lut_stride);
+          }
         }
-      }
 
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+      } else if (blockIdx.y == glwe_dimension) {
 
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
 
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
 
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
 
-          sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 0, i * lut_stride);
+            sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 0, i * lut_stride);
+          }
         }
       }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
     }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
   }
 }
 
@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
     uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
     uint32_t lut_stride) {
 
-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  uint64_t full_dm =
+  uint64_t full_sm =
       get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
-  uint64_t partial_dm =
+  uint64_t partial_sm =
       get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
+
+  auto full_dm = full_sm;
+  auto partial_dm = full_sm - partial_sm;
   uint64_t no_dm = 0;
 
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
   int max_shared_memory = cuda_get_max_shared_memory(0);
   cudaSetDevice(gpu_index);
 
@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(
 
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;
 
   auto d_mem = buffer->d_mem_acc_cg;
   auto keybundle_fft = buffer->keybundle_fft;
   auto global_accumulator = buffer->global_accumulator;
-  auto buffer_fft = buffer->global_accumulator_fft;
+  auto join_buffer = buffer->global_join_buffer;
 
   void *kernel_args[22];
   kernel_args[0] = &lwe_array_out;
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
   kernel_args[4] = &lwe_array_in;
   kernel_args[5] = &lwe_input_indexes;
   kernel_args[6] = &keybundle_fft;
-  kernel_args[7] = &buffer_fft;
+  kernel_args[7] = &join_buffer;
   kernel_args[8] = &global_accumulator;
   kernel_args[9] = &lwe_dimension;
   kernel_args[10] = &glwe_dimension;
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
             Torus, params, PARTIALSM>,
-        grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
   } else {
     kernel_args[19] = &no_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
             Torus, params, FULLSM>,
-        grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 8a534b2e83..831d3a6478 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         const Torus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes,
         const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
         uint32_t lwe_iteration, uint32_t lwe_dimension,
         uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
         int8_t *device_mem, uint64_t device_memory_size_per_block) {
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
 
   double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
+      global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+                            blockIdx.z * level_count * (glwe_dimension + 1)) *
+                               (polynomial_size / 2);
 
   if (lwe_iteration == 0) {
     // First iteration
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes,
         const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
         uint32_t lwe_iteration, uint32_t lwe_dimension,
         uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
         int8_t *device_mem, uint64_t device_memory_size_per_block,
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     accumulator_fft = (double2 *)sharedmem;
 
   for (int level = 0; level < level_count; level++) {
-    double2 *global_fft_slice = global_accumulator_fft +
-                                (level + blockIdx.x * level_count) *
-                                    (glwe_dimension + 1) * (params::degree / 2);
+    double2 *global_fft_slice =
+        global_join_buffer + (level + blockIdx.x * level_count) *
+                                 (glwe_dimension + 1) * (params::degree / 2);
 
     for (int j = 0; j < (glwe_dimension + 1); j++) {
       double2 *fft = global_fft_slice + j * params::degree / 2;
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
   }
   // Otherwise, both kernels run all in shared memory
   uint64_t buffer_size = device_mem +
-                         // global_accumulator_fft
+                         // global_join_buffer
                          (glwe_dimension + 1) * level_count *
                              input_lwe_ciphertext_count *
                              (polynomial_size / 2) * sizeof(double2) +
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
     cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
     Torus const *lut_vector_indexes, Torus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
     uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
     device_programmable_bootstrap_step_one<Torus, params, NOSM>
         <<<grid, thds, 0, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, full_dm);
   } else if (max_shared_memory < full_sm) {
     device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
         <<<grid, thds, partial_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, partial_dm);
   } else {
     device_programmable_bootstrap_step_one<Torus, params, FULLSM>
         <<<grid, thds, full_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, 0);
   }
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
     Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
     uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
     device_programmable_bootstrap_step_two<Torus, params, NOSM>
         <<<grid, thds, 0, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, full_dm, lut_count, lut_stride);
   } else if (max_shared_memory < full_sm) {
     device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
         <<<grid, thds, partial_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, partial_dm, lut_count, lut_stride);
   } else {
     device_programmable_bootstrap_step_two<Torus, params, FULLSM>
         <<<grid, thds, full_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
             level_count, d_mem, 0, lut_count, lut_stride);
   }
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
   uint64_t full_dm_step_two = full_sm_step_two;
 
   Torus *global_accumulator = pbs_buffer->global_accumulator;
-  double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
+  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
   int8_t *d_mem = pbs_buffer->d_mem;
 
   for (int i = 0; i < lwe_dimension; i++) {
     execute_step_one<Torus, params>(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
         partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
     execute_step_two<Torus, params>(
         stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
         lut_vector_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
         partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
         lut_count, lut_stride);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 15802c5996..b57c61cf90 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -50,7 +50,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     uint64_t device_memory_size_per_block) {
 
   extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory = sharedmem;
+  int8_t *selected_memory;
 
   if constexpr (SMD == FULLSM) {
     selected_memory = sharedmem;
@@ -190,14 +190,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                   (glwe_dimension + 1)];
 
   Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];
 
   double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
+      &global_accumulator_fft[(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+                               blockIdx.z * level_count *
+                                   (glwe_dimension + 1)) *
+                              (polynomial_size / 2)];
 
   if (lwe_iteration == 0) {
     // First iteration
@@ -249,8 +249,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_accumulate_step_two(
         Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
         const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
-        double2 *global_accumulator_fft, uint32_t lwe_dimension,
-        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+        double2 *join_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+        uint32_t polynomial_size, uint32_t level_count,
         uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
         uint32_t lwe_chunk_size, int8_t *device_mem,
         uint64_t device_memory_size_per_block, uint32_t lut_count,
@@ -274,30 +274,29 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   double2 *accumulator_fft = (double2 *)selected_memory;
 
   //
-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.x * lwe_chunk_size * level_count *
-                                 (glwe_dimension + 1) * (glwe_dimension + 1) *
-                                 (polynomial_size / 2);
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.x * lwe_chunk_size * level_count *
+                       (glwe_dimension + 1) * (glwe_dimension + 1) *
+                       (polynomial_size / 2)];
 
-  double2 *global_accumulator_fft_input =
-      global_accumulator_fft +
-      blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
+  double2 *join_buffer_slice =
+      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
+                   (polynomial_size / 2)];
 
   for (int level = 0; level < level_count; level++) {
     double2 *global_fft_slice =
-        global_accumulator_fft_input +
-        level * (glwe_dimension + 1) * (polynomial_size / 2);
+        &join_buffer_slice[level * (glwe_dimension + 1) *
+                           (polynomial_size / 2)];
 
     for (int j = 0; j < (glwe_dimension + 1); j++) {
-      double2 *fft = global_fft_slice + j * params::degree / 2;
+      double2 *fft = &global_fft_slice[j * params::degree / 2];
 
       // Get the bootstrapping key piece necessary for the multiplication
       // It is already in the Fourier domain
       auto bsk_slice =
           get_ith_mask_kth_block(keybundle, iteration, j, level,
                                  polynomial_size, glwe_dimension, level_count);
-      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+      auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2];
 
       polynomial_product_accumulate_in_fourier_domain<params, double2>(
           accumulator_fft, fft, bsk_poly, !level && !j);
@@ -308,8 +307,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   // accumulator
   NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
   Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
+                          params::degree];
 
   add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
   synchronize_threads_in_block();
@@ -499,8 +498,6 @@ __host__ void execute_compute_keybundle(
   auto lwe_chunk_size = buffer->lwe_chunk_size;
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;
 
   uint32_t keybundle_size_per_input =
       lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -559,7 +556,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
   //
   auto d_mem = buffer->d_mem_acc_step_one;
   auto global_accumulator = buffer->global_accumulator;
-  auto global_accumulator_fft = buffer->global_accumulator_fft;
+  auto global_accumulator_fft = buffer->global_join_buffer;
 
   dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
   dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -611,7 +608,7 @@ __host__ void execute_step_two(
   auto d_mem = buffer->d_mem_acc_step_two;
   auto keybundle_fft = buffer->keybundle_fft;
   auto global_accumulator = buffer->global_accumulator;
-  auto global_accumulator_fft = buffer->global_accumulator_fft;
+  auto global_accumulator_fft = buffer->global_join_buffer;
 
   dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
   dim3 thds(polynomial_size / params::opt, 1, 1);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index 910a74c18e..bbdf1ab43e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -133,18 +133,17 @@ __global__ void device_programmable_bootstrap_tbc(
     GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
                                            accumulator_rotated);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
 
     // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, cluster_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
-
+    mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
+        accumulator_fft, block_join_buffer, bootstrapping_key, i, cluster,
+        support_dsm);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator);
   }
 
   auto block_lwe_array_out =
@@ -152,42 +151,44 @@ __global__ void device_programmable_bootstrap_tbc(
                          (glwe_dimension * polynomial_size + 1) +
                      blockIdx.y * polynomial_size];
 
-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
-
-        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 1, i * lut_stride);
+  if (blockIdx.x == 0) {
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
       }
-    }
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
-
-        sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 0, i * lut_stride);
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
       }
     }
   }
@@ -287,7 +288,7 @@ __host__ void host_programmable_bootstrap_tbc(
   uint64_t partial_dm = full_dm - partial_sm;
 
   int8_t *d_mem = buffer->d_mem;
-  double2 *buffer_fft = buffer->global_accumulator_fft;
+  double2 *buffer_fft = buffer->global_join_buffer;
 
   int thds = polynomial_size / params::opt;
   dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index ac5fd8051b..ade8d1f423 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -54,9 +54,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     selected_memory = &device_mem[block_index * device_memory_size_per_block];
   }
 
-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated = (Torus *)selected_memory;
   double2 *accumulator_fft =
-      (double2 *)accumulator +
+      (double2 *)accumulator_rotated +
       (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
 
   if constexpr (SMD == PARTIALSM) {
@@ -78,13 +78,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
                    params::degree / 2];
 
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+  Torus *global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];
 
-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.z * keybundle_size_per_input];
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
@@ -94,12 +93,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
   } else {
     // Load the accumulator calculated in previous iterations
     copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
+        global_accumulator_slice, accumulator_rotated);
   }
 
   for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -107,75 +106,78 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     // bootstrapped ciphertext
     round_to_closest_multiple_inplace<Torus, params::opt,
                                       params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator_rotated, base_log, level_count);
 
     // Decompose the accumulator. Each block gets one level of the
     // decomposition, for the mask and the body (so block 0 will have the
     // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
 
     // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, cluster_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
-
+    mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
     synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
   }
 
-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                           (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
-
-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 1, i * lut_stride);
+  auto accumulator = accumulator_rotated;
+
+  if (blockIdx.x == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];
+
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized
+        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 1, i * lut_stride);
+          }
         }
-      }
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 0, i * lut_stride);
+      } else if (blockIdx.y == glwe_dimension) {
+        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 0, i * lut_stride);
+          }
         }
       }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
     }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
   }
 }
 
@@ -326,13 +328,11 @@ __host__ void execute_tbc_external_product_loop(
 
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;
 
   auto d_mem = buffer->d_mem_acc_tbc;
   auto keybundle_fft = buffer->keybundle_fft;
   auto global_accumulator = buffer->global_accumulator;
-  auto buffer_fft = buffer->global_accumulator_fft;
+  auto buffer_fft = buffer->global_join_buffer;
 
   dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
   dim3 thds(polynomial_size / params::opt, 1, 1);
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
index f1da499b1c..6b19f08d76 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -56,8 +56,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
                                       bool zeroAcc, uint32_t num_poly = 1) {
   constexpr int degree = block_size * elems_per_thread;
   for (int z = 0; z < num_poly; z++) {
-    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
-    const T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
+    T *accumulator_slice = &accumulator[z * degree];
+    const T *input_slice = &input[z * degree];
 
     int tid = threadIdx.x;
     if (zeroAcc) {
@@ -66,9 +66,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
         tid += block_size;
       }
     } else {
-      tid = threadIdx.x;
-      for (int i = 0; i < elems_per_thread; i++) {
-        if (j < degree) {
+      if (j < degree) {
+        for (int i = 0; i < elems_per_thread; i++) {
           // if (tid < degree - j)
           //  accumulator_slice[tid] = input_slice[tid + j];
           // else
@@ -76,8 +75,11 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
           int x = tid + j - SEL(degree, 0, tid < degree - j);
           accumulator_slice[tid] =
               SEL(-1, 1, tid < degree - j) * input_slice[x];
-        } else {
-          int32_t jj = j - degree;
+          tid += block_size;
+        }
+      } else {
+        int32_t jj = j - degree;
+        for (int i = 0; i < elems_per_thread; i++) {
           // if (tid < degree - jj)
           //  accumulator_slice[tid] = -input_slice[tid + jj];
           // else
@@ -85,8 +87,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
           int x = tid + jj - SEL(degree, 0, tid < degree - jj);
           accumulator_slice[tid] =
               SEL(1, -1, tid < degree - jj) * input_slice[x];
+          tid += block_size;
         }
-        tid += block_size;
       }
     }
   }
@@ -160,9 +162,13 @@ __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
   }
 }
 
+/**
+ * In case of classical PBS, this method should accumulate the result.
+ * In case of multi-bit PBS, it should overwrite.
+ */
 template <typename Torus, class params>
 __device__ void add_to_torus(double2 *m_values, Torus *result,
-                             bool init_torus = false) {
+                             bool overwrite_result = false) {
   int tid = threadIdx.x;
 #pragma unroll
   for (int i = 0; i < params::opt / 2; i++) {
@@ -175,7 +181,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
     Torus torus_imag = 0;
     typecast_double_round_to_torus<Torus>(double_imag, torus_imag);
 
-    if (init_torus) {
+    if (overwrite_result) {
       result[tid] = torus_real;
       result[tid + params::degree / 2] = torus_imag;
     } else {
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index c054926f70..0f825dca1b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -3,6 +3,7 @@
 
 #include "crypto/torus.cuh"
 #include "parameters.cuh"
+#include "types/complex/operations.cuh"
 
 template <typename T>
 __device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
@@ -55,6 +56,27 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
   }
 }
 
+// Computes result += x
+// If init_accumulator is set, assumes that result was not initialized and does
+// that with the outcome of first * second
+template <class params>
+__device__ void
+polynomial_accumulate_in_fourier_domain(double2 *result, double2 *x,
+                                        bool init_accumulator = false) {
+  auto tid = threadIdx.x;
+  if (init_accumulator) {
+    for (int i = 0; i < params::opt / 2; i++) {
+      result[tid] = x[tid];
+      tid += params::degree / params::opt;
+    }
+  } else {
+    for (int i = 0; i < params::opt / 2; i++) {
+      result[tid] += x[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
 // This method expects to work with polynomial_size / compression_params::opt
 // threads in the x-block If init_accumulator is set, assumes that result was
 // not initialized and does that with the outcome of first * second
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
index 5ddb1430f2..11e4dd3122 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -233,147 +233,15 @@ ::testing::internal::ParamGenerator<ClassicalProgrammableBootstrapTestParams>
         // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
         // message_modulus, carry_modulus, number_of_inputs, repetitions,
         // samples
-        // BOOLEAN_DEFAULT_PARAMETERS
+        // PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
         (ClassicalProgrammableBootstrapTestParams){
-            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 2, 2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
+            887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
+            100, 1, 1},
+        // PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
         (ClassicalProgrammableBootstrapTestParams){
-            830, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
-            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
-            1, 2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalProgrammableBootstrapTestParams){
-            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            656, 2, 512,
-            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
-            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            742, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            745, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalProgrammableBootstrapTestParams){
-            807, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
-            2, 1, 40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            915, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
-            2, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalProgrammableBootstrapTestParams){
-            864, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
-            2, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            930, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
-            1, 2, 1, 1},
-
-        // BOOLEAN_DEFAULT_PARAMETERS
-        (ClassicalProgrammableBootstrapTestParams){
-            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 100, 2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
-        (ClassicalProgrammableBootstrapTestParams){
-            830, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
-            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
-            1, 100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalProgrammableBootstrapTestParams){
-            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            656, 2, 512,
-            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
-            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            742, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            745, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalProgrammableBootstrapTestParams){
-            807, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
-            100, 1, 40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            915, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
-            100, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalProgrammableBootstrapTestParams){
-            864, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
-            100, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            930, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
-            1, 100, 1, 1});
+            977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
+            new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
+            1, 1});
 std::string printParamName(
     ::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
   ClassicalProgrammableBootstrapTestParams params = p.param;
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
index 1adfd773b0..eec69ddd38 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -171,70 +171,44 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
   }
 }
 
+/**
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int grouping_factor;
+  int repetitions;
+  int samples;
+ */
 // Defines for which parameters set the PBS will be tested.
 // It executes each src for all pairs on phis X qs (Cartesian product)
 ::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
     multipbs_params_u64 = ::testing::Values(
-        // fast src
-        (MultiBitProgrammableBootstrapTestParams){
-            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
-            2, 1, 2, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
-            2, 128, 2, 1, 10},
-        // 4_bits_multi_bit_group_2
-        (MultiBitProgrammableBootstrapTestParams){
-            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
-            2, 1, 2, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
-            2, 128, 2, 1, 10},
-        // 4_bits_multi_bit_group_3
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            1, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            1, 3, 1, 1},
-
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 1024,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
+        // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
         (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
+            882, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 8, 8,
+            100, 3, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
         (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 1},
+            978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
+            new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
+            100, 3, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
         (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 1},
+            836, 1, 2048, new_gaussian_from_std_dev((3.433444883863949e-06)),
+            new_gaussian_from_std_dev((2.845267479601915e-15)), 22, 2, 4, 4,
+            100, 2, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
         (MultiBitProgrammableBootstrapTestParams){
-            972, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
-            68, 3, 1, 1});
+            978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
+            new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
+            100, 2, 1, 1});
 
 std::string printParamName(
     ::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {