diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index bcb2c6cbe9..3c3a61b8f6 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,6 +27,15 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
     std::abort();                                                              \
   }
 
+cudaEvent_t cuda_create_event(uint32_t gpu_index);
+
+void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
+                       uint32_t gpu_index);
+void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
+                            uint32_t gpu_index);
+
+void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);
+
 cudaStream_t cuda_create_stream(uint32_t gpu_index);
 
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index 898daba86d..325891b860 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -35,6 +35,8 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
 
+enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };
+
 extern "C" {
 void scratch_cuda_apply_univariate_lut_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -282,23 +284,61 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
     uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
     uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory);
+
+void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory);
 
 void cuda_propagate_single_carry_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry);
 
-void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks);
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);
 
 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count,
                                          int8_t **mem_ptr_void);
 
+void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void);
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory);
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow);
+
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -318,25 +358,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr_void);
 
-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
-    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks_in_radix);
-
-void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void);
-
 void scratch_cuda_integer_scalar_mul_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -376,26 +397,6 @@ void cleanup_cuda_integer_div_rem(void *const *streams,
                                   uint32_t const *gpu_indexes,
                                   uint32_t gpu_count, int8_t **mem_ptr_void);
 
-void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks_in_radix);
-
-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void);
-
 void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 1a985c1ca0..ff3fac680f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -53,6 +53,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                  uint32_t carry_modulus,
                                  std::function<Torus(Torus)> f);
 
+template <typename Torus>
+void generate_many_lut_device_accumulator(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus, std::vector<std::function<Torus(Torus)>> &f);
+
 struct int_radix_params {
   PBS_TYPE pbs_type;
   uint32_t glwe_dimension;
@@ -316,6 +322,113 @@ template <typename Torus> struct int_radix_lut {
            num_radix_blocks * sizeof(Torus));
   }
 
+  // Construction for many luts
+  int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
+                uint32_t num_radix_blocks, uint32_t num_many_lut,
+                bool allocate_gpu_memory) {
+
+    this->params = params;
+    this->num_blocks = num_radix_blocks;
+    this->num_luts = num_luts;
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    Torus lut_buffer_size =
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
+
+    ///////////////
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cudaSetDevice(i);
+      int8_t *gpu_pbs_buffer;
+      auto num_blocks_on_gpu =
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+
+      execute_scratch_pbs<Torus>(
+          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
+          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
+          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
+          allocate_gpu_memory);
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      buffer.push_back(gpu_pbs_buffer);
+    }
+
+    if (allocate_gpu_memory) {
+      // Allocate LUT
+      // LUT is used as a trivial encryption and must be initialized outside
+      // this constructor
+      for (uint i = 0; i < active_gpu_count; i++) {
+        auto lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size,
+                                              streams[i], gpu_indexes[i]);
+        auto lut_indexes = (Torus *)cuda_malloc_async(
+            lut_indexes_size, streams[i], gpu_indexes[i]);
+        // lut_indexes is initialized to 0 by default
+        // if a different behavior is wanted, it should be rewritten later
+        cuda_memset_async(lut_indexes, 0, lut_indexes_size, streams[i],
+                          gpu_indexes[i]);
+
+        lut_vec.push_back(lut);
+        lut_indexes_vec.push_back(lut_indexes);
+
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
+      // by default
+      lwe_indexes_in = (Torus *)cuda_malloc_async(
+          num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+      lwe_indexes_out = (Torus *)cuda_malloc_async(
+          num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+      lwe_trivial_indexes = (Torus *)cuda_malloc_async(
+          num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+      h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+      for (int i = 0; i < num_radix_blocks; i++)
+        h_lwe_indexes_in[i] = i;
+
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                               num_radix_blocks * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
+                               num_radix_blocks * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
+                               num_radix_blocks * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
+
+      /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+      /// copy data on each GPU then when we gather data to GPU 0 we can copy
+      /// back to the original indexing
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_array_in_vec, num_radix_blocks,
+                                params.big_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_after_ks_vec, num_radix_blocks,
+                                params.small_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_many_lut_output_async(
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          num_radix_blocks, num_many_lut, params.big_lwe_dimension + 1);
+      multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
+                                  lwe_trivial_indexes_vec, num_radix_blocks);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                                 lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                                 num_radix_blocks);
+
+      // Keyswitch
+      Torus big_size =
+          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
+      Torus small_size =
+          (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
+      tmp_lwe_before_ks =
+          (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
+    }
+  }
+
   // Return a pointer to idx-ith lut at gpu_index's global memory
   Torus *get_lut(uint32_t gpu_index, size_t idx) {
     auto lut = lut_vec[gpu_index];
@@ -377,14 +490,14 @@ template <typename Torus> struct int_radix_lut {
       cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
       cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
     }
-    lut_vec.clear();
-    lut_indexes_vec.clear();
 
     cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]);
     cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]);
     cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]);
 
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    lut_vec.clear();
+    lut_indexes_vec.clear();
     free(h_lwe_indexes_in);
     free(h_lwe_indexes_out);
 
@@ -755,7 +868,7 @@ template <typename Torus> struct int_fullprop_buffer {
   }
 };
 
-template <typename Torus> struct int_sc_prop_memory {
+template <typename Torus> struct int_legacy_sc_prop_memory {
   Torus *generates_or_propagates;
   Torus *step_output;
 
@@ -767,9 +880,10 @@ template <typename Torus> struct int_sc_prop_memory {
 
   int_radix_params params;
 
-  int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                     uint32_t gpu_count, int_radix_params params,
-                     uint32_t num_radix_blocks, bool allocate_gpu_memory) {
+  int_legacy_sc_prop_memory(cudaStream_t const *streams,
+                            uint32_t const *gpu_indexes, uint32_t gpu_count,
+                            int_radix_params params, uint32_t num_radix_blocks,
+                            bool allocate_gpu_memory) {
     this->params = params;
     auto glwe_dimension = params.glwe_dimension;
     auto polynomial_size = params.polynomial_size;
@@ -879,8 +993,6 @@ template <typename Torus> struct int_overflowing_sub_memory {
   Torus *generates_or_propagates;
   Torus *step_output;
 
-  // luts_array[2] = {lut_does_block_generate_carry,
-  // lut_does_block_generate_or_propagate}
   int_radix_lut<Torus> *luts_array;
   int_radix_lut<Torus> *luts_borrow_propagation_sum;
   int_radix_lut<Torus> *message_acc;
@@ -975,146 +1087,1324 @@ template <typename Torus> struct int_overflowing_sub_memory {
         glwe_dimension, polynomial_size, message_modulus, carry_modulus,
         f_message_acc);
 
-    luts_array->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
-                                               gpu_indexes[0]);
-    message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
-  }
+    luts_array->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
+                                               gpu_indexes[0]);
+    message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
+    cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
+
+    luts_array->release(streams, gpu_indexes, gpu_count);
+    luts_borrow_propagation_sum->release(streams, gpu_indexes, gpu_count);
+    message_acc->release(streams, gpu_indexes, gpu_count);
+
+    delete luts_array;
+    delete luts_borrow_propagation_sum;
+    delete message_acc;
+  }
+};
+
+template <typename Torus> struct int_sum_ciphertexts_vec_memory {
+  Torus *new_blocks;
+  Torus *new_blocks_copy;
+  Torus *old_blocks;
+  Torus *small_lwe_vector;
+  int_radix_params params;
+
+  int32_t *d_smart_copy_in;
+  int32_t *d_smart_copy_out;
+
+  bool mem_reuse = false;
+
+  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
+                                 uint32_t const *gpu_indexes,
+                                 uint32_t gpu_count, int_radix_params params,
+                                 uint32_t num_blocks_in_radix,
+                                 uint32_t max_num_radix_in_vec,
+                                 bool allocate_gpu_memory) {
+    this->params = params;
+
+    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+
+    // allocate gpu memory for intermediate buffers
+    new_blocks = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    old_blocks = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    small_lwe_vector = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(old_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(small_lwe_vector, 0,
+                      max_pbs_count * (params.small_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+
+    d_smart_copy_in = (int32_t *)cuda_malloc_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    d_smart_copy_out = (int32_t *)cuda_malloc_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+  }
+
+  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
+                                 uint32_t const *gpu_indexes,
+                                 uint32_t gpu_count, int_radix_params params,
+                                 uint32_t num_blocks_in_radix,
+                                 uint32_t max_num_radix_in_vec,
+                                 Torus *new_blocks, Torus *old_blocks,
+                                 Torus *small_lwe_vector) {
+    mem_reuse = true;
+    this->params = params;
+
+    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+
+    // assign  gpu memory for intermediate buffers
+    this->new_blocks = new_blocks;
+    this->old_blocks = old_blocks;
+    this->small_lwe_vector = small_lwe_vector;
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+
+    d_smart_copy_in = (int32_t *)cuda_malloc_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    d_smart_copy_out = (int32_t *)cuda_malloc_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]);
+    cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
+
+    if (!mem_reuse) {
+      cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]);
+      cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]);
+      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
+    }
+
+    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
+  }
+};
+// For sequential algorithm in group propagation
+template <typename Torus> struct int_seq_group_prop_memory {
+
+  Torus *group_resolved_carries;
+  int_radix_lut<Torus> *lut_sequential_algorithm;
+  uint32_t grouping_size;
+
+  int_seq_group_prop_memory(cudaStream_t const *streams,
+                            uint32_t const *gpu_indexes, uint32_t gpu_count,
+                            int_radix_params params, uint32_t group_size,
+                            uint32_t big_lwe_size_bytes,
+                            bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+
+    grouping_size = group_size;
+    group_resolved_carries = (Torus *)cuda_malloc_async(
+        (grouping_size)*big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(group_resolved_carries, 0,
+                      (grouping_size)*big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    int num_seq_luts = grouping_size - 1;
+    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
+    lut_sequential_algorithm = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_seq_luts, num_seq_luts,
+        allocate_gpu_memory);
+    for (int index = 0; index < num_seq_luts; index++) {
+      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
+        return (propa_cum_sum_block >> (index + 1)) & 1;
+      };
+      auto seq_lut = lut_sequential_algorithm->get_lut(gpu_indexes[0], index);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], seq_lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_lut_sequential);
+      h_seq_lut_indexes[index] = index;
+    }
+    Torus *seq_lut_indexes =
+        lut_sequential_algorithm->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(seq_lut_indexes, h_seq_lut_indexes,
+                             num_seq_luts * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+
+    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    free(h_seq_lut_indexes);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(group_resolved_carries, streams[0], gpu_indexes[0]);
+    lut_sequential_algorithm->release(streams, gpu_indexes, gpu_count);
+    delete lut_sequential_algorithm;
+  };
+};
+
+// For hillis steele algorithm in group propagation
+template <typename Torus> struct int_hs_group_prop_memory {
+
+  int_radix_lut<Torus> *lut_hillis_steele;
+  uint32_t grouping_size;
+
+  int_hs_group_prop_memory(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes, uint32_t gpu_count,
+                           int_radix_params params, uint32_t num_groups,
+                           uint32_t big_lwe_size_bytes,
+                           bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+
+    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
+      if (msb == 2) {
+        return 1; // Remap Generate to 1
+      } else if (msb == 3) {
+        // MSB propagates
+        if (lsb == 2) {
+          return 1;
+        } else {
+          return lsb;
+        } // also remap here
+      } else {
+        return msb;
+      }
+    };
+
+    lut_hillis_steele =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_groups, allocate_gpu_memory);
+
+    auto hillis_steele_lut = lut_hillis_steele->get_lut(gpu_indexes[0], 0);
+    generate_device_accumulator_bivariate<Torus>(
+        streams[0], gpu_indexes[0], hillis_steele_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_lut_hillis_steele);
+
+    lut_hillis_steele->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    lut_hillis_steele->release(streams, gpu_indexes, gpu_count);
+    delete lut_hillis_steele;
+  }
+};
+
+// compute_shifted_blocks_and_block_states
+template <typename Torus> struct int_shifted_blocks_and_states_memory {
+  Torus *shifted_blocks_and_states;
+  Torus *shifted_blocks;
+  Torus *block_states;
+
+  int_radix_lut<Torus> *luts_array_first_step;
+
+  int_shifted_blocks_and_states_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    shifted_blocks_and_states = (Torus *)cuda_malloc_async(
+        num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0],
+        gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks_and_states, 0,
+                      num_many_lut * num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    shifted_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    block_states = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(block_states, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    uint32_t num_luts_first_step = 2 * grouping_size + 1;
+
+    luts_array_first_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_first_step,
+        num_radix_blocks, num_many_lut, allocate_gpu_memory);
+
+    auto f_shift_block = [message_modulus](Torus block) -> Torus {
+      return (block % message_modulus) << 1;
+    };
+
+    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
+      if (block >= message_modulus)
+        return OUTPUT_CARRY::GENERATED;
+      else {
+        return OUTPUT_CARRY::NONE;
+      }
+    };
+    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
+        f_first_block_state, f_shift_block};
+
+    auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0);
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], first_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts);
+
+    // luts for other blocks of the first grouping
+    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
+      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block >= message_modulus) {
+          r = 2; // Generates Carry
+        } else if (block == (message_modulus - 1)) {
+          r = 1; // Propagates a carry
+        } else {
+          r = 0; // Does not generate carry
+        }
+        return r << (lut_id - 1);
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // luts for the rest of groupings (except for the last block)
+    for (int i = 0; i < grouping_size; i++) {
+      uint32_t lut_id = i + grouping_size;
+      auto f_state = [message_modulus, i](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block >= message_modulus) {
+          r = 2; // Generates Carry
+        } else if (block == (message_modulus - 1)) {
+          r = 1; // Propagates a carry
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << i;
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // For the last block we need to generate a new lut
+    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
+      if (block >= message_modulus)
+        return 2 << 1; // Generates
+      else
+        return 0; // Nothing
+    };
+
+    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step
+
+    auto last_block_lut =
+        luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
+        f_last_block_state, f_shift_block};
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], last_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts);
+
+    // Generate the indexes to switch between luts within the pbs
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+      bool is_last_index = (index == (num_radix_blocks - 1));
+      if (is_last_index) {
+        if (num_radix_blocks == 1) {
+          h_lut_indexes[index] = 2 * grouping_size;
+        } else {
+          h_lut_indexes[index] = 2;
+        }
+      } else if (is_in_first_grouping) {
+        h_lut_indexes[index] = index_in_grouping;
+      } else {
+        h_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+    }
+
+    // copy the indexes to the gpu
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size,
+                             streams[0], gpu_indexes[0]);
+    // Do I need to do something else for the multi-gpu?
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    free(h_lut_indexes);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    cuda_drop_async(shifted_blocks_and_states, streams[0], gpu_indexes[0]);
+    cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(block_states, streams[0], gpu_indexes[0]);
+
+    luts_array_first_step->release(streams, gpu_indexes, gpu_count);
+    delete luts_array_first_step;
+  };
+};
+
+// compute_propagation simulator and group carries
+template <typename Torus> struct int_prop_simu_group_carries_memory {
+  Torus *scalar_array_cum_sum;
+  Torus *propagation_cum_sums;
+  Torus *simulators;
+  Torus *grouping_pgns;
+  Torus *prepared_blocks;
+
+  Torus *resolved_carries;
+
+  int_radix_lut<Torus> *luts_array_second_step;
+
+  int_seq_group_prop_memory<Torus> *seq_group_prop_mem;
+  int_hs_group_prop_memory<Torus> *hs_group_prop_mem;
+
+  uint32_t group_size;
+  bool use_sequential_algorithm_to_resolver_group_carries;
+
+  int_prop_simu_group_carries_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+
+    group_size = grouping_size;
+
+    scalar_array_cum_sum = (Torus *)cuda_malloc_async(
+        num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+    cuda_memset_async(scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    propagation_cum_sums = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(propagation_cum_sums, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    simulators = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(simulators, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    grouping_pgns = (Torus *)cuda_malloc_async(num_groups * big_lwe_size_bytes,
+                                               streams[0], gpu_indexes[0]);
+    cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    prepared_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    resolved_carries = (Torus *)cuda_malloc_async(
+        (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(resolved_carries, 0,
+                      (num_groups + 1) * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    // create lut objects for step 2
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint32_t num_carry_to_resolve = num_groups - 1;
+    uint32_t saturated_sub =
+        ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0);
+    uint32_t sequential_depth = saturated_sub / (grouping_size - 1);
+    uint32_t hillis_steel_depth;
+
+    if (num_carry_to_resolve == 0) {
+      hillis_steel_depth = 0;
+    } else {
+      hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve));
+    }
+
+    use_sequential_algorithm_to_resolver_group_carries =
+        sequential_depth <= hillis_steel_depth;
+    uint32_t num_extra_luts = 0;
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      num_extra_luts = (grouping_size - 1);
+    } else {
+      num_extra_luts = 1;
+    }
+
+    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
+    luts_array_second_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_second_step,
+        num_radix_blocks, allocate_gpu_memory);
+
+    // luts for first group inner propagation
+    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
+      auto f_first_grouping_inner_propagation =
+          [lut_id](Torus propa_cum_sum_block) -> Torus {
+        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
+
+        if (carry != 0) {
+          return 2ull; // Generates Carry
+        } else {
+          return 0ull; // Does not generate carry
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_first_grouping_inner_propagation);
+    }
+
+    auto f_first_grouping_outer_propagation =
+        [num_bits_in_block](Torus block) -> Torus {
+      return (block >> (num_bits_in_block - 1)) & 1;
+    };
+
+    int lut_id = grouping_size - 1;
+    auto lut_first_group_outer =
+        luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut_first_group_outer, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_first_grouping_outer_propagation);
+
+    // for other groupings inner propagation
+    for (int index = 0; index < grouping_size; index++) {
+      uint32_t lut_id = index + grouping_size;
+
+      auto f_other_groupings_inner_propagation =
+          [index](Torus propa_cum_sum_block) -> Torus {
+        uint64_t mask = (2 << index) - 1;
+        if (propa_cum_sum_block >= (2 << index)) {
+          return 2ull; // Generates
+        } else if ((propa_cum_sum_block & mask) == mask) {
+          return 1ull; // Propagate
+        } else {
+          return 0ull; // Nothing
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_other_groupings_inner_propagation);
+    }
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      for (int index = 0; index < grouping_size - 1; index++) {
+        uint32_t lut_id = index + 2 * grouping_size;
+
+        auto f_group_propagation = [index, block_modulus,
+                                    num_bits_in_block](Torus block) -> Torus {
+          if (block == (block_modulus - 1)) {
+            return 0ull;
+          } else {
+            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
+          }
+        };
+
+        auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+        generate_device_accumulator<Torus>(
+            streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+            message_modulus, carry_modulus, f_group_propagation);
+      }
+    } else {
+      uint32_t lut_id = 2 * grouping_size;
+      auto f_group_propagation = [block_modulus](Torus block) {
+        if (block == (block_modulus - 1)) {
+          return 2ull;
+        } else {
+          return UINT64_MAX % (block_modulus * 2ull);
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_group_propagation);
+    }
+
+    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    Torus *h_scalar_array_cum_sum =
+        (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+
+      if (is_in_first_grouping) {
+        h_second_lut_indexes[index] = index_in_grouping;
+      } else if (index_in_grouping == (grouping_size - 1)) {
+        if (use_sequential_algorithm_to_resolver_group_carries) {
+          int inner_index = (grouping_index - 1) % (grouping_size - 1);
+          h_second_lut_indexes[index] = inner_index + 2 * grouping_size;
+        } else {
+          h_second_lut_indexes[index] = 2 * grouping_size;
+        }
+      } else {
+        h_second_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+
+      bool may_have_its_padding_bit_set =
+          !is_in_first_grouping && (index_in_grouping == grouping_size - 1);
+
+      if (may_have_its_padding_bit_set) {
+        if (use_sequential_algorithm_to_resolver_group_carries) {
+          h_scalar_array_cum_sum[index] =
+              1 << ((grouping_index - 1) % (grouping_size - 1));
+        } else {
+          h_scalar_array_cum_sum[index] = 1;
+        }
+      } else {
+        h_scalar_array_cum_sum[index] = 0;
+      }
+    }
+
+    // copy the indexes to the gpu
+    Torus *second_lut_indexes =
+        luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(second_lut_indexes, h_second_lut_indexes,
+                             lut_indexes_size, streams[0], gpu_indexes[0]);
+
+    cuda_memcpy_async_to_gpu(scalar_array_cum_sum, h_scalar_array_cum_sum,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+
+      seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, grouping_size,
+          big_lwe_size_bytes, true);
+
+    } else {
+      hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_groups,
+          big_lwe_size_bytes, true);
+    }
+
+    free(h_scalar_array_cum_sum);
+    free(h_second_lut_indexes);
+  };
+
+  // needed for the division to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          Torus *new_scalars, uint32_t new_num_blocks) {
+    Torus *lut_indexes =
+        luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    cuda_memcpy_async_gpu_to_gpu(scalar_array_cum_sum, new_scalars,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(scalar_array_cum_sum, streams[0], gpu_indexes[0]);
+    cuda_drop_async(propagation_cum_sums, streams[0], gpu_indexes[0]);
+    cuda_drop_async(simulators, streams[0], gpu_indexes[0]);
+    cuda_drop_async(grouping_pgns, streams[0], gpu_indexes[0]);
+    cuda_drop_async(prepared_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(resolved_carries, streams[0], gpu_indexes[0]);
+
+    luts_array_second_step->release(streams, gpu_indexes, gpu_count);
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      seq_group_prop_mem->release(streams, gpu_indexes, gpu_count);
+      delete seq_group_prop_mem;
+    } else {
+      hs_group_prop_mem->release(streams, gpu_indexes, gpu_count);
+      delete hs_group_prop_mem;
+    }
+
+    delete luts_array_second_step;
+  };
+};
+
+template <typename Torus> struct int_sc_prop_memory {
+  uint32_t num_many_lut;
+  uint32_t lut_stride;
+
+  uint32_t group_size;
+  uint32_t num_groups;
+  Torus *output_flag;
+  Torus *last_lhs;
+  Torus *last_rhs;
+  int_radix_lut<Torus> *lut_message_extract;
+
+  int_radix_lut<Torus> *lut_overflow_flag_prep;
+  int_radix_lut<Torus> *lut_overflow_flag_last;
+  int_radix_lut<Torus> *lut_carry_flag_last;
+
+  int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
+  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
+
+  int_radix_params params;
+  bool use_sequential_algorithm_to_resolver_group_carries;
+  uint32_t requested_flag;
+
+  uint32_t active_gpu_count;
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+
+  cudaEvent_t *incoming_events1;
+  cudaEvent_t *incoming_events2;
+  cudaEvent_t *outgoing_events1;
+  cudaEvent_t *outgoing_events2;
+  cudaEvent_t *outgoing_events3;
+  cudaEvent_t *outgoing_events4;
+
+  int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                     uint32_t gpu_count, int_radix_params params,
+                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
+                     uint32_t uses_carry, bool allocate_gpu_memory) {
+    this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    requested_flag = requested_flag_in;
+    // for compute shifted blocks and block states
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+    uint32_t grouping_size = num_bits_in_block;
+    group_size = grouping_size;
+    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+
+    num_many_lut = 2; // many luts apply 2 luts
+    uint32_t box_size = polynomial_size / block_modulus;
+    lut_stride = (block_modulus / num_many_lut) * box_size;
+
+    shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks, num_many_lut,
+        grouping_size, true);
+
+    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        grouping_size, num_groups, true);
+
+    //  Step 3 elements
+    lut_message_extract =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_radix_blocks, allocate_gpu_memory);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], extract_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_message_extract);
+
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    // This store a single block that with be used to store the overflow or
+    // carry results
+    output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                             gpu_indexes[0]);
+    cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) {
+      last_lhs = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                            gpu_indexes[0]);
+      last_rhs = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                            gpu_indexes[0]);
+      cuda_memset_async(last_lhs, 0, big_lwe_size_bytes, streams[0],
+                        gpu_indexes[0]);
+      cuda_memset_async(last_rhs, 0, big_lwe_size_bytes, streams[0],
+                        gpu_indexes[0]);
+
+      // For step 1 overflow should be enable only if flag overflow
+      uint32_t num_bits_in_message = std::log2(message_modulus);
+      lut_overflow_flag_prep = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_overflow_fp = [num_bits_in_message](Torus lhs,
+                                                 Torus rhs) -> Torus {
+        Torus mask = (1 << (num_bits_in_message - 1)) - 1;
+        Torus lhs_except_last_bit = lhs & mask;
+        Torus rhs_except_last_bit = rhs & mask;
+        Torus input_carry1 = 1;
+        Torus input_carry2 = 0;
+
+        Torus output_carry1 =
+            ((lhs + rhs + input_carry1) >> num_bits_in_message) & 1;
+        Torus output_carry2 =
+            ((lhs + rhs + input_carry2) >> num_bits_in_message) & 1;
+        Torus input_carry_last_bit1 =
+            ((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >>
+             (num_bits_in_message - 1)) &
+            1;
+        Torus input_carry_last_bit2 =
+            ((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >>
+             (num_bits_in_message - 1)) &
+            1;
+
+        Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1);
+        Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2);
+
+        return output1 << 3 | output2 << 2;
+      };
+
+      auto overflow_flag_prep_lut =
+          lut_overflow_flag_prep->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator_bivariate<Torus>(
+          streams[0], gpu_indexes[0], overflow_flag_prep_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_fp);
+
+      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    }
+
+    // For the final cleanup in case of overflow or carry (it seems that I can)
+    // It seems that this lut could be apply together with the other one but for
+    // now we won't do it
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
+      lut_overflow_flag_last = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_overflow_last = [num_radix_blocks,
+                              requested_flag_in](Torus block) -> Torus {
+        uint32_t position = (num_radix_blocks == 1 &&
+                             requested_flag_in == outputFlag::FLAG_OVERFLOW)
+                                ? 0
+                                : 1;
+        Torus input_carry = (block >> position) & 1;
+        Torus does_overflow_if_carry_is_1 = (block >> 3) & 1;
+        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
+        if (input_carry == outputFlag::FLAG_OVERFLOW) {
+          return does_overflow_if_carry_is_1;
+        } else {
+          return does_overflow_if_carry_is_0;
+        }
+      };
+      auto overflow_flag_last =
+          lut_overflow_flag_last->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_last);
+
+      lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    }
+    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
+      lut_carry_flag_last = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_carry_last = [](Torus block) -> Torus {
+        return ((block >> 2) & 1);
+      };
+      auto carry_flag_last = lut_carry_flag_last->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_carry_last);
+
+      lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+
+    incoming_events1 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    incoming_events2 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events1 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events2 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events3 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events4 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+
+    for (uint j = 0; j < active_gpu_count; j++) {
+      incoming_events1[j] = cuda_create_event(gpu_indexes[j]);
+      incoming_events2[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events1[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events2[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events3[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events4[j] = cuda_create_event(gpu_indexes[j]);
+    }
+  };
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    shifted_blocks_state_mem->release(streams, gpu_indexes, gpu_count);
+    prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
+    cuda_drop_async(output_flag, streams[0], gpu_indexes[0]);
+    lut_message_extract->release(streams, gpu_indexes, gpu_count);
+    delete lut_message_extract;
+
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow
+      lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count);
+      lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count);
+      delete lut_overflow_flag_prep;
+      delete lut_overflow_flag_last;
+      cuda_drop_async(last_lhs, streams[0], gpu_indexes[0]);
+      cuda_drop_async(last_rhs, streams[0], gpu_indexes[0]);
+    }
+    if (requested_flag == outputFlag::FLAG_CARRY) { // In case of carry
+      lut_carry_flag_last->release(streams, gpu_indexes, gpu_count);
+      delete lut_carry_flag_last;
+    }
+
+    // release sub streams
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+
+    // release events
+    for (uint j = 0; j < active_gpu_count; j++) {
+      cuda_event_destroy(incoming_events1[j], gpu_indexes[j]);
+      cuda_event_destroy(incoming_events2[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events1[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events2[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events3[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events4[j], gpu_indexes[j]);
+    }
+    free(incoming_events1);
+    free(incoming_events2);
+    free(outgoing_events1);
+    free(outgoing_events2);
+    free(outgoing_events3);
+    free(outgoing_events4);
+  };
+};
+
+template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
+  Torus *shifted_blocks_and_borrow_states;
+  Torus *shifted_blocks;
+  Torus *borrow_states;
+
+  int_radix_lut<Torus> *luts_array_first_step;
+
+  int_shifted_blocks_and_borrow_states_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async(
+        num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0],
+        gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks_and_borrow_states, 0,
+                      num_many_lut * num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    shifted_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    borrow_states = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    uint32_t num_luts_first_step = 2 * grouping_size + 1;
+
+    luts_array_first_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_first_step,
+        num_radix_blocks, num_many_lut, allocate_gpu_memory);
+
+    auto f_shift_block = [message_modulus](Torus block) -> Torus {
+      uint64_t overflow_guard = message_modulus;
+      uint64_t block_mod = block % message_modulus;
+      return (overflow_guard | block_mod) << 1;
+    };
+
+    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
+      if (block < message_modulus)
+        return 1; // Borrows
+      else {
+        return 0; // Nothing
+      }
+    };
+    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
+        f_first_block_state, f_shift_block};
+
+    auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0);
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], first_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts);
+
+    // luts for other blocks of the first grouping
+    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
+      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block < message_modulus) {
+          r = 2; // Borrows
+        } else if (block == message_modulus) {
+          r = 1; // Propagates a borrow
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << (lut_id - 1);
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // luts for the rest of groupings (except for the last block)
+    for (int i = 0; i < grouping_size; i++) {
+      uint32_t lut_id = i + grouping_size;
+      auto f_state = [message_modulus, i](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block < message_modulus) {
+          r = 2; // Generates borrow
+        } else if (block == message_modulus) {
+          r = 1; // Propagates a borrow
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << i;
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
+      if (block < message_modulus)
+        return 2 << 1; // Generates a borrow
+      else
+        return 0; // Nothing
+    };
+
+    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step
+
+    auto last_block_lut =
+        luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
+        f_last_block_state, f_shift_block};
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], last_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts);
+
+    // Generate the indexes to switch between luts within the pbs
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+      bool is_last_index = (index == (num_radix_blocks - 1));
+      if (is_last_index) {
+        if (num_radix_blocks == 1) {
+          h_lut_indexes[index] = 2 * grouping_size;
+        } else {
+          h_lut_indexes[index] = 2;
+        }
+      } else if (is_in_first_grouping) {
+        h_lut_indexes[index] = index_in_grouping;
+      } else {
+        h_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+    }
+    // copy the indexes to the gpu
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size,
+                             streams[0], gpu_indexes[0]);
+    // Do I need to do something else for the multi-gpu?
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    free(h_lut_indexes);
+  };
 
+  // needed for the division to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          uint32_t new_num_blocks) {
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
-    cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]);
-    cuda_drop_async(step_output, streams[0], gpu_indexes[0]);
 
-    luts_array->release(streams, gpu_indexes, gpu_count);
-    luts_borrow_propagation_sum->release(streams, gpu_indexes, gpu_count);
-    message_acc->release(streams, gpu_indexes, gpu_count);
+    cuda_drop_async(shifted_blocks_and_borrow_states, streams[0],
+                    gpu_indexes[0]);
+    cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]);
 
-    delete luts_array;
-    delete luts_borrow_propagation_sum;
-    delete message_acc;
-  }
+    luts_array_first_step->release(streams, gpu_indexes, gpu_count);
+    delete luts_array_first_step;
+  };
 };
 
-template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-  Torus *new_blocks;
-  Torus *new_blocks_copy;
-  Torus *old_blocks;
-  Torus *small_lwe_vector;
+template <typename Torus> struct int_borrow_prop_memory {
+  uint32_t num_many_lut;
+  uint32_t lut_stride;
+
+  uint32_t group_size;
+  uint32_t num_groups;
+  Torus *overflow_block;
+
+  int_radix_lut<Torus> *lut_message_extract;
+  int_radix_lut<Torus> *lut_borrow_flag;
+
+  int_shifted_blocks_and_borrow_states_memory<Torus>
+      *shifted_blocks_borrow_state_mem;
+  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
+
   int_radix_params params;
-  int_sc_prop_memory<Torus> *scp_mem;
 
-  int32_t *d_smart_copy_in;
-  int32_t *d_smart_copy_out;
+  uint32_t active_gpu_count;
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
 
-  bool mem_reuse = false;
+  cudaEvent_t *incoming_events;
+  cudaEvent_t *outgoing_events1;
+  cudaEvent_t *outgoing_events2;
 
-  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
-                                 uint32_t const *gpu_indexes,
-                                 uint32_t gpu_count, int_radix_params params,
-                                 uint32_t num_blocks_in_radix,
-                                 uint32_t max_num_radix_in_vec,
-                                 bool allocate_gpu_memory) {
+  uint32_t compute_overflow;
+  int_borrow_prop_memory(cudaStream_t const *streams,
+                         uint32_t const *gpu_indexes, uint32_t gpu_count,
+                         int_radix_params params, uint32_t num_radix_blocks,
+                         uint32_t compute_overflow_in,
+                         bool allocate_gpu_memory) {
     this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    compute_overflow = compute_overflow_in;
+    // for compute shifted blocks and block states
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+    uint32_t grouping_size = num_bits_in_block;
+    group_size = grouping_size;
+    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+
+    num_many_lut = 2; // many luts apply 2 luts
+    uint32_t box_size = polynomial_size / block_modulus;
+    lut_stride = (block_modulus / num_many_lut) * box_size;
+
+    shifted_blocks_borrow_state_mem =
+        new int_shifted_blocks_and_borrow_states_memory<Torus>(
+            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+            num_many_lut, grouping_size, true);
+
+    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        grouping_size, num_groups, true);
 
-    // create single carry propagation memory object
-    scp_mem =
-        new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                      num_blocks_in_radix, allocate_gpu_memory);
-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+    overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                                gpu_indexes[0]);
+    cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
 
-    // allocate gpu memory for intermediate buffers
-    new_blocks = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    new_blocks_copy = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    old_blocks = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    small_lwe_vector = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks_copy, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(old_blocks, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(small_lwe_vector, 0,
-                      max_pbs_count * (params.small_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
+    lut_message_extract =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_radix_blocks, allocate_gpu_memory);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
 
-    d_smart_copy_in = (int32_t *)cuda_malloc_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
-    d_smart_copy_out = (int32_t *)cuda_malloc_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
-    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
-                      streams[0], gpu_indexes[0]);
-  }
+    auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0);
 
-  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
-                                 uint32_t const *gpu_indexes,
-                                 uint32_t gpu_count, int_radix_params params,
-                                 uint32_t num_blocks_in_radix,
-                                 uint32_t max_num_radix_in_vec,
-                                 Torus *new_blocks, Torus *old_blocks,
-                                 Torus *small_lwe_vector) {
-    mem_reuse = true;
-    this->params = params;
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], extract_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_message_extract);
 
-    // create single carry propagation memory object
-    scp_mem = new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count,
-                                            params, num_blocks_in_radix, true);
-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    // assign  gpu memory for intermediate buffers
-    this->new_blocks = new_blocks;
-    this->old_blocks = old_blocks;
-    this->small_lwe_vector = small_lwe_vector;
-    new_blocks_copy = (Torus *)cuda_malloc_async(
-        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
-        streams[0], gpu_indexes[0]);
-    cuda_memset_async(new_blocks_copy, 0,
-                      max_pbs_count * (params.big_lwe_dimension + 1) *
-                          sizeof(Torus),
-                      streams[0], gpu_indexes[0]);
+    if (compute_overflow) {
+      lut_borrow_flag =
+          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                   num_radix_blocks, allocate_gpu_memory);
+      // lut for the first block in the first grouping
+      auto f_borrow_flag = [](Torus block) -> Torus {
+        return ((block >> 2) & 1);
+      };
 
-    d_smart_copy_in = (int32_t *)cuda_malloc_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
-    d_smart_copy_out = (int32_t *)cuda_malloc_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
-    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
-                      streams[0], gpu_indexes[0]);
-    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
-                      streams[0], gpu_indexes[0]);
-  }
+      auto borrow_flag_lut = lut_borrow_flag->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], borrow_flag_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_borrow_flag);
+
+      lut_borrow_flag->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+
+    incoming_events =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events1 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events2 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      incoming_events[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events1[j] = cuda_create_event(gpu_indexes[j]);
+      outgoing_events2[j] = cuda_create_event(gpu_indexes[j]);
+    }
+  };
 
+  // needed for the division to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes,
+                          Torus *first_indexes_for_div,
+                          Torus *second_indexes_for_div, Torus *scalars_for_div,
+                          uint32_t new_num_blocks) {
+    shifted_blocks_borrow_state_mem->update_lut_indexes(
+        streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
+    prop_simu_group_carries_mem->update_lut_indexes(
+        streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
+        new_num_blocks);
+  }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
-    cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]);
-    cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
 
-    if (!mem_reuse) {
-      cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
+    shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count);
+    prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
+    cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]);
+
+    lut_message_extract->release(streams, gpu_indexes, gpu_count);
+    delete lut_message_extract;
+    if (compute_overflow) {
+      lut_borrow_flag->release(streams, gpu_indexes, gpu_count);
+      delete lut_borrow_flag;
     }
 
-    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
-    scp_mem->release(streams, gpu_indexes, gpu_count);
-    delete scp_mem;
-  }
+    // The substreams have to be synchronized before destroying events
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    // release events
+    for (uint j = 0; j < active_gpu_count; j++) {
+      cuda_event_destroy(incoming_events[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events1[j], gpu_indexes[j]);
+      cuda_event_destroy(outgoing_events2[j], gpu_indexes[j]);
+    }
+    free(incoming_events);
+    free(outgoing_events1);
+    free(outgoing_events2);
+
+    // release sub streams
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+  };
 };
 
 template <typename Torus> struct int_zero_out_if_buffer {
@@ -1170,6 +2460,7 @@ template <typename Torus> struct int_mul_memory {
   int_radix_lut<Torus> *zero_out_predicate_lut;
 
   int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_mem;
+  int_sc_prop_memory<Torus> *sc_prop_mem;
   int_zero_out_if_buffer<Torus> *zero_out_mem;
 
   int_radix_params params;
@@ -1276,6 +2567,11 @@ template <typename Torus> struct int_mul_memory {
         streams, gpu_indexes, gpu_count, params, num_radix_blocks,
         2 * num_radix_blocks, block_mul_res, vector_result_sb,
         small_lwe_vector);
+    uint32_t uses_carry = 0;
+    uint32_t requested_flag = outputFlag::FLAG_NONE;
+    sc_prop_mem = new int_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        requested_flag, uses_carry, allocate_gpu_memory);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1295,9 +2591,11 @@ template <typename Torus> struct int_mul_memory {
 
     luts_array->release(streams, gpu_indexes, gpu_count);
     sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count);
+    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
 
     delete luts_array;
     delete sum_ciphertexts_mem;
+    delete sc_prop_mem;
   }
 };
 
@@ -2313,7 +3611,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
   // memory objects for other operations
   int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
   int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
-  int_overflowing_sub_memory<Torus> *overflow_sub_mem;
+  int_borrow_prop_memory<Torus> *overflow_sub_mem;
   int_comparison_buffer<Torus> *comparison_buffer;
 
   // lookup tables
@@ -2350,6 +3648,11 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
   Torus *at_least_one_upper_block_is_non_zero;
   Torus *cleaned_merged_interesting_remainder;
 
+  Torus **first_indexes_for_overflow_sub;
+  Torus **second_indexes_for_overflow_sub;
+  Torus **scalars_for_overflow_sub;
+  uint32_t max_indexes_to_erase;
+
   // allocate and initialize if needed, temporary arrays used to calculate
   // cuda integer div_rem operation
   void init_temporary_buffers(cudaStream_t const *streams,
@@ -2558,8 +3861,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
         streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
         params, 2 * num_blocks, true);
 
-    overflow_sub_mem = new int_overflowing_sub_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, num_blocks, true);
+    uint32_t compute_overflow = 1;
+    overflow_sub_mem = new int_borrow_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
+        true);
+    uint32_t group_size = overflow_sub_mem->group_size;
+    bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
+                       ->use_sequential_algorithm_to_resolver_group_carries;
+    create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
+                                    group_size, use_seq);
 
     comparison_buffer = new int_comparison_buffer<Torus>(
         streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
@@ -2584,6 +3894,94 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
     }
   }
 
+  void create_indexes_for_overflow_sub(cudaStream_t const *streams,
+                                       uint32_t const *gpu_indexes,
+                                       uint32_t num_blocks, uint32_t group_size,
+                                       bool use_seq) {
+    max_indexes_to_erase = num_blocks;
+
+    first_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    second_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
+
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+
+    // Extra indexes for the luts in first step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      cudaMalloc((void **)&first_indexes_for_overflow_sub[nb - 1],
+                 nb * sizeof(Torus));
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2;
+          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+      }
+      cuda_memcpy_async_to_gpu(first_indexes_for_overflow_sub[nb - 1],
+                               h_lut_indexes, nb * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+    }
+    // Extra indexes for the luts in second step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      cudaMalloc((void **)&second_indexes_for_overflow_sub[nb - 1],
+                 nb * sizeof(Torus));
+      cudaMalloc((void **)&scalars_for_overflow_sub[nb - 1],
+                 nb * sizeof(Torus));
+
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
+        bool may_have_its_padding_bit_set =
+            !is_in_first_grouping && (index_in_grouping == group_size - 1);
+
+        if (may_have_its_padding_bit_set) {
+          if (use_seq) {
+            h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
+          } else {
+            h_scalar[index] = 1;
+          }
+        } else {
+          h_scalar[index] = 0;
+        }
+      }
+      cuda_memcpy_async_to_gpu(second_indexes_for_overflow_sub[nb - 1],
+                               h_lut_indexes, nb * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+      cuda_memcpy_async_to_gpu(scalars_for_overflow_sub[nb - 1], h_scalar,
+                               nb * sizeof(Torus), streams[0], gpu_indexes[0]);
+    }
+    free(h_lut_indexes);
+    free(h_scalar);
+  };
+
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
     uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
@@ -2678,6 +4076,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                     gpu_indexes[0]);
     cuda_drop_async(cleaned_merged_interesting_remainder, streams[0],
                     gpu_indexes[0]);
+
+    for (int i = 0; i < max_indexes_to_erase; i++) {
+      cuda_drop_async(first_indexes_for_overflow_sub[i], streams[0],
+                      gpu_indexes[0]);
+      cuda_drop_async(second_indexes_for_overflow_sub[i], streams[0],
+                      gpu_indexes[0]);
+      cuda_drop_async(scalars_for_overflow_sub[i], streams[0], gpu_indexes[0]);
+    }
+    free(first_indexes_for_overflow_sub);
+    free(second_indexes_for_overflow_sub);
+    free(scalars_for_overflow_sub);
   }
 };
 
@@ -2823,107 +4232,6 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
   }
 };
 
-template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
-  int_radix_params params;
-  uint32_t active_gpu_count;
-
-  // memory objects for other operations
-  int_sc_prop_memory<Torus> *scp_mem;
-  int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
-  int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
-
-  // sub streams
-  cudaStream_t *sub_streams_1;
-  cudaStream_t *sub_streams_2;
-
-  // temporary device buffers
-  Torus *result;                       // num_blocks
-  Torus *input_carries;                // num_blocks
-  Torus *neg_rhs;                      // num_blocks
-  Torus *output_carry;                 // single block
-  Torus *last_block_inner_propagation; // single block
-
-  // allocate temporary arrays used to calculate
-  // cuda integer signed overflowing add or sub
-  void allocate_temporary_buffers(cudaStream_t const *streams,
-                                  uint32_t const *gpu_indexes,
-                                  uint32_t gpu_count, uint32_t num_blocks) {
-    uint32_t big_lwe_size = params.big_lwe_dimension + 1;
-
-    result = (Torus *)cuda_malloc_async(
-        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-
-    neg_rhs = (Torus *)cuda_malloc_async(
-        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-
-    input_carries = (Torus *)cuda_malloc_async(
-        big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-    output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus),
-                                              streams[0], gpu_indexes[0]);
-    last_block_inner_propagation = (Torus *)cuda_malloc_async(
-        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-  }
-
-  // constructor without memory reuse
-  int_signed_overflowing_add_or_sub_memory(
-      cudaStream_t const *streams, uint32_t const *gpu_indexes,
-      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks,
-      SIGNED_OPERATION op, bool allocate_gpu_memory) {
-    this->params = params;
-    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
-
-    allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
-                               num_blocks);
-
-    // initialize streams
-    sub_streams_1 =
-        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 =
-        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < active_gpu_count; j++) {
-      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
-      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-    }
-
-    // initialize memory objects for other operations
-    scp_mem =
-        new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                      num_blocks, allocate_gpu_memory);
-    las_block_prop_mem = new int_last_block_inner_propagate_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, op, num_blocks,
-        allocate_gpu_memory);
-
-    resolve_overflow_mem = new int_resolve_signed_overflow_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, allocate_gpu_memory);
-  }
-
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    // memory objects for other operations
-    scp_mem->release(streams, gpu_indexes, gpu_count);
-    las_block_prop_mem->release(streams, gpu_indexes, gpu_count);
-    resolve_overflow_mem->release(streams, gpu_indexes, gpu_count);
-
-    delete scp_mem;
-    delete las_block_prop_mem;
-    delete resolve_overflow_mem;
-
-    // temporary device buffers
-    cuda_drop_async(result, streams[0], gpu_indexes[0]);
-    cuda_drop_async(neg_rhs, streams[0], gpu_indexes[0]);
-    cuda_drop_async(input_carries, streams[0], gpu_indexes[0]);
-    cuda_drop_async(output_carry, streams[0], gpu_indexes[0]);
-    cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
-
-    // sub streams
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
-      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
-    }
-    free(sub_streams_1);
-    free(sub_streams_2);
-  }
-};
 template <typename Torus> struct int_bitop_buffer {
 
   int_radix_params params;
@@ -3006,6 +4314,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
   int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
   Torus *preshifted_buffer;
   Torus *all_shifted_buffer;
+  int_sc_prop_memory<Torus> *sc_prop_mem;
 
   int_scalar_mul_buffer(cudaStream_t const *streams,
                         uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -3044,13 +4353,20 @@ template <typename Torus> struct int_scalar_mul_buffer {
       sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
           streams, gpu_indexes, gpu_count, params, num_radix_blocks,
           num_ciphertext_bits, allocate_gpu_memory);
+      uint32_t uses_carry = 0;
+      uint32_t requested_flag = outputFlag::FLAG_NONE;
+      sc_prop_mem = new int_sc_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+          requested_flag, uses_carry, allocate_gpu_memory);
     }
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
     sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
+    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
     delete sum_ciphertexts_vec_mem;
+    delete sc_prop_mem;
     cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
   }
 };
@@ -3074,9 +4390,11 @@ template <typename Torus> struct int_abs_buffer {
               streams, gpu_indexes, gpu_count,
               SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
               allocate_gpu_memory);
-      scp_mem =
-          new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                        num_radix_blocks, allocate_gpu_memory);
+      uint32_t requested_flag = outputFlag::FLAG_NONE;
+      uint32_t uses_carry = 0;
+      scp_mem = new int_sc_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+          requested_flag, uses_carry, allocate_gpu_memory);
       bitxor_mem = new int_bitop_buffer<Torus>(
           streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
           num_radix_blocks, allocate_gpu_memory);
@@ -3155,12 +4473,14 @@ template <typename Torus> struct int_div_rem_memory {
       abs_mem_2 =
           new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
                                     num_blocks, allocate_gpu_memory);
-      scp_mem_1 =
-          new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                        num_blocks, allocate_gpu_memory);
-      scp_mem_2 =
-          new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                        num_blocks, allocate_gpu_memory);
+      uint32_t requested_flag = outputFlag::FLAG_NONE;
+      uint32_t uses_carry = 0;
+      scp_mem_1 = new int_sc_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_blocks, requested_flag,
+          uses_carry, allocate_gpu_memory);
+      scp_mem_2 = new int_sc_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_blocks, requested_flag,
+          uses_carry, allocate_gpu_memory);
 
       std::function<uint64_t(uint64_t)> quotient_predicate_lut_f =
           [](uint64_t x) -> uint64_t { return x == 1; };
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
index 10c476c12b..ddc9a2a508 100644
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -27,6 +27,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                        void const *lwe_array_in_2,
                                        uint32_t input_lwe_dimension,
                                        uint32_t input_lwe_ciphertext_count);
+
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, void const *plaintext_array_in,
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
index 350b5862f4..f0a54cca2f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -28,7 +28,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 #endif
 
 template <typename Torus>
@@ -46,7 +46,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 
 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -63,7 +63,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
index a9e21f77ab..722a276235 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -255,7 +255,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template <typename Torus>
@@ -266,7 +266,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 #if (CUDA_ARCH >= 900)
@@ -278,7 +278,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
index c90d671fdb..3596eeba4b 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
 
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -78,7 +78,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
 
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **pbs_buffer);
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
index fceac32e97..504c864069 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -27,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index be24b0475b..e15fc72185 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,6 +2,30 @@
 #include <cstdint>
 #include <cuda_runtime.h>
 
+cudaEvent_t cuda_create_event(uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaEvent_t event;
+  check_cuda_error(cudaEventCreate(&event));
+  return event;
+}
+
+void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
+                       uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaEventRecord(event, stream));
+}
+
+void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
+                            uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
+}
+
+void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaEventDestroy(event));
+}
+
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
   check_cuda_error(cudaSetDevice(gpu_index));
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
index ad1a4b9e23..d9053bbfbd 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -58,9 +58,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
   host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
                        radix_params.big_lwe_dimension, num_blocks);
 
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
-                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
-                                     ksks, num_blocks);
+  uint32_t requested_flag = outputFlag::FLAG_NONE;
+  uint32_t uses_carry = 0;
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
+      bsks, ksks, num_blocks, requested_flag, uses_carry);
 
   host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
                               mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
deleted file mode 100644
index 2ae72ad2a6..0000000000
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "integer/addition.cuh"
-
-void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
-                                                : SIGNED_OPERATION::SUBTRACTION;
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_signed_overflowing_add_or_sub_memory<uint64_t> **)mem_ptr,
-      num_blocks, op, params, allocate_gpu_memory);
-}
-
-void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t num_blocks) {
-
-  auto mem = (int_signed_overflowing_add_or_sub_memory<uint64_t> *)mem_ptr;
-  SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION
-                                                : SIGNED_OPERATION::SUBTRACTION;
-
-  host_integer_signed_overflowing_add_or_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lhs), static_cast<uint64_t const *>(rhs),
-      static_cast<uint64_t *>(overflowed), op, bsks, (uint64_t *const *)(ksks),
-      mem, num_blocks);
-}
-
-void cleanup_signed_overflowing_add_or_sub(void *const *streams,
-                                           uint32_t const *gpu_indexes,
-                                           uint32_t gpu_count,
-                                           int8_t **mem_ptr_void) {
-  int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr =
-      (int_signed_overflowing_add_or_sub_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
deleted file mode 100644
index 9c763596d0..0000000000
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef TFHE_RS_ADDITION_CUH
-#define TFHE_RS_ADDITION_CUH
-
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer/comparison.cuh"
-#include "integer/integer.cuh"
-#include "integer/integer_utilities.h"
-#include "integer/negation.cuh"
-#include "integer/scalar_shifts.cuh"
-#include "linear_algebra.h"
-#include "pbs/programmable_bootstrap.h"
-#include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-template <typename Torus>
-void host_resolve_signed_overflow(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation,
-    Torus const *last_block_input_carry, Torus *last_block_output_carry,
-    int_resolve_signed_overflow_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
-
-  auto x = mem->x;
-
-  Torus *d_clears =
-      (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
-
-  cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], d_clears, 2, 1);
-
-  // replace with host function call
-  cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
-      mem->params.big_lwe_dimension, 1);
-
-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, x,
-                       mem->params.big_lwe_dimension, 1);
-  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                       last_block_inner_propagation, last_block_input_carry,
-                       mem->params.big_lwe_dimension, 1);
-
-  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
-                                      last_block_inner_propagation,
-                                      mem->resolve_overflow_lut, ksks, bsks, 1);
-
-  cuda_drop_async(d_clears, streams[0], gpu_indexes[0]);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count,
-    int_signed_overflowing_add_or_sub_memory<Torus> **mem_ptr,
-    uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_signed_overflowing_add_or_sub_memory<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_blocks, op,
-      allocate_gpu_memory);
-}
-
-/*
- * Addition - signed_operation = 1
- * Subtraction - signed_operation = -1
- */
-template <typename Torus>
-__host__ void host_integer_signed_overflowing_add_or_sub_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed,
-    SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks,
-    int_signed_overflowing_add_or_sub_memory<uint64_t> *mem_ptr,
-    uint32_t num_blocks) {
-
-  auto radix_params = mem_ptr->params;
-
-  uint32_t big_lwe_dimension = radix_params.big_lwe_dimension;
-  uint32_t big_lwe_size = big_lwe_dimension + 1;
-  uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4);
-
-  auto result = mem_ptr->result;
-  auto neg_rhs = mem_ptr->neg_rhs;
-  auto input_carries = mem_ptr->input_carries;
-  auto output_carry = mem_ptr->output_carry;
-  auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation;
-
-  cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes,
-                               streams[0], gpu_indexes[0]);
-
-  // phase 1
-  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
-                         big_lwe_dimension, num_blocks);
-  } else {
-    host_integer_radix_negation<Torus>(
-        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
-        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                         big_lwe_dimension, num_blocks);
-  }
-
-  // phase 2
-  for (uint j = 0; j < gpu_count; j++) {
-    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-  }
-
-  host_propagate_single_carry<Torus>(
-      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
-      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation<Torus>(
-      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
-      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
-      ksks);
-
-  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
-    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
-  }
-
-  // phase 3
-  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
-  if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) {
-    // Quick fix for the case where the subtraction is done on a single block
-    Torus *one_scalar =
-        (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]);
-    cuda_set_value_async<Torus>(streams[0], gpu_indexes[0], one_scalar, 1, 1);
-    create_trivial_radix<Torus>(
-        streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension,
-        1, 1, radix_params.message_modulus, radix_params.carry_modulus);
-    cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]);
-  }
-
-  host_resolve_signed_overflow<Torus>(
-      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
-      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
-
-  cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes,
-                               streams[0], gpu_indexes[0]);
-}
-
-#endif // TFHE_RS_ADDITION_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index 88d598fef6..097dc47f32 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -295,7 +295,7 @@ __host__ void host_integer_decompress(
   extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
 
   // In the case of extracting a single LWE these parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
@@ -311,7 +311,7 @@ __host__ void host_integer_decompress(
         compression_params.small_lwe_dimension,
         encryption_params.polynomial_size, encryption_params.pbs_base_log,
         encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
   } else {
     /// For multi GPU execution we create vectors of pointers for inputs and
     /// outputs
@@ -338,7 +338,7 @@ __host__ void host_integer_decompress(
         compression_params.small_lwe_dimension,
         encryption_params.polynomial_size, encryption_params.pbs_base_log,
         encryption_params.pbs_level, encryption_params.grouping_factor,
-        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+        num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
     multi_gpu_gather_lwe_async<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index 394c816d07..ab574705f6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -425,11 +425,24 @@ __host__ void host_unsigned_integer_div_rem_kb(
     auto do_overflowing_sub = [&](cudaStream_t const *streams,
                                   uint32_t const *gpu_indexes,
                                   uint32_t gpu_count) {
-      host_integer_overflowing_sub_kb<Torus>(
-          streams, gpu_indexes, gpu_count, new_remainder.data,
-          subtraction_overflowed.data, merged_interesting_remainder.data,
-          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
+      uint32_t compute_borrow = 1;
+      uint32_t uses_input_borrow = 0;
+      auto first_indexes = mem_ptr->first_indexes_for_overflow_sub
+                               [merged_interesting_remainder.len - 1];
+      auto second_indexes = mem_ptr->second_indexes_for_overflow_sub
+                                [merged_interesting_remainder.len - 1];
+      auto scalar_indexes =
+          mem_ptr
+              ->scalars_for_overflow_sub[merged_interesting_remainder.len - 1];
+      mem_ptr->overflow_sub_mem->update_lut_indexes(
+          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
           merged_interesting_remainder.len);
+      host_integer_overflowing_sub<uint64_t>(
+          streams, gpu_indexes, gpu_count, new_remainder.data,
+          (uint64_t *)merged_interesting_remainder.data,
+          interesting_divisor.data, subtraction_overflowed.data,
+          (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
+          merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
     };
 
     // fills:
@@ -657,10 +670,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
         int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
         num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
 
-    host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
-                                       gpu_count, int_mem_ptr->negated_quotient,
-                                       nullptr, nullptr, int_mem_ptr->scp_mem_1,
-                                       bsks, ksks, num_blocks);
+    uint32_t requested_flag = outputFlag::FLAG_NONE;
+    uint32_t uses_carry = 0;
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+        int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
+        bsks, ksks, num_blocks, requested_flag, uses_carry);
 
     host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
                                 gpu_count, int_mem_ptr->negated_remainder,
@@ -671,7 +686,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
     host_propagate_single_carry<Torus>(
         int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
         int_mem_ptr->negated_remainder, nullptr, nullptr,
-        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks);
+        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
+        uses_carry);
 
     host_integer_radix_cmux_kb<Torus>(
         int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
index 53b1366c37..6d224e64d7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,4 +1,5 @@
 #include "integer/integer.cuh"
+#include "integer/negation.cuh"
 #include <linear_algebra.h>
 
 void cuda_full_propagation_64_inplace(void *const *streams,
@@ -49,7 +50,8 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
     uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
     uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
 
   int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                           big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -59,30 +61,94 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
   scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
+      requested_flag, uses_carry, allocate_gpu_memory);
+}
+
+void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      requested_flag, uses_carry, allocate_gpu_memory);
+}
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      compute_overflow, allocate_gpu_memory);
 }
 
 void cuda_propagate_single_carry_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {
+
   host_propagate_single_carry<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
-      nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_blocks);
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }
 
-void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_blocks) {
-  host_propagate_single_carry<uint64_t>(
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) {
+
+  host_add_and_propagate_single_carry<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
-      static_cast<uint64_t *>(input_carries),
+      static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
       (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      num_blocks);
+      num_blocks, requested_flag, uses_carry);
+}
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  host_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(overflow_block),
+      static_cast<const uint64_t *>(input_borrow),
+      (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
+      num_blocks, compute_overflow, uses_input_borrow);
 }
 
 void cleanup_cuda_propagate_single_carry(void *const *streams,
@@ -94,6 +160,23 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
 
+void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void) {
+  int_sc_prop_memory<uint64_t> *mem_ptr =
+      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void) {
+  int_borrow_prop_memory<uint64_t> *mem_ptr =
+      (int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
 void scratch_cuda_apply_univariate_lut_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -142,14 +225,14 @@ void cuda_apply_many_univariate_lut_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
     void *const *ksks, void *const *bsks, uint32_t num_blocks,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   host_apply_many_univariate_lut_kb<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       static_cast<uint64_t *>(output_radix_lwe),
       static_cast<const uint64_t *>(input_radix_lwe),
       (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
-      lut_count, lut_stride);
+      num_many_lut, lut_stride);
 }
 
 void scratch_cuda_apply_bivariate_lut_kb_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 28993ed406..51cedfd668 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -8,6 +8,7 @@
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
+#include "linearalgebra/negation.cuh"
 #include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
@@ -80,6 +81,7 @@ host_radix_blocks_rotate_right(cudaStream_t const *streams,
   cudaSetDevice(gpu_indexes[0]);
   radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
       dst, src, value, blocks_count, lwe_size);
+  check_cuda_error(cudaGetLastError());
 }
 
 // rotate radix ciphertext left with specific value
@@ -97,6 +99,7 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams,
   cudaSetDevice(gpu_indexes[0]);
   radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
       dst, src, value, blocks_count, lwe_size);
+  check_cuda_error(cudaGetLastError());
 }
 
 // reverse the blocks in a list
@@ -126,6 +129,138 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
   int num_blocks = blocks_count / 2, num_threads = 1024;
   radix_blocks_reverse_lwe_inplace<Torus>
       <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
+  check_cuda_error(cudaGetLastError());
+}
+
+// If group_size = 4, the first group of 4 elements will be transformed as
+// follows:
+//  dest[0] = src[0]
+//  dest[1] = src[0] + src[1]
+//  dest[2] = src[0] + src[1] + src[2]
+//  dest[3] = src[0] + src[1] + src[2] + src[3]
+template <typename Torus>
+__global__ void
+radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count,
+                               uint32_t lwe_size, uint32_t group_size) {
+
+  size_t block_offset = blockIdx.x * group_size * lwe_size;
+
+  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+    size_t idx = j + block_offset;
+    Torus sum = src[idx];
+    dest[idx] = sum;
+    for (int gidx = 1; gidx < group_size; gidx++) {
+      if (gidx + blockIdx.x * group_size <
+          blocks_count) { // in case the last group is not full
+        sum += src[idx + gidx * lwe_size];
+        dest[idx + gidx * lwe_size] = sum;
+      }
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_cumulative_sum_in_groups(
+    cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
+    uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
+  cudaSetDevice(gpu_index);
+  // Each CUDA block is responsible for a single group
+  int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
+      num_threads = 512;
+  radix_cumulative_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      dest, src, radix_blocks_count, lwe_size, group_size);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__global__ void radix_split_simulators_and_grouping_pgns(
+    Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count,
+    uint32_t lwe_size, uint32_t group_size, Torus delta) {
+
+  size_t block_offset = blockIdx.x * lwe_size;
+  if (blockIdx.x % group_size == 0) {
+    if (blockIdx.x == 0) {
+      // save trivial 0
+      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+        simulators[j] = 0;
+      }
+    } else {
+      // save trivial 1
+      for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) {
+        size_t simu_idx = j + block_offset;
+        simulators[simu_idx] = 0;
+      }
+      if (threadIdx.x == 0) {
+        simulators[lwe_size - 1 + block_offset] = 1 * delta;
+      }
+    }
+
+    if ((blockIdx.x / group_size + 1) <
+        (blocks_count + group_size - 1) / group_size) {
+      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
+      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
+      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+        size_t in_offset = j + src_offset;
+        size_t out_offset = j + pgns_offset;
+        grouping_pgns[out_offset] = src[in_offset];
+      }
+    }
+  } else {
+    // save simulators
+    size_t src_offset = (blockIdx.x - 1) * lwe_size;
+    for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+      simulators[j + block_offset] = src[j + src_offset];
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_split_simulators_and_grouping_pgns(
+    cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
+    Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
+    uint32_t lwe_size, uint32_t group_size, Torus delta) {
+  cudaSetDevice(gpu_index);
+  // Each CUDA block is responsible for a single group
+  int num_blocks = radix_blocks_count, num_threads = 512;
+  radix_split_simulators_and_grouping_pgns<Torus>
+      <<<num_blocks, num_threads, 0, stream>>>(simulators, grouping_pgns, src,
+                                               radix_blocks_count, lwe_size,
+                                               group_size, delta);
+  check_cuda_error(cudaGetLastError());
+}
+
+// If group_size = 4, the first group of 4 elements will be transformed as
+// follows:
+//  src1 size num_radix_blocks * lwe_size
+//  src2 size num_group * lwe_size
+//  dest[0] = src1[0] + src2[0]
+//  dest[1] = src1[1] + src2[0]
+//  dest[2] = src1[2] + src2[0]
+//  dest[3] = src1[3] + src2[0]
+template <typename Torus>
+__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2,
+                                    uint32_t blocks_count, uint32_t lwe_size,
+                                    uint32_t group_size) {
+
+  size_t src1_offset = blockIdx.x * lwe_size;
+  size_t src2_index = (blockIdx.x / group_size) * lwe_size;
+  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+    size_t idx = j + src1_offset;
+    dest[idx] = src1[idx] + src2[j + src2_index];
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
+                                       Torus *dest, Torus *src1, Torus *src2,
+                                       uint32_t radix_blocks_count,
+                                       uint32_t lwe_size, uint32_t group_size) {
+  cudaSetDevice(gpu_index);
+
+  int num_blocks = radix_blocks_count, num_threads = 512;
+  radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      dest, src1, src2, radix_blocks_count, lwe_size, group_size);
+  check_cuda_error(cudaGetLastError());
 }
 
 // polynomial_size threads
@@ -238,7 +373,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   auto grouping_factor = params.grouping_factor;
 
   // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   /// For multi GPU execution we create vectors of pointers for inputs and
   /// outputs
@@ -262,7 +397,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -288,7 +423,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
         lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
@@ -310,7 +445,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
     void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    int_radix_lut<Torus> *lut, uint32_t lut_count, uint32_t lut_stride) {
+    int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
   // apply_lookup_table
   auto params = lut->params;
   auto pbs_type = params.pbs_type;
@@ -346,7 +481,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -372,15 +507,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
         lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      lwe_array_out, lwe_after_pbs_vec,
-                                      lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
+    multi_gpu_gather_many_lut_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+        lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
+        num_many_lut);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
@@ -409,7 +544,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   auto grouping_factor = params.grouping_factor;
 
   // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
 
   // Left message is shifted
@@ -442,7 +577,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     multi_gpu_scatter_lwe_async<Torus>(
@@ -464,7 +599,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
         lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
@@ -525,6 +660,48 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
   rotate_left<Torus>(body, half_box_size, polynomial_size);
 }
 
+template <typename Torus>
+void generate_many_lookup_table(
+    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t carry_modulus,
+    std::vector<std::function<Torus(Torus)>> &functions) {
+
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  uint32_t box_size = polynomial_size / modulus_sup;
+  Torus delta = (1ul << 63) / modulus_sup;
+
+  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
+
+  auto body = &acc[glwe_dimension * polynomial_size];
+
+  size_t fn_counts = functions.size();
+
+  assert(fn_counts <= modulus_sup / 2);
+
+  // Space used for each sub lut
+  uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size;
+
+  // This accumulator extracts the carry bits
+  for (int f = 0; f < fn_counts; f++) {
+    int lut_offset = f * single_function_sub_lut_size;
+    for (int i = 0; i < modulus_sup / fn_counts; i++) {
+      int index = i * box_size + lut_offset;
+      for (int j = index; j < index + box_size; j++) {
+        auto f_eval = functions[f](i);
+        body[j] = f_eval * delta;
+      }
+    }
+  }
+  int half_box_size = box_size / 2;
+
+  // Negate the first half_box_size coefficients
+  for (int i = 0; i < half_box_size; i++) {
+    body[i] = -body[i];
+  }
+
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
+}
+
 template <typename Torus>
 void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
                                      uint32_t polynomial_size,
@@ -658,16 +835,145 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
   free(h_lut);
 }
 
+/*
+ *  generate many lut accumulator for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for accumulator
+ *    ...
+ *    vector<f> - evaluating functions with one Torus input
+ */
 template <typename Torus>
-void scratch_cuda_propagate_single_carry_kb_inplace(
+void generate_many_lut_device_accumulator(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus,
+    std::vector<std::function<Torus(Torus)>> &functions) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill accumulator
+  generate_many_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
+                                    message_modulus, carry_modulus, functions);
+
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(
+      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+      stream, gpu_index);
+
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
+}
+
+// This function is used to perform step 1 of Thomas' new carry propagation
+// algorithm It uses a many lut to calculate two luts in parallel
+// shifted_blocks: contains (block % message modulus) << 1
+// block states: contains the propagation states for the different blocks
+// depending on the group it belongs to and the internal position within the
+// block.
+template <typename Torus>
+void host_compute_shifted_blocks_and_states(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_sc_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
+    int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t lut_stride,
+    uint32_t num_many_lut) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
+  auto luts_array_first_step = mem->luts_array_first_step;
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array,
+      bsks, ksks, num_radix_blocks, luts_array_first_step, num_many_lut,
+      lut_stride);
+
+  auto shifted_blocks = mem->shifted_blocks;
+  auto block_states = mem->block_states;
+  cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states,
+                               big_lwe_size_bytes * num_radix_blocks,
+                               streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(
+      shifted_blocks,
+      shifted_blocks_and_states + big_lwe_size * num_radix_blocks,
+      big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]);
+}
+
+template <typename Torus>
+void host_resolve_group_carries_sequentially(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns,
+    int_radix_params params, int_seq_group_prop_memory<Torus> *mem,
+    void *const *bsks, Torus *const *ksks, uint32_t num_groups) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-  *mem_ptr =
-      new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
-                                    num_radix_blocks, allocate_gpu_memory);
+  auto group_resolved_carries = mem->group_resolved_carries;
+  if (num_groups > 1) {
+    // First carry is just copied
+    cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns,
+                                 big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);
+    uint32_t solve_per_iter = mem->grouping_size - 1;
+    uint32_t remaining_carries =
+        num_groups -
+        2; // the first one has been resolved and we ignore the last one
+    uint32_t num_loops =
+        ceil(double(remaining_carries) / (double)(solve_per_iter));
+    uint32_t last_resolved_pos = 1;
+
+    for (int i = 0; i < num_loops; i++) {
+      uint32_t loop_offset = i * solve_per_iter;
+      uint32_t blocks_to_solve = solve_per_iter;
+      // In case the last iteration has to solve less
+      if (loop_offset + blocks_to_solve > num_groups - 2) {
+        blocks_to_solve = remaining_carries - loop_offset;
+      }
+
+      // The group_resolved carries is used as an intermediate array
+      // First we need to copy the last resolved carry
+      cuda_memcpy_async_gpu_to_gpu(
+          group_resolved_carries,
+          resolved_carries + last_resolved_pos * big_lwe_size,
+          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      // The array is filled with the blocks_to_solve
+      cuda_memcpy_async_gpu_to_gpu(
+          group_resolved_carries + big_lwe_size,
+          grouping_pgns + last_resolved_pos * big_lwe_size,
+          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      // Perform one group cumulative sum
+      host_radix_cumulative_sum_in_groups<Torus>(
+          streams[0], gpu_indexes[0], group_resolved_carries,
+          group_resolved_carries, blocks_to_solve + 1, big_lwe_size,
+          mem->grouping_size);
+
+      // Apply the lut
+      auto luts_sequential = mem->lut_sequential_algorithm;
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count,
+          group_resolved_carries + big_lwe_size,
+          group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve,
+          luts_sequential);
+
+      // Copy the result to the resolved carries array
+      cuda_memcpy_async_gpu_to_gpu(
+          resolved_carries + (last_resolved_pos + 1) * big_lwe_size,
+          group_resolved_carries + big_lwe_size,
+          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      last_resolved_pos += blocks_to_solve;
+    }
+  }
 }
 
 template <typename Torus>
@@ -675,26 +981,26 @@ void host_compute_prefix_sum_hillis_steele(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates,
     int_radix_params params, int_radix_lut<Torus> *luts, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks) {
+    Torus *const *ksks, uint32_t num_radix_blocks) {
 
   auto glwe_dimension = params.glwe_dimension;
   auto polynomial_size = params.polynomial_size;
   auto big_lwe_size = glwe_dimension * polynomial_size + 1;
   auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
-  int num_steps = ceil(log2((double)num_blocks));
+  int num_steps = ceil(log2((double)num_radix_blocks));
   int space = 1;
   cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, streams[0],
-                               gpu_indexes[0]);
+                               big_lwe_size_bytes * num_radix_blocks,
+                               streams[0], gpu_indexes[0]);
 
   for (int step = 0; step < num_steps; step++) {
-    if (space > num_blocks - 1)
+    if (space > num_radix_blocks - 1)
       PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
             "propagation")
     auto cur_blocks = &step_output[space * big_lwe_size];
     auto prev_blocks = generates_or_propagates;
-    int cur_total_blocks = num_blocks - space;
+    int cur_total_blocks = num_radix_blocks - space;
 
     integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
@@ -707,14 +1013,116 @@ void host_compute_prefix_sum_hillis_steele(
   }
 }
 
+// This function is used to perform step 2 of Thomas' new propagation algorithm
+// Consist three steps:
+// - propagates the carry within each group with cheap LWE operations stored in
+// simulators
+// - calculates the propagation state of each group
+// - resolves the carries between groups, either sequentially or with hillis
+// steele
 template <typename Torus>
-void host_propagate_single_carry(cudaStream_t const *streams,
-                                 uint32_t const *gpu_indexes,
-                                 uint32_t gpu_count, Torus *lwe_array,
-                                 Torus *carry_out, Torus *input_carries,
-                                 int_sc_prop_memory<Torus> *mem,
-                                 void *const *bsks, Torus *const *ksks,
-                                 uint32_t num_blocks) {
+void host_compute_propagation_simulators_and_group_carries(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *block_states, int_radix_params params,
+    int_prop_simu_group_carries_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_groups) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+
+  auto propagation_cum_sums = mem->propagation_cum_sums;
+  auto group_size = mem->group_size;
+  host_radix_cumulative_sum_in_groups<Torus>(
+      streams[0], gpu_indexes[0], propagation_cum_sums, block_states,
+      num_radix_blocks, big_lwe_size, group_size);
+
+  auto luts_array_second_step = mem->luts_array_second_step;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, propagation_cum_sums,
+      propagation_cum_sums, bsks, ksks, num_radix_blocks,
+      luts_array_second_step);
+
+  auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
+  auto big_lwe_dimension = big_lwe_size - 1;
+
+  host_integer_radix_scalar_addition_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, propagation_cum_sums,
+      scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks,
+      message_modulus, carry_modulus);
+
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  Torus delta = (1ull << 63) / modulus_sup;
+  auto simulators = mem->simulators;
+  auto grouping_pgns = mem->grouping_pgns;
+  host_radix_split_simulators_and_grouping_pgns<Torus>(
+      streams[0], gpu_indexes[0], simulators, grouping_pgns,
+      propagation_cum_sums, num_radix_blocks, big_lwe_size, group_size, delta);
+
+  auto resolved_carries = mem->resolved_carries;
+  if (mem->use_sequential_algorithm_to_resolver_group_carries) {
+    // Resolve group carries sequentially
+    host_resolve_group_carries_sequentially(
+        streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns,
+        params, mem->seq_group_prop_mem, bsks, ksks, num_groups);
+  } else {
+    // Resolve group carries with hillis steele
+    auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
+    host_compute_prefix_sum_hillis_steele<Torus>(
+        streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size],
+        grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks,
+        num_groups - 1);
+  }
+}
+// This function is used to perform step 1 of Thomas' new borrow propagation
+// algorithm It uses a many lut to calculate two luts in parallel
+// shifted_blocks: contains (block % message modulus) << 1
+// block states: contains the propagation states for the different blocks
+// depending on the group it belongs to and the internal position within the
+// block.
+template <typename Torus>
+void host_compute_shifted_blocks_and_borrow_states(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
+    int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t lut_stride,
+    uint32_t num_many_lut) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
+  auto luts_array_first_step = mem->luts_array_first_step;
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
+      lwe_array, bsks, ksks, num_radix_blocks, luts_array_first_step,
+      num_many_lut, lut_stride);
+
+  auto shifted_blocks = mem->shifted_blocks;
+  auto borrow_states = mem->borrow_states;
+  cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
+                               big_lwe_size_bytes * num_radix_blocks,
+                               streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(
+      shifted_blocks,
+      shifted_blocks_and_borrow_states + big_lwe_size * num_radix_blocks,
+      big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]);
+}
+
+template <typename Torus>
+void host_legacy_propagate_single_carry(cudaStream_t const *streams,
+                                        uint32_t const *gpu_indexes,
+                                        uint32_t gpu_count, Torus *lwe_array,
+                                        Torus *carry_out, Torus *input_carries,
+                                        int_legacy_sc_prop_memory<Torus> *mem,
+                                        void *const *bsks, Torus *const *ksks,
+                                        uint32_t num_blocks) {
   auto params = mem->params;
   if (params.message_modulus == 2)
     PANIC("Cuda error: single carry propagation is not supported for 1 bit "
@@ -848,7 +1256,7 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
   int small_lwe_size = (params.small_lwe_dimension + 1);
 
   // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (int i = 0; i < num_blocks; i++) {
     auto cur_input_block = &input_blocks[i * big_lwe_size];
@@ -872,7 +1280,7 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
         mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
         params.glwe_dimension, params.small_lwe_dimension,
         params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);
+        params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);
 
     cuda_memcpy_async_gpu_to_gpu(
         (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
@@ -952,6 +1360,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
   getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
   device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
       lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
+  check_cuda_error(cudaGetLastError());
 }
 
 template <typename Torus>
@@ -1155,11 +1564,11 @@ void host_apply_many_univariate_lut_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in,
     int_radix_lut<Torus> *mem, Torus *const *ksks, void *const *bsks,
-    uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_blocks, uint32_t num_many_lut, uint32_t lut_stride) {
 
   integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
-      num_blocks, mem, lut_count, lut_stride);
+      num_blocks, mem, num_many_lut, lut_stride);
 }
 
 template <typename Torus>
@@ -1192,4 +1601,371 @@ void host_apply_bivariate_lut_kb(
       radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift);
 }
 
+template <typename Torus>
+void scratch_cuda_propagate_single_carry_kb_inplace(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_sc_prop_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_sc_prop_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
+      uses_carry, allocate_gpu_memory);
+}
+// This function perform the three steps of Thomas' new carry propagation
+// includes the logic to extract overflow when requested
+template <typename Torus>
+void host_propagate_single_carry(cudaStream_t const *streams,
+                                 uint32_t const *gpu_indexes,
+                                 uint32_t gpu_count, Torus *lwe_array,
+                                 Torus *carry_out, const Torus *input_carries,
+                                 int_sc_prop_memory<Torus> *mem,
+                                 void *const *bsks, Torus *const *ksks,
+                                 uint32_t num_radix_blocks,
+                                 uint32_t requested_flag, uint32_t uses_carry) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
+  auto lut_stride = mem->lut_stride;
+  auto num_many_lut = mem->num_many_lut;
+  if (requested_flag == outputFlag::FLAG_OVERFLOW)
+    PANIC("Cuda error: single carry propagation is not supported for overflow, "
+          "try using add_and_propagate_single_carry");
+  if (uses_carry == 1) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                         input_carries, big_lwe_dimension, 1);
+  }
+  // Step 1
+  host_compute_shifted_blocks_and_states<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array, params,
+      mem->shifted_blocks_state_mem, bsks, ksks, num_radix_blocks, lut_stride,
+      num_many_lut);
+  auto block_states = mem->shifted_blocks_state_mem->block_states;
+
+  if (requested_flag == outputFlag::FLAG_CARRY) {
+    cuda_memcpy_async_gpu_to_gpu(
+        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  }
+  // Step 2
+  host_compute_propagation_simulators_and_group_carries<Torus>(
+      streams, gpu_indexes, gpu_count, block_states, params,
+      mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
+      mem->num_groups);
+
+  auto group_size = mem->prop_simu_group_carries_mem->group_size;
+
+  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
+  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
+  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                       shifted_blocks,
+                       mem->prop_simu_group_carries_mem->simulators,
+                       big_lwe_dimension, num_radix_blocks);
+
+  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
+      requested_flag == outputFlag::FLAG_CARRY) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
+                         mem->output_flag,
+                         mem->prop_simu_group_carries_mem->simulators +
+                             (num_radix_blocks - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+  // Step 3
+  //  Add carries and cleanup OutputFlag::None
+  host_radix_sum_in_groups<Torus>(
+      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
+      big_lwe_size, group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
+      bsks, ksks, num_radix_blocks, message_extract);
+
+  if (requested_flag == outputFlag::FLAG_CARRY) {
+    host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                         mem->output_flag, mem->output_flag,
+                         mem->prop_simu_group_carries_mem->resolved_carries +
+                             (mem->num_groups - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+        mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
+
+    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
+                                 big_lwe_size_bytes, mem->sub_streams_2[0],
+                                 gpu_indexes[0]);
+  }
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
+  }
+}
+
+// This function perform the three steps of Thomas' new carry propagation
+// includes the logic to extract overflow when requested
+template <typename Torus>
+void host_add_and_propagate_single_carry(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lhs_array, const Torus *rhs_array,
+    Torus *carry_out, const Torus *input_carries,
+    int_sc_prop_memory<Torus> *mem, void *const *bsks, Torus *const *ksks,
+    uint32_t num_radix_blocks, uint32_t requested_flag, uint32_t uses_carry) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
+  auto lut_stride = mem->lut_stride;
+  auto num_many_lut = mem->num_many_lut;
+
+  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
+    cuda_memcpy_async_gpu_to_gpu(
+        mem->last_lhs, lhs_array + (num_radix_blocks - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        mem->last_rhs, rhs_array + (num_radix_blocks - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  }
+
+  host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
+                       rhs_array, big_lwe_dimension, num_radix_blocks);
+
+  if (uses_carry == 1) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
+                         input_carries, big_lwe_dimension, 1);
+  }
+  // Step 1
+  host_compute_shifted_blocks_and_states<Torus>(
+      streams, gpu_indexes, gpu_count, lhs_array, params,
+      mem->shifted_blocks_state_mem, bsks, ksks, num_radix_blocks, lut_stride,
+      num_many_lut);
+  auto block_states = mem->shifted_blocks_state_mem->block_states;
+  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
+    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs,
+        mem->last_rhs, bsks, ksks, 1, lut_overflow_prep,
+        lut_overflow_prep->params.message_modulus);
+  } else if (requested_flag == outputFlag::FLAG_CARRY) {
+    cuda_memcpy_async_gpu_to_gpu(
+        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  }
+
+  // Step 2
+  host_compute_propagation_simulators_and_group_carries<Torus>(
+      streams, gpu_indexes, gpu_count, block_states, params,
+      mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
+      mem->num_groups);
+
+  auto group_size = mem->prop_simu_group_carries_mem->group_size;
+
+  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
+  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
+  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                       shifted_blocks,
+                       mem->prop_simu_group_carries_mem->simulators,
+                       big_lwe_dimension, num_radix_blocks);
+
+  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
+      requested_flag == outputFlag::FLAG_CARRY) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
+                         mem->output_flag,
+                         mem->prop_simu_group_carries_mem->simulators +
+                             (num_radix_blocks - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  // Step 3
+  //  Add carries and cleanup OutputFlag::None
+  host_radix_sum_in_groups<Torus>(
+      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
+      big_lwe_size, group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks,
+      bsks, ksks, num_radix_blocks, message_extract);
+
+  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
+      requested_flag == outputFlag::FLAG_CARRY) {
+    if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
+        uses_carry == 1) {
+      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                           mem->output_flag, mem->output_flag, input_carries,
+                           big_lwe_dimension, 1);
+
+    } else {
+
+      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                           mem->output_flag, mem->output_flag,
+                           mem->prop_simu_group_carries_mem->resolved_carries +
+                               (mem->num_groups - 1) * big_lwe_size,
+                           big_lwe_dimension, 1);
+    }
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
+    } else {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
+    }
+    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
+                                 big_lwe_size_bytes, mem->sub_streams_2[0],
+                                 gpu_indexes[0]);
+  }
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
+  }
+}
+
+template <typename Torus>
+void scratch_cuda_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_borrow_prop_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    uint32_t compute_overflow, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_borrow_prop_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      compute_overflow, allocate_gpu_memory);
+}
+
+// This function perform the three steps of Thomas' new borrow propagation
+// includes the logic to extract overflow when requested
+template <typename Torus>
+void host_single_borrow_propagate(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lhsrhs_array, Torus *overflow_block,
+    const Torus *input_borrow, int_borrow_prop_memory<Torus> *mem,
+    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
+    uint32_t num_groups, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  auto big_lwe_dimension = big_lwe_size - 1;
+  auto lut_stride = mem->lut_stride;
+  auto num_many_lut = mem->num_many_lut;
+
+  assert(mem->num_groups >= num_groups);
+  if (uses_input_borrow == 1) {
+    host_unchecked_sub_with_correcting_term<Torus>(
+        streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
+        big_lwe_dimension, 1, message_modulus, carry_modulus,
+        message_modulus - 1);
+  }
+  // Step 1
+  host_compute_shifted_blocks_and_borrow_states<Torus>(
+      streams, gpu_indexes, gpu_count, lhsrhs_array, params,
+      mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_radix_blocks,
+      lut_stride, num_many_lut);
+
+  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
+  cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
+                               borrow_states +
+                                   (num_radix_blocks - 1) * big_lwe_size,
+                               big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+  // Step 2
+  host_compute_propagation_simulators_and_group_carries<Torus>(
+      streams, gpu_indexes, gpu_count, borrow_states, params,
+      mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
+      num_groups);
+
+  auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
+  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
+  auto simulators = mem->prop_simu_group_carries_mem->simulators;
+
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                          shifted_blocks, simulators, big_lwe_dimension,
+                          num_radix_blocks);
+
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+
+  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
+                         mem->overflow_block,
+                         mem->prop_simu_group_carries_mem->simulators +
+                             (num_radix_blocks - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+  auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries;
+
+  // Step 3
+  //  This needs to be done before because in next step we modify the resolved
+  //  borrows
+  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
+                         mem->overflow_block,
+                         resolved_borrows + (num_groups - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+
+  cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0],
+                           gpu_indexes[j]);
+    cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0],
+                           gpu_indexes[j]);
+  }
+
+  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
+    auto borrow_flag = mem->lut_borrow_flag;
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
+        mem->overflow_block, bsks, ksks, 1, borrow_flag);
+  }
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
+                      gpu_indexes[j]);
+  }
+
+  // subtract borrow and cleanup prepared blocks
+  host_negation<Torus>(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows,
+                       resolved_borrows, big_lwe_dimension, num_groups);
+
+  host_radix_sum_in_groups<Torus>(
+      mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      resolved_borrows, num_radix_blocks, big_lwe_size, mem->group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks,
+      bsks, ksks, num_radix_blocks, message_extract);
+
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
+                      gpu_indexes[j]);
+    cuda_stream_wait_event(streams[0], mem->outgoing_events1[j],
+                           gpu_indexes[0]);
+    cuda_stream_wait_event(streams[0], mem->outgoing_events2[j],
+                           gpu_indexes[0]);
+  }
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 1e2694e1e2..1a39b08567 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -209,7 +209,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   auto small_lwe_size = small_lwe_dimension + 1;
 
   // In the case of extracting a single LWE this parameters are dummy
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
 
   if (num_radix_in_vec == 0)
@@ -370,7 +370,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
           mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type, num_many_lut, lut_stride);
     } else {
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
@@ -418,7 +418,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
           mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type, lut_count, lut_stride);
+          mem_ptr->params.pbs_type, num_many_lut, lut_stride);
 
       multi_gpu_gather_lwe_async<Torus>(
           streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -578,10 +578,15 @@ __host__ void host_integer_mult_radix_kb(
       terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
       2 * num_blocks, mem_ptr->luts_array);
 
-  auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, nullptr, nullptr,
-                                     scp_mem_ptr, bsks, ksks, num_blocks);
+  uint32_t block_modulus = message_modulus * carry_modulus;
+  uint32_t num_bits_in_block = std::log2(block_modulus);
+
+  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
+  uint32_t requested_flag = outputFlag::FLAG_NONE;
+  uint32_t uses_carry = 0;
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
+      scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
index e55ea9e912..36972b29d3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -12,49 +12,3 @@ void cuda_negate_integer_radix_ciphertext_64(
       static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
       lwe_ciphertext_count, message_modulus, carry_modulus);
 }
-
-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
-}
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
-    void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
-
-  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
-
-  host_integer_overflowing_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(radix_lwe_out),
-      static_cast<uint64_t *>(radix_lwe_overflowed),
-      static_cast<const uint64_t *>(radix_lwe_left),
-      static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-      mem, num_blocks);
-}
-
-void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void) {
-  int_overflowing_sub_memory<uint64_t> *mem_ptr =
-      (int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
index 6eda409df9..28f7ac93cf 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
   *mem_ptr = new int_overflowing_sub_memory<Torus>(
       streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }
-
+/*
 template <typename Torus>
 __host__ void host_integer_overflowing_sub_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb(
                                           mem_ptr, bsks, ksks, num_blocks);
 }
 
+*/
+template <typename Torus>
+__host__ void host_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
+    const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
+    int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  auto radix_params = mem_ptr->params;
+
+  // We need to recalculate the num_groups, because on the division the number
+  // of num_blocks changes
+  uint32_t block_modulus =
+      radix_params.message_modulus * radix_params.carry_modulus;
+  uint32_t num_bits_in_block = std::log2(block_modulus);
+  uint32_t grouping_size = num_bits_in_block;
+  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+
+  auto stream = (cudaStream_t *)streams;
+  host_unchecked_sub_with_correcting_term<Torus>(
+      stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
+      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
+      radix_params.carry_modulus, radix_params.message_modulus - 1);
+
+  host_single_borrow_propagate<Torus>(
+      streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(overflow_block),
+      static_cast<const Torus *>(input_borrow),
+      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
+      num_blocks, num_groups, compute_overflow, uses_input_borrow);
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
index 37a51006ae..941f31bc42 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -112,10 +112,12 @@ __host__ void host_integer_scalar_mul_radix(
         terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
         num_radix_blocks, j, nullptr);
 
-    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
-    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
-                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
-                                   num_radix_blocks);
+    auto scp_mem_ptr = mem->sc_prop_mem;
+    uint32_t requested_flag = outputFlag::FLAG_NONE;
+    uint32_t uses_carry = 0;
+    host_propagate_single_carry<T>(
+        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
+        scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
index d3f47ad263..03ded74b46 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -57,6 +57,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                           static_cast<const uint64_t *>(lwe_array_in_2),
                           input_lwe_dimension, input_lwe_ciphertext_count);
 }
+
 /*
  * Perform the addition of a u32 input LWE ciphertext vector with a u32
  * plaintext vector. See the equivalent operation on u64 data for more details.
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
index 29e1f62689..3401cdadd2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -82,6 +82,46 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
   check_cuda_error(cudaGetLastError());
 }
 
+template <typename T>
+__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
+                                         T const *input_2, uint32_t num_entries,
+                                         uint32_t message_modulus) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] * message_modulus + input_2[index];
+  }
+}
+
+template <typename T>
+__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
+                                            uint32_t gpu_index, T *output,
+                                            T const *input_1, T const *input_2,
+                                            uint32_t input_lwe_dimension,
+                                            uint32_t input_lwe_ciphertext_count,
+                                            uint32_t message_modulus) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
+      &output[(input_lwe_ciphertext_count - 1) * lwe_size],
+      &input_1[(input_lwe_ciphertext_count - 1) * lwe_size],
+      &input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size,
+      message_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void subtraction(T *output, T const *input_1, T const *input_2,
                             uint32_t num_entries) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 9215bc044e..7209aba72e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -92,7 +92,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                        uint32_t polynomial_size, uint32_t base_log,
                        uint32_t level_count, uint32_t grouping_factor,
                        uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
-                       uint32_t lut_count, uint32_t lut_stride) {
+                       uint32_t num_many_lut, uint32_t lut_stride) {
 
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
@@ -126,7 +126,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            lut_count, lut_stride);
+            num_many_lut, lut_stride);
       }
       break;
     default:
@@ -165,7 +165,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, lut_count, lut_stride);
+            num_inputs_on_gpu, num_many_lut, lut_stride);
       }
       break;
     case CLASSICAL:
@@ -194,7 +194,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            lut_count, lut_stride);
+            num_many_lut, lut_stride);
       }
       break;
     default:
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 2e5f83d45b..c77b69b353 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_cg(
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t lut_count,
+    uint64_t device_memory_size_per_block, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   grid_group grid = this_grid();
@@ -152,8 +152,8 @@ __global__ void device_programmable_bootstrap_cg(
       // but we do the computation at block 0 to avoid waiting for extra blocks,
       // in case they're not synchronized
       sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
           auto next_lwe_array_out =
               lwe_array_out +
               (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -168,8 +168,8 @@ __global__ void device_programmable_bootstrap_cg(
       }
     } else if (blockIdx.y == glwe_dimension) {
       sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
 
           auto next_lwe_array_out =
               lwe_array_out +
@@ -235,7 +235,7 @@ __host__ void host_programmable_bootstrap_cg(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -273,7 +273,7 @@ __host__ void host_programmable_bootstrap_cg(
   kernel_args[10] = &base_log;
   kernel_args[11] = &level_count;
   kernel_args[12] = &d_mem;
-  kernel_args[14] = &lut_count;
+  kernel_args[14] = &num_many_lut;
   kernel_args[15] = &lut_stride;
 
   if (max_shared_memory < partial_sm) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index d736534e48..5eb59c5b78 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -32,7 +32,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
         uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
         int8_t *device_mem, uint64_t device_memory_size_per_block,
-        uint32_t lut_count, uint32_t lut_stride) {
+        uint32_t num_many_lut, uint32_t lut_stride) {
+
   grid_group grid = this_grid();
 
   // We use shared memory for the polynomials that are used often during the
@@ -134,8 +135,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         // default
         sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
 
-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
             auto next_lwe_array_out =
                 lwe_array_out +
                 (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -153,8 +154,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
         sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
 
-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
 
             auto next_lwe_array_out =
                 lwe_array_out +
@@ -293,7 +294,7 @@ __host__ void execute_cg_external_product_loop(
     Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
     uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
+    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   uint64_t full_sm =
@@ -343,7 +344,7 @@ __host__ void execute_cg_external_product_loop(
   kernel_args[16] = &chunk_size;
   kernel_args[17] = &keybundle_size_per_input;
   kernel_args[18] = &d_mem;
-  kernel_args[20] = &lut_count;
+  kernel_args[20] = &num_many_lut;
   kernel_args[21] = &lut_stride;
 
   dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
@@ -379,7 +380,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
@@ -397,7 +398,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, lut_count,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
         lut_stride);
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index dd3d446204..a77db81e19 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -123,7 +123,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   switch (polynomial_size) {
@@ -133,7 +133,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 512:
     host_programmable_bootstrap_tbc<Torus, Degree<512>>(
@@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 1024:
     host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
@@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 2048:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
@@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 4096:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
@@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 8192:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
@@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 16384:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
@@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -380,7 +380,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   switch (polynomial_size) {
@@ -390,7 +390,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 512:
     host_programmable_bootstrap_cg<Torus, Degree<512>>(
@@ -398,7 +398,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 1024:
     host_programmable_bootstrap_cg<Torus, Degree<1024>>(
@@ -406,7 +406,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 2048:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
@@ -414,7 +414,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 4096:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
@@ -422,7 +422,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 8192:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
@@ -430,7 +430,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 16384:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
@@ -438,7 +438,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -455,7 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   switch (polynomial_size) {
@@ -465,7 +465,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 512:
     host_programmable_bootstrap<Torus, Degree<512>>(
@@ -473,7 +473,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 1024:
     host_programmable_bootstrap<Torus, Degree<1024>>(
@@ -481,7 +481,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 2048:
     host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -489,7 +489,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 4096:
     host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -497,7 +497,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 8192:
     host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -505,7 +505,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case 16384:
     host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -513,7 +513,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -531,7 +531,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
 
   if (base_log > 32)
     PANIC("Cuda error (classical PBS): base log should be <= 32")
@@ -551,7 +551,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<const uint32_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -566,7 +566,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<const uint32_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -578,7 +578,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<const uint32_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -653,7 +653,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
   if (base_log > 64)
     PANIC("Cuda error (classical PBS): base log should be <= 64")
 
@@ -672,7 +672,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -687,7 +687,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -699,7 +699,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -727,7 +727,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -737,7 +737,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
@@ -758,7 +758,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -768,7 +768,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
@@ -797,7 +797,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
     uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -806,7 +806,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
     void *stream, uint32_t gpu_index,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 31f1e9487f..25701aca92 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -142,7 +142,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         uint32_t lwe_iteration, uint32_t lwe_dimension,
         uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
         int8_t *device_mem, uint64_t device_memory_size_per_block,
-        uint32_t lut_count, uint32_t lut_stride) {
+        uint32_t num_many_lut, uint32_t lut_stride) {
 
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -217,8 +217,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       // but we do the computation at block 0 to avoid waiting for extra blocks,
       // in case they're not synchronized
       sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
           auto next_lwe_array_out =
               lwe_array_out +
               (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
@@ -233,8 +233,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       }
     } else if (blockIdx.y == glwe_dimension) {
       sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
 
           auto next_lwe_array_out =
               lwe_array_out +
@@ -412,8 +412,8 @@ __host__ void execute_step_two(
     uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
-    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count,
-    uint32_t lut_stride) {
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   int max_shared_memory = cuda_get_max_shared_memory(0);
   cudaSetDevice(gpu_index);
@@ -426,21 +426,21 @@ __host__ void execute_step_two(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm, lut_count, lut_stride);
+            level_count, d_mem, full_dm, num_many_lut, lut_stride);
   } else if (max_shared_memory < full_sm) {
     device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
         <<<grid, thds, partial_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm, lut_count, lut_stride);
+            level_count, d_mem, partial_dm, num_many_lut, lut_stride);
   } else {
     device_programmable_bootstrap_step_two<Torus, params, FULLSM>
         <<<grid, thds, full_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_join_buffer,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0, lut_count, lut_stride);
+            level_count, d_mem, 0, num_many_lut, lut_stride);
   }
   check_cuda_error(cudaGetLastError());
 }
@@ -456,7 +456,7 @@ __host__ void host_programmable_bootstrap(
     pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
@@ -493,7 +493,7 @@ __host__ void host_programmable_bootstrap(
         global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
         partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
-        lut_count, lut_stride);
+        num_many_lut, lut_stride);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index 72b8982549..b2a7f214e2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -67,7 +67,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   switch (polynomial_size) {
   case 256:
@@ -76,7 +76,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 512:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -84,7 +84,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -92,7 +92,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 2048:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -100,7 +100,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 4096:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -108,7 +108,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -116,7 +116,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -124,7 +124,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -142,7 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   switch (polynomial_size) {
   case 256:
@@ -151,7 +151,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 512:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -159,7 +159,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -167,7 +167,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 2048:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -175,7 +175,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 4096:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -183,7 +183,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -191,7 +191,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -199,7 +199,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -215,7 +215,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void const *lwe_input_indexes, void const *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   if (base_log > 64)
@@ -236,7 +236,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
 #else
     PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -251,7 +251,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -263,7 +263,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -499,7 +499,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 
 template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
     void *stream, uint32_t gpu_index,
@@ -516,7 +516,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -588,7 +588,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   if (base_log > 32)
     PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
@@ -600,7 +600,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 512:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -608,7 +608,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -616,7 +616,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 2048: {
     int num_sms = 0;
@@ -629,14 +629,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
           lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
           lwe_dimension, polynomial_size, grouping_factor, base_log,
-          level_count, num_samples, lut_count, lut_stride);
+          level_count, num_samples, num_many_lut, lut_stride);
     else
       host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
           static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
           lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
           lwe_dimension, polynomial_size, grouping_factor, base_log,
-          level_count, num_samples, lut_count, lut_stride);
+          level_count, num_samples, num_many_lut, lut_stride);
 
     break;
   }
@@ -646,7 +646,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -654,7 +654,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -662,7 +662,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, lut_count, lut_stride);
+        num_samples, num_many_lut, lut_stride);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -685,5 +685,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride);
+    uint32_t num_many_lut, uint32_t lut_stride);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index a58647185b..ba73d29bf7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -253,7 +253,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         uint32_t polynomial_size, uint32_t level_count,
         uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
         uint32_t lwe_chunk_size, int8_t *device_mem,
-        uint64_t device_memory_size_per_block, uint32_t lut_count,
+        uint64_t device_memory_size_per_block, uint32_t num_many_lut,
         uint32_t lut_stride) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -326,8 +326,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       // but we do the computation at block 0 to avoid waiting for extra blocks,
       // in case they're not synchronized
       sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
           auto next_lwe_array_out =
               lwe_array_out +
               (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
@@ -342,8 +342,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
       }
     } else if (blockIdx.y == glwe_dimension) {
       sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
 
           auto next_lwe_array_out =
               lwe_array_out +
@@ -591,12 +591,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_two(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
-    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
-    uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
+__host__ void
+execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+                 Torus const *lwe_output_indexes,
+                 pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+                 uint32_t lwe_dimension, uint32_t glwe_dimension,
+                 uint32_t polynomial_size, int32_t grouping_factor,
+                 uint32_t level_count, uint32_t j, uint32_t lwe_offset,
+                 uint32_t num_many_lut, uint32_t lut_stride) {
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
   uint64_t full_sm_accumulate_step_two =
@@ -621,7 +623,7 @@ __host__ void execute_step_two(
             global_accumulator, global_accumulator_fft, lwe_dimension,
             glwe_dimension, polynomial_size, level_count, grouping_factor, j,
             lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
-            lut_count, lut_stride);
+            num_many_lut, lut_stride);
   else
     device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
                                                                 FULLSM>
@@ -630,7 +632,7 @@ __host__ void execute_step_two(
                      global_accumulator, global_accumulator_fft, lwe_dimension,
                      glwe_dimension, polynomial_size, level_count,
                      grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
-                     lut_count, lut_stride);
+                     num_many_lut, lut_stride);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -643,7 +645,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
@@ -667,7 +669,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
       execute_step_two<Torus, params>(
           stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
           num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, j, lwe_offset, lut_count, lut_stride);
+          grouping_factor, level_count, j, lwe_offset, num_many_lut,
+          lut_stride);
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index b7dc557e3a..bbbf6f2eee 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -45,8 +45,8 @@ __global__ void device_programmable_bootstrap_tbc(
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, bool support_dsm, uint32_t lut_count,
-    uint32_t lut_stride) {
+    uint64_t device_memory_size_per_block, bool support_dsm,
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   cluster_group cluster = this_cluster();
 
@@ -158,8 +158,8 @@ __global__ void device_programmable_bootstrap_tbc(
       // in case they're not synchronized
       sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
 
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
           auto next_lwe_array_out =
               lwe_array_out +
               (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -175,8 +175,8 @@ __global__ void device_programmable_bootstrap_tbc(
     } else if (blockIdx.y == glwe_dimension) {
       sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
 
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
 
           auto next_lwe_array_out =
               lwe_array_out +
@@ -261,7 +261,7 @@ __host__ void host_programmable_bootstrap_tbc(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
 
   auto supports_dsm =
       supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -317,7 +317,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
-        supports_dsm, lut_count, lut_stride));
+        supports_dsm, num_many_lut, lut_stride));
   } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
     config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
 
@@ -326,7 +326,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        partial_dm, supports_dsm, lut_count, lut_stride));
+        partial_dm, supports_dsm, num_many_lut, lut_stride));
   } else {
     config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
 
@@ -335,7 +335,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
-        supports_dsm, lut_count, lut_stride));
+        supports_dsm, num_many_lut, lut_stride));
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 22b6f4e196..701f80379b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -32,7 +32,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
         uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
         int8_t *device_mem, uint64_t device_memory_size_per_block,
-        bool support_dsm, uint32_t lut_count, uint32_t lut_stride) {
+        bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {
 
   cluster_group cluster = this_cluster();
 
@@ -141,8 +141,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         // blocks, in case they're not synchronized
         sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
 
-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
             auto next_lwe_array_out =
                 lwe_array_out +
                 (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
@@ -157,8 +157,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
         }
       } else if (blockIdx.y == glwe_dimension) {
         sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-        if (lut_count > 1) {
-          for (int i = 1; i < lut_count; i++) {
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
 
             auto next_lwe_array_out =
                 lwe_array_out +
@@ -299,7 +299,7 @@ __host__ void execute_tbc_external_product_loop(
     Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
     uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
+    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -363,7 +363,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, full_dm, supports_dsm, lut_count,
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm, num_many_lut,
         lut_stride));
   } else if (max_shared_memory < full_dm + minimum_dm) {
     config.dynamicSmemBytes = partial_dm + minimum_dm;
@@ -375,7 +375,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, partial_dm, supports_dsm, lut_count,
+        keybundle_size_per_input, d_mem, partial_dm, supports_dsm, num_many_lut,
         lut_stride));
   } else {
     config.dynamicSmemBytes = full_dm + minimum_dm;
@@ -387,7 +387,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, 0, supports_dsm, lut_count,
+        keybundle_size_per_input, d_mem, 0, supports_dsm, num_many_lut,
         lut_stride));
   }
 }
@@ -401,7 +401,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t lut_count, uint32_t lut_stride) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
   cudaSetDevice(gpu_index);
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -419,7 +419,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, lut_count,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
         lut_stride);
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index eb09d17e52..8de4cc197e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -46,6 +46,24 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
   }
 }
 
+/// Allocates the input/output vector for all devices
+/// Initializes also the related indexing and initializes it to the trivial
+/// index
+template <typename Torus>
+void multi_gpu_alloc_lwe_many_lut_output_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
+    uint32_t num_many_lut, uint32_t lwe_size) {
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    Torus *d_array = (Torus *)cuda_malloc_async(num_many_lut * inputs_on_gpu *
+                                                    lwe_size * sizeof(Torus),
+                                                streams[i], gpu_indexes[i]);
+    dest[i] = d_array;
+  }
+}
+
 /// Load an array residing on one GPU to all active gpus
 /// and split the array among them.
 /// The input indexing logic is given by an index array.
@@ -126,6 +144,49 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
   }
 }
 
+/// Copy data from multiple GPUs back to GPU 0 following the indexing given in
+/// dest_indexes
+/// The input indexing should be the trivial one
+template <typename Torus>
+void multi_gpu_gather_many_lut_lwe_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *dest, const std::vector<Torus *> &src,
+    Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs,
+    uint32_t lwe_size, uint32_t num_many_lut) {
+
+  for (uint lut_id = 0; lut_id < num_many_lut; lut_id++) {
+    for (uint i = 0; i < gpu_count; i++) {
+      auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+      auto gpu_offset = 0;
+      for (uint j = 0; j < i; j++) {
+        gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+      }
+
+      if (is_trivial_index) {
+        auto d_dest =
+            dest + gpu_offset * lwe_size + lut_id * num_inputs * lwe_size;
+        auto d_src = src[i] + lut_id * inputs_on_gpu * lwe_size;
+
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                     inputs_on_gpu * lwe_size * sizeof(Torus),
+                                     streams[i], gpu_indexes[i]);
+      } else {
+        auto dest_indexes = h_dest_indexes + gpu_offset;
+
+        for (uint j = 0; j < inputs_on_gpu; j++) {
+          auto d_dest = dest + dest_indexes[j] * lwe_size +
+                        lut_id * num_inputs * lwe_size;
+          auto d_src =
+              src[i] + j * lwe_size + lut_id * inputs_on_gpu * lwe_size;
+
+          cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
+                                       streams[i], gpu_indexes[i]);
+        }
+      }
+    }
+  }
+}
+
 template <typename Torus>
 void multi_gpu_release_async(cudaStream_t const *streams,
                              uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
index ae6ec59f31..7b4cbe9802 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -177,7 +177,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -186,7 +186,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
         d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
         d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut,
         lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
@@ -208,7 +208,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -221,7 +221,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
         (const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk,
         (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-        pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride);
+        pbs_level, input_lwe_ciphertext_count, num_many_lut, lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -234,7 +234,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -243,7 +243,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
         d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
         d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut,
         lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
@@ -265,7 +265,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
       stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -276,7 +276,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, lut_count, lut_stride);
+        input_lwe_ciphertext_count, num_many_lut, lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -297,7 +297,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
       stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -308,7 +308,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, lut_count, lut_stride);
+        input_lwe_ciphertext_count, num_many_lut, lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -322,7 +322,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
       stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
@@ -333,7 +333,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, lut_count, lut_stride);
+        input_lwe_ciphertext_count, num_many_lut, lut_stride);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
index 11e4dd3122..5a5223d4fe 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -173,7 +173,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
   cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
   int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
                  polynomial_size * (lwe_dimension + 1);
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   // Here execute the PBS
   for (int r = 0; r < repetitions; r++) {
@@ -192,7 +192,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
           lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, lut_count, lut_stride);
+          pbs_level, number_of_inputs, num_many_lut, lut_stride);
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                                (glwe_dimension * polynomial_size + 1) *
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
index eec69ddd38..a621233454 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -119,7 +119,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
                  (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
                  (1 << grouping_factor);
 
-  uint32_t lut_count = 1;
+  uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (int r = 0; r < repetitions; r++) {
     uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
@@ -137,7 +137,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
           glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-          pbs_level, number_of_inputs, lut_count, lut_stride);
+          pbs_level, number_of_inputs, num_many_lut, lut_stride);
 
       // Copy result to the host memory
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index df0f91ab23..d6bf96fe14 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -721,6 +721,32 @@ extern "C" {
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: PBS_TYPE,
+        requested_flag: u32,
+        uses_carry: u32,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        requested_flag: u32,
+        uses_carry: u32,
         allocate_gpu_memory: bool,
     );
 }
@@ -731,24 +757,30 @@ extern "C" {
         gpu_count: u32,
         lwe_array: *mut ffi::c_void,
         carry_out: *mut ffi::c_void,
+        carry_in: *const ffi::c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
         num_blocks: u32,
+        requested_flag: u32,
+        uses_carry: u32,
     );
 }
 extern "C" {
-    pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+    pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array: *mut ffi::c_void,
+        lhs_array: *mut ffi::c_void,
+        rhs_array: *const ffi::c_void,
         carry_out: *mut ffi::c_void,
-        input_carries: *mut ffi::c_void,
+        carry_in: *const ffi::c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
         num_blocks: u32,
+        requested_flag: u32,
+        uses_carry: u32,
     );
 }
 extern "C" {
@@ -760,43 +792,55 @@ extern "C" {
     );
 }
 extern "C" {
-    pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+    pub fn cleanup_cuda_add_and_propagate_single_carry(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+extern "C" {
+    pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
         glwe_dimension: u32,
         polynomial_size: u32,
-        lwe_dimension: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
         ks_level: u32,
         ks_base_log: u32,
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks_in_radix: u32,
-        max_num_radix_in_vec: u32,
+        num_blocks: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: PBS_TYPE,
+        compute_overflow: u32,
         allocate_gpu_memory: bool,
     );
 }
 extern "C" {
-    pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+    pub fn cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_out: *mut ffi::c_void,
-        radix_lwe_vec: *mut ffi::c_void,
-        num_radix_in_vec: u32,
+        lhs_array: *mut ffi::c_void,
+        rhs_array: *const ffi::c_void,
+        overflow_block: *mut ffi::c_void,
+        input_borrow: *const ffi::c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        num_blocks_in_radix: u32,
+        num_blocks: u32,
+        compute_overflow: u32,
+        uses_input_borrow: u32,
     );
 }
 extern "C" {
-    pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+    pub fn cleanup_cuda_integer_overflowing_sub(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
@@ -804,21 +848,21 @@ extern "C" {
     );
 }
 extern "C" {
-    pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64(
+    pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
         glwe_dimension: u32,
         polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
+        lwe_dimension: u32,
         ks_level: u32,
         ks_base_log: u32,
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks: u32,
+        num_blocks_in_radix: u32,
+        max_num_radix_in_vec: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: PBS_TYPE,
@@ -826,14 +870,13 @@ extern "C" {
     );
 }
 extern "C" {
-    pub fn cuda_integer_radix_overflowing_sub_kb_64(
+    pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         radix_lwe_out: *mut ffi::c_void,
-        radix_lwe_overflowed: *mut ffi::c_void,
-        radix_lwe_left: *const ffi::c_void,
-        radix_lwe_right: *const ffi::c_void,
+        radix_lwe_vec: *mut ffi::c_void,
+        num_radix_in_vec: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
@@ -841,7 +884,7 @@ extern "C" {
     );
 }
 extern "C" {
-    pub fn cleanup_cuda_integer_radix_overflowing_sub(
+    pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
@@ -942,52 +985,6 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
-    pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_blocks: u32,
-        signed_operation: i8,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-    );
-}
-extern "C" {
-    pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lhs: *mut ffi::c_void,
-        rhs: *const ffi::c_void,
-        overflowed: *mut ffi::c_void,
-        signed_operation: i8,
-        mem_ptr: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-        num_blocks_in_radix: u32,
-    );
-}
-extern "C" {
-    pub fn cleanup_signed_overflowing_add_or_sub(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 extern "C" {
     pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
@@ -1384,7 +1381,7 @@ extern "C" {
         base_log: u32,
         level_count: u32,
         num_samples: u32,
-        lut_count: u32,
+        num_many_lut: u32,
         lut_stride: u32,
     );
 }
@@ -1406,7 +1403,7 @@ extern "C" {
         base_log: u32,
         level_count: u32,
         num_samples: u32,
-        lut_count: u32,
+        num_many_lut: u32,
         lut_stride: u32,
     );
 }
@@ -1469,7 +1466,7 @@ extern "C" {
         base_log: u32,
         level_count: u32,
         num_samples: u32,
-        lut_count: u32,
+        num_many_lut: u32,
         lut_stride: u32,
     );
 }
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index b25cde608f..8fd40719b7 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -111,7 +111,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
     level: DecompositionLevelCount,
     num_samples: u32,
 ) {
-    let lut_count = 1u32;
+    let num_many_lut = 1u32;
     let lut_stride = 0u32;
     let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
     scratch_cuda_programmable_bootstrap_64(
@@ -141,7 +141,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        lut_count,
+        num_many_lut,
         lut_stride,
     );
     cleanup_cuda_programmable_bootstrap(
@@ -175,7 +175,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
     grouping_factor: LweBskGroupingFactor,
     num_samples: u32,
 ) {
-    let lut_count = 1u32;
+    let num_many_lut = 1u32;
     let lut_stride = 0u32;
     let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
     scratch_cuda_multi_bit_programmable_bootstrap_64(
@@ -206,7 +206,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        lut_count,
+        num_many_lut,
         lut_stride,
     );
     cleanup_cuda_multi_bit_programmable_bootstrap(
diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
index a2970b40fc..8770c11e6b 100644
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -310,21 +310,6 @@ impl CudaRadixCiphertextInfo {
                 .collect(),
         }
     }
-    pub(crate) fn after_bitnot(&self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .map(|left| CudaBlockInfo {
-                    degree: Degree::new(left.message_modulus.0 - 1),
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        }
-    }
     pub(crate) fn after_scalar_bitand<T>(&self, scalar: T) -> Self
     where
         T: DecomposableInto<u8>,
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index de2038d9b4..1fead5183c 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
 
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
 
@@ -1016,10 +1017,11 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
+pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
     radix_lwe_input: &mut CudaVec<T>,
     carry_out: &mut CudaVec<T>,
+    carry_in: &CudaVec<T>,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     lwe_dimension: LweDimension,
@@ -1034,6 +1036,8 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
+    requested_flag: OutputFlag,
+    uses_carry: u32,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
@@ -1070,6 +1074,8 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
+        requested_flag as u32,
+        uses_carry,
         true,
     );
     cuda_propagate_single_carry_kb_64_inplace(
@@ -1078,10 +1084,13 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
         streams.len() as u32,
         radix_lwe_input.as_mut_c_ptr(0),
         carry_out.as_mut_c_ptr(0),
+        carry_in.as_c_ptr(0),
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
         num_blocks,
+        requested_flag as u32,
+        uses_carry,
     );
     cleanup_cuda_propagate_single_carry(
         streams.ptr.as_ptr(),
@@ -1096,14 +1105,12 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
-    T: UnsignedInteger,
-    B: Numeric,
->(
+pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    radix_lwe_input: &mut CudaVec<T>,
+    radix_lwe_lhs_input: &mut CudaVec<T>,
+    radix_lwe_rhs_input: &CudaVec<T>,
     carry_out: &mut CudaVec<T>,
-    input_carries: &mut CudaVec<T>,
+    carry_in: &CudaVec<T>,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     lwe_dimension: LweDimension,
@@ -1118,10 +1125,17 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
+    requested_flag: OutputFlag,
+    uses_carry: u32,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_input.gpu_index(0),
+        radix_lwe_lhs_input.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_rhs_input.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -1136,7 +1150,7 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
     let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
-    scratch_cuda_propagate_single_carry_kb_64_inplace(
+    scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
@@ -1154,21 +1168,26 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async<
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
+        requested_flag as u32,
+        uses_carry,
         true,
     );
-    cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+    cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
-        radix_lwe_input.as_mut_c_ptr(0),
+        radix_lwe_lhs_input.as_mut_c_ptr(0),
+        radix_lwe_rhs_input.as_c_ptr(0),
         carry_out.as_mut_c_ptr(0),
-        input_carries.as_mut_c_ptr(0),
+        carry_in.as_c_ptr(0),
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
         num_blocks,
+        requested_flag as u32,
+        uses_carry,
     );
-    cleanup_cuda_propagate_single_carry(
+    cleanup_cuda_add_and_propagate_single_carry(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
@@ -2145,108 +2164,6 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
     );
 }
 
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
-pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
-    T: UnsignedInteger,
-    B: Numeric,
->(
-    streams: &CudaStreams,
-    ct_res: &mut CudaVec<T>,
-    ct_overflowed: &mut CudaVec<T>,
-    lhs: &CudaVec<T>,
-    rhs: &CudaVec<T>,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    num_blocks: u32,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-) {
-    assert_eq!(
-        streams.gpu_indexes[0],
-        ct_res.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        ct_overflowed.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        lhs.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        rhs.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        bootstrapping_key.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        keyswitch_key.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_integer_radix_overflowing_sub_kb_64(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        std::ptr::addr_of_mut!(mem_ptr),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_blocks,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        true,
-    );
-    cuda_integer_radix_overflowing_sub_kb_64(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        ct_res.as_mut_c_ptr(0),
-        ct_overflowed.as_mut_c_ptr(0),
-        lhs.as_c_ptr(0),
-        rhs.as_c_ptr(0),
-        mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
-        keyswitch_key.ptr.as_ptr(),
-        num_blocks,
-    );
-    cleanup_cuda_integer_radix_overflowing_sub(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        std::ptr::addr_of_mut!(mem_ptr),
-    );
-}
-
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -2356,7 +2273,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
-    lut_count: u32,
+    num_many_lut: u32,
     lut_stride: u32,
 ) {
     assert_eq!(
@@ -2410,7 +2327,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
         keyswitch_key.ptr.as_ptr(),
         bootstrapping_key.ptr.as_ptr(),
         num_blocks,
-        lut_count,
+        num_many_lut,
         lut_stride,
     );
     cleanup_cuda_apply_univariate_lut_kb_64(
@@ -2592,67 +2509,83 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async<
-    T: UnsignedInteger,
-    B: Numeric,
->(
+pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    lhs: &mut CudaVec<T>,
-    rhs: &CudaVec<T>,
-    overflowed: &mut CudaVec<T>,
-    signed_operation: i8,
+    radix_lwe_output: &mut CudaSliceMut<T>,
+    generates_or_propagates: &mut CudaSliceMut<T>,
+    input_lut: &[T],
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
+    lwe_dimension: LweDimension,
     glwe_dimension: GlweDimension,
     polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
     ks_level: DecompositionLevelCount,
     ks_base_log: DecompositionBaseLog,
     pbs_level: DecompositionLevelCount,
     pbs_base_log: DecompositionBaseLog,
     num_blocks: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
+    shift: u32,
 ) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        generates_or_propagates.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_output.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+    scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
+        input_lut.as_ptr().cast(),
+        lwe_dimension.0 as u32,
         glwe_dimension.0 as u32,
         polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
         ks_level.0 as u32,
         ks_base_log.0 as u32,
         pbs_level.0 as u32,
         pbs_base_log.0 as u32,
         grouping_factor.0 as u32,
         num_blocks,
-        signed_operation,
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
         true,
     );
-    cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
+
+    cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
-        lhs.as_mut_c_ptr(0),
-        rhs.as_c_ptr(0),
-        overflowed.as_mut_c_ptr(0),
-        signed_operation,
+        radix_lwe_output.as_mut_c_ptr(0),
+        generates_or_propagates.as_mut_c_ptr(0),
         mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
+        bootstrapping_key.ptr.as_ptr(),
         num_blocks,
+        shift,
     );
-    cleanup_signed_overflowing_add_or_sub(
+
+    cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
@@ -2665,11 +2598,43 @@ pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async<
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Numeric>(
+pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
     streams: &CudaStreams,
     radix_lwe_output: &mut CudaSliceMut<T>,
-    generates_or_propagates: &mut CudaSliceMut<T>,
-    input_lut: &[T],
+    num_blocks: u32,
+    lwe_size: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_output.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    if num_blocks > 1 {
+        cuda_integer_reverse_blocks_64_inplace(
+            streams.ptr.as_ptr(),
+            streams.gpu_indexes.as_ptr(),
+            streams.len() as u32,
+            radix_lwe_output.as_mut_c_ptr(0),
+            num_blocks,
+            lwe_size,
+        );
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    radix_lwe_input: &mut CudaVec<T>,
+    radix_rhs_input: &CudaVec<T>,
+    carry_out: &mut CudaVec<T>,
+    carry_in: &CudaVec<T>,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     lwe_dimension: LweDimension,
@@ -2684,16 +2649,12 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
-    shift: u32,
+    compute_overflow: bool,
+    uses_input_borrow: u32,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        generates_or_propagates.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        radix_lwe_output.gpu_index(0),
+        radix_lwe_input.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -2707,15 +2668,16 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
+    scratch_cuda_integer_overflowing_sub_kb_64_inplace(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
-        input_lut.as_ptr().cast(),
-        lwe_dimension.0 as u32,
         glwe_dimension.0 as u32,
         polynomial_size.0 as u32,
+        big_lwe_dimension,
+        lwe_dimension.0 as u32,
         ks_level.0 as u32,
         ks_base_log.0 as u32,
         pbs_level.0 as u32,
@@ -2725,23 +2687,25 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
         pbs_type as u32,
+        compute_overflow as u32,
         true,
     );
-
-    cuda_integer_compute_prefix_sum_hillis_steele_64(
+    cuda_integer_overflowing_sub_kb_64_inplace(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
-        radix_lwe_output.as_mut_c_ptr(0),
-        generates_or_propagates.as_mut_c_ptr(0),
+        radix_lwe_input.as_mut_c_ptr(0),
+        radix_rhs_input.as_c_ptr(0),
+        carry_out.as_mut_c_ptr(0),
+        carry_in.as_c_ptr(0),
         mem_ptr,
-        keyswitch_key.ptr.as_ptr(),
         bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
         num_blocks,
-        shift,
+        compute_overflow as u32,
+        uses_input_borrow,
     );
-
-    cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
+    cleanup_cuda_integer_overflowing_sub(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
@@ -2749,34 +2713,6 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
     );
 }
 
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
-pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
-    streams: &CudaStreams,
-    radix_lwe_output: &mut CudaSliceMut<T>,
-    num_blocks: u32,
-    lwe_size: u32,
-) {
-    assert_eq!(
-        streams.gpu_indexes[0],
-        radix_lwe_output.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    if num_blocks > 1 {
-        cuda_integer_reverse_blocks_64_inplace(
-            streams.ptr.as_ptr(),
-            streams.gpu_indexes.as_ptr(),
-            streams.len() as u32,
-            radix_lwe_output.as_mut_c_ptr(0),
-            num_blocks,
-            lwe_size,
-        );
-    }
-}
-
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
index 3d746230a6..797de9836f 100644
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -8,17 +8,11 @@ use crate::integer::gpu::ciphertext::{
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
     unchecked_add_integer_radix_assign_async,
-    unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async,
-    unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, PBSType,
+    unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::NoiseLevel;
 
-#[derive(Copy, Clone, PartialEq, Eq)]
-pub(crate) enum SignedOperation {
-    Addition,
-    Subtraction,
-}
-
 impl CudaServerKey {
     /// Computes homomorphically an addition between two ciphertexts encrypting integer values.
     ///
@@ -114,8 +108,14 @@ impl CudaServerKey {
                 (ct_left, &tmp_rhs)
             }
         };
-        self.unchecked_add_assign_async(lhs, rhs, streams);
-        let _carry = self.propagate_single_carry_assign_async(lhs, streams);
+
+        let _carry = self.add_and_propagate_single_carry_assign_async(
+            lhs,
+            rhs,
+            streams,
+            None,
+            OutputFlag::None,
+        );
     }
 
     pub fn add_assign<T: CudaIntegerRadixCiphertext>(
@@ -348,7 +348,7 @@ impl CudaServerKey {
             .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
             .unwrap();
 
-        self.propagate_single_carry_assign_async(&mut result, streams);
+        self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
         assert!(result.block_carries_are_empty());
         result
     }
@@ -535,8 +535,58 @@ impl CudaServerKey {
         rhs: &CudaUnsignedRadixCiphertext,
         stream: &CudaStreams,
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
-        let mut ct_res = self.unchecked_add(lhs, rhs, stream);
-        let mut carry_out = self.propagate_single_carry_assign_async(&mut ct_res, stream);
+        let output_flag = OutputFlag::from_signedness(CudaUnsignedRadixCiphertext::IS_SIGNED);
+
+        let mut ct_res = lhs.duplicate_async(stream);
+        let mut carry_out: CudaUnsignedRadixCiphertext = self
+            .add_and_propagate_single_carry_assign_async(
+                &mut ct_res,
+                rhs,
+                stream,
+                None,
+                output_flag,
+            );
+
+        ct_res.as_mut().info = ct_res
+            .as_ref()
+            .info
+            .after_overflowing_add(&rhs.as_ref().info);
+
+        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+        {
+            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO);
+        } else {
+            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
+        }
+
+        let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
+
+        (ct_res, ct_overflowed)
+    }
+
+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronised
+    pub unsafe fn unchecked_signed_overflowing_add_async(
+        &self,
+        lhs: &CudaSignedRadixCiphertext,
+        rhs: &CudaSignedRadixCiphertext,
+        input_carry: Option<&CudaBooleanBlock>,
+        stream: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED);
+
+        let mut ct_res = lhs.duplicate_async(stream);
+        let mut carry_out: CudaSignedRadixCiphertext = self
+            .add_and_propagate_single_carry_assign_async(
+                &mut ct_res,
+                rhs,
+                stream,
+                input_carry,
+                output_flag,
+            );
 
         ct_res.as_mut().info = ct_res
             .as_ref()
@@ -655,141 +705,13 @@ impl CudaServerKey {
             "inputs cannot be empty"
         );
 
-        self.unchecked_signed_overflowing_add_or_sub(
-            ct_left,
-            ct_right,
-            SignedOperation::Addition,
-            stream,
-        )
-    }
-
-    pub(crate) fn unchecked_signed_overflowing_add_or_sub(
-        &self,
-        lhs: &CudaSignedRadixCiphertext,
-        rhs: &CudaSignedRadixCiphertext,
-        signed_operation: SignedOperation,
-        streams: &CudaStreams,
-    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
-        assert!(self.message_modulus.0 >= 4 && self.carry_modulus.0 >= 4);
-
-        let mut result: CudaSignedRadixCiphertext;
+        let result;
+        let overflowed;
         unsafe {
-            result = lhs.duplicate_async(streams);
-        }
-        let carry_out: CudaSignedRadixCiphertext =
-            unsafe { self.create_trivial_zero_radix_async(1, streams) };
-        let mut overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
-
-        unsafe {
-            self.unchecked_signed_overflowing_add_or_sub_assign_async(
-                &mut result,
-                rhs,
-                &mut overflowed,
-                signed_operation,
-                streams,
-            );
-        }
-        streams.synchronize();
-
-        (result, overflowed)
-    }
-
-    /// # Safety
-    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronized
-    pub(crate) unsafe fn unchecked_signed_overflowing_add_or_sub_assign_async(
-        &self,
-        lhs: &mut CudaSignedRadixCiphertext,
-        rhs: &CudaSignedRadixCiphertext,
-        overflowed: &mut CudaBooleanBlock,
-        signed_operation: SignedOperation,
-        streams: &CudaStreams,
-    ) {
-        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-        {
-            overflowed.as_mut().ciphertext.info = overflowed
-                .as_ref()
-                .ciphertext
-                .info
-                .boolean_info(NoiseLevel::ZERO);
-        } else {
-            overflowed.as_mut().ciphertext.info = overflowed
-                .as_ref()
-                .ciphertext
-                .info
-                .boolean_info(NoiseLevel::NOMINAL);
-        }
-        let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-        let signed_operation_numeric: i8 =
-            if matches!(signed_operation, SignedOperation::Subtraction) {
-                -1
-            } else {
-                1
-            };
-        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
-                    streams,
-                    &mut lhs.as_mut().d_blocks.0.d_vec,
-                    &rhs.as_ref().d_blocks.0.d_vec,
-                    &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    signed_operation_numeric,
-                    &d_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    num_blocks,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                );
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async(
-                    streams,
-                    &mut lhs.as_mut().d_blocks.0.d_vec,
-                    &rhs.as_ref().d_blocks.0.d_vec,
-                    &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    signed_operation_numeric,
-                    &d_multibit_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    num_blocks,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                );
-            }
+            (result, overflowed) =
+                self.unchecked_signed_overflowing_add_async(ct_left, ct_right, None, stream);
         };
-
-        lhs.as_mut().info = lhs
-            .as_ref()
-            .info
-            .after_overflowing_add(&rhs.ciphertext.info);
+        stream.synchronize();
+        (result, overflowed)
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
index 43e82bcd79..71006cb289 100644
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -90,7 +90,6 @@ impl CudaServerKey {
             &d_decomposed_scalar,
             streams,
         );
-        ct.as_mut().info = ct.as_ref().info.after_bitnot();
     }
 
     pub fn unchecked_bitnot_assign<T: CudaIntegerRadixCiphertext>(
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index f7571bf7d0..08e15a1318 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -6,6 +6,7 @@ use crate::core_crypto::prelude::{
     ContiguousEntityContainerMut, LweBskGroupingFactor, LweCiphertextCount,
 };
 use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::gpu::ciphertext::{
     CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
@@ -13,10 +14,11 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async, full_propagate_assign_async,
-    propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async,
-    CudaServerKey, PBSType,
+    add_and_propagate_single_carry_assign_async, apply_many_univariate_lut_kb_async,
+    apply_univariate_lut_kb_async, full_propagate_assign_async,
+    propagate_single_carry_assign_async, CudaServerKey, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator};
 use crate::shortint::server_key::{
@@ -203,6 +205,8 @@ impl CudaServerKey {
         &self,
         ct: &mut T,
         streams: &CudaStreams,
+        input_carry: Option<&CudaBooleanBlock>,
+        requested_flag: OutputFlag,
     ) -> T
     where
         T: CudaIntegerRadixCiphertext,
@@ -210,12 +214,20 @@ impl CudaServerKey {
         let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
         let ciphertext = ct.as_mut();
         let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+        let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
+        let mut aux_block: T = self.create_trivial_zero_radix(1, streams);
+        let in_carry_dvec = input_carry.map_or_else(
+            || &aux_block.as_mut().d_blocks.0.d_vec,
+            |block| &block.0.ciphertext.d_blocks.0.d_vec,
+        );
+
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 propagate_single_carry_assign_async(
                     streams,
                     &mut ciphertext.d_blocks.0.d_vec,
                     &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     d_bsk.input_lwe_dimension(),
@@ -230,6 +242,8 @@ impl CudaServerKey {
                     ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::Classical,
                     LweBskGroupingFactor(0),
+                    requested_flag,
+                    uses_carry,
                 );
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
@@ -237,6 +251,7 @@ impl CudaServerKey {
                     streams,
                     &mut ciphertext.d_blocks.0.d_vec,
                     &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     d_multibit_bsk.input_lwe_dimension(),
@@ -251,6 +266,8 @@ impl CudaServerKey {
                     ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::MultiBit,
                     d_multibit_bsk.grouping_factor,
+                    requested_flag,
+                    uses_carry,
                 );
             }
         };
@@ -269,26 +286,35 @@ impl CudaServerKey {
     ///
     /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
     ///   not be dropped until streams is synchronized
-    #[allow(dead_code)]
-    pub(crate) unsafe fn propagate_single_carry_get_input_carries_assign_async<T>(
+    pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T>(
         &self,
-        ct: &mut T,
-        input_carries: &mut T,
+        lhs: &mut T,
+        rhs: &T,
         streams: &CudaStreams,
+        input_carry: Option<&CudaBooleanBlock>,
+        requested_flag: OutputFlag,
     ) -> T
     where
         T: CudaIntegerRadixCiphertext,
     {
         let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
-        let ciphertext = ct.as_mut();
-        let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+
+        let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32;
+        let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
+        let mut aux_block: T = self.create_trivial_zero_radix(1, streams);
+        let in_carry_dvec = input_carry.map_or_else(
+            || &aux_block.as_mut().d_blocks.0.d_vec,
+            |block| &block.0.ciphertext.d_blocks.0.d_vec,
+        );
+
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
-                propagate_single_carry_get_input_carries_assign_async(
+                add_and_propagate_single_carry_assign_async(
                     streams,
-                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut lhs.as_mut().d_blocks.0.d_vec,
+                    &rhs.as_ref().d_blocks.0.d_vec,
                     &mut carry_out.as_mut().d_blocks.0.d_vec,
-                    &mut input_carries.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     d_bsk.input_lwe_dimension(),
@@ -299,18 +325,21 @@ impl CudaServerKey {
                     d_bsk.decomp_level_count(),
                     d_bsk.decomp_base_log(),
                     num_blocks,
-                    ciphertext.info.blocks.first().unwrap().message_modulus,
-                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    self.message_modulus,
+                    self.carry_modulus,
                     PBSType::Classical,
                     LweBskGroupingFactor(0),
+                    requested_flag,
+                    uses_carry,
                 );
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                propagate_single_carry_get_input_carries_assign_async(
+                add_and_propagate_single_carry_assign_async(
                     streams,
-                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut lhs.as_mut().d_blocks.0.d_vec,
+                    &rhs.as_ref().d_blocks.0.d_vec,
                     &mut carry_out.as_mut().d_blocks.0.d_vec,
-                    &mut input_carries.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
                     d_multibit_bsk.input_lwe_dimension(),
@@ -321,14 +350,16 @@ impl CudaServerKey {
                     d_multibit_bsk.decomp_level_count(),
                     d_multibit_bsk.decomp_base_log(),
                     num_blocks,
-                    ciphertext.info.blocks.first().unwrap().message_modulus,
-                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    self.message_modulus,
+                    self.carry_modulus,
                     PBSType::MultiBit,
                     d_multibit_bsk.grouping_factor,
+                    requested_flag,
+                    uses_carry,
                 );
             }
         };
-        ciphertext.info.blocks.iter_mut().for_each(|b| {
+        lhs.as_mut().info.blocks.iter_mut().for_each(|b| {
             b.degree = Degree::new(b.message_modulus.0 - 1);
             b.noise_level = NoiseLevel::NOMINAL;
         });
diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs
index d7156919cf..ddae4c8620 100644
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -1,6 +1,7 @@
 use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 
 impl CudaServerKey {
     /// Homomorphically computes the opposite of a ciphertext encrypting an integer message.
@@ -144,7 +145,8 @@ impl CudaServerKey {
         };
 
         let mut res = self.unchecked_neg_async(ct, streams);
-        let _carry = self.propagate_single_carry_assign_async(&mut res, streams);
+        let _carry =
+            self.propagate_single_carry_assign_async(&mut res, streams, None, OutputFlag::None);
         res
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
index 7c30c789bb..68def18dec 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
@@ -8,6 +8,7 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::scalar_addition_integer_radix_assign_async;
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::prelude::CastInto;
 use crate::shortint::ciphertext::NoiseLevel;
 
@@ -186,7 +187,7 @@ impl CudaServerKey {
         };
 
         self.unchecked_scalar_add_assign_async(ct, scalar, streams);
-        let _carry = self.propagate_single_carry_assign_async(ct, streams);
+        let _carry = self.propagate_single_carry_assign_async(ct, streams, None, OutputFlag::None);
     }
 
     pub fn scalar_add_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams)
@@ -264,7 +265,8 @@ impl CudaServerKey {
         self.unchecked_scalar_add_assign(ct_left, scalar, stream);
         let mut carry_out;
         unsafe {
-            carry_out = self.propagate_single_carry_assign_async(ct_left, stream);
+            carry_out =
+                self.propagate_single_carry_assign_async(ct_left, stream, None, OutputFlag::Carry);
         }
         stream.synchronize();
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
index d117927343..ff5e2801c7 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
@@ -4,6 +4,7 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext};
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::integer::server_key::TwosComplementNegation;
 use crate::prelude::CastInto;
 
@@ -151,7 +152,7 @@ impl CudaServerKey {
         };
 
         self.unchecked_scalar_sub_assign_async(ct, scalar, stream);
-        let _carry = self.propagate_single_carry_assign_async(ct, stream);
+        let _carry = self.propagate_single_carry_assign_async(ct, stream, None, OutputFlag::None);
     }
 
     pub fn scalar_sub_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams)
diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs
index 8e784a3686..b44ae35710 100644
--- a/tfhe/src/integer/gpu/server_key/radix/sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs
@@ -1,18 +1,17 @@
-use super::add::SignedOperation;
-use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
-use crate::core_crypto::prelude::{CiphertextModulus, LweBskGroupingFactor, LweCiphertextCount};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
-use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
 use crate::integer::gpu::ciphertext::{
-    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
-    CudaUnsignedRadixCiphertext,
+    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
 };
-use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+use crate::integer::gpu::server_key::CudaServerKey;
+
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
     unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::NoiseLevel;
+use crate::shortint::parameters::{Degree, LweBskGroupingFactor};
 
 impl CudaServerKey {
     /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values.
@@ -271,8 +270,14 @@ impl CudaServerKey {
             }
         };
 
-        self.unchecked_sub_assign_async(lhs, rhs, streams);
-        let _carry = self.propagate_single_carry_assign_async(lhs, streams);
+        let neg_rhs = self.unchecked_neg_async(rhs, streams);
+        let _carry = self.add_and_propagate_single_carry_assign_async(
+            lhs,
+            &neg_rhs,
+            streams,
+            None,
+            OutputFlag::None,
+        );
     }
 
     pub fn unsigned_overflowing_sub(
@@ -353,87 +358,102 @@ impl CudaServerKey {
         rhs: &CudaUnsignedRadixCiphertext,
         stream: &CudaStreams,
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
-        let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-        let mut tmp: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
-        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-        {
-            tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::ZERO);
-        } else {
-            tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
-        }
         let mut ct_res = lhs.duplicate_async(stream);
-        let block = CudaLweCiphertextList::new(
-            tmp.as_ref().d_blocks.lwe_dimension(),
-            LweCiphertextCount(1),
-            CiphertextModulus::new_native(),
-            stream,
-        );
-        let block_info = tmp.as_ref().info.blocks[0];
-        let ct_info = vec![block_info];
-        let ct_info = CudaRadixCiphertextInfo { blocks: ct_info };
 
-        let mut ct_overflowed =
-            CudaBooleanBlock::from_cuda_radix_ciphertext(CudaRadixCiphertext::new(block, ct_info));
+        let compute_overflow = true;
+        const INPUT_BORROW: Option<&CudaBooleanBlock> = None;
+
+        let mut overflow_block: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(1, stream);
+        let ciphertext = ct_res.as_mut();
+        let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+        let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32);
+
+        let mut aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
+        let in_carry_dvec = INPUT_BORROW.map_or_else(
+            || &aux_block.as_mut().d_blocks.0.d_vec,
+            |block| &block.0.ciphertext.d_blocks.0.d_vec,
+        );
 
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
                     stream,
-                    &mut ct_res.as_mut().d_blocks.0.d_vec,
-                    &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    &lhs.as_ref().d_blocks.0.d_vec,
+                    &mut ciphertext.d_blocks.0.d_vec,
                     &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflow_block.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
                     self.key_switching_key.decomposition_level_count(),
                     self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
                     num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::Classical,
                     LweBskGroupingFactor(0),
+                    compute_overflow,
+                    uses_input_borrow,
                 );
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
                     stream,
-                    &mut ct_res.as_mut().d_blocks.0.d_vec,
-                    &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    &lhs.as_ref().d_blocks.0.d_vec,
+                    &mut ciphertext.d_blocks.0.d_vec,
                     &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflow_block.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
+                    d_multibit_bsk.input_lwe_dimension(),
+                    d_multibit_bsk.glwe_dimension(),
+                    d_multibit_bsk.polynomial_size(),
                     self.key_switching_key.decomposition_level_count(),
                     self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.decomp_level_count(),
+                    d_multibit_bsk.decomp_base_log(),
                     num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::MultiBit,
                     d_multibit_bsk.grouping_factor,
+                    compute_overflow,
+                    uses_input_borrow,
                 );
             }
         };
+        ciphertext.info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(b.message_modulus.0 - 1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        overflow_block
+            .as_mut()
+            .info
+            .blocks
+            .iter_mut()
+            .for_each(|b| {
+                b.degree = Degree::new(1);
+                b.noise_level = NoiseLevel::ZERO;
+            });
+
+        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+        {
+            overflow_block.as_mut().info =
+                overflow_block.as_ref().info.boolean_info(NoiseLevel::ZERO);
+        } else {
+            overflow_block.as_mut().info = overflow_block
+                .as_ref()
+                .info
+                .boolean_info(NoiseLevel::NOMINAL);
+        }
+
+        let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(overflow_block.ciphertext);
 
         ct_res.as_mut().info = ct_res
             .as_ref()
@@ -541,11 +561,34 @@ impl CudaServerKey {
             ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 > 0,
             "inputs cannot be empty"
         );
+        let result;
+        let overflowed;
+        unsafe {
+            (result, overflowed) =
+                self.unchecked_signed_overflowing_sub_async(ct_left, ct_right, stream);
+        };
+        stream.synchronize();
+        (result, overflowed)
+    }
+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronised
+    pub unsafe fn unchecked_signed_overflowing_sub_async(
+        &self,
+        ct_left: &CudaSignedRadixCiphertext,
+        ct_right: &CudaSignedRadixCiphertext,
+        stream: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        let flipped_rhs = self.bitnot(ct_right, stream);
+        let ct_input_carry: CudaUnsignedRadixCiphertext =
+            self.create_trivial_radix_async(1, 1, stream);
+        let input_carry = CudaBooleanBlock::from_cuda_radix_ciphertext(ct_input_carry.ciphertext);
 
-        self.unchecked_signed_overflowing_add_or_sub(
+        self.unchecked_signed_overflowing_add_async(
             ct_left,
-            ct_right,
-            SignedOperation::Subtraction,
+            &flipped_rhs,
+            Some(&input_carry),
             stream,
         )
     }