diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index a9990423fe..8f3a1ce07d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -297,6 +297,47 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
                                          uint32_t gpu_count,
                                          int8_t **mem_ptr_void);
 
+void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory);
+
+void cuda_fast_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry);
+
+void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
+                                              uint32_t const *gpu_indexes,
+                                              uint32_t gpu_count,
+                                              int8_t **mem_ptr_void);
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory);
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow);
+
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 6dc85d4225..2823bd1179 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -53,6 +53,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                  uint32_t carry_modulus,
                                  std::function<Torus(Torus)> f);
 
+template <typename Torus>
+void generate_many_lut_device_accumulator(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus, std::vector<std::function<Torus(Torus)>> &f);
+
 struct int_radix_params {
   PBS_TYPE pbs_type;
   uint32_t glwe_dimension;
@@ -1116,6 +1122,1267 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     delete scp_mem;
   }
 };
+// For sequential algorithm in group propagation
+template <typename Torus> struct int_seq_group_prop_memory {
+
+  Torus *group_resolved_carries;
+  int_radix_lut<Torus> *lut_sequential_algorithm;
+  uint32_t grouping_size;
+
+  int_seq_group_prop_memory(cudaStream_t const *streams,
+                            uint32_t const *gpu_indexes, uint32_t gpu_count,
+                            int_radix_params params, uint32_t group_size,
+                            uint32_t big_lwe_size_bytes,
+                            bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+
+    grouping_size = group_size;
+    group_resolved_carries = (Torus *)cuda_malloc_async(
+        (grouping_size)*big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(group_resolved_carries, 0,
+                      (grouping_size)*big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    int num_seq_luts = grouping_size - 1;
+    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
+    lut_sequential_algorithm = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_seq_luts, num_seq_luts,
+        allocate_gpu_memory);
+    for (int index = 0; index < num_seq_luts; index++) {
+      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
+        return (propa_cum_sum_block >> (index + 1)) & 1;
+      };
+      auto seq_lut = lut_sequential_algorithm->get_lut(gpu_indexes[0], index);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], seq_lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_lut_sequential);
+      h_seq_lut_indexes[index] = index;
+    }
+    Torus *seq_lut_indexes =
+        lut_sequential_algorithm->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(seq_lut_indexes, h_seq_lut_indexes,
+                             num_seq_luts * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+
+    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    free(h_seq_lut_indexes);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(group_resolved_carries, streams[0], gpu_indexes[0]);
+    lut_sequential_algorithm->release(streams, gpu_indexes, gpu_count);
+    delete lut_sequential_algorithm;
+  };
+};
+
+// For hillis steele algorithm in group propagation
+template <typename Torus> struct int_hs_group_prop_memory {
+
+  int_radix_lut<Torus> *lut_hillis_steele;
+  uint32_t grouping_size;
+
+  int_hs_group_prop_memory(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes, uint32_t gpu_count,
+                           int_radix_params params, uint32_t num_groups,
+                           uint32_t big_lwe_size_bytes,
+                           bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+
+    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
+      if (msb == 2) {
+        return 1; // Remap Generate to 1
+      } else if (msb == 3) {
+        // MSB propagates
+        if (lsb == 2) {
+          return 1;
+        } else {
+          return lsb;
+        } // also remap here
+      } else {
+        return msb;
+      }
+    };
+
+    lut_hillis_steele =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_groups, allocate_gpu_memory);
+
+    auto hillis_steele_lut = lut_hillis_steele->get_lut(gpu_indexes[0], 0);
+    generate_device_accumulator_bivariate<Torus>(
+        streams[0], gpu_indexes[0], hillis_steele_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_lut_hillis_steele);
+
+    lut_hillis_steele->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    lut_hillis_steele->release(streams, gpu_indexes, gpu_count);
+    delete lut_hillis_steele;
+  }
+};
+
+// compute_shifted_blocks_and_block_states
+template <typename Torus> struct int_shifted_blocks_and_states_memory {
+  Torus *shifted_blocks_and_states;
+  Torus *shifted_blocks;
+  Torus *block_states;
+
+  int_radix_lut<Torus> *luts_array_first_step;
+
+  int_shifted_blocks_and_states_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t lut_count, uint32_t grouping_size, uint32_t requested_flag,
+      bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    shifted_blocks_and_states = (Torus *)cuda_malloc_async(
+        lut_count * num_radix_blocks * big_lwe_size_bytes, streams[0],
+        gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks_and_states, 0,
+                      lut_count * num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    shifted_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    block_states = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(block_states, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    uint32_t num_luts_first_step =
+        requested_flag == 1 ? (2 * grouping_size + 2) : 2 * grouping_size + 1;
+
+    luts_array_first_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_first_step,
+        num_radix_blocks, allocate_gpu_memory);
+
+    auto f_shift_block = [message_modulus](Torus block) -> Torus {
+      return (block % message_modulus) << 1;
+    };
+
+    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
+      if (block >= message_modulus)
+        return OUTPUT_CARRY::GENERATED;
+      else {
+        return OUTPUT_CARRY::NONE;
+      }
+    };
+    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
+        f_first_block_state, f_shift_block};
+
+    auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0);
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], first_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts);
+
+    // luts for other blocks of the first grouping
+    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
+      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block >= message_modulus) {
+          r = 2; // Generates Carry
+        } else if (block == (message_modulus - 1)) {
+          r = 1; // Propagates a carry
+        } else {
+          r = 0; // Does not generate carry
+        }
+        return r << (lut_id - 1);
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // luts for the rest of groupings (except for the last block)
+    for (int i = 0; i < grouping_size; i++) {
+      uint32_t lut_id = i + grouping_size;
+      auto f_state = [message_modulus, i](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block >= message_modulus) {
+          r = 2; // Generates Carry
+        } else if (block == (message_modulus - 1)) {
+          r = 1; // Propagates a carry
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << i;
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // For the last block we need to generate a new lut
+    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
+      if (block >= message_modulus)
+        return 2 << 1; // Generates
+      else
+        return 0; // Nothing
+    };
+
+    uint32_t lut_id =
+        requested_flag == 1
+            ? num_luts_first_step - 2
+            : num_luts_first_step - 1; // The last lut of the first step
+
+    auto last_block_lut =
+        luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+    if (requested_flag == 1) { // If overflow
+      auto f_overflow_shift_block = [message_modulus](Torus block) -> Torus {
+        Torus lhs = block / message_modulus;
+        Torus rhs = block % message_modulus;
+        return ((lhs + rhs) % message_modulus) << 1;
+      };
+      std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
+          f_last_block_state, f_overflow_shift_block};
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], last_block_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_last_grouping_luts);
+
+      // We need an extra lut to handle the case with more than 1 radix
+      // The first of the many luts we will discard it, so we dont care about it
+      lut_id = lut_id + 1;
+      auto extra_block_lut =
+          luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+      std::vector<std::function<Torus(Torus)>> f_last_grouping_luts_overflow = {
+          f_overflow_shift_block, f_overflow_shift_block};
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], extra_block_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_last_grouping_luts_overflow);
+
+    } else {
+      std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
+          f_last_block_state, f_shift_block};
+
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], last_block_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_last_grouping_luts);
+    }
+    // Generate the indexes to switch between luts within the pbs
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+      bool is_last_index = (index == (num_radix_blocks - 1));
+      if (is_last_index) {
+        if (num_radix_blocks == 1) {
+          h_lut_indexes[index] = 2 * grouping_size;
+        } else {
+          if (requested_flag == 1) {
+            h_lut_indexes[index] = 2 * grouping_size + 1;
+          } else {
+            h_lut_indexes[index] = 2;
+          }
+        }
+      } else if (is_in_first_grouping) {
+        h_lut_indexes[index] = index_in_grouping;
+      } else {
+        h_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+    }
+
+    // copy the indexes to the gpu
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size,
+                             streams[0], gpu_indexes[0]);
+    // Do I need to do something else for the multi-gpu?
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    free(h_lut_indexes);
+  };
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    cuda_drop_async(shifted_blocks_and_states, streams[0], gpu_indexes[0]);
+    cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(block_states, streams[0], gpu_indexes[0]);
+
+    luts_array_first_step->release(streams, gpu_indexes, gpu_count);
+    delete luts_array_first_step;
+  };
+};
+
+// compute_propagation simulator and group carries
+template <typename Torus> struct int_prop_simu_group_carries_memory {
+  Torus *scalar_array_cum_sum;
+  Torus *propagation_cum_sums;
+  Torus *simulators;
+  Torus *grouping_pgns;
+  Torus *prepared_blocks;
+
+  Torus *resolved_carries;
+
+  int_radix_lut<Torus> *luts_array_second_step;
+
+  int_seq_group_prop_memory<Torus> *seq_group_prop_mem;
+  int_hs_group_prop_memory<Torus> *hs_group_prop_mem;
+
+  uint32_t group_size;
+  bool use_sequential_algorithm_to_resolver_group_carries;
+
+  int_prop_simu_group_carries_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+
+    group_size = grouping_size;
+
+    scalar_array_cum_sum = (Torus *)cuda_malloc_async(
+        num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
+    cuda_memset_async(scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    propagation_cum_sums = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(propagation_cum_sums, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    simulators = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(simulators, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    grouping_pgns = (Torus *)cuda_malloc_async(num_groups * big_lwe_size_bytes,
+                                               streams[0], gpu_indexes[0]);
+    cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    prepared_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    resolved_carries = (Torus *)cuda_malloc_async(
+        (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(resolved_carries, 0,
+                      (num_groups + 1) * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    // create lut objects for step 2
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint32_t num_carry_to_resolve = num_groups - 1;
+    uint32_t saturated_sub =
+        ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0);
+    uint32_t sequential_depth = saturated_sub / (grouping_size - 1);
+    uint32_t hillis_steel_depth;
+
+    if (num_carry_to_resolve == 0) {
+      hillis_steel_depth = 0;
+    } else {
+      hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve));
+    }
+
+    use_sequential_algorithm_to_resolver_group_carries =
+        sequential_depth <= hillis_steel_depth;
+    uint32_t num_extra_luts = 0;
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      num_extra_luts = (grouping_size - 1);
+    } else {
+      num_extra_luts = 1;
+    }
+
+    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
+    luts_array_second_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_second_step,
+        num_radix_blocks, allocate_gpu_memory);
+
+    // luts for first group inner propagation
+    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
+      auto f_first_grouping_inner_propagation =
+          [lut_id](Torus propa_cum_sum_block) -> Torus {
+        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
+
+        if (carry != 0) {
+          return 2ull; // Generates Carry
+        } else {
+          return 0ull; // Does not generate carry
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_first_grouping_inner_propagation);
+    }
+
+    auto f_first_grouping_outer_propagation =
+        [num_bits_in_block](Torus block) -> Torus {
+      return (block >> (num_bits_in_block - 1)) & 1;
+    };
+
+    int lut_id = grouping_size - 1;
+    auto lut_first_group_outer =
+        luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut_first_group_outer, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_first_grouping_outer_propagation);
+
+    // for other groupings inner propagation
+    for (int index = 0; index < grouping_size; index++) {
+      uint32_t lut_id = index + grouping_size;
+
+      auto f_other_groupings_inner_propagation =
+          [index](Torus propa_cum_sum_block) -> Torus {
+        uint64_t mask = (2 << index) - 1;
+        if (propa_cum_sum_block >= (2 << index)) {
+          return 2ull; // Generates
+        } else if ((propa_cum_sum_block & mask) == mask) {
+          return 1ull; // Propagate
+        } else {
+          return 0ull; // Nothing
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_other_groupings_inner_propagation);
+    }
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      for (int index = 0; index < grouping_size - 1; index++) {
+        uint32_t lut_id = index + 2 * grouping_size;
+
+        auto f_group_propagation = [index, block_modulus,
+                                    num_bits_in_block](Torus block) -> Torus {
+          if (block == (block_modulus - 1)) {
+            return 0ull;
+          } else {
+            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
+          }
+        };
+
+        auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+        generate_device_accumulator<Torus>(
+            streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+            message_modulus, carry_modulus, f_group_propagation);
+      }
+    } else {
+      uint32_t lut_id = 2 * grouping_size;
+      auto f_group_propagation = [block_modulus](Torus block) {
+        if (block == (block_modulus - 1)) {
+          return 2ull;
+        } else {
+          return UINT64_MAX % (block_modulus * 2ull);
+        }
+      };
+
+      auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_group_propagation);
+    }
+
+    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    Torus *h_scalar_array_cum_sum =
+        (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+
+      if (is_in_first_grouping) {
+        h_second_lut_indexes[index] = index_in_grouping;
+      } else if (index_in_grouping == (grouping_size - 1)) {
+        if (use_sequential_algorithm_to_resolver_group_carries) {
+          int inner_index = (grouping_index - 1) % (grouping_size - 1);
+          h_second_lut_indexes[index] = inner_index + 2 * grouping_size;
+        } else {
+          h_second_lut_indexes[index] = 2 * grouping_size;
+        }
+      } else {
+        h_second_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+
+      bool may_have_its_padding_bit_set =
+          !is_in_first_grouping && (index_in_grouping == grouping_size - 1);
+
+      if (may_have_its_padding_bit_set) {
+        if (use_sequential_algorithm_to_resolver_group_carries) {
+          h_scalar_array_cum_sum[index] =
+              1 << ((grouping_index - 1) % (grouping_size - 1));
+        } else {
+          h_scalar_array_cum_sum[index] = 1;
+        }
+      } else {
+        h_scalar_array_cum_sum[index] = 0;
+      }
+    }
+
+    // copy the indexes to the gpu
+    Torus *second_lut_indexes =
+        luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(second_lut_indexes, h_second_lut_indexes,
+                             lut_indexes_size, streams[0], gpu_indexes[0]);
+
+    cuda_memcpy_async_to_gpu(scalar_array_cum_sum, h_scalar_array_cum_sum,
+                             num_radix_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+
+      seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, grouping_size,
+          big_lwe_size_bytes, true);
+
+    } else {
+      hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_groups,
+          big_lwe_size_bytes, true);
+    }
+
+    free(h_scalar_array_cum_sum);
+    free(h_second_lut_indexes);
+  };
+
+  // needed for the divison to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          Torus *new_scalars, uint32_t new_num_blocks) {
+    Torus *lut_indexes =
+        luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    cuda_memcpy_async_gpu_to_gpu(scalar_array_cum_sum, new_scalars,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(scalar_array_cum_sum, streams[0], gpu_indexes[0]);
+    cuda_drop_async(propagation_cum_sums, streams[0], gpu_indexes[0]);
+    cuda_drop_async(simulators, streams[0], gpu_indexes[0]);
+    cuda_drop_async(grouping_pgns, streams[0], gpu_indexes[0]);
+    cuda_drop_async(prepared_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(resolved_carries, streams[0], gpu_indexes[0]);
+
+    luts_array_second_step->release(streams, gpu_indexes, gpu_count);
+
+    if (use_sequential_algorithm_to_resolver_group_carries) {
+      seq_group_prop_mem->release(streams, gpu_indexes, gpu_count);
+    } else {
+      hs_group_prop_mem->release(streams, gpu_indexes, gpu_count);
+    }
+
+    delete luts_array_second_step;
+  };
+};
+
+template <typename Torus> struct int_fast_sc_prop_memory {
+  uint32_t lut_count;
+  uint32_t lut_stride;
+
+  uint32_t group_size;
+  uint32_t num_groups;
+  Torus *output_flag;
+
+  int_radix_lut<Torus> *lut_message_extract;
+
+  int_radix_lut<Torus> *lut_overflow_flag_prep;
+  int_radix_lut<Torus> *lut_overflow_flag_last;
+  int_radix_lut<Torus> *lut_carry_flag_last;
+
+  int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
+  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
+
+  int_radix_params params;
+  bool use_sequential_algorithm_to_resolver_group_carries;
+  uint32_t requested_flag;
+
+  uint32_t active_gpu_count;
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+
+  int_fast_sc_prop_memory(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          int_radix_params params, uint32_t num_radix_blocks,
+                          uint32_t requested_flag_in, uint32_t uses_carry,
+                          bool allocate_gpu_memory) {
+    this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    requested_flag = requested_flag_in;
+    // for compute shifted blocks and block states
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+    uint32_t grouping_size = num_bits_in_block;
+    group_size = grouping_size;
+    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+
+    lut_count = 2; // many luts apply 2 luts
+    uint32_t box_size = polynomial_size / block_modulus;
+    lut_stride = (block_modulus / lut_count) * box_size;
+
+    shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks, lut_count,
+        grouping_size, requested_flag, true);
+
+    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        grouping_size, num_groups, true);
+
+    //  Step 3 elements
+    lut_message_extract =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_radix_blocks, allocate_gpu_memory);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], extract_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_message_extract);
+
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    // This store a single block that with be used to store the overflow or
+    // carry results
+    output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                             gpu_indexes[0]);
+    cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    if (requested_flag == 1) {
+      // For step 1 overflow should be enable only if flag overflow
+      uint32_t num_bits_in_message = std::log2(message_modulus);
+      lut_overflow_flag_prep = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_overflow_fp = [num_bits_in_message,
+                            message_modulus](Torus packed_lhsrhs) -> Torus {
+        Torus lhs = packed_lhsrhs / message_modulus;
+        Torus rhs = packed_lhsrhs % message_modulus;
+
+        Torus mask = (1 << (num_bits_in_message - 1)) - 1;
+        Torus lhs_except_last_bit = lhs & mask;
+        Torus rhs_except_last_bit = rhs & mask;
+        Torus input_carry1 = 1;
+        Torus input_carry2 = 0;
+
+        Torus output_carry1 =
+            ((lhs + rhs + input_carry1) >> num_bits_in_message) & 1;
+        Torus output_carry2 =
+            ((lhs + rhs + input_carry2) >> num_bits_in_message) & 1;
+        Torus input_carry_last_bit1 =
+            ((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >>
+             (num_bits_in_message - 1)) &
+            1;
+        Torus input_carry_last_bit2 =
+            ((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >>
+             (num_bits_in_message - 1)) &
+            1;
+
+        Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1);
+        Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2);
+
+        return output1 << 3 | output2 << 2;
+      };
+
+      auto overflow_flag_prep_lut =
+          lut_overflow_flag_prep->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], overflow_flag_prep_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_fp);
+
+      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    }
+
+    // For the final cleanup in case of overflow or carry (it seems that I can)
+    // It seems that this lut could be apply together with the other one but for
+    // now we won't do it
+    if (requested_flag == 1) { // Overflow case
+      lut_overflow_flag_last = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_overflow_last = [](Torus block) -> Torus {
+        Torus input_carry = (block >> 1) & 1;
+        Torus does_overflow_if_carry_is_1 = (block >> 3) & 1;
+        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
+        if (input_carry == 1) {
+          return does_overflow_if_carry_is_1;
+        } else {
+          return does_overflow_if_carry_is_0;
+        }
+      };
+      auto overflow_flag_last =
+          lut_overflow_flag_last->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_last);
+
+      lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes,
+                                            gpu_indexes[0]);
+    }
+    if (requested_flag == 2) { // Carry case
+      lut_carry_flag_last = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
+
+      auto f_carry_last = [](Torus block) -> Torus {
+        return ((block >> 2) & 1);
+      };
+      auto carry_flag_last = lut_carry_flag_last->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_carry_last);
+
+      lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+
+    active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+  };
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    shifted_blocks_state_mem->release(streams, gpu_indexes, gpu_count);
+    prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
+    cuda_drop_async(output_flag, streams[0], gpu_indexes[0]);
+    lut_message_extract->release(streams, gpu_indexes, gpu_count);
+    delete lut_message_extract;
+
+    if (requested_flag == 1) { // In case of overflow
+      lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count);
+      lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count);
+      delete lut_overflow_flag_prep;
+      delete lut_overflow_flag_last;
+    }
+    if (requested_flag == 2) { // In case of carry
+      lut_carry_flag_last->release(streams, gpu_indexes, gpu_count);
+      delete lut_carry_flag_last;
+    }
+
+    // release sub streams
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+  };
+};
+
+template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
+  Torus *shifted_blocks_and_borrow_states;
+  Torus *shifted_blocks;
+  Torus *borrow_states;
+
+  int_radix_lut<Torus> *luts_array_first_step;
+
+  int_shifted_blocks_and_borrow_states_memory(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
+      uint32_t lut_count, uint32_t grouping_size, bool allocate_gpu_memory) {
+
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+    shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async(
+        lut_count * num_radix_blocks * big_lwe_size_bytes, streams[0],
+        gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks_and_borrow_states, 0,
+                      lut_count * num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    shifted_blocks = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+    borrow_states = (Torus *)cuda_malloc_async(
+        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
+
+    uint32_t num_luts_first_step = 2 * grouping_size + 1;
+
+    luts_array_first_step = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_luts_first_step,
+        num_radix_blocks, allocate_gpu_memory);
+
+    auto f_shift_block = [message_modulus](Torus block) -> Torus {
+      uint64_t overflow_guard = message_modulus;
+      uint64_t block_mod = block % message_modulus;
+      return (overflow_guard | block_mod) << 1;
+    };
+
+    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
+      if (block < message_modulus)
+        return 1; // Borrows
+      else {
+        return 0; // Nothing
+      }
+    };
+    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
+        f_first_block_state, f_shift_block};
+
+    auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0);
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], first_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts);
+
+    // luts for other blocks of the first grouping
+    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
+      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block < message_modulus) {
+          r = 2; // Borrows
+        } else if (block == message_modulus) {
+          r = 1; // Propagates a borrow
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << (lut_id - 1);
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    // luts for the rest of groupings (except for the last block)
+    for (int i = 0; i < grouping_size; i++) {
+      uint32_t lut_id = i + grouping_size;
+      auto f_state = [message_modulus, i](Torus block) -> Torus {
+        uint64_t r = 0;
+        if (block < message_modulus) {
+          r = 2; // Generates borrow
+        } else if (block == message_modulus) {
+          r = 1; // Propagates a borrow
+        } else {
+          r = 0; // Does not borrow
+        }
+        return r << i;
+      };
+      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
+          f_state, f_shift_block};
+
+      auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+      generate_many_lut_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_grouping_luts);
+    }
+
+    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
+      if (block < message_modulus)
+        return 2 << 1; // Generates a borrow
+      else
+        return 0; // Nothing
+    };
+
+    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step
+
+    auto last_block_lut =
+        luts_array_first_step->get_lut(gpu_indexes[0], lut_id);
+
+    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
+        f_last_block_state, f_shift_block};
+
+    generate_many_lut_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], last_block_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts);
+
+    // Generate the indexes to switch between luts within the pbs
+    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size);
+
+    for (int index = 0; index < num_radix_blocks; index++) {
+      uint32_t grouping_index = index / grouping_size;
+      bool is_in_first_grouping = (grouping_index == 0);
+      uint32_t index_in_grouping = index % grouping_size;
+      bool is_last_index = (index == (num_radix_blocks - 1));
+      if (is_last_index) {
+        if (num_radix_blocks == 1) {
+          h_lut_indexes[index] = 2 * grouping_size;
+        } else {
+          h_lut_indexes[index] = 2;
+        }
+      } else if (is_in_first_grouping) {
+        h_lut_indexes[index] = index_in_grouping;
+      } else {
+        h_lut_indexes[index] = index_in_grouping + grouping_size;
+      }
+    }
+    // copy the indexes to the gpu
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size,
+                             streams[0], gpu_indexes[0]);
+    // Do I need to do something else for the multi-gpu?
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    free(h_lut_indexes);
+  };
+
+  // needed for the divison to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          uint32_t new_num_blocks) {
+    Torus *lut_indexes =
+        luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0);
+    cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes,
+                                 new_num_blocks * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+  }
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    cuda_drop_async(shifted_blocks_and_borrow_states, streams[0],
+                    gpu_indexes[0]);
+    cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]);
+    cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]);
+
+    luts_array_first_step->release(streams, gpu_indexes, gpu_count);
+    delete luts_array_first_step;
+  };
+};
+
+template <typename Torus> struct int_fast_borrow_prop_memory {
+  uint32_t lut_count;
+  uint32_t lut_stride;
+
+  uint32_t group_size;
+  uint32_t num_groups;
+  Torus *overflow_block;
+
+  int_radix_lut<Torus> *lut_message_extract;
+  int_radix_lut<Torus> *lut_borrow_flag;
+
+  int_shifted_blocks_and_borrow_states_memory<Torus>
+      *shifted_blocks_borrow_state_mem;
+  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
+
+  int_radix_params params;
+
+  Torus **first_indexes_for_div;
+  Torus **second_indexes_for_div;
+  Torus **scalars_for_div;
+  uint32_t active_gpu_count;
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+
+  cudaEvent_t *incoming_events;
+  cudaEvent_t *outgoing_events1;
+  cudaEvent_t *outgoing_events2;
+
+  bool is_in_div = false;
+  uint32_t max_indexes_to_erase = 0;
+
+  uint32_t compute_overflow;
+  int_fast_borrow_prop_memory(cudaStream_t const *streams,
+                              uint32_t const *gpu_indexes, uint32_t gpu_count,
+                              int_radix_params params,
+                              uint32_t num_radix_blocks,
+                              uint32_t compute_overflow_in,
+                              bool allocate_gpu_memory) {
+    this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;
+    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
+    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    compute_overflow = compute_overflow_in;
+    // for compute shifted blocks and block states
+    uint32_t block_modulus = message_modulus * carry_modulus;
+    uint32_t num_bits_in_block = std::log2(block_modulus);
+    uint32_t grouping_size = num_bits_in_block;
+    group_size = grouping_size;
+    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+
+    lut_count = 2; // many luts apply 2 luts
+    uint32_t box_size = polynomial_size / block_modulus;
+    lut_stride = (block_modulus / lut_count) * box_size;
+
+    shifted_blocks_borrow_state_mem =
+        new int_shifted_blocks_and_borrow_states_memory<Torus>(
+            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+            lut_count, grouping_size, true);
+
+    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        grouping_size, num_groups, true);
+
+    overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
+                                                gpu_indexes[0]);
+    cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+
+    lut_message_extract =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_radix_blocks, allocate_gpu_memory);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], extract_lut, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, f_message_extract);
+
+    lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+    if (compute_overflow) {
+      lut_borrow_flag =
+          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                   num_radix_blocks, allocate_gpu_memory);
+      // lut for the first block in the first grouping
+      auto f_borrow_flag = [](Torus block) -> Torus {
+        return ((block >> 2) & 1);
+      };
+
+      auto borrow_flag_lut = lut_borrow_flag->get_lut(gpu_indexes[0], 0);
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], borrow_flag_lut, glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_borrow_flag);
+
+      lut_borrow_flag->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+
+    active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+
+    incoming_events =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events1 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    outgoing_events2 =
+        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      cudaEventCreate(&incoming_events[j]);
+      cudaEventCreate(&outgoing_events1[j]);
+      cudaEventCreate(&outgoing_events2[j]);
+    }
+  };
+
+  void create_indexes_for_division(cudaStream_t const *streams,
+                                   uint32_t const *gpu_indexes,
+                                   uint32_t num_blocks) {
+
+    is_in_div = true;
+    max_indexes_to_erase = num_blocks;
+    bool use_seq = prop_simu_group_carries_mem
+                       ->use_sequential_algorithm_to_resolver_group_carries;
+    first_indexes_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *));
+    second_indexes_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *));
+    scalars_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *));
+
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+
+    // Extra indexes for the luts in first step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      cudaMalloc((void **)&first_indexes_for_div[nb - 1], nb * sizeof(Torus));
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2;
+          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+      }
+      cuda_memcpy_async_to_gpu(first_indexes_for_div[nb - 1], h_lut_indexes,
+                               nb * sizeof(Torus), streams[0], gpu_indexes[0]);
+    }
+    // Extra indexes for the luts in second step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      cudaMalloc((void **)&second_indexes_for_div[nb - 1], nb * sizeof(Torus));
+      cudaMalloc((void **)&scalars_for_div[nb - 1], nb * sizeof(Torus));
+
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
+        bool may_have_its_padding_bit_set =
+            !is_in_first_grouping && (index_in_grouping == group_size - 1);
+
+        if (may_have_its_padding_bit_set) {
+          if (use_seq) {
+            h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
+          } else {
+            h_scalar[index] = 1;
+          }
+        } else {
+          h_scalar[index] = 0;
+        }
+      }
+      cuda_memcpy_async_to_gpu(second_indexes_for_div[nb - 1], h_lut_indexes,
+                               nb * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_memcpy_async_to_gpu(scalars_for_div[nb - 1], h_scalar,
+                               nb * sizeof(Torus), streams[0], gpu_indexes[0]);
+    }
+    free(h_lut_indexes);
+    free(h_scalar);
+  };
+
+  // needed for the divison to update the lut indexes
+  void update_lut_indexes(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes,
+                          uint32_t new_num_blocks) {
+    assert(max_indexes_to_erase >= new_num_blocks);
+    shifted_blocks_borrow_state_mem->update_lut_indexes(
+        streams, gpu_indexes, first_indexes_for_div[new_num_blocks - 1],
+        new_num_blocks);
+    prop_simu_group_carries_mem->update_lut_indexes(
+        streams, gpu_indexes, second_indexes_for_div[new_num_blocks - 1],
+        scalars_for_div[new_num_blocks - 1], new_num_blocks);
+  }
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count);
+    prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count);
+    cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]);
+
+    lut_message_extract->release(streams, gpu_indexes, gpu_count);
+    delete lut_message_extract;
+    if (compute_overflow) {
+      lut_borrow_flag->release(streams, gpu_indexes, gpu_count);
+      delete lut_borrow_flag;
+    }
+
+    if (is_in_div) {
+      for (int i = 0; i < max_indexes_to_erase; i++) {
+        cuda_drop_async(first_indexes_for_div[i], streams[0], gpu_indexes[0]);
+        cuda_drop_async(second_indexes_for_div[i], streams[0], gpu_indexes[0]);
+        cuda_drop_async(scalars_for_div[i], streams[0], gpu_indexes[0]);
+      }
+      free(first_indexes_for_div);
+      free(second_indexes_for_div);
+      free(scalars_for_div);
+    }
+    // release sub streams
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
+      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
+    }
+    free(sub_streams_1);
+    free(sub_streams_2);
+
+    // release events
+    for (uint j = 0; j < active_gpu_count; j++) {
+      cudaEventDestroy(incoming_events[j]);
+      cudaEventDestroy(outgoing_events1[j]);
+      cudaEventDestroy(outgoing_events2[j]);
+    }
+    free(incoming_events);
+    free(outgoing_events1);
+    free(outgoing_events2);
+  };
+};
 
 template <typename Torus> struct int_mul_memory {
   Torus *vector_result_sb;
@@ -1124,6 +2391,7 @@ template <typename Torus> struct int_mul_memory {
 
   int_radix_lut<Torus> *luts_array; // lsb msb
   int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_mem;
+  int_fast_sc_prop_memory<Torus> *fast_sc_prop_mem;
 
   int_radix_params params;
 
@@ -1199,6 +2467,11 @@ template <typename Torus> struct int_mul_memory {
         streams, gpu_indexes, gpu_count, params, num_radix_blocks,
         2 * num_radix_blocks, block_mul_res, vector_result_sb,
         small_lwe_vector);
+    uint32_t uses_carry = 0;
+    uint32_t requested_flag = 0;
+    fast_sc_prop_mem = new int_fast_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        requested_flag, uses_carry, allocate_gpu_memory);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1209,9 +2482,11 @@ template <typename Torus> struct int_mul_memory {
 
     luts_array->release(streams, gpu_indexes, gpu_count);
     sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count);
+    fast_sc_prop_mem->release(streams, gpu_indexes, gpu_count);
 
     delete luts_array;
     delete sum_ciphertexts_mem;
+    delete fast_sc_prop_mem;
   }
 };
 
@@ -2271,7 +3546,7 @@ template <typename Torus> struct int_div_rem_memory {
   // memory objects for other operations
   int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
   int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
-  int_overflowing_sub_memory<Torus> *overflow_sub_mem;
+  int_fast_borrow_prop_memory<Torus> *overflow_sub_mem;
   int_comparison_buffer<Torus> *comparison_buffer;
 
   // lookup tables
@@ -2515,8 +3790,12 @@ template <typename Torus> struct int_div_rem_memory {
         streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
         params, 2 * num_blocks, true);
 
-    overflow_sub_mem = new int_overflowing_sub_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, num_blocks, true);
+    uint32_t compute_overflow = 1;
+    overflow_sub_mem = new int_fast_borrow_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
+        true);
+    overflow_sub_mem->create_indexes_for_division(streams, gpu_indexes,
+                                                  num_blocks);
 
     comparison_buffer = new int_comparison_buffer<Torus>(
         streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
@@ -2963,6 +4242,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
   int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
   Torus *preshifted_buffer;
   Torus *all_shifted_buffer;
+  int_fast_sc_prop_memory<Torus> *fast_sc_prop_mem;
 
   int_scalar_mul_buffer(cudaStream_t const *streams,
                         uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -3001,13 +4281,20 @@ template <typename Torus> struct int_scalar_mul_buffer {
       sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
           streams, gpu_indexes, gpu_count, params, num_radix_blocks,
           num_ciphertext_bits, allocate_gpu_memory);
+      uint32_t uses_carry = 0;
+      uint32_t requested_flag = 0;
+      fast_sc_prop_mem = new int_fast_sc_prop_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+          requested_flag, uses_carry, allocate_gpu_memory);
     }
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
     sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
+    fast_sc_prop_mem->release(streams, gpu_indexes, gpu_count);
     delete sum_ciphertexts_vec_mem;
+    delete fast_sc_prop_mem;
     cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
   }
 };
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
index 10c476c12b..133103432c 100644
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -27,6 +27,12 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                        void const *lwe_array_in_2,
                                        uint32_t input_lwe_dimension,
                                        uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64_with_packing(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus);
+
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, void const *plaintext_array_in,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index 4bd9933f2c..51ddd567a9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -425,11 +425,16 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
     auto do_overflowing_sub = [&](cudaStream_t const *streams,
                                   uint32_t const *gpu_indexes,
                                   uint32_t gpu_count) {
-      host_integer_overflowing_sub_kb<Torus>(
+      uint32_t compute_borrow = 1;
+      uint32_t uses_input_borrow = 0;
+      mem_ptr->overflow_sub_mem->update_lut_indexes(
+          streams, gpu_indexes, merged_interesting_remainder.len);
+      host_integer_overflowing_sub<uint64_t>(
           streams, gpu_indexes, gpu_count, new_remainder.data,
-          subtraction_overflowed.data, merged_interesting_remainder.data,
-          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
-          merged_interesting_remainder.len);
+          (uint64_t *)merged_interesting_remainder.data,
+          interesting_divisor.data, subtraction_overflowed.data,
+          (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
+          merged_interesting_remainder.len, compute_borrow, uses_input_borrow);
     };
 
     // fills:
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
index 53b1366c37..8e13980c51 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,4 +1,5 @@
 #include "integer/integer.cuh"
+#include "integer/negation.cuh"
 #include <linear_algebra.h>
 
 void cuda_full_propagation_64_inplace(void *const *streams,
@@ -62,6 +63,46 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
       allocate_gpu_memory);
 }
 
+void scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_fast_propagate_single_carry_kb_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_fast_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      requested_flag, uses_carry, allocate_gpu_memory);
+}
+
+void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_fast_borrow_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      compute_overflow, allocate_gpu_memory);
+}
+
 void cuda_propagate_single_carry_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
@@ -73,6 +114,37 @@ void cuda_propagate_single_carry_kb_64_inplace(
       (uint64_t **)(ksks), num_blocks);
 }
 
+void cuda_fast_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {
+
+  host_fast_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
+      (int_fast_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
+}
+
+void cuda_integer_overflowing_sub_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  host_integer_overflowing_sub<uint64_t>(
+      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(overflow_block),
+      static_cast<const uint64_t *>(input_borrow),
+      (int_fast_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
+      num_blocks, compute_overflow, uses_input_borrow);
+}
+
 void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
@@ -94,6 +166,24 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
   mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
 
+void cleanup_cuda_fast_propagate_single_carry(void *const *streams,
+                                              uint32_t const *gpu_indexes,
+                                              uint32_t gpu_count,
+                                              int8_t **mem_ptr_void) {
+  int_fast_sc_prop_memory<uint64_t> *mem_ptr =
+      (int_fast_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void cleanup_cuda_integer_overflowing_sub(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void) {
+  int_fast_borrow_prop_memory<uint64_t> *mem_ptr =
+      (int_fast_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
 void scratch_cuda_apply_univariate_lut_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 8560b94c8d..80c18dce38 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -8,6 +8,7 @@
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
+#include "linearalgebra/negation.cuh"
 #include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
@@ -128,6 +129,134 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams,
       <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }
 
+// If group_size = 4, the first group of 4 elements will be transformed as
+// follows:
+//  dest[0] = src[0]
+//  dest[1] = src[0] + src[1]
+//  dest[2] = src[0] + src[1] + src[2]
+//  dest[3] = src[0] + src[1] + src[2] + src[3]
+template <typename Torus>
+__global__ void
+radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count,
+                               uint32_t lwe_size, uint32_t group_size) {
+
+  size_t block_offset = blockIdx.x * group_size * lwe_size;
+
+  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+    size_t idx = j + block_offset;
+    Torus sum = src[idx];
+    dest[idx] = sum;
+    for (int gidx = 1; gidx < group_size; gidx++) {
+      if (gidx + blockIdx.x * group_size <
+          blocks_count) { // in case the last group is not full
+        sum += src[idx + gidx * lwe_size];
+        dest[idx + gidx * lwe_size] = sum;
+      }
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_cumulative_sum_in_groups(
+    cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src,
+    uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) {
+  cudaSetDevice(gpu_index);
+  // Each CUDA block is responsible for a single group
+  int num_blocks = (radix_blocks_count + group_size - 1) / group_size,
+      num_threads = 512;
+  radix_cumulative_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      dest, src, radix_blocks_count, lwe_size, group_size);
+}
+
+template <typename Torus>
+__global__ void radix_split_simulators_and_grouping_pgns(
+    Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count,
+    uint32_t lwe_size, uint32_t group_size, Torus delta) {
+
+  size_t block_offset = blockIdx.x * lwe_size;
+  if (blockIdx.x % group_size == 0) {
+    if (blockIdx.x == 0) {
+      // save trivial 0
+      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+        simulators[j] = 0;
+      }
+    } else {
+      // save trivial 1
+      for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) {
+        size_t simu_idx = j + block_offset;
+        simulators[simu_idx] = 0;
+      }
+      if (threadIdx.x == 0) {
+        simulators[lwe_size - 1 + block_offset] = 1 * delta;
+      }
+    }
+
+    if ((blockIdx.x / group_size + 1) <
+        (blocks_count + group_size - 1) / group_size) {
+      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
+      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
+      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+        size_t in_offset = j + src_offset;
+        size_t out_offset = j + pgns_offset;
+        grouping_pgns[out_offset] = src[in_offset];
+      }
+    }
+  } else {
+    // save simulators
+    size_t src_offset = (blockIdx.x - 1) * lwe_size;
+    for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+      simulators[j + block_offset] = src[j + src_offset];
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_split_simulators_and_grouping_pgns(
+    cudaStream_t stream, uint32_t gpu_index, Torus *simulators,
+    Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count,
+    uint32_t lwe_size, uint32_t group_size, Torus delta) {
+  cudaSetDevice(gpu_index);
+  // Each CUDA block is responsible for a single group
+  int num_blocks = radix_blocks_count, num_threads = 512;
+  radix_split_simulators_and_grouping_pgns<Torus>
+      <<<num_blocks, num_threads, 0, stream>>>(simulators, grouping_pgns, src,
+                                               radix_blocks_count, lwe_size,
+                                               group_size, delta);
+}
+
+// If group_size = 4, the first group of 4 elements will be transformed as
+// follows:
+//  src1 size num_radix_blocks * lwe_size
+//  src2 size num_group * lwe_size
+//  dest[0] = src1[0] + src2[0]
+//  dest[1] = src1[1] + src2[0]
+//  dest[2] = src1[2] + src2[0]
+//  dest[3] = src1[3] + src2[0]
+template <typename Torus>
+__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2,
+                                    uint32_t blocks_count, uint32_t lwe_size,
+                                    uint32_t group_size) {
+
+  size_t src1_offset = blockIdx.x * lwe_size;
+  size_t src2_index = (blockIdx.x / group_size) * lwe_size;
+  for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
+    size_t idx = j + src1_offset;
+    dest[idx] = src1[idx] + src2[j + src2_index];
+  }
+}
+
+template <typename Torus>
+__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
+                                       Torus *dest, Torus *src1, Torus *src2,
+                                       uint32_t radix_blocks_count,
+                                       uint32_t lwe_size, uint32_t group_size) {
+  cudaSetDevice(gpu_index);
+
+  int num_blocks = radix_blocks_count, num_threads = 512;
+  radix_sum_in_groups<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      dest, src1, src2, radix_blocks_count, lwe_size, group_size);
+}
+
 // polynomial_size threads
 template <typename Torus>
 __global__ void
@@ -479,6 +608,48 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
   rotate_left<Torus>(body, half_box_size, polynomial_size);
 }
 
+template <typename Torus>
+void generate_many_lookup_table(
+    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t carry_modulus,
+    std::vector<std::function<Torus(Torus)>> &functions) {
+
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  uint32_t box_size = polynomial_size / modulus_sup;
+  Torus delta = (1ul << 63) / modulus_sup;
+
+  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
+
+  auto body = &acc[glwe_dimension * polynomial_size];
+
+  size_t fn_counts = functions.size();
+
+  assert(fn_counts <= modulus_sup / 2);
+
+  // Space used for each sub lut
+  uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size;
+
+  // This accumulator extracts the carry bits
+  for (int f = 0; f < fn_counts; f++) {
+    int lut_offset = f * single_function_sub_lut_size;
+    for (int i = 0; i < modulus_sup / fn_counts; i++) {
+      int index = i * box_size + lut_offset;
+      for (int j = index; j < index + box_size; j++) {
+        auto f_eval = functions[f](i);
+        body[j] = f_eval * delta;
+      }
+    }
+  }
+  int half_box_size = box_size / 2;
+
+  // Negate the first half_box_size coefficients
+  for (int i = 0; i < half_box_size; i++) {
+    body[i] = -body[i];
+  }
+
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
+}
+
 template <typename Torus>
 void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
                                      uint32_t polynomial_size,
@@ -612,6 +783,37 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
   free(h_lut);
 }
 
+/*
+ *  generate many lut accumulator for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for accumulator
+ *    ...
+ *    vector<f> - evaluating functions with one Torus input
+ */
+template <typename Torus>
+void generate_many_lut_device_accumulator(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus,
+    std::vector<std::function<Torus(Torus)>> &functions) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill accumulator
+  generate_many_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
+                                    message_modulus, carry_modulus, functions);
+
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(
+      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+      stream, gpu_index);
+
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
+}
+
 template <typename Torus>
 void scratch_cuda_propagate_single_carry_kb_inplace(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -624,6 +826,108 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
                                     num_radix_blocks, allocate_gpu_memory);
 }
 
+template <typename Torus>
+void host_compute_shifted_blocks_and_states(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
+    int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
+    uint32_t lut_count) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
+  auto luts_array_first_step = mem->luts_array_first_step;
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array,
+      bsks, ksks, num_blocks, luts_array_first_step, lut_count, lut_stride);
+
+  auto shifted_blocks = mem->shifted_blocks;
+  auto block_states = mem->block_states;
+  cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states,
+                               big_lwe_size_bytes * num_blocks, streams[0],
+                               gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(
+      shifted_blocks, shifted_blocks_and_states + big_lwe_size * num_blocks,
+      big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
+}
+
+template <typename Torus>
+void host_resolve_group_carries_sequentially(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns,
+    int_radix_params params, int_seq_group_prop_memory<Torus> *mem,
+    void *const *bsks, Torus *const *ksks, uint32_t num_groups) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto group_resolved_carries = mem->group_resolved_carries;
+  if (num_groups > 1) {
+    // First carry is just copied
+    cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns,
+                                 big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);
+    uint32_t solve_per_iter = mem->grouping_size - 1;
+    uint32_t remaining_carries =
+        num_groups -
+        2; // the first one has been resolved and we ignore the last one
+    uint32_t num_loops =
+        ceil(double(remaining_carries) / (double)(solve_per_iter));
+    uint32_t last_resolved_pos = 1;
+
+    for (int i = 0; i < num_loops; i++) {
+      uint32_t loop_offset = i * solve_per_iter;
+      uint32_t blocks_to_solve = solve_per_iter;
+      // In case the last iteration has to solve less
+      if (loop_offset + blocks_to_solve > num_groups - 2) {
+        blocks_to_solve = remaining_carries - loop_offset;
+      }
+
+      // The group_resolved carries is used as an intermediate array
+      // First we need to copy the last resolved carry
+      cuda_memcpy_async_gpu_to_gpu(
+          group_resolved_carries,
+          resolved_carries + last_resolved_pos * big_lwe_size,
+          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      // The array is filled with the blocks_to_solve
+      cuda_memcpy_async_gpu_to_gpu(
+          group_resolved_carries + big_lwe_size,
+          grouping_pgns + last_resolved_pos * big_lwe_size,
+          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      // Perform one group cumulative sum
+      host_radix_cumulative_sum_in_groups<Torus>(
+          streams[0], gpu_indexes[0], group_resolved_carries,
+          group_resolved_carries, blocks_to_solve + 1, big_lwe_size,
+          mem->grouping_size);
+
+      // Apply the lut
+      auto luts_sequential = mem->lut_sequential_algorithm;
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count,
+          group_resolved_carries + big_lwe_size,
+          group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve,
+          luts_sequential);
+
+      // Copy the result to the resolved carries array
+      cuda_memcpy_async_gpu_to_gpu(
+          resolved_carries + (last_resolved_pos + 1) * big_lwe_size,
+          group_resolved_carries + big_lwe_size,
+          blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+      last_resolved_pos += blocks_to_solve;
+    }
+  }
+}
+
 template <typename Torus>
 void host_compute_prefix_sum_hillis_steele(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -661,6 +965,95 @@ void host_compute_prefix_sum_hillis_steele(
   }
 }
 
+template <typename Torus>
+void host_compute_propagation_simulators_and_group_carries(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *block_states, int_radix_params params,
+    int_prop_simu_group_carries_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t num_groups) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+
+  auto propagation_cum_sums = mem->propagation_cum_sums;
+  auto group_size = mem->group_size;
+  host_radix_cumulative_sum_in_groups<Torus>(
+      streams[0], gpu_indexes[0], propagation_cum_sums, block_states,
+      num_blocks, big_lwe_size, group_size);
+
+  auto luts_array_second_step = mem->luts_array_second_step;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, propagation_cum_sums,
+      propagation_cum_sums, bsks, ksks, num_blocks, luts_array_second_step);
+
+  auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
+  auto big_lwe_dimension = big_lwe_size - 1;
+
+  host_integer_radix_scalar_addition_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, propagation_cum_sums,
+      scalar_array_cum_sum, big_lwe_dimension, num_blocks, message_modulus,
+      carry_modulus);
+
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  Torus delta = (1ull << 63) / modulus_sup;
+  auto simulators = mem->simulators;
+  auto grouping_pgns = mem->grouping_pgns;
+  host_radix_split_simulators_and_grouping_pgns<Torus>(
+      streams[0], gpu_indexes[0], simulators, grouping_pgns,
+      propagation_cum_sums, num_blocks, big_lwe_size, group_size, delta);
+
+  auto resolved_carries = mem->resolved_carries;
+  if (mem->use_sequential_algorithm_to_resolver_group_carries) {
+    // Resolve group carries sequentially
+    host_resolve_group_carries_sequentially(
+        streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns,
+        params, mem->seq_group_prop_mem, bsks, ksks, num_groups);
+  } else {
+    // Resolve group carries with hillis steele
+    auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
+    host_compute_prefix_sum_hillis_steele<Torus>(
+        streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size],
+        grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks,
+        num_groups - 1);
+  }
+}
+
+template <typename Torus>
+void host_compute_shifted_blocks_and_borrow_states(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array, int_radix_params params,
+    int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride,
+    uint32_t lut_count) {
+
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
+  auto luts_array_first_step = mem->luts_array_first_step;
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
+      lwe_array, bsks, ksks, num_blocks, luts_array_first_step, lut_count,
+      lut_stride);
+
+  auto shifted_blocks = mem->shifted_blocks;
+  auto borrow_states = mem->borrow_states;
+  cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states,
+                               big_lwe_size_bytes * num_blocks, streams[0],
+                               gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(
+      shifted_blocks,
+      shifted_blocks_and_borrow_states + big_lwe_size * num_blocks,
+      big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]);
+}
+
 template <typename Torus>
 void host_propagate_single_carry(cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes,
@@ -1146,4 +1539,247 @@ void host_apply_bivariate_lut_kb(
       radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift);
 }
 
+template <typename Torus>
+void scratch_cuda_fast_propagate_single_carry_kb_inplace(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_fast_sc_prop_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
+    uint32_t uses_carry, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_fast_sc_prop_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
+      uses_carry, allocate_gpu_memory);
+}
+
+template <typename Torus>
+void host_fast_propagate_single_carry(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array, Torus *carry_out,
+    const Torus *input_carries, int_fast_sc_prop_memory<Torus> *mem,
+    void *const *bsks, Torus *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
+  auto lut_stride = mem->lut_stride;
+  auto lut_count = mem->lut_count;
+
+  enum outputFlag { NONE = 0, OVERFLOW = 1, CARRY = 2 };
+  if (uses_carry == 1) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                         input_carries, big_lwe_dimension, 1);
+  }
+
+  host_compute_shifted_blocks_and_states<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array, params,
+      mem->shifted_blocks_state_mem, bsks, ksks, num_blocks, lut_stride,
+      lut_count);
+  auto block_states = mem->shifted_blocks_state_mem->block_states;
+  if (requested_flag == outputFlag::OVERFLOW) {
+    // This operation could be added to the many lut with some trickery to be in
+    // parallel but first i will try to use different streams
+    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, mem->output_flag,
+        lwe_array + (num_blocks - 1) * big_lwe_size, bsks, ksks, 1,
+        lut_overflow_prep);
+  } else if (requested_flag == outputFlag::CARRY) {
+    cuda_memcpy_async_gpu_to_gpu(
+        mem->output_flag, block_states + (num_blocks - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  }
+
+  host_compute_propagation_simulators_and_group_carries<Torus>(
+      streams, gpu_indexes, gpu_count, block_states, params,
+      mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks,
+      mem->num_groups);
+
+  auto group_size = mem->prop_simu_group_carries_mem->group_size;
+
+  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
+  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
+  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                       shifted_blocks,
+                       mem->prop_simu_group_carries_mem->simulators,
+                       big_lwe_dimension, num_blocks);
+
+  if (requested_flag == outputFlag::OVERFLOW ||
+      requested_flag == outputFlag::CARRY) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
+                         mem->output_flag,
+                         mem->prop_simu_group_carries_mem->simulators +
+                             (num_blocks - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+
+  for (uint j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+
+  // Add carries and cleanup OutputFlag::None
+  host_radix_sum_in_groups<Torus>(
+      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      mem->prop_simu_group_carries_mem->resolved_carries, num_blocks,
+      big_lwe_size, group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
+      bsks, ksks, num_blocks, message_extract);
+
+  if (requested_flag == outputFlag::OVERFLOW ||
+      requested_flag == outputFlag::CARRY) {
+    // Here I could also do some trick to try to apply this function in parallel
+    // First i will try sequential, then i improve it
+
+    host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                         mem->output_flag, mem->output_flag,
+                         mem->prop_simu_group_carries_mem->resolved_carries +
+                             (mem->num_groups - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+
+    if (requested_flag == outputFlag::OVERFLOW) {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
+    } else {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
+    }
+    for (uint j = 0; j < mem->active_gpu_count; j++) {
+      cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
+                                   big_lwe_size_bytes, mem->sub_streams_2[j],
+                                   gpu_indexes[j]);
+    }
+  }
+
+  for (uint j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
+  }
+}
+
+template <typename Torus>
+void scratch_cuda_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_fast_borrow_prop_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    uint32_t compute_overflow, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_fast_borrow_prop_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      compute_overflow, allocate_gpu_memory);
+}
+
+template <typename Torus>
+void host_fast_borrow_propagate(cudaStream_t const *streams,
+                                uint32_t const *gpu_indexes, uint32_t gpu_count,
+                                Torus *lhsrhs_array, Torus *overflow_block,
+                                const Torus *input_borrow,
+                                int_fast_borrow_prop_memory<Torus> *mem,
+                                void *const *bsks, Torus *const *ksks,
+                                uint32_t num_blocks, uint32_t num_groups,
+                                uint32_t compute_overflow,
+                                uint32_t uses_input_borrow) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  auto big_lwe_dimension = big_lwe_size - 1;
+  auto lut_stride = mem->lut_stride;
+  auto lut_count = mem->lut_count;
+
+  assert(mem->num_groups >= num_groups);
+  if (uses_input_borrow == 1) {
+    host_unchecked_sub_with_correcting_term<Torus>(
+        streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow,
+        big_lwe_dimension, 1, message_modulus, carry_modulus,
+        message_modulus - 1);
+  }
+
+  host_compute_shifted_blocks_and_borrow_states<Torus>(
+      streams, gpu_indexes, gpu_count, lhsrhs_array, params,
+      mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_blocks, lut_stride,
+      lut_count);
+
+  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
+  cuda_memcpy_async_gpu_to_gpu(mem->overflow_block,
+                               borrow_states + (num_blocks - 1) * big_lwe_size,
+                               big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+
+  host_compute_propagation_simulators_and_group_carries<Torus>(
+      streams, gpu_indexes, gpu_count, borrow_states, params,
+      mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks, num_groups);
+
+  auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks;
+  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
+  auto simulators = mem->prop_simu_group_carries_mem->simulators;
+
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                          shifted_blocks, simulators, big_lwe_dimension,
+                          num_blocks);
+
+  // unchecked_scalar_add_ssing
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension,
+      num_blocks, message_modulus, carry_modulus);
+
+  // unchecked_add_assing in overflow_block
+  if (compute_overflow == 1) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
+                         mem->overflow_block,
+                         mem->prop_simu_group_carries_mem->simulators +
+                             (num_blocks - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+  auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries;
+
+  // This needs to be done before because in next step we modify the resolved
+  // borrows
+  if (compute_overflow == 1) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
+                         mem->overflow_block,
+                         resolved_borrows + (num_groups - 1) * big_lwe_size,
+                         big_lwe_dimension, 1);
+  }
+
+  cudaEventRecord(mem->incoming_events[0], streams[0]);
+  cudaStreamWaitEvent(mem->sub_streams_1[0], mem->incoming_events[0], 0);
+  cudaStreamWaitEvent(mem->sub_streams_2[0], mem->incoming_events[0], 0);
+
+  if (compute_overflow == 1) {
+    auto borrow_flag = mem->lut_borrow_flag;
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
+        mem->overflow_block, bsks, ksks, 1, borrow_flag);
+  }
+  cudaEventRecord(mem->outgoing_events1[0], mem->sub_streams_1[0]);
+  // subtract borrow and cleanup prepared blocks
+  host_negation<Torus>(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows,
+                       resolved_borrows, big_lwe_dimension, num_groups);
+
+  host_radix_sum_in_groups<Torus>(
+      mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      resolved_borrows, num_blocks, big_lwe_size, mem->group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks,
+      bsks, ksks, num_blocks, message_extract);
+
+  cudaEventRecord(mem->outgoing_events2[0], mem->sub_streams_2[0]);
+
+  cudaStreamWaitEvent(streams[0], mem->outgoing_events1[0], 0);
+  cudaStreamWaitEvent(streams[0], mem->outgoing_events2[0], 0);
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 0ebf410125..6f51099a3b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -562,10 +562,21 @@ __host__ void host_integer_mult_radix_kb(
       terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
       2 * num_blocks, mem_ptr->luts_array);
 
-  auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, nullptr, nullptr,
-                                     scp_mem_ptr, bsks, ksks, num_blocks);
+  uint32_t block_modulus = message_modulus * carry_modulus;
+  uint32_t num_bits_in_block = std::log2(block_modulus);
+  // if (num_blocks < num_bits_in_block) {
+  //   auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem;
+  //   host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
+  //                                      radix_lwe_out, nullptr, nullptr,
+  //                                      scp_mem_ptr, bsks, ksks, num_blocks);
+  // } else {
+  auto fast_scp_mem_ptr = mem_ptr->fast_sc_prop_mem;
+  uint32_t requested_flag = 0;
+  uint32_t uses_carry = 0;
+  host_fast_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
+      fast_scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
+  //}
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
index e55ea9e912..d7d104fa33 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -38,15 +38,7 @@ void cuda_integer_radix_overflowing_sub_kb_64(
     void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
     void *const *ksks, uint32_t num_blocks) {
 
-  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
-
-  host_integer_overflowing_sub_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(radix_lwe_out),
-      static_cast<uint64_t *>(radix_lwe_overflowed),
-      static_cast<const uint64_t *>(radix_lwe_left),
-      static_cast<const uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
-      mem, num_blocks);
+  // auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
 }
 
 void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
index 6eda409df9..fb88efc685 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb(
   *mem_ptr = new int_overflowing_sub_memory<Torus>(
       streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }
-
+/*
 template <typename Torus>
 __host__ void host_integer_overflowing_sub_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb(
                                           mem_ptr, bsks, ksks, num_blocks);
 }
 
+*/
+template <typename Torus>
+__host__ void host_integer_overflowing_sub(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
+    const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
+    int_fast_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
+
+  auto radix_params = mem_ptr->params;
+
+  // We need to recalculate the num_groups, because on the division the number
+  // of num_blocks changes
+  uint32_t block_modulus =
+      radix_params.message_modulus * radix_params.carry_modulus;
+  uint32_t num_bits_in_block = std::log2(block_modulus);
+  uint32_t grouping_size = num_bits_in_block;
+  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+
+  auto stream = (cudaStream_t *)streams;
+  host_unchecked_sub_with_correcting_term<Torus>(
+      stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
+      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
+      radix_params.carry_modulus, radix_params.message_modulus - 1);
+
+  host_fast_borrow_propagate<Torus>(
+      streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
+      static_cast<Torus *>(overflow_block),
+      static_cast<const Torus *>(input_borrow),
+      (int_fast_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
+      num_blocks, num_groups, compute_overflow, uses_input_borrow);
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
index 37a51006ae..d7c4e1f10c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -112,10 +112,24 @@ __host__ void host_integer_scalar_mul_radix(
         terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
         num_radix_blocks, j, nullptr);
 
-    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
-    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
-                                   nullptr, nullptr, scp_mem_ptr, bsks, ksks,
-                                   num_radix_blocks);
+    // uint32_t carry_modulus = message_modulus;
+    // uint32_t block_modulus = message_modulus * carry_modulus;
+    // uint32_t num_bits_in_block = std::log2(block_modulus);
+    // if (num_radix_blocks < num_bits_in_block) {
+    //   auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
+    //   host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count,
+    //   lwe_array,
+    //                                  nullptr, nullptr, scp_mem_ptr, bsks,
+    //                                  ksks, num_radix_blocks);
+    // } else {
+    auto fast_scp_mem_ptr = mem->fast_sc_prop_mem;
+    uint32_t requested_flag = 0;
+    uint32_t uses_carry = 0;
+    host_fast_propagate_single_carry<T>(
+        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
+        fast_scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag,
+        uses_carry);
+    //}
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
index d3f47ad263..51dfc6d3ea 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -57,6 +57,27 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                           static_cast<const uint64_t *>(lwe_array_in_2),
                           input_lwe_dimension, input_lwe_ciphertext_count);
 }
+// last block it is the packing lhs*message_modulus + rhs
+void cuda_add_lwe_ciphertext_vector_64_with_packing(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus) {
+
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<const uint64_t *>(lwe_array_in_1),
+                          static_cast<const uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count - 1);
+
+  host_pack_for_overflowing_ops<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in_1),
+      static_cast<const uint64_t *>(lwe_array_in_2), input_lwe_dimension,
+      input_lwe_ciphertext_count, message_modulus);
+}
+
 /*
  * Perform the addition of a u32 input LWE ciphertext vector with a u32
  * plaintext vector. See the equivalent operation on u64 data for more details.
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
index 29e1f62689..8da077414d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -82,6 +82,45 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
   check_cuda_error(cudaGetLastError());
 }
 
+template <typename T>
+__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
+                                         T const *input_2, uint32_t num_entries,
+                                         uint32_t message_modulus) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] * message_modulus + input_2[index];
+  }
+}
+
+template <typename T>
+__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
+                                            uint32_t gpu_index, T *output,
+                                            T const *input_1, T const *input_2,
+                                            uint32_t input_lwe_dimension,
+                                            uint32_t input_lwe_ciphertext_count,
+                                            uint32_t message_modulus) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
+      &output[input_lwe_ciphertext_count - 1],
+      &input_1[input_lwe_ciphertext_count - 1],
+      &input_2[input_lwe_ciphertext_count - 1], lwe_size, message_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void subtraction(T *output, T const *input_1, T const *input_2,
                             uint32_t num_entries) {
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 2a665498f3..ea12bb14c3 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -755,6 +755,102 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
+extern "C" {
+    pub fn scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        requested_flag: u32,
+        uses_carry: u32,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn cuda_fast_propagate_single_carry_kb_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array: *mut ffi::c_void,
+        carry_out: *mut ffi::c_void,
+        carry_in: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        num_blocks: u32,
+        requested_flag: u32,
+        uses_carry: u32,
+    );
+}
+extern "C" {
+    pub fn cleanup_cuda_fast_propagate_single_carry(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+extern "C" {
+    pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        compute_overflow: u32,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn cuda_integer_overflowing_sub_kb_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lhs_array: *mut ffi::c_void,
+        rhs_array: *const ffi::c_void,
+        overflow_block: *mut ffi::c_void,
+        input_borrow: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        num_blocks: u32,
+        compute_overflow: u32,
+        uses_input_borrow: u32,
+    );
+}
+extern "C" {
+    pub fn cleanup_cuda_integer_overflowing_sub(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 extern "C" {
     pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
@@ -1144,6 +1240,18 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
+extern "C" {
+    pub fn cuda_add_lwe_ciphertext_vector_64_with_packing(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in_1: *const ffi::c_void,
+        lwe_array_in_2: *const ffi::c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+        message_modulus: u32,
+    );
+}
 extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
         stream: *mut ffi::c_void,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index b25cde608f..3766119d45 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -458,6 +458,33 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
     );
 }
 
+/// Discarding addition of a vector of LWE ciphertexts
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+pub unsafe fn add_lwe_ciphertext_vector_with_packing_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in_1: &CudaVec<T>,
+    lwe_array_in_2: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    num_samples: u32,
+    message_modulus: u32,
+) {
+    cuda_add_lwe_ciphertext_vector_64_with_packing(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in_1.as_c_ptr(0),
+        lwe_array_in_2.as_c_ptr(0),
+        lwe_dimension.0 as u32,
+        num_samples,
+        message_modulus,
+    );
+}
+
 /// Discarding addition of a vector of LWE ciphertexts with a vector of plaintexts
 ///
 /// # Safety
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 0e40674f4c..5cfd32677d 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
 
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
 
@@ -466,6 +467,41 @@ pub unsafe fn unchecked_add_integer_radix_assign_async<T: UnsignedInteger>(
     );
 }
 
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_add_integer_radix_assign_with_packing_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    radix_lwe_left: &mut CudaVec<T>,
+    radix_lwe_right: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    num_blocks: u32,
+    message_modulus: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_left.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_right.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    cuda_add_lwe_ciphertext_vector_64_with_packing(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        radix_lwe_left.as_mut_c_ptr(0),
+        radix_lwe_left.as_c_ptr(0),
+        radix_lwe_right.as_c_ptr(0),
+        lwe_dimension.0 as u32,
+        num_blocks,
+        message_modulus,
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -1085,6 +1121,94 @@ pub unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B: Numeric
     );
 }
 
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub(crate) unsafe fn propagate_fast_single_carry_assign_async<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    radix_lwe_input: &mut CudaVec<T>,
+    carry_out: &mut CudaVec<T>,
+    carry_in: &CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    requested_flag: OutputFlag,
+    uses_carry: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_input.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
+    scratch_cuda_fast_propagate_single_carry_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        requested_flag as u32,
+        uses_carry as u32,
+        true,
+    );
+    cuda_fast_propagate_single_carry_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        radix_lwe_input.as_mut_c_ptr(0),
+        carry_out.as_mut_c_ptr(0),
+        carry_in.as_c_ptr(0),
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+        requested_flag as u32,
+        uses_carry as u32,
+    );
+    cleanup_cuda_fast_propagate_single_carry(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -2139,107 +2263,107 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
     );
 }
 
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
-pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
-    T: UnsignedInteger,
-    B: Numeric,
->(
-    streams: &CudaStreams,
-    ct_res: &mut CudaVec<T>,
-    ct_overflowed: &mut CudaVec<T>,
-    lhs: &CudaVec<T>,
-    rhs: &CudaVec<T>,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    num_blocks: u32,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-) {
-    assert_eq!(
-        streams.gpu_indexes[0],
-        ct_res.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        ct_overflowed.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        lhs.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        rhs.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        bootstrapping_key.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        keyswitch_key.gpu_index(0),
-        "GPU error: all data should reside on the same GPU."
-    );
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_integer_radix_overflowing_sub_kb_64(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        std::ptr::addr_of_mut!(mem_ptr),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_blocks,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        true,
-    );
-    cuda_integer_radix_overflowing_sub_kb_64(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        ct_res.as_mut_c_ptr(0),
-        ct_overflowed.as_mut_c_ptr(0),
-        lhs.as_c_ptr(0),
-        rhs.as_c_ptr(0),
-        mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
-        keyswitch_key.ptr.as_ptr(),
-        num_blocks,
-    );
-    cleanup_cuda_integer_radix_overflowing_sub(
-        streams.ptr.as_ptr(),
-        streams.gpu_indexes.as_ptr(),
-        streams.len() as u32,
-        std::ptr::addr_of_mut!(mem_ptr),
-    );
-}
+// #[allow(clippy::too_many_arguments)]
+// /// # Safety
+// ///
+// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
+// synchronization ///   is required
+// pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
+//     T: UnsignedInteger,
+//     B: Numeric,
+// >(
+//     streams: &CudaStreams,
+//     ct_res: &mut CudaVec<T>,
+//     ct_overflowed: &mut CudaVec<T>,
+//     lhs: &CudaVec<T>,
+//     rhs: &CudaVec<T>,
+//     bootstrapping_key: &CudaVec<B>,
+//     keyswitch_key: &CudaVec<T>,
+//     message_modulus: MessageModulus,
+//     carry_modulus: CarryModulus,
+//     glwe_dimension: GlweDimension,
+//     polynomial_size: PolynomialSize,
+//     big_lwe_dimension: LweDimension,
+//     small_lwe_dimension: LweDimension,
+//     ks_level: DecompositionLevelCount,
+//     ks_base_log: DecompositionBaseLog,
+//     pbs_level: DecompositionLevelCount,
+//     pbs_base_log: DecompositionBaseLog,
+//     num_blocks: u32,
+//     pbs_type: PBSType,
+//     grouping_factor: LweBskGroupingFactor,
+// ) {
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         ct_res.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         ct_overflowed.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         lhs.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         rhs.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         bootstrapping_key.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     assert_eq!(
+//         streams.gpu_indexes[0],
+//         keyswitch_key.gpu_index(0),
+//         "GPU error: all data should reside on the same GPU."
+//     );
+//     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+//     scratch_cuda_integer_radix_overflowing_sub_kb_64(
+//         streams.ptr.as_ptr(),
+//         streams.gpu_indexes.as_ptr(),
+//         streams.len() as u32,
+//         std::ptr::addr_of_mut!(mem_ptr),
+//         glwe_dimension.0 as u32,
+//         polynomial_size.0 as u32,
+//         big_lwe_dimension.0 as u32,
+//         small_lwe_dimension.0 as u32,
+//         ks_level.0 as u32,
+//         ks_base_log.0 as u32,
+//         pbs_level.0 as u32,
+//         pbs_base_log.0 as u32,
+//         grouping_factor.0 as u32,
+//         num_blocks,
+//         message_modulus.0 as u32,
+//         carry_modulus.0 as u32,
+//         pbs_type as u32,
+//         true,
+//     );
+//     cuda_integer_radix_overflowing_sub_kb_64(
+//         streams.ptr.as_ptr(),
+//         streams.gpu_indexes.as_ptr(),
+//         streams.len() as u32,
+//         ct_res.as_mut_c_ptr(0),
+//         ct_overflowed.as_mut_c_ptr(0),
+//         lhs.as_c_ptr(0),
+//         rhs.as_c_ptr(0),
+//         mem_ptr,
+//         bootstrapping_key.ptr.as_ptr(),
+//         keyswitch_key.ptr.as_ptr(),
+//         num_blocks,
+//     );
+//     cleanup_cuda_integer_radix_overflowing_sub(
+//         streams.ptr.as_ptr(),
+//         streams.gpu_indexes.as_ptr(),
+//         streams.len() as u32,
+//         std::ptr::addr_of_mut!(mem_ptr),
+//     );
+// }
 
 #[allow(clippy::too_many_arguments)]
 /// # Safety
@@ -2770,3 +2894,95 @@ pub unsafe fn reverse_blocks_inplace_async<T: UnsignedInteger>(
         );
     }
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    radix_lwe_input: &mut CudaVec<T>,
+    radix_rhs_input: &CudaVec<T>,
+    carry_out: &mut CudaVec<T>,
+    carry_in: &CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    compute_overflow: bool,
+    uses_input_borrow: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        radix_lwe_input.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
+    scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        compute_overflow as u32,
+        true,
+    );
+    cuda_integer_overflowing_sub_kb_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        radix_lwe_input.as_mut_c_ptr(0),
+        radix_rhs_input.as_c_ptr(0),
+        carry_out.as_mut_c_ptr(0),
+        carry_in.as_c_ptr(0),
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+        compute_overflow as u32,
+        uses_input_borrow as u32,
+    );
+    cleanup_cuda_integer_overflowing_sub(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
index 3d746230a6..cbf6c12777 100644
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -8,9 +8,11 @@ use crate::integer::gpu::ciphertext::{
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
     unchecked_add_integer_radix_assign_async,
+    unchecked_add_integer_radix_assign_with_packing_async,
     unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async,
     unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::NoiseLevel;
 
 #[derive(Copy, Clone, PartialEq, Eq)]
@@ -115,7 +117,8 @@ impl CudaServerKey {
             }
         };
         self.unchecked_add_assign_async(lhs, rhs, streams);
-        let _carry = self.propagate_single_carry_assign_async(lhs, streams);
+        let _carry =
+            self.new_propagate_single_carry_assign_async(lhs, streams, None, OutputFlag::None);
     }
 
     pub fn add_assign<T: CudaIntegerRadixCiphertext>(
@@ -173,6 +176,18 @@ impl CudaServerKey {
         result
     }
 
+    pub fn unchecked_add_with_packing<T: CudaIntegerRadixCiphertext>(
+        &self,
+        ct_left: &T,
+        ct_right: &T,
+        streams: &CudaStreams,
+    ) -> T {
+        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        self.unchecked_add_assign_with_packing(&mut result, ct_right, streams);
+
+        result
+    }
+
     /// # Safety
     ///
     /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
@@ -215,6 +230,49 @@ impl CudaServerKey {
         ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info);
     }
 
+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronised
+    pub unsafe fn unchecked_add_assign_with_packing_async<T: CudaIntegerRadixCiphertext>(
+        &self,
+        ct_left: &mut T,
+        ct_right: &T,
+        streams: &CudaStreams,
+    ) {
+        let ciphertext_left = ct_left.as_mut();
+        let ciphertext_right = ct_right.as_ref();
+        assert_eq!(
+            ciphertext_left.d_blocks.lwe_dimension(),
+            ciphertext_right.d_blocks.lwe_dimension(),
+            "Mismatched lwe dimension between ct_left ({:?}) and ct_right ({:?})",
+            ciphertext_left.d_blocks.lwe_dimension(),
+            ciphertext_right.d_blocks.lwe_dimension()
+        );
+
+        assert_eq!(
+            ciphertext_left.d_blocks.ciphertext_modulus(),
+            ciphertext_right.d_blocks.ciphertext_modulus(),
+            "Mismatched moduli between ct_left ({:?}) and ct_right ({:?})",
+            ciphertext_left.d_blocks.ciphertext_modulus(),
+            ciphertext_right.d_blocks.ciphertext_modulus()
+        );
+
+        let lwe_dimension = ciphertext_left.d_blocks.lwe_dimension();
+        let lwe_ciphertext_count = ciphertext_left.d_blocks.lwe_ciphertext_count();
+
+        unchecked_add_integer_radix_assign_with_packing_async(
+            streams,
+            &mut ciphertext_left.d_blocks.0.d_vec,
+            &ciphertext_right.d_blocks.0.d_vec,
+            lwe_dimension,
+            lwe_ciphertext_count.0 as u32,
+            self.message_modulus.0 as u32,
+        );
+
+        ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info);
+    }
+
     pub fn unchecked_add_assign<T: CudaIntegerRadixCiphertext>(
         &self,
         ct_left: &mut T,
@@ -227,6 +285,18 @@ impl CudaServerKey {
         streams.synchronize();
     }
 
+    pub fn unchecked_add_assign_with_packing<T: CudaIntegerRadixCiphertext>(
+        &self,
+        ct_left: &mut T,
+        ct_right: &T,
+        streams: &CudaStreams,
+    ) {
+        unsafe {
+            self.unchecked_add_assign_with_packing_async(ct_left, ct_right, streams);
+        }
+        streams.synchronize();
+    }
+
     /// # Safety
     ///
     /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
@@ -348,7 +418,7 @@ impl CudaServerKey {
             .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
             .unwrap();
 
-        self.propagate_single_carry_assign_async(&mut result, streams);
+        self.new_propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
         assert!(result.block_carries_are_empty());
         result
     }
@@ -535,8 +605,53 @@ impl CudaServerKey {
         rhs: &CudaUnsignedRadixCiphertext,
         stream: &CudaStreams,
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
-        let mut ct_res = self.unchecked_add(lhs, rhs, stream);
-        let mut carry_out = self.propagate_single_carry_assign_async(&mut ct_res, stream);
+        let output_flag = OutputFlag::from_signedness(CudaUnsignedRadixCiphertext::IS_SIGNED);
+
+        let mut ct_res = match output_flag {
+            OutputFlag::Overflow => self.unchecked_add_with_packing(lhs, rhs, stream),
+            _ => self.unchecked_add(lhs, rhs, stream),
+        };
+
+        let mut carry_out =
+            self.new_propagate_single_carry_assign_async(&mut ct_res, stream, None, output_flag);
+
+        ct_res.as_mut().info = ct_res
+            .as_ref()
+            .info
+            .after_overflowing_add(&rhs.as_ref().info);
+
+        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+        {
+            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO);
+        } else {
+            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
+        }
+
+        let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
+
+        (ct_res, ct_overflowed)
+    }
+
+    /// # Safety
+    ///
+    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until stream is synchronised
+    pub unsafe fn unchecked_signed_overflowing_add_async(
+        &self,
+        lhs: &CudaSignedRadixCiphertext,
+        rhs: &CudaSignedRadixCiphertext,
+        stream: &CudaStreams,
+    ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) {
+        let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED);
+
+        let mut ct_res = match output_flag {
+            OutputFlag::Overflow => self.unchecked_add_with_packing(lhs, rhs, stream),
+            _ => self.unchecked_add(lhs, rhs, stream),
+        };
+
+        let mut carry_out =
+            self.new_propagate_single_carry_assign_async(&mut ct_res, stream, None, output_flag);
 
         ct_res.as_mut().info = ct_res
             .as_ref()
@@ -661,6 +776,7 @@ impl CudaServerKey {
             SignedOperation::Addition,
             stream,
         )
+        //}
     }
 
     pub(crate) fn unchecked_signed_overflowing_add_or_sub(
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index 32daab51ce..7ed82650f2 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -6,6 +6,7 @@ use crate::core_crypto::prelude::{
     ContiguousEntityContainerMut, LweBskGroupingFactor, LweCiphertextCount,
 };
 use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::gpu::ciphertext::{
     CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
@@ -14,9 +15,10 @@ use crate::integer::gpu::ciphertext::{
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
     apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async, full_propagate_assign_async,
-    propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async,
-    CudaServerKey, PBSType,
+    propagate_fast_single_carry_assign_async, propagate_single_carry_assign_async,
+    propagate_single_carry_get_input_carries_assign_async, CudaServerKey, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator};
 use crate::shortint::server_key::{
@@ -264,6 +266,110 @@ impl CudaServerKey {
         carry_out
     }
 
+    /// # Safety
+    ///
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronized
+    pub(crate) unsafe fn propagate_fast_single_carry_assign_async<T>(
+        &self,
+        ct: &mut T,
+        streams: &CudaStreams,
+        input_carry: Option<&CudaBooleanBlock>,
+        requested_flag: OutputFlag,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
+        let ciphertext = ct.as_mut();
+        let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+        let uses_carry = match input_carry {
+            Some(_block) => 1u32,
+            None => 0u32,
+        };
+        let mut aux_block: T = self.create_trivial_zero_radix(1, streams);
+        let in_carry_dvec = match input_carry {
+            Some(block) => &block.0.ciphertext.d_blocks.0.d_vec,
+            None => &aux_block.as_mut().d_blocks.0.d_vec,
+        };
+
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                propagate_fast_single_carry_assign_async(
+                    streams,
+                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
+                    &d_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    requested_flag,
+                    uses_carry,
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                propagate_fast_single_carry_assign_async(
+                    streams,
+                    &mut ciphertext.d_blocks.0.d_vec,
+                    &mut carry_out.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
+                    &d_multibit_bsk.d_vec,
+                    &self.key_switching_key.d_vec,
+                    d_multibit_bsk.input_lwe_dimension(),
+                    d_multibit_bsk.glwe_dimension(),
+                    d_multibit_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count(),
+                    d_multibit_bsk.decomp_base_log(),
+                    num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
+                    PBSType::MultiBit,
+                    d_multibit_bsk.grouping_factor,
+                    requested_flag,
+                    uses_carry,
+                );
+            }
+        };
+        ciphertext.info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(b.message_modulus.0 - 1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        carry_out.as_mut().info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        carry_out
+    }
+
+    /// # Safety
+    ///
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronized
+    pub(crate) unsafe fn new_propagate_single_carry_assign_async<T>(
+        &self,
+        ct: &mut T,
+        streams: &CudaStreams,
+        input_carry: Option<&CudaBooleanBlock>,
+        requested_flag: OutputFlag,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        self.propagate_fast_single_carry_assign_async(ct, streams, input_carry, requested_flag)
+    }
     /// # Safety
     ///
     /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs
index d7156919cf..77c7ec2762 100644
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -1,6 +1,7 @@
 use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 
 impl CudaServerKey {
     /// Homomorphically computes the opposite of a ciphertext encrypting an integer message.
@@ -144,7 +145,8 @@ impl CudaServerKey {
         };
 
         let mut res = self.unchecked_neg_async(ct, streams);
-        let _carry = self.propagate_single_carry_assign_async(&mut res, streams);
+        let _carry =
+            self.new_propagate_single_carry_assign_async(&mut res, streams, None, OutputFlag::None);
         res
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
index 7c30c789bb..2384c94359 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
@@ -8,6 +8,7 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::scalar_addition_integer_radix_assign_async;
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::prelude::CastInto;
 use crate::shortint::ciphertext::NoiseLevel;
 
@@ -186,7 +187,8 @@ impl CudaServerKey {
         };
 
         self.unchecked_scalar_add_assign_async(ct, scalar, streams);
-        let _carry = self.propagate_single_carry_assign_async(ct, streams);
+        let _carry =
+            self.new_propagate_single_carry_assign_async(ct, streams, None, OutputFlag::None);
     }
 
     pub fn scalar_add_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams)
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
index d117927343..34161ab9c5 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
@@ -4,6 +4,7 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext};
 use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::integer::server_key::TwosComplementNegation;
 use crate::prelude::CastInto;
 
@@ -151,7 +152,8 @@ impl CudaServerKey {
         };
 
         self.unchecked_scalar_sub_assign_async(ct, scalar, stream);
-        let _carry = self.propagate_single_carry_assign_async(ct, stream);
+        let _carry =
+            self.new_propagate_single_carry_assign_async(ct, stream, None, OutputFlag::None);
     }
 
     pub fn scalar_sub_assign<Scalar, T>(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams)
diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs
index 8e784a3686..5964a22331 100644
--- a/tfhe/src/integer/gpu/server_key/radix/sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs
@@ -1,18 +1,18 @@
 use super::add::SignedOperation;
-use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
-use crate::core_crypto::prelude::{CiphertextModulus, LweBskGroupingFactor, LweCiphertextCount};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
-use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
 use crate::integer::gpu::ciphertext::{
-    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
-    CudaUnsignedRadixCiphertext,
+    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
 };
-use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+use crate::integer::gpu::server_key::CudaServerKey;
+
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
     unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
 };
+use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::NoiseLevel;
+use crate::shortint::parameters::{Degree, LweBskGroupingFactor};
 
 impl CudaServerKey {
     /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values.
@@ -272,7 +272,8 @@ impl CudaServerKey {
         };
 
         self.unchecked_sub_assign_async(lhs, rhs, streams);
-        let _carry = self.propagate_single_carry_assign_async(lhs, streams);
+        let _carry =
+            self.new_propagate_single_carry_assign_async(lhs, streams, None, OutputFlag::None);
     }
 
     pub fn unsigned_overflowing_sub(
@@ -353,87 +354,104 @@ impl CudaServerKey {
         rhs: &CudaUnsignedRadixCiphertext,
         stream: &CudaStreams,
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
-        let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-        let mut tmp: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
-        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-        {
-            tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::ZERO);
-        } else {
-            tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
-        }
         let mut ct_res = lhs.duplicate_async(stream);
-        let block = CudaLweCiphertextList::new(
-            tmp.as_ref().d_blocks.lwe_dimension(),
-            LweCiphertextCount(1),
-            CiphertextModulus::new_native(),
-            stream,
-        );
-        let block_info = tmp.as_ref().info.blocks[0];
-        let ct_info = vec![block_info];
-        let ct_info = CudaRadixCiphertextInfo { blocks: ct_info };
 
-        let mut ct_overflowed =
-            CudaBooleanBlock::from_cuda_radix_ciphertext(CudaRadixCiphertext::new(block, ct_info));
+        let compute_overflow = true;
+        const INPUT_BORROW: Option<&CudaBooleanBlock> = None;
+
+        let mut overflow_block: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(1, stream);
+        let ciphertext = ct_res.as_mut();
+        let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
+        let uses_input_borrow = match INPUT_BORROW {
+            Some(_block) => 1u32,
+            None => 0u32,
+        };
+        let mut aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
+        let in_carry_dvec = match INPUT_BORROW {
+            Some(block) => &block.0.ciphertext.d_blocks.0.d_vec,
+            None => &aux_block.as_mut().d_blocks.0.d_vec,
+        };
 
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
                     stream,
-                    &mut ct_res.as_mut().d_blocks.0.d_vec,
-                    &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    &lhs.as_ref().d_blocks.0.d_vec,
+                    &mut ciphertext.d_blocks.0.d_vec,
                     &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflow_block.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_bsk.d_vec,
                     &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
                     self.key_switching_key.decomposition_level_count(),
                     self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
                     num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::Classical,
                     LweBskGroupingFactor(0),
+                    compute_overflow,
+                    uses_input_borrow,
                 );
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
                     stream,
-                    &mut ct_res.as_mut().d_blocks.0.d_vec,
-                    &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec,
-                    &lhs.as_ref().d_blocks.0.d_vec,
+                    &mut ciphertext.d_blocks.0.d_vec,
                     &rhs.as_ref().d_blocks.0.d_vec,
+                    &mut overflow_block.as_mut().d_blocks.0.d_vec,
+                    in_carry_dvec,
                     &d_multibit_bsk.d_vec,
                     &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
+                    d_multibit_bsk.input_lwe_dimension(),
+                    d_multibit_bsk.glwe_dimension(),
+                    d_multibit_bsk.polynomial_size(),
                     self.key_switching_key.decomposition_level_count(),
                     self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.decomp_level_count(),
+                    d_multibit_bsk.decomp_base_log(),
                     num_blocks,
+                    ciphertext.info.blocks.first().unwrap().message_modulus,
+                    ciphertext.info.blocks.first().unwrap().carry_modulus,
                     PBSType::MultiBit,
                     d_multibit_bsk.grouping_factor,
+                    compute_overflow,
+                    uses_input_borrow,
                 );
             }
         };
+        ciphertext.info.blocks.iter_mut().for_each(|b| {
+            b.degree = Degree::new(b.message_modulus.0 - 1);
+            b.noise_level = NoiseLevel::NOMINAL;
+        });
+        overflow_block
+            .as_mut()
+            .info
+            .blocks
+            .iter_mut()
+            .for_each(|b| {
+                b.degree = Degree::new(1);
+                b.noise_level = NoiseLevel::ZERO;
+            });
+
+        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
+        {
+            overflow_block.as_mut().info =
+                overflow_block.as_ref().info.boolean_info(NoiseLevel::ZERO);
+        } else {
+            overflow_block.as_mut().info = overflow_block
+                .as_ref()
+                .info
+                .boolean_info(NoiseLevel::NOMINAL);
+        }
+
+        let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(overflow_block.ciphertext);
 
         ct_res.as_mut().info = ct_res
             .as_ref()