diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 99860fd1c7..9c2bfacdfa 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -112,10 +112,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
                                uint32_t gpu_count, int8_t **mem_ptr_void);
 
-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus);
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);
 
 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index 4f6761d6c6..20897b141b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
@@ -66,7 +66,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
     uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
 
-  host_packing_keyswitch_lwe_list_to_glwe(
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(glwe_array_out),
       static_cast<uint64_t *>(lwe_array_in),
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index eb881fedcd..3c520092f2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
     int num_blocks = (n + block_size - 1) / block_size;
 
     // Launch the kernel
-    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                                 n);
+    cuda_set_value_kernel<Torus>
+        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
     check_cuda_error(cudaGetLastError());
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
index e44cd0ad60..daf90239d3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
       streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
       mem->params.big_lwe_dimension, 1);
 
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
-                1);
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, last_block_input_carry,
-                mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, x,
+                       mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, last_block_input_carry,
+                       mem->params.big_lwe_dimension, 1);
 
   host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                       last_block_inner_propagation,
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
 
   // phase 1
   if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
+                         big_lwe_dimension, num_blocks);
   } else {
-    host_integer_radix_negation(
+    host_integer_radix_negation<Torus>(
         streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
         num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                         big_lwe_dimension, num_blocks);
   }
 
   // phase 2
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
 
-  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-                              result, output_carry, input_carries,
-                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation(
+  host_propagate_single_carry<Torus>(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation<Torus>(
       mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
       last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
       &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
   // phase 3
   auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
 
-  host_resolve_signed_overflow(
+  host_resolve_signed_overflow<Torus>(
       streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
       input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
index 28f685b6e4..68a501eac6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
   std::function<uint64_t(uint64_t)> predicate_lut_f =
       [](uint64_t x) -> uint64_t { return x == 1; };
 
-  scratch_cuda_integer_radix_cmux_kb(
+  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
       lwe_ciphertext_count, params, allocate_gpu_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index 3082d5998c..9b72407f17 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
     auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
     auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
 
-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
-        params.message_modulus, 1);
+    device_pack_bivariate_blocks<Torus>
+        <<<num_blocks, num_threads, 0, streams[0]>>>(
+            lwe_array_out_block, predicate->lwe_indexes_in,
+            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
+            params.big_lwe_dimension, params.message_modulus, 1);
     check_cuda_error(cudaGetLastError());
   }
 
@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
   }
 
   auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-              lwe_array_true, lwe_condition, mem_true,
-              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
   auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
-              bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
   for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
   }
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
   // will be 0 If the condition was false, true_ct will be 0 and false_ct will
   // have kept its value
   auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
-                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
-                num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index d56d6e3297..ed2a3bbef5 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
   int num_entries = (lwe_dimension + 1);
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
   // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
       output, input, lwe_dimension, num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 }
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     uint32_t num_radix_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto glwe_dimension = params.glwe_dimension;
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
     auto is_equal_to_num_blocks_map =
         &are_all_block_true_buffer->is_equal_to_lut_map;
     for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);
 
       accumulator += (big_lwe_dimension + 1);
       remaining_blocks -= (chunk_length - 1);
@@ -165,7 +165,6 @@ __host__ void is_at_least_one_comparisons_block_true(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     uint32_t num_radix_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto message_modulus = params.message_modulus;
@@ -192,8 +191,9 @@ __host__ void is_at_least_one_comparisons_block_true(
     auto input_blocks = mem_ptr->tmp_lwe_array_out;
     auto accumulator = buffer->tmp_block_accumulated;
     for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);
 
       accumulator += (big_lwe_dimension + 1);
       remaining_blocks -= (chunk_length - 1);
@@ -280,8 +280,8 @@ __host__ void host_compare_with_zero_equality(
       uint32_t chunk_size =
           std::min(remainder_blocks, num_elements_to_fill_carry);
 
-      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
-                            big_lwe_dimension, chunk_size);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
+                                   big_lwe_dimension, chunk_size);
 
       num_sum_blocks++;
       remainder_blocks -= (chunk_size - 1);
@@ -295,8 +295,9 @@ __host__ void host_compare_with_zero_equality(
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
       zero_comparison);
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, sum, mem_ptr, bsks, ksks,
+                                        num_sum_blocks);
 }
 
 template <typename Torus>
@@ -310,7 +311,7 @@ __host__ void host_integer_radix_equality_check_kb(
 
   // Applies the LUT for the comparison operation
   auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
       bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
       eq_buffer->operator_lut->params.message_modulus);
@@ -319,9 +320,9 @@ __host__ void host_integer_radix_equality_check_kb(
   //
   // It returns a block encrypting 1 if all input blocks are 1
   // otherwise the block encrypts 0
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 comparisons, mem_ptr, bsks, ksks,
-                                 num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, comparisons, mem_ptr,
+                                        bsks, ksks, num_radix_blocks);
 }
 
 template <typename Torus>
@@ -352,19 +353,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
 
   // Subtract
   // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
-                   lwe_array_right, big_lwe_dimension, num_radix_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
+                          lwe_array_left, lwe_array_right, big_lwe_dimension,
+                          num_radix_blocks);
 
   // Apply LUT to compare to 0
   auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
       num_radix_blocks, is_non_zero_lut);
 
   // Add one
   // Here Lhs can have the following values: (-1) % (message modulus * carry
   // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
       num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -406,8 +408,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
 
   auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
   while (partial_block_count > 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);
 
     integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +435,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
   std::function<Torus(Torus)> f;
 
   if (partial_block_count == 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);
 
     f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
       int msb = (x >> 2) & 3;
@@ -454,9 +456,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
   last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
   // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                 gpu_count, lwe_array_out, y,
-                                                 bsks, ksks, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
+      last_lut);
 }
 
 template <typename Torus>
@@ -488,19 +490,21 @@ __host__ void host_integer_radix_difference_check_kb(
     if (mem_ptr->is_signed) {
       packed_num_radix_blocks -= 2;
     }
-    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                       big_lwe_dimension, packed_num_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
+                       lwe_array_right, big_lwe_dimension,
+                       packed_num_radix_blocks, message_modulus);
     // From this point we have half number of blocks
     packed_num_radix_blocks /= 2;
 
     // Clean noise
     auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
         packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
         packed_num_radix_blocks, identity_lut);
 
@@ -517,16 +521,17 @@ __host__ void host_integer_radix_difference_check_kb(
   if (!mem_ptr->is_signed) {
     // Compare packed blocks, or simply the total number of radix blocks in the
     // inputs
-    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
+    compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
+                                   packed_num_radix_blocks);
     num_comparisons = packed_num_radix_blocks;
   } else {
     // Packing is possible
     if (carry_modulus >= message_modulus) {
       // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                              rhs, mem_ptr, bsks, ksks,
-                              packed_num_radix_blocks);
+      compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                     comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                     packed_num_radix_blocks);
 
       // Compare the last block before the sign block separately
       auto identity_lut = mem_ptr->identity_lut;
@@ -535,21 +540,21 @@ __host__ void host_integer_radix_difference_check_kb(
       Torus *last_right_block_before_sign_block =
           diff_buffer->tmp_packed_right +
           packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
           lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
           identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
           lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
           1, identity_lut);
-      compare_radix_blocks_kb(
+      compare_radix_blocks_kb<Torus>(
           streams, gpu_indexes, gpu_count,
           comparisons + packed_num_radix_blocks * big_lwe_size,
           last_left_block_before_sign_block, last_right_block_before_sign_block,
           mem_ptr, bsks, ksks, 1);
       // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count,
           comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
           lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -558,11 +563,11 @@ __host__ void host_integer_radix_difference_check_kb(
       num_comparisons = packed_num_radix_blocks + 2;
 
     } else {
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
-                              ksks, num_radix_blocks - 1);
+      compare_radix_blocks_kb<Torus>(
+          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
+          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
       // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count,
           comparisons + (num_radix_blocks - 1) * big_lwe_size,
           lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -575,9 +580,9 @@ __host__ void host_integer_radix_difference_check_kb(
   // Reduces a vec containing radix blocks that encrypts a sign
   // (inferior, equal, superior) to one single radix block containing the
   // final sign
-  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                      comparisons, mem_ptr->diff_buffer->tree_buffer,
-                      reduction_lut_f, bsks, ksks, num_comparisons);
+  tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             comparisons, mem_ptr->diff_buffer->tree_buffer,
+                             reduction_lut_f, bsks, ksks, num_comparisons);
 }
 
 template <typename Torus>
@@ -601,16 +606,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                              Torus **ksks, uint32_t total_num_radix_blocks) {
 
   // Compute the sign
-  host_integer_radix_difference_check_kb(
+  host_integer_radix_difference_check_kb<Torus>(
       streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
       lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
       ksks, total_num_radix_blocks);
 
   // Selector
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }
 
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
index 841041f27b..43c35dc435 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -14,7 +14,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
       lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
       carry_modulus);
 
-  scratch_cuda_compress_integer_radix_ciphertext_64(
+  scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
       lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
@@ -39,7 +39,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
       lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
       0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
 
-  scratch_cuda_integer_decompress_radix_ciphertext_64(
+  scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
       encryption_params, compression_params, storage_log_modulus,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index 26bd6befed..164d1e908c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -54,8 +54,8 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
 
   dim3 grid(num_blocks);
   dim3 threads(num_threads);
-  pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
-                                     len);
+  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
+                                            in_len, len);
 }
 
 template <typename Torus>
@@ -70,10 +70,10 @@ __host__ void host_integer_compress(cudaStream_t *streams,
 
   // Shift
   auto lwe_shifted = mem_ptr->tmp_lwe;
-  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
-                                lwe_array_in,
-                                (uint64_t)compression_params.message_modulus,
-                                input_lwe_dimension, num_lwes);
+  host_cleartext_multiplication<Torus>(
+      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
+      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
+      num_lwes);
 
   uint32_t lwe_in_size = input_lwe_dimension + 1;
   uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
@@ -88,7 +88,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
     auto lwe_subset = lwe_shifted + i * lwe_in_size;
     auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
 
-    host_packing_keyswitch_lwe_list_to_glwe(
+    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
         streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
         fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
         compression_params.polynomial_size, compression_params.ks_base_log,
@@ -96,16 +96,16 @@ __host__ void host_integer_compress(cudaStream_t *streams,
   }
 
   // Modulus switch
-  host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
-                              num_glwes *
-                                  (compression_params.glwe_dimension *
-                                       compression_params.polynomial_size +
-                                   body_count),
-                              mem_ptr->storage_log_modulus);
+  host_modulus_switch_inplace<Torus>(
+      streams[0], gpu_indexes[0], tmp_glwe_array_out,
+      num_glwes * (compression_params.glwe_dimension *
+                       compression_params.polynomial_size +
+                   body_count),
+      mem_ptr->storage_log_modulus);
   check_cuda_error(cudaGetLastError());
 
-  host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
-            num_glwes, body_count, mem_ptr);
+  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
+                   tmp_glwe_array_out, num_glwes, body_count, mem_ptr);
 }
 
 template <typename Torus>
@@ -167,8 +167,8 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
   getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
   dim3 grid(num_blocks);
   dim3 threads(num_threads);
-  extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
-                                        log_modulus, initial_out_len);
+  extract<Torus><<<grid, threads, 0, stream>>>(
+      glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -182,8 +182,8 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
 
   auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
   auto compression_params = mem_ptr->compression_params;
-  host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
-               mem_ptr);
+  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
+                      packed_glwe_in, 0, mem_ptr);
 
   auto num_lwes = mem_ptr->num_lwes;
 
@@ -252,7 +252,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
 }
 
 template <typename Torus>
-__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
+__host__ void scratch_cuda_compress_integer_radix_ciphertext(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
     int_compression<Torus> **mem_ptr, uint32_t num_lwes,
     int_radix_params compression_params, uint32_t lwe_per_glwe,
@@ -264,7 +264,7 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
 }
 
 template <typename Torus>
-__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
+__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
     int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
     int_radix_params encryption_params, int_radix_params compression_params,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index e2b95a0203..9e68cb408d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           // Shift the mask so that we will only keep bits we should
           uint32_t shifted_mask = full_message_mask >> shift_amount;
 
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
               streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
               interesting_divisor.last_block(), bsks, ksks, 1,
               mem_ptr->masking_luts_1[shifted_mask]);
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           // the estimated degree of the output is < msg_modulus
           shifted_mask = shifted_mask & full_message_mask;
 
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
               streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
               divisor_ms_blocks.first_block(), bsks, ksks, 1,
               mem_ptr->masking_luts_2[shifted_mask]);
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           interesting_remainder1.insert(0, numerator_block_1.first_block(),
                                         streams[0], gpu_indexes[0]);
 
-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
               streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
               mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
 
@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                interesting_remainder1.len - 1, streams[0],
                                gpu_indexes[0]);
 
-          host_radix_blocks_rotate_left(
+          host_radix_blocks_rotate_left<Torus>(
               streams, gpu_indexes, gpu_count, interesting_remainder1.data,
               tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
 
@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
 
     auto left_shift_interesting_remainder2 =
         [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
               streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
               mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
         }; // left_shift_interesting_remainder2
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
     // but in that position, interesting_remainder2 always has a 0
     auto &merged_interesting_remainder = interesting_remainder1;
 
-    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
-                  merged_interesting_remainder.data,
-                  interesting_remainder2.data, radix_params.big_lwe_dimension,
-                  merged_interesting_remainder.len);
+    host_addition<Torus>(
+        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+        merged_interesting_remainder.data, interesting_remainder2.data,
+        radix_params.big_lwe_dimension, merged_interesting_remainder.len);
 
     // after create_clean_version_of_merged_remainder
     // `merged_interesting_remainder` will be reused as
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
         // We could call unchecked_scalar_ne
         // But we are in the special case where scalar == 0
         // So we can skip some stuff
-        host_compare_with_zero_equality(
+        host_compare_with_zero_equality<Torus>(
             streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
             mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
             mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
         tmp_1.len =
             ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
 
-        is_at_least_one_comparisons_block_true(
+        is_at_least_one_comparisons_block_true<Torus>(
             streams, gpu_indexes, gpu_count,
             at_least_one_upper_block_is_non_zero.data, tmp_1.data,
             mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
     //  `cleaned_merged_interesting_remainder` - radix ciphertext
     auto create_clean_version_of_merged_remainder =
         [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
               streams, gpu_indexes, gpu_count,
               cleaned_merged_interesting_remainder.data,
               cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
     }
 
-    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
-                  subtraction_overflowed.data,
-                  at_least_one_upper_block_is_non_zero.data,
-                  radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);
 
     int factor = (i) ? 3 : 2;
     int factor_lut_id = factor - 2;
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           mem_ptr->merge_overflow_flags_luts[pos_in_block]
               ->params.message_modulus);
 
-      host_addition(streams[0], gpu_indexes[0],
-                    &quotient[block_of_bit * big_lwe_size],
-                    &quotient[block_of_bit * big_lwe_size],
-                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+      host_addition<Torus>(
+          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
+          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
+          radix_params.big_lwe_dimension, 1);
     };
 
     for (uint j = 0; j < gpu_count; j++) {
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
 
   // Clean the quotient and remainder
   // as even though they have no carries, they are not at nominal noise level
-  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                remainder2.data, radix_params.big_lwe_dimension,
-                remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);
 
   for (uint j = 0; j < gpu_count; j++) {
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
   }
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
       mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
       bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
       mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
       ksks, num_blocks, mem_ptr->message_extract_lut_2);
   for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
index e5b4b2e742..b9f76c0ec6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                           ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                           message_modulus, carry_modulus);
 
-  scratch_cuda_propagate_single_carry_kb_inplace(
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
       allocate_gpu_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index cd7640d00b..b23f6e5ab2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
           "pointers should be different");
   }
   cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
       dst, src, value, blocks_count, lwe_size);
 }
 
@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
           "pointers should be different");
   }
   cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
       dst, src, value, blocks_count, lwe_size);
 }
 
@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                   uint32_t lwe_size) {
   cudaSetDevice(gpu_indexes[0]);
   int num_blocks = blocks_count / 2, num_threads = 1024;
-  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
-      src, blocks_count, lwe_size);
+  radix_blocks_reverse_lwe_inplace<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }
 
 // polynomial_size threads
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
   int num_blocks = 0, num_threads = 0;
   int num_entries = num_radix_blocks * (lwe_dimension + 1);
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
-      lwe_dimension, shift, num_radix_blocks);
+  device_pack_bivariate_blocks<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(
+          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
+          lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -273,10 +274,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
 
   // Left message is shifted
   auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
-                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
-                        lut->lwe_indexes_in, big_lwe_dimension, shift,
-                        num_radix_blocks);
+  pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
+                               lwe_array_pbs_in, lut->lwe_trivial_indexes,
+                               lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
+                               big_lwe_dimension, shift, num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 
   /// For multi GPU execution we create vectors of pointers for inputs and
@@ -380,7 +381,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
     body[i] = -body[i];
   }
 
-  rotate_left(body, half_box_size, polynomial_size);
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }
 
 template <typename Torus>
@@ -590,13 +591,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
       ksks, num_blocks, luts_array);
 
   // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele(
+  host_compute_prefix_sum_hillis_steele<Torus>(
       streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
       params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
 
-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
   if (carry_out != nullptr) {
     cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
                                  streams[0], gpu_indexes[0]);
@@ -610,8 +611,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  gpu_indexes[0]);
   }
 
-  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
-                glwe_dimension * polynomial_size, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                       step_output, glwe_dimension * polynomial_size,
+                       num_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -664,14 +666,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
       overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
       big_lwe_size_bytes, streams[0], gpu_indexes[0]);
 
-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
   cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                     gpu_indexes[0]);
 
-  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                   step_output, glwe_dimension * polynomial_size, num_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                          step_output, glwe_dimension * polynomial_size,
+                          num_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -727,10 +730,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
 
     if (i < num_blocks - 1) {
       auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(streams[0], gpu_indexes[0], next_input_block,
-                    next_input_block,
-                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
+                           next_input_block,
+                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                           params.big_lwe_dimension, 1);
     }
   }
 }
@@ -794,7 +797,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
   int num_blocks = 0, num_threads = 0;
   int num_entries = (lwe_dimension + 1);
   getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
       lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }
 
@@ -840,7 +843,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
   // this
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
-  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
+  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
       lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
   check_cuda_error(cudaGetLastError());
 }
@@ -857,7 +860,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
                              uint32_t num_radix_blocks, uint32_t bits_per_block,
                              int_bit_extract_luts_buffer<Torus> *bit_extract) {
 
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
       num_radix_blocks * bits_per_block, bit_extract->lut);
 }
@@ -870,7 +873,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
              std::function<Torus(Torus)> sign_handler_f, void **bsks,
              Torus **ksks, uint32_t num_sign_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto diff_buffer = mem_ptr->diff_buffer;
 
   auto params = mem_ptr->params;
@@ -904,9 +906,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
     lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
     while (num_sign_blocks > 2) {
-      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
-                  big_lwe_dimension, num_sign_blocks, 4);
-      integer_radix_apply_univariate_lookup_table_kb(
+      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                         big_lwe_dimension, num_sign_blocks, 4);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
           num_sign_blocks / 2, lut);
 
@@ -937,11 +939,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
         final_lut_f);
     lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
-                2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_b, bsks, ksks, 1, lut);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       big_lwe_dimension, 2, 4);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
+        1, lut);
 
   } else {
 
@@ -957,9 +959,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
         final_lut_f);
     lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_a, bsks, ksks, 1, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
+        1, lut);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index ba831d1f9f..28045096bb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
     if (!ch_amount)
       ch_amount++;
     dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);
 
     cudaSetDevice(gpu_indexes[0]);
     tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
@@ -303,7 +302,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
     // inside d_smart_copy_in there are only -1 values
     // it's fine to call smart_copy with same pointer
     // as source and destination
-    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
         new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
         big_lwe_size);
     check_cuda_error(cudaGetLastError());
@@ -422,9 +421,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   luts_message_carry->release(streams, gpu_indexes, gpu_count);
   delete (luts_message_carry);
 
-  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
-                num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }
 
 template <typename Torus, class params>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
index 3ed45f89e1..67ae11be96 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,14 +1,16 @@
 #include "integer/negation.cuh"
 
-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
 
-  host_integer_radix_negation(
+  host_integer_radix_negation<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
-      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
 
 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
index 2a18793ba7..eb8307c93a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -25,14 +25,13 @@ template <typename Torus>
 __global__ void
 device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
                               uint64_t lwe_dimension, uint64_t message_modulus,
-                              uint64_t carry_modulus, uint64_t delta) {
+                              uint64_t delta) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < lwe_dimension + 1) {
     bool is_body = (tid == lwe_dimension);
 
     // z = ceil( degree / 2^p ) * 2^p
     uint64_t z = (2 * message_modulus - 1) / message_modulus;
-    __syncthreads();
     z *= message_modulus;
 
     // (0,Delta*z) - ct
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
 
       uint64_t encoded_zb = zb * delta;
 
-      __syncthreads();
-
       // (0,Delta*z) - ct
       output[tid] =
           (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
-      __syncthreads();
     }
   }
 }
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
-  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
 
   // Value of the shift we multiply our messages by
   // If message_modulus and carry_modulus are always powers of 2 we can simplify
   // this
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
-  device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
+  device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
       output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
-      carry_modulus, delta);
+      delta);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(
 
   auto radix_params = mem_ptr->params;
 
-  host_unchecked_sub_with_correcting_term(
+  host_unchecked_sub_with_correcting_term<Torus>(
       streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
       radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
       radix_params.message_modulus, radix_params.carry_modulus,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
index d1b65c5e71..153a6e5e86 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
     void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
     uint32_t message_modulus, uint32_t carry_modulus) {
 
-  host_integer_radix_scalar_addition_inplace(
+  host_integer_radix_scalar_addition_inplace<uint64_t>(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
       static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
       lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
index 79a7c88072..1595651f06 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -45,9 +45,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
   // this
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_addition_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -83,8 +84,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
   // this
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
+  device_integer_radix_add_scalar_one_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
+                                      lwe_dimension, delta);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -122,10 +124,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
   // this
   uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
 
-  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_subtraction_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
   check_cuda_error(cudaGetLastError());
 }
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 7cc3e6cec0..a8cd292e7a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -3,6 +3,58 @@
 
 #include "integer/comparison.cuh"
 
+template <typename Torus>
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {
+
+  if (num_radix_blocks == 0)
+    return;
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
+  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_integer_radix_scalar_subtraction_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
+      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
+
+  // Apply LUT to compare to 0
+  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+}
+
 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
   if (total_num_scalar_blocks == 0) {
     // We only have to compare blocks with zero
     // means scalar is zero
-    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
-                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
-                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+        lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        mem_ptr->is_zero_lut);
 
     auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
       x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
     Torus *lhs = diff_buffer->tmp_packed_left;
     Torus *rhs = diff_buffer->tmp_packed_right;
 
-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);
 
     // From this point we have half number of blocks
     num_lsb_radix_blocks /= 2;
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
     // - 2 if lhs > rhs
 
     auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);
 
     // Reduces a vec containing radix blocks that encrypts a sign
     // (inferior, equal, superior) to one single radix block containing the
     // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
     //////////////
     // msb
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
     for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
         scalar_bivariate_last_leaf_lut_f);
     lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
         lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);
 
@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
     Torus *lhs = diff_buffer->tmp_packed_left;
     Torus *rhs = diff_buffer->tmp_packed_right;
 
-    pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_scalar_blocks, message_modulus);
 
     // From this point we have half number of blocks
     num_lsb_radix_blocks /= 2;
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
     // - 1 if lhs == rhs
     // - 2 if lhs > rhs
     auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);
 
     // Reduces a vec containing radix blocks that encrypts a sign
     // (inferior, equal, superior) to one single radix block containing the
     // final sign
-    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                               comparisons, mem_ptr->diff_buffer->tree_buffer,
+                               sign_handler_f, bsks, ksks,
+                               num_lsb_radix_blocks);
   }
 }
 
@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     // We only have to compare blocks with zero
     // means scalar is zero
     Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(
+    host_compare_with_zero_equality<Torus>(
         streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
         mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
     Torus *sign_block =
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
         scalar_bivariate_last_leaf_lut_f);
     lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
         sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);
 
@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     Torus *lhs = diff_buffer->tmp_packed_left;
     Torus *rhs = diff_buffer->tmp_packed_right;
 
-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);
 
     // From this point we have half number of blocks
     num_lsb_radix_blocks /= 2;
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     // - 2 if lhs > rhs
 
     auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);
 
     // Reduces a vec containing radix blocks that encrypts a sign
     // (inferior, equal, superior) to one single radix block containing the
     // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
     //////////////
     // msb
     // We remove the last block (which is the sign)
     Torus *are_all_msb_zeros = lwe_array_msb_out;
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
 
     auto sign_bit_pos = (int)log2(message_modulus) - 1;
 
@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
     Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
         are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
         signed_msb_lut->params.message_modulus);
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
 
     //////////////
     // Reduce the two blocks into one final
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        2);
 
   } else {
     // We only have to do the regular comparison
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     Torus *lhs = diff_buffer->tmp_packed_left;
     Torus *rhs = diff_buffer->tmp_packed_right;
 
-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks - 1,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_lsb_radix_blocks - 1, message_modulus);
 
     // From this point we have half number of blocks
     num_lsb_radix_blocks /= 2;
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     // - 0 if lhs < rhs
     // - 1 if lhs == rhs
     // - 2 if lhs > rhs
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
-                                   ksks, num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                          bsks, ksks, num_lsb_radix_blocks);
     Torus *encrypted_sign_block =
         lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
     Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
 
     auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-    create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
-                         scalar_sign_block, big_lwe_dimension, 1, 1,
-                         message_modulus, carry_modulus);
+    create_trivial_radix<Torus>(
+        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
+        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);
 
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
         encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
         mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     // Reduces a vec containing radix blocks that encrypts a sign
     // (inferior, equal, superior) to one single radix block containing the
     // final sign
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
-                 num_lsb_radix_blocks + 1);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        num_lsb_radix_blocks + 1);
   }
 }
 
@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   // Calculates the difference sign between the ciphertext and the scalar
   // - 0 if lhs < rhs
   // - 1 if lhs == rhs
   // - 2 if lhs > rhs
   auto sign = mem_ptr->tmp_lwe_array_out;
-  integer_radix_signed_scalar_difference_check_kb(
+  integer_radix_signed_scalar_difference_check_kb<Torus>(
       streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
       mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
       total_num_scalar_blocks);
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
   auto lwe_array_left = lwe_array_in;
   auto lwe_array_right = mem_ptr->tmp_block_comparisons;
 
-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);
 
   // Selector
   // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             sign, lwe_array_left, lwe_array_right,
-                             mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }
 
 template <typename Torus>
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
 
   if (mem_ptr->is_signed) {
     // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb(
+    integer_radix_signed_scalar_difference_check_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
         scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
         total_num_radix_blocks, total_num_scalar_blocks);
   } else {
-    integer_radix_unsigned_scalar_difference_check_kb(
+    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
         scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
         total_num_radix_blocks, total_num_scalar_blocks);
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
 
   if (mem_ptr->is_signed) {
     // is signed and scalar is positive
-    integer_radix_signed_scalar_maxmin_kb(
+    integer_radix_signed_scalar_maxmin_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
         scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
         total_num_scalar_blocks);
   } else {
-    integer_radix_unsigned_scalar_maxmin_kb(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
-        total_num_scalar_blocks);
+    PANIC("Cuda error: only signed scalar maxmin can be called in signed "
+          "scalar comparison")
   }
 }
 
-template <typename Torus>
-__host__ void scalar_compare_radix_blocks_kb(
-    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
-    uint32_t num_radix_blocks) {
-
-  if (num_radix_blocks == 0)
-    return;
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
-                               num_radix_blocks * (big_lwe_dimension + 1) *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace(
-      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
-      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
-
-  // Apply LUT to compare to 0
-  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
-      ksks, num_radix_blocks, sign_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
-      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
   // - 1 if lhs == rhs
   // - 2 if lhs > rhs
   auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb(
+  host_integer_radix_scalar_difference_check_kb<Torus>(
       streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
       mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
       total_num_scalar_blocks);
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
   auto lwe_array_left = lwe_array_in;
   auto lwe_array_right = mem_ptr->tmp_block_comparisons;
 
-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);
 
   // Selector
   // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }
 
 template <typename Torus>
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
     auto packed_scalar =
         packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
 
-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                       scalar_blocks, 0, num_scalar_blocks, message_modulus);
 
     cuda_memcpy_async_gpu_to_gpu(
         scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
         gpu_indexes[0]);
     scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
 
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
         lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
         bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
   }
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
       PANIC("Cuda error: integer operation not supported")
     }
 
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, msb_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, msb_lut);
   }
 
   for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
 
   switch (mem_ptr->op) {
   case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
+    are_all_comparisons_block_true<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
         mem_ptr, bsks, ksks,
         num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
     break;
   case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
+    is_at_least_one_comparisons_block_true<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
         mem_ptr, bsks, ksks,
         num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
index 8347945532..ad71ea5cb0 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
       cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
                                    lwe_size_bytes * num_radix_blocks,
                                    streams[0], gpu_indexes[0]);
-      host_integer_radix_logical_scalar_shift_kb_inplace(
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
           streams, gpu_indexes, gpu_count, ptr, shift_amount,
           mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
     } else {
@@ -82,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix(
           preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
       T *block_shift_buffer =
           all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     block_shift_buffer, preshifted_radix_ct,
-                                     i / msg_bits, num_radix_blocks, lwe_size);
+      host_radix_blocks_rotate_right<T>(
+          streams, gpu_indexes, gpu_count, block_shift_buffer,
+          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
       // create trivial assign for value = 0
       cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
                         streams[0], gpu_indexes[0]);
       j++;
     }
   }
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
   cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
   mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
index e951d2b2a6..02a0a77a40 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
   // one block is responsible to process single lwe ciphertext
   if (mem->shift_type == LEFT_SHIFT) {
     // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);
 
     cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                  num_blocks * big_lwe_size_bytes, streams[0],
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
 
     auto receiver_blocks = lwe_array;
     auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   giver_blocks, lwe_array, 1, num_blocks,
-                                   big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks, big_lwe_size);
 
     auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
 
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
 
   } else {
     // rotate left as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
 
     cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                  num_blocks * big_lwe_size_bytes, streams[0],
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
 
     auto receiver_blocks = lwe_array;
     auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
-                                  lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);
 
     auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
index 348655d1db..6555fad9f1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
 
   if (mem->shift_type == LEFT_SHIFT) {
     // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);
 
     // create trivial assign for value = 0
     cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
 
   } else {
     // right shift
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
 
     // rotate left as the blocks are from LSB to MSB
     // create trivial assign for value = 0
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
   Torus *last_block_copy = &padding_block[big_lwe_size];
 
   if (mem->shift_type == RIGHT_SHIFT) {
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
     cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                  num_blocks * big_lwe_size_bytes, streams[0],
                                  gpu_indexes[0]);
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
       }
       auto lut_univariate_padding_block =
           mem->lut_buffers_univariate[num_bits_in_block - 1];
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
           mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
           last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
       // Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
       if (shift_within_block != 0) {
         auto lut_univariate_shift_last_block =
             mem->lut_buffers_univariate[shift_within_block - 1];
-        integer_radix_apply_univariate_lookup_table_kb(
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
             mem->local_streams_2, gpu_indexes, gpu_count, last_block,
             last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
       }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index 801c0f131a..ffd70a75e5 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
     switch (mem->shift_type) {
     case LEFT_SHIFT:
       // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
 
       if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
         for (int i = 0; i < rotations; i++)
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
       break;
     case RIGHT_SHIFT:
       // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
 
       if (mem->is_signed)
         for (int i = 0; i < rotations; i++)
@@ -119,15 +119,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
       break;
     case LEFT_ROTATE:
       // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
       break;
     case RIGHT_ROTATE:
       // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
       break;
     default:
       PANIC("Unknown operation")
@@ -137,20 +137,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
     // control_bit|b|a
     cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
                       streams[0], gpu_indexes[0]); // Do we need this?
-    pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
-                          mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
-                          mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
-                          total_nb_bits);
+    pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
+                                 mux_lut->lwe_indexes_out, rotated_input,
+                                 input_bits_a, mux_lut->lwe_indexes_in,
+                                 big_lwe_dimension, 2, total_nb_bits);
 
     // The shift bit is already properly aligned/positioned
     for (int i = 0; i < total_nb_bits; i++)
-      host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
-                    mux_inputs + i * big_lwe_size, shift_bit,
-                    mem->params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0],
+                           mux_inputs + i * big_lwe_size,
+                           mux_inputs + i * big_lwe_size, shift_bit,
+                           mem->params.big_lwe_dimension, 1);
 
     // we have
     // control_bit|b|a
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
         total_nb_bits, mux_lut);
   }
@@ -179,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
     auto bit_to_add = input_bits_a + i * big_lwe_size;
 
     for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
-                    big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                           big_lwe_dimension, 1);
 
       block += big_lwe_size;
       bit_to_add += bits_per_block * big_lwe_size;
@@ -188,7 +189,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
 
     // To give back a clean ciphertext
     auto cleaning_lut = mem->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
         num_radix_blocks, cleaning_lut);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
index a9d590b93b..17c06326f6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                        uint32_t input_lwe_dimension,
                                        uint32_t input_lwe_ciphertext_count) {
 
-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in_1),
-                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in_1),
+                          static_cast<uint32_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 
 /*
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                        uint32_t input_lwe_dimension,
                                        uint32_t input_lwe_ciphertext_count) {
 
-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in_1),
-                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in_1),
+                          static_cast<uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 /*
  * Perform the addition of a u32 input LWE ciphertext vector with a u32
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
     void *plaintext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<uint32_t *>(lwe_array_in),
-                          static_cast<uint32_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
  * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
     void *plaintext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<uint64_t *>(lwe_array_in),
-                          static_cast<uint64_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
index 7e7695989f..8e0b627446 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -43,7 +43,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
   cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
                                (lwe_dimension + 1) * lwe_ciphertext_count,
                                stream, gpu_index);
-  plaintext_addition<<<grid, thds, 0, stream>>>(
+  plaintext_addition<T><<<grid, thds, 0, stream>>>(
       output, lwe_input, plaintext_input, lwe_dimension, num_entries);
   check_cuda_error(cudaGetLastError());
 }
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  subtraction<T>
+      <<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                    (input_lwe_dimension + 1) * sizeof(T),
                                stream, gpu_index);
 
-  radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
+  radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
       output, plaintext_input, input_lwe_dimension, num_entries);
   check_cuda_error(cudaGetLastError());
 }
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
   }
 }
 template <typename T>
-
 __host__ void host_unchecked_sub_with_correcting_term(
     cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
     uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
+  unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
       output, input_1, input_2, num_entries, lwe_size, message_modulus,
       carry_modulus, degree);
   check_cuda_error(cudaGetLastError());
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
index a64c15378d..1c424b336c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,7 +9,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_vec_multiplication(
+  host_cleartext_vec_multiplication<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_array_in),
@@ -49,7 +49,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_vec_multiplication(
+  host_cleartext_vec_multiplication<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_array_in),
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
index b9864973a0..9f9d396ed4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -46,7 +46,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
       output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
   check_cuda_error(cudaGetLastError());
 }
@@ -82,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  cleartext_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
       output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
   check_cuda_error(cudaGetLastError());
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
index b91115fca3..a1465e390f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                           uint32_t input_lwe_dimension,
                                           uint32_t input_lwe_ciphertext_count) {
 
-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 
 /*
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                           uint32_t input_lwe_dimension,
                                           uint32_t input_lwe_ciphertext_count) {
 
-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
index b1faeb6156..e8c4ba0a69 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
   dim3 grid(num_blocks, 1, 1);
   dim3 thds(num_threads, 1, 1);
 
-  negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
+  negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
   check_cuda_error(cudaGetLastError());
 }
 
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 3ce55b8503..0d1e1c12c6 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -323,11 +323,12 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn cuda_negate_integer_radix_ciphertext_64_inplace(
+    pub fn cuda_negate_integer_radix_ciphertext_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array: *mut c_void,
+        lwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
         lwe_dimension: u32,
         lwe_ciphertext_count: u32,
         message_modulus: u32,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index d95cfa2bac..adc6717971 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -551,19 +551,21 @@ pub unsafe fn negate_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
 ///
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
-pub unsafe fn negate_integer_radix_assign_async<T: UnsignedInteger>(
+pub unsafe fn negate_integer_radix_async<T: UnsignedInteger>(
     streams: &CudaStreams,
-    lwe_array: &mut CudaVec<T>,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
     lwe_dimension: LweDimension,
     num_samples: u32,
     message_modulus: u32,
     carry_modulus: u32,
 ) {
-    cuda_negate_integer_radix_ciphertext_64_inplace(
+    cuda_negate_integer_radix_ciphertext_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes.as_ptr(),
         streams.len() as u32,
-        lwe_array.as_mut_c_ptr(0),
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
         lwe_dimension.0 as u32,
         num_samples,
         message_modulus,
diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs
index f3fd1d60c5..6af4269916 100644
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -1,4 +1,4 @@
-use crate::core_crypto::gpu::{negate_integer_radix_assign_async, CudaStreams};
+use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaServerKey;
 
@@ -58,49 +58,26 @@ impl CudaServerKey {
     pub unsafe fn unchecked_neg_async<T: CudaIntegerRadixCiphertext>(
         &self,
         ctxt: &T,
-        stream: &CudaStreams,
+        streams: &CudaStreams,
     ) -> T {
-        let mut result = ctxt.duplicate_async(stream);
-        self.unchecked_neg_assign_async(&mut result, stream);
-        result
-    }
+        let mut ciphertext_out = ctxt.duplicate_async(streams);
+        let lwe_dimension = ctxt.as_ref().d_blocks.lwe_dimension();
+        let lwe_ciphertext_count = ctxt.as_ref().d_blocks.lwe_ciphertext_count();
 
-    /// # Safety
-    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
-    pub unsafe fn unchecked_neg_assign_async<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ctxt: &mut T,
-        stream: &CudaStreams,
-    ) {
-        let ciphertext = ctxt.as_mut();
-        let lwe_dimension = ciphertext.d_blocks.lwe_dimension();
-        let lwe_ciphertext_count = ciphertext.d_blocks.lwe_ciphertext_count();
+        let info = ctxt.as_ref().info.blocks.first().unwrap();
 
-        let info = ciphertext.info.blocks.first().unwrap();
-
-        negate_integer_radix_assign_async(
-            stream,
-            &mut ciphertext.d_blocks.0.d_vec,
+        negate_integer_radix_async(
+            streams,
+            &mut ciphertext_out.as_mut().d_blocks.0.d_vec,
+            &ctxt.as_ref().d_blocks.0.d_vec,
             lwe_dimension,
             lwe_ciphertext_count.0 as u32,
             info.message_modulus.0 as u32,
             info.carry_modulus.0 as u32,
         );
 
-        ciphertext.info = ciphertext.info.after_neg();
-    }
-
-    pub fn unchecked_neg_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ctxt: &mut T,
-        stream: &CudaStreams,
-    ) {
-        unsafe {
-            self.unchecked_neg_assign_async(ctxt, stream);
-        }
-        stream.synchronize();
+        ciphertext_out.as_mut().info = ctxt.as_ref().info.after_neg();
+        ciphertext_out
     }
 
     /// Homomorphically computes the opposite of a ciphertext encrypting an integer message.
@@ -141,9 +118,9 @@ impl CudaServerKey {
     /// let dec: u64 = cks.decrypt(&res);
     /// assert_eq!(modulus - msg, dec);
     /// ```
-    pub fn neg<T: CudaIntegerRadixCiphertext>(&self, ctxt: &T, stream: &CudaStreams) -> T {
-        let mut result = unsafe { ctxt.duplicate_async(stream) };
-        self.neg_assign(&mut result, stream);
+    pub fn neg<T: CudaIntegerRadixCiphertext>(&self, ctxt: &T, streams: &CudaStreams) -> T {
+        let result = unsafe { self.neg_async(ctxt, streams) };
+        streams.synchronize();
         result
     }
 
@@ -151,29 +128,23 @@ impl CudaServerKey {
     ///
     /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
     ///   not be dropped until stream is synchronised
-    pub unsafe fn neg_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub unsafe fn neg_async<T: CudaIntegerRadixCiphertext>(
         &self,
-        ctxt: &mut T,
-        stream: &CudaStreams,
-    ) {
+        ctxt: &T,
+        streams: &CudaStreams,
+    ) -> T {
         let mut tmp_ctxt;
 
         let ct = if ctxt.block_carries_are_empty() {
             ctxt
         } else {
-            tmp_ctxt = ctxt.duplicate_async(stream);
-            self.full_propagate_assign_async(&mut tmp_ctxt, stream);
+            tmp_ctxt = ctxt.duplicate_async(streams);
+            self.full_propagate_assign_async(&mut tmp_ctxt, streams);
             &mut tmp_ctxt
         };
 
-        self.unchecked_neg_assign_async(ct, stream);
-        let _carry = self.propagate_single_carry_assign_async(ct, stream);
-    }
-
-    pub fn neg_assign<T: CudaIntegerRadixCiphertext>(&self, ctxt: &mut T, stream: &CudaStreams) {
-        unsafe {
-            self.neg_assign_async(ctxt, stream);
-        }
-        stream.synchronize();
+        let mut res = self.unchecked_neg_async(ct, streams);
+        let _carry = self.propagate_single_carry_assign_async(&mut res, streams);
+        res
     }
 }