diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index f5f8124aee..c7e07c9efb 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -3,6 +3,7 @@
 
 #include "bootstrap.h"
 #include "bootstrap_multibit.h"
+#include "pbs/bootstrap.cuh"
 #include <cassert>
 #include <cmath>
 #include <functional>
@@ -297,31 +298,11 @@ template <typename Torus> struct int_radix_lut {
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
     ///////////////
-    // PBS
-    if (params.pbs_type == MULTI_BIT) {
-      // Only 64 bits is supported
-      static_assert(
-          sizeof(Torus) == 8,
-          "Error (GPU multi bit PBS): only 64 bits Torus is supported");
-      scratch_cuda_multi_bit_pbs_64(
-          stream, &pbs_buffer, params.small_lwe_dimension,
-          params.glwe_dimension, params.polynomial_size, params.pbs_level,
-          params.grouping_factor, num_radix_blocks,
-          cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-    } else {
-      // Classic
-      // We only use low latency for classic mode
-      if (sizeof(Torus) == sizeof(uint32_t))
-        scratch_cuda_bootstrap_low_latency_32(
-            stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
-            params.pbs_level, num_radix_blocks,
-            cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-      else
-        scratch_cuda_bootstrap_low_latency_64(
-            stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
-            params.pbs_level, num_radix_blocks,
-            cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-    }
+    execute_scratch_pbs<Torus>(
+        stream, &pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
+        params.polynomial_size, params.pbs_level, params.grouping_factor,
+        num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+        params.pbs_type, allocate_gpu_memory);
 
     if (allocate_gpu_memory) {
       // Allocate LUT
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index f257eeaab8..8da2b2aaa3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1,88 +1,17 @@
 #ifndef CUDA_INTEGER_CUH
 #define CUDA_INTEGER_CUH
 
+#include "bootstrap.h"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <functional>
 
-template <typename Torus>
-void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, void *bootstrapping_key,
-                 int8_t *pbs_buffer, uint32_t glwe_dimension,
-                 uint32_t lwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count,
-                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-                 uint32_t num_luts, uint32_t lwe_idx,
-                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
-  if (sizeof(Torus) == sizeof(uint32_t)) {
-    // 32 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      printf("multibit\n");
-      printf("Error: 32-bit multibit PBS is not supported.\n");
-      break;
-    case LOW_LAT:
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    case AMORTIZED:
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    default:
-      break;
-    }
-  } else {
-    // 64 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, grouping_factor, base_log, level_count,
-          input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
-      break;
-    case LOW_LAT:
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    case AMORTIZED:
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    default:
-      break;
-    }
-  }
-}
-
 // function rotates right  radix ciphertext with specific value
 // grid is one dimensional
 // blockIdx.x represents x_th block of radix ciphertext
@@ -187,12 +116,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
       lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
       ks_base_log, ks_level, num_radix_blocks);
 
-  execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
-              lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
-              lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
-              polynomial_size, pbs_base_log, pbs_level, grouping_factor,
-              num_radix_blocks, 1, 0,
-              cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
+                     lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
+                     bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
+                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
+                     num_radix_blocks, 1, 0,
+                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
 }
 
 template <typename Torus>
@@ -471,31 +400,12 @@ void scratch_cuda_full_propagation(
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     bool allocate_gpu_memory) {
 
-  // PBS
   int8_t *pbs_buffer;
-  if (pbs_type == MULTI_BIT) {
-    uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
-        lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
-    // Only 64 bits is supported
-    scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
-                                  glwe_dimension, polynomial_size, pbs_level,
-                                  grouping_factor, num_radix_blocks,
-                                  cuda_get_max_shared_memory(stream->gpu_index),
-                                  allocate_gpu_memory, lwe_chunk_size);
-  } else {
-    // Classic
-    // We only use low latency for classic mode
-    if (sizeof(Torus) == sizeof(uint32_t))
-      scratch_cuda_bootstrap_low_latency_32(
-          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
-          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
-          allocate_gpu_memory);
-    else
-      scratch_cuda_bootstrap_low_latency_64(
-          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
-          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
-          allocate_gpu_memory);
-  }
+  execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
+                             polynomial_size, pbs_level, grouping_factor,
+                             num_radix_blocks,
+                             cuda_get_max_shared_memory(stream->gpu_index),
+                             pbs_type, allocate_gpu_memory);
 
   // LUT
   Torus *lut_buffer;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 9303e213a4..f02d42b921 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -7,15 +7,11 @@
 #endif
 
 #include "bootstrap.h"
-#include "bootstrap_multibit.h"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
 #include "integer/integer.cuh"
 #include "linear_algebra.h"
-#include "pbs/bootstrap_amortized.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
index 73ade26d94..81cc429136 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
@@ -1 +1 @@
-#include "bootstrapping_key.cuh"
+#include "bootstrap.cuh"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh
new file mode 100644
index 0000000000..6da130412a
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh
@@ -0,0 +1,136 @@
+#include "../../include/bootstrap.h"
+#include "../../include/device.h"
+#include "../include/device.h"
+#include "bootstrap_low_latency.cuh"
+#include "bootstrap_multibit.cuh"
+
+template <typename Torus>
+void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
+                 Torus *lwe_output_indexes, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, void *bootstrapping_key,
+                 int8_t *pbs_buffer, uint32_t glwe_dimension,
+                 uint32_t lwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count,
+                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+                 uint32_t num_luts, uint32_t lwe_idx,
+                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
+  switch (sizeof(Torus)) {
+  case sizeof(uint32_t):
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      PANIC("Error: 32-bit multibit PBS is not supported.\n");
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+    break;
+  case sizeof(uint64_t):
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, grouping_factor, base_log, level_count,
+          input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  default:
+    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
+          "moduli are supported.");
+  }
+}
+
+template <typename Torus>
+void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
+                         uint32_t glwe_dimension, uint32_t lwe_dimension,
+                         uint32_t polynomial_size, uint32_t level_count,
+                         uint32_t grouping_factor,
+                         uint32_t input_lwe_ciphertext_count,
+                         uint32_t max_shared_memory, PBS_TYPE pbs_type,
+                         bool allocate_gpu_memory) {
+  switch (sizeof(Torus)) {
+  case sizeof(uint32_t):
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      PANIC("Error: 32-bit multibit PBS is not supported.\n");
+    case LOW_LAT:
+      scratch_cuda_bootstrap_low_latency_32(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    case AMORTIZED:
+      scratch_cuda_bootstrap_amortized_32(
+          stream, pbs_buffer, glwe_dimension, polynomial_size,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  case sizeof(uint64_t):
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      scratch_cuda_multi_bit_pbs_64(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, grouping_factor, input_lwe_ciphertext_count,
+          max_shared_memory, allocate_gpu_memory);
+      break;
+    case LOW_LAT:
+      scratch_cuda_bootstrap_low_latency_64(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    case AMORTIZED:
+      scratch_cuda_bootstrap_amortized_64(
+          stream, pbs_buffer, glwe_dimension, polynomial_size,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  default:
+    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
+          "moduli are supported.");
+  }
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
index 2b10bd6ec8..b74eb989e1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
@@ -3,7 +3,6 @@
 
 #include "bootstrap.h"
 #include "bootstrap_multibit.cuh"
-#include "bootstrap_multibit.h"
 #include "cooperative_groups.h"
 #include "crypto/gadget.cuh"
 #include "crypto/ggsw.cuh"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
new file mode 100644
index 0000000000..821e12b6f8
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
@@ -0,0 +1,78 @@
+#include "bootstrapping_key.cuh"
+
+void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size) {
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
+      (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
+      level_count, polynomial_size, total_polynomials);
+}
+
+void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size) {
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
+      (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
+      level_count, polynomial_size, total_polynomials);
+}
+
+void cuda_convert_lwe_multi_bit_bootstrap_key_64(
+    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
+    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
+    uint32_t grouping_factor) {
+  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+                               level_count * (1 << grouping_factor) /
+                               grouping_factor;
+  size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
+
+  cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
+                           stream);
+}
+
+// We need these lines so the compiler knows how to specialize these functions
+template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
+                                                    int level,
+                                                    uint32_t polynomial_size,
+                                                    int glwe_dimension,
+                                                    uint32_t level_count);
+template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
+                                                    int level,
+                                                    uint32_t polynomial_size,
+                                                    int glwe_dimension,
+                                                    uint32_t level_count);
+
+template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
+    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+
+template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
+    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
index d009b6e7c6..7c1a3a9f5f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -244,43 +244,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
   free(h_bsk);
 }
 
-void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size) {
-  uint32_t total_polynomials =
-      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
-      (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
-      level_count, polynomial_size, total_polynomials);
-}
-
-void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size) {
-  uint32_t total_polynomials =
-      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
-      (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
-      level_count, polynomial_size, total_polynomials);
-}
-
-void cuda_convert_lwe_multi_bit_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor) {
-  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
-                               level_count * (1 << grouping_factor) /
-                               grouping_factor;
-  size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
-
-  cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
-                           stream);
-}
-
 void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                                  cuda_stream_t *stream,
                                  uint32_t polynomial_size,
@@ -458,43 +421,4 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
   cuda_drop_async(buffer, stream);
 }
 
-// We need these lines so the compiler knows how to specialize these functions
-template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
-                                                    int level,
-                                                    uint32_t polynomial_size,
-                                                    int glwe_dimension,
-                                                    uint32_t level_count);
-template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
-                                                    int level,
-                                                    uint32_t polynomial_size,
-                                                    int glwe_dimension,
-                                                    uint32_t level_count);
-
-template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
-    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
-    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
 #endif // CNCRT_BSK_H