diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
index 4a9a0c64a4..260c037b6a 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -411,6 +411,12 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                          int glwe_dimension,
                                          uint32_t level_count);
 
+template <typename T>
+__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
+                                           int level, uint32_t polynomial_size,
+                                           int glwe_dimension,
+                                           uint32_t level_count);
+
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                      uint32_t polynomial_size,
@@ -422,8 +428,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                      int glwe_dimension, uint32_t level_count);
 
 template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
     uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
 
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index 964637afaf..a1bc861f76 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -38,11 +38,13 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 // threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
 template <typename Torus>
-__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
-                          Torus *lwe_array_in, Torus *lwe_input_indexes,
-                          Torus *ksk, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t base_log,
-                          uint32_t level_count, int gpu_offset) {
+__global__ void
+keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+          const Torus *__restrict__ lwe_array_in,
+          const Torus *__restrict__ lwe_input_indexes,
+          const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
+          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
+          int gpu_offset) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
index 703c31cdd0..abea6deba8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
@@ -36,6 +36,18 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
 }
 
 // We need these lines so the compiler knows how to specialize these functions
+template __device__ const uint64_t *
+get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
+template __device__ const uint32_t *
+get_ith_mask_kth_block(const uint32_t *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
+template __device__ const double2 *
+get_ith_mask_kth_block(const double2 *ptr, int i, int k, int level,
+                       uint32_t polynomial_size, int glwe_dimension,
+                       uint32_t level_count);
 template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
                                                      int k, int level,
                                                      uint32_t polynomial_size,
@@ -51,6 +63,7 @@ template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
                                                     uint32_t polynomial_size,
                                                     int glwe_dimension,
                                                     uint32_t level_count);
+
 template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
                                                      int k, int level,
                                                      uint32_t polynomial_size,
@@ -67,10 +80,12 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                     int glwe_dimension,
                                                     uint32_t level_count);
 
-template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
-    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const uint64_t *ptr, int g, int i, int k, int level,
+    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t level_count);
 
-template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
-    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const double2 *ptr, int g, int i, int k, int level,
+    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
+    uint32_t level_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
index eef7951c88..8ee59ff3a9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -17,6 +17,18 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
 }
 
 ////////////////////////////////////////////////
+template <typename T>
+__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
+                                           int level, uint32_t polynomial_size,
+                                           int glwe_dimension,
+                                           uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
+                                 level_count) +
+              level * polynomial_size / 2 * (glwe_dimension + 1) *
+                  (glwe_dimension + 1) +
+              k * polynomial_size / 2 * (glwe_dimension + 1)];
+}
+
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                      uint32_t polynomial_size,
@@ -27,7 +39,6 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                   (glwe_dimension + 1) +
               k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
-
 template <typename T>
 __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                      uint32_t polynomial_size,
@@ -50,14 +61,16 @@ __device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
 }
 
 template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
     uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
-  T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
-                                         glwe_dimension, level_count);
+  const T *ptr_group =
+      ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
+                              glwe_dimension, level_count);
   return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
                                 glwe_dimension, level_count);
 }
+
 ////////////////////////////////////////////////
 template <typename T, typename ST>
 void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 4a542ff5f5..4b5a2f1f3a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -22,11 +22,11 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
                         uint32_t glwe_dimension, bool support_dsm);
 
 template <typename Torus, typename G, class params>
-__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
-                              double2 *join_buffer, double2 *bootstrapping_key,
-                              int polynomial_size, uint32_t glwe_dimension,
-                              int level_count, int iteration, G &group,
-                              bool support_dsm = false) {
+__device__ void
+mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
+              const double2 *__restrict__ bootstrapping_key,
+              int polynomial_size, uint32_t glwe_dimension, int level_count,
+              int iteration, G &group, bool support_dsm = false) {
 
   // Switch to the FFT space
   NSMFFT_direct<HalfDegree<params>>(fft);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index eae1b593f9..0f1f98a2ac 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -47,11 +47,14 @@ template <typename Torus, class params, sharedMemDegree SMD>
  * is not FULLSM
  */
 __global__ void device_programmable_bootstrap_amortized(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_idx,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
     size_t device_memory_size_per_sample, uint32_t gpu_offset) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -81,7 +84,7 @@ __global__ void device_programmable_bootstrap_amortized(
   auto block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] *
                     (lwe_dimension + 1)];
-  Torus *block_lut_vector =
+  const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
                   (glwe_dimension + 1)];
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 97c01bdfeb..681ce13913 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -36,12 +36,15 @@ namespace cg = cooperative_groups;
  */
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    int8_t *device_mem, uint64_t device_memory_size_per_block,
-    uint32_t gpu_offset) {
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
 
   grid_group grid = this_grid();
 
@@ -74,12 +77,13 @@ __global__ void device_programmable_bootstrap_cg(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   double2 *block_join_buffer =
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 5eba4f2f8f..18cce4e8ef 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -19,12 +19,15 @@
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ keybundle_array, double2 *join_buffer,
+    Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
     uint32_t keybundle_size_per_input, int8_t *device_mem,
     uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
 
@@ -54,12 +57,13 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   double2 *block_join_buffer =
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -69,9 +73,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
       global_accumulator +
       (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
 
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.z * keybundle_size_per_input;
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index baa50954c1..0f4dec5fcb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -18,11 +18,14 @@
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_step_one(
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t lwe_iteration,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
     uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
 
   // We use shared memory for the polynomials that are used often during the
@@ -50,12 +53,13 @@ __global__ void device_programmable_bootstrap_step_one(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   Torus *global_slice =
       global_accumulator +
@@ -129,11 +133,13 @@ __global__ void device_programmable_bootstrap_step_one(
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_step_two(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t lwe_iteration,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
     uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
 
   // We use shared memory for the polynomials that are used often during the
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 8b8abc0f71..b2069d2f65 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,7 +18,7 @@
 #include <vector>
 
 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
+__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
                                             uint32_t ggsw_idx,
                                             uint32_t grouping_factor) {
   Torus x = 0;
@@ -34,12 +34,14 @@ __device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
-    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
-    Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
-    uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
+    const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t lwe_offset,
+    uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
+    int8_t *device_mem, uint64_t device_memory_size_per_block,
+    uint32_t gpu_offset) {
 
   extern __shared__ int8_t sharedmem[];
   int8_t *selected_memory = sharedmem;
@@ -63,7 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     //
     Torus *accumulator = (Torus *)selected_memory;
 
-    Torus *block_lwe_array_in =
+    const Torus *block_lwe_array_in =
         &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] *
                       (lwe_dimension + 1)];
 
@@ -79,10 +81,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     // ////////////////////////////////
     // Keygen guarantees the first term is a constant term of the polynomial, no
     // polynomial multiplication required
-    Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+    const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
         bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
         grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
 
     copy_polynomial<Torus, params::opt, params::degree / params::opt>(
         bsk_poly, accumulator);
@@ -90,13 +92,13 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     // Accumulate the other terms
     for (int g = 1; g < (1 << grouping_factor); g++) {
 
-      Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
           bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
           grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
 
       // Calculates the monomial degree
-      Torus *lwe_array_group =
+      const Torus *lwe_array_group =
           block_lwe_array_in + rev_lwe_iteration * grouping_factor;
       uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
           lwe_array_group, g, grouping_factor);
@@ -148,8 +150,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
-    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *global_accumulator,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
     double2 *global_accumulator_fft, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
@@ -179,12 +183,13 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
   if constexpr (SMD == PARTIALSM)
     accumulator_fft = (double2 *)sharedmem;
 
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   Torus *global_slice =
       global_accumulator +
@@ -243,11 +248,12 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
+    uint32_t lwe_chunk_size, int8_t *device_mem,
     uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -268,11 +274,11 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
   double2 *accumulator_fft = (double2 *)selected_memory;
 
   //
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.x * lwe_chunk_size * level_count *
-                           (glwe_dimension + 1) * (glwe_dimension + 1) *
-                           (polynomial_size / 2);
+  const double2 *keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.x * lwe_chunk_size * level_count *
+                                 (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                 (polynomial_size / 2);
 
   double2 *global_accumulator_fft_input =
       global_accumulator_fft +
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index 71aaca6b6b..ff379bb919 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -36,11 +36,15 @@ namespace cg = cooperative_groups;
  */
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block, bool support_dsm,
     uint32_t gpu_offset) {
 
   cluster_group cluster = this_cluster();
@@ -77,12 +81,13 @@ __global__ void device_programmable_bootstrap_tbc(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   double2 *block_join_buffer =
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 9378cc832b..23f74a2d6b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -19,12 +19,15 @@
 
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
-    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ keybundle_array, double2 *join_buffer,
+    Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
     uint32_t keybundle_size_per_input, int8_t *device_mem,
     uint64_t device_memory_size_per_block, bool support_dsm,
     uint32_t gpu_offset) {
@@ -62,12 +65,13 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in =
+  const Torus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
                     (lwe_dimension + 1)];
 
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
+                  (glwe_dimension + 1)];
 
   double2 *block_join_buffer =
       &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
@@ -77,9 +81,9 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
       global_accumulator +
       (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
 
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle = keybundle_array +
+                             // select the input
+                             blockIdx.z * keybundle_size_per_input;
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
index c81025afe6..d9c353721b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -11,7 +11,8 @@
  *  function compresses decomposed buffer into half size complex buffer for fft
  */
 template <class params>
-__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
+__device__ void real_to_complex_compressed(const int16_t *__restrict__ src,
+                                           double2 *dst) {
   int tid = threadIdx.x;
 #pragma unroll
   for (int i = 0; i < params::opt / 2; i++) {
@@ -22,7 +23,7 @@ __device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
 }
 
 template <typename T, int elems_per_thread, int block_size>
-__device__ void copy_polynomial(T *source, T *dst) {
+__device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
   int tid = threadIdx.x;
 #pragma unroll
   for (int i = 0; i < elems_per_thread; i++) {
@@ -41,13 +42,14 @@ __device__ void copy_polynomial(T *source, T *dst) {
  *  By default, it works on a single polynomial.
  */
 template <typename T, int elems_per_thread, int block_size>
-__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
-                                                      uint32_t j, bool zeroAcc,
-                                                      uint32_t num_poly = 1) {
+__device__ void
+divide_by_monomial_negacyclic_inplace(T *accumulator,
+                                      const T *__restrict__ input, uint32_t j,
+                                      bool zeroAcc, uint32_t num_poly = 1) {
   constexpr int degree = block_size * elems_per_thread;
   for (int z = 0; z < num_poly; z++) {
     T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
-    T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
+    const T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
 
     int tid = threadIdx.x;
     if (zeroAcc) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index 3997d51512..a6d7901f20 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -39,9 +39,8 @@ __device__ void polynomial_product_in_fourier_domain(T *result, T *first,
 // If init_accumulator is set, assumes that result was not initialized and does
 // that with the outcome of first * second
 template <class params, typename T>
-__device__ void
-polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
-                                                bool init_accumulator = false) {
+__device__ void polynomial_product_accumulate_in_fourier_domain(
+    T *result, T *first, const T *second, bool init_accumulator = false) {
   int tid = threadIdx.x;
   if (init_accumulator) {
     for (int i = 0; i < params::opt / 2; i++) {
@@ -60,7 +59,7 @@ polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
 // that with the outcome of first * second
 template <typename T, class params>
 __device__ void
-polynomial_product_accumulate_by_monomial(T *result, T *poly,
+polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
                                           uint64_t monomial_degree,
                                           bool init_accumulator = false) {
   // monomial_degree \in [0, 2 * params::degree)