diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index 4a9a0c64a4..260c037b6a 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -411,6 +411,12 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, int glwe_dimension, uint32_t level_count); +template +__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k, + int level, uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); + template __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -422,8 +428,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, int glwe_dimension, uint32_t level_count); template -__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block( - T *ptr, int g, int i, int k, int level, uint32_t grouping_factor, +__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block( + const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index 964637afaf..a1bc861f76 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -38,11 +38,13 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level, // threads in y are used to paralelize the lwe_dimension_in loop. // shared memory is used to store intermediate results of the reduction. template -__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, - Torus *lwe_array_in, Torus *lwe_input_indexes, - Torus *ksk, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t base_log, - uint32_t level_count, int gpu_offset) { +__global__ void +keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const Torus *__restrict__ ksk, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, + int gpu_offset) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu index 703c31cdd0..abea6deba8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu @@ -36,6 +36,18 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( } // We need these lines so the compiler knows how to specialize these functions +template __device__ const uint64_t * +get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level, + uint32_t polynomial_size, int glwe_dimension, + uint32_t level_count); +template __device__ const uint32_t * +get_ith_mask_kth_block(const uint32_t *ptr, int i, int k, int level, + uint32_t polynomial_size, int glwe_dimension, + uint32_t level_count); +template __device__ const double2 * +get_ith_mask_kth_block(const double2 *ptr, int i, int k, int level, + uint32_t polynomial_size, int glwe_dimension, + uint32_t level_count); template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -51,6 +63,7 @@ template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k, uint32_t polynomial_size, int glwe_dimension, uint32_t level_count); + template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -67,10 +80,12 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k, int glwe_dimension, uint32_t level_count); -template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block( - uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); +template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block( + const uint64_t *ptr, int g, int i, int k, int level, + uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, + uint32_t level_count); -template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block( - double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); +template __device__ const double2 *get_multi_bit_ith_lwe_gth_group_kth_block( + const double2 *ptr, int g, int i, int k, int level, + uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, + uint32_t level_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index eef7951c88..8ee59ff3a9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -17,6 +17,18 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, } //////////////////////////////////////////////// +template +__device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k, + int level, uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count) { + return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, + level_count) + + level * polynomial_size / 2 * (glwe_dimension + 1) * + (glwe_dimension + 1) + + k * polynomial_size / 2 * (glwe_dimension + 1)]; +} + template __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -27,7 +39,6 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, (glwe_dimension + 1) + k * polynomial_size / 2 * (glwe_dimension + 1)]; } - template __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -50,14 +61,16 @@ __device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor, } template -__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block( - T *ptr, int g, int i, int k, int level, uint32_t grouping_factor, +__device__ const T *get_multi_bit_ith_lwe_gth_group_kth_block( + const T *ptr, int g, int i, int k, int level, uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) { - T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size, - glwe_dimension, level_count); + const T *ptr_group = + ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size, + glwe_dimension, level_count); return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size, glwe_dimension, level_count); } + //////////////////////////////////////////////// template void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 4a542ff5f5..4b5a2f1f3a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -22,11 +22,11 @@ get_join_buffer_element(int level_id, int glwe_id, G &group, uint32_t glwe_dimension, bool support_dsm); template -__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, - double2 *join_buffer, double2 *bootstrapping_key, - int polynomial_size, uint32_t glwe_dimension, - int level_count, int iteration, G &group, - bool support_dsm = false) { +__device__ void +mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer, + const double2 *__restrict__ bootstrapping_key, + int polynomial_size, uint32_t glwe_dimension, int level_count, + int iteration, G &group, bool support_dsm = false) { // Switch to the FFT space NSMFFT_direct>(fft); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index eae1b593f9..0f1f98a2ac 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -47,11 +47,14 @@ template * is not FULLSM */ __global__ void device_programmable_bootstrap_amortized( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t lwe_idx, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ bootstrapping_key, int8_t *device_mem, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, size_t device_memory_size_per_sample, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -81,7 +84,7 @@ __global__ void device_programmable_bootstrap_amortized( auto block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = + const Torus *block_lut_vector = &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * (glwe_dimension + 1)]; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 97c01bdfeb..681ce13913 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -36,12 +36,15 @@ namespace cg = cooperative_groups; */ template __global__ void device_programmable_bootstrap_cg( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - int8_t *device_mem, uint64_t device_memory_size_per_block, - uint32_t gpu_offset) { + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *device_mem, + uint64_t device_memory_size_per_block, uint32_t gpu_offset) { grid_group grid = this_grid(); @@ -74,12 +77,13 @@ __global__ void device_programmable_bootstrap_cg( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; double2 *block_join_buffer = &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 5eba4f2f8f..18cce4e8ef 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -19,12 +19,15 @@ template __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, - uint32_t lwe_offset, uint32_t lwe_chunk_size, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ keybundle_array, double2 *join_buffer, + Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t gpu_offset) { @@ -54,12 +57,13 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; double2 *block_join_buffer = &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * @@ -69,9 +73,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( global_accumulator + (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; - double2 *keybundle = keybundle_array + - // select the input - blockIdx.z * keybundle_size_per_input; + const double2 *keybundle = keybundle_array + + // select the input + blockIdx.z * keybundle_size_per_input; if (lwe_offset == 0) { // Put "b" in [0, 2N[ diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index baa50954c1..0f4dec5fcb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -18,11 +18,14 @@ template __global__ void device_programmable_bootstrap_step_one( - Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, double2 *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *device_mem, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t lwe_iteration, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the @@ -50,12 +53,13 @@ __global__ void device_programmable_bootstrap_step_one( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; Torus *global_slice = global_accumulator + @@ -129,11 +133,13 @@ __global__ void device_programmable_bootstrap_step_one( template __global__ void device_programmable_bootstrap_step_two( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, double2 *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *device_mem, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t lwe_iteration, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 8b8abc0f71..b2069d2f65 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -18,7 +18,7 @@ #include template -__device__ Torus calculates_monomial_degree(Torus *lwe_array_group, +__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group, uint32_t ggsw_idx, uint32_t grouping_factor) { Torus x = 0; @@ -34,12 +34,14 @@ __device__ Torus calculates_monomial_degree(Torus *lwe_array_group, template __global__ void device_multi_bit_programmable_bootstrap_keybundle( - Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array, - Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size, - uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t gpu_offset) { + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array, + const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t lwe_offset, + uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, + int8_t *device_mem, uint64_t device_memory_size_per_block, + uint32_t gpu_offset) { extern __shared__ int8_t sharedmem[]; int8_t *selected_memory = sharedmem; @@ -63,7 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( // Torus *accumulator = (Torus *)selected_memory; - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] * (lwe_dimension + 1)]; @@ -79,10 +81,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( // //////////////////////////////// // Keygen guarantees the first term is a constant term of the polynomial, no // polynomial multiplication required - Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( + const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id, grouping_factor, 2 * polynomial_size, glwe_dimension, level_count); - Torus *bsk_poly = bsk_slice + poly_id * params::degree; + const Torus *bsk_poly = bsk_slice + poly_id * params::degree; copy_polynomial( bsk_poly, accumulator); @@ -90,13 +92,13 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( // Accumulate the other terms for (int g = 1; g < (1 << grouping_factor); g++) { - Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( + const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id, grouping_factor, 2 * polynomial_size, glwe_dimension, level_count); - Torus *bsk_poly = bsk_slice + poly_id * params::degree; + const Torus *bsk_poly = bsk_slice + poly_id * params::degree; // Calculates the monomial degree - Torus *lwe_array_group = + const Torus *lwe_array_group = block_lwe_array_in + rev_lwe_iteration * grouping_factor; uint32_t monomial_degree = calculates_monomial_degree( lwe_array_group, g, grouping_factor); @@ -148,8 +150,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( template __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( - Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *global_accumulator, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator, double2 *global_accumulator_fft, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem, @@ -179,12 +183,13 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( if constexpr (SMD == PARTIALSM) accumulator_fft = (double2 *)sharedmem; - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; Torus *global_slice = global_accumulator + @@ -243,11 +248,12 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( template __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( - Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array, - Torus *global_accumulator, double2 *global_accumulator_fft, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t grouping_factor, uint32_t iteration, - uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const double2 *__restrict__ keybundle_array, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, + uint32_t lwe_chunk_size, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t gpu_offset) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -268,11 +274,11 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( double2 *accumulator_fft = (double2 *)selected_memory; // - double2 *keybundle = keybundle_array + - // select the input - blockIdx.x * lwe_chunk_size * level_count * - (glwe_dimension + 1) * (glwe_dimension + 1) * - (polynomial_size / 2); + const double2 *keybundle = keybundle_array + + // select the input + blockIdx.x * lwe_chunk_size * level_count * + (glwe_dimension + 1) * (glwe_dimension + 1) * + (polynomial_size / 2); double2 *global_accumulator_fft_input = global_accumulator_fft + diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 71aaca6b6b..ff379bb919 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -36,11 +36,15 @@ namespace cg = cooperative_groups; */ template __global__ void device_programmable_bootstrap_tbc( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int8_t *device_mem, + uint64_t device_memory_size_per_block, bool support_dsm, uint32_t gpu_offset) { cluster_group cluster = this_cluster(); @@ -77,12 +81,13 @@ __global__ void device_programmable_bootstrap_tbc( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; double2 *block_join_buffer = &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 9378cc832b..23f74a2d6b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -19,12 +19,15 @@ template __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( - Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, - uint32_t lwe_offset, uint32_t lwe_chunk_size, + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ keybundle_array, double2 *join_buffer, + Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, uint64_t device_memory_size_per_block, bool support_dsm, uint32_t gpu_offset) { @@ -62,12 +65,13 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps - Torus *block_lwe_array_in = + const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] * (lwe_dimension + 1)]; - Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * - params::degree * (glwe_dimension + 1)]; + const Torus *block_lut_vector = + &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree * + (glwe_dimension + 1)]; double2 *block_join_buffer = &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * @@ -77,9 +81,9 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( global_accumulator + (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; - double2 *keybundle = keybundle_array + - // select the input - blockIdx.z * keybundle_size_per_input; + const double2 *keybundle = keybundle_array + + // select the input + blockIdx.z * keybundle_size_per_input; if (lwe_offset == 0) { // Put "b" in [0, 2N[ diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index c81025afe6..d9c353721b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -11,7 +11,8 @@ * function compresses decomposed buffer into half size complex buffer for fft */ template -__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) { +__device__ void real_to_complex_compressed(const int16_t *__restrict__ src, + double2 *dst) { int tid = threadIdx.x; #pragma unroll for (int i = 0; i < params::opt / 2; i++) { @@ -22,7 +23,7 @@ __device__ void real_to_complex_compressed(int16_t *src, double2 *dst) { } template -__device__ void copy_polynomial(T *source, T *dst) { +__device__ void copy_polynomial(const T *__restrict__ source, T *dst) { int tid = threadIdx.x; #pragma unroll for (int i = 0; i < elems_per_thread; i++) { @@ -41,13 +42,14 @@ __device__ void copy_polynomial(T *source, T *dst) { * By default, it works on a single polynomial. */ template -__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input, - uint32_t j, bool zeroAcc, - uint32_t num_poly = 1) { +__device__ void +divide_by_monomial_negacyclic_inplace(T *accumulator, + const T *__restrict__ input, uint32_t j, + bool zeroAcc, uint32_t num_poly = 1) { constexpr int degree = block_size * elems_per_thread; for (int z = 0; z < num_poly; z++) { T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree); - T *input_slice = (T *)input + (ptrdiff_t)(z * degree); + const T *input_slice = (T *)input + (ptrdiff_t)(z * degree); int tid = threadIdx.x; if (zeroAcc) { diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index 3997d51512..a6d7901f20 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -39,9 +39,8 @@ __device__ void polynomial_product_in_fourier_domain(T *result, T *first, // If init_accumulator is set, assumes that result was not initialized and does // that with the outcome of first * second template -__device__ void -polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second, - bool init_accumulator = false) { +__device__ void polynomial_product_accumulate_in_fourier_domain( + T *result, T *first, const T *second, bool init_accumulator = false) { int tid = threadIdx.x; if (init_accumulator) { for (int i = 0; i < params::opt / 2; i++) { @@ -60,7 +59,7 @@ polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second, // that with the outcome of first * second template __device__ void -polynomial_product_accumulate_by_monomial(T *result, T *poly, +polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, uint64_t monomial_degree, bool init_accumulator = false) { // monomial_degree \in [0, 2 * params::degree)