diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index 1e109b4409..350b5862f4 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -106,7 +106,7 @@ template struct pbs_buffer { uint32_t lwe_chunk_size; double2 *keybundle_fft; Torus *global_accumulator; - double2 *global_accumulator_fft; + double2 *global_join_buffer; PBS_VARIANT pbs_variant; @@ -225,10 +225,12 @@ template struct pbs_buffer { num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2), stream, gpu_index); global_accumulator = (Torus *)cuda_malloc_async( - num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream, - gpu_index); - global_accumulator_fft = (double2 *)cuda_malloc_async( - num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2), + input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size * + sizeof(Torus), + stream, gpu_index); + global_join_buffer = (double2 *)cuda_malloc_async( + level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count * + (polynomial_size / 2) * sizeof(double2), stream, gpu_index); } } @@ -260,7 +262,7 @@ template struct pbs_buffer { cuda_drop_async(keybundle_fft, stream, gpu_index); cuda_drop_async(global_accumulator, stream, gpu_index); - cuda_drop_async(global_accumulator_fft, stream, gpu_index); + cuda_drop_async(global_join_buffer, stream, gpu_index); } }; diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index b5451f5bdd..a9e21f77ab 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -69,7 +69,7 @@ template struct pbs_buffer { int8_t *d_mem; Torus *global_accumulator; - double2 *global_accumulator_fft; + double2 *global_join_buffer; PBS_VARIANT pbs_variant; @@ -114,7 +114,7 @@ template struct pbs_buffer { // Otherwise, both kernels run all in shared memory d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index); - global_accumulator_fft = (double2 *)cuda_malloc_async( + global_join_buffer = (double2 *)cuda_malloc_async( (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count * (polynomial_size / 2) * sizeof(double2), stream, gpu_index); @@ -147,7 +147,7 @@ template struct pbs_buffer { // Otherwise, both kernels run all in shared memory d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index); - global_accumulator_fft = (double2 *)cuda_malloc_async( + global_join_buffer = (double2 *)cuda_malloc_async( (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count * polynomial_size / 2 * sizeof(double2), stream, gpu_index); @@ -194,7 +194,7 @@ template struct pbs_buffer { // Otherwise, both kernels run all in shared memory d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index); - global_accumulator_fft = (double2 *)cuda_malloc_async( + global_join_buffer = (double2 *)cuda_malloc_async( (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count * polynomial_size / 2 * sizeof(double2), stream, gpu_index); @@ -208,7 +208,7 @@ template struct pbs_buffer { void release(cudaStream_t stream, uint32_t gpu_index) { cuda_drop_async(d_mem, stream, gpu_index); - cuda_drop_async(global_accumulator_fft, stream, gpu_index); + cuda_drop_async(global_join_buffer, stream, gpu_index); if (pbs_variant == DEFAULT) cuda_drop_async(global_accumulator, stream, gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh index 32df0caff7..7b4653af3b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh @@ -1,6 +1,7 @@ #ifndef CNCRT_CRYPTO_CUH #define CNCRT_CRPYTO_CUH +#include "crypto/torus.cuh" #include "device.h" #include @@ -21,7 +22,6 @@ private: uint32_t base_log; uint32_t mask; uint32_t num_poly; - int current_level; T mask_mod_b; T *state; @@ -32,7 +32,6 @@ public: state(state) { mask_mod_b = (1ll << base_log) - 1ll; - current_level = level_count; int tid = threadIdx.x; for (int i = 0; i < num_poly * params::opt; i++) { state[tid] >>= (sizeof(T) * 8 - base_log * level_count); @@ -52,8 +51,6 @@ public: // Decomposes a single polynomial __device__ void decompose_and_compress_next_polynomial(double2 *result, int j) { - if (j == 0) - current_level -= 1; int tid = threadIdx.x; auto state_slice = state + j * params::degree; @@ -72,8 +69,8 @@ public: res_re -= carry_re << base_log; res_im -= carry_im << base_log; - result[tid].x = (int32_t)res_re; - result[tid].y = (int32_t)res_im; + typecast_torus_to_double(res_re, result[tid].x); + typecast_torus_to_double(res_im, result[tid].y); tid += params::degree / params::opt; } diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index b4ef3259a9..f9875b107a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -1,6 +1,7 @@ #ifndef CNCRT_TORUS_CUH #define CNCRT_TORUS_CUH +#include "device.h" #include "polynomial/parameters.cuh" #include "types/int128.cuh" #include "utils/kernel_dimensions.cuh" @@ -43,6 +44,21 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) { typecast_double_to_torus(round(frac), r); } +template +__device__ inline void typecast_torus_to_double(T x, double &r); + +template <> +__device__ inline void typecast_torus_to_double(uint32_t x, + double &r) { + r = __int2double_rn(x); +} + +template <> +__device__ inline void typecast_torus_to_double(uint64_t x, + double &r) { + r = __ll2double_rn(x); +} + template __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, uint32_t level_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 95a7a78c14..9215bc044e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -7,6 +7,7 @@ #include "fft/bnsmfft.cuh" #include "helper_multi_gpu.h" #include "pbs/programmable_bootstrap_multibit.h" +#include "polynomial/polynomial_math.cuh" using namespace cooperative_groups; namespace cg = cooperative_groups; @@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group, double2 *global_memory_buffer, uint32_t polynomial_size, uint32_t glwe_dimension, bool support_dsm); -template +/** Perform the matrix multiplication between the GGSW and the GLWE, + * each block operating on a single level for mask and body. + * Both operands should be at fourier domain + * + * This function assumes: + * - Thread blocks at dimension x relates to the decomposition level. + * - Thread blocks at dimension y relates to the glwe dimension. + * - polynomial_size / params::opt threads are available per block + */ +template __device__ void -mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer, - const double2 *__restrict__ bootstrapping_key, - int polynomial_size, uint32_t glwe_dimension, int level_count, - int iteration, G &group, bool support_dsm = false) { - - // Switch to the FFT space - NSMFFT_direct>(fft); - synchronize_threads_in_block(); - - // Get the pieces of the bootstrapping key that will be needed for the - // external product; blockIdx.x is the ID of the block that's executing - // this function, so we end up getting the lines of the bootstrapping key - // needed to perform the external product in this block (corresponding to - // the same decomposition level) - auto bsk_slice = get_ith_mask_kth_block( - bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size, - glwe_dimension, level_count); - - // Perform the matrix multiplication between the GGSW and the GLWE, - // each block operating on a single level for mask and body +mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer, + const double2 *__restrict__ bootstrapping_key, + int iteration, G &group, + bool support_dsm = false) { + const uint32_t polynomial_size = params::degree; + const uint32_t glwe_dimension = gridDim.y - 1; + const uint32_t level_count = gridDim.x; // The first product is used to initialize level_join_buffer - auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2; auto this_block_rank = get_this_block_rank(group, support_dsm); - auto buffer_slice = - get_join_buffer_element(blockIdx.x, blockIdx.y, group, join_buffer, - polynomial_size, glwe_dimension, support_dsm); - - int tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - buffer_slice[tid] = fft[tid] * bsk_poly[tid]; - tid += params::degree / params::opt; - } - - group.sync(); // Continues multiplying fft by every polynomial in that particular bsk level // Each y-block accumulates in a different polynomial at each iteration - for (int j = 1; j < (glwe_dimension + 1); j++) { + auto bsk_slice = get_ith_mask_kth_block( + bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size, + glwe_dimension, level_count); + for (int j = 0; j < glwe_dimension + 1; j++) { int idx = (j + this_block_rank) % (glwe_dimension + 1); - auto bsk_poly = bsk_slice + idx * params::degree / 2; + auto bsk_poly = bsk_slice + idx * polynomial_size / 2; auto buffer_slice = get_join_buffer_element(blockIdx.x, idx, group, join_buffer, polynomial_size, glwe_dimension, support_dsm); - int tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - buffer_slice[tid] += fft[tid] * bsk_poly[tid]; - tid += params::degree / params::opt; - } + polynomial_product_accumulate_in_fourier_domain( + buffer_slice, fft, bsk_poly, j == 0); group.sync(); } @@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer, // All blocks are synchronized here; after this sync, level_join_buffer has // the values needed from every other block - auto src_acc = - get_join_buffer_element(0, blockIdx.y, group, join_buffer, - polynomial_size, glwe_dimension, support_dsm); - - // copy first product into fft buffer - tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - fft[tid] = src_acc[tid]; - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); - // accumulate rest of the products into fft buffer - for (int l = 1; l < gridDim.x; l++) { + for (int l = 0; l < level_count; l++) { auto cur_src_acc = get_join_buffer_element(l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension, support_dsm); - tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - fft[tid] += cur_src_acc[tid]; - tid += params::degree / params::opt; - } - } - synchronize_threads_in_block(); + polynomial_accumulate_in_fourier_domain(fft, cur_src_acc, l == 0); + } - // Perform the inverse FFT on the result of the GGSW x GLWE and add to the - // accumulator - NSMFFT_inverse>(fft); synchronize_threads_in_block(); - - add_to_torus(fft, accumulator); - - __syncthreads(); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 682f073ca7..0cfd95efb7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg( GadgetMatrix gadget_acc(base_log, level_count, accumulator_rotated); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); - - // We are using the same memory space for accumulator_fft and - // accumulator_rotated, so we need to synchronize here to make sure they - // don't modify the same memory space at the same time + NSMFFT_direct>(accumulator_fft); synchronize_threads_in_block(); // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe( - accumulator, accumulator_fft, block_join_buffer, bootstrapping_key, - polynomial_size, glwe_dimension, level_count, i, grid); - + mul_ggsw_glwe_in_fourier_domain( + accumulator_fft, block_join_buffer, bootstrapping_key, i, grid); + NSMFFT_inverse>(accumulator_fft); synchronize_threads_in_block(); + + add_to_torus(accumulator_fft, accumulator); } auto block_lwe_array_out = @@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg( (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; - if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { - // Perform a sample extract. At this point, all blocks have the result, but - // we do the computation at block 0 to avoid waiting for extra blocks, in - // case they're not synchronized - sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_mask(next_block_lwe_array_out, - accumulator, 1, i * lut_stride); + if (blockIdx.x == 0) { + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra blocks, + // in case they're not synchronized + sample_extract_mask(block_lwe_array_out, accumulator); + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask(next_block_lwe_array_out, + accumulator, 1, i * lut_stride); + } } - } - } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { - sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_body(next_block_lwe_array_out, - accumulator, 0, i * lut_stride); + } else if (blockIdx.y == glwe_dimension) { + sample_extract_body(block_lwe_array_out, accumulator, 0); + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_body(next_block_lwe_array_out, + accumulator, 0, i * lut_stride); + } } } } @@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg( uint64_t partial_dm = full_dm - partial_sm; int8_t *d_mem = buffer->d_mem; - double2 *buffer_fft = buffer->global_accumulator_fft; + double2 *buffer_fft = buffer->global_join_buffer; int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index b8ddc8fdc7..1e89164235 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt) uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t lut_count, uint32_t lut_stride) { - grid_group grid = this_grid(); // We use shared memory for the polynomials that are used often during the @@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) selected_memory = &device_mem[block_index * device_memory_size_per_block]; } - Torus *accumulator = (Torus *)selected_memory; + Torus *accumulator_rotated = (Torus *)selected_memory; double2 *accumulator_fft = - (double2 *)accumulator + + (double2 *)accumulator_rotated + (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2)); if constexpr (SMD == PARTIALSM) @@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt) &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * params::degree / 2]; - Torus *global_slice = - global_accumulator + - (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; + Torus *global_accumulator_slice = + &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * + params::degree]; - const double2 *keybundle = keybundle_array + - // select the input - blockIdx.z * keybundle_size_per_input; + const double2 *keybundle = + &keybundle_array[blockIdx.z * keybundle_size_per_input]; if (lwe_offset == 0) { // Put "b" in [0, 2N[ @@ -87,12 +85,12 @@ __global__ void __launch_bounds__(params::degree / params::opt) divide_by_monomial_negacyclic_inplace( - accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, - false); + accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree], + b_hat, false); } else { - // Load the accumulator calculated in previous iterations + // Load the accumulator_rotated calculated in previous iterations copy_polynomial( - global_slice, accumulator); + global_accumulator_slice, accumulator_rotated); } for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) { @@ -100,79 +98,82 @@ __global__ void __launch_bounds__(params::degree / params::opt) // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator, base_log, level_count); + accumulator_rotated, base_log, level_count); - // Decompose the accumulator. Each block gets one level of the + // Decompose the accumulator_rotated. Each block gets one level of the // decomposition, for the mask and the body (so block 0 will have the - // accumulator decomposed at level 0, 1 at 1, etc.) - GadgetMatrix gadget_acc(base_log, level_count, accumulator); + // accumulator_rotated decomposed at level 0, 1 at 1, etc.) + GadgetMatrix gadget_acc(base_log, level_count, + accumulator_rotated); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); - - // We are using the same memory space for accumulator_fft and - // accumulator_rotated, so we need to synchronize here to make sure they - // don't modify the same memory space at the same time + NSMFFT_direct>(accumulator_fft); synchronize_threads_in_block(); // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe( - accumulator, accumulator_fft, block_join_buffer, keybundle, - polynomial_size, glwe_dimension, level_count, i, grid); - + mul_ggsw_glwe_in_fourier_domain( + accumulator_fft, block_join_buffer, keybundle, i, grid); + NSMFFT_inverse>(accumulator_fft); synchronize_threads_in_block(); + + add_to_torus(accumulator_fft, accumulator_rotated, true); } - if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { - auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { - // Perform a sample extract. At this point, all blocks have the result, - // but we do the computation at block 0 to avoid waiting for extra blocks, - // in case they're not synchronized - // Always extract one by default - sample_extract_mask(block_lwe_array_out, accumulator); - - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_mask(next_block_lwe_array_out, - accumulator, 1, i * lut_stride); + auto accumulator = accumulator_rotated; + + if (blockIdx.x == 0) { + if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { + auto block_lwe_array_out = + &lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra + // blocks, in case they're not synchronized Always extract one by + // default + sample_extract_mask(block_lwe_array_out, accumulator); + + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask(next_block_lwe_array_out, + accumulator, 1, i * lut_stride); + } } - } - } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { + } else if (blockIdx.y == glwe_dimension) { - sample_extract_body(block_lwe_array_out, accumulator, 0); + sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; - sample_extract_body(next_block_lwe_array_out, - accumulator, 0, i * lut_stride); + sample_extract_body(next_block_lwe_array_out, + accumulator, 0, i * lut_stride); + } } } + } else { + // Load the accumulator calculated in previous iterations + copy_polynomial( + accumulator, global_accumulator_slice); } - } else { - // Load the accumulator calculated in previous iterations - copy_polynomial( - accumulator, global_slice); } } @@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop( uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) { - auto lwe_chunk_size = buffer->lwe_chunk_size; - uint64_t full_dm = + uint64_t full_sm = get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( polynomial_size); - uint64_t partial_dm = + uint64_t partial_sm = get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( polynomial_size); + + auto full_dm = full_sm; + auto partial_dm = full_sm - partial_sm; uint64_t no_dm = 0; + auto lwe_chunk_size = buffer->lwe_chunk_size; int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); @@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop( uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); - if (chunk_size == 0) - return; auto d_mem = buffer->d_mem_acc_cg; auto keybundle_fft = buffer->keybundle_fft; auto global_accumulator = buffer->global_accumulator; - auto buffer_fft = buffer->global_accumulator_fft; + auto join_buffer = buffer->global_join_buffer; void *kernel_args[22]; kernel_args[0] = &lwe_array_out; @@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop( kernel_args[4] = &lwe_array_in; kernel_args[5] = &lwe_input_indexes; kernel_args[6] = &keybundle_fft; - kernel_args[7] = &buffer_fft; + kernel_args[7] = &join_buffer; kernel_args[8] = &global_accumulator; kernel_args[9] = &lwe_dimension; kernel_args[10] = &glwe_dimension; @@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop( check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< Torus, params, PARTIALSM>, - grid_accumulate, thds, (void **)kernel_args, partial_dm, stream)); + grid_accumulate, thds, (void **)kernel_args, partial_sm, stream)); } else { kernel_args[19] = &no_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate< Torus, params, FULLSM>, - grid_accumulate, thds, (void **)kernel_args, full_dm, stream)); + grid_accumulate, thds, (void **)kernel_args, full_sm, stream)); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 8a534b2e83..831d3a6478 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) const Torus *__restrict__ lwe_array_in, const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, + Torus *global_accumulator, double2 *global_join_buffer, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, uint64_t device_memory_size_per_block) { @@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; double2 *global_fft_slice = - global_accumulator_fft + - (blockIdx.y + blockIdx.x * (glwe_dimension + 1) + - blockIdx.z * level_count * (glwe_dimension + 1)) * - (polynomial_size / 2); + global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) + + blockIdx.z * level_count * (glwe_dimension + 1)) * + (polynomial_size / 2); if (lwe_iteration == 0) { // First iteration @@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) const Torus *__restrict__ lut_vector, const Torus *__restrict__ lut_vector_indexes, const double2 *__restrict__ bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, + Torus *global_accumulator, double2 *global_join_buffer, uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, uint64_t device_memory_size_per_block, @@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) accumulator_fft = (double2 *)sharedmem; for (int level = 0; level < level_count; level++) { - double2 *global_fft_slice = global_accumulator_fft + - (level + blockIdx.x * level_count) * - (glwe_dimension + 1) * (params::degree / 2); + double2 *global_fft_slice = + global_join_buffer + (level + blockIdx.x * level_count) * + (glwe_dimension + 1) * (params::degree / 2); for (int j = 0; j < (glwe_dimension + 1); j++) { double2 *fft = global_fft_slice + j * params::degree / 2; @@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap( } // Otherwise, both kernels run all in shared memory uint64_t buffer_size = device_mem + - // global_accumulator_fft + // global_join_buffer (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count * (polynomial_size / 2) * sizeof(double2) + @@ -368,7 +367,7 @@ __host__ void execute_step_one( cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector, Torus const *lut_vector_indexes, Torus const *lwe_array_in, Torus const *lwe_input_indexes, double2 const *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, + Torus *global_accumulator, double2 *global_join_buffer, uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, @@ -383,21 +382,21 @@ __host__ void execute_step_one( device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, partial_dm); } else { device_programmable_bootstrap_step_one <<>>( lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0); } @@ -409,7 +408,7 @@ __host__ void execute_step_two( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *lwe_output_indexes, Torus const *lut_vector, Torus const *lut_vector_indexes, double2 const *bootstrapping_key, - Torus *global_accumulator, double2 *global_accumulator_fft, + Torus *global_accumulator, double2 *global_join_buffer, uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, @@ -425,21 +424,21 @@ __host__ void execute_step_two( device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, lut_count, lut_stride); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, partial_dm, lut_count, lut_stride); } else { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, - bootstrapping_key, global_accumulator, global_accumulator_fft, + bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, lut_count, lut_stride); } @@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap( uint64_t full_dm_step_two = full_sm_step_two; Torus *global_accumulator = pbs_buffer->global_accumulator; - double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft; + double2 *global_join_buffer = pbs_buffer->global_join_buffer; int8_t *d_mem = pbs_buffer->d_mem; for (int i = 0; i < lwe_dimension; i++) { execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, - global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, + global_join_buffer, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, - global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, + global_join_buffer, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two, lut_count, lut_stride); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 15802c5996..b57c61cf90 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -50,7 +50,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( uint64_t device_memory_size_per_block) { extern __shared__ int8_t sharedmem[]; - int8_t *selected_memory = sharedmem; + int8_t *selected_memory; if constexpr (SMD == FULLSM) { selected_memory = sharedmem; @@ -190,14 +190,14 @@ __global__ void __launch_bounds__(params::degree / params::opt) (glwe_dimension + 1)]; Torus *global_slice = - global_accumulator + - (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; + &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * + params::degree]; double2 *global_fft_slice = - global_accumulator_fft + - (blockIdx.y + blockIdx.x * (glwe_dimension + 1) + - blockIdx.z * level_count * (glwe_dimension + 1)) * - (polynomial_size / 2); + &global_accumulator_fft[(blockIdx.y + blockIdx.x * (glwe_dimension + 1) + + blockIdx.z * level_count * + (glwe_dimension + 1)) * + (polynomial_size / 2)]; if (lwe_iteration == 0) { // First iteration @@ -249,8 +249,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) device_multi_bit_programmable_bootstrap_accumulate_step_two( Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, const double2 *__restrict__ keybundle_array, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + double2 *join_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem, uint64_t device_memory_size_per_block, uint32_t lut_count, @@ -274,30 +274,29 @@ __global__ void __launch_bounds__(params::degree / params::opt) double2 *accumulator_fft = (double2 *)selected_memory; // - const double2 *keybundle = keybundle_array + - // select the input - blockIdx.x * lwe_chunk_size * level_count * - (glwe_dimension + 1) * (glwe_dimension + 1) * - (polynomial_size / 2); + const double2 *keybundle = + &keybundle_array[blockIdx.x * lwe_chunk_size * level_count * + (glwe_dimension + 1) * (glwe_dimension + 1) * + (polynomial_size / 2)]; - double2 *global_accumulator_fft_input = - global_accumulator_fft + - blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2); + double2 *join_buffer_slice = + &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) * + (polynomial_size / 2)]; for (int level = 0; level < level_count; level++) { double2 *global_fft_slice = - global_accumulator_fft_input + - level * (glwe_dimension + 1) * (polynomial_size / 2); + &join_buffer_slice[level * (glwe_dimension + 1) * + (polynomial_size / 2)]; for (int j = 0; j < (glwe_dimension + 1); j++) { - double2 *fft = global_fft_slice + j * params::degree / 2; + double2 *fft = &global_fft_slice[j * params::degree / 2]; // Get the bootstrapping key piece necessary for the multiplication // It is already in the Fourier domain auto bsk_slice = get_ith_mask_kth_block(keybundle, iteration, j, level, polynomial_size, glwe_dimension, level_count); - auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2; + auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2]; polynomial_product_accumulate_in_fourier_domain( accumulator_fft, fft, bsk_poly, !level && !j); @@ -308,8 +307,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // accumulator NSMFFT_inverse>(accumulator_fft); Torus *global_slice = - global_accumulator + - (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree; + &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * + params::degree]; add_to_torus(accumulator_fft, global_slice, true); synchronize_threads_in_block(); @@ -499,8 +498,6 @@ __host__ void execute_compute_keybundle( auto lwe_chunk_size = buffer->lwe_chunk_size; uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); - if (chunk_size == 0) - return; uint32_t keybundle_size_per_input = lwe_chunk_size * level_count * (glwe_dimension + 1) * @@ -559,7 +556,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, // auto d_mem = buffer->d_mem_acc_step_one; auto global_accumulator = buffer->global_accumulator; - auto global_accumulator_fft = buffer->global_accumulator_fft; + auto global_accumulator_fft = buffer->global_join_buffer; dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -611,7 +608,7 @@ __host__ void execute_step_two( auto d_mem = buffer->d_mem_acc_step_two; auto keybundle_fft = buffer->keybundle_fft; auto global_accumulator = buffer->global_accumulator; - auto global_accumulator_fft = buffer->global_accumulator_fft; + auto global_accumulator_fft = buffer->global_join_buffer; dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1); dim3 thds(polynomial_size / params::opt, 1, 1); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 910a74c18e..bbdf1ab43e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -133,18 +133,17 @@ __global__ void device_programmable_bootstrap_tbc( GadgetMatrix gadget_acc(base_log, level_count, accumulator_rotated); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); - - // We are using the same memory space for accumulator_fft and - // accumulator_rotated, so we need to synchronize here to make sure they - // don't modify the same memory space at the same time + NSMFFT_direct>(accumulator_fft); synchronize_threads_in_block(); // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe( - accumulator, accumulator_fft, block_join_buffer, bootstrapping_key, - polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm); - + mul_ggsw_glwe_in_fourier_domain( + accumulator_fft, block_join_buffer, bootstrapping_key, i, cluster, + support_dsm); + NSMFFT_inverse>(accumulator_fft); synchronize_threads_in_block(); + + add_to_torus(accumulator_fft, accumulator); } auto block_lwe_array_out = @@ -152,42 +151,44 @@ __global__ void device_programmable_bootstrap_tbc( (glwe_dimension * polynomial_size + 1) + blockIdx.y * polynomial_size]; - if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { - // Perform a sample extract. At this point, all blocks have the result, but - // we do the computation at block 0 to avoid waiting for extra blocks, in - // case they're not synchronized - sample_extract_mask(block_lwe_array_out, accumulator); - - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_mask(next_block_lwe_array_out, - accumulator, 1, i * lut_stride); + if (blockIdx.x == 0) { + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra blocks, + // in case they're not synchronized + sample_extract_mask(block_lwe_array_out, accumulator); + + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask(next_block_lwe_array_out, + accumulator, 1, i * lut_stride); + } } - } - } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { - sample_extract_body(block_lwe_array_out, accumulator, 0); - - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_body(next_block_lwe_array_out, - accumulator, 0, i * lut_stride); + } else if (blockIdx.y == glwe_dimension) { + sample_extract_body(block_lwe_array_out, accumulator, 0); + + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_body(next_block_lwe_array_out, + accumulator, 0, i * lut_stride); + } } } } @@ -287,7 +288,7 @@ __host__ void host_programmable_bootstrap_tbc( uint64_t partial_dm = full_dm - partial_sm; int8_t *d_mem = buffer->d_mem; - double2 *buffer_fft = buffer->global_accumulator_fft; + double2 *buffer_fft = buffer->global_join_buffer; int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index ac5fd8051b..ade8d1f423 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -54,9 +54,9 @@ __global__ void __launch_bounds__(params::degree / params::opt) selected_memory = &device_mem[block_index * device_memory_size_per_block]; } - Torus *accumulator = (Torus *)selected_memory; + Torus *accumulator_rotated = (Torus *)selected_memory; double2 *accumulator_fft = - (double2 *)accumulator + + (double2 *)accumulator_rotated + (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2)); if constexpr (SMD == PARTIALSM) { @@ -78,13 +78,12 @@ __global__ void __launch_bounds__(params::degree / params::opt) &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * params::degree / 2]; - Torus *global_slice = - global_accumulator + - (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree; + Torus *global_accumulator_slice = + &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * + params::degree]; - const double2 *keybundle = keybundle_array + - // select the input - blockIdx.z * keybundle_size_per_input; + const double2 *keybundle = + &keybundle_array[blockIdx.z * keybundle_size_per_input]; if (lwe_offset == 0) { // Put "b" in [0, 2N[ @@ -94,12 +93,12 @@ __global__ void __launch_bounds__(params::degree / params::opt) divide_by_monomial_negacyclic_inplace( - accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, - false); + accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree], + b_hat, false); } else { // Load the accumulator calculated in previous iterations copy_polynomial( - global_slice, accumulator); + global_accumulator_slice, accumulator_rotated); } for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) { @@ -107,75 +106,78 @@ __global__ void __launch_bounds__(params::degree / params::opt) // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator, base_log, level_count); + accumulator_rotated, base_log, level_count); // Decompose the accumulator. Each block gets one level of the // decomposition, for the mask and the body (so block 0 will have the // accumulator decomposed at level 0, 1 at 1, etc.) - GadgetMatrix gadget_acc(base_log, level_count, accumulator); + GadgetMatrix gadget_acc(base_log, level_count, + accumulator_rotated); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); - - // We are using the same memory space for accumulator_fft and - // accumulator_rotated, so we need to synchronize here to make sure they - // don't modify the same memory space at the same time + NSMFFT_direct>(accumulator_fft); synchronize_threads_in_block(); // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe( - accumulator, accumulator_fft, block_join_buffer, keybundle, - polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm); - + mul_ggsw_glwe_in_fourier_domain( + accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm); + NSMFFT_inverse>(accumulator_fft); synchronize_threads_in_block(); + + add_to_torus(accumulator_fft, accumulator_rotated, true); } - if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { - auto block_lwe_array_out = - &lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { - // Perform a sample extract. At this point, all blocks have the result, - // but we do the computation at block 0 to avoid waiting for extra blocks, - // in case they're not synchronized - sample_extract_mask(block_lwe_array_out, accumulator); - - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_mask(next_block_lwe_array_out, - accumulator, 1, i * lut_stride); + auto accumulator = accumulator_rotated; + + if (blockIdx.x == 0) { + if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { + auto block_lwe_array_out = + &lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra + // blocks, in case they're not synchronized + sample_extract_mask(block_lwe_array_out, accumulator); + + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask(next_block_lwe_array_out, + accumulator, 1, i * lut_stride); + } } - } - } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { - sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { - - auto next_lwe_array_out = - lwe_array_out + - (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); - auto next_block_lwe_array_out = - &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * - (glwe_dimension * polynomial_size + 1) + - blockIdx.y * polynomial_size]; - - sample_extract_body(next_block_lwe_array_out, - accumulator, 0, i * lut_stride); + } else if (blockIdx.y == glwe_dimension) { + sample_extract_body(block_lwe_array_out, accumulator, 0); + if (lut_count > 1) { + for (int i = 1; i < lut_count; i++) { + + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_body(next_block_lwe_array_out, + accumulator, 0, i * lut_stride); + } } } + } else { + // Load the accumulator calculated in previous iterations + copy_polynomial( + accumulator, global_accumulator_slice); } - } else { - // Load the accumulator calculated in previous iterations - copy_polynomial( - accumulator, global_slice); } } @@ -326,13 +328,11 @@ __host__ void execute_tbc_external_product_loop( uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); - if (chunk_size == 0) - return; auto d_mem = buffer->d_mem_acc_tbc; auto keybundle_fft = buffer->keybundle_fft; auto global_accumulator = buffer->global_accumulator; - auto buffer_fft = buffer->global_accumulator_fft; + auto buffer_fft = buffer->global_join_buffer; dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); dim3 thds(polynomial_size / params::opt, 1, 1); diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index f1da499b1c..6b19f08d76 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -56,8 +56,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator, bool zeroAcc, uint32_t num_poly = 1) { constexpr int degree = block_size * elems_per_thread; for (int z = 0; z < num_poly; z++) { - T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree); - const T *input_slice = (T *)input + (ptrdiff_t)(z * degree); + T *accumulator_slice = &accumulator[z * degree]; + const T *input_slice = &input[z * degree]; int tid = threadIdx.x; if (zeroAcc) { @@ -66,9 +66,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator, tid += block_size; } } else { - tid = threadIdx.x; - for (int i = 0; i < elems_per_thread; i++) { - if (j < degree) { + if (j < degree) { + for (int i = 0; i < elems_per_thread; i++) { // if (tid < degree - j) // accumulator_slice[tid] = input_slice[tid + j]; // else @@ -76,8 +75,11 @@ divide_by_monomial_negacyclic_inplace(T *accumulator, int x = tid + j - SEL(degree, 0, tid < degree - j); accumulator_slice[tid] = SEL(-1, 1, tid < degree - j) * input_slice[x]; - } else { - int32_t jj = j - degree; + tid += block_size; + } + } else { + int32_t jj = j - degree; + for (int i = 0; i < elems_per_thread; i++) { // if (tid < degree - jj) // accumulator_slice[tid] = -input_slice[tid + jj]; // else @@ -85,8 +87,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator, int x = tid + jj - SEL(degree, 0, tid < degree - jj); accumulator_slice[tid] = SEL(1, -1, tid < degree - jj) * input_slice[x]; + tid += block_size; } - tid += block_size; } } } @@ -160,9 +162,13 @@ __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log, } } +/** + * In case of classical PBS, this method should accumulate the result. + * In case of multi-bit PBS, it should overwrite. + */ template __device__ void add_to_torus(double2 *m_values, Torus *result, - bool init_torus = false) { + bool overwrite_result = false) { int tid = threadIdx.x; #pragma unroll for (int i = 0; i < params::opt / 2; i++) { @@ -175,7 +181,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result, Torus torus_imag = 0; typecast_double_round_to_torus(double_imag, torus_imag); - if (init_torus) { + if (overwrite_result) { result[tid] = torus_real; result[tid + params::degree / 2] = torus_imag; } else { diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index c054926f70..0f825dca1b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -3,6 +3,7 @@ #include "crypto/torus.cuh" #include "parameters.cuh" +#include "types/complex/operations.cuh" template __device__ T *get_chunk(T *data, int chunk_num, int chunk_size) { @@ -55,6 +56,27 @@ __device__ void polynomial_product_accumulate_in_fourier_domain( } } +// Computes result += x +// If init_accumulator is set, assumes that result was not initialized and does +// that with the outcome of first * second +template +__device__ void +polynomial_accumulate_in_fourier_domain(double2 *result, double2 *x, + bool init_accumulator = false) { + auto tid = threadIdx.x; + if (init_accumulator) { + for (int i = 0; i < params::opt / 2; i++) { + result[tid] = x[tid]; + tid += params::degree / params::opt; + } + } else { + for (int i = 0; i < params::opt / 2; i++) { + result[tid] += x[tid]; + tid += params::degree / params::opt; + } + } +} + // This method expects to work with polynomial_size / compression_params::opt // threads in the x-block If init_accumulator is set, assumes that result was // not initialized and does that with the outcome of first * second diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index 5ddb1430f2..11e4dd3122 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -233,147 +233,15 @@ ::testing::internal::ParamGenerator // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level, // message_modulus, carry_modulus, number_of_inputs, repetitions, // samples - // BOOLEAN_DEFAULT_PARAMETERS + // PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64 (ClassicalProgrammableBootstrapTestParams){ - 777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2, - 2, 2, 2, 40}, - // BOOLEAN_TFHE_LIB_PARAMETERS + 887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4, + 100, 1, 1}, + // PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64 (ClassicalProgrammableBootstrapTestParams){ - 830, 2, 1024, - new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2, - 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)), - new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2, - 1, 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_1 - (ClassicalProgrammableBootstrapTestParams){ - 684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2, - 2, 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_2_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 656, 2, 512, - new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)), - new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1, - 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_2 - // SHORTINT_PARAM_MESSAGE_2_CARRY_1 - // SHORTINT_PARAM_MESSAGE_3_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 742, 2, 1024, - new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4, - 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_3 - // SHORTINT_PARAM_MESSAGE_2_CARRY_2 - // SHORTINT_PARAM_MESSAGE_3_CARRY_1 - // SHORTINT_PARAM_MESSAGE_4_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 745, 1, 2048, - new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8, - 2, 2, 40}, - // SHORTINT_PARAM_MESSAGE_5_CARRY_0 - // SHORTINT_PARAM_MESSAGE_3_CARRY_2 - (ClassicalProgrammableBootstrapTestParams){ - 807, 1, 4096, - new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1, - 2, 1, 40}, - // SHORTINT_PARAM_MESSAGE_6_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 915, 1, 8192, - new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1, - 2, 1, 2}, - // SHORTINT_PARAM_MESSAGE_3_CARRY_3 - (ClassicalProgrammableBootstrapTestParams){ - 864, 1, 8192, - new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8, - 2, 1, 2}, - // SHORTINT_PARAM_MESSAGE_4_CARRY_3 - // SHORTINT_PARAM_MESSAGE_7_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 930, 1, 16384, - new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128, - 1, 2, 1, 1}, - - // BOOLEAN_DEFAULT_PARAMETERS - (ClassicalProgrammableBootstrapTestParams){ - 777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2, - 2, 100, 2, 40}, - // BOOLEAN_TFHE_LIB_PARAMETERS - (ClassicalProgrammableBootstrapTestParams){ - 830, 2, 1024, - new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2, - 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)), - new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2, - 1, 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_1 - (ClassicalProgrammableBootstrapTestParams){ - 684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2, - 2, 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_2_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 656, 2, 512, - new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)), - new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1, - 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_2 - // SHORTINT_PARAM_MESSAGE_2_CARRY_1 - // SHORTINT_PARAM_MESSAGE_3_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 742, 2, 1024, - new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4, - 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_1_CARRY_3 - // SHORTINT_PARAM_MESSAGE_2_CARRY_2 - // SHORTINT_PARAM_MESSAGE_3_CARRY_1 - // SHORTINT_PARAM_MESSAGE_4_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 745, 1, 2048, - new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)), - new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8, - 100, 2, 40}, - // SHORTINT_PARAM_MESSAGE_5_CARRY_0 - // SHORTINT_PARAM_MESSAGE_3_CARRY_2 - (ClassicalProgrammableBootstrapTestParams){ - 807, 1, 4096, - new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1, - 100, 1, 40}, - // SHORTINT_PARAM_MESSAGE_6_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 915, 1, 8192, - new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1, - 100, 1, 2}, - // SHORTINT_PARAM_MESSAGE_3_CARRY_3 - (ClassicalProgrammableBootstrapTestParams){ - 864, 1, 8192, - new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8, - 100, 1, 2}, - // SHORTINT_PARAM_MESSAGE_4_CARRY_3 - // SHORTINT_PARAM_MESSAGE_7_CARRY_0 - (ClassicalProgrammableBootstrapTestParams){ - 930, 1, 16384, - new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)), - new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128, - 1, 100, 1, 1}); + 977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07), + new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100, + 1, 1}); std::string printParamName( ::testing::TestParamInfo p) { ClassicalProgrammableBootstrapTestParams params = p.param; diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index 1adfd773b0..eec69ddd38 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -171,70 +171,44 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, } } +/** + int lwe_dimension; + int glwe_dimension; + int polynomial_size; + DynamicDistribution lwe_noise_distribution; + DynamicDistribution glwe_noise_distribution; + int pbs_base_log; + int pbs_level; + int message_modulus; + int carry_modulus; + int number_of_inputs; + int grouping_factor; + int repetitions; + int samples; + */ // Defines for which parameters set the PBS will be tested. // It executes each src for all pairs on phis X qs (Cartesian product) ::testing::internal::ParamGenerator multipbs_params_u64 = ::testing::Values( - // fast src - (MultiBitProgrammableBootstrapTestParams){ - 16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2, - 2, 1, 2, 1, 10}, - (MultiBitProgrammableBootstrapTestParams){ - 16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2, - 2, 128, 2, 1, 10}, - // 4_bits_multi_bit_group_2 - (MultiBitProgrammableBootstrapTestParams){ - 818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2, - 2, 1, 2, 1, 10}, - (MultiBitProgrammableBootstrapTestParams){ - 818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)), - new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2, - 2, 128, 2, 1, 10}, - // 4_bits_multi_bit_group_3 - (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 2048, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 1, 3, 1, 10}, - (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 16384, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 1, 3, 1, 1}, - - (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 1024, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 128, 3, 1, 10}, - (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 2048, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 128, 3, 1, 10}, + // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64 (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 4096, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 128, 3, 1, 10}, + 882, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 8, 8, + 100, 3, 1, 1}, + // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64 (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 8192, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 128, 3, 1, 1}, + 978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)), + new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8, + 100, 3, 1, 1}, + // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64 (MultiBitProgrammableBootstrapTestParams){ - 888, 1, 16384, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2, - 128, 3, 1, 1}, + 836, 1, 2048, new_gaussian_from_std_dev((3.433444883863949e-06)), + new_gaussian_from_std_dev((2.845267479601915e-15)), 22, 2, 4, 4, + 100, 2, 1, 1}, + // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64 (MultiBitProgrammableBootstrapTestParams){ - 972, 1, 8192, - new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)), - new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8, - 68, 3, 1, 1}); + 978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)), + new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8, + 100, 2, 1, 1}); std::string printParamName( ::testing::TestParamInfo p) {