diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index a9990423fe..8f3a1ce07d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -297,6 +297,47 @@ void cleanup_cuda_propagate_single_carry(void *const *streams, uint32_t gpu_count, int8_t **mem_ptr_void); +void scratch_cuda_fast_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory); + +void cuda_fast_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks, + uint32_t requested_flag, uint32_t uses_carry); + +void cleanup_cuda_fast_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow, + bool allocate_gpu_memory); + +void cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs_array, const void *rhs_array, void *overflow_block, + const void *input_borrow, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow); + +void cleanup_cuda_integer_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 6dc85d4225..2823bd1179 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -53,6 +53,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, uint32_t carry_modulus, std::function f); +template +void generate_many_lut_device_accumulator( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t carry_modulus, std::vector> &f); + struct int_radix_params { PBS_TYPE pbs_type; uint32_t glwe_dimension; @@ -1116,6 +1122,1267 @@ template struct int_sum_ciphertexts_vec_memory { delete scp_mem; } }; +// For sequential algorithm in group propagation +template struct int_seq_group_prop_memory { + + Torus *group_resolved_carries; + int_radix_lut *lut_sequential_algorithm; + uint32_t grouping_size; + + int_seq_group_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t group_size, + uint32_t big_lwe_size_bytes, + bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + grouping_size = group_size; + group_resolved_carries = (Torus *)cuda_malloc_async( + (grouping_size)*big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(group_resolved_carries, 0, + (grouping_size)*big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + int num_seq_luts = grouping_size - 1; + Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus)); + lut_sequential_algorithm = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_seq_luts, num_seq_luts, + allocate_gpu_memory); + for (int index = 0; index < num_seq_luts; index++) { + auto f_lut_sequential = [index](Torus propa_cum_sum_block) { + return (propa_cum_sum_block >> (index + 1)) & 1; + }; + auto seq_lut = lut_sequential_algorithm->get_lut(gpu_indexes[0], index); + generate_device_accumulator( + streams[0], gpu_indexes[0], seq_lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_lut_sequential); + h_seq_lut_indexes[index] = index; + } + Torus *seq_lut_indexes = + lut_sequential_algorithm->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(seq_lut_indexes, h_seq_lut_indexes, + num_seq_luts * sizeof(Torus), streams[0], + gpu_indexes[0]); + + lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + free(h_seq_lut_indexes); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(group_resolved_carries, streams[0], gpu_indexes[0]); + lut_sequential_algorithm->release(streams, gpu_indexes, gpu_count); + delete lut_sequential_algorithm; + }; +}; + +// For hillis steele algorithm in group propagation +template struct int_hs_group_prop_memory { + + int_radix_lut *lut_hillis_steele; + uint32_t grouping_size; + + int_hs_group_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_groups, + uint32_t big_lwe_size_bytes, + bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus { + if (msb == 2) { + return 1; // Remap Generate to 1 + } else if (msb == 3) { + // MSB propagates + if (lsb == 2) { + return 1; + } else { + return lsb; + } // also remap here + } else { + return msb; + } + }; + + lut_hillis_steele = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_groups, allocate_gpu_memory); + + auto hillis_steele_lut = lut_hillis_steele->get_lut(gpu_indexes[0], 0); + generate_device_accumulator_bivariate( + streams[0], gpu_indexes[0], hillis_steele_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_lut_hillis_steele); + + lut_hillis_steele->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + lut_hillis_steele->release(streams, gpu_indexes, gpu_count); + delete lut_hillis_steele; + } +}; + +// compute_shifted_blocks_and_block_states +template struct int_shifted_blocks_and_states_memory { + Torus *shifted_blocks_and_states; + Torus *shifted_blocks; + Torus *block_states; + + int_radix_lut *luts_array_first_step; + + int_shifted_blocks_and_states_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t lut_count, uint32_t grouping_size, uint32_t requested_flag, + bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + shifted_blocks_and_states = (Torus *)cuda_malloc_async( + lut_count * num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(shifted_blocks_and_states, 0, + lut_count * num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + shifted_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + block_states = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(block_states, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + uint32_t num_luts_first_step = + requested_flag == 1 ? (2 * grouping_size + 2) : 2 * grouping_size + 1; + + luts_array_first_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_first_step, + num_radix_blocks, allocate_gpu_memory); + + auto f_shift_block = [message_modulus](Torus block) -> Torus { + return (block % message_modulus) << 1; + }; + + auto f_first_block_state = [message_modulus](Torus block) -> Torus { + if (block >= message_modulus) + return OUTPUT_CARRY::GENERATED; + else { + return OUTPUT_CARRY::NONE; + } + }; + std::vector> f_first_grouping_luts = { + f_first_block_state, f_shift_block}; + + auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + + // luts for other blocks of the first grouping + for (int lut_id = 1; lut_id < grouping_size; lut_id++) { + auto f_state = [message_modulus, lut_id](Torus block) -> Torus { + uint64_t r = 0; + if (block >= message_modulus) { + r = 2; // Generates Carry + } else if (block == (message_modulus - 1)) { + r = 1; // Propagates a carry + } else { + r = 0; // Does not generate carry + } + return r << (lut_id - 1); + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // luts for the rest of groupings (except for the last block) + for (int i = 0; i < grouping_size; i++) { + uint32_t lut_id = i + grouping_size; + auto f_state = [message_modulus, i](Torus block) -> Torus { + uint64_t r = 0; + if (block >= message_modulus) { + r = 2; // Generates Carry + } else if (block == (message_modulus - 1)) { + r = 1; // Propagates a carry + } else { + r = 0; // Does not borrow + } + return r << i; + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // For the last block we need to generate a new lut + auto f_last_block_state = [message_modulus](Torus block) -> Torus { + if (block >= message_modulus) + return 2 << 1; // Generates + else + return 0; // Nothing + }; + + uint32_t lut_id = + requested_flag == 1 + ? num_luts_first_step - 2 + : num_luts_first_step - 1; // The last lut of the first step + + auto last_block_lut = + luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + if (requested_flag == 1) { // If overflow + auto f_overflow_shift_block = [message_modulus](Torus block) -> Torus { + Torus lhs = block / message_modulus; + Torus rhs = block % message_modulus; + return ((lhs + rhs) % message_modulus) << 1; + }; + std::vector> f_last_grouping_luts = { + f_last_block_state, f_overflow_shift_block}; + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_last_grouping_luts); + + // We need an extra lut to handle the case with more than 1 radix + // The first of the many luts we will discard it, so we dont care about it + lut_id = lut_id + 1; + auto extra_block_lut = + luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + std::vector> f_last_grouping_luts_overflow = { + f_overflow_shift_block, f_overflow_shift_block}; + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], extra_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_last_grouping_luts_overflow); + + } else { + std::vector> f_last_grouping_luts = { + f_last_block_state, f_shift_block}; + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_last_grouping_luts); + } + // Generate the indexes to switch between luts within the pbs + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + bool is_last_index = (index == (num_radix_blocks - 1)); + if (is_last_index) { + if (num_radix_blocks == 1) { + h_lut_indexes[index] = 2 * grouping_size; + } else { + if (requested_flag == 1) { + h_lut_indexes[index] = 2 * grouping_size + 1; + } else { + h_lut_indexes[index] = 2; + } + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + grouping_size; + } + } + + // copy the indexes to the gpu + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size, + streams[0], gpu_indexes[0]); + // Do I need to do something else for the multi-gpu? + + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + free(h_lut_indexes); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + cuda_drop_async(shifted_blocks_and_states, streams[0], gpu_indexes[0]); + cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(block_states, streams[0], gpu_indexes[0]); + + luts_array_first_step->release(streams, gpu_indexes, gpu_count); + delete luts_array_first_step; + }; +}; + +// compute_propagation simulator and group carries +template struct int_prop_simu_group_carries_memory { + Torus *scalar_array_cum_sum; + Torus *propagation_cum_sums; + Torus *simulators; + Torus *grouping_pgns; + Torus *prepared_blocks; + + Torus *resolved_carries; + + int_radix_lut *luts_array_second_step; + + int_seq_group_prop_memory *seq_group_prop_mem; + int_hs_group_prop_memory *hs_group_prop_mem; + + uint32_t group_size; + bool use_sequential_algorithm_to_resolver_group_carries; + + int_prop_simu_group_carries_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + + group_size = grouping_size; + + scalar_array_cum_sum = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + cuda_memset_async(scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus), + streams[0], gpu_indexes[0]); + propagation_cum_sums = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(propagation_cum_sums, 0, + num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + simulators = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(simulators, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + grouping_pgns = (Torus *)cuda_malloc_async(num_groups * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + prepared_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + resolved_carries = (Torus *)cuda_malloc_async( + (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(resolved_carries, 0, + (num_groups + 1) * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + // create lut objects for step 2 + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + uint32_t num_carry_to_resolve = num_groups - 1; + uint32_t saturated_sub = + ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0); + uint32_t sequential_depth = saturated_sub / (grouping_size - 1); + uint32_t hillis_steel_depth; + + if (num_carry_to_resolve == 0) { + hillis_steel_depth = 0; + } else { + hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve)); + } + + use_sequential_algorithm_to_resolver_group_carries = + sequential_depth <= hillis_steel_depth; + uint32_t num_extra_luts = 0; + if (use_sequential_algorithm_to_resolver_group_carries) { + num_extra_luts = (grouping_size - 1); + } else { + num_extra_luts = 1; + } + + uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts; + luts_array_second_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_second_step, + num_radix_blocks, allocate_gpu_memory); + + // luts for first group inner propagation + for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) { + auto f_first_grouping_inner_propagation = + [lut_id](Torus propa_cum_sum_block) -> Torus { + uint64_t carry = (propa_cum_sum_block >> lut_id) & 1; + + if (carry != 0) { + return 2ull; // Generates Carry + } else { + return 0ull; // Does not generate carry + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_first_grouping_inner_propagation); + } + + auto f_first_grouping_outer_propagation = + [num_bits_in_block](Torus block) -> Torus { + return (block >> (num_bits_in_block - 1)) & 1; + }; + + int lut_id = grouping_size - 1; + auto lut_first_group_outer = + luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut_first_group_outer, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_first_grouping_outer_propagation); + + // for other groupings inner propagation + for (int index = 0; index < grouping_size; index++) { + uint32_t lut_id = index + grouping_size; + + auto f_other_groupings_inner_propagation = + [index](Torus propa_cum_sum_block) -> Torus { + uint64_t mask = (2 << index) - 1; + if (propa_cum_sum_block >= (2 << index)) { + return 2ull; // Generates + } else if ((propa_cum_sum_block & mask) == mask) { + return 1ull; // Propagate + } else { + return 0ull; // Nothing + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_other_groupings_inner_propagation); + } + + if (use_sequential_algorithm_to_resolver_group_carries) { + for (int index = 0; index < grouping_size - 1; index++) { + uint32_t lut_id = index + 2 * grouping_size; + + auto f_group_propagation = [index, block_modulus, + num_bits_in_block](Torus block) -> Torus { + if (block == (block_modulus - 1)) { + return 0ull; + } else { + return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1))); + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_group_propagation); + } + } else { + uint32_t lut_id = 2 * grouping_size; + auto f_group_propagation = [block_modulus](Torus block) { + if (block == (block_modulus - 1)) { + return 2ull; + } else { + return UINT64_MAX % (block_modulus * 2ull); + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_group_propagation); + } + + Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size); + + Torus *h_scalar_array_cum_sum = + (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + + if (is_in_first_grouping) { + h_second_lut_indexes[index] = index_in_grouping; + } else if (index_in_grouping == (grouping_size - 1)) { + if (use_sequential_algorithm_to_resolver_group_carries) { + int inner_index = (grouping_index - 1) % (grouping_size - 1); + h_second_lut_indexes[index] = inner_index + 2 * grouping_size; + } else { + h_second_lut_indexes[index] = 2 * grouping_size; + } + } else { + h_second_lut_indexes[index] = index_in_grouping + grouping_size; + } + + bool may_have_its_padding_bit_set = + !is_in_first_grouping && (index_in_grouping == grouping_size - 1); + + if (may_have_its_padding_bit_set) { + if (use_sequential_algorithm_to_resolver_group_carries) { + h_scalar_array_cum_sum[index] = + 1 << ((grouping_index - 1) % (grouping_size - 1)); + } else { + h_scalar_array_cum_sum[index] = 1; + } + } else { + h_scalar_array_cum_sum[index] = 0; + } + } + + // copy the indexes to the gpu + Torus *second_lut_indexes = + luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(second_lut_indexes, h_second_lut_indexes, + lut_indexes_size, streams[0], gpu_indexes[0]); + + cuda_memcpy_async_to_gpu(scalar_array_cum_sum, h_scalar_array_cum_sum, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + if (use_sequential_algorithm_to_resolver_group_carries) { + + seq_group_prop_mem = new int_seq_group_prop_memory( + streams, gpu_indexes, gpu_count, params, grouping_size, + big_lwe_size_bytes, true); + + } else { + hs_group_prop_mem = new int_hs_group_prop_memory( + streams, gpu_indexes, gpu_count, params, num_groups, + big_lwe_size_bytes, true); + } + + free(h_scalar_array_cum_sum); + free(h_second_lut_indexes); + }; + + // needed for the divison to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, Torus *new_lut_indexes, + Torus *new_scalars, uint32_t new_num_blocks) { + Torus *lut_indexes = + luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + + luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + cuda_memcpy_async_gpu_to_gpu(scalar_array_cum_sum, new_scalars, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + } + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(scalar_array_cum_sum, streams[0], gpu_indexes[0]); + cuda_drop_async(propagation_cum_sums, streams[0], gpu_indexes[0]); + cuda_drop_async(simulators, streams[0], gpu_indexes[0]); + cuda_drop_async(grouping_pgns, streams[0], gpu_indexes[0]); + cuda_drop_async(prepared_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(resolved_carries, streams[0], gpu_indexes[0]); + + luts_array_second_step->release(streams, gpu_indexes, gpu_count); + + if (use_sequential_algorithm_to_resolver_group_carries) { + seq_group_prop_mem->release(streams, gpu_indexes, gpu_count); + } else { + hs_group_prop_mem->release(streams, gpu_indexes, gpu_count); + } + + delete luts_array_second_step; + }; +}; + +template struct int_fast_sc_prop_memory { + uint32_t lut_count; + uint32_t lut_stride; + + uint32_t group_size; + uint32_t num_groups; + Torus *output_flag; + + int_radix_lut *lut_message_extract; + + int_radix_lut *lut_overflow_flag_prep; + int_radix_lut *lut_overflow_flag_last; + int_radix_lut *lut_carry_flag_last; + + int_shifted_blocks_and_states_memory *shifted_blocks_state_mem; + int_prop_simu_group_carries_memory *prop_simu_group_carries_mem; + + int_radix_params params; + bool use_sequential_algorithm_to_resolver_group_carries; + uint32_t requested_flag; + + uint32_t active_gpu_count; + cudaStream_t *sub_streams_1; + cudaStream_t *sub_streams_2; + + int_fast_sc_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + uint32_t requested_flag_in, uint32_t uses_carry, + bool allocate_gpu_memory) { + this->params = params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + requested_flag = requested_flag_in; + // for compute shifted blocks and block states + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + group_size = grouping_size; + num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size; + + lut_count = 2; // many luts apply 2 luts + uint32_t box_size = polynomial_size / block_modulus; + lut_stride = (block_modulus / lut_count) * box_size; + + shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, lut_count, + grouping_size, requested_flag, true); + + prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + grouping_size, num_groups, true); + + // Step 3 elements + lut_message_extract = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_message_extract = [message_modulus](Torus block) -> Torus { + return (block >> 1) % message_modulus; + }; + + auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], extract_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_extract); + + lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + // This store a single block that with be used to store the overflow or + // carry results + output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + if (requested_flag == 1) { + // For step 1 overflow should be enable only if flag overflow + uint32_t num_bits_in_message = std::log2(message_modulus); + lut_overflow_flag_prep = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_overflow_fp = [num_bits_in_message, + message_modulus](Torus packed_lhsrhs) -> Torus { + Torus lhs = packed_lhsrhs / message_modulus; + Torus rhs = packed_lhsrhs % message_modulus; + + Torus mask = (1 << (num_bits_in_message - 1)) - 1; + Torus lhs_except_last_bit = lhs & mask; + Torus rhs_except_last_bit = rhs & mask; + Torus input_carry1 = 1; + Torus input_carry2 = 0; + + Torus output_carry1 = + ((lhs + rhs + input_carry1) >> num_bits_in_message) & 1; + Torus output_carry2 = + ((lhs + rhs + input_carry2) >> num_bits_in_message) & 1; + Torus input_carry_last_bit1 = + ((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >> + (num_bits_in_message - 1)) & + 1; + Torus input_carry_last_bit2 = + ((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >> + (num_bits_in_message - 1)) & + 1; + + Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1); + Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2); + + return output1 << 3 | output2 << 2; + }; + + auto overflow_flag_prep_lut = + lut_overflow_flag_prep->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], overflow_flag_prep_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_overflow_fp); + + lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + } + + // For the final cleanup in case of overflow or carry (it seems that I can) + // It seems that this lut could be apply together with the other one but for + // now we won't do it + if (requested_flag == 1) { // Overflow case + lut_overflow_flag_last = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_overflow_last = [](Torus block) -> Torus { + Torus input_carry = (block >> 1) & 1; + Torus does_overflow_if_carry_is_1 = (block >> 3) & 1; + Torus does_overflow_if_carry_is_0 = (block >> 2) & 1; + if (input_carry == 1) { + return does_overflow_if_carry_is_1; + } else { + return does_overflow_if_carry_is_0; + } + }; + auto overflow_flag_last = + lut_overflow_flag_last->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_overflow_last); + + lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + } + if (requested_flag == 2) { // Carry case + lut_carry_flag_last = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_carry_last = [](Torus block) -> Torus { + return ((block >> 2) & 1); + }; + auto carry_flag_last = lut_carry_flag_last->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_carry_last); + + lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count); + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { + sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); + sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); + } + }; + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + shifted_blocks_state_mem->release(streams, gpu_indexes, gpu_count); + prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(output_flag, streams[0], gpu_indexes[0]); + lut_message_extract->release(streams, gpu_indexes, gpu_count); + delete lut_message_extract; + + if (requested_flag == 1) { // In case of overflow + lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count); + lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count); + delete lut_overflow_flag_prep; + delete lut_overflow_flag_last; + } + if (requested_flag == 2) { // In case of carry + lut_carry_flag_last->release(streams, gpu_indexes, gpu_count); + delete lut_carry_flag_last; + } + + // release sub streams + for (uint i = 0; i < active_gpu_count; i++) { + cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); + cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); + } + free(sub_streams_1); + free(sub_streams_2); + }; +}; + +template struct int_shifted_blocks_and_borrow_states_memory { + Torus *shifted_blocks_and_borrow_states; + Torus *shifted_blocks; + Torus *borrow_states; + + int_radix_lut *luts_array_first_step; + + int_shifted_blocks_and_borrow_states_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t lut_count, uint32_t grouping_size, bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async( + lut_count * num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(shifted_blocks_and_borrow_states, 0, + lut_count * num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + shifted_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + borrow_states = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + uint32_t num_luts_first_step = 2 * grouping_size + 1; + + luts_array_first_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_first_step, + num_radix_blocks, allocate_gpu_memory); + + auto f_shift_block = [message_modulus](Torus block) -> Torus { + uint64_t overflow_guard = message_modulus; + uint64_t block_mod = block % message_modulus; + return (overflow_guard | block_mod) << 1; + }; + + auto f_first_block_state = [message_modulus](Torus block) -> Torus { + if (block < message_modulus) + return 1; // Borrows + else { + return 0; // Nothing + } + }; + std::vector> f_first_grouping_luts = { + f_first_block_state, f_shift_block}; + + auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + + // luts for other blocks of the first grouping + for (int lut_id = 1; lut_id < grouping_size; lut_id++) { + auto f_state = [message_modulus, lut_id](Torus block) -> Torus { + uint64_t r = 0; + if (block < message_modulus) { + r = 2; // Borrows + } else if (block == message_modulus) { + r = 1; // Propagates a borrow + } else { + r = 0; // Does not borrow + } + return r << (lut_id - 1); + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // luts for the rest of groupings (except for the last block) + for (int i = 0; i < grouping_size; i++) { + uint32_t lut_id = i + grouping_size; + auto f_state = [message_modulus, i](Torus block) -> Torus { + uint64_t r = 0; + if (block < message_modulus) { + r = 2; // Generates borrow + } else if (block == message_modulus) { + r = 1; // Propagates a borrow + } else { + r = 0; // Does not borrow + } + return r << i; + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + auto f_last_block_state = [message_modulus](Torus block) -> Torus { + if (block < message_modulus) + return 2 << 1; // Generates a borrow + else + return 0; // Nothing + }; + + uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step + + auto last_block_lut = + luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + std::vector> f_last_grouping_luts = { + f_last_block_state, f_shift_block}; + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts); + + // Generate the indexes to switch between luts within the pbs + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + bool is_last_index = (index == (num_radix_blocks - 1)); + if (is_last_index) { + if (num_radix_blocks == 1) { + h_lut_indexes[index] = 2 * grouping_size; + } else { + h_lut_indexes[index] = 2; + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + grouping_size; + } + } + // copy the indexes to the gpu + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size, + streams[0], gpu_indexes[0]); + // Do I need to do something else for the multi-gpu? + + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + free(h_lut_indexes); + }; + + // needed for the divison to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, Torus *new_lut_indexes, + uint32_t new_num_blocks) { + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + cuda_drop_async(shifted_blocks_and_borrow_states, streams[0], + gpu_indexes[0]); + cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]); + + luts_array_first_step->release(streams, gpu_indexes, gpu_count); + delete luts_array_first_step; + }; +}; + +template struct int_fast_borrow_prop_memory { + uint32_t lut_count; + uint32_t lut_stride; + + uint32_t group_size; + uint32_t num_groups; + Torus *overflow_block; + + int_radix_lut *lut_message_extract; + int_radix_lut *lut_borrow_flag; + + int_shifted_blocks_and_borrow_states_memory + *shifted_blocks_borrow_state_mem; + int_prop_simu_group_carries_memory *prop_simu_group_carries_mem; + + int_radix_params params; + + Torus **first_indexes_for_div; + Torus **second_indexes_for_div; + Torus **scalars_for_div; + uint32_t active_gpu_count; + cudaStream_t *sub_streams_1; + cudaStream_t *sub_streams_2; + + cudaEvent_t *incoming_events; + cudaEvent_t *outgoing_events1; + cudaEvent_t *outgoing_events2; + + bool is_in_div = false; + uint32_t max_indexes_to_erase = 0; + + uint32_t compute_overflow; + int_fast_borrow_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, + uint32_t num_radix_blocks, + uint32_t compute_overflow_in, + bool allocate_gpu_memory) { + this->params = params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + compute_overflow = compute_overflow_in; + // for compute shifted blocks and block states + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + group_size = grouping_size; + num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size; + + lut_count = 2; // many luts apply 2 luts + uint32_t box_size = polynomial_size / block_modulus; + lut_stride = (block_modulus / lut_count) * box_size; + + shifted_blocks_borrow_state_mem = + new int_shifted_blocks_and_borrow_states_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + lut_count, grouping_size, true); + + prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + grouping_size, num_groups, true); + + overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + lut_message_extract = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_message_extract = [message_modulus](Torus block) -> Torus { + return (block >> 1) % message_modulus; + }; + + auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], extract_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_extract); + + lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + if (compute_overflow) { + lut_borrow_flag = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_borrow_flag = [](Torus block) -> Torus { + return ((block >> 2) & 1); + }; + + auto borrow_flag_lut = lut_borrow_flag->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], borrow_flag_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_borrow_flag); + + lut_borrow_flag->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count); + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { + sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); + sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); + } + + incoming_events = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events1 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events2 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + for (uint j = 0; j < active_gpu_count; j++) { + cudaEventCreate(&incoming_events[j]); + cudaEventCreate(&outgoing_events1[j]); + cudaEventCreate(&outgoing_events2[j]); + } + }; + + void create_indexes_for_division(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t num_blocks) { + + is_in_div = true; + max_indexes_to_erase = num_blocks; + bool use_seq = prop_simu_group_carries_mem + ->use_sequential_algorithm_to_resolver_group_carries; + first_indexes_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *)); + second_indexes_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *)); + scalars_for_div = (Torus **)malloc(num_blocks * sizeof(Torus *)); + + Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus)); + Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus)); + + // Extra indexes for the luts in first step + for (int nb = 1; nb <= num_blocks; nb++) { + cudaMalloc((void **)&first_indexes_for_div[nb - 1], nb * sizeof(Torus)); + for (int index = 0; index < nb; index++) { + uint32_t grouping_index = index / group_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % group_size; + bool is_last_index = (index == (nb - 1)); + if (is_last_index) { + if (nb == 1) { + h_lut_indexes[index] = 2 * group_size; + } else { + h_lut_indexes[index] = 2; + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + group_size; + } + } + cuda_memcpy_async_to_gpu(first_indexes_for_div[nb - 1], h_lut_indexes, + nb * sizeof(Torus), streams[0], gpu_indexes[0]); + } + // Extra indexes for the luts in second step + for (int nb = 1; nb <= num_blocks; nb++) { + cudaMalloc((void **)&second_indexes_for_div[nb - 1], nb * sizeof(Torus)); + cudaMalloc((void **)&scalars_for_div[nb - 1], nb * sizeof(Torus)); + + for (int index = 0; index < nb; index++) { + uint32_t grouping_index = index / group_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % group_size; + + if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else if (index_in_grouping == (group_size - 1)) { + if (use_seq) { + int inner_index = (grouping_index - 1) % (group_size - 1); + h_lut_indexes[index] = inner_index + 2 * group_size; + } else { + h_lut_indexes[index] = 2 * group_size; + } + } else { + h_lut_indexes[index] = index_in_grouping + group_size; + } + + bool may_have_its_padding_bit_set = + !is_in_first_grouping && (index_in_grouping == group_size - 1); + + if (may_have_its_padding_bit_set) { + if (use_seq) { + h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1)); + } else { + h_scalar[index] = 1; + } + } else { + h_scalar[index] = 0; + } + } + cuda_memcpy_async_to_gpu(second_indexes_for_div[nb - 1], h_lut_indexes, + nb * sizeof(Torus), streams[0], gpu_indexes[0]); + cuda_memcpy_async_to_gpu(scalars_for_div[nb - 1], h_scalar, + nb * sizeof(Torus), streams[0], gpu_indexes[0]); + } + free(h_lut_indexes); + free(h_scalar); + }; + + // needed for the divison to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t new_num_blocks) { + assert(max_indexes_to_erase >= new_num_blocks); + shifted_blocks_borrow_state_mem->update_lut_indexes( + streams, gpu_indexes, first_indexes_for_div[new_num_blocks - 1], + new_num_blocks); + prop_simu_group_carries_mem->update_lut_indexes( + streams, gpu_indexes, second_indexes_for_div[new_num_blocks - 1], + scalars_for_div[new_num_blocks - 1], new_num_blocks); + } + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count); + prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]); + + lut_message_extract->release(streams, gpu_indexes, gpu_count); + delete lut_message_extract; + if (compute_overflow) { + lut_borrow_flag->release(streams, gpu_indexes, gpu_count); + delete lut_borrow_flag; + } + + if (is_in_div) { + for (int i = 0; i < max_indexes_to_erase; i++) { + cuda_drop_async(first_indexes_for_div[i], streams[0], gpu_indexes[0]); + cuda_drop_async(second_indexes_for_div[i], streams[0], gpu_indexes[0]); + cuda_drop_async(scalars_for_div[i], streams[0], gpu_indexes[0]); + } + free(first_indexes_for_div); + free(second_indexes_for_div); + free(scalars_for_div); + } + // release sub streams + for (uint i = 0; i < active_gpu_count; i++) { + cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); + cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); + } + free(sub_streams_1); + free(sub_streams_2); + + // release events + for (uint j = 0; j < active_gpu_count; j++) { + cudaEventDestroy(incoming_events[j]); + cudaEventDestroy(outgoing_events1[j]); + cudaEventDestroy(outgoing_events2[j]); + } + free(incoming_events); + free(outgoing_events1); + free(outgoing_events2); + }; +}; template struct int_mul_memory { Torus *vector_result_sb; @@ -1124,6 +2391,7 @@ template struct int_mul_memory { int_radix_lut *luts_array; // lsb msb int_sum_ciphertexts_vec_memory *sum_ciphertexts_mem; + int_fast_sc_prop_memory *fast_sc_prop_mem; int_radix_params params; @@ -1199,6 +2467,11 @@ template struct int_mul_memory { streams, gpu_indexes, gpu_count, params, num_radix_blocks, 2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector); + uint32_t uses_carry = 0; + uint32_t requested_flag = 0; + fast_sc_prop_mem = new int_fast_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + requested_flag, uses_carry, allocate_gpu_memory); } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -1209,9 +2482,11 @@ template struct int_mul_memory { luts_array->release(streams, gpu_indexes, gpu_count); sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count); + fast_sc_prop_mem->release(streams, gpu_indexes, gpu_count); delete luts_array; delete sum_ciphertexts_mem; + delete fast_sc_prop_mem; } }; @@ -2271,7 +3546,7 @@ template struct int_div_rem_memory { // memory objects for other operations int_logical_scalar_shift_buffer *shift_mem_1; int_logical_scalar_shift_buffer *shift_mem_2; - int_overflowing_sub_memory *overflow_sub_mem; + int_fast_borrow_prop_memory *overflow_sub_mem; int_comparison_buffer *comparison_buffer; // lookup tables @@ -2515,8 +3790,12 @@ template struct int_div_rem_memory { streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true); - overflow_sub_mem = new int_overflowing_sub_memory( - streams, gpu_indexes, gpu_count, params, num_blocks, true); + uint32_t compute_overflow = 1; + overflow_sub_mem = new int_fast_borrow_prop_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow, + true); + overflow_sub_mem->create_indexes_for_division(streams, gpu_indexes, + num_blocks); comparison_buffer = new int_comparison_buffer( streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params, @@ -2963,6 +4242,7 @@ template struct int_scalar_mul_buffer { int_sum_ciphertexts_vec_memory *sum_ciphertexts_vec_mem; Torus *preshifted_buffer; Torus *all_shifted_buffer; + int_fast_sc_prop_memory *fast_sc_prop_mem; int_scalar_mul_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, @@ -3001,13 +4281,20 @@ template struct int_scalar_mul_buffer { sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory( streams, gpu_indexes, gpu_count, params, num_radix_blocks, num_ciphertext_bits, allocate_gpu_memory); + uint32_t uses_carry = 0; + uint32_t requested_flag = 0; + fast_sc_prop_mem = new int_fast_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + requested_flag, uses_carry, allocate_gpu_memory); } } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count); + fast_sc_prop_mem->release(streams, gpu_indexes, gpu_count); delete sum_ciphertexts_vec_mem; + delete fast_sc_prop_mem; cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]); } }; diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h index 10c476c12b..133103432c 100644 --- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h +++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h @@ -27,6 +27,12 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); +void cuda_add_lwe_ciphertext_vector_64_with_packing( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in_1, void const *lwe_array_in_2, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus); + void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_array_in, void const *plaintext_array_in, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index 4bd9933f2c..51ddd567a9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -425,11 +425,16 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams, auto do_overflowing_sub = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - host_integer_overflowing_sub_kb( + uint32_t compute_borrow = 1; + uint32_t uses_input_borrow = 0; + mem_ptr->overflow_sub_mem->update_lut_indexes( + streams, gpu_indexes, merged_interesting_remainder.len); + host_integer_overflowing_sub( streams, gpu_indexes, gpu_count, new_remainder.data, - subtraction_overflowed.data, merged_interesting_remainder.data, - interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem, - merged_interesting_remainder.len); + (uint64_t *)merged_interesting_remainder.data, + interesting_divisor.data, subtraction_overflowed.data, + (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks, + merged_interesting_remainder.len, compute_borrow, uses_input_borrow); }; // fills: diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index 53b1366c37..8e13980c51 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -1,4 +1,5 @@ #include "integer/integer.cuh" +#include "integer/negation.cuh" #include void cuda_full_propagation_64_inplace(void *const *streams, @@ -62,6 +63,46 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( allocate_gpu_memory); } +void scratch_cuda_fast_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_fast_propagate_single_carry_kb_inplace( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_fast_sc_prop_memory **)mem_ptr, num_blocks, params, + requested_flag, uses_carry, allocate_gpu_memory); +} + +void scratch_cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow, + bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_integer_overflowing_sub( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_fast_borrow_prop_memory **)mem_ptr, num_blocks, params, + compute_overflow, allocate_gpu_memory); +} + void cuda_propagate_single_carry_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks, @@ -73,6 +114,37 @@ void cuda_propagate_single_carry_kb_64_inplace( (uint64_t **)(ksks), num_blocks); } +void cuda_fast_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks, + uint32_t requested_flag, uint32_t uses_carry) { + + host_fast_propagate_single_carry( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lwe_array), static_cast(carry_out), + static_cast(carry_in), + (int_fast_sc_prop_memory *)mem_ptr, bsks, (uint64_t **)(ksks), + num_blocks, requested_flag, uses_carry); +} + +void cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs_array, const void *rhs_array, void *overflow_block, + const void *input_borrow, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow) { + + host_integer_overflowing_sub( + (cudaStream_t const *)streams, gpu_indexes, gpu_count, + static_cast(lhs_array), static_cast(lhs_array), + static_cast(rhs_array), + static_cast(overflow_block), + static_cast(input_borrow), + (int_fast_borrow_prop_memory *)mem_ptr, bsks, (uint64_t **)ksks, + num_blocks, compute_overflow, uses_input_borrow); +} + void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr, @@ -94,6 +166,24 @@ void cleanup_cuda_propagate_single_carry(void *const *streams, mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } +void cleanup_cuda_fast_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_fast_sc_prop_memory *mem_ptr = + (int_fast_sc_prop_memory *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} + +void cleanup_cuda_integer_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_fast_borrow_prop_memory *mem_ptr = + (int_fast_borrow_prop_memory *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} + void scratch_cuda_apply_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 8560b94c8d..80c18dce38 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -8,6 +8,7 @@ #include "integer/scalar_addition.cuh" #include "linear_algebra.h" #include "linearalgebra/addition.cuh" +#include "linearalgebra/negation.cuh" #include "pbs/programmable_bootstrap.h" #include "polynomial/functions.cuh" #include "utils/helper.cuh" @@ -128,6 +129,134 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams, <<>>(src, blocks_count, lwe_size); } +// If group_size = 4, the first group of 4 elements will be transformed as +// follows: +// dest[0] = src[0] +// dest[1] = src[0] + src[1] +// dest[2] = src[0] + src[1] + src[2] +// dest[3] = src[0] + src[1] + src[2] + src[3] +template +__global__ void +radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count, + uint32_t lwe_size, uint32_t group_size) { + + size_t block_offset = blockIdx.x * group_size * lwe_size; + + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t idx = j + block_offset; + Torus sum = src[idx]; + dest[idx] = sum; + for (int gidx = 1; gidx < group_size; gidx++) { + if (gidx + blockIdx.x * group_size < + blocks_count) { // in case the last group is not full + sum += src[idx + gidx * lwe_size]; + dest[idx + gidx * lwe_size] = sum; + } + } + } +} + +template +__host__ void host_radix_cumulative_sum_in_groups( + cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src, + uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) { + cudaSetDevice(gpu_index); + // Each CUDA block is responsible for a single group + int num_blocks = (radix_blocks_count + group_size - 1) / group_size, + num_threads = 512; + radix_cumulative_sum_in_groups<<>>( + dest, src, radix_blocks_count, lwe_size, group_size); +} + +template +__global__ void radix_split_simulators_and_grouping_pgns( + Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count, + uint32_t lwe_size, uint32_t group_size, Torus delta) { + + size_t block_offset = blockIdx.x * lwe_size; + if (blockIdx.x % group_size == 0) { + if (blockIdx.x == 0) { + // save trivial 0 + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + simulators[j] = 0; + } + } else { + // save trivial 1 + for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) { + size_t simu_idx = j + block_offset; + simulators[simu_idx] = 0; + } + if (threadIdx.x == 0) { + simulators[lwe_size - 1 + block_offset] = 1 * delta; + } + } + + if ((blockIdx.x / group_size + 1) < + (blocks_count + group_size - 1) / group_size) { + size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size; + size_t pgns_offset = (blockIdx.x / group_size) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t in_offset = j + src_offset; + size_t out_offset = j + pgns_offset; + grouping_pgns[out_offset] = src[in_offset]; + } + } + } else { + // save simulators + size_t src_offset = (blockIdx.x - 1) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + simulators[j + block_offset] = src[j + src_offset]; + } + } +} + +template +__host__ void host_radix_split_simulators_and_grouping_pgns( + cudaStream_t stream, uint32_t gpu_index, Torus *simulators, + Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count, + uint32_t lwe_size, uint32_t group_size, Torus delta) { + cudaSetDevice(gpu_index); + // Each CUDA block is responsible for a single group + int num_blocks = radix_blocks_count, num_threads = 512; + radix_split_simulators_and_grouping_pgns + <<>>(simulators, grouping_pgns, src, + radix_blocks_count, lwe_size, + group_size, delta); +} + +// If group_size = 4, the first group of 4 elements will be transformed as +// follows: +// src1 size num_radix_blocks * lwe_size +// src2 size num_group * lwe_size +// dest[0] = src1[0] + src2[0] +// dest[1] = src1[1] + src2[0] +// dest[2] = src1[2] + src2[0] +// dest[3] = src1[3] + src2[0] +template +__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2, + uint32_t blocks_count, uint32_t lwe_size, + uint32_t group_size) { + + size_t src1_offset = blockIdx.x * lwe_size; + size_t src2_index = (blockIdx.x / group_size) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t idx = j + src1_offset; + dest[idx] = src1[idx] + src2[j + src2_index]; + } +} + +template +__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index, + Torus *dest, Torus *src1, Torus *src2, + uint32_t radix_blocks_count, + uint32_t lwe_size, uint32_t group_size) { + cudaSetDevice(gpu_index); + + int num_blocks = radix_blocks_count, num_threads = 512; + radix_sum_in_groups<<>>( + dest, src1, src2, radix_blocks_count, lwe_size, group_size); +} + // polynomial_size threads template __global__ void @@ -479,6 +608,48 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, rotate_left(body, half_box_size, polynomial_size); } +template +void generate_many_lookup_table( + Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t message_modulus, uint32_t carry_modulus, + std::vector> &functions) { + + uint32_t modulus_sup = message_modulus * carry_modulus; + uint32_t box_size = polynomial_size / modulus_sup; + Torus delta = (1ul << 63) / modulus_sup; + + memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); + + auto body = &acc[glwe_dimension * polynomial_size]; + + size_t fn_counts = functions.size(); + + assert(fn_counts <= modulus_sup / 2); + + // Space used for each sub lut + uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size; + + // This accumulator extracts the carry bits + for (int f = 0; f < fn_counts; f++) { + int lut_offset = f * single_function_sub_lut_size; + for (int i = 0; i < modulus_sup / fn_counts; i++) { + int index = i * box_size + lut_offset; + for (int j = index; j < index + box_size; j++) { + auto f_eval = functions[f](i); + body[j] = f_eval * delta; + } + } + } + int half_box_size = box_size / 2; + + // Negate the first half_box_size coefficients + for (int i = 0; i < half_box_size; i++) { + body[i] = -body[i]; + } + + rotate_left(body, half_box_size, polynomial_size); +} + template void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -612,6 +783,37 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, free(h_lut); } +/* + * generate many lut accumulator for device pointer + * v_stream - cuda stream + * acc - device pointer for accumulator + * ... + * vector - evaluating functions with one Torus input + */ +template +void generate_many_lut_device_accumulator( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t carry_modulus, + std::vector> &functions) { + + // host lut + Torus *h_lut = + (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); + + // fill accumulator + generate_many_lookup_table(h_lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, functions); + + // copy host lut and lut_indexes_vec to device + cuda_memcpy_async_to_gpu( + acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + stream, gpu_index); + + cuda_synchronize_stream(stream, gpu_index); + free(h_lut); +} + template void scratch_cuda_propagate_single_carry_kb_inplace( cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -624,6 +826,108 @@ void scratch_cuda_propagate_single_carry_kb_inplace( num_radix_blocks, allocate_gpu_memory); } +template +void host_compute_shifted_blocks_and_states( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, int_radix_params params, + int_shifted_blocks_and_states_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride, + uint32_t lut_count) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + auto shifted_blocks_and_states = mem->shifted_blocks_and_states; + auto luts_array_first_step = mem->luts_array_first_step; + + integer_radix_apply_many_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array, + bsks, ksks, num_blocks, luts_array_first_step, lut_count, lut_stride); + + auto shifted_blocks = mem->shifted_blocks; + auto block_states = mem->block_states; + cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states, + big_lwe_size_bytes * num_blocks, streams[0], + gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + shifted_blocks, shifted_blocks_and_states + big_lwe_size * num_blocks, + big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]); +} + +template +void host_resolve_group_carries_sequentially( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns, + int_radix_params params, int_seq_group_prop_memory *mem, + void *const *bsks, Torus *const *ksks, uint32_t num_groups) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + auto group_resolved_carries = mem->group_resolved_carries; + if (num_groups > 1) { + // First carry is just copied + cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns, + big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + uint32_t solve_per_iter = mem->grouping_size - 1; + uint32_t remaining_carries = + num_groups - + 2; // the first one has been resolved and we ignore the last one + uint32_t num_loops = + ceil(double(remaining_carries) / (double)(solve_per_iter)); + uint32_t last_resolved_pos = 1; + + for (int i = 0; i < num_loops; i++) { + uint32_t loop_offset = i * solve_per_iter; + uint32_t blocks_to_solve = solve_per_iter; + // In case the last iteration has to solve less + if (loop_offset + blocks_to_solve > num_groups - 2) { + blocks_to_solve = remaining_carries - loop_offset; + } + + // The group_resolved carries is used as an intermediate array + // First we need to copy the last resolved carry + cuda_memcpy_async_gpu_to_gpu( + group_resolved_carries, + resolved_carries + last_resolved_pos * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + // The array is filled with the blocks_to_solve + cuda_memcpy_async_gpu_to_gpu( + group_resolved_carries + big_lwe_size, + grouping_pgns + last_resolved_pos * big_lwe_size, + blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + // Perform one group cumulative sum + host_radix_cumulative_sum_in_groups( + streams[0], gpu_indexes[0], group_resolved_carries, + group_resolved_carries, blocks_to_solve + 1, big_lwe_size, + mem->grouping_size); + + // Apply the lut + auto luts_sequential = mem->lut_sequential_algorithm; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, + group_resolved_carries + big_lwe_size, + group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve, + luts_sequential); + + // Copy the result to the resolved carries array + cuda_memcpy_async_gpu_to_gpu( + resolved_carries + (last_resolved_pos + 1) * big_lwe_size, + group_resolved_carries + big_lwe_size, + blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + last_resolved_pos += blocks_to_solve; + } + } +} + template void host_compute_prefix_sum_hillis_steele( cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -661,6 +965,95 @@ void host_compute_prefix_sum_hillis_steele( } } +template +void host_compute_propagation_simulators_and_group_carries( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *block_states, int_radix_params params, + int_prop_simu_group_carries_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks, uint32_t num_groups) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + + auto propagation_cum_sums = mem->propagation_cum_sums; + auto group_size = mem->group_size; + host_radix_cumulative_sum_in_groups( + streams[0], gpu_indexes[0], propagation_cum_sums, block_states, + num_blocks, big_lwe_size, group_size); + + auto luts_array_second_step = mem->luts_array_second_step; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, propagation_cum_sums, + propagation_cum_sums, bsks, ksks, num_blocks, luts_array_second_step); + + auto scalar_array_cum_sum = mem->scalar_array_cum_sum; + auto big_lwe_dimension = big_lwe_size - 1; + + host_integer_radix_scalar_addition_inplace( + streams, gpu_indexes, gpu_count, propagation_cum_sums, + scalar_array_cum_sum, big_lwe_dimension, num_blocks, message_modulus, + carry_modulus); + + uint32_t modulus_sup = message_modulus * carry_modulus; + Torus delta = (1ull << 63) / modulus_sup; + auto simulators = mem->simulators; + auto grouping_pgns = mem->grouping_pgns; + host_radix_split_simulators_and_grouping_pgns( + streams[0], gpu_indexes[0], simulators, grouping_pgns, + propagation_cum_sums, num_blocks, big_lwe_size, group_size, delta); + + auto resolved_carries = mem->resolved_carries; + if (mem->use_sequential_algorithm_to_resolver_group_carries) { + // Resolve group carries sequentially + host_resolve_group_carries_sequentially( + streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns, + params, mem->seq_group_prop_mem, bsks, ksks, num_groups); + } else { + // Resolve group carries with hillis steele + auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele; + host_compute_prefix_sum_hillis_steele( + streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size], + grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks, + num_groups - 1); + } +} + +template +void host_compute_shifted_blocks_and_borrow_states( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, int_radix_params params, + int_shifted_blocks_and_borrow_states_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks, uint32_t lut_stride, + uint32_t lut_count) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states; + auto luts_array_first_step = mem->luts_array_first_step; + + integer_radix_apply_many_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states, + lwe_array, bsks, ksks, num_blocks, luts_array_first_step, lut_count, + lut_stride); + + auto shifted_blocks = mem->shifted_blocks; + auto borrow_states = mem->borrow_states; + cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states, + big_lwe_size_bytes * num_blocks, streams[0], + gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + shifted_blocks, + shifted_blocks_and_borrow_states + big_lwe_size * num_blocks, + big_lwe_size_bytes * num_blocks, streams[0], gpu_indexes[0]); +} + template void host_propagate_single_carry(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -1146,4 +1539,247 @@ void host_apply_bivariate_lut_kb( radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift); } +template +void scratch_cuda_fast_propagate_single_carry_kb_inplace( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_fast_sc_prop_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory) { + + *mem_ptr = new int_fast_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag, + uses_carry, allocate_gpu_memory); +} + +template +void host_fast_propagate_single_carry( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, Torus *carry_out, + const Torus *input_carries, int_fast_sc_prop_memory *mem, + void *const *bsks, Torus *const *ksks, uint32_t num_blocks, + uint32_t requested_flag, uint32_t uses_carry) { + auto params = mem->params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + auto big_lwe_dimension = big_lwe_size - 1; // For host addition + auto lut_stride = mem->lut_stride; + auto lut_count = mem->lut_count; + + enum outputFlag { NONE = 0, OVERFLOW = 1, CARRY = 2 }; + if (uses_carry == 1) { + host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, + input_carries, big_lwe_dimension, 1); + } + + host_compute_shifted_blocks_and_states( + streams, gpu_indexes, gpu_count, lwe_array, params, + mem->shifted_blocks_state_mem, bsks, ksks, num_blocks, lut_stride, + lut_count); + auto block_states = mem->shifted_blocks_state_mem->block_states; + if (requested_flag == outputFlag::OVERFLOW) { + // This operation could be added to the many lut with some trickery to be in + // parallel but first i will try to use different streams + auto lut_overflow_prep = mem->lut_overflow_flag_prep; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, mem->output_flag, + lwe_array + (num_blocks - 1) * big_lwe_size, bsks, ksks, 1, + lut_overflow_prep); + } else if (requested_flag == outputFlag::CARRY) { + cuda_memcpy_async_gpu_to_gpu( + mem->output_flag, block_states + (num_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } + + host_compute_propagation_simulators_and_group_carries( + streams, gpu_indexes, gpu_count, block_states, params, + mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks, + mem->num_groups); + + auto group_size = mem->prop_simu_group_carries_mem->group_size; + + auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; + auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks; + host_addition(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, + mem->prop_simu_group_carries_mem->simulators, + big_lwe_dimension, num_blocks); + + if (requested_flag == outputFlag::OVERFLOW || + requested_flag == outputFlag::CARRY) { + host_addition(streams[0], gpu_indexes[0], mem->output_flag, + mem->output_flag, + mem->prop_simu_group_carries_mem->simulators + + (num_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + + for (uint j = 0; j < mem->active_gpu_count; j++) { + cuda_synchronize_stream(streams[j], gpu_indexes[j]); + } + + // Add carries and cleanup OutputFlag::None + host_radix_sum_in_groups( + mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + mem->prop_simu_group_carries_mem->resolved_carries, num_blocks, + big_lwe_size, group_size); + + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks, + bsks, ksks, num_blocks, message_extract); + + if (requested_flag == outputFlag::OVERFLOW || + requested_flag == outputFlag::CARRY) { + // Here I could also do some trick to try to apply this function in parallel + // First i will try sequential, then i improve it + + host_addition(mem->sub_streams_2[0], gpu_indexes[0], + mem->output_flag, mem->output_flag, + mem->prop_simu_group_carries_mem->resolved_carries + + (mem->num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); + + if (requested_flag == outputFlag::OVERFLOW) { + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, + mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last); + } else { + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, + mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last); + } + for (uint j = 0; j < mem->active_gpu_count; j++) { + cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag, + big_lwe_size_bytes, mem->sub_streams_2[j], + gpu_indexes[j]); + } + } + + for (uint j = 0; j < mem->active_gpu_count; j++) { + cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]); + cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]); + } +} + +template +void scratch_cuda_integer_overflowing_sub( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_fast_borrow_prop_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + uint32_t compute_overflow, bool allocate_gpu_memory) { + + *mem_ptr = new int_fast_borrow_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + compute_overflow, allocate_gpu_memory); +} + +template +void host_fast_borrow_propagate(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + Torus *lhsrhs_array, Torus *overflow_block, + const Torus *input_borrow, + int_fast_borrow_prop_memory *mem, + void *const *bsks, Torus *const *ksks, + uint32_t num_blocks, uint32_t num_groups, + uint32_t compute_overflow, + uint32_t uses_input_borrow) { + auto params = mem->params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + auto big_lwe_dimension = big_lwe_size - 1; + auto lut_stride = mem->lut_stride; + auto lut_count = mem->lut_count; + + assert(mem->num_groups >= num_groups); + if (uses_input_borrow == 1) { + host_unchecked_sub_with_correcting_term( + streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow, + big_lwe_dimension, 1, message_modulus, carry_modulus, + message_modulus - 1); + } + + host_compute_shifted_blocks_and_borrow_states( + streams, gpu_indexes, gpu_count, lhsrhs_array, params, + mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_blocks, lut_stride, + lut_count); + + auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states; + cuda_memcpy_async_gpu_to_gpu(mem->overflow_block, + borrow_states + (num_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + host_compute_propagation_simulators_and_group_carries( + streams, gpu_indexes, gpu_count, borrow_states, params, + mem->prop_simu_group_carries_mem, bsks, ksks, num_blocks, num_groups); + + auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks; + auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; + auto simulators = mem->prop_simu_group_carries_mem->simulators; + + host_subtraction(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, simulators, big_lwe_dimension, + num_blocks); + + // unchecked_scalar_add_ssing + host_integer_radix_add_scalar_one_inplace( + streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension, + num_blocks, message_modulus, carry_modulus); + + // unchecked_add_assing in overflow_block + if (compute_overflow == 1) { + host_addition(streams[0], gpu_indexes[0], mem->overflow_block, + mem->overflow_block, + mem->prop_simu_group_carries_mem->simulators + + (num_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries; + + // This needs to be done before because in next step we modify the resolved + // borrows + if (compute_overflow == 1) { + host_addition(streams[0], gpu_indexes[0], mem->overflow_block, + mem->overflow_block, + resolved_borrows + (num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + + cudaEventRecord(mem->incoming_events[0], streams[0]); + cudaStreamWaitEvent(mem->sub_streams_1[0], mem->incoming_events[0], 0); + cudaStreamWaitEvent(mem->sub_streams_2[0], mem->incoming_events[0], 0); + + if (compute_overflow == 1) { + auto borrow_flag = mem->lut_borrow_flag; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block, + mem->overflow_block, bsks, ksks, 1, borrow_flag); + } + cudaEventRecord(mem->outgoing_events1[0], mem->sub_streams_1[0]); + // subtract borrow and cleanup prepared blocks + host_negation(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows, + resolved_borrows, big_lwe_dimension, num_groups); + + host_radix_sum_in_groups( + mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + resolved_borrows, num_blocks, big_lwe_size, mem->group_size); + + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks, + bsks, ksks, num_blocks, message_extract); + + cudaEventRecord(mem->outgoing_events2[0], mem->sub_streams_2[0]); + + cudaStreamWaitEvent(streams[0], mem->outgoing_events1[0], 0); + cudaStreamWaitEvent(streams[0], mem->outgoing_events2[0], 0); +} + #endif // TFHE_RS_INTERNAL_INTEGER_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 0ebf410125..6f51099a3b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -562,10 +562,21 @@ __host__ void host_integer_mult_radix_kb( terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks, mem_ptr->luts_array); - auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem; - host_propagate_single_carry(streams, gpu_indexes, gpu_count, - radix_lwe_out, nullptr, nullptr, - scp_mem_ptr, bsks, ksks, num_blocks); + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + // if (num_blocks < num_bits_in_block) { + // auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem; + // host_propagate_single_carry(streams, gpu_indexes, gpu_count, + // radix_lwe_out, nullptr, nullptr, + // scp_mem_ptr, bsks, ksks, num_blocks); + // } else { + auto fast_scp_mem_ptr = mem_ptr->fast_sc_prop_mem; + uint32_t requested_flag = 0; + uint32_t uses_carry = 0; + host_fast_propagate_single_carry( + streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr, + fast_scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry); + //} } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu index e55ea9e912..d7d104fa33 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu @@ -38,15 +38,7 @@ void cuda_integer_radix_overflowing_sub_kb_64( void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks, void *const *ksks, uint32_t num_blocks) { - auto mem = (int_overflowing_sub_memory *)mem_ptr; - - host_integer_overflowing_sub_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_overflowed), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - mem, num_blocks); + // auto mem = (int_overflowing_sub_memory *)mem_ptr; } void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index 6eda409df9..fb88efc685 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb( *mem_ptr = new int_overflowing_sub_memory( streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); } - +/* template __host__ void host_integer_overflowing_sub_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb( mem_ptr, bsks, ksks, num_blocks); } +*/ +template +__host__ void host_integer_overflowing_sub( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array, + const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow, + int_fast_borrow_prop_memory *mem_ptr, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow) { + + auto radix_params = mem_ptr->params; + + // We need to recalculate the num_groups, because on the division the number + // of num_blocks changes + uint32_t block_modulus = + radix_params.message_modulus * radix_params.carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size; + + auto stream = (cudaStream_t *)streams; + host_unchecked_sub_with_correcting_term( + stream[0], gpu_indexes[0], static_cast(lwe_out_array), + static_cast(lhs_array), static_cast(rhs_array), + radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus, + radix_params.carry_modulus, radix_params.message_modulus - 1); + + host_fast_borrow_propagate( + streams, gpu_indexes, gpu_count, static_cast(lwe_out_array), + static_cast(overflow_block), + static_cast(input_borrow), + (int_fast_borrow_prop_memory *)mem_ptr, bsks, (Torus **)(ksks), + num_blocks, num_groups, compute_overflow, uses_input_borrow); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index 37a51006ae..d7c4e1f10c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -112,10 +112,24 @@ __host__ void host_integer_scalar_mul_radix( terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem, num_radix_blocks, j, nullptr); - auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem; - host_propagate_single_carry(streams, gpu_indexes, gpu_count, lwe_array, - nullptr, nullptr, scp_mem_ptr, bsks, ksks, - num_radix_blocks); + // uint32_t carry_modulus = message_modulus; + // uint32_t block_modulus = message_modulus * carry_modulus; + // uint32_t num_bits_in_block = std::log2(block_modulus); + // if (num_radix_blocks < num_bits_in_block) { + // auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem; + // host_propagate_single_carry(streams, gpu_indexes, gpu_count, + // lwe_array, + // nullptr, nullptr, scp_mem_ptr, bsks, + // ksks, num_radix_blocks); + // } else { + auto fast_scp_mem_ptr = mem->fast_sc_prop_mem; + uint32_t requested_flag = 0; + uint32_t uses_carry = 0; + host_fast_propagate_single_carry( + streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr, + fast_scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, + uses_carry); + //} } } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu index d3f47ad263..51dfc6d3ea 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu @@ -57,6 +57,27 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } +// last block it is the packing lhs*message_modulus + rhs +void cuda_add_lwe_ciphertext_vector_64_with_packing( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_array_in_1, void const *lwe_array_in_2, + uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus) { + + host_addition(static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), + input_lwe_dimension, input_lwe_ciphertext_count - 1); + + host_pack_for_overflowing_ops( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), input_lwe_dimension, + input_lwe_ciphertext_count, message_modulus); +} + /* * Perform the addition of a u32 input LWE ciphertext vector with a u32 * plaintext vector. See the equivalent operation on u64 data for more details. diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 29e1f62689..8da077414d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -82,6 +82,45 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, check_cuda_error(cudaGetLastError()); } +template +__global__ void pack_for_overflowing_ops(T *output, T const *input_1, + T const *input_2, uint32_t num_entries, + uint32_t message_modulus) { + + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { + // Here we take advantage of the wrapping behaviour of uint + output[index] = input_1[index] * message_modulus + input_2[index]; + } +} + +template +__host__ void host_pack_for_overflowing_ops(cudaStream_t stream, + uint32_t gpu_index, T *output, + T const *input_1, T const *input_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + pack_for_overflowing_ops<<>>( + &output[input_lwe_ciphertext_count - 1], + &input_1[input_lwe_ciphertext_count - 1], + &input_2[input_lwe_ciphertext_count - 1], lwe_size, message_modulus); + check_cuda_error(cudaGetLastError()); +} + template __global__ void subtraction(T *output, T const *input_1, T const *input_2, uint32_t num_entries) { diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 2a665498f3..ea12bb14c3 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -755,6 +755,102 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } +extern "C" { + pub fn scratch_cuda_fast_propagate_single_carry_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + requested_flag: u32, + uses_carry: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_fast_propagate_single_carry_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut ffi::c_void, + carry_out: *mut ffi::c_void, + carry_in: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + requested_flag: u32, + uses_carry: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_fast_propagate_single_carry( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + compute_overflow: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn cuda_integer_overflowing_sub_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lhs_array: *mut ffi::c_void, + rhs_array: *const ffi::c_void, + overflow_block: *mut ffi::c_void, + input_borrow: *const ffi::c_void, + mem_ptr: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + num_blocks: u32, + compute_overflow: u32, + uses_input_borrow: u32, + ); +} +extern "C" { + pub fn cleanup_cuda_integer_overflowing_sub( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} extern "C" { pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, @@ -1144,6 +1240,18 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } +extern "C" { + pub fn cuda_add_lwe_ciphertext_vector_64_with_packing( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_array_in_1: *const ffi::c_void, + lwe_array_in_2: *const ffi::c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + message_modulus: u32, + ); +} extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32( stream: *mut ffi::c_void, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index b25cde608f..3766119d45 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -458,6 +458,33 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async( ); } +/// Discarding addition of a vector of LWE ciphertexts +/// +/// # Safety +/// +/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is +/// required +pub unsafe fn add_lwe_ciphertext_vector_with_packing_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaVec, + lwe_array_in_1: &CudaVec, + lwe_array_in_2: &CudaVec, + lwe_dimension: LweDimension, + num_samples: u32, + message_modulus: u32, +) { + cuda_add_lwe_ciphertext_vector_64_with_packing( + streams.ptr[0], + streams.gpu_indexes[0], + lwe_array_out.as_mut_c_ptr(0), + lwe_array_in_1.as_c_ptr(0), + lwe_array_in_2.as_c_ptr(0), + lwe_dimension.0 as u32, + num_samples, + message_modulus, + ); +} + /// Discarding addition of a vector of LWE ciphertexts with a vector of plaintexts /// /// # Safety diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 0e40674f4c..5cfd32677d 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; +use crate::integer::server_key::radix_parallel::OutputFlag; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; @@ -466,6 +467,41 @@ pub unsafe fn unchecked_add_integer_radix_assign_async( ); } +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn unchecked_add_integer_radix_assign_with_packing_async( + streams: &CudaStreams, + radix_lwe_left: &mut CudaVec, + radix_lwe_right: &CudaVec, + lwe_dimension: LweDimension, + num_blocks: u32, + message_modulus: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_left.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_right.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + cuda_add_lwe_ciphertext_vector_64_with_packing( + streams.ptr[0], + streams.gpu_indexes[0], + radix_lwe_left.as_mut_c_ptr(0), + radix_lwe_left.as_c_ptr(0), + radix_lwe_right.as_c_ptr(0), + lwe_dimension.0 as u32, + num_blocks, + message_modulus, + ); +} + #[allow(clippy::too_many_arguments)] /// # Safety /// @@ -1085,6 +1121,94 @@ pub unsafe fn propagate_single_carry_assign_async( + streams: &CudaStreams, + radix_lwe_input: &mut CudaVec, + carry_out: &mut CudaVec, + carry_in: &CudaVec, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + lwe_dimension: LweDimension, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + num_blocks: u32, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + requested_flag: OutputFlag, + uses_carry: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_input.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32; + scratch_cuda_fast_propagate_single_carry_kb_64_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension, + lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + requested_flag as u32, + uses_carry as u32, + true, + ); + cuda_fast_propagate_single_carry_kb_64_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + radix_lwe_input.as_mut_c_ptr(0), + carry_out.as_mut_c_ptr(0), + carry_in.as_c_ptr(0), + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + num_blocks, + requested_flag as u32, + uses_carry as u32, + ); + cleanup_cuda_fast_propagate_single_carry( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + #[allow(clippy::too_many_arguments)] /// # Safety /// @@ -2139,107 +2263,107 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< ); } -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required -pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async< - T: UnsignedInteger, - B: Numeric, ->( - streams: &CudaStreams, - ct_res: &mut CudaVec, - ct_overflowed: &mut CudaVec, - lhs: &CudaVec, - rhs: &CudaVec, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - num_blocks: u32, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, -) { - assert_eq!( - streams.gpu_indexes[0], - ct_res.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - ct_overflowed.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - lhs.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - rhs.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - bootstrapping_key.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - keyswitch_key.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_integer_radix_overflowing_sub_kb_64( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_blocks, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - true, - ); - cuda_integer_radix_overflowing_sub_kb_64( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - ct_res.as_mut_c_ptr(0), - ct_overflowed.as_mut_c_ptr(0), - lhs.as_c_ptr(0), - rhs.as_c_ptr(0), - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - keyswitch_key.ptr.as_ptr(), - num_blocks, - ); - cleanup_cuda_integer_radix_overflowing_sub( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - ); -} +// #[allow(clippy::too_many_arguments)] +// /// # Safety +// /// +// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as +// synchronization /// is required +// pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async< +// T: UnsignedInteger, +// B: Numeric, +// >( +// streams: &CudaStreams, +// ct_res: &mut CudaVec, +// ct_overflowed: &mut CudaVec, +// lhs: &CudaVec, +// rhs: &CudaVec, +// bootstrapping_key: &CudaVec, +// keyswitch_key: &CudaVec, +// message_modulus: MessageModulus, +// carry_modulus: CarryModulus, +// glwe_dimension: GlweDimension, +// polynomial_size: PolynomialSize, +// big_lwe_dimension: LweDimension, +// small_lwe_dimension: LweDimension, +// ks_level: DecompositionLevelCount, +// ks_base_log: DecompositionBaseLog, +// pbs_level: DecompositionLevelCount, +// pbs_base_log: DecompositionBaseLog, +// num_blocks: u32, +// pbs_type: PBSType, +// grouping_factor: LweBskGroupingFactor, +// ) { +// assert_eq!( +// streams.gpu_indexes[0], +// ct_res.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// assert_eq!( +// streams.gpu_indexes[0], +// ct_overflowed.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// assert_eq!( +// streams.gpu_indexes[0], +// lhs.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// assert_eq!( +// streams.gpu_indexes[0], +// rhs.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// assert_eq!( +// streams.gpu_indexes[0], +// bootstrapping_key.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// assert_eq!( +// streams.gpu_indexes[0], +// keyswitch_key.gpu_index(0), +// "GPU error: all data should reside on the same GPU." +// ); +// let mut mem_ptr: *mut i8 = std::ptr::null_mut(); +// scratch_cuda_integer_radix_overflowing_sub_kb_64( +// streams.ptr.as_ptr(), +// streams.gpu_indexes.as_ptr(), +// streams.len() as u32, +// std::ptr::addr_of_mut!(mem_ptr), +// glwe_dimension.0 as u32, +// polynomial_size.0 as u32, +// big_lwe_dimension.0 as u32, +// small_lwe_dimension.0 as u32, +// ks_level.0 as u32, +// ks_base_log.0 as u32, +// pbs_level.0 as u32, +// pbs_base_log.0 as u32, +// grouping_factor.0 as u32, +// num_blocks, +// message_modulus.0 as u32, +// carry_modulus.0 as u32, +// pbs_type as u32, +// true, +// ); +// cuda_integer_radix_overflowing_sub_kb_64( +// streams.ptr.as_ptr(), +// streams.gpu_indexes.as_ptr(), +// streams.len() as u32, +// ct_res.as_mut_c_ptr(0), +// ct_overflowed.as_mut_c_ptr(0), +// lhs.as_c_ptr(0), +// rhs.as_c_ptr(0), +// mem_ptr, +// bootstrapping_key.ptr.as_ptr(), +// keyswitch_key.ptr.as_ptr(), +// num_blocks, +// ); +// cleanup_cuda_integer_radix_overflowing_sub( +// streams.ptr.as_ptr(), +// streams.gpu_indexes.as_ptr(), +// streams.len() as u32, +// std::ptr::addr_of_mut!(mem_ptr), +// ); +// } #[allow(clippy::too_many_arguments)] /// # Safety @@ -2770,3 +2894,95 @@ pub unsafe fn reverse_blocks_inplace_async( ); } } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async< + T: UnsignedInteger, + B: Numeric, +>( + streams: &CudaStreams, + radix_lwe_input: &mut CudaVec, + radix_rhs_input: &CudaVec, + carry_out: &mut CudaVec, + carry_in: &CudaVec, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + lwe_dimension: LweDimension, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + num_blocks: u32, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + compute_overflow: bool, + uses_input_borrow: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_input.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32; + scratch_cuda_integer_overflowing_sub_kb_64_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension, + lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + compute_overflow as u32, + true, + ); + cuda_integer_overflowing_sub_kb_64_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + radix_lwe_input.as_mut_c_ptr(0), + radix_rhs_input.as_c_ptr(0), + carry_out.as_mut_c_ptr(0), + carry_in.as_c_ptr(0), + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + num_blocks, + compute_overflow as u32, + uses_input_borrow as u32, + ); + cleanup_cuda_integer_overflowing_sub( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index 3d746230a6..cbf6c12777 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -8,9 +8,11 @@ use crate::integer::gpu::ciphertext::{ use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ unchecked_add_integer_radix_assign_async, + unchecked_add_integer_radix_assign_with_packing_async, unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::NoiseLevel; #[derive(Copy, Clone, PartialEq, Eq)] @@ -115,7 +117,8 @@ impl CudaServerKey { } }; self.unchecked_add_assign_async(lhs, rhs, streams); - let _carry = self.propagate_single_carry_assign_async(lhs, streams); + let _carry = + self.new_propagate_single_carry_assign_async(lhs, streams, None, OutputFlag::None); } pub fn add_assign( @@ -173,6 +176,18 @@ impl CudaServerKey { result } + pub fn unchecked_add_with_packing( + &self, + ct_left: &T, + ct_right: &T, + streams: &CudaStreams, + ) -> T { + let mut result = unsafe { ct_left.duplicate_async(streams) }; + self.unchecked_add_assign_with_packing(&mut result, ct_right, streams); + + result + } + /// # Safety /// /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must @@ -215,6 +230,49 @@ impl CudaServerKey { ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info); } + /// # Safety + /// + /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until stream is synchronised + pub unsafe fn unchecked_add_assign_with_packing_async( + &self, + ct_left: &mut T, + ct_right: &T, + streams: &CudaStreams, + ) { + let ciphertext_left = ct_left.as_mut(); + let ciphertext_right = ct_right.as_ref(); + assert_eq!( + ciphertext_left.d_blocks.lwe_dimension(), + ciphertext_right.d_blocks.lwe_dimension(), + "Mismatched lwe dimension between ct_left ({:?}) and ct_right ({:?})", + ciphertext_left.d_blocks.lwe_dimension(), + ciphertext_right.d_blocks.lwe_dimension() + ); + + assert_eq!( + ciphertext_left.d_blocks.ciphertext_modulus(), + ciphertext_right.d_blocks.ciphertext_modulus(), + "Mismatched moduli between ct_left ({:?}) and ct_right ({:?})", + ciphertext_left.d_blocks.ciphertext_modulus(), + ciphertext_right.d_blocks.ciphertext_modulus() + ); + + let lwe_dimension = ciphertext_left.d_blocks.lwe_dimension(); + let lwe_ciphertext_count = ciphertext_left.d_blocks.lwe_ciphertext_count(); + + unchecked_add_integer_radix_assign_with_packing_async( + streams, + &mut ciphertext_left.d_blocks.0.d_vec, + &ciphertext_right.d_blocks.0.d_vec, + lwe_dimension, + lwe_ciphertext_count.0 as u32, + self.message_modulus.0 as u32, + ); + + ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info); + } + pub fn unchecked_add_assign( &self, ct_left: &mut T, @@ -227,6 +285,18 @@ impl CudaServerKey { streams.synchronize(); } + pub fn unchecked_add_assign_with_packing( + &self, + ct_left: &mut T, + ct_right: &T, + streams: &CudaStreams, + ) { + unsafe { + self.unchecked_add_assign_with_packing_async(ct_left, ct_right, streams); + } + streams.synchronize(); + } + /// # Safety /// /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must @@ -348,7 +418,7 @@ impl CudaServerKey { .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) .unwrap(); - self.propagate_single_carry_assign_async(&mut result, streams); + self.new_propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None); assert!(result.block_carries_are_empty()); result } @@ -535,8 +605,53 @@ impl CudaServerKey { rhs: &CudaUnsignedRadixCiphertext, stream: &CudaStreams, ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) { - let mut ct_res = self.unchecked_add(lhs, rhs, stream); - let mut carry_out = self.propagate_single_carry_assign_async(&mut ct_res, stream); + let output_flag = OutputFlag::from_signedness(CudaUnsignedRadixCiphertext::IS_SIGNED); + + let mut ct_res = match output_flag { + OutputFlag::Overflow => self.unchecked_add_with_packing(lhs, rhs, stream), + _ => self.unchecked_add(lhs, rhs, stream), + }; + + let mut carry_out = + self.new_propagate_single_carry_assign_async(&mut ct_res, stream, None, output_flag); + + ct_res.as_mut().info = ct_res + .as_ref() + .info + .after_overflowing_add(&rhs.as_ref().info); + + if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + { + carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO); + } else { + carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL); + } + + let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext); + + (ct_res, ct_overflowed) + } + + /// # Safety + /// + /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until stream is synchronised + pub unsafe fn unchecked_signed_overflowing_add_async( + &self, + lhs: &CudaSignedRadixCiphertext, + rhs: &CudaSignedRadixCiphertext, + stream: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED); + + let mut ct_res = match output_flag { + OutputFlag::Overflow => self.unchecked_add_with_packing(lhs, rhs, stream), + _ => self.unchecked_add(lhs, rhs, stream), + }; + + let mut carry_out = + self.new_propagate_single_carry_assign_async(&mut ct_res, stream, None, output_flag); ct_res.as_mut().info = ct_res .as_ref() @@ -661,6 +776,7 @@ impl CudaServerKey { SignedOperation::Addition, stream, ) + //} } pub(crate) fn unchecked_signed_overflowing_add_or_sub( diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index 32daab51ce..7ed82650f2 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -6,6 +6,7 @@ use crate::core_crypto::prelude::{ ContiguousEntityContainerMut, LweBskGroupingFactor, LweCiphertextCount, }; use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; +use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; use crate::integer::gpu::ciphertext::{ CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, @@ -14,9 +15,10 @@ use crate::integer::gpu::ciphertext::{ use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async, full_propagate_assign_async, - propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async, - CudaServerKey, PBSType, + propagate_fast_single_carry_assign_async, propagate_single_carry_assign_async, + propagate_single_carry_get_input_carries_assign_async, CudaServerKey, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::{Degree, NoiseLevel}; use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator}; use crate::shortint::server_key::{ @@ -264,6 +266,110 @@ impl CudaServerKey { carry_out } + /// # Safety + /// + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronized + pub(crate) unsafe fn propagate_fast_single_carry_assign_async( + &self, + ct: &mut T, + streams: &CudaStreams, + input_carry: Option<&CudaBooleanBlock>, + requested_flag: OutputFlag, + ) -> T + where + T: CudaIntegerRadixCiphertext, + { + let mut carry_out: T = self.create_trivial_zero_radix(1, streams); + let ciphertext = ct.as_mut(); + let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; + let uses_carry = match input_carry { + Some(_block) => 1u32, + None => 0u32, + }; + let mut aux_block: T = self.create_trivial_zero_radix(1, streams); + let in_carry_dvec = match input_carry { + Some(block) => &block.0.ciphertext.d_blocks.0.d_vec, + None => &aux_block.as_mut().d_blocks.0.d_vec, + }; + + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + propagate_fast_single_carry_assign_async( + streams, + &mut ciphertext.d_blocks.0.d_vec, + &mut carry_out.as_mut().d_blocks.0.d_vec, + in_carry_dvec, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + d_bsk.input_lwe_dimension(), + d_bsk.glwe_dimension(), + d_bsk.polynomial_size(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count(), + d_bsk.decomp_base_log(), + num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, + PBSType::Classical, + LweBskGroupingFactor(0), + requested_flag, + uses_carry, + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + propagate_fast_single_carry_assign_async( + streams, + &mut ciphertext.d_blocks.0.d_vec, + &mut carry_out.as_mut().d_blocks.0.d_vec, + in_carry_dvec, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + d_multibit_bsk.input_lwe_dimension(), + d_multibit_bsk.glwe_dimension(), + d_multibit_bsk.polynomial_size(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count(), + d_multibit_bsk.decomp_base_log(), + num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + requested_flag, + uses_carry, + ); + } + }; + ciphertext.info.blocks.iter_mut().for_each(|b| { + b.degree = Degree::new(b.message_modulus.0 - 1); + b.noise_level = NoiseLevel::NOMINAL; + }); + carry_out.as_mut().info.blocks.iter_mut().for_each(|b| { + b.degree = Degree::new(1); + b.noise_level = NoiseLevel::NOMINAL; + }); + carry_out + } + + /// # Safety + /// + /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until streams is synchronized + pub(crate) unsafe fn new_propagate_single_carry_assign_async( + &self, + ct: &mut T, + streams: &CudaStreams, + input_carry: Option<&CudaBooleanBlock>, + requested_flag: OutputFlag, + ) -> T + where + T: CudaIntegerRadixCiphertext, + { + self.propagate_fast_single_carry_assign_async(ct, streams, input_carry, requested_flag) + } /// # Safety /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs index d7156919cf..77c7ec2762 100644 --- a/tfhe/src/integer/gpu/server_key/radix/neg.rs +++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs @@ -1,6 +1,7 @@ use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams}; use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; impl CudaServerKey { /// Homomorphically computes the opposite of a ciphertext encrypting an integer message. @@ -144,7 +145,8 @@ impl CudaServerKey { }; let mut res = self.unchecked_neg_async(ct, streams); - let _carry = self.propagate_single_carry_assign_async(&mut res, streams); + let _carry = + self.new_propagate_single_carry_assign_async(&mut res, streams, None, OutputFlag::None); res } } diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 7c30c789bb..2384c94359 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -8,6 +8,7 @@ use crate::integer::gpu::ciphertext::{ }; use crate::integer::gpu::scalar_addition_integer_radix_assign_async; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::prelude::CastInto; use crate::shortint::ciphertext::NoiseLevel; @@ -186,7 +187,8 @@ impl CudaServerKey { }; self.unchecked_scalar_add_assign_async(ct, scalar, streams); - let _carry = self.propagate_single_carry_assign_async(ct, streams); + let _carry = + self.new_propagate_single_carry_assign_async(ct, streams, None, OutputFlag::None); } pub fn scalar_add_assign(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams) diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs index d117927343..34161ab9c5 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs @@ -4,6 +4,7 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext}; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::integer::server_key::TwosComplementNegation; use crate::prelude::CastInto; @@ -151,7 +152,8 @@ impl CudaServerKey { }; self.unchecked_scalar_sub_assign_async(ct, scalar, stream); - let _carry = self.propagate_single_carry_assign_async(ct, stream); + let _carry = + self.new_propagate_single_carry_assign_async(ct, stream, None, OutputFlag::None); } pub fn scalar_sub_assign(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams) diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs index 8e784a3686..5964a22331 100644 --- a/tfhe/src/integer/gpu/server_key/radix/sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs @@ -1,18 +1,18 @@ use super::add::SignedOperation; -use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::CudaStreams; -use crate::core_crypto::prelude::{CiphertextModulus, LweBskGroupingFactor, LweCiphertextCount}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; -use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo; use crate::integer::gpu::ciphertext::{ - CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, - CudaUnsignedRadixCiphertext, + CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, }; -use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; +use crate::integer::gpu::server_key::CudaServerKey; + +use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::NoiseLevel; +use crate::shortint::parameters::{Degree, LweBskGroupingFactor}; impl CudaServerKey { /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values. @@ -272,7 +272,8 @@ impl CudaServerKey { }; self.unchecked_sub_assign_async(lhs, rhs, streams); - let _carry = self.propagate_single_carry_assign_async(lhs, streams); + let _carry = + self.new_propagate_single_carry_assign_async(lhs, streams, None, OutputFlag::None); } pub fn unsigned_overflowing_sub( @@ -353,87 +354,104 @@ impl CudaServerKey { rhs: &CudaUnsignedRadixCiphertext, stream: &CudaStreams, ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) { - let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - let mut tmp: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream); - if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - { - tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::ZERO); - } else { - tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::NOMINAL); - } let mut ct_res = lhs.duplicate_async(stream); - let block = CudaLweCiphertextList::new( - tmp.as_ref().d_blocks.lwe_dimension(), - LweCiphertextCount(1), - CiphertextModulus::new_native(), - stream, - ); - let block_info = tmp.as_ref().info.blocks[0]; - let ct_info = vec![block_info]; - let ct_info = CudaRadixCiphertextInfo { blocks: ct_info }; - let mut ct_overflowed = - CudaBooleanBlock::from_cuda_radix_ciphertext(CudaRadixCiphertext::new(block, ct_info)); + let compute_overflow = true; + const INPUT_BORROW: Option<&CudaBooleanBlock> = None; + + let mut overflow_block: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(1, stream); + let ciphertext = ct_res.as_mut(); + let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; + let uses_input_borrow = match INPUT_BORROW { + Some(_block) => 1u32, + None => 0u32, + }; + let mut aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream); + let in_carry_dvec = match INPUT_BORROW { + Some(block) => &block.0.ciphertext.d_blocks.0.d_vec, + None => &aux_block.as_mut().d_blocks.0.d_vec, + }; match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async( stream, - &mut ct_res.as_mut().d_blocks.0.d_vec, - &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - &lhs.as_ref().d_blocks.0.d_vec, + &mut ciphertext.d_blocks.0.d_vec, &rhs.as_ref().d_blocks.0.d_vec, + &mut overflow_block.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_bsk.d_vec, &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), + d_bsk.input_lwe_dimension(), + d_bsk.glwe_dimension(), + d_bsk.polynomial_size(), self.key_switching_key.decomposition_level_count(), self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, + d_bsk.decomp_level_count(), + d_bsk.decomp_base_log(), num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::Classical, LweBskGroupingFactor(0), + compute_overflow, + uses_input_borrow, ); } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async( stream, - &mut ct_res.as_mut().d_blocks.0.d_vec, - &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - &lhs.as_ref().d_blocks.0.d_vec, + &mut ciphertext.d_blocks.0.d_vec, &rhs.as_ref().d_blocks.0.d_vec, + &mut overflow_block.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), + d_multibit_bsk.input_lwe_dimension(), + d_multibit_bsk.glwe_dimension(), + d_multibit_bsk.polynomial_size(), self.key_switching_key.decomposition_level_count(), self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, + d_multibit_bsk.decomp_level_count(), + d_multibit_bsk.decomp_base_log(), num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::MultiBit, d_multibit_bsk.grouping_factor, + compute_overflow, + uses_input_borrow, ); } }; + ciphertext.info.blocks.iter_mut().for_each(|b| { + b.degree = Degree::new(b.message_modulus.0 - 1); + b.noise_level = NoiseLevel::NOMINAL; + }); + overflow_block + .as_mut() + .info + .blocks + .iter_mut() + .for_each(|b| { + b.degree = Degree::new(1); + b.noise_level = NoiseLevel::ZERO; + }); + + if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + { + overflow_block.as_mut().info = + overflow_block.as_ref().info.boolean_info(NoiseLevel::ZERO); + } else { + overflow_block.as_mut().info = overflow_block + .as_ref() + .info + .boolean_info(NoiseLevel::NOMINAL); + } + + let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(overflow_block.ciphertext); ct_res.as_mut().info = ct_res .as_ref()