diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index bcb2c6cbe9..3c3a61b8f6 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -27,6 +27,15 @@ inline void cuda_error(cudaError_t code, const char *file, int line) { std::abort(); \ } +cudaEvent_t cuda_create_event(uint32_t gpu_index); + +void cuda_event_record(cudaEvent_t event, cudaStream_t stream, + uint32_t gpu_index); +void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event, + uint32_t gpu_index); + +void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index); + cudaStream_t cuda_create_stream(uint32_t gpu_index); void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index 898daba86d..325891b860 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -35,6 +35,8 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; +enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 }; + extern "C" { void scratch_cuda_apply_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, @@ -282,23 +284,61 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory); + +void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory); void cuda_propagate_single_carry_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t num_blocks); + void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks, + uint32_t requested_flag, uint32_t uses_carry); -void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( +void cuda_add_and_propagate_single_carry_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr, - void *const *bsks, void *const *ksks, uint32_t num_blocks); + void *lhs_array, const void *rhs_array, void *carry_out, + const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry); void cleanup_cuda_propagate_single_carry(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); +void cleanup_cuda_add_and_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + +void scratch_cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow, + bool allocate_gpu_memory); + +void cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs_array, const void *rhs_array, void *overflow_block, + const void *input_borrow, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow); + +void cleanup_cuda_integer_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); + void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -318,25 +358,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); -void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); - -void cuda_integer_radix_overflowing_sub_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left, - void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t num_blocks_in_radix); - -void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - void scratch_cuda_integer_scalar_mul_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -376,26 +397,6 @@ void cleanup_cuda_integer_div_rem(void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); -void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); - -void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lhs, void const *rhs, void *overflowed, int8_t signed_operation, - int8_t *mem_ptr, void *const *bsks, void *const *ksks, - uint32_t num_blocks_in_radix); - -void cleanup_signed_overflowing_add_or_sub(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void); - void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 1a985c1ca0..ff3fac680f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -53,6 +53,12 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, uint32_t carry_modulus, std::function f); +template +void generate_many_lut_device_accumulator( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t carry_modulus, std::vector> &f); + struct int_radix_params { PBS_TYPE pbs_type; uint32_t glwe_dimension; @@ -316,6 +322,113 @@ template struct int_radix_lut { num_radix_blocks * sizeof(Torus)); } + // Construction for many luts + int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_luts, + uint32_t num_radix_blocks, uint32_t num_many_lut, + bool allocate_gpu_memory) { + + this->params = params; + this->num_blocks = num_radix_blocks; + this->num_luts = num_luts; + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + Torus lut_buffer_size = + (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); + + /////////////// + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + for (uint i = 0; i < active_gpu_count; i++) { + cudaSetDevice(i); + int8_t *gpu_pbs_buffer; + auto num_blocks_on_gpu = + get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count); + + execute_scratch_pbs( + streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, + params.small_lwe_dimension, params.polynomial_size, params.pbs_level, + params.grouping_factor, num_blocks_on_gpu, params.pbs_type, + allocate_gpu_memory); + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + buffer.push_back(gpu_pbs_buffer); + } + + if (allocate_gpu_memory) { + // Allocate LUT + // LUT is used as a trivial encryption and must be initialized outside + // this constructor + for (uint i = 0; i < active_gpu_count; i++) { + auto lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, + streams[i], gpu_indexes[i]); + auto lut_indexes = (Torus *)cuda_malloc_async( + lut_indexes_size, streams[i], gpu_indexes[i]); + // lut_indexes is initialized to 0 by default + // if a different behavior is wanted, it should be rewritten later + cuda_memset_async(lut_indexes, 0, lut_indexes_size, streams[i], + gpu_indexes[i]); + + lut_vec.push_back(lut); + lut_indexes_vec.push_back(lut_indexes); + + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + + // lwe_(input/output)_indexes are initialized to range(num_radix_blocks) + // by default + lwe_indexes_in = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + lwe_indexes_out = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + lwe_trivial_indexes = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + + h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + + for (int i = 0; i < num_radix_blocks; i++) + h_lwe_indexes_in[i] = i; + + cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + memcpy(h_lwe_indexes_out, h_lwe_indexes_in, + num_radix_blocks * sizeof(Torus)); + + /// With multiple GPUs we allocate arrays to be pushed to the vectors and + /// copy data on each GPU then when we gather data to GPU 0 we can copy + /// back to the original indexing + multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_array_in_vec, num_radix_blocks, + params.big_lwe_dimension + 1); + multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, num_radix_blocks, + params.small_lwe_dimension + 1); + multi_gpu_alloc_lwe_many_lut_output_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + num_radix_blocks, num_many_lut, params.big_lwe_dimension + 1); + multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count, + lwe_trivial_indexes_vec, num_radix_blocks); + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count, + lwe_trivial_indexes_vec, lwe_trivial_indexes, + num_radix_blocks); + + // Keyswitch + Torus big_size = + (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); + Torus small_size = + (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); + tmp_lwe_before_ks = + (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); + } + } + // Return a pointer to idx-ith lut at gpu_index's global memory Torus *get_lut(uint32_t gpu_index, size_t idx) { auto lut = lut_vec[gpu_index]; @@ -377,14 +490,14 @@ template struct int_radix_lut { cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]); cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]); } - lut_vec.clear(); - lut_indexes_vec.clear(); cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]); cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]); cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]); cuda_synchronize_stream(streams[0], gpu_indexes[0]); + lut_vec.clear(); + lut_indexes_vec.clear(); free(h_lwe_indexes_in); free(h_lwe_indexes_out); @@ -755,7 +868,7 @@ template struct int_fullprop_buffer { } }; -template struct int_sc_prop_memory { +template struct int_legacy_sc_prop_memory { Torus *generates_or_propagates; Torus *step_output; @@ -767,9 +880,10 @@ template struct int_sc_prop_memory { int_radix_params params; - int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_radix_blocks, bool allocate_gpu_memory) { + int_legacy_sc_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + bool allocate_gpu_memory) { this->params = params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -879,8 +993,6 @@ template struct int_overflowing_sub_memory { Torus *generates_or_propagates; Torus *step_output; - // luts_array[2] = {lut_does_block_generate_carry, - // lut_does_block_generate_or_propagate} int_radix_lut *luts_array; int_radix_lut *luts_borrow_propagation_sum; int_radix_lut *message_acc; @@ -975,146 +1087,1324 @@ template struct int_overflowing_sub_memory { glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_message_acc); - luts_array->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes, - gpu_indexes[0]); - message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - } + luts_array->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + message_acc->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); + cuda_drop_async(step_output, streams[0], gpu_indexes[0]); + + luts_array->release(streams, gpu_indexes, gpu_count); + luts_borrow_propagation_sum->release(streams, gpu_indexes, gpu_count); + message_acc->release(streams, gpu_indexes, gpu_count); + + delete luts_array; + delete luts_borrow_propagation_sum; + delete message_acc; + } +}; + +template struct int_sum_ciphertexts_vec_memory { + Torus *new_blocks; + Torus *new_blocks_copy; + Torus *old_blocks; + Torus *small_lwe_vector; + int_radix_params params; + + int32_t *d_smart_copy_in; + int32_t *d_smart_copy_out; + + bool mem_reuse = false; + + int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_blocks_in_radix, + uint32_t max_num_radix_in_vec, + bool allocate_gpu_memory) { + this->params = params; + + int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; + + // allocate gpu memory for intermediate buffers + new_blocks = (Torus *)cuda_malloc_async( + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); + new_blocks_copy = (Torus *)cuda_malloc_async( + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); + old_blocks = (Torus *)cuda_malloc_async( + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); + small_lwe_vector = (Torus *)cuda_malloc_async( + max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_memset_async(new_blocks, 0, + max_pbs_count * (params.big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_memset_async(new_blocks_copy, 0, + max_pbs_count * (params.big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_memset_async(old_blocks, 0, + max_pbs_count * (params.big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_memset_async(small_lwe_vector, 0, + max_pbs_count * (params.small_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + + d_smart_copy_in = (int32_t *)cuda_malloc_async( + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); + d_smart_copy_out = (int32_t *)cuda_malloc_async( + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); + cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), + streams[0], gpu_indexes[0]); + cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), + streams[0], gpu_indexes[0]); + } + + int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_blocks_in_radix, + uint32_t max_num_radix_in_vec, + Torus *new_blocks, Torus *old_blocks, + Torus *small_lwe_vector) { + mem_reuse = true; + this->params = params; + + int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; + + // assign gpu memory for intermediate buffers + this->new_blocks = new_blocks; + this->old_blocks = old_blocks; + this->small_lwe_vector = small_lwe_vector; + new_blocks_copy = (Torus *)cuda_malloc_async( + max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), + streams[0], gpu_indexes[0]); + cuda_memset_async(new_blocks_copy, 0, + max_pbs_count * (params.big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + + d_smart_copy_in = (int32_t *)cuda_malloc_async( + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); + d_smart_copy_out = (int32_t *)cuda_malloc_async( + max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); + cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), + streams[0], gpu_indexes[0]); + cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), + streams[0], gpu_indexes[0]); + } + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]); + cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]); + + if (!mem_reuse) { + cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); + } + + cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]); + } +}; +// For sequential algorithm in group propagation +template struct int_seq_group_prop_memory { + + Torus *group_resolved_carries; + int_radix_lut *lut_sequential_algorithm; + uint32_t grouping_size; + + int_seq_group_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t group_size, + uint32_t big_lwe_size_bytes, + bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + grouping_size = group_size; + group_resolved_carries = (Torus *)cuda_malloc_async( + (grouping_size)*big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(group_resolved_carries, 0, + (grouping_size)*big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + int num_seq_luts = grouping_size - 1; + Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus)); + lut_sequential_algorithm = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_seq_luts, num_seq_luts, + allocate_gpu_memory); + for (int index = 0; index < num_seq_luts; index++) { + auto f_lut_sequential = [index](Torus propa_cum_sum_block) { + return (propa_cum_sum_block >> (index + 1)) & 1; + }; + auto seq_lut = lut_sequential_algorithm->get_lut(gpu_indexes[0], index); + generate_device_accumulator( + streams[0], gpu_indexes[0], seq_lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_lut_sequential); + h_seq_lut_indexes[index] = index; + } + Torus *seq_lut_indexes = + lut_sequential_algorithm->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(seq_lut_indexes, h_seq_lut_indexes, + num_seq_luts * sizeof(Torus), streams[0], + gpu_indexes[0]); + + lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + free(h_seq_lut_indexes); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(group_resolved_carries, streams[0], gpu_indexes[0]); + lut_sequential_algorithm->release(streams, gpu_indexes, gpu_count); + delete lut_sequential_algorithm; + }; +}; + +// For hillis steele algorithm in group propagation +template struct int_hs_group_prop_memory { + + int_radix_lut *lut_hillis_steele; + uint32_t grouping_size; + + int_hs_group_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_groups, + uint32_t big_lwe_size_bytes, + bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus { + if (msb == 2) { + return 1; // Remap Generate to 1 + } else if (msb == 3) { + // MSB propagates + if (lsb == 2) { + return 1; + } else { + return lsb; + } // also remap here + } else { + return msb; + } + }; + + lut_hillis_steele = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_groups, allocate_gpu_memory); + + auto hillis_steele_lut = lut_hillis_steele->get_lut(gpu_indexes[0], 0); + generate_device_accumulator_bivariate( + streams[0], gpu_indexes[0], hillis_steele_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_lut_hillis_steele); + + lut_hillis_steele->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + lut_hillis_steele->release(streams, gpu_indexes, gpu_count); + delete lut_hillis_steele; + } +}; + +// compute_shifted_blocks_and_block_states +template struct int_shifted_blocks_and_states_memory { + Torus *shifted_blocks_and_states; + Torus *shifted_blocks; + Torus *block_states; + + int_radix_lut *luts_array_first_step; + + int_shifted_blocks_and_states_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + shifted_blocks_and_states = (Torus *)cuda_malloc_async( + num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(shifted_blocks_and_states, 0, + num_many_lut * num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + shifted_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + block_states = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(block_states, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + uint32_t num_luts_first_step = 2 * grouping_size + 1; + + luts_array_first_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_first_step, + num_radix_blocks, num_many_lut, allocate_gpu_memory); + + auto f_shift_block = [message_modulus](Torus block) -> Torus { + return (block % message_modulus) << 1; + }; + + auto f_first_block_state = [message_modulus](Torus block) -> Torus { + if (block >= message_modulus) + return OUTPUT_CARRY::GENERATED; + else { + return OUTPUT_CARRY::NONE; + } + }; + std::vector> f_first_grouping_luts = { + f_first_block_state, f_shift_block}; + + auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + + // luts for other blocks of the first grouping + for (int lut_id = 1; lut_id < grouping_size; lut_id++) { + auto f_state = [message_modulus, lut_id](Torus block) -> Torus { + uint64_t r = 0; + if (block >= message_modulus) { + r = 2; // Generates Carry + } else if (block == (message_modulus - 1)) { + r = 1; // Propagates a carry + } else { + r = 0; // Does not generate carry + } + return r << (lut_id - 1); + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // luts for the rest of groupings (except for the last block) + for (int i = 0; i < grouping_size; i++) { + uint32_t lut_id = i + grouping_size; + auto f_state = [message_modulus, i](Torus block) -> Torus { + uint64_t r = 0; + if (block >= message_modulus) { + r = 2; // Generates Carry + } else if (block == (message_modulus - 1)) { + r = 1; // Propagates a carry + } else { + r = 0; // Does not borrow + } + return r << i; + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // For the last block we need to generate a new lut + auto f_last_block_state = [message_modulus](Torus block) -> Torus { + if (block >= message_modulus) + return 2 << 1; // Generates + else + return 0; // Nothing + }; + + uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step + + auto last_block_lut = + luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + std::vector> f_last_grouping_luts = { + f_last_block_state, f_shift_block}; + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts); + + // Generate the indexes to switch between luts within the pbs + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + bool is_last_index = (index == (num_radix_blocks - 1)); + if (is_last_index) { + if (num_radix_blocks == 1) { + h_lut_indexes[index] = 2 * grouping_size; + } else { + h_lut_indexes[index] = 2; + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + grouping_size; + } + } + + // copy the indexes to the gpu + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size, + streams[0], gpu_indexes[0]); + // Do I need to do something else for the multi-gpu? + + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + free(h_lut_indexes); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + cuda_drop_async(shifted_blocks_and_states, streams[0], gpu_indexes[0]); + cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(block_states, streams[0], gpu_indexes[0]); + + luts_array_first_step->release(streams, gpu_indexes, gpu_count); + delete luts_array_first_step; + }; +}; + +// compute_propagation simulator and group carries +template struct int_prop_simu_group_carries_memory { + Torus *scalar_array_cum_sum; + Torus *propagation_cum_sums; + Torus *simulators; + Torus *grouping_pgns; + Torus *prepared_blocks; + + Torus *resolved_carries; + + int_radix_lut *luts_array_second_step; + + int_seq_group_prop_memory *seq_group_prop_mem; + int_hs_group_prop_memory *hs_group_prop_mem; + + uint32_t group_size; + bool use_sequential_algorithm_to_resolver_group_carries; + + int_prop_simu_group_carries_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + + group_size = grouping_size; + + scalar_array_cum_sum = (Torus *)cuda_malloc_async( + num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); + cuda_memset_async(scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus), + streams[0], gpu_indexes[0]); + propagation_cum_sums = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(propagation_cum_sums, 0, + num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + simulators = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(simulators, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + grouping_pgns = (Torus *)cuda_malloc_async(num_groups * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + prepared_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + resolved_carries = (Torus *)cuda_malloc_async( + (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(resolved_carries, 0, + (num_groups + 1) * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + // create lut objects for step 2 + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + uint32_t num_carry_to_resolve = num_groups - 1; + uint32_t saturated_sub = + ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0); + uint32_t sequential_depth = saturated_sub / (grouping_size - 1); + uint32_t hillis_steel_depth; + + if (num_carry_to_resolve == 0) { + hillis_steel_depth = 0; + } else { + hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve)); + } + + use_sequential_algorithm_to_resolver_group_carries = + sequential_depth <= hillis_steel_depth; + uint32_t num_extra_luts = 0; + if (use_sequential_algorithm_to_resolver_group_carries) { + num_extra_luts = (grouping_size - 1); + } else { + num_extra_luts = 1; + } + + uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts; + luts_array_second_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_second_step, + num_radix_blocks, allocate_gpu_memory); + + // luts for first group inner propagation + for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) { + auto f_first_grouping_inner_propagation = + [lut_id](Torus propa_cum_sum_block) -> Torus { + uint64_t carry = (propa_cum_sum_block >> lut_id) & 1; + + if (carry != 0) { + return 2ull; // Generates Carry + } else { + return 0ull; // Does not generate carry + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_first_grouping_inner_propagation); + } + + auto f_first_grouping_outer_propagation = + [num_bits_in_block](Torus block) -> Torus { + return (block >> (num_bits_in_block - 1)) & 1; + }; + + int lut_id = grouping_size - 1; + auto lut_first_group_outer = + luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut_first_group_outer, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_first_grouping_outer_propagation); + + // for other groupings inner propagation + for (int index = 0; index < grouping_size; index++) { + uint32_t lut_id = index + grouping_size; + + auto f_other_groupings_inner_propagation = + [index](Torus propa_cum_sum_block) -> Torus { + uint64_t mask = (2 << index) - 1; + if (propa_cum_sum_block >= (2 << index)) { + return 2ull; // Generates + } else if ((propa_cum_sum_block & mask) == mask) { + return 1ull; // Propagate + } else { + return 0ull; // Nothing + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_other_groupings_inner_propagation); + } + + if (use_sequential_algorithm_to_resolver_group_carries) { + for (int index = 0; index < grouping_size - 1; index++) { + uint32_t lut_id = index + 2 * grouping_size; + + auto f_group_propagation = [index, block_modulus, + num_bits_in_block](Torus block) -> Torus { + if (block == (block_modulus - 1)) { + return 0ull; + } else { + return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1))); + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_group_propagation); + } + } else { + uint32_t lut_id = 2 * grouping_size; + auto f_group_propagation = [block_modulus](Torus block) { + if (block == (block_modulus - 1)) { + return 2ull; + } else { + return UINT64_MAX % (block_modulus * 2ull); + } + }; + + auto lut = luts_array_second_step->get_lut(gpu_indexes[0], lut_id); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_group_propagation); + } + + Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size); + + Torus *h_scalar_array_cum_sum = + (Torus *)malloc(num_radix_blocks * sizeof(Torus)); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + + if (is_in_first_grouping) { + h_second_lut_indexes[index] = index_in_grouping; + } else if (index_in_grouping == (grouping_size - 1)) { + if (use_sequential_algorithm_to_resolver_group_carries) { + int inner_index = (grouping_index - 1) % (grouping_size - 1); + h_second_lut_indexes[index] = inner_index + 2 * grouping_size; + } else { + h_second_lut_indexes[index] = 2 * grouping_size; + } + } else { + h_second_lut_indexes[index] = index_in_grouping + grouping_size; + } + + bool may_have_its_padding_bit_set = + !is_in_first_grouping && (index_in_grouping == grouping_size - 1); + + if (may_have_its_padding_bit_set) { + if (use_sequential_algorithm_to_resolver_group_carries) { + h_scalar_array_cum_sum[index] = + 1 << ((grouping_index - 1) % (grouping_size - 1)); + } else { + h_scalar_array_cum_sum[index] = 1; + } + } else { + h_scalar_array_cum_sum[index] = 0; + } + } + + // copy the indexes to the gpu + Torus *second_lut_indexes = + luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(second_lut_indexes, h_second_lut_indexes, + lut_indexes_size, streams[0], gpu_indexes[0]); + + cuda_memcpy_async_to_gpu(scalar_array_cum_sum, h_scalar_array_cum_sum, + num_radix_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + if (use_sequential_algorithm_to_resolver_group_carries) { + + seq_group_prop_mem = new int_seq_group_prop_memory( + streams, gpu_indexes, gpu_count, params, grouping_size, + big_lwe_size_bytes, true); + + } else { + hs_group_prop_mem = new int_hs_group_prop_memory( + streams, gpu_indexes, gpu_count, params, num_groups, + big_lwe_size_bytes, true); + } + + free(h_scalar_array_cum_sum); + free(h_second_lut_indexes); + }; + + // needed for the division to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, Torus *new_lut_indexes, + Torus *new_scalars, uint32_t new_num_blocks) { + Torus *lut_indexes = + luts_array_second_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + + luts_array_second_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + cuda_memcpy_async_gpu_to_gpu(scalar_array_cum_sum, new_scalars, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + } + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(scalar_array_cum_sum, streams[0], gpu_indexes[0]); + cuda_drop_async(propagation_cum_sums, streams[0], gpu_indexes[0]); + cuda_drop_async(simulators, streams[0], gpu_indexes[0]); + cuda_drop_async(grouping_pgns, streams[0], gpu_indexes[0]); + cuda_drop_async(prepared_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(resolved_carries, streams[0], gpu_indexes[0]); + + luts_array_second_step->release(streams, gpu_indexes, gpu_count); + + if (use_sequential_algorithm_to_resolver_group_carries) { + seq_group_prop_mem->release(streams, gpu_indexes, gpu_count); + delete seq_group_prop_mem; + } else { + hs_group_prop_mem->release(streams, gpu_indexes, gpu_count); + delete hs_group_prop_mem; + } + + delete luts_array_second_step; + }; +}; + +template struct int_sc_prop_memory { + uint32_t num_many_lut; + uint32_t lut_stride; + + uint32_t group_size; + uint32_t num_groups; + Torus *output_flag; + Torus *last_lhs; + Torus *last_rhs; + int_radix_lut *lut_message_extract; + + int_radix_lut *lut_overflow_flag_prep; + int_radix_lut *lut_overflow_flag_last; + int_radix_lut *lut_carry_flag_last; + + int_shifted_blocks_and_states_memory *shifted_blocks_state_mem; + int_prop_simu_group_carries_memory *prop_simu_group_carries_mem; + + int_radix_params params; + bool use_sequential_algorithm_to_resolver_group_carries; + uint32_t requested_flag; + + uint32_t active_gpu_count; + cudaStream_t *sub_streams_1; + cudaStream_t *sub_streams_2; + + cudaEvent_t *incoming_events1; + cudaEvent_t *incoming_events2; + cudaEvent_t *outgoing_events1; + cudaEvent_t *outgoing_events2; + cudaEvent_t *outgoing_events3; + cudaEvent_t *outgoing_events4; + + int_sc_prop_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, uint32_t requested_flag_in, + uint32_t uses_carry, bool allocate_gpu_memory) { + this->params = params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + requested_flag = requested_flag_in; + // for compute shifted blocks and block states + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + group_size = grouping_size; + num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size; + + num_many_lut = 2; // many luts apply 2 luts + uint32_t box_size = polynomial_size / block_modulus; + lut_stride = (block_modulus / num_many_lut) * box_size; + + shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, num_many_lut, + grouping_size, true); + + prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + grouping_size, num_groups, true); + + // Step 3 elements + lut_message_extract = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_message_extract = [message_modulus](Torus block) -> Torus { + return (block >> 1) % message_modulus; + }; + + auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], extract_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_extract); + + lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + // This store a single block that with be used to store the overflow or + // carry results + output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + if (requested_flag == outputFlag::FLAG_OVERFLOW) { + last_lhs = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + last_rhs = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(last_lhs, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(last_rhs, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + + // For step 1 overflow should be enable only if flag overflow + uint32_t num_bits_in_message = std::log2(message_modulus); + lut_overflow_flag_prep = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_overflow_fp = [num_bits_in_message](Torus lhs, + Torus rhs) -> Torus { + Torus mask = (1 << (num_bits_in_message - 1)) - 1; + Torus lhs_except_last_bit = lhs & mask; + Torus rhs_except_last_bit = rhs & mask; + Torus input_carry1 = 1; + Torus input_carry2 = 0; + + Torus output_carry1 = + ((lhs + rhs + input_carry1) >> num_bits_in_message) & 1; + Torus output_carry2 = + ((lhs + rhs + input_carry2) >> num_bits_in_message) & 1; + Torus input_carry_last_bit1 = + ((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >> + (num_bits_in_message - 1)) & + 1; + Torus input_carry_last_bit2 = + ((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >> + (num_bits_in_message - 1)) & + 1; + + Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1); + Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2); + + return output1 << 3 | output2 << 2; + }; + + auto overflow_flag_prep_lut = + lut_overflow_flag_prep->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator_bivariate( + streams[0], gpu_indexes[0], overflow_flag_prep_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_overflow_fp); + + lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + } + + // For the final cleanup in case of overflow or carry (it seems that I can) + // It seems that this lut could be apply together with the other one but for + // now we won't do it + if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case + lut_overflow_flag_last = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_overflow_last = [num_radix_blocks, + requested_flag_in](Torus block) -> Torus { + uint32_t position = (num_radix_blocks == 1 && + requested_flag_in == outputFlag::FLAG_OVERFLOW) + ? 0 + : 1; + Torus input_carry = (block >> position) & 1; + Torus does_overflow_if_carry_is_1 = (block >> 3) & 1; + Torus does_overflow_if_carry_is_0 = (block >> 2) & 1; + if (input_carry == outputFlag::FLAG_OVERFLOW) { + return does_overflow_if_carry_is_1; + } else { + return does_overflow_if_carry_is_0; + } + }; + auto overflow_flag_last = + lut_overflow_flag_last->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_overflow_last); + + lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes, + gpu_indexes[0]); + } + if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case + lut_carry_flag_last = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); + + auto f_carry_last = [](Torus block) -> Torus { + return ((block >> 2) & 1); + }; + auto carry_flag_last = lut_carry_flag_last->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_carry_last); + + lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { + sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); + sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); + } + + incoming_events1 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + incoming_events2 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events1 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events2 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events3 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events4 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + + for (uint j = 0; j < active_gpu_count; j++) { + incoming_events1[j] = cuda_create_event(gpu_indexes[j]); + incoming_events2[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events1[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events2[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events3[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events4[j] = cuda_create_event(gpu_indexes[j]); + } + }; + + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count) { + + shifted_blocks_state_mem->release(streams, gpu_indexes, gpu_count); + prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(output_flag, streams[0], gpu_indexes[0]); + lut_message_extract->release(streams, gpu_indexes, gpu_count); + delete lut_message_extract; + + if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow + lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count); + lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count); + delete lut_overflow_flag_prep; + delete lut_overflow_flag_last; + cuda_drop_async(last_lhs, streams[0], gpu_indexes[0]); + cuda_drop_async(last_rhs, streams[0], gpu_indexes[0]); + } + if (requested_flag == outputFlag::FLAG_CARRY) { // In case of carry + lut_carry_flag_last->release(streams, gpu_indexes, gpu_count); + delete lut_carry_flag_last; + } + + // release sub streams + for (uint i = 0; i < active_gpu_count; i++) { + cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); + cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); + } + free(sub_streams_1); + free(sub_streams_2); + + // release events + for (uint j = 0; j < active_gpu_count; j++) { + cuda_event_destroy(incoming_events1[j], gpu_indexes[j]); + cuda_event_destroy(incoming_events2[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events1[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events2[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events3[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events4[j], gpu_indexes[j]); + } + free(incoming_events1); + free(incoming_events2); + free(outgoing_events1); + free(outgoing_events2); + free(outgoing_events3); + free(outgoing_events4); + }; +}; + +template struct int_shifted_blocks_and_borrow_states_memory { + Torus *shifted_blocks_and_borrow_states; + Torus *shifted_blocks; + Torus *borrow_states; + + int_radix_lut *luts_array_first_step; + + int_shifted_blocks_and_borrow_states_memory( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, + uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + shifted_blocks_and_borrow_states = (Torus *)cuda_malloc_async( + num_many_lut * num_radix_blocks * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(shifted_blocks_and_borrow_states, 0, + num_many_lut * num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + shifted_blocks = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(shifted_blocks, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + borrow_states = (Torus *)cuda_malloc_async( + num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memset_async(borrow_states, 0, num_radix_blocks * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + + uint32_t num_luts_first_step = 2 * grouping_size + 1; + + luts_array_first_step = new int_radix_lut( + streams, gpu_indexes, gpu_count, params, num_luts_first_step, + num_radix_blocks, num_many_lut, allocate_gpu_memory); + + auto f_shift_block = [message_modulus](Torus block) -> Torus { + uint64_t overflow_guard = message_modulus; + uint64_t block_mod = block % message_modulus; + return (overflow_guard | block_mod) << 1; + }; + + auto f_first_block_state = [message_modulus](Torus block) -> Torus { + if (block < message_modulus) + return 1; // Borrows + else { + return 0; // Nothing + } + }; + std::vector> f_first_grouping_luts = { + f_first_block_state, f_shift_block}; + + auto first_block_lut = luts_array_first_step->get_lut(gpu_indexes[0], 0); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + + // luts for other blocks of the first grouping + for (int lut_id = 1; lut_id < grouping_size; lut_id++) { + auto f_state = [message_modulus, lut_id](Torus block) -> Torus { + uint64_t r = 0; + if (block < message_modulus) { + r = 2; // Borrows + } else if (block == message_modulus) { + r = 1; // Propagates a borrow + } else { + r = 0; // Does not borrow + } + return r << (lut_id - 1); + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + // luts for the rest of groupings (except for the last block) + for (int i = 0; i < grouping_size; i++) { + uint32_t lut_id = i + grouping_size; + auto f_state = [message_modulus, i](Torus block) -> Torus { + uint64_t r = 0; + if (block < message_modulus) { + r = 2; // Generates borrow + } else if (block == message_modulus) { + r = 1; // Propagates a borrow + } else { + r = 0; // Does not borrow + } + return r << i; + }; + std::vector> f_grouping_luts = { + f_state, f_shift_block}; + + auto lut = luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_grouping_luts); + } + + auto f_last_block_state = [message_modulus](Torus block) -> Torus { + if (block < message_modulus) + return 2 << 1; // Generates a borrow + else + return 0; // Nothing + }; + + uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step + + auto last_block_lut = + luts_array_first_step->get_lut(gpu_indexes[0], lut_id); + + std::vector> f_last_grouping_luts = { + f_last_block_state, f_shift_block}; + + generate_many_lut_device_accumulator( + streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts); + + // Generate the indexes to switch between luts within the pbs + Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); + Torus *h_lut_indexes = (Torus *)malloc(lut_indexes_size); + + for (int index = 0; index < num_radix_blocks; index++) { + uint32_t grouping_index = index / grouping_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % grouping_size; + bool is_last_index = (index == (num_radix_blocks - 1)); + if (is_last_index) { + if (num_radix_blocks == 1) { + h_lut_indexes[index] = 2 * grouping_size; + } else { + h_lut_indexes[index] = 2; + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + grouping_size; + } + } + // copy the indexes to the gpu + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, lut_indexes_size, + streams[0], gpu_indexes[0]); + // Do I need to do something else for the multi-gpu? + + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + free(h_lut_indexes); + }; + // needed for the division to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, Torus *new_lut_indexes, + uint32_t new_num_blocks) { + Torus *lut_indexes = + luts_array_first_step->get_lut_indexes(gpu_indexes[0], 0); + cuda_memcpy_async_gpu_to_gpu(lut_indexes, new_lut_indexes, + new_num_blocks * sizeof(Torus), streams[0], + gpu_indexes[0]); + luts_array_first_step->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - cuda_drop_async(generates_or_propagates, streams[0], gpu_indexes[0]); - cuda_drop_async(step_output, streams[0], gpu_indexes[0]); - luts_array->release(streams, gpu_indexes, gpu_count); - luts_borrow_propagation_sum->release(streams, gpu_indexes, gpu_count); - message_acc->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(shifted_blocks_and_borrow_states, streams[0], + gpu_indexes[0]); + cuda_drop_async(shifted_blocks, streams[0], gpu_indexes[0]); + cuda_drop_async(borrow_states, streams[0], gpu_indexes[0]); - delete luts_array; - delete luts_borrow_propagation_sum; - delete message_acc; - } + luts_array_first_step->release(streams, gpu_indexes, gpu_count); + delete luts_array_first_step; + }; }; -template struct int_sum_ciphertexts_vec_memory { - Torus *new_blocks; - Torus *new_blocks_copy; - Torus *old_blocks; - Torus *small_lwe_vector; +template struct int_borrow_prop_memory { + uint32_t num_many_lut; + uint32_t lut_stride; + + uint32_t group_size; + uint32_t num_groups; + Torus *overflow_block; + + int_radix_lut *lut_message_extract; + int_radix_lut *lut_borrow_flag; + + int_shifted_blocks_and_borrow_states_memory + *shifted_blocks_borrow_state_mem; + int_prop_simu_group_carries_memory *prop_simu_group_carries_mem; + int_radix_params params; - int_sc_prop_memory *scp_mem; - int32_t *d_smart_copy_in; - int32_t *d_smart_copy_out; + uint32_t active_gpu_count; + cudaStream_t *sub_streams_1; + cudaStream_t *sub_streams_2; - bool mem_reuse = false; + cudaEvent_t *incoming_events; + cudaEvent_t *outgoing_events1; + cudaEvent_t *outgoing_events2; - int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_blocks_in_radix, - uint32_t max_num_radix_in_vec, - bool allocate_gpu_memory) { + uint32_t compute_overflow; + int_borrow_prop_memory(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count, + int_radix_params params, uint32_t num_radix_blocks, + uint32_t compute_overflow_in, + bool allocate_gpu_memory) { this->params = params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + auto big_lwe_size = (polynomial_size * glwe_dimension + 1); + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + compute_overflow = compute_overflow_in; + // for compute shifted blocks and block states + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + group_size = grouping_size; + num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size; + + num_many_lut = 2; // many luts apply 2 luts + uint32_t box_size = polynomial_size / block_modulus; + lut_stride = (block_modulus / num_many_lut) * box_size; + + shifted_blocks_borrow_state_mem = + new int_shifted_blocks_and_borrow_states_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + num_many_lut, grouping_size, true); + + prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + grouping_size, num_groups, true); - // create single carry propagation memory object - scp_mem = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_blocks_in_radix, allocate_gpu_memory); - int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; + overflow_block = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + cuda_memset_async(overflow_block, 0, big_lwe_size_bytes, streams[0], + gpu_indexes[0]); - // allocate gpu memory for intermediate buffers - new_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - new_blocks_copy = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - old_blocks = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - small_lwe_vector = (Torus *)cuda_malloc_async( - max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks_copy, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(old_blocks, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(small_lwe_vector, 0, - max_pbs_count * (params.small_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); + lut_message_extract = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_message_extract = [message_modulus](Torus block) -> Torus { + return (block >> 1) % message_modulus; + }; - d_smart_copy_in = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); - d_smart_copy_out = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); - cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), - streams[0], gpu_indexes[0]); - cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), - streams[0], gpu_indexes[0]); - } + auto extract_lut = lut_message_extract->get_lut(gpu_indexes[0], 0); - int_sum_ciphertexts_vec_memory(cudaStream_t const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, int_radix_params params, - uint32_t num_blocks_in_radix, - uint32_t max_num_radix_in_vec, - Torus *new_blocks, Torus *old_blocks, - Torus *small_lwe_vector) { - mem_reuse = true; - this->params = params; + generate_device_accumulator( + streams[0], gpu_indexes[0], extract_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_message_extract); - // create single carry propagation memory object - scp_mem = new int_sc_prop_memory(streams, gpu_indexes, gpu_count, - params, num_blocks_in_radix, true); - int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec; + lut_message_extract->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - // assign gpu memory for intermediate buffers - this->new_blocks = new_blocks; - this->old_blocks = old_blocks; - this->small_lwe_vector = small_lwe_vector; - new_blocks_copy = (Torus *)cuda_malloc_async( - max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus), - streams[0], gpu_indexes[0]); - cuda_memset_async(new_blocks_copy, 0, - max_pbs_count * (params.big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); + if (compute_overflow) { + lut_borrow_flag = + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, + num_radix_blocks, allocate_gpu_memory); + // lut for the first block in the first grouping + auto f_borrow_flag = [](Torus block) -> Torus { + return ((block >> 2) & 1); + }; - d_smart_copy_in = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); - d_smart_copy_out = (int32_t *)cuda_malloc_async( - max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]); - cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), - streams[0], gpu_indexes[0]); - cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), - streams[0], gpu_indexes[0]); - } + auto borrow_flag_lut = lut_borrow_flag->get_lut(gpu_indexes[0], 0); + + generate_device_accumulator( + streams[0], gpu_indexes[0], borrow_flag_lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_borrow_flag); + + lut_borrow_flag->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + + active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + sub_streams_1 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + sub_streams_2 = + (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); + for (uint j = 0; j < active_gpu_count; j++) { + sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); + sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); + } + + incoming_events = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events1 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + outgoing_events2 = + (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); + for (uint j = 0; j < active_gpu_count; j++) { + incoming_events[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events1[j] = cuda_create_event(gpu_indexes[j]); + outgoing_events2[j] = cuda_create_event(gpu_indexes[j]); + } + }; + // needed for the division to update the lut indexes + void update_lut_indexes(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + Torus *first_indexes_for_div, + Torus *second_indexes_for_div, Torus *scalars_for_div, + uint32_t new_num_blocks) { + shifted_blocks_borrow_state_mem->update_lut_indexes( + streams, gpu_indexes, first_indexes_for_div, new_num_blocks); + prop_simu_group_carries_mem->update_lut_indexes( + streams, gpu_indexes, second_indexes_for_div, scalars_for_div, + new_num_blocks); + } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - cuda_drop_async(d_smart_copy_in, streams[0], gpu_indexes[0]); - cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]); - if (!mem_reuse) { - cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]); - cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]); - cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]); + shifted_blocks_borrow_state_mem->release(streams, gpu_indexes, gpu_count); + prop_simu_group_carries_mem->release(streams, gpu_indexes, gpu_count); + cuda_drop_async(overflow_block, streams[0], gpu_indexes[0]); + + lut_message_extract->release(streams, gpu_indexes, gpu_count); + delete lut_message_extract; + if (compute_overflow) { + lut_borrow_flag->release(streams, gpu_indexes, gpu_count); + delete lut_borrow_flag; } - cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]); - scp_mem->release(streams, gpu_indexes, gpu_count); - delete scp_mem; - } + // The substreams have to be synchronized before destroying events + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + // release events + for (uint j = 0; j < active_gpu_count; j++) { + cuda_event_destroy(incoming_events[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events1[j], gpu_indexes[j]); + cuda_event_destroy(outgoing_events2[j], gpu_indexes[j]); + } + free(incoming_events); + free(outgoing_events1); + free(outgoing_events2); + + // release sub streams + for (uint i = 0; i < active_gpu_count; i++) { + cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); + cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); + } + free(sub_streams_1); + free(sub_streams_2); + }; }; template struct int_zero_out_if_buffer { @@ -1170,6 +2460,7 @@ template struct int_mul_memory { int_radix_lut *zero_out_predicate_lut; int_sum_ciphertexts_vec_memory *sum_ciphertexts_mem; + int_sc_prop_memory *sc_prop_mem; int_zero_out_if_buffer *zero_out_mem; int_radix_params params; @@ -1276,6 +2567,11 @@ template struct int_mul_memory { streams, gpu_indexes, gpu_count, params, num_radix_blocks, 2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector); + uint32_t uses_carry = 0; + uint32_t requested_flag = outputFlag::FLAG_NONE; + sc_prop_mem = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + requested_flag, uses_carry, allocate_gpu_memory); } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -1295,9 +2591,11 @@ template struct int_mul_memory { luts_array->release(streams, gpu_indexes, gpu_count); sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count); + sc_prop_mem->release(streams, gpu_indexes, gpu_count); delete luts_array; delete sum_ciphertexts_mem; + delete sc_prop_mem; } }; @@ -2313,7 +3611,7 @@ template struct unsigned_int_div_rem_memory { // memory objects for other operations int_logical_scalar_shift_buffer *shift_mem_1; int_logical_scalar_shift_buffer *shift_mem_2; - int_overflowing_sub_memory *overflow_sub_mem; + int_borrow_prop_memory *overflow_sub_mem; int_comparison_buffer *comparison_buffer; // lookup tables @@ -2350,6 +3648,11 @@ template struct unsigned_int_div_rem_memory { Torus *at_least_one_upper_block_is_non_zero; Torus *cleaned_merged_interesting_remainder; + Torus **first_indexes_for_overflow_sub; + Torus **second_indexes_for_overflow_sub; + Torus **scalars_for_overflow_sub; + uint32_t max_indexes_to_erase; + // allocate and initialize if needed, temporary arrays used to calculate // cuda integer div_rem operation void init_temporary_buffers(cudaStream_t const *streams, @@ -2558,8 +3861,15 @@ template struct unsigned_int_div_rem_memory { streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true); - overflow_sub_mem = new int_overflowing_sub_memory( - streams, gpu_indexes, gpu_count, params, num_blocks, true); + uint32_t compute_overflow = 1; + overflow_sub_mem = new int_borrow_prop_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow, + true); + uint32_t group_size = overflow_sub_mem->group_size; + bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem + ->use_sequential_algorithm_to_resolver_group_carries; + create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks, + group_size, use_seq); comparison_buffer = new int_comparison_buffer( streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params, @@ -2584,6 +3894,94 @@ template struct unsigned_int_div_rem_memory { } } + void create_indexes_for_overflow_sub(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t num_blocks, uint32_t group_size, + bool use_seq) { + max_indexes_to_erase = num_blocks; + + first_indexes_for_overflow_sub = + (Torus **)malloc(num_blocks * sizeof(Torus *)); + second_indexes_for_overflow_sub = + (Torus **)malloc(num_blocks * sizeof(Torus *)); + scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *)); + + Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus)); + Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus)); + + // Extra indexes for the luts in first step + for (int nb = 1; nb <= num_blocks; nb++) { + cudaMalloc((void **)&first_indexes_for_overflow_sub[nb - 1], + nb * sizeof(Torus)); + for (int index = 0; index < nb; index++) { + uint32_t grouping_index = index / group_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % group_size; + bool is_last_index = (index == (nb - 1)); + if (is_last_index) { + if (nb == 1) { + h_lut_indexes[index] = 2 * group_size; + } else { + h_lut_indexes[index] = 2; + } + } else if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else { + h_lut_indexes[index] = index_in_grouping + group_size; + } + } + cuda_memcpy_async_to_gpu(first_indexes_for_overflow_sub[nb - 1], + h_lut_indexes, nb * sizeof(Torus), streams[0], + gpu_indexes[0]); + } + // Extra indexes for the luts in second step + for (int nb = 1; nb <= num_blocks; nb++) { + cudaMalloc((void **)&second_indexes_for_overflow_sub[nb - 1], + nb * sizeof(Torus)); + cudaMalloc((void **)&scalars_for_overflow_sub[nb - 1], + nb * sizeof(Torus)); + + for (int index = 0; index < nb; index++) { + uint32_t grouping_index = index / group_size; + bool is_in_first_grouping = (grouping_index == 0); + uint32_t index_in_grouping = index % group_size; + + if (is_in_first_grouping) { + h_lut_indexes[index] = index_in_grouping; + } else if (index_in_grouping == (group_size - 1)) { + if (use_seq) { + int inner_index = (grouping_index - 1) % (group_size - 1); + h_lut_indexes[index] = inner_index + 2 * group_size; + } else { + h_lut_indexes[index] = 2 * group_size; + } + } else { + h_lut_indexes[index] = index_in_grouping + group_size; + } + + bool may_have_its_padding_bit_set = + !is_in_first_grouping && (index_in_grouping == group_size - 1); + + if (may_have_its_padding_bit_set) { + if (use_seq) { + h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1)); + } else { + h_scalar[index] = 1; + } + } else { + h_scalar[index] = 0; + } + } + cuda_memcpy_async_to_gpu(second_indexes_for_overflow_sub[nb - 1], + h_lut_indexes, nb * sizeof(Torus), streams[0], + gpu_indexes[0]); + cuda_memcpy_async_to_gpu(scalars_for_overflow_sub[nb - 1], h_scalar, + nb * sizeof(Torus), streams[0], gpu_indexes[0]); + } + free(h_lut_indexes); + free(h_scalar); + }; + void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus); @@ -2678,6 +4076,17 @@ template struct unsigned_int_div_rem_memory { gpu_indexes[0]); cuda_drop_async(cleaned_merged_interesting_remainder, streams[0], gpu_indexes[0]); + + for (int i = 0; i < max_indexes_to_erase; i++) { + cuda_drop_async(first_indexes_for_overflow_sub[i], streams[0], + gpu_indexes[0]); + cuda_drop_async(second_indexes_for_overflow_sub[i], streams[0], + gpu_indexes[0]); + cuda_drop_async(scalars_for_overflow_sub[i], streams[0], gpu_indexes[0]); + } + free(first_indexes_for_overflow_sub); + free(second_indexes_for_overflow_sub); + free(scalars_for_overflow_sub); } }; @@ -2823,107 +4232,6 @@ template struct int_resolve_signed_overflow_memory { } }; -template struct int_signed_overflowing_add_or_sub_memory { - int_radix_params params; - uint32_t active_gpu_count; - - // memory objects for other operations - int_sc_prop_memory *scp_mem; - int_last_block_inner_propagate_memory *las_block_prop_mem; - int_resolve_signed_overflow_memory *resolve_overflow_mem; - - // sub streams - cudaStream_t *sub_streams_1; - cudaStream_t *sub_streams_2; - - // temporary device buffers - Torus *result; // num_blocks - Torus *input_carries; // num_blocks - Torus *neg_rhs; // num_blocks - Torus *output_carry; // single block - Torus *last_block_inner_propagation; // single block - - // allocate temporary arrays used to calculate - // cuda integer signed overflowing add or sub - void allocate_temporary_buffers(cudaStream_t const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, uint32_t num_blocks) { - uint32_t big_lwe_size = params.big_lwe_dimension + 1; - - result = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - - neg_rhs = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - - input_carries = (Torus *)cuda_malloc_async( - big_lwe_size * num_blocks * sizeof(Torus), streams[0], gpu_indexes[0]); - output_carry = (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus), - streams[0], gpu_indexes[0]); - last_block_inner_propagation = (Torus *)cuda_malloc_async( - big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); - } - - // constructor without memory reuse - int_signed_overflowing_add_or_sub_memory( - cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, int_radix_params params, uint32_t num_blocks, - SIGNED_OPERATION op, bool allocate_gpu_memory) { - this->params = params; - active_gpu_count = get_active_gpu_count(num_blocks, gpu_count); - - allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count, - num_blocks); - - // initialize streams - sub_streams_1 = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - sub_streams_2 = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < active_gpu_count; j++) { - sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); - sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); - } - - // initialize memory objects for other operations - scp_mem = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_blocks, allocate_gpu_memory); - las_block_prop_mem = new int_last_block_inner_propagate_memory( - streams, gpu_indexes, gpu_count, params, op, num_blocks, - allocate_gpu_memory); - - resolve_overflow_mem = new int_resolve_signed_overflow_memory( - streams, gpu_indexes, gpu_count, params, allocate_gpu_memory); - } - - void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count) { - // memory objects for other operations - scp_mem->release(streams, gpu_indexes, gpu_count); - las_block_prop_mem->release(streams, gpu_indexes, gpu_count); - resolve_overflow_mem->release(streams, gpu_indexes, gpu_count); - - delete scp_mem; - delete las_block_prop_mem; - delete resolve_overflow_mem; - - // temporary device buffers - cuda_drop_async(result, streams[0], gpu_indexes[0]); - cuda_drop_async(neg_rhs, streams[0], gpu_indexes[0]); - cuda_drop_async(input_carries, streams[0], gpu_indexes[0]); - cuda_drop_async(output_carry, streams[0], gpu_indexes[0]); - cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]); - - // sub streams - for (uint i = 0; i < active_gpu_count; i++) { - cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); - cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); - } - free(sub_streams_1); - free(sub_streams_2); - } -}; template struct int_bitop_buffer { int_radix_params params; @@ -3006,6 +4314,7 @@ template struct int_scalar_mul_buffer { int_sum_ciphertexts_vec_memory *sum_ciphertexts_vec_mem; Torus *preshifted_buffer; Torus *all_shifted_buffer; + int_sc_prop_memory *sc_prop_mem; int_scalar_mul_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, @@ -3044,13 +4353,20 @@ template struct int_scalar_mul_buffer { sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory( streams, gpu_indexes, gpu_count, params, num_radix_blocks, num_ciphertext_bits, allocate_gpu_memory); + uint32_t uses_carry = 0; + uint32_t requested_flag = outputFlag::FLAG_NONE; + sc_prop_mem = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + requested_flag, uses_carry, allocate_gpu_memory); } } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count); + sc_prop_mem->release(streams, gpu_indexes, gpu_count); delete sum_ciphertexts_vec_mem; + delete sc_prop_mem; cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]); } }; @@ -3074,9 +4390,11 @@ template struct int_abs_buffer { streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory); - scp_mem = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_radix_blocks, allocate_gpu_memory); + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + scp_mem = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + requested_flag, uses_carry, allocate_gpu_memory); bitxor_mem = new int_bitop_buffer( streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params, num_radix_blocks, allocate_gpu_memory); @@ -3155,12 +4473,14 @@ template struct int_div_rem_memory { abs_mem_2 = new int_abs_buffer(streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); - scp_mem_1 = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_blocks, allocate_gpu_memory); - scp_mem_2 = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_blocks, allocate_gpu_memory); + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + scp_mem_1 = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, requested_flag, + uses_carry, allocate_gpu_memory); + scp_mem_2 = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_blocks, requested_flag, + uses_carry, allocate_gpu_memory); std::function quotient_predicate_lut_f = [](uint64_t x) -> uint64_t { return x == 1; }; diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h index 10c476c12b..ddc9a2a508 100644 --- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h +++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h @@ -27,6 +27,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, void const *lwe_array_in_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count); + void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_array_in, void const *plaintext_array_in, diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index 350b5862f4..f0a54cca2f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -28,7 +28,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); #endif template @@ -46,7 +46,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); template void scratch_cuda_multi_bit_programmable_bootstrap( @@ -63,7 +63,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); template uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index a9e21f77ab..722a276235 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -255,7 +255,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template @@ -266,7 +266,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); #if (CUDA_ARCH >= 900) @@ -278,7 +278,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h index c90d671fdb..3596eeba4b 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h @@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); + uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -78,7 +78,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride); + uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h index fceac32e97..504c864069 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -27,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index be24b0475b..e15fc72185 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -2,6 +2,30 @@ #include #include +cudaEvent_t cuda_create_event(uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); + cudaEvent_t event; + check_cuda_error(cudaEventCreate(&event)); + return event; +} + +void cuda_event_record(cudaEvent_t event, cudaStream_t stream, + uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); + check_cuda_error(cudaEventRecord(event, stream)); +} + +void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event, + uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); + check_cuda_error(cudaStreamWaitEvent(stream, event, 0)); +} + +void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); + check_cuda_error(cudaEventDestroy(event)); +} + /// Unsafe function to create a CUDA stream, must check first that GPU exists cudaStream_t cuda_create_stream(uint32_t gpu_index) { check_cuda_error(cudaSetDevice(gpu_index)); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh index ad1a4b9e23..d9053bbfbd 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh @@ -58,9 +58,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, host_addition(streams[0], gpu_indexes[0], ct, mask, ct, radix_params.big_lwe_dimension, num_blocks); - host_propagate_single_carry(streams, gpu_indexes, gpu_count, ct, - nullptr, nullptr, mem_ptr->scp_mem, bsks, - ksks, num_blocks); + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + host_propagate_single_carry( + streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem, + bsks, ksks, num_blocks, requested_flag, uses_carry); host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct, mem_ptr->bitxor_mem, bsks, ksks, num_blocks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu deleted file mode 100644 index 2ae72ad2a6..0000000000 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cu +++ /dev/null @@ -1,50 +0,0 @@ -#include "integer/addition.cuh" - -void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { - - SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION - : SIGNED_OPERATION::SUBTRACTION; - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - big_lwe_dimension, small_lwe_dimension, ks_level, - ks_base_log, pbs_level, pbs_base_log, grouping_factor, - message_modulus, carry_modulus); - - scratch_cuda_integer_signed_overflowing_add_or_sub_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_signed_overflowing_add_or_sub_memory **)mem_ptr, - num_blocks, op, params, allocate_gpu_memory); -} - -void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lhs, void const *rhs, void *overflowed, int8_t signed_operation, - int8_t *mem_ptr, void *const *bsks, void *const *ksks, - uint32_t num_blocks) { - - auto mem = (int_signed_overflowing_add_or_sub_memory *)mem_ptr; - SIGNED_OPERATION op = (signed_operation == 1) ? SIGNED_OPERATION::ADDITION - : SIGNED_OPERATION::SUBTRACTION; - - host_integer_signed_overflowing_add_or_sub_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lhs), static_cast(rhs), - static_cast(overflowed), op, bsks, (uint64_t *const *)(ksks), - mem, num_blocks); -} - -void cleanup_signed_overflowing_add_or_sub(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void) { - int_signed_overflowing_add_or_sub_memory *mem_ptr = - (int_signed_overflowing_add_or_sub_memory *)(*mem_ptr_void); - - mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); -} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh deleted file mode 100644 index 9c763596d0..0000000000 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef TFHE_RS_ADDITION_CUH -#define TFHE_RS_ADDITION_CUH - -#include "crypto/keyswitch.cuh" -#include "device.h" -#include "integer/comparison.cuh" -#include "integer/integer.cuh" -#include "integer/integer_utilities.h" -#include "integer/negation.cuh" -#include "integer/scalar_shifts.cuh" -#include "linear_algebra.h" -#include "pbs/programmable_bootstrap.h" -#include "utils/helper.cuh" -#include "utils/kernel_dimensions.cuh" -#include -#include -#include -#include -#include - -template -void host_resolve_signed_overflow( - cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *result, Torus *last_block_inner_propagation, - Torus const *last_block_input_carry, Torus *last_block_output_carry, - int_resolve_signed_overflow_memory *mem, void *const *bsks, - Torus *const *ksks) { - - auto x = mem->x; - - Torus *d_clears = - (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]); - - cuda_set_value_async(streams[0], gpu_indexes[0], d_clears, 2, 1); - - // replace with host function call - cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( - streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears, - mem->params.big_lwe_dimension, 1); - - host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, - last_block_inner_propagation, x, - mem->params.big_lwe_dimension, 1); - host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, - last_block_inner_propagation, last_block_input_carry, - mem->params.big_lwe_dimension, 1); - - host_apply_univariate_lut_kb(streams, gpu_indexes, gpu_count, result, - last_block_inner_propagation, - mem->resolve_overflow_lut, ksks, bsks, 1); - - cuda_drop_async(d_clears, streams[0], gpu_indexes[0]); -} - -template -__host__ void scratch_cuda_integer_signed_overflowing_add_or_sub_kb( - cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, - int_signed_overflowing_add_or_sub_memory **mem_ptr, - uint32_t num_blocks, SIGNED_OPERATION op, int_radix_params params, - bool allocate_gpu_memory) { - - *mem_ptr = new int_signed_overflowing_add_or_sub_memory( - streams, gpu_indexes, gpu_count, params, num_blocks, op, - allocate_gpu_memory); -} - -/* - * Addition - signed_operation = 1 - * Subtraction - signed_operation = -1 - */ -template -__host__ void host_integer_signed_overflowing_add_or_sub_kb( - cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *lhs, Torus const *rhs, Torus *overflowed, - SIGNED_OPERATION op, void *const *bsks, uint64_t *const *ksks, - int_signed_overflowing_add_or_sub_memory *mem_ptr, - uint32_t num_blocks) { - - auto radix_params = mem_ptr->params; - - uint32_t big_lwe_dimension = radix_params.big_lwe_dimension; - uint32_t big_lwe_size = big_lwe_dimension + 1; - uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - - assert(radix_params.message_modulus >= 4 && radix_params.carry_modulus >= 4); - - auto result = mem_ptr->result; - auto neg_rhs = mem_ptr->neg_rhs; - auto input_carries = mem_ptr->input_carries; - auto output_carry = mem_ptr->output_carry; - auto last_block_inner_propagation = mem_ptr->last_block_inner_propagation; - - cuda_memcpy_async_gpu_to_gpu(result, lhs, num_blocks * big_lwe_size_bytes, - streams[0], gpu_indexes[0]); - - // phase 1 - if (op == SIGNED_OPERATION::ADDITION) { - host_addition(streams[0], gpu_indexes[0], result, lhs, rhs, - big_lwe_dimension, num_blocks); - } else { - host_integer_radix_negation( - streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension, - num_blocks, radix_params.message_modulus, radix_params.carry_modulus); - host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs, - big_lwe_dimension, num_blocks); - } - - // phase 2 - for (uint j = 0; j < gpu_count; j++) { - cuda_synchronize_stream(streams[j], gpu_indexes[j]); - } - - host_propagate_single_carry( - mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry, - input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks); - host_generate_last_block_inner_propagation( - mem_ptr->sub_streams_2, gpu_indexes, gpu_count, - last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size], - &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks, - ksks); - - for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { - cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); - cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); - } - - // phase 3 - auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size]; - if (op == SIGNED_OPERATION::SUBTRACTION && num_blocks == 1) { - // Quick fix for the case where the subtraction is done on a single block - Torus *one_scalar = - (Torus *)cuda_malloc_async(sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_set_value_async(streams[0], gpu_indexes[0], one_scalar, 1, 1); - create_trivial_radix( - streams[0], gpu_indexes[0], input_carry, one_scalar, big_lwe_dimension, - 1, 1, radix_params.message_modulus, radix_params.carry_modulus); - cuda_drop_async(one_scalar, streams[0], gpu_indexes[0]); - } - - host_resolve_signed_overflow( - streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation, - input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks); - - cuda_memcpy_async_gpu_to_gpu(lhs, result, num_blocks * big_lwe_size_bytes, - streams[0], gpu_indexes[0]); -} - -#endif // TFHE_RS_ADDITION_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index 88d598fef6..097dc47f32 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -295,7 +295,7 @@ __host__ void host_integer_decompress( extracted_lwe = h_mem_ptr->tmp_extracted_lwe; // In the case of extracting a single LWE these parameters are dummy - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension @@ -311,7 +311,7 @@ __host__ void host_integer_decompress( compression_params.small_lwe_dimension, encryption_params.polynomial_size, encryption_params.pbs_base_log, encryption_params.pbs_level, encryption_params.grouping_factor, - num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride); + num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride); } else { /// For multi GPU execution we create vectors of pointers for inputs and /// outputs @@ -338,7 +338,7 @@ __host__ void host_integer_decompress( compression_params.small_lwe_dimension, encryption_params.polynomial_size, encryption_params.pbs_base_log, encryption_params.pbs_level, encryption_params.grouping_factor, - num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride); + num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride); /// Copy data back to GPU 0 and release vecs multi_gpu_gather_lwe_async( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index 394c816d07..ab574705f6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -425,11 +425,24 @@ __host__ void host_unsigned_integer_div_rem_kb( auto do_overflowing_sub = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - host_integer_overflowing_sub_kb( - streams, gpu_indexes, gpu_count, new_remainder.data, - subtraction_overflowed.data, merged_interesting_remainder.data, - interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem, + uint32_t compute_borrow = 1; + uint32_t uses_input_borrow = 0; + auto first_indexes = mem_ptr->first_indexes_for_overflow_sub + [merged_interesting_remainder.len - 1]; + auto second_indexes = mem_ptr->second_indexes_for_overflow_sub + [merged_interesting_remainder.len - 1]; + auto scalar_indexes = + mem_ptr + ->scalars_for_overflow_sub[merged_interesting_remainder.len - 1]; + mem_ptr->overflow_sub_mem->update_lut_indexes( + streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes, merged_interesting_remainder.len); + host_integer_overflowing_sub( + streams, gpu_indexes, gpu_count, new_remainder.data, + (uint64_t *)merged_interesting_remainder.data, + interesting_divisor.data, subtraction_overflowed.data, + (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks, + merged_interesting_remainder.len, compute_borrow, uses_input_borrow); }; // fills: @@ -657,10 +670,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams, int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus, radix_params.carry_modulus); - host_propagate_single_carry(int_mem_ptr->sub_streams_1, gpu_indexes, - gpu_count, int_mem_ptr->negated_quotient, - nullptr, nullptr, int_mem_ptr->scp_mem_1, - bsks, ksks, num_blocks); + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + host_propagate_single_carry( + int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, + int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1, + bsks, ksks, num_blocks, requested_flag, uses_carry); host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, int_mem_ptr->negated_remainder, @@ -671,7 +686,8 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams, host_propagate_single_carry( int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, int_mem_ptr->negated_remainder, nullptr, nullptr, - int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks); + int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag, + uses_carry); host_integer_radix_cmux_kb( int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index 53b1366c37..6d224e64d7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -1,4 +1,5 @@ #include "integer/integer.cuh" +#include "integer/negation.cuh" #include void cuda_full_propagation_64_inplace(void *const *streams, @@ -49,7 +50,8 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, @@ -59,30 +61,94 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( scratch_cuda_propagate_single_carry_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sc_prop_memory **)mem_ptr, num_blocks, params, - allocate_gpu_memory); + requested_flag, uses_carry, allocate_gpu_memory); +} + +void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_propagate_single_carry_kb_inplace( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_sc_prop_memory **)mem_ptr, num_blocks, params, + requested_flag, uses_carry, allocate_gpu_memory); +} + +void scratch_cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, + uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow, + bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_integer_overflowing_sub( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_borrow_prop_memory **)mem_ptr, num_blocks, params, + compute_overflow, allocate_gpu_memory); } void cuda_propagate_single_carry_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t num_blocks) { + void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr, + void *const *bsks, void *const *ksks, uint32_t num_blocks, + uint32_t requested_flag, uint32_t uses_carry) { + host_propagate_single_carry( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(carry_out), - nullptr, (int_sc_prop_memory *)mem_ptr, bsks, - (uint64_t **)(ksks), num_blocks); + static_cast(carry_in), + (int_sc_prop_memory *)mem_ptr, bsks, (uint64_t **)(ksks), + num_blocks, requested_flag, uses_carry); } -void cuda_propagate_single_carry_get_input_carries_kb_64_inplace( +void cuda_add_and_propagate_single_carry_kb_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr, - void *const *bsks, void *const *ksks, uint32_t num_blocks) { - host_propagate_single_carry( + void *lhs_array, const void *rhs_array, void *carry_out, + const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks, + uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) { + + host_add_and_propagate_single_carry( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), static_cast(carry_out), - static_cast(input_carries), + static_cast(lhs_array), + static_cast(rhs_array), + static_cast(carry_out), + static_cast(carry_in), (int_sc_prop_memory *)mem_ptr, bsks, (uint64_t **)(ksks), - num_blocks); + num_blocks, requested_flag, uses_carry); +} + +void cuda_integer_overflowing_sub_kb_64_inplace( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + void *lhs_array, const void *rhs_array, void *overflow_block, + const void *input_borrow, int8_t *mem_ptr, void *const *bsks, + void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow) { + + host_integer_overflowing_sub( + (cudaStream_t const *)streams, gpu_indexes, gpu_count, + static_cast(lhs_array), static_cast(lhs_array), + static_cast(rhs_array), + static_cast(overflow_block), + static_cast(input_borrow), + (int_borrow_prop_memory *)mem_ptr, bsks, (uint64_t **)ksks, + num_blocks, compute_overflow, uses_input_borrow); } void cleanup_cuda_propagate_single_carry(void *const *streams, @@ -94,6 +160,23 @@ void cleanup_cuda_propagate_single_carry(void *const *streams, mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); } +void cleanup_cuda_add_and_propagate_single_carry(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_sc_prop_memory *mem_ptr = + (int_sc_prop_memory *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} +void cleanup_cuda_integer_overflowing_sub(void *const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + int_borrow_prop_memory *mem_ptr = + (int_borrow_prop_memory *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} + void scratch_cuda_apply_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, @@ -142,14 +225,14 @@ void cuda_apply_many_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr, void *const *ksks, void *const *bsks, uint32_t num_blocks, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { host_apply_many_univariate_lut_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(output_radix_lwe), static_cast(input_radix_lwe), (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks, - lut_count, lut_stride); + num_many_lut, lut_stride); } void scratch_cuda_apply_bivariate_lut_kb_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 28993ed406..51cedfd668 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -8,6 +8,7 @@ #include "integer/scalar_addition.cuh" #include "linear_algebra.h" #include "linearalgebra/addition.cuh" +#include "linearalgebra/negation.cuh" #include "pbs/programmable_bootstrap.h" #include "polynomial/functions.cuh" #include "utils/helper.cuh" @@ -80,6 +81,7 @@ host_radix_blocks_rotate_right(cudaStream_t const *streams, cudaSetDevice(gpu_indexes[0]); radix_blocks_rotate_right<<>>( dst, src, value, blocks_count, lwe_size); + check_cuda_error(cudaGetLastError()); } // rotate radix ciphertext left with specific value @@ -97,6 +99,7 @@ host_radix_blocks_rotate_left(cudaStream_t const *streams, cudaSetDevice(gpu_indexes[0]); radix_blocks_rotate_left<<>>( dst, src, value, blocks_count, lwe_size); + check_cuda_error(cudaGetLastError()); } // reverse the blocks in a list @@ -126,6 +129,138 @@ host_radix_blocks_reverse_inplace(cudaStream_t const *streams, int num_blocks = blocks_count / 2, num_threads = 1024; radix_blocks_reverse_lwe_inplace <<>>(src, blocks_count, lwe_size); + check_cuda_error(cudaGetLastError()); +} + +// If group_size = 4, the first group of 4 elements will be transformed as +// follows: +// dest[0] = src[0] +// dest[1] = src[0] + src[1] +// dest[2] = src[0] + src[1] + src[2] +// dest[3] = src[0] + src[1] + src[2] + src[3] +template +__global__ void +radix_cumulative_sum_in_groups(Torus *dest, Torus *src, uint32_t blocks_count, + uint32_t lwe_size, uint32_t group_size) { + + size_t block_offset = blockIdx.x * group_size * lwe_size; + + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t idx = j + block_offset; + Torus sum = src[idx]; + dest[idx] = sum; + for (int gidx = 1; gidx < group_size; gidx++) { + if (gidx + blockIdx.x * group_size < + blocks_count) { // in case the last group is not full + sum += src[idx + gidx * lwe_size]; + dest[idx + gidx * lwe_size] = sum; + } + } + } +} + +template +__host__ void host_radix_cumulative_sum_in_groups( + cudaStream_t stream, uint32_t gpu_index, Torus *dest, Torus *src, + uint32_t radix_blocks_count, uint32_t lwe_size, uint32_t group_size) { + cudaSetDevice(gpu_index); + // Each CUDA block is responsible for a single group + int num_blocks = (radix_blocks_count + group_size - 1) / group_size, + num_threads = 512; + radix_cumulative_sum_in_groups<<>>( + dest, src, radix_blocks_count, lwe_size, group_size); + check_cuda_error(cudaGetLastError()); +} + +template +__global__ void radix_split_simulators_and_grouping_pgns( + Torus *simulators, Torus *grouping_pgns, Torus *src, uint32_t blocks_count, + uint32_t lwe_size, uint32_t group_size, Torus delta) { + + size_t block_offset = blockIdx.x * lwe_size; + if (blockIdx.x % group_size == 0) { + if (blockIdx.x == 0) { + // save trivial 0 + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + simulators[j] = 0; + } + } else { + // save trivial 1 + for (int j = threadIdx.x; j < lwe_size - 1; j += blockDim.x) { + size_t simu_idx = j + block_offset; + simulators[simu_idx] = 0; + } + if (threadIdx.x == 0) { + simulators[lwe_size - 1 + block_offset] = 1 * delta; + } + } + + if ((blockIdx.x / group_size + 1) < + (blocks_count + group_size - 1) / group_size) { + size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size; + size_t pgns_offset = (blockIdx.x / group_size) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t in_offset = j + src_offset; + size_t out_offset = j + pgns_offset; + grouping_pgns[out_offset] = src[in_offset]; + } + } + } else { + // save simulators + size_t src_offset = (blockIdx.x - 1) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + simulators[j + block_offset] = src[j + src_offset]; + } + } +} + +template +__host__ void host_radix_split_simulators_and_grouping_pgns( + cudaStream_t stream, uint32_t gpu_index, Torus *simulators, + Torus *grouping_pgns, Torus *src, uint32_t radix_blocks_count, + uint32_t lwe_size, uint32_t group_size, Torus delta) { + cudaSetDevice(gpu_index); + // Each CUDA block is responsible for a single group + int num_blocks = radix_blocks_count, num_threads = 512; + radix_split_simulators_and_grouping_pgns + <<>>(simulators, grouping_pgns, src, + radix_blocks_count, lwe_size, + group_size, delta); + check_cuda_error(cudaGetLastError()); +} + +// If group_size = 4, the first group of 4 elements will be transformed as +// follows: +// src1 size num_radix_blocks * lwe_size +// src2 size num_group * lwe_size +// dest[0] = src1[0] + src2[0] +// dest[1] = src1[1] + src2[0] +// dest[2] = src1[2] + src2[0] +// dest[3] = src1[3] + src2[0] +template +__global__ void radix_sum_in_groups(Torus *dest, Torus *src1, Torus *src2, + uint32_t blocks_count, uint32_t lwe_size, + uint32_t group_size) { + + size_t src1_offset = blockIdx.x * lwe_size; + size_t src2_index = (blockIdx.x / group_size) * lwe_size; + for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) { + size_t idx = j + src1_offset; + dest[idx] = src1[idx] + src2[j + src2_index]; + } +} + +template +__host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index, + Torus *dest, Torus *src1, Torus *src2, + uint32_t radix_blocks_count, + uint32_t lwe_size, uint32_t group_size) { + cudaSetDevice(gpu_index); + + int num_blocks = radix_blocks_count, num_threads = 512; + radix_sum_in_groups<<>>( + dest, src1, src2, radix_blocks_count, lwe_size, group_size); + check_cuda_error(cudaGetLastError()); } // polynomial_size threads @@ -238,7 +373,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( auto grouping_factor = params.grouping_factor; // In the case of extracting a single LWE this parameters are dummy - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; /// For multi GPU execution we create vectors of pointers for inputs and /// outputs @@ -262,7 +397,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride); + grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -288,7 +423,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count, + pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); /// Copy data back to GPU 0 and release vecs @@ -310,7 +445,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, - int_radix_lut *lut, uint32_t lut_count, uint32_t lut_stride) { + int_radix_lut *lut, uint32_t num_many_lut, uint32_t lut_stride) { // apply_lookup_table auto params = lut->params; auto pbs_type = params.pbs_type; @@ -346,7 +481,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride); + grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -372,15 +507,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count, + pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); /// Copy data back to GPU 0 and release vecs - multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, - lwe_array_out, lwe_after_pbs_vec, - lut->h_lwe_indexes_out, - lut->using_trivial_lwe_indexes, - num_radix_blocks, big_lwe_dimension + 1); + multi_gpu_gather_many_lut_lwe_async( + streams, gpu_indexes, active_gpu_count, lwe_array_out, + lwe_after_pbs_vec, lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1, + num_many_lut); /// Synchronize all GPUs for (uint i = 0; i < active_gpu_count; i++) { @@ -409,7 +544,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( auto grouping_factor = params.grouping_factor; // In the case of extracting a single LWE this parameters are dummy - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; // Left message is shifted @@ -442,7 +577,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride); + grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); multi_gpu_scatter_lwe_async( @@ -464,7 +599,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count, + pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride); /// Copy data back to GPU 0 and release vecs @@ -525,6 +660,48 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, rotate_left(body, half_box_size, polynomial_size); } +template +void generate_many_lookup_table( + Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t message_modulus, uint32_t carry_modulus, + std::vector> &functions) { + + uint32_t modulus_sup = message_modulus * carry_modulus; + uint32_t box_size = polynomial_size / modulus_sup; + Torus delta = (1ul << 63) / modulus_sup; + + memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); + + auto body = &acc[glwe_dimension * polynomial_size]; + + size_t fn_counts = functions.size(); + + assert(fn_counts <= modulus_sup / 2); + + // Space used for each sub lut + uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size; + + // This accumulator extracts the carry bits + for (int f = 0; f < fn_counts; f++) { + int lut_offset = f * single_function_sub_lut_size; + for (int i = 0; i < modulus_sup / fn_counts; i++) { + int index = i * box_size + lut_offset; + for (int j = index; j < index + box_size; j++) { + auto f_eval = functions[f](i); + body[j] = f_eval * delta; + } + } + } + int half_box_size = box_size / 2; + + // Negate the first half_box_size coefficients + for (int i = 0; i < half_box_size; i++) { + body[i] = -body[i]; + } + + rotate_left(body, half_box_size, polynomial_size); +} + template void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -658,16 +835,145 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, free(h_lut); } +/* + * generate many lut accumulator for device pointer + * v_stream - cuda stream + * acc - device pointer for accumulator + * ... + * vector - evaluating functions with one Torus input + */ template -void scratch_cuda_propagate_single_carry_kb_inplace( +void generate_many_lut_device_accumulator( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, + uint32_t carry_modulus, + std::vector> &functions) { + + // host lut + Torus *h_lut = + (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); + + // fill accumulator + generate_many_lookup_table(h_lut, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, functions); + + // copy host lut and lut_indexes_vec to device + cuda_memcpy_async_to_gpu( + acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + stream, gpu_index); + + cuda_synchronize_stream(stream, gpu_index); + free(h_lut); +} + +// This function is used to perform step 1 of Thomas' new carry propagation +// algorithm It uses a many lut to calculate two luts in parallel +// shifted_blocks: contains (block % message modulus) << 1 +// block states: contains the propagation states for the different blocks +// depending on the group it belongs to and the internal position within the +// block. +template +void host_compute_shifted_blocks_and_states( cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, int_sc_prop_memory **mem_ptr, - uint32_t num_radix_blocks, int_radix_params params, - bool allocate_gpu_memory) { + uint32_t gpu_count, Torus *lwe_array, int_radix_params params, + int_shifted_blocks_and_states_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks, uint32_t lut_stride, + uint32_t num_many_lut) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + auto shifted_blocks_and_states = mem->shifted_blocks_and_states; + auto luts_array_first_step = mem->luts_array_first_step; + + integer_radix_apply_many_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array, + bsks, ksks, num_radix_blocks, luts_array_first_step, num_many_lut, + lut_stride); + + auto shifted_blocks = mem->shifted_blocks; + auto block_states = mem->block_states; + cuda_memcpy_async_gpu_to_gpu(block_states, shifted_blocks_and_states, + big_lwe_size_bytes * num_radix_blocks, + streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + shifted_blocks, + shifted_blocks_and_states + big_lwe_size * num_radix_blocks, + big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]); +} + +template +void host_resolve_group_carries_sequentially( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *resolved_carries, Torus *grouping_pgns, + int_radix_params params, int_seq_group_prop_memory *mem, + void *const *bsks, Torus *const *ksks, uint32_t num_groups) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - *mem_ptr = - new int_sc_prop_memory(streams, gpu_indexes, gpu_count, params, - num_radix_blocks, allocate_gpu_memory); + auto group_resolved_carries = mem->group_resolved_carries; + if (num_groups > 1) { + // First carry is just copied + cuda_memcpy_async_gpu_to_gpu(resolved_carries + big_lwe_size, grouping_pgns, + big_lwe_size_bytes, streams[0], + gpu_indexes[0]); + uint32_t solve_per_iter = mem->grouping_size - 1; + uint32_t remaining_carries = + num_groups - + 2; // the first one has been resolved and we ignore the last one + uint32_t num_loops = + ceil(double(remaining_carries) / (double)(solve_per_iter)); + uint32_t last_resolved_pos = 1; + + for (int i = 0; i < num_loops; i++) { + uint32_t loop_offset = i * solve_per_iter; + uint32_t blocks_to_solve = solve_per_iter; + // In case the last iteration has to solve less + if (loop_offset + blocks_to_solve > num_groups - 2) { + blocks_to_solve = remaining_carries - loop_offset; + } + + // The group_resolved carries is used as an intermediate array + // First we need to copy the last resolved carry + cuda_memcpy_async_gpu_to_gpu( + group_resolved_carries, + resolved_carries + last_resolved_pos * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + // The array is filled with the blocks_to_solve + cuda_memcpy_async_gpu_to_gpu( + group_resolved_carries + big_lwe_size, + grouping_pgns + last_resolved_pos * big_lwe_size, + blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + // Perform one group cumulative sum + host_radix_cumulative_sum_in_groups( + streams[0], gpu_indexes[0], group_resolved_carries, + group_resolved_carries, blocks_to_solve + 1, big_lwe_size, + mem->grouping_size); + + // Apply the lut + auto luts_sequential = mem->lut_sequential_algorithm; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, + group_resolved_carries + big_lwe_size, + group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve, + luts_sequential); + + // Copy the result to the resolved carries array + cuda_memcpy_async_gpu_to_gpu( + resolved_carries + (last_resolved_pos + 1) * big_lwe_size, + group_resolved_carries + big_lwe_size, + blocks_to_solve * big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + last_resolved_pos += blocks_to_solve; + } + } } template @@ -675,26 +981,26 @@ void host_compute_prefix_sum_hillis_steele( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *step_output, Torus *generates_or_propagates, int_radix_params params, int_radix_lut *luts, void *const *bsks, - Torus *const *ksks, uint32_t num_blocks) { + Torus *const *ksks, uint32_t num_radix_blocks) { auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; auto big_lwe_size = glwe_dimension * polynomial_size + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - int num_steps = ceil(log2((double)num_blocks)); + int num_steps = ceil(log2((double)num_radix_blocks)); int space = 1; cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates, - big_lwe_size_bytes * num_blocks, streams[0], - gpu_indexes[0]); + big_lwe_size_bytes * num_radix_blocks, + streams[0], gpu_indexes[0]); for (int step = 0; step < num_steps; step++) { - if (space > num_blocks - 1) + if (space > num_radix_blocks - 1) PANIC("Cuda error: step output is going out of bounds in Hillis Steele " "propagation") auto cur_blocks = &step_output[space * big_lwe_size]; auto prev_blocks = generates_or_propagates; - int cur_total_blocks = num_blocks - space; + int cur_total_blocks = num_radix_blocks - space; integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks, @@ -707,14 +1013,116 @@ void host_compute_prefix_sum_hillis_steele( } } +// This function is used to perform step 2 of Thomas' new propagation algorithm +// Consist three steps: +// - propagates the carry within each group with cheap LWE operations stored in +// simulators +// - calculates the propagation state of each group +// - resolves the carries between groups, either sequentially or with hillis +// steele template -void host_propagate_single_carry(cudaStream_t const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array, - Torus *carry_out, Torus *input_carries, - int_sc_prop_memory *mem, - void *const *bsks, Torus *const *ksks, - uint32_t num_blocks) { +void host_compute_propagation_simulators_and_group_carries( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *block_states, int_radix_params params, + int_prop_simu_group_carries_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_groups) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + + auto propagation_cum_sums = mem->propagation_cum_sums; + auto group_size = mem->group_size; + host_radix_cumulative_sum_in_groups( + streams[0], gpu_indexes[0], propagation_cum_sums, block_states, + num_radix_blocks, big_lwe_size, group_size); + + auto luts_array_second_step = mem->luts_array_second_step; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, propagation_cum_sums, + propagation_cum_sums, bsks, ksks, num_radix_blocks, + luts_array_second_step); + + auto scalar_array_cum_sum = mem->scalar_array_cum_sum; + auto big_lwe_dimension = big_lwe_size - 1; + + host_integer_radix_scalar_addition_inplace( + streams, gpu_indexes, gpu_count, propagation_cum_sums, + scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks, + message_modulus, carry_modulus); + + uint32_t modulus_sup = message_modulus * carry_modulus; + Torus delta = (1ull << 63) / modulus_sup; + auto simulators = mem->simulators; + auto grouping_pgns = mem->grouping_pgns; + host_radix_split_simulators_and_grouping_pgns( + streams[0], gpu_indexes[0], simulators, grouping_pgns, + propagation_cum_sums, num_radix_blocks, big_lwe_size, group_size, delta); + + auto resolved_carries = mem->resolved_carries; + if (mem->use_sequential_algorithm_to_resolver_group_carries) { + // Resolve group carries sequentially + host_resolve_group_carries_sequentially( + streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns, + params, mem->seq_group_prop_mem, bsks, ksks, num_groups); + } else { + // Resolve group carries with hillis steele + auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele; + host_compute_prefix_sum_hillis_steele( + streams, gpu_indexes, gpu_count, &resolved_carries[big_lwe_size], + grouping_pgns, params, luts_carry_propagation_sum, bsks, ksks, + num_groups - 1); + } +} +// This function is used to perform step 1 of Thomas' new borrow propagation +// algorithm It uses a many lut to calculate two luts in parallel +// shifted_blocks: contains (block % message modulus) << 1 +// block states: contains the propagation states for the different blocks +// depending on the group it belongs to and the internal position within the +// block. +template +void host_compute_shifted_blocks_and_borrow_states( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, int_radix_params params, + int_shifted_blocks_and_borrow_states_memory *mem, void *const *bsks, + Torus *const *ksks, uint32_t num_radix_blocks, uint32_t lut_stride, + uint32_t num_many_lut) { + + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + + auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states; + auto luts_array_first_step = mem->luts_array_first_step; + + integer_radix_apply_many_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states, + lwe_array, bsks, ksks, num_radix_blocks, luts_array_first_step, + num_many_lut, lut_stride); + + auto shifted_blocks = mem->shifted_blocks; + auto borrow_states = mem->borrow_states; + cuda_memcpy_async_gpu_to_gpu(borrow_states, shifted_blocks_and_borrow_states, + big_lwe_size_bytes * num_radix_blocks, + streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + shifted_blocks, + shifted_blocks_and_borrow_states + big_lwe_size * num_radix_blocks, + big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]); +} + +template +void host_legacy_propagate_single_carry(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, + Torus *carry_out, Torus *input_carries, + int_legacy_sc_prop_memory *mem, + void *const *bsks, Torus *const *ksks, + uint32_t num_blocks) { auto params = mem->params; if (params.message_modulus == 2) PANIC("Cuda error: single carry propagation is not supported for 1 bit " @@ -848,7 +1256,7 @@ void host_full_propagate_inplace(cudaStream_t const *streams, int small_lwe_size = (params.small_lwe_dimension + 1); // In the case of extracting a single LWE this parameters are dummy - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (int i = 0; i < num_blocks; i++) { auto cur_input_block = &input_blocks[i * big_lwe_size]; @@ -872,7 +1280,7 @@ void host_full_propagate_inplace(cudaStream_t const *streams, mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_base_log, params.pbs_level, - params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride); + params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride); cuda_memcpy_async_gpu_to_gpu( (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector, @@ -952,6 +1360,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads); device_pack_blocks<<>>( lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor); + check_cuda_error(cudaGetLastError()); } template @@ -1155,11 +1564,11 @@ void host_apply_many_univariate_lut_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *radix_lwe_out, Torus const *radix_lwe_in, int_radix_lut *mem, Torus *const *ksks, void *const *bsks, - uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_blocks, uint32_t num_many_lut, uint32_t lut_stride) { integer_radix_apply_many_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks, - num_blocks, mem, lut_count, lut_stride); + num_blocks, mem, num_many_lut, lut_stride); } template @@ -1192,4 +1601,371 @@ void host_apply_bivariate_lut_kb( radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift); } +template +void scratch_cuda_propagate_single_carry_kb_inplace( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_sc_prop_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag, + uint32_t uses_carry, bool allocate_gpu_memory) { + + *mem_ptr = new int_sc_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag, + uses_carry, allocate_gpu_memory); +} +// This function perform the three steps of Thomas' new carry propagation +// includes the logic to extract overflow when requested +template +void host_propagate_single_carry(cudaStream_t const *streams, + uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array, + Torus *carry_out, const Torus *input_carries, + int_sc_prop_memory *mem, + void *const *bsks, Torus *const *ksks, + uint32_t num_radix_blocks, + uint32_t requested_flag, uint32_t uses_carry) { + auto params = mem->params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + auto big_lwe_dimension = big_lwe_size - 1; // For host addition + auto lut_stride = mem->lut_stride; + auto num_many_lut = mem->num_many_lut; + if (requested_flag == outputFlag::FLAG_OVERFLOW) + PANIC("Cuda error: single carry propagation is not supported for overflow, " + "try using add_and_propagate_single_carry"); + if (uses_carry == 1) { + host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, + input_carries, big_lwe_dimension, 1); + } + // Step 1 + host_compute_shifted_blocks_and_states( + streams, gpu_indexes, gpu_count, lwe_array, params, + mem->shifted_blocks_state_mem, bsks, ksks, num_radix_blocks, lut_stride, + num_many_lut); + auto block_states = mem->shifted_blocks_state_mem->block_states; + + if (requested_flag == outputFlag::FLAG_CARRY) { + cuda_memcpy_async_gpu_to_gpu( + mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } + // Step 2 + host_compute_propagation_simulators_and_group_carries( + streams, gpu_indexes, gpu_count, block_states, params, + mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks, + mem->num_groups); + + auto group_size = mem->prop_simu_group_carries_mem->group_size; + + auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; + auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks; + host_addition(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, + mem->prop_simu_group_carries_mem->simulators, + big_lwe_dimension, num_radix_blocks); + + if (requested_flag == outputFlag::FLAG_OVERFLOW || + requested_flag == outputFlag::FLAG_CARRY) { + host_addition(streams[0], gpu_indexes[0], mem->output_flag, + mem->output_flag, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + // Step 3 + // Add carries and cleanup OutputFlag::None + host_radix_sum_in_groups( + mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks, + big_lwe_size, group_size); + + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks, + bsks, ksks, num_radix_blocks, message_extract); + + if (requested_flag == outputFlag::FLAG_CARRY) { + host_addition(mem->sub_streams_2[0], gpu_indexes[0], + mem->output_flag, mem->output_flag, + mem->prop_simu_group_carries_mem->resolved_carries + + (mem->num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); + + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, + mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last); + + cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag, + big_lwe_size_bytes, mem->sub_streams_2[0], + gpu_indexes[0]); + } + for (int j = 0; j < mem->active_gpu_count; j++) { + cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]); + cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]); + } +} + +// This function perform the three steps of Thomas' new carry propagation +// includes the logic to extract overflow when requested +template +void host_add_and_propagate_single_carry( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lhs_array, const Torus *rhs_array, + Torus *carry_out, const Torus *input_carries, + int_sc_prop_memory *mem, void *const *bsks, Torus *const *ksks, + uint32_t num_radix_blocks, uint32_t requested_flag, uint32_t uses_carry) { + auto params = mem->params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + auto big_lwe_dimension = big_lwe_size - 1; // For host addition + auto lut_stride = mem->lut_stride; + auto num_many_lut = mem->num_many_lut; + + if (requested_flag == outputFlag::FLAG_OVERFLOW) { + cuda_memcpy_async_gpu_to_gpu( + mem->last_lhs, lhs_array + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + mem->last_rhs, rhs_array + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } + + host_addition(streams[0], gpu_indexes[0], lhs_array, lhs_array, + rhs_array, big_lwe_dimension, num_radix_blocks); + + if (uses_carry == 1) { + host_addition(streams[0], gpu_indexes[0], lhs_array, lhs_array, + input_carries, big_lwe_dimension, 1); + } + // Step 1 + host_compute_shifted_blocks_and_states( + streams, gpu_indexes, gpu_count, lhs_array, params, + mem->shifted_blocks_state_mem, bsks, ksks, num_radix_blocks, lut_stride, + num_many_lut); + auto block_states = mem->shifted_blocks_state_mem->block_states; + if (requested_flag == outputFlag::FLAG_OVERFLOW) { + auto lut_overflow_prep = mem->lut_overflow_flag_prep; + integer_radix_apply_bivariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs, + mem->last_rhs, bsks, ksks, 1, lut_overflow_prep, + lut_overflow_prep->params.message_modulus); + } else if (requested_flag == outputFlag::FLAG_CARRY) { + cuda_memcpy_async_gpu_to_gpu( + mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } + + // Step 2 + host_compute_propagation_simulators_and_group_carries( + streams, gpu_indexes, gpu_count, block_states, params, + mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks, + mem->num_groups); + + auto group_size = mem->prop_simu_group_carries_mem->group_size; + + auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; + auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks; + host_addition(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, + mem->prop_simu_group_carries_mem->simulators, + big_lwe_dimension, num_radix_blocks); + + if (requested_flag == outputFlag::FLAG_OVERFLOW || + requested_flag == outputFlag::FLAG_CARRY) { + host_addition(streams[0], gpu_indexes[0], mem->output_flag, + mem->output_flag, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + // Step 3 + // Add carries and cleanup OutputFlag::None + host_radix_sum_in_groups( + mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks, + big_lwe_size, group_size); + + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks, + bsks, ksks, num_radix_blocks, message_extract); + + if (requested_flag == outputFlag::FLAG_OVERFLOW || + requested_flag == outputFlag::FLAG_CARRY) { + if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW && + uses_carry == 1) { + host_addition(mem->sub_streams_2[0], gpu_indexes[0], + mem->output_flag, mem->output_flag, input_carries, + big_lwe_dimension, 1); + + } else { + + host_addition(mem->sub_streams_2[0], gpu_indexes[0], + mem->output_flag, mem->output_flag, + mem->prop_simu_group_carries_mem->resolved_carries + + (mem->num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + if (requested_flag == outputFlag::FLAG_OVERFLOW) { + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, + mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last); + } else { + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, + mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last); + } + cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag, + big_lwe_size_bytes, mem->sub_streams_2[0], + gpu_indexes[0]); + } + for (int j = 0; j < mem->active_gpu_count; j++) { + cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]); + cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]); + } +} + +template +void scratch_cuda_integer_overflowing_sub( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, int_borrow_prop_memory **mem_ptr, + uint32_t num_radix_blocks, int_radix_params params, + uint32_t compute_overflow, bool allocate_gpu_memory) { + + *mem_ptr = new int_borrow_prop_memory( + streams, gpu_indexes, gpu_count, params, num_radix_blocks, + compute_overflow, allocate_gpu_memory); +} + +// This function perform the three steps of Thomas' new borrow propagation +// includes the logic to extract overflow when requested +template +void host_single_borrow_propagate( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lhsrhs_array, Torus *overflow_block, + const Torus *input_borrow, int_borrow_prop_memory *mem, + void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, + uint32_t num_groups, uint32_t compute_overflow, + uint32_t uses_input_borrow) { + auto params = mem->params; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; + auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); + auto big_lwe_dimension = big_lwe_size - 1; + auto lut_stride = mem->lut_stride; + auto num_many_lut = mem->num_many_lut; + + assert(mem->num_groups >= num_groups); + if (uses_input_borrow == 1) { + host_unchecked_sub_with_correcting_term( + streams[0], gpu_indexes[0], lhsrhs_array, lhsrhs_array, input_borrow, + big_lwe_dimension, 1, message_modulus, carry_modulus, + message_modulus - 1); + } + // Step 1 + host_compute_shifted_blocks_and_borrow_states( + streams, gpu_indexes, gpu_count, lhsrhs_array, params, + mem->shifted_blocks_borrow_state_mem, bsks, ksks, num_radix_blocks, + lut_stride, num_many_lut); + + auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states; + cuda_memcpy_async_gpu_to_gpu(mem->overflow_block, + borrow_states + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + + // Step 2 + host_compute_propagation_simulators_and_group_carries( + streams, gpu_indexes, gpu_count, borrow_states, params, + mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks, + num_groups); + + auto shifted_blocks = mem->shifted_blocks_borrow_state_mem->shifted_blocks; + auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; + auto simulators = mem->prop_simu_group_carries_mem->simulators; + + host_subtraction(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, simulators, big_lwe_dimension, + num_radix_blocks); + + host_integer_radix_add_scalar_one_inplace( + streams, gpu_indexes, gpu_count, prepared_blocks, big_lwe_dimension, + num_radix_blocks, message_modulus, carry_modulus); + + if (compute_overflow == outputFlag::FLAG_OVERFLOW) { + host_addition(streams[0], gpu_indexes[0], mem->overflow_block, + mem->overflow_block, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries; + + // Step 3 + // This needs to be done before because in next step we modify the resolved + // borrows + if (compute_overflow == outputFlag::FLAG_OVERFLOW) { + host_addition(streams[0], gpu_indexes[0], mem->overflow_block, + mem->overflow_block, + resolved_borrows + (num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); + } + + cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]); + for (int j = 0; j < mem->active_gpu_count; j++) { + cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0], + gpu_indexes[j]); + cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0], + gpu_indexes[j]); + } + + if (compute_overflow == outputFlag::FLAG_OVERFLOW) { + auto borrow_flag = mem->lut_borrow_flag; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block, + mem->overflow_block, bsks, ksks, 1, borrow_flag); + } + for (int j = 0; j < mem->active_gpu_count; j++) { + cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j], + gpu_indexes[j]); + } + + // subtract borrow and cleanup prepared blocks + host_negation(mem->sub_streams_2[0], gpu_indexes[0], resolved_borrows, + resolved_borrows, big_lwe_dimension, num_groups); + + host_radix_sum_in_groups( + mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + resolved_borrows, num_radix_blocks, big_lwe_size, mem->group_size); + + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks, + bsks, ksks, num_radix_blocks, message_extract); + + for (int j = 0; j < mem->active_gpu_count; j++) { + cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j], + gpu_indexes[j]); + cuda_stream_wait_event(streams[0], mem->outgoing_events1[j], + gpu_indexes[0]); + cuda_stream_wait_event(streams[0], mem->outgoing_events2[j], + gpu_indexes[0]); + } +} + #endif // TFHE_RS_INTERNAL_INTEGER_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 1e2694e1e2..1a39b08567 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -209,7 +209,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( auto small_lwe_size = small_lwe_dimension + 1; // In the case of extracting a single LWE this parameters are dummy - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; if (num_radix_in_vec == 0) @@ -370,7 +370,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count, - mem_ptr->params.pbs_type, lut_count, lut_stride); + mem_ptr->params.pbs_type, num_many_lut, lut_stride); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -418,7 +418,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count, - mem_ptr->params.pbs_type, lut_count, lut_stride); + mem_ptr->params.pbs_type, num_many_lut, lut_stride); multi_gpu_gather_lwe_async( streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec, @@ -578,10 +578,15 @@ __host__ void host_integer_mult_radix_kb( terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks, mem_ptr->luts_array); - auto scp_mem_ptr = mem_ptr->sum_ciphertexts_mem->scp_mem; - host_propagate_single_carry(streams, gpu_indexes, gpu_count, - radix_lwe_out, nullptr, nullptr, - scp_mem_ptr, bsks, ksks, num_blocks); + uint32_t block_modulus = message_modulus * carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + + auto scp_mem_ptr = mem_ptr->sc_prop_mem; + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + host_propagate_single_carry( + streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr, + scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu index e55ea9e912..36972b29d3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu @@ -12,49 +12,3 @@ void cuda_negate_integer_radix_ciphertext_64( static_cast(lwe_array_in), lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus); } - -void scratch_cuda_integer_radix_overflowing_sub_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, - uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, - uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { - - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - big_lwe_dimension, small_lwe_dimension, ks_level, - ks_base_log, pbs_level, pbs_base_log, grouping_factor, - message_modulus, carry_modulus); - - scratch_cuda_integer_overflowing_sub_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_overflowing_sub_memory **)mem_ptr, num_blocks, params, - allocate_gpu_memory); -} - -void cuda_integer_radix_overflowing_sub_kb_64( - void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left, - void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t num_blocks) { - - auto mem = (int_overflowing_sub_memory *)mem_ptr; - - host_integer_overflowing_sub_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(radix_lwe_out), - static_cast(radix_lwe_overflowed), - static_cast(radix_lwe_left), - static_cast(radix_lwe_right), bsks, (uint64_t **)(ksks), - mem, num_blocks); -} - -void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, - int8_t **mem_ptr_void) { - int_overflowing_sub_memory *mem_ptr = - (int_overflowing_sub_memory *)(*mem_ptr_void); - - mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); -} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index 6eda409df9..28f7ac93cf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -91,7 +91,7 @@ __host__ void scratch_cuda_integer_overflowing_sub_kb( *mem_ptr = new int_overflowing_sub_memory( streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory); } - +/* template __host__ void host_integer_overflowing_sub_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -113,4 +113,39 @@ __host__ void host_integer_overflowing_sub_kb( mem_ptr, bsks, ksks, num_blocks); } +*/ +template +__host__ void host_integer_overflowing_sub( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array, + const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow, + int_borrow_prop_memory *mem_ptr, void *const *bsks, + Torus *const *ksks, uint32_t num_blocks, uint32_t compute_overflow, + uint32_t uses_input_borrow) { + + auto radix_params = mem_ptr->params; + + // We need to recalculate the num_groups, because on the division the number + // of num_blocks changes + uint32_t block_modulus = + radix_params.message_modulus * radix_params.carry_modulus; + uint32_t num_bits_in_block = std::log2(block_modulus); + uint32_t grouping_size = num_bits_in_block; + uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size; + + auto stream = (cudaStream_t *)streams; + host_unchecked_sub_with_correcting_term( + stream[0], gpu_indexes[0], static_cast(lwe_out_array), + static_cast(lhs_array), static_cast(rhs_array), + radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus, + radix_params.carry_modulus, radix_params.message_modulus - 1); + + host_single_borrow_propagate( + streams, gpu_indexes, gpu_count, static_cast(lwe_out_array), + static_cast(overflow_block), + static_cast(input_borrow), + (int_borrow_prop_memory *)mem_ptr, bsks, (Torus **)(ksks), + num_blocks, num_groups, compute_overflow, uses_input_borrow); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index 37a51006ae..941f31bc42 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -112,10 +112,12 @@ __host__ void host_integer_scalar_mul_radix( terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem, num_radix_blocks, j, nullptr); - auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem; - host_propagate_single_carry(streams, gpu_indexes, gpu_count, lwe_array, - nullptr, nullptr, scp_mem_ptr, bsks, ksks, - num_radix_blocks); + auto scp_mem_ptr = mem->sc_prop_mem; + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + host_propagate_single_carry( + streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr, + scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu index d3f47ad263..03ded74b46 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu @@ -57,6 +57,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } + /* * Perform the addition of a u32 input LWE ciphertext vector with a u32 * plaintext vector. See the equivalent operation on u64 data for more details. diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 29e1f62689..3401cdadd2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -82,6 +82,46 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, check_cuda_error(cudaGetLastError()); } +template +__global__ void pack_for_overflowing_ops(T *output, T const *input_1, + T const *input_2, uint32_t num_entries, + uint32_t message_modulus) { + + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { + // Here we take advantage of the wrapping behaviour of uint + output[index] = input_1[index] * message_modulus + input_2[index]; + } +} + +template +__host__ void host_pack_for_overflowing_ops(cudaStream_t stream, + uint32_t gpu_index, T *output, + T const *input_1, T const *input_2, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count, + uint32_t message_modulus) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + pack_for_overflowing_ops<<>>( + &output[(input_lwe_ciphertext_count - 1) * lwe_size], + &input_1[(input_lwe_ciphertext_count - 1) * lwe_size], + &input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size, + message_modulus); + check_cuda_error(cudaGetLastError()); +} + template __global__ void subtraction(T *output, T const *input_1, T const *input_2, uint32_t num_entries) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 9215bc044e..7209aba72e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -92,7 +92,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { switch (sizeof(Torus)) { case sizeof(uint32_t): @@ -126,7 +126,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_inputs_on_gpu, - lut_count, lut_stride); + num_many_lut, lut_stride); } break; default: @@ -165,7 +165,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, lut_count, lut_stride); + num_inputs_on_gpu, num_many_lut, lut_stride); } break; case CLASSICAL: @@ -194,7 +194,7 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_inputs_on_gpu, - lut_count, lut_stride); + num_many_lut, lut_stride); } break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 2e5f83d45b..c77b69b353 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_cg( const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t lut_count, + uint64_t device_memory_size_per_block, uint32_t num_many_lut, uint32_t lut_stride) { grid_group grid = this_grid(); @@ -152,8 +152,8 @@ __global__ void device_programmable_bootstrap_cg( // but we do the computation at block 0 to avoid waiting for extra blocks, // in case they're not synchronized sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); @@ -168,8 +168,8 @@ __global__ void device_programmable_bootstrap_cg( } } else if (blockIdx.y == glwe_dimension) { sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -235,7 +235,7 @@ __host__ void host_programmable_bootstrap_cg( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -273,7 +273,7 @@ __host__ void host_programmable_bootstrap_cg( kernel_args[10] = &base_log; kernel_args[11] = &level_count; kernel_args[12] = &d_mem; - kernel_args[14] = &lut_count; + kernel_args[14] = &num_many_lut; kernel_args[15] = &lut_stride; if (max_shared_memory < partial_sm) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index d736534e48..5eb59c5b78 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -32,7 +32,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, uint64_t device_memory_size_per_block, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { + grid_group grid = this_grid(); // We use shared memory for the polynomials that are used often during the @@ -134,8 +135,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // default sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); @@ -153,8 +154,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -293,7 +294,7 @@ __host__ void execute_cg_external_product_loop( Torus const *lwe_output_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count, + uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut, uint32_t lut_stride) { uint64_t full_sm = @@ -343,7 +344,7 @@ __host__ void execute_cg_external_product_loop( kernel_args[16] = &chunk_size; kernel_args[17] = &keybundle_size_per_input; kernel_args[18] = &d_mem; - kernel_args[20] = &lut_count; + kernel_args[20] = &num_many_lut; kernel_args[21] = &lut_stride; dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); @@ -379,7 +380,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; @@ -397,7 +398,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_offset, lut_count, + grouping_factor, base_log, level_count, lwe_offset, num_many_lut, lut_stride); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index dd3d446204..a77db81e19 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -123,7 +123,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { switch (polynomial_size) { @@ -133,7 +133,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 512: host_programmable_bootstrap_tbc>( @@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 1024: host_programmable_bootstrap_tbc>( @@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 2048: host_programmable_bootstrap_tbc>( @@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 4096: host_programmable_bootstrap_tbc>( @@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 8192: host_programmable_bootstrap_tbc>( @@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 16384: host_programmable_bootstrap_tbc>( @@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -380,7 +380,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { switch (polynomial_size) { @@ -390,7 +390,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 512: host_programmable_bootstrap_cg>( @@ -398,7 +398,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 1024: host_programmable_bootstrap_cg>( @@ -406,7 +406,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 2048: host_programmable_bootstrap_cg>( @@ -414,7 +414,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 4096: host_programmable_bootstrap_cg>( @@ -422,7 +422,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 8192: host_programmable_bootstrap_cg>( @@ -430,7 +430,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 16384: host_programmable_bootstrap_cg>( @@ -438,7 +438,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -455,7 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { switch (polynomial_size) { @@ -465,7 +465,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 512: host_programmable_bootstrap>( @@ -473,7 +473,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 1024: host_programmable_bootstrap>( @@ -481,7 +481,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 2048: host_programmable_bootstrap>( @@ -489,7 +489,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 4096: host_programmable_bootstrap>( @@ -497,7 +497,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 8192: host_programmable_bootstrap>( @@ -505,7 +505,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case 16384: host_programmable_bootstrap>( @@ -513,7 +513,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -531,7 +531,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be <= 32") @@ -551,7 +551,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -566,7 +566,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -578,7 +578,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -653,7 +653,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be <= 64") @@ -672,7 +672,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -687,7 +687,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -699,7 +699,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - lut_count, lut_stride); + num_many_lut, lut_stride); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -727,7 +727,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -737,7 +737,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void scratch_cuda_programmable_bootstrap_cg( @@ -758,7 +758,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -768,7 +768,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void scratch_cuda_programmable_bootstrap_cg( @@ -797,7 +797,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -806,7 +806,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 31f1e9487f..25701aca92 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -142,7 +142,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, uint64_t device_memory_size_per_block, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -217,8 +217,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // but we do the computation at block 0 to avoid waiting for extra blocks, // in case they're not synchronized sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); @@ -233,8 +233,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) } } else if (blockIdx.y == glwe_dimension) { sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -412,8 +412,8 @@ __host__ void execute_step_two( uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, uint32_t lut_count, - uint32_t lut_stride) { + uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm, + uint32_t num_many_lut, uint32_t lut_stride) { int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); @@ -426,21 +426,21 @@ __host__ void execute_step_two( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, full_dm, lut_count, lut_stride); + level_count, d_mem, full_dm, num_many_lut, lut_stride); } else if (max_shared_memory < full_sm) { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, partial_dm, lut_count, lut_stride); + level_count, d_mem, partial_dm, num_many_lut, lut_stride); } else { device_programmable_bootstrap_step_two <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size, base_log, - level_count, d_mem, 0, lut_count, lut_stride); + level_count, d_mem, 0, num_many_lut, lut_stride); } check_cuda_error(cudaGetLastError()); } @@ -456,7 +456,7 @@ __host__ void host_programmable_bootstrap( pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -493,7 +493,7 @@ __host__ void host_programmable_bootstrap( global_join_buffer, input_lwe_ciphertext_count, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two, - lut_count, lut_stride); + num_many_lut, lut_stride); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 72b8982549..b2a7f214e2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -67,7 +67,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { switch (polynomial_size) { case 256: @@ -76,7 +76,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 512: host_cg_multi_bit_programmable_bootstrap>( @@ -84,7 +84,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 1024: host_cg_multi_bit_programmable_bootstrap>( @@ -92,7 +92,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 2048: host_cg_multi_bit_programmable_bootstrap>( @@ -100,7 +100,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 4096: host_cg_multi_bit_programmable_bootstrap>( @@ -108,7 +108,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 8192: host_cg_multi_bit_programmable_bootstrap>( @@ -116,7 +116,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 16384: host_cg_multi_bit_programmable_bootstrap>( @@ -124,7 +124,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -142,7 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { switch (polynomial_size) { case 256: @@ -151,7 +151,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -159,7 +159,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 1024: host_multi_bit_programmable_bootstrap>( @@ -167,7 +167,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 2048: host_multi_bit_programmable_bootstrap>( @@ -175,7 +175,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 4096: host_multi_bit_programmable_bootstrap>( @@ -183,7 +183,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 8192: host_multi_bit_programmable_bootstrap>( @@ -191,7 +191,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 16384: host_multi_bit_programmable_bootstrap>( @@ -199,7 +199,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -215,7 +215,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void const *lwe_input_indexes, void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t lut_count, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) { if (base_log > 64) @@ -236,7 +236,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") @@ -251,7 +251,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -263,7 +263,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -499,7 +499,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, @@ -516,7 +516,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( @@ -588,7 +588,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { if (base_log > 32) PANIC("Cuda error (multi-bit PBS): base log should be <= 32") @@ -600,7 +600,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 512: host_tbc_multi_bit_programmable_bootstrap>( @@ -608,7 +608,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 1024: host_tbc_multi_bit_programmable_bootstrap>( @@ -616,7 +616,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 2048: { int num_sms = 0; @@ -629,14 +629,14 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, - level_count, num_samples, lut_count, lut_stride); + level_count, num_samples, num_many_lut, lut_stride); else host_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, - level_count, num_samples, lut_count, lut_stride); + level_count, num_samples, num_many_lut, lut_stride); break; } @@ -646,7 +646,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 8192: host_tbc_multi_bit_programmable_bootstrap>( @@ -654,7 +654,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; case 16384: host_tbc_multi_bit_programmable_bootstrap>( @@ -662,7 +662,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, lut_count, lut_stride); + num_samples, num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -685,5 +685,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride); + uint32_t num_many_lut, uint32_t lut_stride); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index a58647185b..ba73d29bf7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -253,7 +253,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, uint32_t lwe_chunk_size, int8_t *device_mem, - uint64_t device_memory_size_per_block, uint32_t lut_count, + uint64_t device_memory_size_per_block, uint32_t num_many_lut, uint32_t lut_stride) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -326,8 +326,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // but we do the computation at block 0 to avoid waiting for extra blocks, // in case they're not synchronized sample_extract_mask(block_lwe_array_out, global_slice); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); @@ -342,8 +342,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) } } else if (blockIdx.y == glwe_dimension) { sample_extract_body(block_lwe_array_out, global_slice, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -591,12 +591,14 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, } template -__host__ void execute_step_two( - cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus const *lwe_output_indexes, pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count, - uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) { +__host__ void +execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus const *lwe_output_indexes, + pbs_buffer *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, int32_t grouping_factor, + uint32_t level_count, uint32_t j, uint32_t lwe_offset, + uint32_t num_many_lut, uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; uint64_t full_sm_accumulate_step_two = @@ -621,7 +623,7 @@ __host__ void execute_step_two( global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two, - lut_count, lut_stride); + num_many_lut, lut_stride); else device_multi_bit_programmable_bootstrap_accumulate_step_two @@ -630,7 +632,7 @@ __host__ void execute_step_two( global_accumulator, global_accumulator_fft, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0, - lut_count, lut_stride); + num_many_lut, lut_stride); check_cuda_error(cudaGetLastError()); } @@ -643,7 +645,7 @@ __host__ void host_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; @@ -667,7 +669,8 @@ __host__ void host_multi_bit_programmable_bootstrap( execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, level_count, j, lwe_offset, lut_count, lut_stride); + grouping_factor, level_count, j, lwe_offset, num_many_lut, + lut_stride); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index b7dc557e3a..bbbf6f2eee 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -45,8 +45,8 @@ __global__ void device_programmable_bootstrap_tbc( const double2 *__restrict__ bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block, bool support_dsm, uint32_t lut_count, - uint32_t lut_stride) { + uint64_t device_memory_size_per_block, bool support_dsm, + uint32_t num_many_lut, uint32_t lut_stride) { cluster_group cluster = this_cluster(); @@ -158,8 +158,8 @@ __global__ void device_programmable_bootstrap_tbc( // in case they're not synchronized sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); @@ -175,8 +175,8 @@ __global__ void device_programmable_bootstrap_tbc( } else if (blockIdx.y == glwe_dimension) { sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -261,7 +261,7 @@ __host__ void host_programmable_bootstrap_tbc( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { auto supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< @@ -317,7 +317,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, - supports_dsm, lut_count, lut_stride)); + supports_dsm, num_many_lut, lut_stride)); } else if (max_shared_memory < full_sm + minimum_sm_tbc) { config.dynamicSmemBytes = partial_sm + minimum_sm_tbc; @@ -326,7 +326,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, - partial_dm, supports_dsm, lut_count, lut_stride)); + partial_dm, supports_dsm, num_many_lut, lut_stride)); } else { config.dynamicSmemBytes = full_sm + minimum_sm_tbc; @@ -335,7 +335,7 @@ __host__ void host_programmable_bootstrap_tbc( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, - supports_dsm, lut_count, lut_stride)); + supports_dsm, num_many_lut, lut_stride)); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 22b6f4e196..701f80379b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -32,7 +32,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, int8_t *device_mem, uint64_t device_memory_size_per_block, - bool support_dsm, uint32_t lut_count, uint32_t lut_stride) { + bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) { cluster_group cluster = this_cluster(); @@ -141,8 +141,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) // blocks, in case they're not synchronized sample_extract_mask(block_lwe_array_out, accumulator); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + (i * gridDim.z * (glwe_dimension * polynomial_size + 1)); @@ -157,8 +157,8 @@ __global__ void __launch_bounds__(params::degree / params::opt) } } else if (blockIdx.y == glwe_dimension) { sample_extract_body(block_lwe_array_out, accumulator, 0); - if (lut_count > 1) { - for (int i = 1; i < lut_count; i++) { + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { auto next_lwe_array_out = lwe_array_out + @@ -299,7 +299,7 @@ __host__ void execute_tbc_external_product_loop( Torus const *lwe_output_indexes, pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count, + uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut, uint32_t lut_stride) { auto lwe_chunk_size = buffer->lwe_chunk_size; @@ -363,7 +363,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, full_dm, supports_dsm, lut_count, + keybundle_size_per_input, d_mem, full_dm, supports_dsm, num_many_lut, lut_stride)); } else if (max_shared_memory < full_dm + minimum_dm) { config.dynamicSmemBytes = partial_dm + minimum_dm; @@ -375,7 +375,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, partial_dm, supports_dsm, lut_count, + keybundle_size_per_input, d_mem, partial_dm, supports_dsm, num_many_lut, lut_stride)); } else { config.dynamicSmemBytes = full_dm + minimum_dm; @@ -387,7 +387,7 @@ __host__ void execute_tbc_external_product_loop( lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft, global_accumulator, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, grouping_factor, lwe_offset, chunk_size, - keybundle_size_per_input, d_mem, 0, supports_dsm, lut_count, + keybundle_size_per_input, d_mem, 0, supports_dsm, num_many_lut, lut_stride)); } } @@ -401,7 +401,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t lut_count, uint32_t lut_stride) { + uint32_t num_many_lut, uint32_t lut_stride) { cudaSetDevice(gpu_index); auto lwe_chunk_size = buffer->lwe_chunk_size; @@ -419,7 +419,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_offset, lut_count, + grouping_factor, base_log, level_count, lwe_offset, num_many_lut, lut_stride); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh index eb09d17e52..8de4cc197e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh @@ -46,6 +46,24 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams, } } +/// Allocates the input/output vector for all devices +/// Initializes also the related indexing and initializes it to the trivial +/// index +template +void multi_gpu_alloc_lwe_many_lut_output_async( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, std::vector &dest, uint32_t num_inputs, + uint32_t num_many_lut, uint32_t lwe_size) { + dest.resize(gpu_count); + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + Torus *d_array = (Torus *)cuda_malloc_async(num_many_lut * inputs_on_gpu * + lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + dest[i] = d_array; + } +} + /// Load an array residing on one GPU to all active gpus /// and split the array among them. /// The input indexing logic is given by an index array. @@ -126,6 +144,49 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams, } } +/// Copy data from multiple GPUs back to GPU 0 following the indexing given in +/// dest_indexes +/// The input indexing should be the trivial one +template +void multi_gpu_gather_many_lut_lwe_async( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *dest, const std::vector &src, + Torus *h_dest_indexes, bool is_trivial_index, uint32_t num_inputs, + uint32_t lwe_size, uint32_t num_many_lut) { + + for (uint lut_id = 0; lut_id < num_many_lut; lut_id++) { + for (uint i = 0; i < gpu_count; i++) { + auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count); + auto gpu_offset = 0; + for (uint j = 0; j < i; j++) { + gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count); + } + + if (is_trivial_index) { + auto d_dest = + dest + gpu_offset * lwe_size + lut_id * num_inputs * lwe_size; + auto d_src = src[i] + lut_id * inputs_on_gpu * lwe_size; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, + inputs_on_gpu * lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + } else { + auto dest_indexes = h_dest_indexes + gpu_offset; + + for (uint j = 0; j < inputs_on_gpu; j++) { + auto d_dest = dest + dest_indexes[j] * lwe_size + + lut_id * num_inputs * lwe_size; + auto d_src = + src[i] + j * lwe_size + lut_id * inputs_on_gpu * lwe_size; + + cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus), + streams[i], gpu_indexes[i]); + } + } + } + } +} + template void multi_gpu_release_async(cudaStream_t const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index ae6ec59f31..7b4cbe9802 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -177,7 +177,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -186,7 +186,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count, + pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } @@ -208,7 +208,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -221,7 +221,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) (const uint64_t *)d_lwe_input_indexes, (const uint64_t *)d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, input_lwe_ciphertext_count, lut_count, lut_stride); + pbs_level, input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } @@ -234,7 +234,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -243,7 +243,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count, + pbs_base_log, pbs_level, input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } @@ -265,7 +265,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -276,7 +276,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, lut_count, lut_stride); + input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } @@ -297,7 +297,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -308,7 +308,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, lut_count, lut_stride); + input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } @@ -322,7 +322,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, true); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (auto _ : st) { // Execute PBS @@ -333,7 +333,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, lut_count, lut_stride); + input_lwe_ciphertext_count, num_many_lut, lut_stride); cuda_synchronize_stream(stream, gpu_index); } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index 11e4dd3122..5a5223d4fe 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -173,7 +173,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * polynomial_size * (lwe_dimension + 1); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; // Here execute the PBS for (int r = 0; r < repetitions; r++) { @@ -192,7 +192,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, lut_count, lut_stride); + pbs_level, number_of_inputs, num_many_lut, lut_stride); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index eec69ddd38..a621233454 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -119,7 +119,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size * (1 << grouping_factor); - uint32_t lut_count = 1; + uint32_t num_many_lut = 1; uint32_t lut_stride = 0; for (int r = 0; r < repetitions; r++) { uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r); @@ -137,7 +137,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, number_of_inputs, lut_count, lut_stride); + pbs_level, number_of_inputs, num_many_lut, lut_stride); // Copy result to the host memory cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index df0f91ab23..d6bf96fe14 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -721,6 +721,32 @@ extern "C" { message_modulus: u32, carry_modulus: u32, pbs_type: PBS_TYPE, + requested_flag: u32, + uses_carry: u32, + allocate_gpu_memory: bool, + ); +} +extern "C" { + pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + requested_flag: u32, + uses_carry: u32, allocate_gpu_memory: bool, ); } @@ -731,24 +757,30 @@ extern "C" { gpu_count: u32, lwe_array: *mut ffi::c_void, carry_out: *mut ffi::c_void, + carry_in: *const ffi::c_void, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, num_blocks: u32, + requested_flag: u32, + uses_carry: u32, ); } extern "C" { - pub fn cuda_propagate_single_carry_get_input_carries_kb_64_inplace( + pub fn cuda_add_and_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array: *mut ffi::c_void, + lhs_array: *mut ffi::c_void, + rhs_array: *const ffi::c_void, carry_out: *mut ffi::c_void, - input_carries: *mut ffi::c_void, + carry_in: *const ffi::c_void, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, num_blocks: u32, + requested_flag: u32, + uses_carry: u32, ); } extern "C" { @@ -760,43 +792,55 @@ extern "C" { ); } extern "C" { - pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + pub fn cleanup_cuda_add_and_propagate_single_carry( + streams: *const *mut ffi::c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr_void: *mut *mut i8, + ); +} +extern "C" { + pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, - lwe_dimension: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, ks_level: u32, ks_base_log: u32, pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks_in_radix: u32, - max_num_radix_in_vec: u32, + num_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: PBS_TYPE, + compute_overflow: u32, allocate_gpu_memory: bool, ); } extern "C" { - pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + pub fn cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_out: *mut ffi::c_void, - radix_lwe_vec: *mut ffi::c_void, - num_radix_in_vec: u32, + lhs_array: *mut ffi::c_void, + rhs_array: *const ffi::c_void, + overflow_block: *mut ffi::c_void, + input_borrow: *const ffi::c_void, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, - num_blocks_in_radix: u32, + num_blocks: u32, + compute_overflow: u32, + uses_input_borrow: u32, ); } extern "C" { - pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( + pub fn cleanup_cuda_integer_overflowing_sub( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, @@ -804,21 +848,21 @@ extern "C" { ); } extern "C" { - pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( + pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, + lwe_dimension: u32, ks_level: u32, ks_base_log: u32, pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, + num_blocks_in_radix: u32, + max_num_radix_in_vec: u32, message_modulus: u32, carry_modulus: u32, pbs_type: PBS_TYPE, @@ -826,14 +870,13 @@ extern "C" { ); } extern "C" { - pub fn cuda_integer_radix_overflowing_sub_kb_64( + pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, radix_lwe_out: *mut ffi::c_void, - radix_lwe_overflowed: *mut ffi::c_void, - radix_lwe_left: *const ffi::c_void, - radix_lwe_right: *const ffi::c_void, + radix_lwe_vec: *mut ffi::c_void, + num_radix_in_vec: u32, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, @@ -841,7 +884,7 @@ extern "C" { ); } extern "C" { - pub fn cleanup_cuda_integer_radix_overflowing_sub( + pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, @@ -942,52 +985,6 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { - pub fn scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - streams: *const *mut ffi::c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - signed_operation: i8, - message_modulus: u32, - carry_modulus: u32, - pbs_type: PBS_TYPE, - allocate_gpu_memory: bool, - ); -} -extern "C" { - pub fn cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( - streams: *const *mut ffi::c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lhs: *mut ffi::c_void, - rhs: *const ffi::c_void, - overflowed: *mut ffi::c_void, - signed_operation: i8, - mem_ptr: *mut i8, - bsks: *const *mut ffi::c_void, - ksks: *const *mut ffi::c_void, - num_blocks_in_radix: u32, - ); -} -extern "C" { - pub fn cleanup_signed_overflowing_add_or_sub( - streams: *const *mut ffi::c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr_void: *mut *mut i8, - ); -} extern "C" { pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, @@ -1384,7 +1381,7 @@ extern "C" { base_log: u32, level_count: u32, num_samples: u32, - lut_count: u32, + num_many_lut: u32, lut_stride: u32, ); } @@ -1406,7 +1403,7 @@ extern "C" { base_log: u32, level_count: u32, num_samples: u32, - lut_count: u32, + num_many_lut: u32, lut_stride: u32, ); } @@ -1469,7 +1466,7 @@ extern "C" { base_log: u32, level_count: u32, num_samples: u32, - lut_count: u32, + num_many_lut: u32, lut_stride: u32, ); } diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index b25cde608f..8fd40719b7 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -111,7 +111,7 @@ pub unsafe fn programmable_bootstrap_async( level: DecompositionLevelCount, num_samples: u32, ) { - let lut_count = 1u32; + let num_many_lut = 1u32; let lut_stride = 0u32; let mut pbs_buffer: *mut i8 = std::ptr::null_mut(); scratch_cuda_programmable_bootstrap_64( @@ -141,7 +141,7 @@ pub unsafe fn programmable_bootstrap_async( base_log.0 as u32, level.0 as u32, num_samples, - lut_count, + num_many_lut, lut_stride, ); cleanup_cuda_programmable_bootstrap( @@ -175,7 +175,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( grouping_factor: LweBskGroupingFactor, num_samples: u32, ) { - let lut_count = 1u32; + let num_many_lut = 1u32; let lut_stride = 0u32; let mut pbs_buffer: *mut i8 = std::ptr::null_mut(); scratch_cuda_multi_bit_programmable_bootstrap_64( @@ -206,7 +206,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( base_log.0 as u32, level.0 as u32, num_samples, - lut_count, + num_many_lut, lut_stride, ); cleanup_cuda_multi_bit_programmable_bootstrap( diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs index a2970b40fc..8770c11e6b 100644 --- a/tfhe/src/integer/gpu/ciphertext/info.rs +++ b/tfhe/src/integer/gpu/ciphertext/info.rs @@ -310,21 +310,6 @@ impl CudaRadixCiphertextInfo { .collect(), } } - pub(crate) fn after_bitnot(&self) -> Self { - Self { - blocks: self - .blocks - .iter() - .map(|left| CudaBlockInfo { - degree: Degree::new(left.message_modulus.0 - 1), - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: NoiseLevel::NOMINAL, - }) - .collect(), - } - } pub(crate) fn after_scalar_bitand(&self, scalar: T) -> Self where T: DecomposableInto, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index de2038d9b4..1fead5183c 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -15,6 +15,7 @@ use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; +use crate::integer::server_key::radix_parallel::OutputFlag; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; @@ -1016,10 +1017,11 @@ pub unsafe fn full_propagate_assign_async( /// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization /// is required -pub unsafe fn propagate_single_carry_assign_async( +pub(crate) unsafe fn propagate_single_carry_assign_async( streams: &CudaStreams, radix_lwe_input: &mut CudaVec, carry_out: &mut CudaVec, + carry_in: &CudaVec, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, lwe_dimension: LweDimension, @@ -1034,6 +1036,8 @@ pub unsafe fn propagate_single_carry_assign_async( +pub(crate) unsafe fn add_and_propagate_single_carry_assign_async( streams: &CudaStreams, - radix_lwe_input: &mut CudaVec, + radix_lwe_lhs_input: &mut CudaVec, + radix_lwe_rhs_input: &CudaVec, carry_out: &mut CudaVec, - input_carries: &mut CudaVec, + carry_in: &CudaVec, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, lwe_dimension: LweDimension, @@ -1118,10 +1125,17 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async< carry_modulus: CarryModulus, pbs_type: PBSType, grouping_factor: LweBskGroupingFactor, + requested_flag: OutputFlag, + uses_carry: u32, ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_input.gpu_index(0), + radix_lwe_lhs_input.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_rhs_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( @@ -1136,7 +1150,7 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async< ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32; - scratch_cuda_propagate_single_carry_kb_64_inplace( + scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, @@ -1154,21 +1168,26 @@ pub unsafe fn propagate_single_carry_get_input_carries_assign_async< message_modulus.0 as u32, carry_modulus.0 as u32, pbs_type as u32, + requested_flag as u32, + uses_carry, true, ); - cuda_propagate_single_carry_get_input_carries_kb_64_inplace( + cuda_add_and_propagate_single_carry_kb_64_inplace( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - radix_lwe_input.as_mut_c_ptr(0), + radix_lwe_lhs_input.as_mut_c_ptr(0), + radix_lwe_rhs_input.as_c_ptr(0), carry_out.as_mut_c_ptr(0), - input_carries.as_mut_c_ptr(0), + carry_in.as_c_ptr(0), mem_ptr, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), num_blocks, + requested_flag as u32, + uses_carry, ); - cleanup_cuda_propagate_single_carry( + cleanup_cuda_add_and_propagate_single_carry( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, @@ -2145,108 +2164,6 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< ); } -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required -pub unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async< - T: UnsignedInteger, - B: Numeric, ->( - streams: &CudaStreams, - ct_res: &mut CudaVec, - ct_overflowed: &mut CudaVec, - lhs: &CudaVec, - rhs: &CudaVec, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - num_blocks: u32, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, -) { - assert_eq!( - streams.gpu_indexes[0], - ct_res.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - ct_overflowed.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - lhs.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - rhs.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - bootstrapping_key.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - assert_eq!( - streams.gpu_indexes[0], - keyswitch_key.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_integer_radix_overflowing_sub_kb_64( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_blocks, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - true, - ); - cuda_integer_radix_overflowing_sub_kb_64( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - ct_res.as_mut_c_ptr(0), - ct_overflowed.as_mut_c_ptr(0), - lhs.as_c_ptr(0), - rhs.as_c_ptr(0), - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - keyswitch_key.ptr.as_ptr(), - num_blocks, - ); - cleanup_cuda_integer_radix_overflowing_sub( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - ); -} - #[allow(clippy::too_many_arguments)] /// # Safety /// @@ -2356,7 +2273,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async carry_modulus: CarryModulus, pbs_type: PBSType, grouping_factor: LweBskGroupingFactor, - lut_count: u32, + num_many_lut: u32, lut_stride: u32, ) { assert_eq!( @@ -2410,7 +2327,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async keyswitch_key.ptr.as_ptr(), bootstrapping_key.ptr.as_ptr(), num_blocks, - lut_count, + num_many_lut, lut_stride, ); cleanup_cuda_apply_univariate_lut_kb_64( @@ -2592,67 +2509,83 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async( +pub unsafe fn compute_prefix_sum_hillis_steele_async( streams: &CudaStreams, - lhs: &mut CudaVec, - rhs: &CudaVec, - overflowed: &mut CudaVec, - signed_operation: i8, + radix_lwe_output: &mut CudaSliceMut, + generates_or_propagates: &mut CudaSliceMut, + input_lut: &[T], bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, + lwe_dimension: LweDimension, glwe_dimension: GlweDimension, polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, ks_level: DecompositionLevelCount, ks_base_log: DecompositionBaseLog, pbs_level: DecompositionLevelCount, pbs_base_log: DecompositionBaseLog, num_blocks: u32, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, pbs_type: PBSType, grouping_factor: LweBskGroupingFactor, + shift: u32, ) { + assert_eq!( + streams.gpu_indexes[0], + generates_or_propagates.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_output.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, std::ptr::addr_of_mut!(mem_ptr), + input_lut.as_ptr().cast(), + lwe_dimension.0 as u32, glwe_dimension.0 as u32, polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, ks_level.0 as u32, ks_base_log.0 as u32, pbs_level.0 as u32, pbs_base_log.0 as u32, grouping_factor.0 as u32, num_blocks, - signed_operation, message_modulus.0 as u32, carry_modulus.0 as u32, pbs_type as u32, true, ); - cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64( + + cuda_integer_compute_prefix_sum_hillis_steele_64( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - lhs.as_mut_c_ptr(0), - rhs.as_c_ptr(0), - overflowed.as_mut_c_ptr(0), - signed_operation, + radix_lwe_output.as_mut_c_ptr(0), + generates_or_propagates.as_mut_c_ptr(0), mem_ptr, - bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), + bootstrapping_key.ptr.as_ptr(), num_blocks, + shift, ); - cleanup_signed_overflowing_add_or_sub( + + cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, @@ -2665,11 +2598,43 @@ pub unsafe fn unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async< /// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization /// is required -pub unsafe fn compute_prefix_sum_hillis_steele_async( +pub unsafe fn reverse_blocks_inplace_async( streams: &CudaStreams, radix_lwe_output: &mut CudaSliceMut, - generates_or_propagates: &mut CudaSliceMut, - input_lut: &[T], + num_blocks: u32, + lwe_size: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + radix_lwe_output.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + if num_blocks > 1 { + cuda_integer_reverse_blocks_64_inplace( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + radix_lwe_output.as_mut_c_ptr(0), + num_blocks, + lwe_size, + ); + } +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async< + T: UnsignedInteger, + B: Numeric, +>( + streams: &CudaStreams, + radix_lwe_input: &mut CudaVec, + radix_rhs_input: &CudaVec, + carry_out: &mut CudaVec, + carry_in: &CudaVec, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, lwe_dimension: LweDimension, @@ -2684,16 +2649,12 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async( - streams: &CudaStreams, - radix_lwe_output: &mut CudaSliceMut, - num_blocks: u32, - lwe_size: u32, -) { - assert_eq!( - streams.gpu_indexes[0], - radix_lwe_output.gpu_index(0), - "GPU error: all data should reside on the same GPU." - ); - if num_blocks > 1 { - cuda_integer_reverse_blocks_64_inplace( - streams.ptr.as_ptr(), - streams.gpu_indexes.as_ptr(), - streams.len() as u32, - radix_lwe_output.as_mut_c_ptr(0), - num_blocks, - lwe_size, - ); - } -} - #[allow(clippy::too_many_arguments)] /// # Safety /// diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index 3d746230a6..797de9836f 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -8,17 +8,11 @@ use crate::integer::gpu::ciphertext::{ use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ unchecked_add_integer_radix_assign_async, - unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, - unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async, PBSType, + unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::NoiseLevel; -#[derive(Copy, Clone, PartialEq, Eq)] -pub(crate) enum SignedOperation { - Addition, - Subtraction, -} - impl CudaServerKey { /// Computes homomorphically an addition between two ciphertexts encrypting integer values. /// @@ -114,8 +108,14 @@ impl CudaServerKey { (ct_left, &tmp_rhs) } }; - self.unchecked_add_assign_async(lhs, rhs, streams); - let _carry = self.propagate_single_carry_assign_async(lhs, streams); + + let _carry = self.add_and_propagate_single_carry_assign_async( + lhs, + rhs, + streams, + None, + OutputFlag::None, + ); } pub fn add_assign( @@ -348,7 +348,7 @@ impl CudaServerKey { .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) .unwrap(); - self.propagate_single_carry_assign_async(&mut result, streams); + self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None); assert!(result.block_carries_are_empty()); result } @@ -535,8 +535,58 @@ impl CudaServerKey { rhs: &CudaUnsignedRadixCiphertext, stream: &CudaStreams, ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) { - let mut ct_res = self.unchecked_add(lhs, rhs, stream); - let mut carry_out = self.propagate_single_carry_assign_async(&mut ct_res, stream); + let output_flag = OutputFlag::from_signedness(CudaUnsignedRadixCiphertext::IS_SIGNED); + + let mut ct_res = lhs.duplicate_async(stream); + let mut carry_out: CudaUnsignedRadixCiphertext = self + .add_and_propagate_single_carry_assign_async( + &mut ct_res, + rhs, + stream, + None, + output_flag, + ); + + ct_res.as_mut().info = ct_res + .as_ref() + .info + .after_overflowing_add(&rhs.as_ref().info); + + if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + { + carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO); + } else { + carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL); + } + + let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext); + + (ct_res, ct_overflowed) + } + + /// # Safety + /// + /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until stream is synchronised + pub unsafe fn unchecked_signed_overflowing_add_async( + &self, + lhs: &CudaSignedRadixCiphertext, + rhs: &CudaSignedRadixCiphertext, + input_carry: Option<&CudaBooleanBlock>, + stream: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED); + + let mut ct_res = lhs.duplicate_async(stream); + let mut carry_out: CudaSignedRadixCiphertext = self + .add_and_propagate_single_carry_assign_async( + &mut ct_res, + rhs, + stream, + input_carry, + output_flag, + ); ct_res.as_mut().info = ct_res .as_ref() @@ -655,141 +705,13 @@ impl CudaServerKey { "inputs cannot be empty" ); - self.unchecked_signed_overflowing_add_or_sub( - ct_left, - ct_right, - SignedOperation::Addition, - stream, - ) - } - - pub(crate) fn unchecked_signed_overflowing_add_or_sub( - &self, - lhs: &CudaSignedRadixCiphertext, - rhs: &CudaSignedRadixCiphertext, - signed_operation: SignedOperation, - streams: &CudaStreams, - ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { - assert!(self.message_modulus.0 >= 4 && self.carry_modulus.0 >= 4); - - let mut result: CudaSignedRadixCiphertext; + let result; + let overflowed; unsafe { - result = lhs.duplicate_async(streams); - } - let carry_out: CudaSignedRadixCiphertext = - unsafe { self.create_trivial_zero_radix_async(1, streams) }; - let mut overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext); - - unsafe { - self.unchecked_signed_overflowing_add_or_sub_assign_async( - &mut result, - rhs, - &mut overflowed, - signed_operation, - streams, - ); - } - streams.synchronize(); - - (result, overflowed) - } - - /// # Safety - /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronized - pub(crate) unsafe fn unchecked_signed_overflowing_add_or_sub_assign_async( - &self, - lhs: &mut CudaSignedRadixCiphertext, - rhs: &CudaSignedRadixCiphertext, - overflowed: &mut CudaBooleanBlock, - signed_operation: SignedOperation, - streams: &CudaStreams, - ) { - if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - { - overflowed.as_mut().ciphertext.info = overflowed - .as_ref() - .ciphertext - .info - .boolean_info(NoiseLevel::ZERO); - } else { - overflowed.as_mut().ciphertext.info = overflowed - .as_ref() - .ciphertext - .info - .boolean_info(NoiseLevel::NOMINAL); - } - let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - let signed_operation_numeric: i8 = - if matches!(signed_operation, SignedOperation::Subtraction) { - -1 - } else { - 1 - }; - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async( - streams, - &mut lhs.as_mut().d_blocks.0.d_vec, - &rhs.as_ref().d_blocks.0.d_vec, - &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - signed_operation_numeric, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_blocks, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - unchecked_signed_overflowing_add_or_sub_radix_kb_assign_async( - streams, - &mut lhs.as_mut().d_blocks.0.d_vec, - &rhs.as_ref().d_blocks.0.d_vec, - &mut overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - signed_operation_numeric, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_blocks, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } + (result, overflowed) = + self.unchecked_signed_overflowing_add_async(ct_left, ct_right, None, stream); }; - - lhs.as_mut().info = lhs - .as_ref() - .info - .after_overflowing_add(&rhs.ciphertext.info); + stream.synchronize(); + (result, overflowed) } } diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs index 43e82bcd79..71006cb289 100644 --- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs @@ -90,7 +90,6 @@ impl CudaServerKey { &d_decomposed_scalar, streams, ); - ct.as_mut().info = ct.as_ref().info.after_bitnot(); } pub fn unchecked_bitnot_assign( diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index f7571bf7d0..08e15a1318 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -6,6 +6,7 @@ use crate::core_crypto::prelude::{ ContiguousEntityContainerMut, LweBskGroupingFactor, LweCiphertextCount, }; use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; +use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; use crate::integer::gpu::ciphertext::{ CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, @@ -13,10 +14,11 @@ use crate::integer::gpu::ciphertext::{ }; use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ - apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async, full_propagate_assign_async, - propagate_single_carry_assign_async, propagate_single_carry_get_input_carries_assign_async, - CudaServerKey, PBSType, + add_and_propagate_single_carry_assign_async, apply_many_univariate_lut_kb_async, + apply_univariate_lut_kb_async, full_propagate_assign_async, + propagate_single_carry_assign_async, CudaServerKey, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::{Degree, NoiseLevel}; use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator}; use crate::shortint::server_key::{ @@ -203,6 +205,8 @@ impl CudaServerKey { &self, ct: &mut T, streams: &CudaStreams, + input_carry: Option<&CudaBooleanBlock>, + requested_flag: OutputFlag, ) -> T where T: CudaIntegerRadixCiphertext, @@ -210,12 +214,20 @@ impl CudaServerKey { let mut carry_out: T = self.create_trivial_zero_radix(1, streams); let ciphertext = ct.as_mut(); let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; + let uses_carry = input_carry.map_or(0u32, |_block| 1u32); + let mut aux_block: T = self.create_trivial_zero_radix(1, streams); + let in_carry_dvec = input_carry.map_or_else( + || &aux_block.as_mut().d_blocks.0.d_vec, + |block| &block.0.ciphertext.d_blocks.0.d_vec, + ); + match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { propagate_single_carry_assign_async( streams, &mut ciphertext.d_blocks.0.d_vec, &mut carry_out.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_bsk.d_vec, &self.key_switching_key.d_vec, d_bsk.input_lwe_dimension(), @@ -230,6 +242,8 @@ impl CudaServerKey { ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::Classical, LweBskGroupingFactor(0), + requested_flag, + uses_carry, ); } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { @@ -237,6 +251,7 @@ impl CudaServerKey { streams, &mut ciphertext.d_blocks.0.d_vec, &mut carry_out.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, d_multibit_bsk.input_lwe_dimension(), @@ -251,6 +266,8 @@ impl CudaServerKey { ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::MultiBit, d_multibit_bsk.grouping_factor, + requested_flag, + uses_carry, ); } }; @@ -269,26 +286,35 @@ impl CudaServerKey { /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until streams is synchronized - #[allow(dead_code)] - pub(crate) unsafe fn propagate_single_carry_get_input_carries_assign_async( + pub(crate) unsafe fn add_and_propagate_single_carry_assign_async( &self, - ct: &mut T, - input_carries: &mut T, + lhs: &mut T, + rhs: &T, streams: &CudaStreams, + input_carry: Option<&CudaBooleanBlock>, + requested_flag: OutputFlag, ) -> T where T: CudaIntegerRadixCiphertext, { let mut carry_out: T = self.create_trivial_zero_radix(1, streams); - let ciphertext = ct.as_mut(); - let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; + + let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32; + let uses_carry = input_carry.map_or(0u32, |_block| 1u32); + let mut aux_block: T = self.create_trivial_zero_radix(1, streams); + let in_carry_dvec = input_carry.map_or_else( + || &aux_block.as_mut().d_blocks.0.d_vec, + |block| &block.0.ciphertext.d_blocks.0.d_vec, + ); + match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { - propagate_single_carry_get_input_carries_assign_async( + add_and_propagate_single_carry_assign_async( streams, - &mut ciphertext.d_blocks.0.d_vec, + &mut lhs.as_mut().d_blocks.0.d_vec, + &rhs.as_ref().d_blocks.0.d_vec, &mut carry_out.as_mut().d_blocks.0.d_vec, - &mut input_carries.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_bsk.d_vec, &self.key_switching_key.d_vec, d_bsk.input_lwe_dimension(), @@ -299,18 +325,21 @@ impl CudaServerKey { d_bsk.decomp_level_count(), d_bsk.decomp_base_log(), num_blocks, - ciphertext.info.blocks.first().unwrap().message_modulus, - ciphertext.info.blocks.first().unwrap().carry_modulus, + self.message_modulus, + self.carry_modulus, PBSType::Classical, LweBskGroupingFactor(0), + requested_flag, + uses_carry, ); } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - propagate_single_carry_get_input_carries_assign_async( + add_and_propagate_single_carry_assign_async( streams, - &mut ciphertext.d_blocks.0.d_vec, + &mut lhs.as_mut().d_blocks.0.d_vec, + &rhs.as_ref().d_blocks.0.d_vec, &mut carry_out.as_mut().d_blocks.0.d_vec, - &mut input_carries.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, d_multibit_bsk.input_lwe_dimension(), @@ -321,14 +350,16 @@ impl CudaServerKey { d_multibit_bsk.decomp_level_count(), d_multibit_bsk.decomp_base_log(), num_blocks, - ciphertext.info.blocks.first().unwrap().message_modulus, - ciphertext.info.blocks.first().unwrap().carry_modulus, + self.message_modulus, + self.carry_modulus, PBSType::MultiBit, d_multibit_bsk.grouping_factor, + requested_flag, + uses_carry, ); } }; - ciphertext.info.blocks.iter_mut().for_each(|b| { + lhs.as_mut().info.blocks.iter_mut().for_each(|b| { b.degree = Degree::new(b.message_modulus.0 - 1); b.noise_level = NoiseLevel::NOMINAL; }); diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs index d7156919cf..ddae4c8620 100644 --- a/tfhe/src/integer/gpu/server_key/radix/neg.rs +++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs @@ -1,6 +1,7 @@ use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams}; use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; impl CudaServerKey { /// Homomorphically computes the opposite of a ciphertext encrypting an integer message. @@ -144,7 +145,8 @@ impl CudaServerKey { }; let mut res = self.unchecked_neg_async(ct, streams); - let _carry = self.propagate_single_carry_assign_async(&mut res, streams); + let _carry = + self.propagate_single_carry_assign_async(&mut res, streams, None, OutputFlag::None); res } } diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 7c30c789bb..68def18dec 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -8,6 +8,7 @@ use crate::integer::gpu::ciphertext::{ }; use crate::integer::gpu::scalar_addition_integer_radix_assign_async; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::prelude::CastInto; use crate::shortint::ciphertext::NoiseLevel; @@ -186,7 +187,7 @@ impl CudaServerKey { }; self.unchecked_scalar_add_assign_async(ct, scalar, streams); - let _carry = self.propagate_single_carry_assign_async(ct, streams); + let _carry = self.propagate_single_carry_assign_async(ct, streams, None, OutputFlag::None); } pub fn scalar_add_assign(&self, ct: &mut T, scalar: Scalar, streams: &CudaStreams) @@ -264,7 +265,8 @@ impl CudaServerKey { self.unchecked_scalar_add_assign(ct_left, scalar, stream); let mut carry_out; unsafe { - carry_out = self.propagate_single_carry_assign_async(ct_left, stream); + carry_out = + self.propagate_single_carry_assign_async(ct_left, stream, None, OutputFlag::Carry); } stream.synchronize(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs index d117927343..ff5e2801c7 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs @@ -4,6 +4,7 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext}; use crate::integer::gpu::server_key::CudaServerKey; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::integer::server_key::TwosComplementNegation; use crate::prelude::CastInto; @@ -151,7 +152,7 @@ impl CudaServerKey { }; self.unchecked_scalar_sub_assign_async(ct, scalar, stream); - let _carry = self.propagate_single_carry_assign_async(ct, stream); + let _carry = self.propagate_single_carry_assign_async(ct, stream, None, OutputFlag::None); } pub fn scalar_sub_assign(&self, ct: &mut T, scalar: Scalar, stream: &CudaStreams) diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs index 8e784a3686..b44ae35710 100644 --- a/tfhe/src/integer/gpu/server_key/radix/sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs @@ -1,18 +1,17 @@ -use super::add::SignedOperation; -use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::CudaStreams; -use crate::core_crypto::prelude::{CiphertextModulus, LweBskGroupingFactor, LweCiphertextCount}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; -use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo; use crate::integer::gpu::ciphertext::{ - CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, - CudaUnsignedRadixCiphertext, + CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, }; -use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; +use crate::integer::gpu::server_key::CudaServerKey; + +use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType, }; +use crate::integer::server_key::radix_parallel::OutputFlag; use crate::shortint::ciphertext::NoiseLevel; +use crate::shortint::parameters::{Degree, LweBskGroupingFactor}; impl CudaServerKey { /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values. @@ -271,8 +270,14 @@ impl CudaServerKey { } }; - self.unchecked_sub_assign_async(lhs, rhs, streams); - let _carry = self.propagate_single_carry_assign_async(lhs, streams); + let neg_rhs = self.unchecked_neg_async(rhs, streams); + let _carry = self.add_and_propagate_single_carry_assign_async( + lhs, + &neg_rhs, + streams, + None, + OutputFlag::None, + ); } pub fn unsigned_overflowing_sub( @@ -353,87 +358,102 @@ impl CudaServerKey { rhs: &CudaUnsignedRadixCiphertext, stream: &CudaStreams, ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) { - let num_blocks = lhs.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - let mut tmp: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream); - if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO - { - tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::ZERO); - } else { - tmp.as_mut().info = tmp.as_ref().info.boolean_info(NoiseLevel::NOMINAL); - } let mut ct_res = lhs.duplicate_async(stream); - let block = CudaLweCiphertextList::new( - tmp.as_ref().d_blocks.lwe_dimension(), - LweCiphertextCount(1), - CiphertextModulus::new_native(), - stream, - ); - let block_info = tmp.as_ref().info.blocks[0]; - let ct_info = vec![block_info]; - let ct_info = CudaRadixCiphertextInfo { blocks: ct_info }; - let mut ct_overflowed = - CudaBooleanBlock::from_cuda_radix_ciphertext(CudaRadixCiphertext::new(block, ct_info)); + let compute_overflow = true; + const INPUT_BORROW: Option<&CudaBooleanBlock> = None; + + let mut overflow_block: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(1, stream); + let ciphertext = ct_res.as_mut(); + let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; + let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32); + + let mut aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream); + let in_carry_dvec = INPUT_BORROW.map_or_else( + || &aux_block.as_mut().d_blocks.0.d_vec, + |block| &block.0.ciphertext.d_blocks.0.d_vec, + ); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async( stream, - &mut ct_res.as_mut().d_blocks.0.d_vec, - &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - &lhs.as_ref().d_blocks.0.d_vec, + &mut ciphertext.d_blocks.0.d_vec, &rhs.as_ref().d_blocks.0.d_vec, + &mut overflow_block.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_bsk.d_vec, &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), + d_bsk.input_lwe_dimension(), + d_bsk.glwe_dimension(), + d_bsk.polynomial_size(), self.key_switching_key.decomposition_level_count(), self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, + d_bsk.decomp_level_count(), + d_bsk.decomp_base_log(), num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::Classical, LweBskGroupingFactor(0), + compute_overflow, + uses_input_borrow, ); } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async( stream, - &mut ct_res.as_mut().d_blocks.0.d_vec, - &mut ct_overflowed.as_mut().ciphertext.d_blocks.0.d_vec, - &lhs.as_ref().d_blocks.0.d_vec, + &mut ciphertext.d_blocks.0.d_vec, &rhs.as_ref().d_blocks.0.d_vec, + &mut overflow_block.as_mut().d_blocks.0.d_vec, + in_carry_dvec, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), + d_multibit_bsk.input_lwe_dimension(), + d_multibit_bsk.glwe_dimension(), + d_multibit_bsk.polynomial_size(), self.key_switching_key.decomposition_level_count(), self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, + d_multibit_bsk.decomp_level_count(), + d_multibit_bsk.decomp_base_log(), num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, PBSType::MultiBit, d_multibit_bsk.grouping_factor, + compute_overflow, + uses_input_borrow, ); } }; + ciphertext.info.blocks.iter_mut().for_each(|b| { + b.degree = Degree::new(b.message_modulus.0 - 1); + b.noise_level = NoiseLevel::NOMINAL; + }); + overflow_block + .as_mut() + .info + .blocks + .iter_mut() + .for_each(|b| { + b.degree = Degree::new(1); + b.noise_level = NoiseLevel::ZERO; + }); + + if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO + { + overflow_block.as_mut().info = + overflow_block.as_ref().info.boolean_info(NoiseLevel::ZERO); + } else { + overflow_block.as_mut().info = overflow_block + .as_ref() + .info + .boolean_info(NoiseLevel::NOMINAL); + } + + let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(overflow_block.ciphertext); ct_res.as_mut().info = ct_res .as_ref() @@ -541,11 +561,34 @@ impl CudaServerKey { ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 > 0, "inputs cannot be empty" ); + let result; + let overflowed; + unsafe { + (result, overflowed) = + self.unchecked_signed_overflowing_sub_async(ct_left, ct_right, stream); + }; + stream.synchronize(); + (result, overflowed) + } + /// # Safety + /// + /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must + /// not be dropped until stream is synchronised + pub unsafe fn unchecked_signed_overflowing_sub_async( + &self, + ct_left: &CudaSignedRadixCiphertext, + ct_right: &CudaSignedRadixCiphertext, + stream: &CudaStreams, + ) -> (CudaSignedRadixCiphertext, CudaBooleanBlock) { + let flipped_rhs = self.bitnot(ct_right, stream); + let ct_input_carry: CudaUnsignedRadixCiphertext = + self.create_trivial_radix_async(1, 1, stream); + let input_carry = CudaBooleanBlock::from_cuda_radix_ciphertext(ct_input_carry.ciphertext); - self.unchecked_signed_overflowing_add_or_sub( + self.unchecked_signed_overflowing_add_async( ct_left, - ct_right, - SignedOperation::Subtraction, + &flipped_rhs, + Some(&input_carry), stream, ) }