From 6fabe6bab0d94f7adf19c9147b269774fe0d8cf8 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Tue, 10 Sep 2024 11:16:15 +0200 Subject: [PATCH] chore(gpu): fix templates and refactor radix negation --- .../tfhe-cuda-backend/cuda/include/integer.h | 9 +- .../cuda/src/crypto/keyswitch.cu | 6 +- backends/tfhe-cuda-backend/cuda/src/device.cu | 4 +- .../cuda/src/integer/addition.cuh | 32 +- .../cuda/src/integer/cmux.cu | 2 +- .../cuda/src/integer/cmux.cuh | 29 +- .../cuda/src/integer/comparison.cuh | 111 ++++--- .../src/integer/compression/compression.cu | 4 +- .../src/integer/compression/compression.cuh | 42 +-- .../cuda/src/integer/div_rem.cuh | 50 +-- .../cuda/src/integer/integer.cu | 2 +- .../cuda/src/integer/integer.cuh | 86 ++--- .../cuda/src/integer/multiplication.cuh | 9 +- .../cuda/src/integer/negation.cu | 16 +- .../cuda/src/integer/negation.cuh | 13 +- .../cuda/src/integer/scalar_addition.cu | 2 +- .../cuda/src/integer/scalar_addition.cuh | 20 +- .../cuda/src/integer/scalar_comparison.cuh | 304 +++++++++--------- .../cuda/src/integer/scalar_mul.cuh | 9 +- .../cuda/src/integer/scalar_rotate.cuh | 23 +- .../cuda/src/integer/scalar_shifts.cuh | 22 +- .../cuda/src/integer/shift_and_rotate.cuh | 47 +-- .../cuda/src/linearalgebra/addition.cu | 42 +-- .../cuda/src/linearalgebra/addition.cuh | 12 +- .../cuda/src/linearalgebra/multiplication.cu | 4 +- .../cuda/src/linearalgebra/multiplication.cuh | 4 +- .../cuda/src/linearalgebra/negation.cu | 16 +- .../cuda/src/linearalgebra/negation.cuh | 2 +- backends/tfhe-cuda-backend/src/cuda_bind.rs | 5 +- tfhe/src/core_crypto/gpu/mod.rs | 10 +- tfhe/src/integer/gpu/server_key/radix/neg.rs | 77 ++--- 31 files changed, 503 insertions(+), 511 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 99860fd1c7..9c2bfacdfa 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -112,10 +112,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64( void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); -void cuda_negate_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus); +void cuda_negate_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension, + uint32_t lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus); void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 4f6761d6c6..20897b141b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - host_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - host_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -66,7 +66,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64( uint32_t output_glwe_dimension, uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { - host_packing_keyswitch_lwe_list_to_glwe( + host_packing_keyswitch_lwe_list_to_glwe( static_cast(stream), gpu_index, static_cast(glwe_array_out), static_cast(lwe_array_in), diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index eb881fedcd..3c520092f2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index, int num_blocks = (n + block_size - 1) / block_size; // Launch the kernel - cuda_set_value_kernel<<>>(d_array, value, - n); + cuda_set_value_kernel + <<>>(d_array, value, n); check_cuda_error(cudaGetLastError()); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh index e44cd0ad60..daf90239d3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -37,12 +37,12 @@ void host_resolve_signed_overflow( streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears, mem->params.big_lwe_dimension, 1); - host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, - last_block_inner_propagation, x, mem->params.big_lwe_dimension, - 1); - host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, - last_block_inner_propagation, last_block_input_carry, - mem->params.big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, + last_block_inner_propagation, x, + mem->params.big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation, + last_block_inner_propagation, last_block_input_carry, + mem->params.big_lwe_dimension, 1); host_apply_univariate_lut_kb(streams, gpu_indexes, gpu_count, result, last_block_inner_propagation, @@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( // phase 1 if (op == SIGNED_OPERATION::ADDITION) { - host_addition(streams[0], gpu_indexes[0], result, lhs, rhs, - big_lwe_dimension, num_blocks); + host_addition(streams[0], gpu_indexes[0], result, lhs, rhs, + big_lwe_dimension, num_blocks); } else { - host_integer_radix_negation( + host_integer_radix_negation( streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension, num_blocks, radix_params.message_modulus, radix_params.carry_modulus); - host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs, - big_lwe_dimension, num_blocks); + host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs, + big_lwe_dimension, num_blocks); } // phase 2 @@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( cuda_synchronize_stream(streams[j], gpu_indexes[j]); } - host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count, - result, output_carry, input_carries, - mem_ptr->scp_mem, bsks, ksks, num_blocks); - host_generate_last_block_inner_propagation( + host_propagate_single_carry( + mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry, + input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks); + host_generate_last_block_inner_propagation( mem_ptr->sub_streams_2, gpu_indexes, gpu_count, last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size], &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks, @@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( // phase 3 auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size]; - host_resolve_signed_overflow( + host_resolve_signed_overflow( streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation, input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index 28f685b6e4..68a501eac6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64( std::function predicate_lut_f = [](uint64_t x) -> uint64_t { return x == 1; }; - scratch_cuda_integer_radix_cmux_kb( + scratch_cuda_integer_radix_cmux_kb( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_cmux_buffer **)mem_ptr, predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index 3082d5998c..9b72407f17 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size; auto lwe_array_input_block = lwe_array_input + i * big_lwe_size; - device_pack_bivariate_blocks<<>>( - lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block, - lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension, - params.message_modulus, 1); + device_pack_bivariate_blocks + <<>>( + lwe_array_out_block, predicate->lwe_indexes_in, + lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in, + params.big_lwe_dimension, params.message_modulus, 1); check_cuda_error(cudaGetLastError()); } @@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb( } auto mem_true = mem_ptr->zero_if_true_buffer; - zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct, - lwe_array_true, lwe_condition, mem_true, - mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks); + zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct, + lwe_array_true, lwe_condition, mem_true, + mem_ptr->inverted_predicate_lut, bsks, ksks, + num_radix_blocks); auto mem_false = mem_ptr->zero_if_false_buffer; - zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct, - lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut, - bsks, ksks, num_radix_blocks); + zero_out_if(false_streams, gpu_indexes, gpu_count, + mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition, + mem_false, mem_ptr->predicate_lut, bsks, ksks, + num_radix_blocks); for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) { cuda_synchronize_stream(true_streams[j], gpu_indexes[j]); } @@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb( // will be 0 If the condition was false, true_ct will be 0 and false_ct will // have kept its value auto added_cts = mem_ptr->tmp_true_ct; - host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct, - mem_ptr->tmp_false_ct, params.big_lwe_dimension, - num_radix_blocks); + host_addition(streams[0], gpu_indexes[0], added_cts, + mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct, + params.big_lwe_dimension, num_radix_blocks); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index d56d6e3297..ed2a3bbef5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index, int num_entries = (lwe_dimension + 1); getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); // Add all blocks and store in sum - device_accumulate_all_blocks<<>>( + device_accumulate_all_blocks<<>>( output, input, lwe_dimension, num_radix_blocks); check_cuda_error(cudaGetLastError()); } @@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; @@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true( auto is_equal_to_num_blocks_map = &are_all_block_true_buffer->is_equal_to_lut_map; for (int i = 0; i < num_chunks; i++) { - accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator, - input_blocks, big_lwe_dimension, chunk_length); + accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator, + input_blocks, big_lwe_dimension, + chunk_length); accumulator += (big_lwe_dimension + 1); remaining_blocks -= (chunk_length - 1); @@ -165,7 +165,6 @@ __host__ void is_at_least_one_comparisons_block_true( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; @@ -192,8 +191,9 @@ __host__ void is_at_least_one_comparisons_block_true( auto input_blocks = mem_ptr->tmp_lwe_array_out; auto accumulator = buffer->tmp_block_accumulated; for (int i = 0; i < num_chunks; i++) { - accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator, - input_blocks, big_lwe_dimension, chunk_length); + accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator, + input_blocks, big_lwe_dimension, + chunk_length); accumulator += (big_lwe_dimension + 1); remaining_blocks -= (chunk_length - 1); @@ -280,8 +280,8 @@ __host__ void host_compare_with_zero_equality( uint32_t chunk_size = std::min(remainder_blocks, num_elements_to_fill_carry); - accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk, - big_lwe_dimension, chunk_size); + accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk, + big_lwe_dimension, chunk_size); num_sum_blocks++; remainder_blocks -= (chunk_size - 1); @@ -295,8 +295,9 @@ __host__ void host_compare_with_zero_equality( integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks, zero_comparison); - are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out, - sum, mem_ptr, bsks, ksks, num_sum_blocks); + are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, + lwe_array_out, sum, mem_ptr, bsks, ksks, + num_sum_blocks); } template @@ -310,7 +311,7 @@ __host__ void host_integer_radix_equality_check_kb( // Applies the LUT for the comparison operation auto comparisons = mem_ptr->tmp_block_comparisons; - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2, bsks, ksks, num_radix_blocks, eq_buffer->operator_lut, eq_buffer->operator_lut->params.message_modulus); @@ -319,9 +320,9 @@ __host__ void host_integer_radix_equality_check_kb( // // It returns a block encrypting 1 if all input blocks are 1 // otherwise the block encrypts 0 - are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out, - comparisons, mem_ptr, bsks, ksks, - num_radix_blocks); + are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, + lwe_array_out, comparisons, mem_ptr, + bsks, ksks, num_radix_blocks); } template @@ -352,19 +353,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // Subtract // Here we need the true lwe sub, not the one that comes from shortint. - host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left, - lwe_array_right, big_lwe_dimension, num_radix_blocks); + host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, + lwe_array_left, lwe_array_right, big_lwe_dimension, + num_radix_blocks); // Apply LUT to compare to 0 auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks, num_radix_blocks, is_non_zero_lut); // Add one // Here Lhs can have the following values: (-1) % (message modulus * carry // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2 - host_integer_radix_add_scalar_one_inplace( + host_integer_radix_add_scalar_one_inplace( streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus); } @@ -406,8 +408,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut; while (partial_block_count > 2) { - pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension, - partial_block_count, 4); + pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension, + partial_block_count, 4); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, x, y, bsks, ksks, @@ -433,8 +435,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, std::function f; if (partial_block_count == 2) { - pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension, - partial_block_count, 4); + pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension, + partial_block_count, 4); f = [block_selector_f, sign_handler_f](Torus x) -> Torus { int msb = (x >> 2) & 3; @@ -454,9 +456,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes, last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); // Last leaf - integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, - gpu_count, lwe_array_out, y, - bsks, ksks, 1, last_lut); + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1, + last_lut); } template @@ -488,19 +490,21 @@ __host__ void host_integer_radix_difference_check_kb( if (mem_ptr->is_signed) { packed_num_radix_blocks -= 2; } - pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left, - big_lwe_dimension, packed_num_radix_blocks, message_modulus); - pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right, - big_lwe_dimension, packed_num_radix_blocks, message_modulus); + pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left, + big_lwe_dimension, packed_num_radix_blocks, + message_modulus); + pack_blocks(streams[0], gpu_indexes[0], packed_right, + lwe_array_right, big_lwe_dimension, + packed_num_radix_blocks, message_modulus); // From this point we have half number of blocks packed_num_radix_blocks /= 2; // Clean noise auto identity_lut = mem_ptr->identity_lut; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks, packed_num_radix_blocks, identity_lut); - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks, packed_num_radix_blocks, identity_lut); @@ -517,16 +521,17 @@ __host__ void host_integer_radix_difference_check_kb( if (!mem_ptr->is_signed) { // Compare packed blocks, or simply the total number of radix blocks in the // inputs - compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs, - rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks); + compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, + lhs, rhs, mem_ptr, bsks, ksks, + packed_num_radix_blocks); num_comparisons = packed_num_radix_blocks; } else { // Packing is possible if (carry_modulus >= message_modulus) { // Compare (num_radix_blocks - 2) / 2 packed blocks - compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs, - rhs, mem_ptr, bsks, ksks, - packed_num_radix_blocks); + compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, ksks, + packed_num_radix_blocks); // Compare the last block before the sign block separately auto identity_lut = mem_ptr->identity_lut; @@ -535,21 +540,21 @@ __host__ void host_integer_radix_difference_check_kb( Torus *last_right_block_before_sign_block = diff_buffer->tmp_packed_right + packed_num_radix_blocks * big_lwe_size; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_left_block_before_sign_block, lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, identity_lut); - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_right_block_before_sign_block, lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, identity_lut); - compare_radix_blocks_kb( + compare_radix_blocks_kb( streams, gpu_indexes, gpu_count, comparisons + packed_num_radix_blocks * big_lwe_size, last_left_block_before_sign_block, last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1); // Compare the sign block separately - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size, lwe_array_left + (num_radix_blocks - 1) * big_lwe_size, @@ -558,11 +563,11 @@ __host__ void host_integer_radix_difference_check_kb( num_comparisons = packed_num_radix_blocks + 2; } else { - compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, - lwe_array_left, lwe_array_right, mem_ptr, bsks, - ksks, num_radix_blocks - 1); + compare_radix_blocks_kb( + streams, gpu_indexes, gpu_count, comparisons, lwe_array_left, + lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1); // Compare the sign block separately - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, comparisons + (num_radix_blocks - 1) * big_lwe_size, lwe_array_left + (num_radix_blocks - 1) * big_lwe_size, @@ -575,9 +580,9 @@ __host__ void host_integer_radix_difference_check_kb( // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign - tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, - comparisons, mem_ptr->diff_buffer->tree_buffer, - reduction_lut_f, bsks, ksks, num_comparisons); + tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, + comparisons, mem_ptr->diff_buffer->tree_buffer, + reduction_lut_f, bsks, ksks, num_comparisons); } template @@ -601,16 +606,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes, Torus **ksks, uint32_t total_num_radix_blocks) { // Compute the sign - host_integer_radix_difference_check_kb( + host_integer_radix_difference_check_kb( streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks); // Selector - host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, - mem_ptr->tmp_lwe_array_out, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, - total_num_radix_blocks); + host_integer_radix_cmux_kb( + streams, gpu_indexes, gpu_count, lwe_array_out, + mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right, + mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu index 841041f27b..43c35dc435 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -14,7 +14,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64( lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus, carry_modulus); - scratch_cuda_compress_integer_radix_ciphertext_64( + scratch_cuda_compress_integer_radix_ciphertext( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_compression **)mem_ptr, num_lwes, compression_params, lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); @@ -39,7 +39,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( lwe_dimension, compression_glwe_dimension * compression_polynomial_size, 0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus); - scratch_cuda_integer_decompress_radix_ciphertext_64( + scratch_cuda_integer_decompress_radix_ciphertext( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_decompression **)mem_ptr, num_lwes, body_count, encryption_params, compression_params, storage_log_modulus, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index 26bd6befed..164d1e908c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -54,8 +54,8 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, dim3 grid(num_blocks); dim3 threads(num_threads); - pack<<>>(array_out, array_in, log_modulus, in_len, - len); + pack<<>>(array_out, array_in, log_modulus, + in_len, len); } template @@ -70,10 +70,10 @@ __host__ void host_integer_compress(cudaStream_t *streams, // Shift auto lwe_shifted = mem_ptr->tmp_lwe; - host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted, - lwe_array_in, - (uint64_t)compression_params.message_modulus, - input_lwe_dimension, num_lwes); + host_cleartext_multiplication( + streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in, + (uint64_t)compression_params.message_modulus, input_lwe_dimension, + num_lwes); uint32_t lwe_in_size = input_lwe_dimension + 1; uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) * @@ -88,7 +88,7 @@ __host__ void host_integer_compress(cudaStream_t *streams, auto lwe_subset = lwe_shifted + i * lwe_in_size; auto glwe_out = tmp_glwe_array_out + i * glwe_out_size; - host_packing_keyswitch_lwe_list_to_glwe( + host_packing_keyswitch_lwe_list_to_glwe( streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, compression_params.polynomial_size, compression_params.ks_base_log, @@ -96,16 +96,16 @@ __host__ void host_integer_compress(cudaStream_t *streams, } // Modulus switch - host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out, - num_glwes * - (compression_params.glwe_dimension * - compression_params.polynomial_size + - body_count), - mem_ptr->storage_log_modulus); + host_modulus_switch_inplace( + streams[0], gpu_indexes[0], tmp_glwe_array_out, + num_glwes * (compression_params.glwe_dimension * + compression_params.polynomial_size + + body_count), + mem_ptr->storage_log_modulus); check_cuda_error(cudaGetLastError()); - host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out, - num_glwes, body_count, mem_ptr); + host_pack(streams[0], gpu_indexes[0], glwe_array_out, + tmp_glwe_array_out, num_glwes, body_count, mem_ptr); } template @@ -167,8 +167,8 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads); dim3 grid(num_blocks); dim3 threads(num_threads); - extract<<>>(glwe_array_out, array_in, glwe_index, - log_modulus, initial_out_len); + extract<<>>( + glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len); check_cuda_error(cudaGetLastError()); } @@ -182,8 +182,8 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, auto extracted_glwe = mem_ptr->tmp_extracted_glwe; auto compression_params = mem_ptr->compression_params; - host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0, - mem_ptr); + host_extract(streams[0], gpu_indexes[0], extracted_glwe, + packed_glwe_in, 0, mem_ptr); auto num_lwes = mem_ptr->num_lwes; @@ -252,7 +252,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, } template -__host__ void scratch_cuda_compress_integer_radix_ciphertext_64( +__host__ void scratch_cuda_compress_integer_radix_ciphertext( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_compression **mem_ptr, uint32_t num_lwes, int_radix_params compression_params, uint32_t lwe_per_glwe, @@ -264,7 +264,7 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64( } template -__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64( +__host__ void scratch_cuda_integer_decompress_radix_ciphertext( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_decompression **mem_ptr, uint32_t num_lwes, uint32_t body_count, int_radix_params encryption_params, int_radix_params compression_params, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index e2b95a0203..9e68cb408d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // Shift the mask so that we will only keep bits we should uint32_t shifted_mask = full_message_mask >> shift_amount; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, interesting_divisor.last_block(), interesting_divisor.last_block(), bsks, ksks, 1, mem_ptr->masking_luts_1[shifted_mask]); @@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // the estimated degree of the output is < msg_modulus shifted_mask = shifted_mask & full_message_mask; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(), divisor_ms_blocks.first_block(), bsks, ksks, 1, mem_ptr->masking_luts_2[shifted_mask]); @@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, interesting_remainder1.insert(0, numerator_block_1.first_block(), streams[0], gpu_indexes[0]); - host_integer_radix_logical_scalar_shift_kb_inplace( + host_integer_radix_logical_scalar_shift_kb_inplace( streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1, mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len); @@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, interesting_remainder1.len - 1, streams[0], gpu_indexes[0]); - host_radix_blocks_rotate_left( + host_radix_blocks_rotate_left( streams, gpu_indexes, gpu_count, interesting_remainder1.data, tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size); @@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, auto left_shift_interesting_remainder2 = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - host_integer_radix_logical_scalar_shift_kb_inplace( + host_integer_radix_logical_scalar_shift_kb_inplace( streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1, mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len); }; // left_shift_interesting_remainder2 @@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // but in that position, interesting_remainder2 always has a 0 auto &merged_interesting_remainder = interesting_remainder1; - host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data, - merged_interesting_remainder.data, - interesting_remainder2.data, radix_params.big_lwe_dimension, - merged_interesting_remainder.len); + host_addition( + streams[0], gpu_indexes[0], merged_interesting_remainder.data, + merged_interesting_remainder.data, interesting_remainder2.data, + radix_params.big_lwe_dimension, merged_interesting_remainder.len); // after create_clean_version_of_merged_remainder // `merged_interesting_remainder` will be reused as @@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // We could call unchecked_scalar_ne // But we are in the special case where scalar == 0 // So we can skip some stuff - host_compare_with_zero_equality( + host_compare_with_zero_equality( streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data, mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len, mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut); @@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, tmp_1.len = ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1); - is_at_least_one_comparisons_block_true( + is_at_least_one_comparisons_block_true( streams, gpu_indexes, gpu_count, at_least_one_upper_block_is_non_zero.data, tmp_1.data, mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len); @@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // `cleaned_merged_interesting_remainder` - radix ciphertext auto create_clean_version_of_merged_remainder = [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) { - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cleaned_merged_interesting_remainder.data, cleaned_merged_interesting_remainder.data, bsks, ksks, @@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); } - host_addition(streams[0], gpu_indexes[0], overflow_sum.data, - subtraction_overflowed.data, - at_least_one_upper_block_is_non_zero.data, - radix_params.big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], overflow_sum.data, + subtraction_overflowed.data, + at_least_one_upper_block_is_non_zero.data, + radix_params.big_lwe_dimension, 1); int factor = (i) ? 3 : 2; int factor_lut_id = factor - 2; @@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, mem_ptr->merge_overflow_flags_luts[pos_in_block] ->params.message_modulus); - host_addition(streams[0], gpu_indexes[0], - "ient[block_of_bit * big_lwe_size], - "ient[block_of_bit * big_lwe_size], - did_not_overflow.data, radix_params.big_lwe_dimension, 1); + host_addition( + streams[0], gpu_indexes[0], "ient[block_of_bit * big_lwe_size], + "ient[block_of_bit * big_lwe_size], did_not_overflow.data, + radix_params.big_lwe_dimension, 1); }; for (uint j = 0; j < gpu_count; j++) { @@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, // Clean the quotient and remainder // as even though they have no carries, they are not at nominal noise level - host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data, - remainder2.data, radix_params.big_lwe_dimension, - remainder1.len); + host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data, + remainder2.data, radix_params.big_lwe_dimension, + remainder1.len); for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1); - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index e5b4b2e742..b9f76c0ec6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace( ks_base_log, pbs_level, pbs_base_log, grouping_factor, message_modulus, carry_modulus); - scratch_cuda_propagate_single_carry_kb_inplace( + scratch_cuda_propagate_single_carry_kb_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_sc_prop_memory **)mem_ptr, num_blocks, params, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index cd7640d00b..b23f6e5ab2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes, "pointers should be different"); } cudaSetDevice(gpu_indexes[0]); - radix_blocks_rotate_right<<>>( + radix_blocks_rotate_right<<>>( dst, src, value, blocks_count, lwe_size); } @@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes, "pointers should be different"); } cudaSetDevice(gpu_indexes[0]); - radix_blocks_rotate_left<<>>( + radix_blocks_rotate_left<<>>( dst, src, value, blocks_count, lwe_size); } @@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t lwe_size) { cudaSetDevice(gpu_indexes[0]); int num_blocks = blocks_count / 2, num_threads = 1024; - radix_blocks_reverse_lwe_inplace<<>>( - src, blocks_count, lwe_size); + radix_blocks_reverse_lwe_inplace + <<>>(src, blocks_count, lwe_size); } // polynomial_size threads @@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams, int num_blocks = 0, num_threads = 0; int num_entries = num_radix_blocks * (lwe_dimension + 1); getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); - device_pack_bivariate_blocks<<>>( - lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in, - lwe_dimension, shift, num_radix_blocks); + device_pack_bivariate_blocks + <<>>( + lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, + lwe_indexes_in, lwe_dimension, shift, num_radix_blocks); check_cuda_error(cudaGetLastError()); } @@ -273,10 +274,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( // Left message is shifted auto lwe_array_pbs_in = lut->tmp_lwe_before_ks; - pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in, - lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2, - lut->lwe_indexes_in, big_lwe_dimension, shift, - num_radix_blocks); + pack_bivariate_blocks(streams, gpu_indexes, gpu_count, + lwe_array_pbs_in, lut->lwe_trivial_indexes, + lwe_array_1, lwe_array_2, lut->lwe_indexes_in, + big_lwe_dimension, shift, num_radix_blocks); check_cuda_error(cudaGetLastError()); /// For multi GPU execution we create vectors of pointers for inputs and @@ -380,7 +381,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, body[i] = -body[i]; } - rotate_left(body, half_box_size, polynomial_size); + rotate_left(body, half_box_size, polynomial_size); } template @@ -590,13 +591,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, ksks, num_blocks, luts_array); // compute prefix sum with hillis&steele - host_compute_prefix_sum_hillis_steele( + host_compute_prefix_sum_hillis_steele( streams, gpu_indexes, gpu_count, step_output, generates_or_propagates, params, luts_carry_propagation_sum, bsks, ksks, num_blocks); - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output, - generates_or_propagates, 1, num_blocks, - big_lwe_size); + host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, + step_output, generates_or_propagates, 1, + num_blocks, big_lwe_size); if (carry_out != nullptr) { cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes, streams[0], gpu_indexes[0]); @@ -610,8 +611,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes, gpu_indexes[0]); } - host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output, - glwe_dimension * polynomial_size, num_blocks); + host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, + step_output, glwe_dimension * polynomial_size, + num_blocks); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, @@ -664,14 +666,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams, overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)], big_lwe_size_bytes, streams[0], gpu_indexes[0]); - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output, - generates_or_propagates, 1, num_blocks, - big_lwe_size); + host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, + step_output, generates_or_propagates, 1, + num_blocks, big_lwe_size); cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0], gpu_indexes[0]); - host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array, - step_output, glwe_dimension * polynomial_size, num_blocks); + host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array, + step_output, glwe_dimension * polynomial_size, + num_blocks); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, @@ -727,10 +730,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, if (i < num_blocks - 1) { auto next_input_block = &input_blocks[(i + 1) * big_lwe_size]; - host_addition(streams[0], gpu_indexes[0], next_input_block, - next_input_block, - &mem_ptr->tmp_big_lwe_vector[big_lwe_size], - params.big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], next_input_block, + next_input_block, + &mem_ptr->tmp_big_lwe_vector[big_lwe_size], + params.big_lwe_dimension, 1); } } } @@ -794,7 +797,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, int num_blocks = 0, num_threads = 0; int num_entries = (lwe_dimension + 1); getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads); - device_pack_blocks<<>>( + device_pack_blocks<<>>( lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor); } @@ -840,7 +843,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, // this uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); - device_create_trivial_radix<<>>( + device_create_trivial_radix<<>>( lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta); check_cuda_error(cudaGetLastError()); } @@ -857,7 +860,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t num_radix_blocks, uint32_t bits_per_block, int_bit_extract_luts_buffer *bit_extract) { - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks, num_radix_blocks * bits_per_block, bit_extract->lut); } @@ -870,7 +873,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, std::function sign_handler_f, void **bsks, Torus **ksks, uint32_t num_sign_blocks) { - cudaSetDevice(gpu_indexes[0]); auto diff_buffer = mem_ptr->diff_buffer; auto params = mem_ptr->params; @@ -904,9 +906,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); while (num_sign_blocks > 2) { - pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, - big_lwe_dimension, num_sign_blocks, 4); - integer_radix_apply_univariate_lookup_table_kb( + pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, + big_lwe_dimension, num_sign_blocks, 4); + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks, num_sign_blocks / 2, lut); @@ -937,11 +939,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, final_lut_f); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension, - 2, 4); - integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, - gpu_count, signs_array_out, - signs_b, bsks, ksks, 1, lut); + pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, + big_lwe_dimension, 2, 4); + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks, + 1, lut); } else { @@ -957,9 +959,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, final_lut_f); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes, - gpu_count, signs_array_out, - signs_a, bsks, ksks, 1, lut); + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks, + 1, lut); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index ba831d1f9f..28045096bb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( if (!ch_amount) ch_amount++; dim3 add_grid(ch_amount, num_blocks, 1); - size_t sm_size = big_lwe_size * sizeof(Torus); cudaSetDevice(gpu_indexes[0]); tree_add_chunks<<>>( @@ -303,7 +302,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( // inside d_smart_copy_in there are only -1 values // it's fine to call smart_copy with same pointer // as source and destination - smart_copy<<>>( + smart_copy<<>>( new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in, big_lwe_size); check_cuda_error(cudaGetLastError()); @@ -422,9 +421,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( luts_message_carry->release(streams, gpu_indexes, gpu_count); delete (luts_message_carry); - host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, - &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, - num_blocks); + host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, + &old_blocks[num_blocks * big_lwe_size], + big_lwe_dimension, num_blocks); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu index 3ed45f89e1..67ae11be96 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu @@ -1,14 +1,16 @@ #include "integer/negation.cuh" -void cuda_negate_integer_radix_ciphertext_64_inplace( - void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array, - uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, - uint32_t message_modulus, uint32_t carry_modulus) { +void cuda_negate_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension, + uint32_t lwe_ciphertext_count, uint32_t message_modulus, + uint32_t carry_modulus) { - host_integer_radix_negation( + host_integer_radix_negation( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), static_cast(lwe_array), - lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus); + static_cast(lwe_array_out), + static_cast(lwe_array_in), lwe_dimension, + lwe_ciphertext_count, message_modulus, carry_modulus); } void scratch_cuda_integer_radix_overflowing_sub_kb_64( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh index 2a18793ba7..eb8307c93a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh @@ -25,14 +25,13 @@ template __global__ void device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks, uint64_t lwe_dimension, uint64_t message_modulus, - uint64_t carry_modulus, uint64_t delta) { + uint64_t delta) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < lwe_dimension + 1) { bool is_body = (tid == lwe_dimension); // z = ceil( degree / 2^p ) * 2^p uint64_t z = (2 * message_modulus - 1) / message_modulus; - __syncthreads(); z *= message_modulus; // (0,Delta*z) - ct @@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks, uint64_t encoded_zb = zb * delta; - __syncthreads(); - // (0,Delta*z) - ct output[tid] = (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]); - __syncthreads(); } } } @@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes, getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t); // Value of the shift we multiply our messages by // If message_modulus and carry_modulus are always powers of 2 we can simplify // this uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); - device_integer_radix_negation<<>>( + device_integer_radix_negation<<>>( output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus, - carry_modulus, delta); + delta); check_cuda_error(cudaGetLastError()); } @@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb( auto radix_params = mem_ptr->params; - host_unchecked_sub_with_correcting_term( + host_unchecked_sub_with_correcting_term( streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left, radix_lwe_right, radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus, radix_params.carry_modulus, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu index d1b65c5e71..153a6e5e86 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu @@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus) { - host_integer_radix_scalar_addition_inplace( + host_integer_radix_scalar_addition_inplace( (cudaStream_t *)(streams), gpu_indexes, gpu_count, static_cast(lwe_array), static_cast(scalar_input), lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh index 79a7c88072..1595651f06 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh @@ -45,9 +45,10 @@ __host__ void host_integer_radix_scalar_addition_inplace( // this uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); - device_integer_radix_scalar_addition_inplace<<>>( - lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension, - delta); + device_integer_radix_scalar_addition_inplace + <<>>(lwe_array, scalar_input, + input_lwe_ciphertext_count, lwe_dimension, + delta); check_cuda_error(cudaGetLastError()); } @@ -83,8 +84,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace( // this uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); - device_integer_radix_add_scalar_one_inplace<<>>( - lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta); + device_integer_radix_add_scalar_one_inplace + <<>>(lwe_array, input_lwe_ciphertext_count, + lwe_dimension, delta); check_cuda_error(cudaGetLastError()); } @@ -122,10 +124,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace( // this uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); - device_integer_radix_scalar_subtraction_inplace<<>>( - lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension, - delta); + device_integer_radix_scalar_subtraction_inplace + <<>>(lwe_array, scalar_input, + input_lwe_ciphertext_count, lwe_dimension, + delta); check_cuda_error(cudaGetLastError()); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 7cc3e6cec0..a8cd292e7a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -3,6 +3,58 @@ #include "integer/comparison.cuh" +template +__host__ void scalar_compare_radix_blocks_kb( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, + int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, + uint32_t num_radix_blocks) { + + if (num_radix_blocks == 0) + return; + auto params = mem_ptr->params; + auto big_lwe_dimension = params.big_lwe_dimension; + auto message_modulus = params.message_modulus; + auto carry_modulus = params.carry_modulus; + + // When rhs > lhs, the subtraction will overflow, and the bit of padding will + // be set to 1 + // meaning that the output of the pbs will be the negative (modulo message + // space) + // + // Example: + // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4 + // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1) + // Since there was an overflow the bit of padding is 1 and not 0. + // When applying the LUT for an input value of 14 we would expect 1, + // but since the bit of padding is 1, we will get -1 modulus our message + // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000 + + auto subtracted_blocks = mem_ptr->tmp_block_comparisons; + cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in, + num_radix_blocks * (big_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + // Subtract + // Here we need the true lwe sub, not the one that comes from shortint. + host_integer_radix_scalar_subtraction_inplace( + streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks, + big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus); + + // Apply LUT to compare to 0 + auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks, + ksks, num_radix_blocks, sign_lut); + + // Add one + // Here Lhs can have the following values: (-1) % (message modulus * carry + // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2 + host_integer_radix_add_scalar_one_inplace( + streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension, + num_radix_blocks, message_modulus, carry_modulus); +} + template __host__ void integer_radix_unsigned_scalar_difference_check_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, @@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( if (total_num_scalar_blocks == 0) { // We only have to compare blocks with zero // means scalar is zero - host_compare_with_zero_equality(streams, gpu_indexes, gpu_count, - mem_ptr->tmp_lwe_array_out, lwe_array_in, - mem_ptr, bsks, ksks, total_num_radix_blocks, - mem_ptr->is_zero_lut); + host_compare_with_zero_equality( + streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out, + lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks, + mem_ptr->is_zero_lut); auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus { x = (x == 1 ? IS_EQUAL : IS_SUPERIOR); @@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( Torus *lhs = diff_buffer->tmp_packed_left; Torus *rhs = diff_buffer->tmp_packed_right; - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - total_num_scalar_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks, + message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + total_num_scalar_blocks, message_modulus); // From this point we have half number of blocks num_lsb_radix_blocks /= 2; @@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( // - 2 if lhs > rhs auto comparisons = mem_ptr->tmp_block_comparisons; - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - comparisons, lhs, rhs, mem_ptr, bsks, ksks, - num_lsb_radix_blocks); + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign - tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, - comparisons, mem_ptr->diff_buffer->tree_buffer, - mem_ptr->identity_lut_f, bsks, ksks, - num_lsb_radix_blocks); + tree_sign_reduction( + lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons, + mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks, + num_lsb_radix_blocks); ////////////// // msb - host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, - lwe_array_msb_out, msb, mem_ptr, bsks, ksks, - num_msb_radix_blocks, mem_ptr->is_zero_lut); + host_compare_with_zero_equality( + msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr, + bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); @@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( scalar_bivariate_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus); @@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( Torus *lhs = diff_buffer->tmp_packed_left; Torus *rhs = diff_buffer->tmp_packed_right; - pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - num_scalar_blocks, message_modulus); + pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks, + message_modulus); + pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + num_scalar_blocks, message_modulus); // From this point we have half number of blocks num_lsb_radix_blocks /= 2; @@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( // - 1 if lhs == rhs // - 2 if lhs > rhs auto comparisons = mem_ptr->tmp_lwe_array_out; - scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, - lhs, rhs, mem_ptr, bsks, ksks, - num_lsb_radix_blocks); + scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign - tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, - comparisons, mem_ptr->diff_buffer->tree_buffer, - sign_handler_f, bsks, ksks, num_lsb_radix_blocks); + tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out, + comparisons, mem_ptr->diff_buffer->tree_buffer, + sign_handler_f, bsks, ksks, + num_lsb_radix_blocks); } } @@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // We only have to compare blocks with zero // means scalar is zero Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out; - host_compare_with_zero_equality( + host_compare_with_zero_equality( streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut); Torus *sign_block = @@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( scalar_bivariate_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros, sign_block, bsks, ksks, 1, lut, lut->params.message_modulus); @@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( Torus *lhs = diff_buffer->tmp_packed_left; Torus *rhs = diff_buffer->tmp_packed_right; - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - total_num_scalar_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks, + message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + total_num_scalar_blocks, message_modulus); // From this point we have half number of blocks num_lsb_radix_blocks /= 2; @@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // - 2 if lhs > rhs auto comparisons = mem_ptr->tmp_block_comparisons; - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - comparisons, lhs, rhs, mem_ptr, bsks, ksks, - num_lsb_radix_blocks); + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign - tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, - comparisons, mem_ptr->diff_buffer->tree_buffer, - mem_ptr->identity_lut_f, bsks, ksks, - num_lsb_radix_blocks); + tree_sign_reduction( + lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons, + mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks, + num_lsb_radix_blocks); ////////////// // msb // We remove the last block (which is the sign) Torus *are_all_msb_zeros = lwe_array_msb_out; - host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, - are_all_msb_zeros, msb, mem_ptr, bsks, ksks, - num_msb_radix_blocks, mem_ptr->is_zero_lut); + host_compare_with_zero_equality( + msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr, + bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); auto sign_bit_pos = (int)log2(message_modulus) - 1; @@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut, signed_msb_lut->params.message_modulus); @@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( ////////////// // Reduce the two blocks into one final - reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2); + reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, + 2); } else { // We only have to do the regular comparison @@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( Torus *lhs = diff_buffer->tmp_packed_left; Torus *rhs = diff_buffer->tmp_packed_right; - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - num_lsb_radix_blocks - 1, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks - 1, + message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + num_lsb_radix_blocks - 1, message_modulus); // From this point we have half number of blocks num_lsb_radix_blocks /= 2; @@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // - 0 if lhs < rhs // - 1 if lhs == rhs // - 2 if lhs > rhs - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - lwe_array_ct_out, lhs, rhs, mem_ptr, bsks, - ksks, num_lsb_radix_blocks); + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + lwe_array_ct_out, lhs, rhs, mem_ptr, + bsks, ksks, num_lsb_radix_blocks); Torus *encrypted_sign_block = lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1); auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block; - create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block, - scalar_sign_block, big_lwe_dimension, 1, 1, - message_modulus, carry_modulus); + create_trivial_radix( + msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block, + big_lwe_dimension, 1, 1, message_modulus, carry_modulus); - integer_radix_apply_bivariate_lookup_table_kb( + integer_radix_apply_bivariate_lookup_table_kb( msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out, encrypted_sign_block, trivial_sign_block, bsks, ksks, 1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); @@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( // Reduces a vec containing radix blocks that encrypts a sign // (inferior, equal, superior) to one single radix block containing the // final sign - reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, - lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks, - num_lsb_radix_blocks + 1); + reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out, + lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks, + num_lsb_radix_blocks + 1); } } @@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) { - cudaSetDevice(gpu_indexes[0]); auto params = mem_ptr->params; // Calculates the difference sign between the ciphertext and the scalar // - 0 if lhs < rhs // - 1 if lhs == rhs // - 2 if lhs > rhs auto sign = mem_ptr->tmp_lwe_array_out; - integer_radix_signed_scalar_difference_check_kb( + integer_radix_signed_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); @@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( auto lwe_array_left = lwe_array_in; auto lwe_array_right = mem_ptr->tmp_block_comparisons; - create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right, - scalar_blocks, params.big_lwe_dimension, - total_num_radix_blocks, total_num_scalar_blocks, - params.message_modulus, params.carry_modulus); + create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right, + scalar_blocks, params.big_lwe_dimension, + total_num_radix_blocks, total_num_scalar_blocks, + params.message_modulus, params.carry_modulus); // Selector // CMUX for Max or Min - host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, - sign, lwe_array_left, lwe_array_right, - mem_ptr->cmux_buffer, bsks, ksks, - total_num_radix_blocks); + host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, + lwe_array_out, sign, lwe_array_left, + lwe_array_right, mem_ptr->cmux_buffer, bsks, + ksks, total_num_radix_blocks); } template @@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb( if (mem_ptr->is_signed) { // is signed and scalar is positive - integer_radix_signed_scalar_difference_check_kb( + integer_radix_signed_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } else { - integer_radix_unsigned_scalar_difference_check_kb( + integer_radix_unsigned_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); @@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb( if (mem_ptr->is_signed) { // is signed and scalar is positive - integer_radix_signed_scalar_maxmin_kb( + integer_radix_signed_scalar_maxmin_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); } else { - integer_radix_unsigned_scalar_maxmin_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, - scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks, - total_num_scalar_blocks); + PANIC("Cuda error: only signed scalar maxmin can be called in signed " + "scalar comparison") } } -template -__host__ void scalar_compare_radix_blocks_kb( - cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks, - int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, - uint32_t num_radix_blocks) { - - if (num_radix_blocks == 0) - return; - auto params = mem_ptr->params; - auto big_lwe_dimension = params.big_lwe_dimension; - auto message_modulus = params.message_modulus; - auto carry_modulus = params.carry_modulus; - - // When rhs > lhs, the subtraction will overflow, and the bit of padding will - // be set to 1 - // meaning that the output of the pbs will be the negative (modulo message - // space) - // - // Example: - // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4 - // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1) - // Since there was an overflow the bit of padding is 1 and not 0. - // When applying the LUT for an input value of 14 we would expect 1, - // but since the bit of padding is 1, we will get -1 modulus our message - // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000 - - auto subtracted_blocks = mem_ptr->tmp_block_comparisons; - cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in, - num_radix_blocks * (big_lwe_dimension + 1) * - sizeof(Torus), - streams[0], gpu_indexes[0]); - // Subtract - // Here we need the true lwe sub, not the one that comes from shortint. - host_integer_radix_scalar_subtraction_inplace( - streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks, - big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus); - - // Apply LUT to compare to 0 - auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut; - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks, - ksks, num_radix_blocks, sign_lut); - - // Add one - // Here Lhs can have the following values: (-1) % (message modulus * carry - // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2 - host_integer_radix_add_scalar_one_inplace( - streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension, - num_radix_blocks, message_modulus, carry_modulus); -} - template __host__ void host_integer_radix_scalar_maxmin_kb( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, @@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb( // - 1 if lhs == rhs // - 2 if lhs > rhs auto sign = mem_ptr->tmp_lwe_array_out; - host_integer_radix_scalar_difference_check_kb( + host_integer_radix_scalar_difference_check_kb( streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks, total_num_scalar_blocks); @@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb( auto lwe_array_left = lwe_array_in; auto lwe_array_right = mem_ptr->tmp_block_comparisons; - create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right, - scalar_blocks, params.big_lwe_dimension, - total_num_radix_blocks, total_num_scalar_blocks, - params.message_modulus, params.carry_modulus); + create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right, + scalar_blocks, params.big_lwe_dimension, + total_num_radix_blocks, total_num_scalar_blocks, + params.message_modulus, params.carry_modulus); // Selector // CMUX for Max or Min - host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out, - mem_ptr->tmp_lwe_array_out, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, - total_num_radix_blocks); + host_integer_radix_cmux_kb( + streams, gpu_indexes, gpu_count, lwe_array_out, + mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right, + mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); } template @@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb( auto packed_scalar = packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks; - pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0, - num_scalar_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb, + big_lwe_dimension, num_lsb_radix_blocks, + message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, + scalar_blocks, 0, num_scalar_blocks, message_modulus); cuda_memcpy_async_gpu_to_gpu( scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0), @@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb( gpu_indexes[0]); scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0); - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts); } @@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb( PANIC("Cuda error: integer operation not supported") } - host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, - lwe_array_msb_out, msb, mem_ptr, bsks, ksks, - num_msb_radix_blocks, msb_lut); + host_compare_with_zero_equality( + msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr, + bsks, ksks, num_msb_radix_blocks, msb_lut); } for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { @@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb( switch (mem_ptr->op) { case COMPARISON_TYPE::EQ: - are_all_comparisons_block_true( + are_all_comparisons_block_true( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsks, ksks, num_halved_scalar_blocks + (num_msb_radix_blocks > 0)); break; case COMPARISON_TYPE::NE: - is_at_least_one_comparisons_block_true( + is_at_least_one_comparisons_block_true( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsks, ksks, num_halved_scalar_blocks + (num_msb_radix_blocks > 0)); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh index 8347945532..ad71ea5cb0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh @@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix( cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array, lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]); - host_integer_radix_logical_scalar_shift_kb_inplace( + host_integer_radix_logical_scalar_shift_kb_inplace( streams, gpu_indexes, gpu_count, ptr, shift_amount, mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks); } else { @@ -82,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix( preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size; T *block_shift_buffer = all_shifted_buffer + j * num_radix_blocks * lwe_size; - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - block_shift_buffer, preshifted_radix_ct, - i / msg_bits, num_radix_blocks, lwe_size); + host_radix_blocks_rotate_right( + streams, gpu_indexes, gpu_count, block_shift_buffer, + preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size); // create trivial assign for value = 0 cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes, streams[0], gpu_indexes[0]); j++; } } + cuda_synchronize_stream(streams[0], gpu_indexes[0]); cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]); mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh index e951d2b2a6..02a0a77a40 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh @@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( // one block is responsible to process single lwe ciphertext if (mem->shift_type == LEFT_SHIFT) { // rotate right as the blocks are from LSB to MSB - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - rotated_buffer, lwe_array, rotations, - num_blocks, big_lwe_size); + host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, + rotated_buffer, lwe_array, rotations, + num_blocks, big_lwe_size); cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, streams[0], @@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( auto receiver_blocks = lwe_array; auto giver_blocks = rotated_buffer; - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - giver_blocks, lwe_array, 1, num_blocks, - big_lwe_size); + host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, + giver_blocks, lwe_array, 1, + num_blocks, big_lwe_size); auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; @@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( } else { // rotate left as the blocks are from LSB to MSB - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, - rotated_buffer, lwe_array, rotations, - num_blocks, big_lwe_size); + host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, + rotated_buffer, lwe_array, rotations, + num_blocks, big_lwe_size); cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, streams[0], @@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( auto receiver_blocks = lwe_array; auto giver_blocks = rotated_buffer; - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks, - lwe_array, 1, num_blocks, big_lwe_size); + host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, + giver_blocks, lwe_array, 1, num_blocks, + big_lwe_size); auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 348655d1db..6555fad9f1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( if (mem->shift_type == LEFT_SHIFT) { // rotate right as the blocks are from LSB to MSB - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - rotated_buffer, lwe_array, rotations, - num_blocks, big_lwe_size); + host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, + rotated_buffer, lwe_array, rotations, + num_blocks, big_lwe_size); // create trivial assign for value = 0 cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes, @@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( } else { // right shift - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, - rotated_buffer, lwe_array, rotations, - num_blocks, big_lwe_size); + host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, + rotated_buffer, lwe_array, rotations, + num_blocks, big_lwe_size); // rotate left as the blocks are from LSB to MSB // create trivial assign for value = 0 @@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( Torus *last_block_copy = &padding_block[big_lwe_size]; if (mem->shift_type == RIGHT_SHIFT) { - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, - rotated_buffer, lwe_array, rotations, - num_blocks, big_lwe_size); + host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, + rotated_buffer, lwe_array, rotations, + num_blocks, big_lwe_size); cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); @@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( } auto lut_univariate_padding_block = mem->lut_buffers_univariate[num_bits_in_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( mem->local_streams_1, gpu_indexes, gpu_count, padding_block, last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); // Replace blocks 'pulled' from the left with the correct padding @@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( if (shift_within_block != 0) { auto lut_univariate_shift_last_block = mem->lut_buffers_univariate[shift_within_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( mem->local_streams_2, gpu_indexes, gpu_count, last_block, last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 801c0f131a..ffd70a75e5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( switch (mem->shift_type) { case LEFT_SHIFT: // rotate right as the blocks are from LSB to MSB - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - rotated_input, input_bits_b, rotations, - total_nb_bits, big_lwe_size); + host_radix_blocks_rotate_right( + streams, gpu_indexes, gpu_count, rotated_input, input_bits_b, + rotations, total_nb_bits, big_lwe_size); if (mem->is_signed && mem->shift_type == RIGHT_SHIFT) for (int i = 0; i < rotations; i++) @@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( break; case RIGHT_SHIFT: // rotate left as the blocks are from LSB to MSB - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, - rotated_input, input_bits_b, rotations, - total_nb_bits, big_lwe_size); + host_radix_blocks_rotate_left( + streams, gpu_indexes, gpu_count, rotated_input, input_bits_b, + rotations, total_nb_bits, big_lwe_size); if (mem->is_signed) for (int i = 0; i < rotations; i++) @@ -119,15 +119,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( break; case LEFT_ROTATE: // rotate right as the blocks are from LSB to MSB - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - rotated_input, input_bits_b, rotations, - total_nb_bits, big_lwe_size); + host_radix_blocks_rotate_right( + streams, gpu_indexes, gpu_count, rotated_input, input_bits_b, + rotations, total_nb_bits, big_lwe_size); break; case RIGHT_ROTATE: // rotate left as the blocks are from LSB to MSB - host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, - rotated_input, input_bits_b, rotations, - total_nb_bits, big_lwe_size); + host_radix_blocks_rotate_left( + streams, gpu_indexes, gpu_count, rotated_input, input_bits_b, + rotations, total_nb_bits, big_lwe_size); break; default: PANIC("Unknown operation") @@ -137,20 +137,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // control_bit|b|a cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes, streams[0], gpu_indexes[0]); // Do we need this? - pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs, - mux_lut->lwe_indexes_out, rotated_input, input_bits_a, - mux_lut->lwe_indexes_in, big_lwe_dimension, 2, - total_nb_bits); + pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs, + mux_lut->lwe_indexes_out, rotated_input, + input_bits_a, mux_lut->lwe_indexes_in, + big_lwe_dimension, 2, total_nb_bits); // The shift bit is already properly aligned/positioned for (int i = 0; i < total_nb_bits; i++) - host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size, - mux_inputs + i * big_lwe_size, shift_bit, - mem->params.big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], + mux_inputs + i * big_lwe_size, + mux_inputs + i * big_lwe_size, shift_bit, + mem->params.big_lwe_dimension, 1); // we have // control_bit|b|a - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks, total_nb_bits, mux_lut); } @@ -179,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( auto bit_to_add = input_bits_a + i * big_lwe_size; for (int j = 0; j < num_radix_blocks; j++) { - host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add, - big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add, + big_lwe_dimension, 1); block += big_lwe_size; bit_to_add += bits_per_block * big_lwe_size; @@ -188,7 +189,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // To give back a clean ciphertext auto cleaning_lut = mem->cleaning_lut; - integer_radix_apply_univariate_lookup_table_kb( + integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks, num_radix_blocks, cleaning_lut); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu index a9d590b93b..17c06326f6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu @@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_addition(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), input_lwe_dimension, - input_lwe_ciphertext_count); + host_addition(static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), + input_lwe_dimension, input_lwe_ciphertext_count); } /* @@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_addition(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), input_lwe_dimension, - input_lwe_ciphertext_count); + host_addition(static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in_1), + static_cast(lwe_array_in_2), + input_lwe_dimension, input_lwe_ciphertext_count); } /* * Perform the addition of a u32 input LWE ciphertext vector with a u32 @@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( void *plaintext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_addition_plaintext(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(plaintext_array_in), - input_lwe_dimension, input_lwe_ciphertext_count); + host_addition_plaintext( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(plaintext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } /* * Perform the addition of a u64 input LWE ciphertext vector with a u64 input @@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( void *plaintext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_addition_plaintext(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(plaintext_array_in), - input_lwe_dimension, input_lwe_ciphertext_count); + host_addition_plaintext( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(plaintext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 7e7695989f..8e0b627446 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -43,7 +43,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output, cuda_memcpy_async_gpu_to_gpu(output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream, gpu_index); - plaintext_addition<<>>( + plaintext_addition<<>>( output, lwe_input, plaintext_input, lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); } @@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - addition<<>>(output, input_1, input_2, num_entries); + addition<<>>(output, input_1, input_2, num_entries); check_cuda_error(cudaGetLastError()); } @@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index, dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - subtraction<<>>(output, input_1, input_2, num_entries); + subtraction + <<>>(output, input_1, input_2, num_entries); check_cuda_error(cudaGetLastError()); } @@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream, (input_lwe_dimension + 1) * sizeof(T), stream, gpu_index); - radix_body_subtraction_inplace<<>>( + radix_body_subtraction_inplace<<>>( output, plaintext_input, input_lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); } @@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term( } } template - __host__ void host_unchecked_sub_with_correcting_term( cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count, @@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term( dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - unchecked_sub_with_correcting_term<<>>( + unchecked_sub_with_correcting_term<<>>( output, input_1, input_2, num_entries, lwe_size, message_modulus, carry_modulus, degree); check_cuda_error(cudaGetLastError()); diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu index a64c15378d..1c424b336c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu @@ -9,7 +9,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_vec_multiplication( + host_cleartext_vec_multiplication( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_array_in), @@ -49,7 +49,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_vec_multiplication( + host_cleartext_vec_multiplication( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_array_in), diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index b9864973a0..9f9d396ed4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -46,7 +46,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - cleartext_vec_multiplication<<>>( + cleartext_vec_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); } @@ -82,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index, dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - cleartext_multiplication<<>>( + cleartext_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu index b91115fca3..a1465e390f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu @@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_negation(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), input_lwe_dimension, - input_lwe_ciphertext_count); + host_negation(static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + input_lwe_dimension, input_lwe_ciphertext_count); } /* @@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_negation(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), input_lwe_dimension, - input_lwe_ciphertext_count); + host_negation(static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + input_lwe_dimension, input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh index b1faeb6156..e8c4ba0a69 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh @@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output, dim3 grid(num_blocks, 1, 1); dim3 thds(num_threads, 1, 1); - negation<<>>(output, input, num_entries); + negation<<>>(output, input, num_entries); check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 3ce55b8503..0d1e1c12c6 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -323,11 +323,12 @@ extern "C" { mem_ptr: *mut *mut i8, ); - pub fn cuda_negate_integer_radix_ciphertext_64_inplace( + pub fn cuda_negate_integer_radix_ciphertext_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array: *mut c_void, + lwe_array_out: *mut c_void, + lwe_array_in: *const c_void, lwe_dimension: u32, lwe_ciphertext_count: u32, message_modulus: u32, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index d95cfa2bac..adc6717971 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -551,19 +551,21 @@ pub unsafe fn negate_lwe_ciphertext_vector_assign_async( /// /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is /// required -pub unsafe fn negate_integer_radix_assign_async( +pub unsafe fn negate_integer_radix_async( streams: &CudaStreams, - lwe_array: &mut CudaVec, + lwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, lwe_dimension: LweDimension, num_samples: u32, message_modulus: u32, carry_modulus: u32, ) { - cuda_negate_integer_radix_ciphertext_64_inplace( + cuda_negate_integer_radix_ciphertext_64( streams.ptr.as_ptr(), streams.gpu_indexes.as_ptr(), streams.len() as u32, - lwe_array.as_mut_c_ptr(0), + lwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), lwe_dimension.0 as u32, num_samples, message_modulus, diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs index f3fd1d60c5..6af4269916 100644 --- a/tfhe/src/integer/gpu/server_key/radix/neg.rs +++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs @@ -1,4 +1,4 @@ -use crate::core_crypto::gpu::{negate_integer_radix_assign_async, CudaStreams}; +use crate::core_crypto::gpu::{negate_integer_radix_async, CudaStreams}; use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; use crate::integer::gpu::server_key::CudaServerKey; @@ -58,49 +58,26 @@ impl CudaServerKey { pub unsafe fn unchecked_neg_async( &self, ctxt: &T, - stream: &CudaStreams, + streams: &CudaStreams, ) -> T { - let mut result = ctxt.duplicate_async(stream); - self.unchecked_neg_assign_async(&mut result, stream); - result - } + let mut ciphertext_out = ctxt.duplicate_async(streams); + let lwe_dimension = ctxt.as_ref().d_blocks.lwe_dimension(); + let lwe_ciphertext_count = ctxt.as_ref().d_blocks.lwe_ciphertext_count(); - /// # Safety - /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised - pub unsafe fn unchecked_neg_assign_async( - &self, - ctxt: &mut T, - stream: &CudaStreams, - ) { - let ciphertext = ctxt.as_mut(); - let lwe_dimension = ciphertext.d_blocks.lwe_dimension(); - let lwe_ciphertext_count = ciphertext.d_blocks.lwe_ciphertext_count(); + let info = ctxt.as_ref().info.blocks.first().unwrap(); - let info = ciphertext.info.blocks.first().unwrap(); - - negate_integer_radix_assign_async( - stream, - &mut ciphertext.d_blocks.0.d_vec, + negate_integer_radix_async( + streams, + &mut ciphertext_out.as_mut().d_blocks.0.d_vec, + &ctxt.as_ref().d_blocks.0.d_vec, lwe_dimension, lwe_ciphertext_count.0 as u32, info.message_modulus.0 as u32, info.carry_modulus.0 as u32, ); - ciphertext.info = ciphertext.info.after_neg(); - } - - pub fn unchecked_neg_assign( - &self, - ctxt: &mut T, - stream: &CudaStreams, - ) { - unsafe { - self.unchecked_neg_assign_async(ctxt, stream); - } - stream.synchronize(); + ciphertext_out.as_mut().info = ctxt.as_ref().info.after_neg(); + ciphertext_out } /// Homomorphically computes the opposite of a ciphertext encrypting an integer message. @@ -141,9 +118,9 @@ impl CudaServerKey { /// let dec: u64 = cks.decrypt(&res); /// assert_eq!(modulus - msg, dec); /// ``` - pub fn neg(&self, ctxt: &T, stream: &CudaStreams) -> T { - let mut result = unsafe { ctxt.duplicate_async(stream) }; - self.neg_assign(&mut result, stream); + pub fn neg(&self, ctxt: &T, streams: &CudaStreams) -> T { + let result = unsafe { self.neg_async(ctxt, streams) }; + streams.synchronize(); result } @@ -151,29 +128,23 @@ impl CudaServerKey { /// /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until stream is synchronised - pub unsafe fn neg_assign_async( + pub unsafe fn neg_async( &self, - ctxt: &mut T, - stream: &CudaStreams, - ) { + ctxt: &T, + streams: &CudaStreams, + ) -> T { let mut tmp_ctxt; let ct = if ctxt.block_carries_are_empty() { ctxt } else { - tmp_ctxt = ctxt.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_ctxt, stream); + tmp_ctxt = ctxt.duplicate_async(streams); + self.full_propagate_assign_async(&mut tmp_ctxt, streams); &mut tmp_ctxt }; - self.unchecked_neg_assign_async(ct, stream); - let _carry = self.propagate_single_carry_assign_async(ct, stream); - } - - pub fn neg_assign(&self, ctxt: &mut T, stream: &CudaStreams) { - unsafe { - self.neg_assign_async(ctxt, stream); - } - stream.synchronize(); + let mut res = self.unchecked_neg_async(ct, streams); + let _carry = self.propagate_single_carry_assign_async(&mut res, streams); + res } }