From 6bb0bf789602c274c4b350e9d5e52c0c0b6617f0 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 13 Dec 2024 16:23:58 +0100 Subject: [PATCH] chore(gpu): run pbs in parallel in difference_check --- .../cuda/include/integer/integer_utilities.h | 13 ++++-------- .../cuda/src/integer/comparison.cuh | 18 ++++++++--------- .../cuda/src/integer/scalar_comparison.cuh | 20 +++++++++++-------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index fd9f2c46eb..bb0f9c1d74 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -3273,8 +3273,7 @@ template struct int_comparison_diff_buffer { int_radix_params params; COMPARISON_TYPE op; - Torus *tmp_packed_left; - Torus *tmp_packed_right; + Torus *tmp_packed; std::function operator_f; @@ -3311,11 +3310,8 @@ template struct int_comparison_diff_buffer { Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus); - tmp_packed_left = (Torus *)cuda_malloc_async( - big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]); - - tmp_packed_right = (Torus *)cuda_malloc_async( - big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]); + tmp_packed = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, + streams[0], gpu_indexes[0]); tree_buffer = new int_tree_sign_reduction_buffer( streams, gpu_indexes, gpu_count, operator_f, params, num_radix_blocks, @@ -3338,8 +3334,7 @@ template struct int_comparison_diff_buffer { reduce_signs_lut->release(streams, gpu_indexes, gpu_count); delete reduce_signs_lut; - cuda_drop_async(tmp_packed_left, streams[0], gpu_indexes[0]); - cuda_drop_async(tmp_packed_right, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_packed, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_signs_a, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_signs_b, streams[0], gpu_indexes[0]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index 80205eeff3..2535aa26d1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -490,8 +490,9 @@ __host__ void host_integer_radix_difference_check_kb( if (carry_modulus >= message_modulus) { // Packing is possible // Pack inputs - Torus *packed_left = diff_buffer->tmp_packed_left; - Torus *packed_right = diff_buffer->tmp_packed_right; + Torus *packed_left = diff_buffer->tmp_packed; + Torus *packed_right = + diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size; // In case the ciphertext is signed, the sign block and the one before it // are handled separately if (mem_ptr->is_signed) { @@ -510,10 +511,7 @@ __host__ void host_integer_radix_difference_check_kb( auto identity_lut = mem_ptr->identity_lut; integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks, - packed_num_radix_blocks, identity_lut); - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks, - packed_num_radix_blocks, identity_lut); + 2 * packed_num_radix_blocks, identity_lut); lhs = packed_left; rhs = packed_right; @@ -542,11 +540,13 @@ __host__ void host_integer_radix_difference_check_kb( // Compare the last block before the sign block separately auto identity_lut = mem_ptr->identity_lut; + Torus *packed_left = diff_buffer->tmp_packed; + Torus *packed_right = + diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size; Torus *last_left_block_before_sign_block = - diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size; + packed_left + packed_num_radix_blocks * big_lwe_size; Torus *last_right_block_before_sign_block = - diff_buffer->tmp_packed_right + - packed_num_radix_blocks * big_lwe_size; + packed_right + packed_num_radix_blocks * big_lwe_size; integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_left_block_before_sign_block, lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 10301b5055..4b79a24cec 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -141,8 +141,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( ////////////// // lsb - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; + Torus *lhs = diff_buffer->tmp_packed; + Torus *rhs = + diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size; pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, big_lwe_dimension, num_lsb_radix_blocks, @@ -210,8 +211,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( uint32_t num_lsb_radix_blocks = total_num_radix_blocks; uint32_t num_scalar_blocks = total_num_scalar_blocks; - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; + Torus *lhs = diff_buffer->tmp_packed; + Torus *rhs = + diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size; pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in, big_lwe_dimension, num_lsb_radix_blocks, @@ -358,8 +360,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( ////////////// // lsb - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; + Torus *lhs = diff_buffer->tmp_packed; + Torus *rhs = + diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size; pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, big_lwe_dimension, num_lsb_radix_blocks, @@ -458,8 +461,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out; auto lwe_array_sign_out = lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size; - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; + Torus *lhs = diff_buffer->tmp_packed; + Torus *rhs = + diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size; pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, big_lwe_dimension, num_lsb_radix_blocks - 1,