From d6e45858c1bd2d52d0ffbaaa3f908b8452dc53dd Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Wed, 18 Dec 2024 13:37:51 +0100 Subject: [PATCH] chore(gpu): rework single carry proip to avoid using local streams --- .../cuda/include/integer/integer_utilities.h | 88 +++++++------- .../cuda/src/integer/integer.cuh | 107 ++++++++---------- 2 files changed, 89 insertions(+), 106 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 3ad91ede25..6adede4aeb 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -1539,10 +1539,12 @@ template struct int_prop_simu_group_carries_memory { cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes, streams[0], gpu_indexes[0]); - prepared_blocks = (Torus *)cuda_malloc_async( - num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); - cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes, - streams[0], gpu_indexes[0]); + prepared_blocks = + (Torus *)cuda_malloc_async((num_radix_blocks + 1) * big_lwe_size_bytes, + streams[0], gpu_indexes[0]); + cuda_memset_async(prepared_blocks, 0, + (num_radix_blocks + 1) * big_lwe_size_bytes, streams[0], + gpu_indexes[0]); resolved_carries = (Torus *)cuda_malloc_async( (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]); @@ -1772,7 +1774,6 @@ template struct int_sc_prop_memory { uint32_t num_many_lut; uint32_t lut_stride; - uint32_t group_size; uint32_t num_groups; Torus *output_flag; Torus *last_lhs; @@ -1780,8 +1781,6 @@ template struct int_sc_prop_memory { int_radix_lut *lut_message_extract; int_radix_lut *lut_overflow_flag_prep; - int_radix_lut *lut_overflow_flag_last; - int_radix_lut *lut_carry_flag_last; int_shifted_blocks_and_states_memory *shifted_blocks_state_mem; int_prop_simu_group_carries_memory *prop_simu_group_carries_mem; @@ -1791,8 +1790,6 @@ template struct int_sc_prop_memory { uint32_t requested_flag; uint32_t active_gpu_count; - cudaStream_t *sub_streams_1; - cudaStream_t *sub_streams_2; cudaEvent_t *incoming_events1; cudaEvent_t *incoming_events2; @@ -1817,7 +1814,6 @@ template struct int_sc_prop_memory { uint32_t block_modulus = message_modulus * carry_modulus; uint32_t num_bits_in_block = std::log2(block_modulus); uint32_t grouping_size = num_bits_in_block; - group_size = grouping_size; num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size; num_many_lut = 2; // many luts apply 2 luts @@ -1834,8 +1830,8 @@ template struct int_sc_prop_memory { // Step 3 elements lut_message_extract = - new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, - num_radix_blocks, allocate_gpu_memory); + new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2, + num_radix_blocks + 1, allocate_gpu_memory); // lut for the first block in the first grouping auto f_message_extract = [message_modulus](Torus block) -> Torus { return (block >> 1) % message_modulus; @@ -1851,8 +1847,9 @@ template struct int_sc_prop_memory { // This store a single block that with be used to store the overflow or // carry results - output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0], - gpu_indexes[0]); + output_flag = + (Torus *)cuda_malloc_async(big_lwe_size_bytes * (num_radix_blocks + 1), + streams[0], gpu_indexes[0]); cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0], gpu_indexes[0]); @@ -1911,9 +1908,6 @@ template struct int_sc_prop_memory { // It seems that this lut could be apply together with the other one but for // now we won't do it if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case - lut_overflow_flag_last = new int_radix_lut( - streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); - auto f_overflow_last = [num_radix_blocks, requested_flag_in](Torus block) -> Torus { uint32_t position = (num_radix_blocks == 1 && @@ -1929,39 +1923,57 @@ template struct int_sc_prop_memory { return does_overflow_if_carry_is_0; } }; - auto overflow_flag_last = lut_overflow_flag_last->get_lut(0, 0); + auto overflow_flag_last = lut_message_extract->get_lut(0, 1); generate_device_accumulator( streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_overflow_last); - lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes, 0); + Torus *h_lut_indexes = + (Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus)); + for (int index = 0; index < num_radix_blocks + 1; index++) { + if (index < num_radix_blocks) { + h_lut_indexes[index] = 0; + } else { + h_lut_indexes[index] = 1; + } + } + cuda_memcpy_async_to_gpu( + lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes, + (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]); + + lut_message_extract->broadcast_lut(streams, gpu_indexes, 0); + free(h_lut_indexes); } if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case - lut_carry_flag_last = new int_radix_lut( - streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); auto f_carry_last = [](Torus block) -> Torus { return ((block >> 2) & 1); }; - auto carry_flag_last = lut_carry_flag_last->get_lut(0, 0); + auto carry_flag_last = lut_message_extract->get_lut(0, 1); generate_device_accumulator( streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_carry_last); - lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, 0); + Torus *h_lut_indexes = + (Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus)); + for (int index = 0; index < num_radix_blocks + 1; index++) { + if (index < num_radix_blocks) { + h_lut_indexes[index] = 0; + } else { + h_lut_indexes[index] = 1; + } + } + cuda_memcpy_async_to_gpu( + lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes, + (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]); + + lut_message_extract->broadcast_lut(streams, gpu_indexes, 0); + free(h_lut_indexes); } active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); - sub_streams_1 = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - sub_streams_2 = - (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t)); - for (uint j = 0; j < active_gpu_count; j++) { - sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]); - sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]); - } incoming_events1 = (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t)); @@ -1997,24 +2009,10 @@ template struct int_sc_prop_memory { if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count); - lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count); delete lut_overflow_flag_prep; - delete lut_overflow_flag_last; cuda_drop_async(last_lhs, streams[0], gpu_indexes[0]); cuda_drop_async(last_rhs, streams[0], gpu_indexes[0]); } - if (requested_flag == outputFlag::FLAG_CARRY) { // In case of carry - lut_carry_flag_last->release(streams, gpu_indexes, gpu_count); - delete lut_carry_flag_last; - } - - // release sub streams - for (uint i = 0; i < active_gpu_count; i++) { - cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]); - cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]); - } - free(sub_streams_1); - free(sub_streams_2); // release events for (uint j = 0; j < active_gpu_count; j++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 45348fac20..02b80a5739 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -1624,13 +1624,12 @@ void host_propagate_single_carry(cudaStream_t const *streams, auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; - auto message_modulus = params.message_modulus; - auto carry_modulus = params.carry_modulus; uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); auto big_lwe_dimension = big_lwe_size - 1; // For host addition auto lut_stride = mem->lut_stride; auto num_many_lut = mem->num_many_lut; + auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks; if (requested_flag == outputFlag::FLAG_OVERFLOW) PANIC("Cuda error: single carry propagation is not supported for overflow, " "try using add_and_propagate_single_carry"); @@ -1647,7 +1646,7 @@ void host_propagate_single_carry(cudaStream_t const *streams, if (requested_flag == outputFlag::FLAG_CARRY) { cuda_memcpy_async_gpu_to_gpu( - mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, + output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, big_lwe_size_bytes, streams[0], gpu_indexes[0]); } // Step 2 @@ -1667,45 +1666,40 @@ void host_propagate_single_carry(cudaStream_t const *streams, if (requested_flag == outputFlag::FLAG_OVERFLOW || requested_flag == outputFlag::FLAG_CARRY) { - host_addition(streams[0], gpu_indexes[0], mem->output_flag, - mem->output_flag, + host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, mem->prop_simu_group_carries_mem->simulators + (num_radix_blocks - 1) * big_lwe_size, big_lwe_dimension, 1); } - cuda_synchronize_stream(streams[0], gpu_indexes[0]); - - // Step 3 - // Add carries and cleanup OutputFlag::None host_radix_sum_in_groups( - mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks, mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks, big_lwe_size, group_size); - - auto message_extract = mem->lut_message_extract; - integer_radix_apply_univariate_lookup_table_kb( - mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks, - bsks, ksks, num_radix_blocks, message_extract); - if (requested_flag == outputFlag::FLAG_CARRY) { - host_addition(mem->sub_streams_2[0], gpu_indexes[0], - mem->output_flag, mem->output_flag, + host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, mem->prop_simu_group_carries_mem->resolved_carries + (mem->num_groups - 1) * big_lwe_size, big_lwe_dimension, 1); + cuda_memcpy_async_gpu_to_gpu( + prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); integer_radix_apply_univariate_lookup_table_kb( - mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, - mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last); + streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks, + bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract); - cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag, - big_lwe_size_bytes, mem->sub_streams_2[0], - gpu_indexes[0]); - } - for (int j = 0; j < mem->active_gpu_count; j++) { - cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]); - cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]); + cuda_memcpy_async_gpu_to_gpu(lwe_array, mem->output_flag, + big_lwe_size_bytes * num_radix_blocks, + streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + carry_out, mem->output_flag + num_radix_blocks * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } else { + auto message_extract = mem->lut_message_extract; + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks, + num_radix_blocks, message_extract); } } @@ -1721,13 +1715,12 @@ void host_add_and_propagate_single_carry( auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; - auto message_modulus = params.message_modulus; - auto carry_modulus = params.carry_modulus; uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); auto big_lwe_dimension = big_lwe_size - 1; // For host addition auto lut_stride = mem->lut_stride; auto num_many_lut = mem->num_many_lut; + auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks; if (requested_flag == outputFlag::FLAG_OVERFLOW) { cuda_memcpy_async_gpu_to_gpu( @@ -1754,12 +1747,12 @@ void host_add_and_propagate_single_carry( if (requested_flag == outputFlag::FLAG_OVERFLOW) { auto lut_overflow_prep = mem->lut_overflow_flag_prep; integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs, + streams, gpu_indexes, gpu_count, output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks, 1, lut_overflow_prep, lut_overflow_prep->params.message_modulus); } else if (requested_flag == outputFlag::FLAG_CARRY) { cuda_memcpy_async_gpu_to_gpu( - mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, + output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size, big_lwe_size_bytes, streams[0], gpu_indexes[0]); } @@ -1780,58 +1773,50 @@ void host_add_and_propagate_single_carry( if (requested_flag == outputFlag::FLAG_OVERFLOW || requested_flag == outputFlag::FLAG_CARRY) { - host_addition(streams[0], gpu_indexes[0], mem->output_flag, - mem->output_flag, + host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, mem->prop_simu_group_carries_mem->simulators + (num_radix_blocks - 1) * big_lwe_size, big_lwe_dimension, 1); } - cuda_synchronize_stream(streams[0], gpu_indexes[0]); // Step 3 // Add carries and cleanup OutputFlag::None host_radix_sum_in_groups( - mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks, + streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks, mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks, big_lwe_size, group_size); - auto message_extract = mem->lut_message_extract; - integer_radix_apply_univariate_lookup_table_kb( - mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks, - bsks, ksks, num_radix_blocks, message_extract); - if (requested_flag == outputFlag::FLAG_OVERFLOW || requested_flag == outputFlag::FLAG_CARRY) { if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW && uses_carry == 1) { - host_addition(mem->sub_streams_2[0], gpu_indexes[0], - mem->output_flag, mem->output_flag, input_carries, - big_lwe_dimension, 1); + host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, + input_carries, big_lwe_dimension, 1); } else { - host_addition(mem->sub_streams_2[0], gpu_indexes[0], - mem->output_flag, mem->output_flag, + host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, mem->prop_simu_group_carries_mem->resolved_carries + (mem->num_groups - 1) * big_lwe_size, big_lwe_dimension, 1); } - if (requested_flag == outputFlag::FLAG_OVERFLOW) { - integer_radix_apply_univariate_lookup_table_kb( - mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, - mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last); - } else { - integer_radix_apply_univariate_lookup_table_kb( - mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag, - mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last); - } - cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag, - big_lwe_size_bytes, mem->sub_streams_2[0], - gpu_indexes[0]); - } - for (int j = 0; j < mem->active_gpu_count; j++) { - cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]); - cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]); + cuda_memcpy_async_gpu_to_gpu( + prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks, + bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract); + + cuda_memcpy_async_gpu_to_gpu(lhs_array, mem->output_flag, + big_lwe_size_bytes * num_radix_blocks, + streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + carry_out, mem->output_flag + num_radix_blocks * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + } else { + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks, + num_radix_blocks, mem->lut_message_extract); } }