Skip to content

Commit

Permalink
chore(gpu): rework single carry proip to avoid using local streams
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Dec 19, 2024
1 parent ae832c1 commit d6e4585
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 106 deletions.
88 changes: 43 additions & 45 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -1539,10 +1539,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

prepared_blocks = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
prepared_blocks =
(Torus *)cuda_malloc_async((num_radix_blocks + 1) * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
cuda_memset_async(prepared_blocks, 0,
(num_radix_blocks + 1) * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);

resolved_carries = (Torus *)cuda_malloc_async(
(num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
Expand Down Expand Up @@ -1772,16 +1774,13 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t num_many_lut;
uint32_t lut_stride;

uint32_t group_size;
uint32_t num_groups;
Torus *output_flag;
Torus *last_lhs;
Torus *last_rhs;
int_radix_lut<Torus> *lut_message_extract;

int_radix_lut<Torus> *lut_overflow_flag_prep;
int_radix_lut<Torus> *lut_overflow_flag_last;
int_radix_lut<Torus> *lut_carry_flag_last;

int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
Expand All @@ -1791,8 +1790,6 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t requested_flag;

uint32_t active_gpu_count;
cudaStream_t *sub_streams_1;
cudaStream_t *sub_streams_2;

cudaEvent_t *incoming_events1;
cudaEvent_t *incoming_events2;
Expand All @@ -1817,7 +1814,6 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t block_modulus = message_modulus * carry_modulus;
uint32_t num_bits_in_block = std::log2(block_modulus);
uint32_t grouping_size = num_bits_in_block;
group_size = grouping_size;
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;

num_many_lut = 2; // many luts apply 2 luts
Expand All @@ -1834,8 +1830,8 @@ template <typename Torus> struct int_sc_prop_memory {

// Step 3 elements
lut_message_extract =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory);
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
num_radix_blocks + 1, allocate_gpu_memory);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
Expand All @@ -1851,8 +1847,9 @@ template <typename Torus> struct int_sc_prop_memory {

// This store a single block that with be used to store the overflow or
// carry results
output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
output_flag =
(Torus *)cuda_malloc_async(big_lwe_size_bytes * (num_radix_blocks + 1),
streams[0], gpu_indexes[0]);
cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);

Expand Down Expand Up @@ -1911,9 +1908,6 @@ template <typename Torus> struct int_sc_prop_memory {
// It seems that this lut could be apply together with the other one but for
// now we won't do it
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
lut_overflow_flag_last = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);

auto f_overflow_last = [num_radix_blocks,
requested_flag_in](Torus block) -> Torus {
uint32_t position = (num_radix_blocks == 1 &&
Expand All @@ -1929,39 +1923,57 @@ template <typename Torus> struct int_sc_prop_memory {
return does_overflow_if_carry_is_0;
}
};
auto overflow_flag_last = lut_overflow_flag_last->get_lut(0, 0);
auto overflow_flag_last = lut_message_extract->get_lut(0, 1);

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_last);

lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes, 0);
Torus *h_lut_indexes =
(Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus));
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]);

lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
free(h_lut_indexes);
}
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
lut_carry_flag_last = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);

auto f_carry_last = [](Torus block) -> Torus {
return ((block >> 2) & 1);
};
auto carry_flag_last = lut_carry_flag_last->get_lut(0, 0);
auto carry_flag_last = lut_message_extract->get_lut(0, 1);

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_carry_last);

lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, 0);
Torus *h_lut_indexes =
(Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus));
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]);

lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
free(h_lut_indexes);
}

active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
}

incoming_events1 =
(cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
Expand Down Expand Up @@ -1997,24 +2009,10 @@ template <typename Torus> struct int_sc_prop_memory {

if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow
lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count);
lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count);
delete lut_overflow_flag_prep;
delete lut_overflow_flag_last;
cuda_drop_async(last_lhs, streams[0], gpu_indexes[0]);
cuda_drop_async(last_rhs, streams[0], gpu_indexes[0]);
}
if (requested_flag == outputFlag::FLAG_CARRY) { // In case of carry
lut_carry_flag_last->release(streams, gpu_indexes, gpu_count);
delete lut_carry_flag_last;
}

// release sub streams
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
}
free(sub_streams_1);
free(sub_streams_2);

// release events
for (uint j = 0; j < active_gpu_count; j++) {
Expand Down
107 changes: 46 additions & 61 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1624,13 +1624,12 @@ void host_propagate_single_carry(cudaStream_t const *streams,
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto big_lwe_dimension = big_lwe_size - 1; // For host addition
auto lut_stride = mem->lut_stride;
auto num_many_lut = mem->num_many_lut;
auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;
if (requested_flag == outputFlag::FLAG_OVERFLOW)
PANIC("Cuda error: single carry propagation is not supported for overflow, "
"try using add_and_propagate_single_carry");
Expand All @@ -1647,7 +1646,7 @@ void host_propagate_single_carry(cudaStream_t const *streams,

if (requested_flag == outputFlag::FLAG_CARRY) {
cuda_memcpy_async_gpu_to_gpu(
mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
}
// Step 2
Expand All @@ -1667,45 +1666,40 @@ void host_propagate_single_carry(cudaStream_t const *streams,

if (requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY) {
host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
mem->output_flag,
host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
mem->prop_simu_group_carries_mem->simulators +
(num_radix_blocks - 1) * big_lwe_size,
big_lwe_dimension, 1);
}

cuda_synchronize_stream(streams[0], gpu_indexes[0]);

// Step 3
// Add carries and cleanup OutputFlag::None
host_radix_sum_in_groups<Torus>(
mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
big_lwe_size, group_size);

auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
bsks, ksks, num_radix_blocks, message_extract);

if (requested_flag == outputFlag::FLAG_CARRY) {
host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
mem->output_flag, mem->output_flag,
host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
mem->prop_simu_group_carries_mem->resolved_carries +
(mem->num_groups - 1) * big_lwe_size,
big_lwe_dimension, 1);

cuda_memcpy_async_gpu_to_gpu(
prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);

cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
big_lwe_size_bytes, mem->sub_streams_2[0],
gpu_indexes[0]);
}
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
cuda_memcpy_async_gpu_to_gpu(lwe_array, mem->output_flag,
big_lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
} else {
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
num_radix_blocks, message_extract);
}
}

Expand All @@ -1721,13 +1715,12 @@ void host_add_and_propagate_single_carry(
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto big_lwe_dimension = big_lwe_size - 1; // For host addition
auto lut_stride = mem->lut_stride;
auto num_many_lut = mem->num_many_lut;
auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;

if (requested_flag == outputFlag::FLAG_OVERFLOW) {
cuda_memcpy_async_gpu_to_gpu(
Expand All @@ -1754,12 +1747,12 @@ void host_add_and_propagate_single_carry(
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs,
streams, gpu_indexes, gpu_count, output_flag, mem->last_lhs,
mem->last_rhs, bsks, ksks, 1, lut_overflow_prep,
lut_overflow_prep->params.message_modulus);
} else if (requested_flag == outputFlag::FLAG_CARRY) {
cuda_memcpy_async_gpu_to_gpu(
mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
}

Expand All @@ -1780,58 +1773,50 @@ void host_add_and_propagate_single_carry(

if (requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY) {
host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
mem->output_flag,
host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
mem->prop_simu_group_carries_mem->simulators +
(num_radix_blocks - 1) * big_lwe_size,
big_lwe_dimension, 1);
}

cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// Step 3
// Add carries and cleanup OutputFlag::None
host_radix_sum_in_groups<Torus>(
mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
big_lwe_size, group_size);

auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks,
bsks, ksks, num_radix_blocks, message_extract);

if (requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY) {
if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
uses_carry == 1) {
host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
mem->output_flag, mem->output_flag, input_carries,
big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
input_carries, big_lwe_dimension, 1);

} else {

host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
mem->output_flag, mem->output_flag,
host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
mem->prop_simu_group_carries_mem->resolved_carries +
(mem->num_groups - 1) * big_lwe_size,
big_lwe_dimension, 1);
}
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
}
cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
big_lwe_size_bytes, mem->sub_streams_2[0],
gpu_indexes[0]);
}
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
cuda_memcpy_async_gpu_to_gpu(
prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);

cuda_memcpy_async_gpu_to_gpu(lhs_array, mem->output_flag,
big_lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
num_radix_blocks, mem->lut_message_extract);
}
}

Expand Down

0 comments on commit d6e4585

Please sign in to comment.