Skip to content

Commit

Permalink
feat(gpu): improve full propagation in sum and sub
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun authored and agnesLeroy committed Nov 25, 2024
1 parent 100c3ae commit 81e11a6
Show file tree
Hide file tree
Showing 46 changed files with 3,335 additions and 1,282 deletions.
9 changes: 9 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}

cudaEvent_t cuda_create_event(uint32_t gpu_index);

void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
uint32_t gpu_index);
void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
uint32_t gpu_index);

void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);

cudaStream_t cuda_create_stream(uint32_t gpu_index);

void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
Expand Down
91 changes: 46 additions & 45 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };

enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

extern "C" {
void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down Expand Up @@ -282,23 +284,61 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory);

void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
uint32_t uses_carry, bool allocate_gpu_memory);

void cuda_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks,
uint32_t requested_flag, uint32_t uses_carry);

void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void *carry_out, void *input_carries, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_blocks);
void *lhs_array, const void *rhs_array, void *carry_out,
const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);

void cleanup_cuda_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
bool allocate_gpu_memory);

void cuda_integer_overflowing_sub_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs_array, const void *rhs_array, void *overflow_block,
const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
uint32_t uses_input_borrow);

void cleanup_cuda_integer_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
Expand All @@ -318,25 +358,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_integer_radix_overflowing_sub_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_overflowed, void const *radix_lwe_left,
void const *radix_lwe_right, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t num_blocks_in_radix);

void cleanup_cuda_integer_radix_overflowing_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_scalar_mul_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -376,26 +397,6 @@ void cleanup_cuda_integer_div_rem(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);

void scratch_cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, int8_t signed_operation,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);

void cuda_signed_overflowing_add_or_sub_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lhs, void const *rhs, void *overflowed, int8_t signed_operation,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t num_blocks_in_radix);

void cleanup_signed_overflowing_add_or_sub(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand Down
Loading

0 comments on commit 81e11a6

Please sign in to comment.