Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(div_rem): implement div_rem, draft pr. #1046

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,23 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(

void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
int8_t **mem_ptr_void);
}

void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk,
uint32_t num_blocks_in_radix);

void cleanup_cuda_integer_div_rem(cuda_stream_t *stream, int8_t **mem_ptr_void);

} // extern C

template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
Expand Down Expand Up @@ -1582,6 +1598,28 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
}
};

template <typename Torus> struct int_div_rem_memory {
int_radix_params params;
bool mem_reuse = false;
int_logical_scalar_shift_buffer<Torus> *shift_mem;
int_overflowing_sub_memory<Torus> *overflow_sub_mem;
int_div_rem_memory(cuda_stream_t *stream, int_radix_params params,
uint32_t num_blocks, bool allocate_gpu_memory) {
this->params = params;
shift_mem = new int_logical_scalar_shift_buffer<Torus>(
stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, num_blocks, true);
overflow_sub_mem =
new int_overflowing_sub_memory<Torus>(stream, params, num_blocks, true);
}
void release(cuda_stream_t *stream) {
shift_mem->release(stream);
overflow_sub_mem->release(stream);

delete shift_mem;
delete overflow_sub_mem;
}
};

template <typename Torus> struct int_zero_out_if_buffer {

int_radix_params params;
Expand Down
82 changes: 82 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#include "integer/div_rem.cuh"

void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);

scratch_cuda_integer_div_rem_kb<uint64_t>(
stream, (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
}

void cuda_integer_div_rem_radix_ciphertext_kb_64(
cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
void *divisor, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {

auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

switch (mem->params.polynomial_size) {
case 512:
host_integer_div_rem_kb<uint64_t, Degree<512>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 1024:
host_integer_div_rem_kb<uint64_t, Degree<1024>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 2048:
host_integer_div_rem_kb<uint64_t, Degree<2048>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 4096:
host_integer_div_rem_kb<uint64_t, Degree<4096>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 8192:
host_integer_div_rem_kb<uint64_t, Degree<8192>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
case 16384:
host_integer_div_rem_kb<uint64_t, Degree<16384>>(
stream, static_cast<uint64_t *>(quotient),
static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
mem, num_blocks);
break;
default:
PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
}
}

void cleanup_cuda_integer_div_rem(cuda_stream_t *stream,
int8_t **mem_ptr_void) {
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);

mem_ptr->release(stream);
}
Loading
Loading