Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): remove lwe chunk size argument from the multi-bit PBS #1445

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,15 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);
uint32_t level_count, uint32_t num_samples);

void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
Expand All @@ -51,8 +50,7 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -61,24 +59,21 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
#endif

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -87,16 +82,14 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -105,8 +98,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

template <typename Torus>
__host__ __device__ uint64_t
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0) {
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
Expand Down Expand Up @@ -242,9 +241,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}

if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
Expand Down Expand Up @@ -336,12 +334,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size);

for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
Expand Down
Loading
Loading