Skip to content

Commit

Permalink
feat(gpu): generate and apply many luts
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun committed Sep 13, 2024
1 parent 95ab73c commit 1f159d2
Show file tree
Hide file tree
Showing 26 changed files with 896 additions and 169 deletions.
5 changes: 5 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);

void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down
13 changes: 8 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
Expand Down Expand Up @@ -331,7 +331,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);

template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -340,7 +341,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);

#if (CUDA_ARCH >= 900)
template <typename Torus>
Expand All @@ -350,7 +352,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);

template <typename Torus>
void scratch_cuda_programmable_bootstrap_tbc(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);

void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
Expand Down Expand Up @@ -58,7 +59,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
#endif

template <typename Torus>
Expand All @@ -74,7 +76,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);

template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
Expand All @@ -90,7 +93,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);

template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,16 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
compression_params.glwe_dimension,
compression_params.polynomial_size);

// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto lut = mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
if (active_gpu_count == 1) {

execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
Expand All @@ -208,7 +212,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);
num_lwes, encryption_params.pbs_type, lut_count, lut_stride);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
Expand All @@ -235,7 +239,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);
num_lwes, encryption_params.pbs_type, lut_count, lut_stride);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down
13 changes: 13 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {

host_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
lut_count, lut_stride);
}

void scratch_cuda_apply_bivariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
Expand Down
118 changes: 113 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;

// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
Expand All @@ -211,7 +214,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand All @@ -237,7 +240,92 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

template <typename Torus>
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
uint32_t lut_stride) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1);

/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);

/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down Expand Up @@ -272,6 +360,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;

// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;

// Left message is shifted
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
Expand Down Expand Up @@ -302,7 +394,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
Expand All @@ -324,7 +416,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down Expand Up @@ -700,6 +793,9 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
int small_lwe_size = (params.small_lwe_dimension + 1);

// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];

Expand All @@ -722,7 +818,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type);
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);

cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
Expand Down Expand Up @@ -994,6 +1090,18 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
num_blocks, mem);
}

template <typename Torus>
void host_apply_many_univariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
uint32_t lut_stride) {

integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
num_blocks, mem, lut_count, lut_stride);
}

template <typename Torus>
void scratch_cuda_apply_bivariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;

// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;

if (num_radix_in_vec == 0)
return;
if (num_radix_in_vec == 1) {
Expand Down Expand Up @@ -364,7 +368,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
mem_ptr->params.pbs_type, lut_count, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

Expand Down Expand Up @@ -412,7 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
mem_ptr->params.pbs_type, lut_count, lut_stride);

multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
Expand Down
Loading

0 comments on commit 1f159d2

Please sign in to comment.