Skip to content

Commit

Permalink
refactor(gpu): remove max_shared_memory from pbs arguments
Browse files Browse the repository at this point in the history
Always use max shared memory from device 0 to configure the
kernels, to avoid bugs with multi-GPU configurations
  • Loading branch information
kc1212 authored and agnesLeroy committed Aug 1, 2024
1 parent 351fc47 commit 5547d92
Show file tree
Hide file tree
Showing 24 changed files with 489 additions and 622 deletions.
3 changes: 1 addition & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,8 +526,7 @@ template <typename Torus> struct int_radix_lut {
execute_scratch_pbs<Torus>(
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
params.grouping_factor, num_blocks_on_gpu,
cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type,
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
allocate_gpu_memory);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
Expand Down
57 changes: 24 additions & 33 deletions backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,28 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
void scratch_cuda_programmable_bootstrap_amortized_32(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void scratch_cuda_programmable_bootstrap_amortized_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
uint32_t num_samples);

void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
uint32_t num_samples);

void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
uint32_t gpu_index,
Expand All @@ -58,41 +56,39 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
uint32_t num_samples);

void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
uint32_t num_samples);

void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);

uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint32_t input_lwe_ciphertext_count);

uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
uint32_t input_lwe_ciphertext_count);
}

template <typename Torus>
Expand Down Expand Up @@ -155,7 +151,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
uint32_t polynomial_size);

template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

Expand All @@ -174,7 +170,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

this->pbs_variant = pbs_variant;

auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto max_shared_memory = cuda_get_max_shared_memory(0);

if (allocate_gpu_memory) {
switch (pbs_variant) {
Expand Down Expand Up @@ -251,7 +247,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size, max_shared_memory);
Torus>(polynomial_size);

uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
Expand Down Expand Up @@ -310,10 +306,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
};

template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
__host__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {

uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
Expand All @@ -339,8 +335,7 @@ template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
uint32_t num_samples);

template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Expand All @@ -349,7 +344,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
uint32_t level_count, uint32_t num_samples);

template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -358,7 +353,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
uint32_t level_count, uint32_t num_samples);

#if (CUDA_ARCH >= 900)
template <typename Torus>
Expand All @@ -368,36 +363,32 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
uint32_t level_count, uint32_t num_samples);

template <typename Torus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
#endif

template <typename Torus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t max_shared_memory);
uint32_t level_count);

#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern "C" {

bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
uint32_t num_samples);

void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void *src,
Expand All @@ -19,17 +19,16 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t chunk_size = 0);

void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0);
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);

void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
Expand All @@ -39,21 +38,21 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
uint32_t polynomial_size);

template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
uint32_t level_count);

#if CUDA_ARCH >= 900
template <typename Torus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);

template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -63,23 +62,23 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t max_shared_memory, uint32_t lwe_chunk_size);
uint32_t lwe_chunk_size);
#endif

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -89,15 +88,15 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -107,7 +106,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
uint32_t lwe_chunk_size = 0);

template <typename Torus>
__host__ __device__ uint64_t
Expand Down Expand Up @@ -314,7 +313,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {

template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint32_t max_shared_memory);
uint32_t polynomial_size);

#endif // CUDA_MULTI_BIT_H
1 change: 0 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
int max_shared_memory = 0;
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
Expand Down
15 changes: 5 additions & 10 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
grouping_factor, num_radix_blocks, pbs_type);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
Expand All @@ -204,8 +203,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down Expand Up @@ -270,8 +268,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
grouping_factor, num_radix_blocks, pbs_type);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
Expand All @@ -293,8 +290,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
Expand Down Expand Up @@ -696,8 +692,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]),
params.pbs_type);
params.grouping_factor, 2, params.pbs_type);

cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
Expand Down
Loading

0 comments on commit 5547d92

Please sign in to comment.