From 601baafc013b55ba767e54b6eb028a000a8cce4d Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Thu, 14 Mar 2024 17:19:40 -0300 Subject: [PATCH] feat(gpu): Implements a classical PBS variant that uses thread block cluster and distributed shared memory --- .../tfhe-cuda-backend/cuda/CMakeLists.txt | 4 +- .../tfhe-cuda-backend/cuda/include/device.h | 1 - .../cuda/include/programmable_bootstrap.h | 97 +++- .../include/programmable_bootstrap_multibit.h | 7 +- .../cuda/src/pbs/programmable_bootstrap.cu | 52 ++- .../cuda/src/pbs/programmable_bootstrap.cuh | 31 +- .../src/pbs/programmable_bootstrap_classic.cu | 269 ++++++++++- .../pbs/programmable_bootstrap_multibit.cu | 67 ++- .../programmable_bootstrap_tbc_classic.cuh | 424 ++++++++++++++++++ .../programmable_bootstrap_tbc_multibit.cuh | 115 +++-- .../benchmarks/benchmark_pbs.cpp | 75 +++- .../tests/test_classical_pbs.cpp | 4 +- .../tests/test_multibit_pbs.cpp | 19 +- 13 files changed, 1059 insertions(+), 106 deletions(-) create mode 100644 backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt index ad452451ed..61404d0400 100644 --- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt @@ -61,8 +61,8 @@ if(${CUDA_SUCCESS}) string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}") set(CUDA_ARCH "${CUDA_ARCH}0") else() - set(CMAKE_CUDA_ARCHITECTURES 70) - set(CUDA_ARCH "700") + set(CMAKE_CUDA_ARCHITECTURES 90) + set(CUDA_ARCH "900") endif() add_compile_definitions(CUDA_ARCH=${CUDA_ARCH}) diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 58ab4a922a..1501b07f2e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -8,7 +8,6 @@ #include #define synchronize_threads_in_block() __syncthreads() - extern "C" { #define check_cuda_error(ans) \ diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index 941f27ac75..41e54a6d56 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -111,6 +111,28 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) { return sizeof(double2) * polynomial_size / 2; // accumulator fft } +template +__host__ __device__ uint64_t +get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) { + return sizeof(Torus) * polynomial_size + // accumulator_rotated + sizeof(Torus) * polynomial_size + // accumulator + sizeof(double2) * polynomial_size / 2; // accumulator fft +} + +template +__host__ __device__ uint64_t +get_buffer_size_partial_sm_programmable_bootstrap_tbc( + uint32_t polynomial_size) { + return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body +} + +template +__host__ __device__ uint64_t +get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + uint32_t polynomial_size) { + return sizeof(double2) * polynomial_size / 2; // tbc +} + template __host__ __device__ uint64_t get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) { @@ -125,6 +147,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) { return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body } +template +__host__ bool +supports_distributed_shared_memory_on_classic_programmable_bootstrap( + uint32_t polynomial_size, uint32_t max_shared_memory); + template struct pbs_buffer; template struct pbs_buffer { @@ -213,6 +240,46 @@ template struct pbs_buffer { polynomial_size / 2 * sizeof(double2), stream); } break; +#if CUDA_ARCH >= 900 + case PBS_VARIANT::TBC: { + + bool supports_dsm = + supports_distributed_shared_memory_on_classic_programmable_bootstrap< + Torus>(polynomial_size, max_shared_memory); + + uint64_t full_sm = + get_buffer_size_full_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t minimum_sm_tbc = 0; + if (supports_dsm) + minimum_sm_tbc = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap< + Torus>(polynomial_size); + + uint64_t partial_dm = full_sm - partial_sm; + uint64_t full_dm = full_sm; + uint64_t device_mem = 0; + + if (max_shared_memory < partial_sm + minimum_sm_tbc) { + device_mem = full_dm * input_lwe_ciphertext_count * level_count * + (glwe_dimension + 1); + } else if (max_shared_memory < full_sm + minimum_sm_tbc) { + device_mem = partial_dm * input_lwe_ciphertext_count * level_count * + (glwe_dimension + 1); + } + + // Otherwise, both kernels run all in shared memory + d_mem = (int8_t *)cuda_malloc_async(device_mem, stream); + + global_accumulator_fft = (double2 *)cuda_malloc_async( + (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count * + polynomial_size / 2 * sizeof(double2), + stream); + } break; +#endif default: PANIC("Cuda error (PBS): unsupported implementation variant.") } @@ -281,6 +348,25 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory); +#if (CUDA_ARCH >= 900) +template +void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes, + Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + pbs_buffer *buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_luts, + uint32_t lwe_idx, uint32_t max_shared_memory); + +template +void scratch_cuda_programmable_bootstrap_tbc( + cuda_stream_t *stream, pbs_buffer **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + bool allocate_gpu_memory); +#endif + template void scratch_cuda_programmable_bootstrap_cg( cuda_stream_t *stream, pbs_buffer **pbs_buffer, @@ -295,11 +381,12 @@ void scratch_cuda_programmable_bootstrap( uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory); -template -__device__ int get_this_block_rank(G &group, bool support_dsm); -template -__device__ double2 *get_join_buffer_element(int i, G &group, bool support_dsm, - double2 *global_memory_buffer); +template +bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples, + uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t level_count, + uint32_t max_shared_memory); #ifdef __CUDACC__ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index a83cd69c6b..6fa788e2d6 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -58,8 +58,10 @@ supports_distributed_shared_memory_on_multibit_programmable_bootstrap( template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( - uint32_t polynomial_size, uint32_t max_shared_memory); + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory); +#if CUDA_ARCH >= 900 template void scratch_cuda_tbc_multi_bit_programmable_bootstrap( cuda_stream_t *stream, pbs_buffer **buffer, @@ -78,6 +80,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size); +#endif template void scratch_cuda_cg_multi_bit_programmable_bootstrap( @@ -306,7 +309,7 @@ template struct pbs_buffer { }; template -__host__ uint32_t get_lwe_chunk_size(int gpu_index, uint32_t max_num_pbs, +__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, uint32_t polynomial_size, uint32_t max_shared_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu index abc7eff345..9ea889dfc7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu @@ -1,40 +1,44 @@ #include "programmable_bootstrap.cuh" - -template <> __device__ int get_this_block_rank(grid_group &group, bool support_dsm) { +template <> +__device__ int get_this_block_rank(grid_group &group, bool support_dsm) { return blockIdx.y; } -template <> __device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) { +template <> +__device__ double2 * +get_join_buffer_element(int level_id, int glwe_id, grid_group &group, + double2 *global_memory_buffer, uint32_t polynomial_size, + uint32_t glwe_dimension, bool support_dsm) { + double2 *buffer_slice = + global_memory_buffer + + (glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2; + return buffer_slice; +} + +#if CUDA_ARCH >= 900 +template <> +__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) { if (support_dsm) return cluster.block_rank(); else return blockIdx.y; } - -template<> __device__ double2 *get_join_buffer_element(int i, grid_group &group, - bool support_dsm, - double2 *global_memory_buffer, uint32_t - polynomial_size) { - double2 *buffer_slice = global_memory_buffer + i * polynomial_size / 2; - return buffer_slice; -} - -template<> __device__ double2 *get_join_buffer_element(int i, cluster_group &cluster, - bool support_dsm, - double2 *global_memory_buffer, uint32_t - polynomial_size) { -#if CUDA_ARCH < 900 - double2 *buffer_slice = - global_memory_buffer + blockIdx.y * polynomial_size / 2; -#else +template <> +__device__ double2 * +get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster, + double2 *global_memory_buffer, uint32_t polynomial_size, + uint32_t glwe_dimension, bool support_dsm) { double2 *buffer_slice; if (support_dsm) { extern __shared__ double2 smem[]; - buffer_slice = cluster.map_shared_rank(smem, i); + buffer_slice = cluster.map_shared_rank( + smem, glwe_id + level_id * (glwe_dimension + 1)); } else { - buffer_slice = global_memory_buffer + i * polynomial_size / 2; + buffer_slice = + global_memory_buffer + + (glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2; } -#endif return buffer_slice; -} \ No newline at end of file +} +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 5b39caaed5..4fb779b40b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -15,8 +15,10 @@ template __device__ int get_this_block_rank(G &group, bool support_dsm); template -__device__ double2 *get_join_buffer_element(int i, G &group, bool support_dsm, - double2 *global_memory_buffer, uint32_t polynomial_size); +__device__ double2 * +get_join_buffer_element(int level_id, int glwe_id, G &group, + double2 *global_memory_buffer, uint32_t polynomial_size, + uint32_t glwe_dimension, bool support_dsm); template __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, @@ -38,18 +40,15 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size, glwe_dimension, level_count); - // Selects all GLWEs in a particular decomposition level - auto level_join_buffer = - join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2; - // Perform the matrix multiplication between the GGSW and the GLWE, // each block operating on a single level for mask and body // The first product is used to initialize level_join_buffer auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2; auto this_block_rank = get_this_block_rank(group, support_dsm); - auto buffer_slice = get_join_buffer_element( - this_block_rank, group, support_dsm, level_join_buffer, polynomial_size); + auto buffer_slice = + get_join_buffer_element(blockIdx.x, blockIdx.y, group, join_buffer, + polynomial_size, glwe_dimension, support_dsm); int tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { @@ -65,8 +64,9 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, int idx = (j + this_block_rank) % (glwe_dimension + 1); auto bsk_poly = bsk_slice + idx * params::degree / 2; - auto buffer_slice = get_join_buffer_element( - idx, group, support_dsm, level_join_buffer, polynomial_size); + auto buffer_slice = get_join_buffer_element(blockIdx.x, idx, group, + join_buffer, polynomial_size, + glwe_dimension, support_dsm); int tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { @@ -80,8 +80,9 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, // All blocks are synchronized here; after this sync, level_join_buffer has // the values needed from every other block - auto src_acc = get_join_buffer_element(blockIdx.y, group, - support_dsm, join_buffer, polynomial_size); + auto src_acc = + get_join_buffer_element(0, blockIdx.y, group, join_buffer, + polynomial_size, glwe_dimension, support_dsm); // copy first product into fft buffer tid = threadIdx.x; @@ -93,7 +94,9 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, // accumulate rest of the products into fft buffer for (int l = 1; l < gridDim.x; l++) { - auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2]; + auto cur_src_acc = get_join_buffer_element(l, blockIdx.y, group, + join_buffer, polynomial_size, + glwe_dimension, support_dsm); tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { fft[tid] += cur_src_acc[tid]; @@ -222,4 +225,4 @@ void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer, } } -#endif \ No newline at end of file +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 12bb26c2eb..0682071563 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -1,5 +1,8 @@ #include "programmable_bootstrap_cg_classic.cuh" #include "programmable_bootstrap_classic.cuh" +#if (CUDA_ARCH >= 900) +#include "programmable_bootstrap_tbc_classic.cuh" +#endif template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, @@ -12,6 +15,176 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, max_shared_memory); } +template +bool has_support_to_cuda_programmable_bootstrap_tbc( + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory) { +#if CUDA_ARCH >= 900 + switch (polynomial_size) { + case 256: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 512: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 1024: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 2048: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 4096: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 8192: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 16384: + return supports_thread_block_clusters_on_classic_programmable_bootstrap< + Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..16384].") + } +#else + return false; +#endif +} + +#if (CUDA_ARCH >= 900) +template +void scratch_cuda_programmable_bootstrap_tbc( + cuda_stream_t *stream, pbs_buffer **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + bool allocate_gpu_memory) { + + switch (polynomial_size) { + case 256: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 512: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 1024: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 2048: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 4096: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 8192: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case 16384: + scratch_programmable_bootstrap_tbc>( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + default: + PANIC("Cuda error (classical PBS): unsupported polynomial size. " + "Supported N's are powers of two" + " in the interval [256..16384].") + } +} + +template +void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes, + Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + pbs_buffer *buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_luts, + uint32_t lwe_idx, uint32_t max_shared_memory) { + + switch (polynomial_size) { + case 256: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 512: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 1024: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 2048: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 4096: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 8192: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + case 16384: + host_programmable_bootstrap_tbc>( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_luts, max_shared_memory); + break; + default: + PANIC("Cuda error (classical PBS): unsupported polynomial size. " + "Supported N's are powers of two" + " in the interval [256..16384].") + } +} +#endif + /* * Returns the buffer size for 64 bits executions */ @@ -143,9 +316,19 @@ void scratch_cuda_programmable_bootstrap_32( uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - if (has_support_to_cuda_programmable_bootstrap_cg( - glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) +#if (CUDA_ARCH >= 900) + if (has_support_to_cuda_programmable_bootstrap_tbc( + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + level_count, max_shared_memory)) + scratch_cuda_programmable_bootstrap_tbc( + stream, (pbs_buffer **)buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + max_shared_memory, allocate_gpu_memory); + else +#endif + if (has_support_to_cuda_programmable_bootstrap_cg( + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_cg( stream, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, @@ -168,9 +351,19 @@ void scratch_cuda_programmable_bootstrap_64( uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory) { - if (has_support_to_cuda_programmable_bootstrap_cg( - glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) +#if (CUDA_ARCH >= 900) + if (has_support_to_cuda_programmable_bootstrap_tbc( + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + level_count, max_shared_memory)) + scratch_cuda_programmable_bootstrap_tbc( + stream, (pbs_buffer **)buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + max_shared_memory, allocate_gpu_memory); + else +#endif + if (has_support_to_cuda_programmable_bootstrap_cg( + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory)) scratch_cuda_programmable_bootstrap_cg( stream, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, @@ -334,6 +527,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( (pbs_buffer *)mem_ptr; switch (buffer->pbs_variant) { + case TBC: + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + stream, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), + (pbs_buffer *)buffer, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count, num_samples, + num_luts, lwe_idx, max_shared_memory); + break; case CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, static_cast(lwe_array_out), @@ -452,6 +658,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( (pbs_buffer *)mem_ptr; switch (buffer->pbs_variant) { + case TBC: + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + stream, static_cast(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), + (pbs_buffer *)buffer, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count, num_samples, + num_luts, lwe_idx, max_shared_memory); + break; case CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, static_cast(lwe_array_out), @@ -560,3 +779,41 @@ template void scratch_cuda_programmable_bootstrap( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, bool allocate_gpu_memory); + +template bool has_support_to_cuda_programmable_bootstrap_tbc( + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory); +template bool has_support_to_cuda_programmable_bootstrap_tbc( + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory); + +#if CUDA_ARCH >= 900 +template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_stream_t *stream, uint32_t *lwe_array_out, + uint32_t *lwe_output_indexes, uint32_t *lut_vector, + uint32_t *lut_vector_indexes, uint32_t *lwe_array_in, + uint32_t *lwe_input_indexes, double2 *bootstrapping_key, + pbs_buffer *buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_luts, + uint32_t lwe_idx, uint32_t max_shared_memory); +template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + cuda_stream_t *stream, uint64_t *lwe_array_out, + uint64_t *lwe_output_indexes, uint64_t *lut_vector, + uint64_t *lut_vector_indexes, uint64_t *lwe_array_in, + uint64_t *lwe_input_indexes, double2 *bootstrapping_key, + pbs_buffer *buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_luts, + uint32_t lwe_idx, uint32_t max_shared_memory); +template void scratch_cuda_programmable_bootstrap_tbc( + cuda_stream_t *stream, pbs_buffer **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + bool allocate_gpu_memory); +template void scratch_cuda_programmable_bootstrap_tbc( + cuda_stream_t *stream, pbs_buffer **pbs_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + bool allocate_gpu_memory); +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 02c9cf4ac8..f6a07bf480 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -3,7 +3,7 @@ #include "programmable_bootstrap_multibit.cuh" #include "programmable_bootstrap_multibit.h" -#if CUDA_ARCH >= 900 +#if (CUDA_ARCH >= 900) #include "programmable_bootstrap_tbc_multibit.cuh" #endif @@ -17,8 +17,53 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( - uint32_t polynomial_size, uint32_t max_shared_memory) { - return cuda_check_support_thread_block_clusters(); + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory) { +#if CUDA_ARCH >= 900 + switch (polynomial_size) { + case 256: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 512: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 1024: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 2048: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 4096: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 8192: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + case 16384: + return supports_thread_block_clusters_on_multibit_programmable_bootstrap< + Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, + polynomial_size, level_count, + max_shared_memory); + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..16384].") + } +#else + return false; +#endif } template @@ -206,7 +251,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( switch (buffer->pbs_variant) { #if CUDA_ARCH >= 900 - case PBS_VARIANT::TBC: + case PBS_VARIANT::TBC: cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( stream, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -386,9 +431,10 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( uint32_t max_shared_memory, bool allocate_gpu_memory, uint32_t lwe_chunk_size) { -#if CUDA_ARCH >= 900 +#if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( - polynomial_size, max_shared_memory)) + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + level_count, max_shared_memory)) scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, @@ -420,7 +466,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream, // Returns a chunk size that is not optimal but close to template -__host__ uint32_t get_lwe_chunk_size(int gpu_index, uint32_t max_num_pbs, +__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, uint32_t polynomial_size, uint32_t max_shared_memory) { @@ -504,9 +550,10 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( - uint32_t polynomial_size, uint32_t max_shared_memory); + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory); -#if CUDA_ARCH >= 900 +#if (CUDA_ARCH >= 900) template void scratch_cuda_tbc_multi_bit_programmable_bootstrap( cuda_stream_t *stream, pbs_buffer **buffer, @@ -676,4 +723,4 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t base_log, uint32_t level_count, uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size); -#endif \ No newline at end of file +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh new file mode 100644 index 0000000000..4b5e7b7f27 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -0,0 +1,424 @@ +#ifndef CUDA_TBC_PBS_CUH +#define CUDA_TBC_PBS_CUH + +#ifdef __CDT_PARSER__ +#undef __CUDA_RUNTIME_H__ +#include +#endif + +#include "cooperative_groups.h" +#include "crypto/gadget.cuh" +#include "crypto/torus.cuh" +#include "device.h" +#include "fft/bnsmfft.cuh" +#include "fft/twiddles.cuh" +#include "polynomial/parameters.cuh" +#include "polynomial/polynomial_math.cuh" +#include "programmable_bootstrap.cuh" +#include "programmable_bootstrap.h" +#include "types/complex/operations.cuh" + +using namespace cooperative_groups; +namespace cg = cooperative_groups; + +/* + * Kernel that computes the classical PBS using cooperative groups + * + * - lwe_array_out: vector of output lwe s, with length + * (glwe_dimension * polynomial_size+1)*num_samples + * - lut_vector: vector of look up tables with + * length (glwe_dimension+1) * polynomial_size * num_samples + * - lut_vector_indexes: mapping between lwe_array_in and lut_vector + * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) * + * num_samples + * + * Each y-block computes one element of the lwe_array_out. + */ +template +__global__ void device_programmable_bootstrap_tbc( + Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, + double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + int8_t *device_mem, uint64_t device_memory_size_per_block, + bool support_dsm) { + + cluster_group cluster = this_cluster(); + + // We use shared memory for the polynomials that are used often during the + // bootstrap, since shared memory is kept in L1 cache and accessing it is + // much faster than global memory + extern __shared__ int8_t sharedmem[]; + int8_t *selected_memory; + uint32_t glwe_dimension = gridDim.y - 1; + + if constexpr (SMD == FULLSM) { + selected_memory = sharedmem; + if (support_dsm) + selected_memory += sizeof(Torus) * polynomial_size; + } else { + int block_index = blockIdx.x + blockIdx.y * gridDim.x + + blockIdx.z * gridDim.x * gridDim.y; + selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } + + Torus *accumulator = (Torus *)selected_memory; + Torus *accumulator_rotated = + (Torus *)accumulator + (ptrdiff_t)polynomial_size; + double2 *accumulator_fft = + (double2 *)accumulator_rotated + + (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2)); + + if constexpr (SMD == PARTIALSM) { + accumulator_fft = (double2 *)sharedmem; + if (support_dsm) + accumulator_fft += (ptrdiff_t)(polynomial_size / 2); + } + + // The third dimension of the block is used to determine on which ciphertext + // this block is operating, in the case of batch bootstraps + Torus *block_lwe_array_in = + &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)]; + + Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] * + params::degree * (glwe_dimension + 1)]; + + double2 *block_join_buffer = + &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * + params::degree / 2]; + // Since the space is L1 cache is small, we use the same memory location for + // the rotated accumulator and the fft accumulator, since we know that the + // rotated array is not in use anymore by the time we perform the fft + + // Put "b" in [0, 2N[ + Torus b_hat = 0; + rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, + 2 * params::degree); + + divide_by_monomial_negacyclic_inplace( + accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, + false); + + for (int i = 0; i < lwe_dimension; i++) { + synchronize_threads_in_block(); + + // Put "a" in [0, 2N[ + Torus a_hat = 0; + rescale_torus_element(block_lwe_array_in[i], a_hat, + 2 * params::degree); // 2 * params::log2_degree + 1); + + // Perform ACC * (X^รค - 1) + multiply_by_monomial_negacyclic_and_sub_polynomial< + Torus, params::opt, params::degree / params::opt>( + accumulator, accumulator_rotated, a_hat); + + // Perform a rounding to increase the accuracy of the + // bootstrapped ciphertext + round_to_closest_multiple_inplace( + accumulator_rotated, base_log, level_count); + + synchronize_threads_in_block(); + + // Decompose the accumulator. Each block gets one level of the + // decomposition, for the mask and the body (so block 0 will have the + // accumulator decomposed at level 0, 1 at 1, etc.) + GadgetMatrix gadget_acc(base_log, level_count, + accumulator_rotated); + gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); + + // We are using the same memory space for accumulator_fft and + // accumulator_rotated, so we need to synchronize here to make sure they + // don't modify the same memory space at the same time + synchronize_threads_in_block(); + + // Perform G^-1(ACC) * GGSW -> GLWE + mul_ggsw_glwe( + accumulator, accumulator_fft, block_join_buffer, bootstrapping_key, + polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm); + + synchronize_threads_in_block(); + } + + auto block_lwe_array_out = + &lwe_array_out[lwe_output_indexes[blockIdx.z] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, but + // we do the computation at block 0 to avoid waiting for extra blocks, in + // case they're not synchronized + sample_extract_mask(block_lwe_array_out, accumulator); + } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { + sample_extract_body(block_lwe_array_out, accumulator, 0); + } +} + +template +__host__ void scratch_programmable_bootstrap_tbc( + cuda_stream_t *stream, pbs_buffer **buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + bool allocate_gpu_memory) { + cudaSetDevice(stream->gpu_index); + + bool supports_dsm = + supports_distributed_shared_memory_on_classic_programmable_bootstrap< + Torus>(polynomial_size, max_shared_memory); + + uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t minimum_sm_tbc = 0; + if (supports_dsm) + minimum_sm_tbc = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + polynomial_size); + + if (max_shared_memory >= full_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm + minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + *buffer = new pbs_buffer( + stream, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory); +} + +/* + * Host wrapper + */ +template +__host__ void host_programmable_bootstrap_tbc( + cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes, + Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + pbs_buffer *buffer, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t input_lwe_ciphertext_count, + uint32_t num_luts, uint32_t max_shared_memory) { + cudaSetDevice(stream->gpu_index); + + auto supports_dsm = + supports_distributed_shared_memory_on_classic_programmable_bootstrap< + Torus>(polynomial_size, max_shared_memory); + + // With SM each block corresponds to either the mask or body, no need to + // duplicate data for each + uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t minimum_sm_tbc = 0; + if (supports_dsm) + minimum_sm_tbc = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + polynomial_size); + + uint64_t full_dm = full_sm; + + uint64_t partial_dm = full_dm - partial_sm; + + int8_t *d_mem = buffer->d_mem; + double2 *buffer_fft = buffer->global_accumulator_fft; + + int thds = polynomial_size / params::opt; + dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); + + cudaLaunchConfig_t config = {0}; + // The grid dimension is not affected by cluster launch, and is still + // enumerated using number of blocks. The grid dimension should be a multiple + // of cluster size. + config.gridDim = grid; + config.blockDim = thds; + + cudaLaunchAttribute attribute[1]; + attribute[0].id = cudaLaunchAttributeClusterDimension; + attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension + attribute[0].val.clusterDim.y = (glwe_dimension + 1); + attribute[0].val.clusterDim.z = 1; + config.attrs = attribute; + config.numAttrs = 1; + config.stream = stream->stream; + + if (max_shared_memory < partial_sm + minimum_sm_tbc) { + config.dynamicSmemBytes = minimum_sm_tbc; + + check_cuda_error(cudaLaunchKernelEx( + &config, device_programmable_bootstrap_tbc, + lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, + lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, + lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm, + supports_dsm)); + } else if (max_shared_memory < full_sm + minimum_sm_tbc) { + config.dynamicSmemBytes = partial_sm + minimum_sm_tbc; + + check_cuda_error(cudaLaunchKernelEx( + &config, device_programmable_bootstrap_tbc, + lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, + lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, + lwe_dimension, polynomial_size, base_log, level_count, d_mem, + partial_dm, supports_dsm)); + } else { + config.dynamicSmemBytes = full_sm + minimum_sm_tbc; + + check_cuda_error(cudaLaunchKernelEx( + &config, device_programmable_bootstrap_tbc, + lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, + lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft, + lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0, + supports_dsm)); + } +} + +// Verify if the grid size satisfies the cooperative group constraints +template +__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( + int glwe_dimension, int level_count, int num_samples, + uint32_t max_shared_memory) { + + // If Cooperative Groups is not supported, no need to check anything else + if (!cuda_check_support_cooperative_groups()) + return false; + + // Calculate the dimension of the kernel + uint64_t full_sm = + get_buffer_size_full_sm_programmable_bootstrap_tbc(params::degree); + + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + params::degree); + + int thds = params::degree / params::opt; + + // Get the maximum number of active blocks per streaming multiprocessors + int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; + int max_active_blocks_per_sm; + + if (max_shared_memory < partial_sm) { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_programmable_bootstrap_tbc, thds, + 0); + } else if (max_shared_memory < full_sm) { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_programmable_bootstrap_tbc, + thds, partial_sm); + } else { + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks_per_sm, + (void *)device_programmable_bootstrap_tbc, thds, + full_sm); + } + + // Get the number of streaming multiprocessors + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + return number_of_blocks <= max_active_blocks_per_sm * number_of_sm; +} + +template +__host__ bool +supports_distributed_shared_memory_on_classic_programmable_bootstrap( + uint32_t polynomial_size, uint32_t max_shared_memory) { + uint64_t minimum_sm = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + polynomial_size); + + if (max_shared_memory < minimum_sm) { + // If we cannot store a single polynomial in a block shared memory we cannot + // use TBC + return false; + } else { + return cuda_check_support_thread_block_clusters(); + } +} + +template +__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory) { + + if (!cuda_check_support_thread_block_clusters() || num_samples > 128) + return false; + + uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t minimum_sm_tbc = 0; + if (supports_distributed_shared_memory_on_classic_programmable_bootstrap< + Torus>(polynomial_size, max_shared_memory)) + minimum_sm_tbc = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + polynomial_size); + + int cluster_size; + + dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); + dim3 thds(polynomial_size / params::opt, 1, 1); + + cudaLaunchConfig_t config = {0}; + // The grid dimension is not affected by cluster launch, and is still + // enumerated using number of blocks. The grid dimension should be a multiple + // of cluster size. + config.gridDim = grid_accumulate; + config.blockDim = thds; + config.numAttrs = 0; + + if (max_shared_memory < partial_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, device_programmable_bootstrap_tbc, + &config)); + } else if (max_shared_memory < full_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, + device_programmable_bootstrap_tbc, &config)); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, device_programmable_bootstrap_tbc, + &config)); + } + + return cluster_size >= level_count * (glwe_dimension + 1); +} + +#endif // CG_PBS_H diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index db69a56356..e2e1a0883a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -40,8 +40,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( // The first (polynomial_size/2) * sizeof(double2) bytes are reserved for // external product using distributed shared memory selected_memory = sharedmem; - if(support_dsm) - selected_memory += sizeof(Torus) * polynomial_size; + if (support_dsm) + selected_memory += sizeof(Torus) * polynomial_size; } else { int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; @@ -53,10 +53,10 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( (double2 *)accumulator + (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2)); - if constexpr (SMD == PARTIALSM){ + if constexpr (SMD == PARTIALSM) { accumulator_fft = (double2 *)sharedmem; - if(support_dsm) - accumulator_fft += sizeof(double2) * (polynomial_size/2); + if (support_dsm) + accumulator_fft += sizeof(double2) * (polynomial_size / 2); } // The third dimension of the block is used to determine on which ciphertext @@ -249,8 +249,9 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = get_average_lwe_chunk_size( - lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count); + lwe_chunk_size = get_lwe_chunk_size( + stream->gpu_index, input_lwe_ciphertext_count, polynomial_size, + max_shared_memory); *buffer = new pbs_buffer( stream, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC, @@ -307,7 +308,7 @@ __host__ void execute_tbc_external_product_loop( cudaLaunchAttribute attribute[1]; attribute[0].id = cudaLaunchAttributeClusterDimension; - attribute[0].val.clusterDim.x = 1; // Cluster size in X-dimension + attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension attribute[0].val.clusterDim.y = (glwe_dimension + 1); attribute[0].val.clusterDim.z = 1; config.attrs = attribute; @@ -363,8 +364,8 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( cudaSetDevice(stream->gpu_index); if (!lwe_chunk_size) - lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count, - glwe_dimension, num_samples); + lwe_chunk_size = get_lwe_chunk_size( + stream->gpu_index, num_samples, polynomial_size, max_shared_memory); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -389,27 +390,91 @@ template __host__ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( uint32_t polynomial_size, uint32_t max_shared_memory) { - uint64_t minimum_sm = + uint64_t minimum_sm = + get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( + polynomial_size); + + if (max_shared_memory <= minimum_sm) { + // If we cannot store a single polynomial in a block shared memory we + // cannot use TBC + return false; + } else { + return cuda_check_support_thread_block_clusters(); + } +} + +template +__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( + uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t max_shared_memory) { + + if (!cuda_check_support_thread_block_clusters()) + return false; + + uint64_t full_sm_tbc_accumulate = + get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( + polynomial_size); + uint64_t partial_sm_tbc_accumulate = + get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap( + polynomial_size); + uint64_t minimum_sm_tbc_accumulate = 0; + if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap< + Torus>(polynomial_size, max_shared_memory)) + minimum_sm_tbc_accumulate = get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); - if (max_shared_memory <= minimum_sm) { - // If we cannot store a single polynomial in a block shared memory we -// cannot use TBC - return false; - } else { - return cuda_check_support_thread_block_clusters(); - } -} + int cluster_size; -template -__host__ bool -supports_thread_block_clusters_on_multibit_programmable_bootstrap() { - return cuda_check_support_thread_block_clusters(); + dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples); + dim3 thds(polynomial_size / params::opt, 1, 1); + + cudaLaunchConfig_t config = {0}; + // The grid dimension is not affected by cluster launch, and is still + // enumerated using number of blocks. The grid dimension should be a multiple + // of cluster size. + config.gridDim = grid_accumulate; + config.blockDim = thds; + config.numAttrs = 0; + + if (max_shared_memory < + partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, + device_multi_bit_programmable_bootstrap_tbc_accumulate, + &config)); + } else if (max_shared_memory < + full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, + device_multi_bit_programmable_bootstrap_tbc_accumulate, + &config)); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeNonPortableClusterSizeAllowed, true)); + check_cuda_error(cudaOccupancyMaxPotentialClusterSize( + &cluster_size, + device_multi_bit_programmable_bootstrap_tbc_accumulate, + &config)); + } + + return cluster_size >= level_count * (glwe_dimension + 1); } -template -__host__ bool +template __host__ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( uint32_t polynomial_size, uint32_t max_shared_memory); #endif // FASTMULTIBIT_PBS_H diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 17cd66b340..f98a611cfe 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -83,10 +83,10 @@ class MultiBitBootstrap_u64 : public benchmark::Fixture { stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk, &plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array, &d_lwe_output_indexes, - lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, lwe_modular_variance, glwe_modular_variance, - pbs_base_log, pbs_level, message_modulus, carry_modulus, - &payload_modulus, &delta, input_lwe_ciphertext_count, 1, 1); + lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, + lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level, + message_modulus, carry_modulus, &payload_modulus, &delta, + input_lwe_ciphertext_count, 1, 1); } void TearDown(const ::benchmark::State &state) { @@ -171,7 +171,8 @@ class ClassicalBootstrap_u64 : public benchmark::Fixture { BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( - polynomial_size, cuda_get_max_shared_memory(stream->gpu_index))) { + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) { st.SkipWithError("Configuration not supported for tbc operation"); return; } @@ -252,6 +253,39 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer); } +#if CUDA_ARCH >= 900 +BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) +(benchmark::State &st) { + if (!has_support_to_cuda_programmable_bootstrap_tbc( + input_lwe_ciphertext_count, glwe_dimension, polynomial_size, + pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) { + st.SkipWithError("Configuration not supported for tbc operation"); + return; + } + + scratch_cuda_programmable_bootstrap_tbc( + stream, (pbs_buffer **)&buffer, glwe_dimension, + polynomial_size, pbs_level, input_lwe_ciphertext_count, + cuda_get_max_shared_memory(stream->gpu_index), true); + + for (auto _ : st) { + // Execute PBS + cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( + stream, (uint64_t *)d_lwe_ct_out_array, + (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity, + (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array, + (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, + (pbs_buffer *)buffer, lwe_dimension, + glwe_dimension, polynomial_size, pbs_base_log, pbs_level, + input_lwe_ciphertext_count, 1, 0, + cuda_get_max_shared_memory(stream->gpu_index)); + cuda_synchronize_stream(stream); + } + + cleanup_cuda_programmable_bootstrap(stream, &buffer); +} +#endif + BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_cg( @@ -366,6 +400,25 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) { } } +static void +CGBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) { + // Define the parameters to benchmark + // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, + // input_lwe_ciphertext_count + + // PARAM_MESSAGE_2_CARRY_2_KS_PBS + std::vector params = { + (BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1}, + }; + + // Add to the list of parameters to benchmark + for (int num_samples = 1; num_samples <= 4096; num_samples *= 2) + for (auto x : params) { + b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size, + x.pbs_base_log, x.pbs_level, num_samples}); + } +} + static void BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) { // Define the parameters to benchmark @@ -405,11 +458,23 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit) "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count", "grouping_factor", "chunk_size"}); +#if CUDA_ARCH >= 900 +BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC) + ->Apply(BootstrapBenchmarkGenerateParams) + ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size", + "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"}); +#endif + BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultPBS) ->Apply(BootstrapBenchmarkGenerateParams) ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size", "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"}); +BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS) + ->Apply(BootstrapBenchmarkGenerateParams) + ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size", + "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"}); + BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS) ->Apply(BootstrapBenchmarkGenerateParams) ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size", diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index 895237c77e..4126459df5 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -205,7 +205,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { uint64_t decrypted = 0; core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out, glwe_dimension * polynomial_size); - EXPECT_NE(decrypted, plaintext); + ASSERT_NE(decrypted, plaintext); // let err = (decrypted >= plaintext) ? decrypted - plaintext : // plaintext // - decrypted; @@ -216,7 +216,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { // Compute the rounding bit uint64_t rounding = (decrypted & rounding_bit) << 1; uint64_t decoded = (decrypted + rounding) / delta; - EXPECT_EQ(decoded, plaintext / delta); + ASSERT_EQ(decoded, plaintext / delta); } } } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index 9859573012..f649f02046 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -86,16 +86,15 @@ class MultiBitProgrammableBootstrapTestPrimitives_u64 stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array, &plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array, - &d_lwe_output_indexes, lwe_dimension, glwe_dimension, - polynomial_size, grouping_factor, lwe_noise_distribution, - glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus, - carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions, - samples); - - scratch_cuda_multi_bit_programmable_bootstrap_64( - stream, &pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, - pbs_level, grouping_factor, number_of_inputs, - cuda_get_max_shared_memory(stream->gpu_index), true); + &d_lwe_output_indexes, lwe_dimension, glwe_dimension, polynomial_size, + grouping_factor, lwe_noise_distribution, glwe_noise_distribution, + pbs_base_log, pbs_level, message_modulus, carry_modulus, + &payload_modulus, &delta, number_of_inputs, repetitions, samples); + + scratch_cuda_multi_bit_programmable_bootstrap_64( + stream, &pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, + pbs_level, grouping_factor, number_of_inputs, + cuda_get_max_shared_memory(stream->gpu_index), true); lwe_ct_out_array = (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *