From 256378f7b15ad62ea74ba7bd86a4b20dee10695e Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 3 Oct 2024 10:32:19 +0200 Subject: [PATCH] chore(gpu): increase max sm size again to gain performance on 3_3 params on H100 This reverts commit 1c0b6fbbd466b31e5594581218fcfa52355ec1c7. --- backends/tfhe-cuda-backend/cuda/src/device.cu | 9 ++ .../cuda/src/pbs/bootstrapping_key.cuh | 98 +++++++++++++++++++ .../pbs/programmable_bootstrap_amortized.cuh | 22 +++++ .../pbs/programmable_bootstrap_cg_classic.cuh | 24 +++++ .../programmable_bootstrap_cg_multibit.cuh | 63 ++++++++++++ .../pbs/programmable_bootstrap_classic.cuh | 49 ++++++++++ .../pbs/programmable_bootstrap_multibit.cuh | 91 +++++++++++++++++ .../programmable_bootstrap_tbc_classic.cuh | 43 ++++++++ .../programmable_bootstrap_tbc_multibit.cuh | 78 +++++++++++++++ 9 files changed, 477 insertions(+) diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 9a880ad3cd..afad0b21fb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -271,5 +271,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, gpu_index); check_cuda_error(cudaGetLastError()); +#if CUDA_ARCH == 900 + max_shared_memory = 226000; +#elif CUDA_ARCH == 890 + max_shared_memory = 127000; +#elif CUDA_ARCH == 800 + max_shared_memory = 163000; +#elif CUDA_ARCH == 700 + max_shared_memory = 95000; +#endif return max_shared_memory; } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 7c0a7d6241..13f878f055 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -115,6 +115,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, switch (polynomial_size) { case 256: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -127,6 +133,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 512: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -139,6 +151,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -151,6 +169,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -163,6 +187,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -175,6 +205,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -187,6 +223,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 16384: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -227,6 +269,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 256: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -240,6 +290,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 512: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -253,6 +311,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -266,6 +332,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -279,6 +353,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -292,6 +374,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -305,6 +395,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 16384: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_polynomial_mul, ForwardFFT>, + FULLSM>, + cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 250f147988..6a60a0f6d1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -258,6 +258,28 @@ __host__ void scratch_programmable_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + uint64_t full_sm = + get_buffer_size_full_sm_programmable_bootstrap_amortized( + polynomial_size, glwe_dimension); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_amortized( + polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { + cudaFuncSetAttribute( + device_programmable_bootstrap_amortized, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_amortized, + cudaFuncCachePreferShared); + } else if (max_shared_memory >= partial_sm) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_amortized, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); + check_cuda_error(cudaFuncSetCacheConfig( + device_programmable_bootstrap_amortized, + cudaFuncCachePreferShared)); + } if (allocate_gpu_memory) { uint64_t buffer_size = get_buffer_size_programmable_bootstrap_amortized( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 668cf2e0ae..04ff5348c5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -194,6 +194,30 @@ __host__ void scratch_programmable_bootstrap_cg( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + uint64_t full_sm = + get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_cg( + polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_cg, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_cg, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory >= partial_sm) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_cg, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_cg, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index f26f45b810..29d8ee9590 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -213,6 +213,69 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( + polynomial_size); + uint64_t full_sm_cg_accumulate = + get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( + polynomial_size); + uint64_t partial_sm_cg_accumulate = + get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( + polynomial_size); + + int max_shared_memory = cuda_get_max_shared_memory(0); + if (max_shared_memory < full_sm_keybundle) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < partial_sm_cg_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory < full_sm_cg_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index d4966f1cdc..b9dfdf415c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -309,6 +309,55 @@ __host__ void scratch_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + uint64_t full_sm_step_one = + get_buffer_size_full_sm_programmable_bootstrap_step_one( + polynomial_size); + uint64_t full_sm_step_two = + get_buffer_size_full_sm_programmable_bootstrap_step_two( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap(polynomial_size); + + int max_shared_memory = cuda_get_max_shared_memory(0); + + // Configure step one + if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_step_one, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_step_one, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory >= partial_sm) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_step_one, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_step_one, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + // Configure step two + if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_step_two, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_step_two, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory >= partial_sm) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_step_two, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_step_two, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 455233f057..0f0ddaf79d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -389,6 +389,97 @@ __host__ void scratch_multi_bit_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + int max_shared_memory = cuda_get_max_shared_memory(0); + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( + polynomial_size); + uint64_t full_sm_accumulate_step_one = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( + polynomial_size); + uint64_t full_sm_accumulate_step_two = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( + polynomial_size); + uint64_t partial_sm_accumulate_step_one = + get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< + Torus>(polynomial_size); + + if (max_shared_memory < full_sm_keybundle) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < partial_sm_accumulate_step_one) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, NOSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, NOSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory < full_sm_accumulate_step_one) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, PARTIALSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm_accumulate_step_one)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, PARTIALSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_one)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one< + Torus, params, FULLSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < full_sm_accumulate_step_two) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two< + Torus, params, NOSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two< + Torus, params, NOSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two< + Torus, params, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_two)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two< + Torus, params, FULLSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 515527ade1..5dccab3606 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -200,6 +200,49 @@ __host__ void scratch_programmable_bootstrap_tbc( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + bool supports_dsm = + supports_distributed_shared_memory_on_classic_programmable_bootstrap< + Torus>(polynomial_size); + + uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t partial_sm = + get_buffer_size_partial_sm_programmable_bootstrap_tbc( + polynomial_size); + uint64_t minimum_sm_tbc = 0; + if (supports_dsm) + minimum_sm_tbc = + get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( + polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + + if (max_shared_memory >= full_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm + minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_programmable_bootstrap_tbc, + cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc)); + cudaFuncSetCacheConfig( + device_programmable_bootstrap_tbc, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 1e839e6c5b..06223abc3c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -203,6 +203,84 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + bool supports_dsm = + supports_distributed_shared_memory_on_multibit_programmable_bootstrap< + Torus>(polynomial_size); + + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( + polynomial_size); + uint64_t full_sm_tbc_accumulate = + get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( + polynomial_size); + uint64_t partial_sm_tbc_accumulate = + get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap( + polynomial_size); + uint64_t minimum_sm_tbc_accumulate = 0; + if (supports_dsm) + minimum_sm_tbc_accumulate = + get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( + polynomial_size); + + int max_shared_memory = cuda_get_max_shared_memory(0); + + if (max_shared_memory < full_sm_keybundle) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < + partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, + minimum_sm_tbc_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory < + full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_tbc_accumulate, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer(