Skip to content

Commit

Permalink
chore(gpu): increase max sm size again to gain performance on 3_3 par…
Browse files Browse the repository at this point in the history
…ams on H100

This reverts commit 1c0b6fb.
  • Loading branch information
agnesLeroy authored and pdroalves committed Oct 3, 2024
1 parent 51cae3d commit 256378f
Show file tree
Hide file tree
Showing 9 changed files with 477 additions and 0 deletions.
9 changes: 9 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,5 +271,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
check_cuda_error(cudaGetLastError());
#if CUDA_ARCH == 900
max_shared_memory = 226000;
#elif CUDA_ARCH == 890
max_shared_memory = 127000;
#elif CUDA_ARCH == 800
max_shared_memory = 163000;
#elif CUDA_ARCH == 700
max_shared_memory = 95000;
#endif
return max_shared_memory;
}
98 changes: 98 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -127,6 +133,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -139,6 +151,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -151,6 +169,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -163,6 +187,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -175,6 +205,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand All @@ -187,6 +223,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
Expand Down Expand Up @@ -227,6 +269,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -240,6 +290,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -253,6 +311,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -266,6 +332,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -279,6 +353,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -292,6 +374,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
Expand All @@ -305,6 +395,14 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,28 @@ __host__ void scratch_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size =
get_buffer_size_programmable_bootstrap_amortized<Torus>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,30 @@ __host__ void scratch_programmable_bootstrap_cg(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}

*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,69 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);

int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}

if (max_shared_memory < partial_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}

auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
Expand Down
Loading

0 comments on commit 256378f

Please sign in to comment.