Skip to content

Commit

Permalink
fix(gpu): implements a NOSM mode to tree_add_chunks()
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Apr 24, 2024
1 parent e28723e commit b34e148
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 22 deletions.
3 changes: 3 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2226,6 +2226,9 @@ template <typename Torus> struct int_scalar_mul_buffer {
all_shifted_buffer = (Torus *)cuda_malloc_async(
num_ciphertext_bits * num_radix_blocks * lwe_size_bytes, stream);

printf("preshift_buffer: %d bytes, all_shifted_bufferd: %d bytes\n",
num_ciphertext_bits * lwe_size_bytes,
num_ciphertext_bits * num_radix_blocks * lwe_size_bytes);
cuda_memset_async(preshifted_buffer, 0,
num_ciphertext_bits * lwe_size_bytes, stream);

Expand Down
69 changes: 52 additions & 17 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
}
}

template <typename Torus>
template <typename Torus, sharedMemDegree SMD>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {

extern __shared__ Torus result[];
extern __shared__ int8_t sharedmem[];

Torus *result = (Torus *)sharedmem;

size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
Expand All @@ -106,6 +109,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
size_t block_stride = blockIdx.y * block_size;
auto dst_block = &dst_radix[block_stride];

if constexpr (SMD == NOSM)
result = dst_block;

// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
Expand All @@ -121,9 +127,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
}

// put result from shared mem to global mem
for (int i = tid; i < block_size; i += stride) {
dst_block[i] = result[i];
}
if constexpr (SMD == FULLSM)
for (int i = tid; i < block_size; i += stride)
dst_block[i] = result[i];
}

template <typename Torus, class params>
Expand Down Expand Up @@ -181,11 +187,20 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(

cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
stream, params, num_blocks_in_radix, max_num_radix_in_vec,
allocate_gpu_memory);
Expand Down Expand Up @@ -242,8 +257,16 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);

tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
if (sm_size < max_shared_memory)
tree_add_chunks<Torus, FULLSM>
<<<add_grid, 512, sm_size, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size,
num_blocks);
else
tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, stream->stream>>>(
new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);

check_cuda_error(cudaGetLastError());

size_t total_count = 0;
size_t message_count = 0;
Expand Down Expand Up @@ -295,6 +318,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
big_lwe_size);
check_cuda_error(cudaGetLastError());

if (carry_count > 0)
cuda_set_value_async<Torus>(
Expand Down Expand Up @@ -411,6 +435,7 @@ __host__ void host_integer_mult_radix_kb(
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
vector_lsb_rhs, vector_msb_rhs, num_blocks);
check_cuda_error(cudaGetLastError());

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
Expand All @@ -426,6 +451,7 @@ __host__ void host_integer_mult_radix_kb(
vector_result_msb, glwe_dimension,
lsb_vector_block_count, msb_vector_block_count,
num_blocks);
check_cuda_error(cudaGetLastError());

int terms_degree[2 * num_blocks * num_blocks];
for (int i = 0; i < num_blocks * num_blocks; i++) {
Expand All @@ -452,11 +478,20 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<Torus, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}

*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
Expand Down
19 changes: 14 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,20 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(

cudaSetDevice(stream->gpu_index);
size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
check_cuda_error(cudaFuncSetAttribute(
tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
sm_size));
cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(
cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
check_cuda_error(cudaGetLastError());
}

*mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
allocate_gpu_memory);
Expand Down

0 comments on commit b34e148

Please sign in to comment.