diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 9e7639e355..00a633fbbf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -22,18 +22,19 @@ template __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value, uint32_t blocks_count, uint32_t lwe_size) { - value %= blocks_count; - size_t tid = threadIdx.x; - size_t src_block_id = blockIdx.x; - size_t dst_block_id = (src_block_id + value) % blocks_count; - size_t stride = blockDim.x; + if (tid < lwe_size) { + value %= blocks_count; + size_t src_block_id = blockIdx.x; + size_t dst_block_id = (src_block_id + value) % blocks_count; + size_t stride = blockDim.x; - auto cur_src_block = &src[src_block_id * lwe_size]; - auto cur_dst_block = &dst[dst_block_id * lwe_size]; + auto cur_src_block = &src[src_block_id * lwe_size]; + auto cur_dst_block = &dst[dst_block_id * lwe_size]; - for (size_t i = tid; i < lwe_size; i += stride) { - cur_dst_block[i] = cur_src_block[i]; + for (size_t i = tid; i < lwe_size; i += stride) { + cur_dst_block[i] = cur_src_block[i]; + } } } @@ -44,25 +45,28 @@ template __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value, uint32_t blocks_count, uint32_t lwe_size) { - value %= blocks_count; - size_t src_block_id = blockIdx.x; - size_t tid = threadIdx.x; - size_t dst_block_id = (src_block_id >= value) - ? src_block_id - value - : src_block_id - value + blocks_count; - size_t stride = blockDim.x; + if (tid < lwe_size) { + value %= blocks_count; + size_t src_block_id = blockIdx.x; - auto cur_src_block = &src[src_block_id * lwe_size]; - auto cur_dst_block = &dst[dst_block_id * lwe_size]; + size_t dst_block_id = (src_block_id >= value) + ? src_block_id - value + : src_block_id - value + blocks_count; + size_t stride = blockDim.x; - for (size_t i = tid; i < lwe_size; i += stride) { - cur_dst_block[i] = cur_src_block[i]; + auto cur_src_block = &src[src_block_id * lwe_size]; + auto cur_dst_block = &dst[dst_block_id * lwe_size]; + + for (size_t i = tid; i < lwe_size; i += stride) { + cur_dst_block[i] = cur_src_block[i]; + } } } // rotate radix ciphertext right with specific value // calculation is not inplace, so `dst` and `src` must not be the same +// one block is responsible to process single lwe ciphertext template __host__ void host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index fc81a409c5..1165f132ee 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -93,15 +93,11 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext, } } -template +template __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks, uint32_t chunk_size, uint32_t block_size, uint32_t num_blocks) { - extern __shared__ int8_t sharedmem[]; - - Torus *result = (Torus *)sharedmem; - size_t stride = blockDim.x; size_t chunk_id = blockIdx.x; size_t chunk_elem_size = chunk_size * num_blocks * block_size; @@ -109,10 +105,7 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks, auto src_chunk = &input_blocks[chunk_id * chunk_elem_size]; auto dst_radix = &result_blocks[chunk_id * radix_elem_size]; size_t block_stride = blockIdx.y * block_size; - auto dst_block = &dst_radix[block_stride]; - - if constexpr (SMD == NOSM) - result = dst_block; + auto result = &dst_radix[block_stride]; // init shared mem with first radix of chunk size_t tid = threadIdx.x; @@ -127,11 +120,6 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks, result[i] += cur_src_radix[block_stride + i]; } } - - // put result from shared mem to global mem - if constexpr (SMD == FULLSM) - for (int i = tid; i < block_size; i += stride) - dst_block[i] = result[i]; } template @@ -281,12 +269,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( size_t sm_size = big_lwe_size * sizeof(Torus); cudaSetDevice(gpu_indexes[0]); - if (sm_size < max_shared_memory) - tree_add_chunks<<>>( - new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks); - else - tree_add_chunks<<>>( - new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks); + tree_add_chunks<<>>( + new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks); check_cuda_error(cudaGetLastError()); @@ -299,7 +283,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in, h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max, total_count, message_count, carry_count, sm_copy_count); - + cuda_synchronize_stream(streams[0], gpu_indexes[0]); auto lwe_indexes_in = luts_message_carry->lwe_indexes_in; auto lwe_indexes_out = luts_message_carry->lwe_indexes_out; luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0], diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 47eea7b426..1d5fa20a15 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -52,11 +52,6 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( Torus *full_rotated_buffer = mem->tmp_rotated; Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size]; - // rotate right all the blocks in radix ciphertext - // copy result in new buffer - // 1024 threads are used in every block - // block_count blocks will be used in the grid - // one block is responsible to process single lwe ciphertext if (mem->shift_type == LEFT_SHIFT) { // rotate right as the blocks are from LSB to MSB host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,