Skip to content

Commit

Permalink
fix(gpu): fix vec with device other than 0
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Nov 27, 2024
1 parent 9758167 commit 6e36e67
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 308 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,12 @@ template <typename Torus> struct int_decompression {
};

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
carry_extract_lut->get_lut(gpu_indexes[0], 0),
streams[0], gpu_indexes[0], carry_extract_lut->get_lut(0, 0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
encryption_params.message_modulus, encryption_params.carry_modulus,
carry_extract_f);

carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
carry_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
Expand Down
342 changes: 150 additions & 192 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ __host__ void are_all_comparisons_block_true(
return x == chunk_length;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
streams[0], gpu_indexes[0], new_lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);

new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
new_lut->broadcast_lut(streams, gpu_indexes, 0);

(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
Expand Down Expand Up @@ -449,9 +449,9 @@ __host__ void tree_sign_reduction(
f = sign_handler_f;
}
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f);
last_lut->broadcast_lut(streams, gpu_indexes, 0);

// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
Expand Down
40 changes: 19 additions & 21 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1463,10 +1463,10 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
reduce_two_orderings_function);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, 0);

while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
Expand Down Expand Up @@ -1497,10 +1497,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
big_lwe_dimension, 2, 4);
Expand All @@ -1517,10 +1516,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
Expand All @@ -1539,11 +1537,11 @@ void scratch_cuda_apply_univariate_lut_kb(
1, num_radix_blocks, allocate_gpu_memory);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_async_to_gpu(
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
}

template <typename Torus>
Expand Down Expand Up @@ -1582,11 +1580,11 @@ void scratch_cuda_apply_bivariate_lut_kb(
1, num_radix_blocks, allocate_gpu_memory);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_async_to_gpu(
(*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
(params.glwe_dimension + 1) *
params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
}

template <typename Torus>
Expand Down
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_blocks, reused_lut);
}
auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);

// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
Expand All @@ -285,7 +285,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry);
luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

while (r > 2) {
size_t cur_total_blocks = r * num_blocks;
Expand Down Expand Up @@ -334,10 +334,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(gpu_indexes[0], message_count), 1,
luts_message_carry->get_lut_indexes(0, message_count), 1,
carry_count);

luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
Expand Down
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ __host__ void host_integer_radix_scalar_bitop_kb(
} else {
// We have all possible LUTs pre-computed and we use the decomposed scalar
// as index to recover the right one
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(gpu_indexes[0], 0),
clear_blocks, num_clear_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
Expand Down
39 changes: 19 additions & 20 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
};

auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
scalar_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, scalar_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
Expand Down Expand Up @@ -194,10 +194,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
Expand Down Expand Up @@ -329,10 +329,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes, 0);

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
Expand Down Expand Up @@ -422,11 +422,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

auto signed_msb_lut = mem_ptr->signed_msb_lut;
generate_device_accumulator_bivariate<Torus>(
msb_streams[0], gpu_indexes[0],
signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);

Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
Expand Down Expand Up @@ -676,10 +675,10 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
scalar_blocks, 0, num_scalar_blocks, message_modulus);

cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->get_lut_indexes(0, 0),
packed_scalar,
num_halved_scalar_blocks * sizeof(Torus),
lsb_streams[0], gpu_indexes[0]);
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
Expand Down
16 changes: 8 additions & 8 deletions tfhe/src/core_crypto/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,12 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
polynomial_size: PolynomialSize,
) {
let size = std::mem::size_of_val(src);
for &gpu_index in streams.gpu_indexes.iter() {
for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
cuda_convert_lwe_programmable_bootstrap_key_64(
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
dest.get_mut_c_ptr(gpu_index),
stream,
streams.gpu_indexes[gpu_index],
dest.get_mut_c_ptr(gpu_index as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
Expand Down Expand Up @@ -367,12 +367,12 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
grouping_factor: LweBskGroupingFactor,
) {
let size = std::mem::size_of_val(src);
for &gpu_index in streams.gpu_indexes.iter() {
for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
dest.as_mut_c_ptr(gpu_index),
stream,
streams.gpu_indexes[gpu_index],
dest.as_mut_c_ptr(gpu_index as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
Expand Down
64 changes: 15 additions & 49 deletions tfhe/src/core_crypto/gpu/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,11 @@ impl<T: Numeric> Clone for CudaVec<T> {
fn clone(&self) -> Self {
let size = self.len as u64 * std::mem::size_of::<T>() as u64;
let mut cloned_vec = Vec::with_capacity(self.ptr.len());
for &index in self.gpu_indexes.iter() {
for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
unsafe {
cuda_synchronize_device(index);
let ptr = cuda_malloc(size, self.gpu_indexes[index as usize]);
cuda_memcpy_gpu_to_gpu(
ptr,
self.ptr[index as usize],
size,
self.gpu_indexes[index as usize],
);
cuda_synchronize_device(gpu_index);
let ptr = cuda_malloc(size, gpu_index);
cuda_memcpy_gpu_to_gpu(ptr, self.ptr[index], size, gpu_index);
cloned_vec.push(ptr);
}
}
Expand Down Expand Up @@ -95,24 +90,18 @@ impl<T: Numeric> CudaVec<T> {
pub fn new_multi_gpu(len: usize, streams: &CudaStreams) -> Self {
let size = len as u64 * std::mem::size_of::<T>() as u64;
let mut ptrs = Vec::with_capacity(streams.len());
for &gpu_index in streams.gpu_indexes.iter() {
let ptr = unsafe {
cuda_malloc_async(
size,
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
)
};
for (index, &stream) in streams.ptr.iter().enumerate() {
let ptr = unsafe { cuda_malloc_async(size, stream, index as u32) };
unsafe {
cuda_memset_async(
ptr,
0u64,
size,
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
streams.ptr[index],
streams.gpu_indexes[index],
);
}
streams.synchronize_one(gpu_index);
streams.synchronize_one(index as u32);
ptrs.push(ptr);
}

Expand Down Expand Up @@ -171,29 +160,6 @@ impl<T: Numeric> CudaVec<T> {
}
}

/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronised
pub unsafe fn memset_multi_gpu_async(&mut self, value: T, streams: &CudaStreams)
where
T: Into<u64>,
{
for &gpu_index in self.gpu_indexes.clone().iter() {
let size = self.len() * std::mem::size_of::<T>();
// We check that self is not empty to avoid invalid pointers
if size > 0 {
cuda_memset_async(
self.as_mut_c_ptr(gpu_index),
value.into(),
size as u64,
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
);
}
}
}

/// Copies data from slice into `CudaVec`
///
/// # Safety
Expand Down Expand Up @@ -234,19 +200,19 @@ impl<T: Numeric> CudaVec<T> {
where
T: Numeric,
{
for &gpu_index in streams.gpu_indexes.iter() {
for (gpu_index, &stream) in streams.ptr.iter().enumerate() {
assert!(self.len() >= src.len());
let size = std::mem::size_of_val(src);

// We have to check that src is not empty, because Rust slice with size 0 results in an
// invalid pointer being passed to copy_to_gpu_async
if size > 0 {
cuda_memcpy_async_to_gpu(
self.get_mut_c_ptr(gpu_index),
self.get_mut_c_ptr(gpu_index as u32),
src.as_ptr().cast(),
size as u64,
streams.ptr[gpu_index as usize],
streams.gpu_indexes[gpu_index as usize],
stream,
streams.gpu_indexes[gpu_index],
);
}
}
Expand Down Expand Up @@ -473,10 +439,10 @@ unsafe impl<T> Sync for CudaVec<T> where T: Sync + Numeric {}
impl<T: Numeric> Drop for CudaVec<T> {
/// Free memory for pointer `ptr` synchronously
fn drop(&mut self) {
for &gpu_index in self.gpu_indexes.iter() {
for (index, &gpu_index) in self.gpu_indexes.iter().enumerate() {
// Synchronizes the device to be sure no stream is still using this pointer
synchronize_device(gpu_index);
unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) };
unsafe { cuda_drop(self.get_mut_c_ptr(index as u32), gpu_index) };
}
}
}
Expand Down

0 comments on commit 6e36e67

Please sign in to comment.