From e087840ff593cc62ffb289c51f33898dc315fffa Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Fri, 20 Dec 2024 12:22:43 -0300 Subject: [PATCH 1/2] chore(gpu): port fix to compression encoding - Modifies the generation of the LUT used in decompression so that the delta is calculated with a different precision, as in the CPU implementation --- .../compression/compression_utilities.h | 32 ++++--- .../cuda/include/integer/integer_utilities.h | 9 ++ .../src/integer/compression/compression.cuh | 2 +- .../cuda/src/integer/integer.cuh | 94 ++++++++++++++----- .../ciphertext/compressed_ciphertext_list.rs | 2 +- .../gpu/list_compression/server_keys.rs | 22 ++++- 6 files changed, 120 insertions(+), 41 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h index db4aeb4efc..632539b1d4 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h @@ -64,7 +64,7 @@ template struct int_decompression { Torus *tmp_extracted_lwe; uint32_t *tmp_indexes_array; - int_radix_lut *carry_extract_lut; + int_radix_lut *decompression_rescale_lut; int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params encryption_params, @@ -83,7 +83,7 @@ template struct int_decompression { Torus lwe_accumulator_size = (compression_params.glwe_dimension * compression_params.polynomial_size + 1); - carry_extract_lut = new int_radix_lut( + decompression_rescale_lut = new int_radix_lut( streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks, allocate_gpu_memory); @@ -96,18 +96,28 @@ template struct int_decompression { num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); - // Carry extract LUT - auto carry_extract_f = [encryption_params](Torus x) -> Torus { - return x / encryption_params.message_modulus; + // Rescale is done using an identity LUT + // Here we do not divide by message_modulus + // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits + // space, we want to keep the original 2 bits value in the 4 bits space, + // so we apply the identity and the encoding will rescale it for us. + auto decompression_rescale_f = [encryption_params](Torus x) -> Torus { + return x; }; - generate_device_accumulator( - streams[0], gpu_indexes[0], carry_extract_lut->get_lut(0, 0), + auto effective_compression_message_modulus = + encryption_params.carry_modulus; + auto effective_compression_carry_modulus = 1; + + generate_device_accumulator_with_encoding( + streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0), encryption_params.glwe_dimension, encryption_params.polynomial_size, + effective_compression_message_modulus, + effective_compression_carry_modulus, encryption_params.message_modulus, encryption_params.carry_modulus, - carry_extract_f); + decompression_rescale_f); - carry_extract_lut->broadcast_lut(streams, gpu_indexes, 0); + decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0); } } void release(cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -116,8 +126,8 @@ template struct int_decompression { cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]); cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]); - carry_extract_lut->release(streams, gpu_indexes, gpu_count); - delete carry_extract_lut; + decompression_rescale_lut->release(streams, gpu_indexes, gpu_count); + delete decompression_rescale_lut; } }; #endif diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 5a35868c54..186b8c7ce2 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -38,6 +38,15 @@ void generate_device_accumulator_bivariate_with_factor( cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f, int factor); + +template +void generate_device_accumulator_with_encoding( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t input_message_modulus, uint32_t input_carry_modulus, + uint32_t output_message_modulus, uint32_t output_carry_modulus, + std::function f); + /* * generate univariate accumulator (lut) for device pointer * stream - cuda stream diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index 097dc47f32..86d0065ded 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -300,7 +300,7 @@ __host__ void host_integer_decompress( /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension auto encryption_params = h_mem_ptr->encryption_params; - auto lut = h_mem_ptr->carry_extract_lut; + auto lut = h_mem_ptr->decompression_rescale_lut; auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); if (active_gpu_count == 1) { execute_pbs_async( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index dfb80e1ad9..b5319ba9f5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -627,26 +627,46 @@ void rotate_left(Torus *buffer, int mid, uint32_t array_length) { std::rotate(buffer, buffer + mid, buffer + array_length); } +/// Caller needs to ensure that the operation applied is coherent from an +/// encoding perspective. +/// +/// For example: +/// +/// Input encoding has 2 bits and output encoding has 4 bits, applying the +/// identity lut would map the following: +/// +/// 0|00|xx -> 0|00|00 +/// 0|01|xx -> 0|00|01 +/// 0|10|xx -> 0|00|10 +/// 0|11|xx -> 0|00|11 +/// +/// The reason is the identity function is computed in the input space but the +/// scaling is done in the output space, as there are more bits in the output +/// space, the delta is smaller hence the apparent "division" happening. template -void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, - std::function f) { - - uint32_t modulus_sup = message_modulus * carry_modulus; - uint32_t box_size = polynomial_size / modulus_sup; - Torus delta = (1ul << 63) / modulus_sup; +void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t input_message_modulus, + uint32_t input_carry_modulus, + uint32_t output_message_modulus, + uint32_t output_carry_modulus, + std::function f) { + + uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus; + uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus; + uint32_t box_size = polynomial_size / input_modulus_sup; + Torus output_delta = (1ul << 63) / output_modulus_sup; memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); auto body = &acc[glwe_dimension * polynomial_size]; // This accumulator extracts the carry bits - for (int i = 0; i < modulus_sup; i++) { + for (int i = 0; i < input_modulus_sup; i++) { int index = i * box_size; for (int j = index; j < index + box_size; j++) { auto f_eval = f(i); - body[j] = f_eval * delta; + body[j] = f_eval * output_delta; } } @@ -660,6 +680,16 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, rotate_left(body, half_box_size, polynomial_size); } +template +void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, + uint32_t carry_modulus, + std::function f) { + generate_lookup_table_with_encoding(acc, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, + message_modulus, carry_modulus, f); +} + template void generate_many_lookup_table( Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -803,28 +833,22 @@ void generate_device_accumulator_bivariate_with_factor( free(h_lut); } -/* - * generate accumulator for device pointer - * v_stream - cuda stream - * acc - device pointer for accumulator - * ... - * f - evaluating function with one Torus input - */ template -void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, - Torus *acc, uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t message_modulus, - uint32_t carry_modulus, - std::function f) { +void generate_device_accumulator_with_encoding( + cudaStream_t stream, uint32_t gpu_index, Torus *acc, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t input_message_modulus, uint32_t input_carry_modulus, + uint32_t output_message_modulus, uint32_t output_carry_modulus, + std::function f) { // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); // fill accumulator - generate_lookup_table(h_lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f); + generate_lookup_table_with_encoding( + h_lut, glwe_dimension, polynomial_size, input_message_modulus, + input_carry_modulus, output_message_modulus, output_carry_modulus, f); // copy host lut and lut_indexes_vec to device cuda_memcpy_async_to_gpu( @@ -835,6 +859,26 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, free(h_lut); } +/* + * generate accumulator for device pointer + * v_stream - cuda stream + * acc - device pointer for accumulator + * ... + * f - evaluating function with one Torus input + */ +template +void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, + Torus *acc, uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t message_modulus, + uint32_t carry_modulus, + std::function f) { + + generate_device_accumulator_with_encoding( + stream, gpu_index, acc, glwe_dimension, polynomial_size, message_modulus, + carry_modulus, message_modulus, carry_modulus, f); +} + /* * generate many lut accumulator for device pointer * v_stream - cuda stream diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index ef4f88e13f..d6c6714f19 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -103,7 +103,7 @@ impl CudaCompressedCiphertextList { start_block_index, end_block_index, streams, - ), + ).unwrap(), current_info, )) } diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 3faa876d57..cafef0da98 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -195,7 +195,23 @@ impl CudaDecompressionKey { start_block_index: usize, end_block_index: usize, streams: &CudaStreams, - ) -> CudaRadixCiphertext { + ) -> Result { + if self.message_modulus.0 != self.carry_modulus.0 { + return Err(crate::Error::new(format!( + "Tried to unpack values from a list where message modulus \ + ({:?}) is != carry modulus ({:?}), this is not supported.", + self.message_modulus, self.carry_modulus, + ))); + } + + if end_block_index >= packed_list.bodies_count { + return Err(crate::Error::new(format!( + "Tried getting index {end_block_index} for CompressedCiphertextList \ + with {} elements, out of bound access.", + packed_list.bodies_count + ))); + } + let indexes_array = (start_block_index..=end_block_index) .map(|x| x as u32) .collect_vec(); @@ -264,10 +280,10 @@ impl CudaDecompressionKey { let blocks = vec![first_block_info; output_lwe.0.lwe_ciphertext_count.0]; - CudaRadixCiphertext { + Ok(CudaRadixCiphertext { d_blocks: output_lwe, info: CudaRadixCiphertextInfo { blocks }, - } + }) } CudaBootstrappingKey::MultiBit(_) => { panic! {"Compression is currently not compatible with Multi-Bit PBS"} From 3492e81a876a15b544896de02079d6f1235078ab Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 6 Jan 2025 08:50:02 -0300 Subject: [PATCH 2/2] fix(gpu): fix delta calculation when Torus is not a 64-bit type --- .../cuda/src/integer/integer.cuh | 18 ++++++++++++------ .../ciphertext/compressed_ciphertext_list.rs | 16 +++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index b5319ba9f5..f6e53b5050 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -655,7 +655,9 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension, uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus; uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus; uint32_t box_size = polynomial_size / input_modulus_sup; - Torus output_delta = (1ul << 63) / output_modulus_sup; + auto nbits = sizeof(Torus) * 8; + Torus output_delta = + (static_cast(1) << (nbits - 1)) / output_modulus_sup; memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); @@ -698,7 +700,8 @@ void generate_many_lookup_table( uint32_t modulus_sup = message_modulus * carry_modulus; uint32_t box_size = polynomial_size / modulus_sup; - Torus delta = (1ul << 63) / modulus_sup; + auto nbits = sizeof(Torus) * 8; + Torus delta = (static_cast(1) << (nbits - 1)) / modulus_sup; memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); @@ -1099,7 +1102,8 @@ void host_compute_propagation_simulators_and_group_carries( message_modulus, carry_modulus); uint32_t modulus_sup = message_modulus * carry_modulus; - Torus delta = (1ull << 63) / modulus_sup; + auto nbits = sizeof(Torus) * 8; + Torus delta = (static_cast(1) << (nbits - 1)) / modulus_sup; auto simulators = mem->simulators; auto grouping_pgns = mem->grouping_pgns; host_radix_split_simulators_and_grouping_pgns( @@ -1426,8 +1430,8 @@ __host__ void create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus const *scalar_array, uint32_t lwe_dimension, uint32_t num_radix_blocks, - uint32_t num_scalar_blocks, uint64_t message_modulus, - uint64_t carry_modulus) { + uint32_t num_scalar_blocks, Torus message_modulus, + Torus carry_modulus) { cudaSetDevice(gpu_index); size_t radix_size = (lwe_dimension + 1) * num_radix_blocks; @@ -1447,7 +1451,9 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index, // Value of the shift we multiply our messages by // If message_modulus and carry_modulus are always powers of 2 we can simplify // this - uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); + auto nbits = sizeof(Torus) * 8; + Torus delta = (static_cast(1) << (nbits - 1)) / + (message_modulus * carry_modulus); device_create_trivial_radix<<>>( lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta); diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index d6c6714f19..9d17f0f504 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -97,13 +97,15 @@ impl CudaCompressedCiphertextList { let end_block_index = start_block_index + current_info.num_blocks() - 1; Some(( - decomp_key.unpack( - &self.packed_list, - current_info, - start_block_index, - end_block_index, - streams, - ).unwrap(), + decomp_key + .unpack( + &self.packed_list, + current_info, + start_block_index, + end_block_index, + streams, + ) + .unwrap(), current_info, )) }