From 19dc0f02f97bf02a62839884f0cc9145274e60e2 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 24 Jul 2024 11:06:39 -0300 Subject: [PATCH] refactor(gpu): refactor sample extract and modulus switch to match CPU's version --- .../cuda/src/crypto/torus.cuh | 37 +++++------------- .../pbs/programmable_bootstrap_amortized.cuh | 8 ++-- .../pbs/programmable_bootstrap_cg_classic.cuh | 7 ++-- .../programmable_bootstrap_cg_multibit.cuh | 4 +- .../pbs/programmable_bootstrap_classic.cuh | 8 ++-- .../pbs/programmable_bootstrap_multibit.cuh | 7 ++-- .../programmable_bootstrap_tbc_classic.cuh | 8 ++-- .../programmable_bootstrap_tbc_multibit.cuh | 4 +- .../cuda/src/polynomial/functions.cuh | 38 +++++++------------ 9 files changed, 45 insertions(+), 76 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 8fce461ca7..177892e5a9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -39,36 +39,19 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, } template -__device__ __forceinline__ void rescale_torus_element(T element, T &output, - uint32_t log_shift) { - output = - round((double)element / (double(std::numeric_limits::max()) + 1.0) * - (double)log_shift); -} +__device__ __forceinline__ void modulus_switch(T input, T &output, + uint32_t log_modulus) { + constexpr uint32_t BITS = sizeof(T) * 8; -template -__device__ __forceinline__ T rescale_torus_element(T element, - uint32_t log_shift) { - return round((double)element / (double(std::numeric_limits::max()) + 1.0) * - (double)log_shift); + output = input + (((T)1) << (BITS - log_modulus - 1)); + output >>= (BITS - log_modulus); } -template <> -__device__ __forceinline__ void -rescale_torus_element(uint32_t element, uint32_t &output, - uint32_t log_shift) { - output = - round(__uint2double_rn(element) / - (__uint2double_rn(std::numeric_limits::max()) + 1.0) * - __uint2double_rn(log_shift)); +template +__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) { + T output; + modulus_switch(input, output, log_modulus); + return output; } -template <> -__device__ __forceinline__ void -rescale_torus_element(uint64_t element, uint64_t &output, - uint32_t log_shift) { - output = round(__ull2double_rn(element) / - (__ull2double_rn(std::numeric_limits::max()) + 1.0) * - __uint2double_rn(log_shift)); -} #endif // CNCRT_TORUS_H diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 48536f1373..668883e044 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -89,8 +89,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "b", the body, in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); // 2 * params::log2_degree + 1); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -105,8 +105,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "a" in [0, 2N[ instead of Zq Torus a_hat = 0; - rescale_torus_element(block_lwe_array_in[iteration], a_hat, - 2 * params::degree); // 2 * params::log2_degree + 1); + modulus_switch(block_lwe_array_in[iteration], a_hat, + params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index a7a14d9967..3b0eecce7b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -93,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg( // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -106,8 +106,7 @@ __global__ void device_programmable_bootstrap_cg( // Put "a" in [0, 2N[ Torus a_hat = 0; - rescale_torus_element(block_lwe_array_in[i], a_hat, - 2 * params::degree); // 2 * params::log2_degree + 1); + modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 59dfd2abfa..d8e17c936f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -79,8 +79,8 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( if (lwe_offset == 0) { // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 2413709066..8a60c6b778 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -74,8 +74,8 @@ __global__ void device_programmable_bootstrap_step_one( // First iteration // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); // The y-dimension is used to select the element of the GLWE this block will // compute divide_by_monomial_negacyclic_inplace @@ -204,8 +203,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( // Initializes the accumulator with the body of LWE // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index ff56518064..b19cde44d9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -96,8 +96,8 @@ __global__ void device_programmable_bootstrap_tbc( // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -109,8 +109,8 @@ __global__ void device_programmable_bootstrap_tbc( // Put "a" in [0, 2N[ Torus a_hat = 0; - rescale_torus_element(block_lwe_array_in[i], a_hat, - 2 * params::degree); // 2 * params::log2_degree + 1); + modulus_switch(block_lwe_array_in[i], a_hat, + params::log2_degree + 1); // 2 * params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 0aa7115436..fd143e494e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -86,8 +86,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( if (lwe_offset == 0) { // Put "b" in [0, 2N[ Torus b_hat = 0; - rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, - 2 * params::degree); + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index d9c353721b..b2384769ef 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -189,37 +189,28 @@ __device__ void add_to_torus(double2 *m_values, Torus *result, } } -// Extracts the body of a GLWE. -// k is the offset to find the body element / polynomial in the lwe_array_out / -// accumulator +// Extracts the body of the nth-LWE in a GLWE. template __device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator, - uint32_t k) { + uint32_t glwe_dimension, uint32_t nth = 0) { // Set first coefficient of the accumulator as the body of the LWE sample - lwe_array_out[k * params::degree] = accumulator[k * params::degree]; + lwe_array_out[glwe_dimension * params::degree] = + accumulator[glwe_dimension * params::degree + nth]; } -// Extracts the mask from num_poly polynomials individually +// Extracts the mask from the nth-LWE in a GLWE. template __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, - uint32_t num_poly = 1) { + uint32_t num_poly = 1, uint32_t nth = 0) { for (int z = 0; z < num_poly; z++) { Torus *lwe_array_out_slice = (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree); Torus *accumulator_slice = (Torus *)accumulator + (ptrdiff_t)(z * params::degree); - // Set ACC = -ACC - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - accumulator_slice[tid] = -accumulator_slice[tid]; - tid = tid + params::degree / params::opt; - } synchronize_threads_in_block(); - // Reverse the accumulator - tid = threadIdx.x; + int tid = threadIdx.x; Torus result[params::opt]; #pragma unroll for (int i = 0; i < params::opt; i++) { @@ -227,10 +218,13 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, tid = tid + params::degree / params::opt; } synchronize_threads_in_block(); + + // Set ACC = -ACC tid = threadIdx.x; #pragma unroll for (int i = 0; i < params::opt; i++) { - accumulator_slice[tid] = result[i]; + accumulator_slice[tid] = + SEL(-result[i], result[i], tid >= params::degree - nth); tid = tid + params::degree / params::opt; } synchronize_threads_in_block(); @@ -244,23 +238,17 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, // result[i] = -accumulator_slice[tid - 1 + params::degree]; // else // result[i] = accumulator_slice[tid - 1]; - int x = tid - 1 + SEL(0, params::degree, tid < 1); + int x = tid - 1 + SEL(0, params::degree - nth, tid < 1); result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x]; tid += params::degree / params::opt; } synchronize_threads_in_block(); - tid = threadIdx.x; - for (int i = 0; i < params::opt; i++) { - accumulator_slice[tid] = result[i]; - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); // Copy to the mask of the LWE sample tid = threadIdx.x; #pragma unroll for (int i = 0; i < params::opt; i++) { - lwe_array_out_slice[tid] = accumulator_slice[tid]; + lwe_array_out_slice[tid] = result[i]; tid = tid + params::degree / params::opt; } }