From 04d2646ebbf941e5183fe073531f9b030c7802ae Mon Sep 17 00:00:00 2001 From: Guillermo Oyarzun Date: Thu, 8 Aug 2024 12:30:46 +0200 Subject: [PATCH] refactor(gpu): avoid synchronizations in the keybundle --- .../pbs/programmable_bootstrap_multibit.cuh | 78 +++++++++---------- .../cuda/src/polynomial/functions.cuh | 7 ++ .../cuda/src/polynomial/polynomial_math.cuh | 25 ++++++ 3 files changed, 71 insertions(+), 39 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index b5f5e43f1e..501c023b0f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -18,9 +18,9 @@ #include template -__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group, - uint32_t ggsw_idx, - uint32_t grouping_factor) { +__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group, + uint32_t ggsw_idx, + uint32_t grouping_factor) { Torus x = 0; for (int i = 0; i < grouping_factor; i++) { uint32_t mask_position = grouping_factor - (i + 1); @@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group, return modulus_switch(x, params::log2_degree + 1); } +__device__ __forceinline__ int +get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension, + uint32_t level_count) { + return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * + level_count; +} + template __global__ void device_multi_bit_programmable_bootstrap_keybundle( const Torus *__restrict__ lwe_array_in, @@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( uint32_t input_idx = blockIdx.x / lwe_chunk_size; if (lwe_iteration < (lwe_dimension / grouping_factor)) { - // - Torus *accumulator = (Torus *)selected_memory; const Torus *block_lwe_array_in = &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; @@ -81,57 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id, grouping_factor, 2 * polynomial_size, glwe_dimension, level_count); - const Torus *bsk_poly = bsk_slice + poly_id * params::degree; + const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree; - copy_polynomial( - bsk_poly, accumulator); + Torus reg_acc[params::opt]; - // Accumulate the other terms - for (int g = 1; g < (1 << grouping_factor); g++) { + copy_polynomial_in_regs( + bsk_poly_ini, reg_acc); - const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( - bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id, - grouping_factor, 2 * polynomial_size, glwe_dimension, level_count); - const Torus *bsk_poly = bsk_slice + poly_id * params::degree; + int offset = + get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count); - // Calculates the monomial degree + // Precalculate the monomial degrees and store them in shared memory + uint32_t *monomial_degrees = (uint32_t *)selected_memory; + if (threadIdx.x < (1 << grouping_factor)) { const Torus *lwe_array_group = block_lwe_array_in + rev_lwe_iteration * grouping_factor; - uint32_t monomial_degree = calculates_monomial_degree( - lwe_array_group, g, grouping_factor); - - synchronize_threads_in_block(); - // Multiply by the bsk element - polynomial_accumulate_monic_monomial_mul( - accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree, - params::opt, false); + monomial_degrees[threadIdx.x] = calculates_monomial_degree( + lwe_array_group, threadIdx.x, grouping_factor); } - synchronize_threads_in_block(); - // Move accumulator to local memory - double2 temp[params::opt / 2]; - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - temp[i].x = __ll2double_rn((int64_t)accumulator[tid]); - temp[i].y = - __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]); - temp[i].x /= (double)std::numeric_limits::max(); - temp[i].y /= (double)std::numeric_limits::max(); - tid += params::degree / params::opt; + // Accumulate the other terms + for (int g = 1; g < (1 << grouping_factor); g++) { + + uint32_t monomial_degree = monomial_degrees[g]; + + const Torus *bsk_poly = bsk_poly_ini + g * offset; + // Multiply by the bsk element + polynomial_product_accumulate_by_monomial_nosync( + reg_acc, bsk_poly, monomial_degree); } + synchronize_threads_in_block(); // needed because we are going to reuse the + // shared memory for the fft - synchronize_threads_in_block(); // Move from local memory back to shared memory but as complex - tid = threadIdx.x; + int tid = threadIdx.x; double2 *fft = (double2 *)selected_memory; #pragma unroll for (int i = 0; i < params::opt / 2; i++) { - fft[tid] = temp[i]; + fft[tid] = + make_double2(__ll2double_rn((int64_t)reg_acc[i]) / + (double)std::numeric_limits::max(), + __ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) / + (double)std::numeric_limits::max()); tid += params::degree / params::opt; } - synchronize_threads_in_block(); + NSMFFT_direct>(fft); // lwe iteration diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh index f94c8e5848..64d328b1e3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh @@ -31,6 +31,13 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) { tid = tid + block_size; } } +template +__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) { +#pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + dst[i] = source[threadIdx.x + i * block_size]; + } +} /* * Receives num_poly concatenated polynomials of type T. For each: diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index 37a1135346..c054926f70 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -83,4 +83,29 @@ __device__ void polynomial_accumulate_monic_monomial_mul( } } +template +__device__ void polynomial_product_accumulate_by_monomial_nosync( + T *result, const T *__restrict__ poly, uint32_t monomial_degree) { + // monomial_degree \in [0, 2 * params::degree) + int full_cycles_count = monomial_degree / params::degree; + int remainder_degrees = monomial_degree % params::degree; + +// Every thread has a fixed position to track instead of "chasing" the +// position +#pragma unroll + for (int i = 0; i < params::opt; i++) { + int pos = + (threadIdx.x + i * (params::degree / params::opt) - monomial_degree) & + (params::degree - 1); + + T element = poly[pos]; + T x = SEL(element, -element, full_cycles_count % 2); + x = SEL(-x, x, + threadIdx.x + i * (params::degree / params::opt) >= + remainder_degrees); + + result[i] += x; + } +} + #endif // CNCRT_POLYNOMIAL_MATH_H