From 04d2646ebbf941e5183fe073531f9b030c7802ae Mon Sep 17 00:00:00 2001
From: Guillermo Oyarzun <guillermo.oyarzun@zama.ai>
Date: Thu, 8 Aug 2024 12:30:46 +0200
Subject: [PATCH] refactor(gpu): avoid synchronizations in the keybundle

---
 .../pbs/programmable_bootstrap_multibit.cuh   | 78 +++++++++----------
 .../cuda/src/polynomial/functions.cuh         |  7 ++
 .../cuda/src/polynomial/polynomial_math.cuh   | 25 ++++++
 3 files changed, 71 insertions(+), 39 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index b5f5e43f1e..501c023b0f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,9 +18,9 @@
 #include <vector>
 
 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
-                                            uint32_t ggsw_idx,
-                                            uint32_t grouping_factor) {
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
   Torus x = 0;
   for (int i = 0; i < grouping_factor; i++) {
     uint32_t mask_position = grouping_factor - (i + 1);
@@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
   return modulus_switch(x, params::log2_degree + 1);
 }
 
+__device__ __forceinline__ int
+get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
+                          uint32_t level_count) {
+  return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) *
+         level_count;
+}
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     const Torus *__restrict__ lwe_array_in,
@@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
   uint32_t input_idx = blockIdx.x / lwe_chunk_size;
 
   if (lwe_iteration < (lwe_dimension / grouping_factor)) {
-    //
-    Torus *accumulator = (Torus *)selected_memory;
 
     const Torus *block_lwe_array_in =
         &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
@@ -81,57 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
         bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
         grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree;
 
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
+    Torus reg_acc[params::opt];
 
-    // Accumulate the other terms
-    for (int g = 1; g < (1 << grouping_factor); g++) {
+    copy_polynomial_in_regs<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly_ini, reg_acc);
 
-      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    int offset =
+        get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
 
-      // Calculates the monomial degree
+    // Precalculate the monomial degrees and store them in shared memory
+    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
+    if (threadIdx.x < (1 << grouping_factor)) {
       const Torus *lwe_array_group =
           block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
-      // Multiply by the bsk element
-      polynomial_accumulate_monic_monomial_mul<Torus>(
-          accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree,
-          params::opt, false);
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
     }
-
     synchronize_threads_in_block();
 
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
+    // Accumulate the other terms
+    for (int g = 1; g < (1 << grouping_factor); g++) {
+
+      uint32_t monomial_degree = monomial_degrees[g];
+
+      const Torus *bsk_poly = bsk_poly_ini + g * offset;
+      // Multiply by the bsk element
+      polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
+          reg_acc, bsk_poly, monomial_degree);
     }
+    synchronize_threads_in_block(); // needed because we are going to reuse the
+                                    // shared memory for the fft
 
-    synchronize_threads_in_block();
     // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
+    int tid = threadIdx.x;
     double2 *fft = (double2 *)selected_memory;
 #pragma unroll
     for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
+      fft[tid] =
+          make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
+                           (double)std::numeric_limits<Torus>::max(),
+                       __ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
+                           (double)std::numeric_limits<Torus>::max());
       tid += params::degree / params::opt;
     }
-    synchronize_threads_in_block();
+
     NSMFFT_direct<HalfDegree<params>>(fft);
 
     // lwe iteration
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
index f94c8e5848..64d328b1e3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -31,6 +31,13 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
     tid = tid + block_size;
   }
 }
+template <typename T, int elems_per_thread, int block_size>
+__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
+#pragma unroll
+  for (int i = 0; i < elems_per_thread; i++) {
+    dst[i] = source[threadIdx.x + i * block_size];
+  }
+}
 
 /*
  * Receives num_poly  concatenated polynomials of type T. For each:
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index 37a1135346..c054926f70 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -83,4 +83,29 @@ __device__ void polynomial_accumulate_monic_monomial_mul(
   }
 }
 
+template <typename T, class params>
+__device__ void polynomial_product_accumulate_by_monomial_nosync(
+    T *result, const T *__restrict__ poly, uint32_t monomial_degree) {
+  // monomial_degree \in [0, 2 * params::degree)
+  int full_cycles_count = monomial_degree / params::degree;
+  int remainder_degrees = monomial_degree % params::degree;
+
+// Every thread has a fixed position to track instead of "chasing" the
+// position
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    int pos =
+        (threadIdx.x + i * (params::degree / params::opt) - monomial_degree) &
+        (params::degree - 1);
+
+    T element = poly[pos];
+    T x = SEL(element, -element, full_cycles_count % 2);
+    x = SEL(-x, x,
+            threadIdx.x + i * (params::degree / params::opt) >=
+                remainder_degrees);
+
+    result[i] += x;
+  }
+}
+
 #endif // CNCRT_POLYNOMIAL_MATH_H