feat(gpu): implement signed scalar ge, gt, le, lt, max, and min

zama-ai · Apr 8, 2024 · 3972e62 · 3972e62
1 parent b6868e0
commit 3972e62
Show file tree

Hide file tree

Showing 18 changed files with 1,229 additions and 249 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -36,7 +36,7 @@ enum COMPARISON_TYPE {
   MAX = 6,
   MIN = 7,
 };
-enum IS_RELATIONSHIP { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
+enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 extern "C" {
 void scratch_cuda_full_propagation_64(
@@ -1846,6 +1846,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
                                  bool allocate_gpu_memory) {
     this->params = params;
 
+    Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
+
     block_selector_f = [](Torus msb, Torus lsb) -> Torus {
       if (msb == IS_EQUAL) // EQUAL
         return lsb;
@@ -1854,13 +1856,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
     };
 
     if (allocate_gpu_memory) {
-      tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
-                                             num_radix_blocks * sizeof(Torus),
-                                         stream);
-      tmp_y = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
-                                             num_radix_blocks * sizeof(Torus),
-                                         stream);
-
+      tmp_x = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
+      tmp_y = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
       // LUTs
       tree_inner_leaf_lut = new int_radix_lut<Torus>(
           stream, params, 1, num_radix_blocks, allocate_gpu_memory);
@@ -1901,6 +1898,10 @@ template <typename Torus> struct int_comparison_diff_buffer {
 
   int_tree_sign_reduction_buffer<Torus> *tree_buffer;
 
+  Torus *tmp_signs_a;
+  Torus *tmp_signs_b;
+  int_radix_lut<Torus> *reduce_signs_lut;
+
   int_comparison_diff_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
                              int_radix_params params, uint32_t num_radix_blocks,
                              bool allocate_gpu_memory) {
@@ -1922,7 +1923,6 @@ template <typename Torus> struct int_comparison_diff_buffer {
         return 42;
       }
     };
-
     if (allocate_gpu_memory) {
 
       Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
@@ -1935,15 +1935,26 @@ template <typename Torus> struct int_comparison_diff_buffer {
 
       tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
           stream, operator_f, params, num_radix_blocks, allocate_gpu_memory);
+      tmp_signs_a =
+          (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
+      tmp_signs_b =
+          (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
+      // LUTs
+      reduce_signs_lut = new int_radix_lut<Torus>(
+          stream, params, 1, num_radix_blocks, allocate_gpu_memory);
     }
   }
 
   void release(cuda_stream_t *stream) {
     tree_buffer->release(stream);
     delete tree_buffer;
+    reduce_signs_lut->release(stream);
+    delete reduce_signs_lut;
 
     cuda_drop_async(tmp_packed_left, stream);
     cuda_drop_async(tmp_packed_right, stream);
+    cuda_drop_async(tmp_signs_a, stream);
+    cuda_drop_async(tmp_signs_b, stream);
   }
 };
 
@@ -1963,6 +1974,7 @@ template <typename Torus> struct int_comparison_buffer {
 
   Torus *tmp_block_comparisons;
   Torus *tmp_lwe_array_out;
+  Torus *tmp_trivial_sign_block;
 
   // Scalar EQ / NE
   Torus *tmp_packed_input;
@@ -1975,6 +1987,7 @@ template <typename Torus> struct int_comparison_buffer {
   bool is_signed;
 
   // Used for scalar comparisons
+  int_radix_lut<Torus> *signed_msb_lut;
   cuda_stream_t *lsb_stream;
   cuda_stream_t *msb_stream;
 
@@ -1987,22 +2000,22 @@ template <typename Torus> struct int_comparison_buffer {
 
     identity_lut_f = [](Torus x) -> Torus { return x; };
 
+    auto big_lwe_size = params.big_lwe_dimension + 1;
+
     if (allocate_gpu_memory) {
       lsb_stream = cuda_create_stream(stream->gpu_index);
       msb_stream = cuda_create_stream(stream->gpu_index);
 
+      // +1 to have space for signed comparison
       tmp_lwe_array_out = (Torus *)cuda_malloc_async(
-          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
-          stream);
+          big_lwe_size * (num_radix_blocks + 1) * sizeof(Torus), stream);
 
       tmp_packed_input = (Torus *)cuda_malloc_async(
-          (params.big_lwe_dimension + 1) * 2 * num_radix_blocks * sizeof(Torus),
-          stream);
+          big_lwe_size * 2 * num_radix_blocks * sizeof(Torus), stream);
 
       // Block comparisons
       tmp_block_comparisons = (Torus *)cuda_malloc_async(
-          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
-          stream);
+          big_lwe_size * num_radix_blocks * sizeof(Torus), stream);
 
       // Cleaning LUT
       identity_lut = new int_radix_lut<Torus>(
@@ -2054,13 +2067,19 @@ template <typename Torus> struct int_comparison_buffer {
       }
 
       if (is_signed) {
+
+        tmp_trivial_sign_block =
+            (Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus), stream);
+
         signed_lut =
             new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);
+        signed_msb_lut =
+            new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);
 
         auto message_modulus = (int)params.message_modulus;
         uint32_t sign_bit_pos = log2(message_modulus) - 1;
-        std::function<Torus(Torus, Torus)> signed_lut_f;
-        signed_lut_f = [sign_bit_pos](Torus x, Torus y) -> Torus {
+        std::function<Torus(Torus, Torus)> signed_lut_f =
+            [sign_bit_pos](Torus x, Torus y) -> Torus {
           auto x_sign_bit = x >> sign_bit_pos;
           auto y_sign_bit = y >> sign_bit_pos;
 
@@ -2076,14 +2095,14 @@ template <typename Torus> struct int_comparison_buffer {
               return (Torus)(IS_INFERIOR);
             else if (x == y)
               return (Torus)(IS_EQUAL);
-            else if (x > y)
+            else
               return (Torus)(IS_SUPERIOR);
           } else {
             if (x < y)
               return (Torus)(IS_SUPERIOR);
             else if (x == y)
               return (Torus)(IS_EQUAL);
-            else if (x > y)
+            else
               return (Torus)(IS_INFERIOR);
           }
           PANIC("Cuda error: sign_lut creation failed due to wrong function.")
@@ -2126,8 +2145,11 @@ template <typename Torus> struct int_comparison_buffer {
     cuda_drop_async(tmp_packed_input, stream);
 
     if (is_signed) {
+      cuda_drop_async(tmp_trivial_sign_block, stream);
       signed_lut->release(stream);
       delete (signed_lut);
+      signed_msb_lut->release(stream);
+      delete (signed_msb_lut);
     }
     cuda_destroy_stream(lsb_stream);
     cuda_destroy_stream(msb_stream);

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -273,7 +273,7 @@ __host__ void host_compare_with_zero_equality(
       remainder_blocks -= (chunk_size - 1);
 
       // Update operands
-      chunk += chunk_size * big_lwe_size;
+      chunk += (chunk_size - 1) * big_lwe_size;
       sum_i += big_lwe_size;
     }
   }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -587,7 +587,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
       packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
     }
 
-    if (num_radix_blocks % 2 != 0) {
+    if (num_radix_blocks % 2 == 1) {
       // We couldn't pack the last block, so we just copy it
       Torus *lsb_block =
           lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
@@ -599,6 +599,36 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
   }
 }
 
+// template <typename Torus>
+//__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
+//                                    uint32_t lwe_dimension,
+//                                    uint32_t num_radix_blocks, uint32_t
+//                                    factor) {
+//   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+//   int bid = tid / (lwe_dimension + 1);
+// int total_blocks = (num_radix_blocks / 2) + (num_radix_blocks % 2);
+//
+//   if (tid < total_blocks * (lwe_dimension + 1)) {
+//
+//       if (bid < num_radix_blocks / 2) {
+//           Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
+//           Torus *msb_block = lsb_block + (lwe_dimension + 1);
+//
+//           Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
+//
+//           packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
+//       }else if (bid == num_radix_blocks / 2) {
+//       // We can't pack the last block, so we just copy it
+//       Torus *lsb_block =
+//           lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
+//       Torus *last_block =
+//           lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
+//
+//       last_block[tid] = lsb_block[tid];
+//     }
+//   }
+// }
+
 // Packs the low ciphertext in the message parts of the high ciphertext
 // and moves the high ciphertext into the carry part.
 //
@@ -684,4 +714,91 @@ __host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
       num_radix_blocks * bits_per_block, bit_extract->lut);
 }
 
+template <typename Torus>
+__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
+                           Torus *signs_array_in,
+                           int_comparison_buffer<Torus> *mem_ptr,
+                           std::function<Torus(Torus)> sign_handler_f,
+                           void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
+
+  auto diff_buffer = mem_ptr->diff_buffer;
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  std::function<Torus(Torus)> reduce_two_orderings_function =
+      [diff_buffer, sign_handler_f](Torus x) -> Torus {
+    int msb = (x >> 2) & 3;
+    int lsb = x & 3;
+
+    return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
+  };
+
+  auto signs_a = diff_buffer->tmp_signs_a;
+  auto signs_b = diff_buffer->tmp_signs_b;
+
+  cuda_memcpy_async_gpu_to_gpu(
+      signs_a, signs_array_in,
+      (big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
+  if (num_sign_blocks > 2) {
+    auto lut = diff_buffer->reduce_signs_lut;
+    generate_device_accumulator<Torus>(
+        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
+        carry_modulus, reduce_two_orderings_function);
+
+    while (num_sign_blocks > 2) {
+      pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
+                  4);
+      integer_radix_apply_univariate_lookup_table_kb(
+          stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);
+
+      auto last_block_signs_b =
+          signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
+      auto last_block_signs_a =
+          signs_a + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
+      if (num_sign_blocks % 2 == 1)
+        cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
+                                     (big_lwe_dimension + 1) * sizeof(Torus),
+                                     stream);
+
+      num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
+    }
+  }
+
+  if (num_sign_blocks == 2) {
+    std::function<Torus(Torus)> final_lut_f =
+        [reduce_two_orderings_function, sign_handler_f](Torus x) -> Torus {
+      Torus final_sign = reduce_two_orderings_function(x);
+      return sign_handler_f(final_sign);
+    };
+
+    auto lut = diff_buffer->reduce_signs_lut;
+    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, final_lut_f);
+
+    pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
+    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
+                                                   signs_b, bsk, ksk, 1, lut);
+
+  } else {
+
+    std::function<Torus(Torus)> final_lut_f =
+        [mem_ptr, sign_handler_f](Torus x) -> Torus {
+      return sign_handler_f(x & 3);
+    };
+
+    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
+    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, final_lut_f);
+
+    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
+                                                   signs_a, bsk, ksk, 1, lut);
+  }
+}
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH