chore(gpu): abort when trying to launch 32 bit multi-bit PBS

zama-ai · Feb 5, 2024 · a50e90b · a50e90b
1 parent 97feefe
commit a50e90b
Show file tree

Hide file tree

Showing 8 changed files with 233 additions and 209 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -3,6 +3,7 @@
 
 #include "bootstrap.h"
 #include "bootstrap_multibit.h"
+#include "pbs/bootstrap.cuh"
 #include <cassert>
 #include <cmath>
 #include <functional>
@@ -297,31 +298,11 @@ template <typename Torus> struct int_radix_lut {
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
     ///////////////
-    // PBS
-    if (params.pbs_type == MULTI_BIT) {
-      // Only 64 bits is supported
-      static_assert(
-          sizeof(Torus) == 8,
-          "Error (GPU multi bit PBS): only 64 bits Torus is supported");
-      scratch_cuda_multi_bit_pbs_64(
-          stream, &pbs_buffer, params.small_lwe_dimension,
-          params.glwe_dimension, params.polynomial_size, params.pbs_level,
-          params.grouping_factor, num_radix_blocks,
-          cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-    } else {
-      // Classic
-      // We only use low latency for classic mode
-      if (sizeof(Torus) == sizeof(uint32_t))
-        scratch_cuda_bootstrap_low_latency_32(
-            stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
-            params.pbs_level, num_radix_blocks,
-            cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-      else
-        scratch_cuda_bootstrap_low_latency_64(
-            stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
-            params.pbs_level, num_radix_blocks,
-            cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
-    }
+    execute_scratch_pbs<Torus>(
+        stream, &pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
+        params.polynomial_size, params.pbs_level, params.grouping_factor,
+        num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+        params.pbs_type, allocate_gpu_memory);
 
     if (allocate_gpu_memory) {
       // Allocate LUT

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1,88 +1,17 @@
 #ifndef CUDA_INTEGER_CUH
 #define CUDA_INTEGER_CUH
 
+#include "bootstrap.h"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <functional>
 
-template <typename Torus>
-void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, void *bootstrapping_key,
-                 int8_t *pbs_buffer, uint32_t glwe_dimension,
-                 uint32_t lwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count,
-                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-                 uint32_t num_luts, uint32_t lwe_idx,
-                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
-  if (sizeof(Torus) == sizeof(uint32_t)) {
-    // 32 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      printf("multibit\n");
-      printf("Error: 32-bit multibit PBS is not supported.\n");
-      break;
-    case LOW_LAT:
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    case AMORTIZED:
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    default:
-      break;
-    }
-  } else {
-    // 64 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, grouping_factor, base_log, level_count,
-          input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
-      break;
-    case LOW_LAT:
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    case AMORTIZED:
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-          stream, lwe_array_out, lwe_output_indexes, lut_vector,
-          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_luts, lwe_idx, max_shared_memory);
-      break;
-    default:
-      break;
-    }
-  }
-}
-
 // function rotates right  radix ciphertext with specific value
 // grid is one dimensional
 // blockIdx.x represents x_th block of radix ciphertext
@@ -187,12 +116,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
       lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
       ks_base_log, ks_level, num_radix_blocks);
 
-  execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
-              lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
-              lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
-              polynomial_size, pbs_base_log, pbs_level, grouping_factor,
-              num_radix_blocks, 1, 0,
-              cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
+                     lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
+                     bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
+                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
+                     num_radix_blocks, 1, 0,
+                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
 }
 
 template <typename Torus>
@@ -471,31 +400,12 @@ void scratch_cuda_full_propagation(
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
     bool allocate_gpu_memory) {
 
-  // PBS
   int8_t *pbs_buffer;
-  if (pbs_type == MULTI_BIT) {
-    uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
-        lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
-    // Only 64 bits is supported
-    scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
-                                  glwe_dimension, polynomial_size, pbs_level,
-                                  grouping_factor, num_radix_blocks,
-                                  cuda_get_max_shared_memory(stream->gpu_index),
-                                  allocate_gpu_memory, lwe_chunk_size);
-  } else {
-    // Classic
-    // We only use low latency for classic mode
-    if (sizeof(Torus) == sizeof(uint32_t))
-      scratch_cuda_bootstrap_low_latency_32(
-          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
-          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
-          allocate_gpu_memory);
-    else
-      scratch_cuda_bootstrap_low_latency_64(
-          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
-          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
-          allocate_gpu_memory);
-  }
+  execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
+                             polynomial_size, pbs_level, grouping_factor,
+                             num_radix_blocks,
+                             cuda_get_max_shared_memory(stream->gpu_index),
+                             pbs_type, allocate_gpu_memory);
 
   // LUT
   Torus *lut_buffer;

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -7,15 +7,11 @@
 #endif
 
 #include "bootstrap.h"
-#include "bootstrap_multibit.h"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
 #include "integer/integer.cuh"
 #include "linear_algebra.h"
-#include "pbs/bootstrap_amortized.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
@@ -1 +1 @@
-#include "bootstrapping_key.cuh"
+#include "bootstrap.cuh"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh
@@ -0,0 +1,136 @@
+#include "../../include/bootstrap.h"
+#include "../../include/device.h"
+#include "../include/device.h"
+#include "bootstrap_low_latency.cuh"
+#include "bootstrap_multibit.cuh"
+
+template <typename Torus>
+void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
+                 Torus *lwe_output_indexes, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, void *bootstrapping_key,
+                 int8_t *pbs_buffer, uint32_t glwe_dimension,
+                 uint32_t lwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count,
+                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+                 uint32_t num_luts, uint32_t lwe_idx,
+                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
+  switch (sizeof(Torus)) {
+  case sizeof(uint32_t):
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      PANIC("Error: 32-bit multibit PBS is not supported.\n");
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+    break;
+  case sizeof(uint64_t):
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, grouping_factor, base_log, level_count,
+          input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  default:
+    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
+          "moduli are supported.");
+  }
+}
+
+template <typename Torus>
+void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
+                         uint32_t glwe_dimension, uint32_t lwe_dimension,
+                         uint32_t polynomial_size, uint32_t level_count,
+                         uint32_t grouping_factor,
+                         uint32_t input_lwe_ciphertext_count,
+                         uint32_t max_shared_memory, PBS_TYPE pbs_type,
+                         bool allocate_gpu_memory) {
+  switch (sizeof(Torus)) {
+  case sizeof(uint32_t):
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      PANIC("Error: 32-bit multibit PBS is not supported.\n");
+    case LOW_LAT:
+      scratch_cuda_bootstrap_low_latency_32(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    case AMORTIZED:
+      scratch_cuda_bootstrap_amortized_32(
+          stream, pbs_buffer, glwe_dimension, polynomial_size,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  case sizeof(uint64_t):
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      scratch_cuda_multi_bit_pbs_64(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, grouping_factor, input_lwe_ciphertext_count,
+          max_shared_memory, allocate_gpu_memory);
+      break;
+    case LOW_LAT:
+      scratch_cuda_bootstrap_low_latency_64(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    case AMORTIZED:
+      scratch_cuda_bootstrap_amortized_64(
+          stream, pbs_buffer, glwe_dimension, polynomial_size,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Error: unsupported cuda PBS type.");
+    }
+    break;
+  default:
+    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
+          "moduli are supported.");
+  }
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
@@ -3,7 +3,6 @@
 
 #include "bootstrap.h"
 #include "bootstrap_multibit.cuh"
-#include "bootstrap_multibit.h"
 #include "cooperative_groups.h"
 #include "crypto/gadget.cuh"
 #include "crypto/ggsw.cuh"
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		#include "bootstrapping_key.cuh"
		#include "bootstrap.cuh"