Skip to content

Commit

Permalink
chore(gpu): abort when trying to launch 32 bit multi-bit PBS
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 5, 2024
1 parent 97feefe commit a50e90b
Show file tree
Hide file tree
Showing 8 changed files with 233 additions and 209 deletions.
31 changes: 6 additions & 25 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "pbs/bootstrap.cuh"
#include <cassert>
#include <cmath>
#include <functional>
Expand Down Expand Up @@ -297,31 +298,11 @@ template <typename Torus> struct int_radix_lut {
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

///////////////
// PBS
if (params.pbs_type == MULTI_BIT) {
// Only 64 bits is supported
static_assert(
sizeof(Torus) == 8,
"Error (GPU multi bit PBS): only 64 bits Torus is supported");
scratch_cuda_multi_bit_pbs_64(
stream, &pbs_buffer, params.small_lwe_dimension,
params.glwe_dimension, params.polynomial_size, params.pbs_level,
params.grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
} else {
// Classic
// We only use low latency for classic mode
if (sizeof(Torus) == sizeof(uint32_t))
scratch_cuda_bootstrap_low_latency_32(
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
params.pbs_level, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
else
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
params.pbs_level, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
}
execute_scratch_pbs<Torus>(
stream, &pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_level, params.grouping_factor,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
params.pbs_type, allocate_gpu_memory);

if (allocate_gpu_memory) {
// Allocate LUT
Expand Down
114 changes: 12 additions & 102 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
@@ -1,88 +1,17 @@
#ifndef CUDA_INTEGER_CUH
#define CUDA_INTEGER_CUH

#include "bootstrap.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/scalar_addition.cuh"
#include "linear_algebra.h"
#include "linearalgebra/addition.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include <functional>

template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
if (sizeof(Torus) == sizeof(uint32_t)) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
printf("multibit\n");
printf("Error: 32-bit multibit PBS is not supported.\n");
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
}
} else {
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
}
}
}

// function rotates right radix ciphertext with specific value
// grid is one dimensional
// blockIdx.x represents x_th block of radix ciphertext
Expand Down Expand Up @@ -187,12 +116,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);

execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}

template <typename Torus>
Expand Down Expand Up @@ -471,31 +400,12 @@ void scratch_cuda_full_propagation(
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {

// PBS
int8_t *pbs_buffer;
if (pbs_type == MULTI_BIT) {
uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
// Only 64 bits is supported
scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level,
grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory, lwe_chunk_size);
} else {
// Classic
// We only use low latency for classic mode
if (sizeof(Torus) == sizeof(uint32_t))
scratch_cuda_bootstrap_low_latency_32(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
else
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
}
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, pbs_level, grouping_factor,
num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
pbs_type, allocate_gpu_memory);

// LUT
Torus *lut_buffer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,11 @@
#endif

#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "linear_algebra.h"
#include "pbs/bootstrap_amortized.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
Original file line number Diff line number Diff line change
@@ -1 +1 @@
#include "bootstrapping_key.cuh"
#include "bootstrap.cuh"
136 changes: 136 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include "../../include/bootstrap.h"
#include "../../include/device.h"
#include "../include/device.h"
#include "bootstrap_low_latency.cuh"
#include "bootstrap_multibit.cuh"

template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n");
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
}
break;
case sizeof(uint64_t):
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.");
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.");
}
}

template <typename Torus>
void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n");
case LOW_LAT:
scratch_cuda_bootstrap_low_latency_32(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case AMORTIZED:
scratch_cuda_bootstrap_amortized_32(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.");
}
break;
case sizeof(uint64_t):
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
scratch_cuda_multi_bit_pbs_64(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, grouping_factor, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
break;
case LOW_LAT:
scratch_cuda_bootstrap_low_latency_64(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case AMORTIZED:
scratch_cuda_bootstrap_amortized_64(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.");
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

#include "bootstrap.h"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
Expand Down
Loading

0 comments on commit a50e90b

Please sign in to comment.