Skip to content

Commit

Permalink
feat(gpu): implement CUDA-based Radix Integer compression and public …
Browse files Browse the repository at this point in the history
…functional packing keyswitch
  • Loading branch information
pdroalves committed Aug 6, 2024
1 parent a26e68c commit c2fc8f7
Show file tree
Hide file tree
Showing 41 changed files with 2,346 additions and 225 deletions.
117 changes: 115 additions & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ enum SHIFT_OR_ROTATE_TYPE {
LEFT_ROTATE = 2,
RIGHT_ROTATE = 3
};
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
enum BITOP_TYPE {
BITAND = 0,
BITOR = 1,
Expand All @@ -36,6 +35,11 @@ enum COMPARISON_TYPE {
MIN = 7,
};

enum COMPRESSION_MODE {
COMPRESS = 0,
DECOMPRESS = 1,
};

enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
Expand Down Expand Up @@ -202,6 +206,30 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);

void scratch_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
COMPRESSION_MODE mode, bool allocate_gpu_memory);

void cuda_compression_compress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr);

void cuda_compression_decompress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);

void cleanup_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -452,7 +480,8 @@ struct int_radix_params {
message_modulus(message_modulus), carry_modulus(carry_modulus){};

void print() {
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
printf("pbs_type: %u, glwe_dimension: %u, "
"polynomial_size: %u, "
"big_lwe_dimension: %u, "
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
"%u, pbs_base_log: "
Expand Down Expand Up @@ -790,6 +819,90 @@ template <typename Torus> struct int_radix_lut {
}
};

template <typename Torus> struct int_compression {
COMPRESSION_MODE mode;
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;

uint32_t body_count;

// Compression
Torus *tmp_lwe;
Torus *tmp_glwe_array_out;

// Decompression
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
int_radix_lut<Torus> *carry_extract_lut;

int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {
this->mode = mode;
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;

if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;

tmp_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);

if (mode == COMPRESSION_MODE::DECOMPRESS) {
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);

tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Decompression
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
return x / encryption_params.message_modulus;
};

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
carry_extract_lut->get_lut(gpu_indexes[0], 0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
encryption_params.message_modulus, encryption_params.carry_modulus,
carry_extract_f);

carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
if (mode == COMPRESSION_MODE::DECOMPRESS) {
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
}
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
Expand Down
14 changes: 14 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index,
void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count);

void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
}

#endif // CNCRT_KS_H_
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,16 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t chunk_size = 0);

void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);

void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
Expand All @@ -49,7 +50,8 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);

template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -58,14 +60,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
#endif

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -74,14 +78,16 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand All @@ -90,7 +96,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
Expand Down
14 changes: 0 additions & 14 deletions backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Expand Down
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
auto lwe_out = lwe_array_out + input_id * lwe_output_size;

// We assume each GLWE will store the first polynomial_size inputs
uint32_t nth_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
uint32_t lwe_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;

auto nth = nth_array[input_id];

Expand All @@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
template <typename Torus, class params>
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);

dim3 grid(num_glwes);
dim3 grid(num_nths);
dim3 thds(params::degree / params::opt);
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
Expand Down
9 changes: 5 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

/**
* GadgetMatrix implements the iterator design pattern to decompose a set of
* num_poly consecutive polynomials with degree params::degree. A total of
* level_count levels is expected and each call to decompose_and_compress_next()
* writes to the result the next level. It is also possible to advance an
* arbitrary amount of levels by using decompose_and_compress_level().
* num_poly consecutive polynomials with degree compression_params::degree. A
* total of level_count levels is expected and each call to
* decompose_and_compress_next() writes to the result the next level. It is also
* possible to advance an arbitrary amount of levels by using
* decompose_and_compress_level().
*
* This class always decomposes the entire set of num_poly polynomials.
* By default, it works on a single polynomial.
Expand Down
56 changes: 54 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
Expand Down Expand Up @@ -40,11 +40,63 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}

/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `glwe_array_out`: output batch of keyswitched ciphertexts
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
* operation
* - `base log`: the log of the base used in the decomposition (should be the
* one used to create the ksk)
* - `level_count`: the number of levels used in the decomposition (should be
* the one used to create the fp_ksks).
* - `number_of_input_lwe`: the number of inputs
* - `number_of_keys`: the number of fp_ksks
*
* This function calls a wrapper to a device kernel that performs the functional
* packing keyswitch.
*/
void cuda_fp_keyswitch_lwe_to_glwe_64(void *stream, uint32_t gpu_index,
void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count) {

host_fp_keyswitch_lwe_to_glwe(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array),
input_lwe_dimension, output_glwe_dimension,
output_polynomial_size, base_log, level_count);
}

void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {

host_fp_keyswitch_lwe_list_to_glwe(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
num_lwes);
}
Loading

0 comments on commit c2fc8f7

Please sign in to comment.