Skip to content

Commit

Permalink
chore(gpu): rename the low-latency PBS to just PBS and the fast varia…
Browse files Browse the repository at this point in the history
…nts to cg
  • Loading branch information
pdroalves committed Mar 11, 2024
1 parent ba066a3 commit 39b1809
Show file tree
Hide file tree
Showing 41 changed files with 779 additions and 806 deletions.
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ endif()

# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O0 -G -g \
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")

Expand Down
32 changes: 15 additions & 17 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ifndef CUDA_INTEGER_H
#define CUDA_INTEGER_H

#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "pbs/bootstrap.cuh"
#include "pbs/programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
#include <cassert>
#include <cmath>
#include <functional>
Expand Down Expand Up @@ -183,20 +183,21 @@ void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
int8_t **mem_ptr_void);

void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
void scratch_cuda_propagate_single_carry_kb_64_inplace(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_propagate_single_carry_low_latency_kb_64_inplace(
cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
void *ksk, uint32_t num_blocks);
void cuda_propagate_single_carry_kb_64_inplace(cuda_stream_t *stream,
void *lwe_array, int8_t *mem_ptr,
void *bsk, void *ksk,
uint32_t num_blocks);

void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
int8_t **mem_ptr_void);
void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
int8_t **mem_ptr_void);
}

/*
Expand Down Expand Up @@ -396,34 +397,31 @@ template <typename Torus> struct int_radix_lut {
case MULTI_BIT:
switch (sizeof(Torus)) {
case sizeof(uint32_t):
cleanup_cuda_multi_bit_pbs_32(stream, &buffer);
cleanup_cuda_multi_bit_programmable_bootstrap_32(stream, &buffer);
break;
case sizeof(uint64_t):
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, &buffer);
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
"integer "
"moduli are supported.")
}
break;
case LOW_LAT:
case CLASSICAL:
switch (sizeof(Torus)) {
case sizeof(uint32_t):
cleanup_cuda_bootstrap_low_latency_32(stream, &buffer);
cleanup_cuda_programmable_bootstrap_32(stream, &buffer);
break;
case sizeof(uint64_t):
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
cleanup_cuda_programmable_bootstrap_64(stream, &buffer);
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
"integer "
"moduli are supported.")
}
break;
case AMORTIZED:
cleanup_cuda_bootstrap_amortized(stream, &buffer);
break;
default:
PANIC("Cuda error (PBS): unknown PBS type. ")
}
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_

#include "bootstrap.h"
#include "programmable_bootstrap.h"
#include <cstdint>
#include <device.h>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,138 +4,133 @@
#include "device.h"
#include <cstdint>

enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
enum PBS_VARIANT { DEFAULT = 0, FAST = 1 };
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1 };

extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
cuda_stream_t *stream,
uint32_t polynomial_size,
uint32_t total_polynomials);

void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);

void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);

void scratch_cuda_bootstrap_amortized_32(
void scratch_cuda_programmable_bootstrap_amortized_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);

void scratch_cuda_bootstrap_amortized_64(
void scratch_cuda_programmable_bootstrap_amortized_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory);

void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);

void scratch_cuda_bootstrap_low_latency_32(
void scratch_cuda_programmable_bootstrap_32(
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);

void scratch_cuda_bootstrap_low_latency_64(
void scratch_cuda_programmable_bootstrap_64(
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);

void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cleanup_cuda_bootstrap_low_latency_32(cuda_stream_t *stream,
int8_t **pbs_buffer);
void cleanup_cuda_programmable_bootstrap_32(cuda_stream_t *stream,
int8_t **pbs_buffer);

void cleanup_cuda_bootstrap_low_latency_64(cuda_stream_t *stream,
int8_t **pbs_buffer);
void cleanup_cuda_programmable_bootstrap_64(cuda_stream_t *stream,
int8_t **pbs_buffer);

uint64_t get_buffer_size_bootstrap_amortized_64(
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);

uint64_t get_buffer_size_bootstrap_low_latency_64(
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
uint32_t polynomial_size) {
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}

template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
int8_t *d_mem;

Torus *global_accumulator;
Expand All @@ -155,13 +150,13 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
polynomial_size);

uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
Expand Down Expand Up @@ -193,12 +188,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
polynomial_size * sizeof(Torus),
stream);
} break;
case PBS_VARIANT::FAST: {
case PBS_VARIANT::CG: {
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);

uint64_t partial_dm = full_sm - partial_sm;
Expand Down Expand Up @@ -237,14 +232,14 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
};

template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {

uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
Expand All @@ -263,42 +258,42 @@ __host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
}

template <typename Torus>
bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);

template <typename Torus>
void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);

template <typename Torus>
void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);

template <typename Torus, typename STorus>
void scratch_cuda_fast_bootstrap_low_latency(
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **pbs_buffer,
void scratch_cuda_programmable_bootstrap_cg(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);

template <typename Torus, typename STorus>
void scratch_cuda_bootstrap_low_latency(
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
void scratch_cuda_programmable_bootstrap(
cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
Expand Down
Loading

0 comments on commit 39b1809

Please sign in to comment.