Skip to content

Commit

Permalink
chore(gpu): remove some host decoration and duplicated def
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Aug 6, 2024
1 parent 0dd622e commit a26e68c
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 102 deletions.
25 changes: 10 additions & 15 deletions backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,65 +92,60 @@ uint64_t get_buffer_size_programmable_bootstrap_64(
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
uint64_t
get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
uint64_t
get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_tbc(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // tbc
}

template <typename Torus>
__host__ __device__ uint64_t
uint64_t
get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ __device__ uint64_t
uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}

template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size);

template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
Expand Down Expand Up @@ -306,7 +301,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
};

template <typename Torus>
__host__ uint64_t get_buffer_size_programmable_bootstrap_cg(
uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
}

template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size);

template <typename Torus>
Expand All @@ -62,13 +61,6 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
#endif

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
Expand Down Expand Up @@ -101,40 +93,31 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);

template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
Expand Down Expand Up @@ -304,7 +287,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
};

template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);

#endif // CUDA_MULTI_BIT_H
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,7 @@ __global__ void device_programmable_bootstrap_amortized(
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_programmable_bootstrap_amortized(
uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size, uint32_t glwe_dimension) {
return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
Expand All @@ -225,14 +224,13 @@ get_buffer_size_full_sm_programmable_bootstrap_amortized(
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_programmable_bootstrap_amortized(
uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}

template <typename Torus>
__host__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint64_t get_buffer_size_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,20 +141,18 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint64_t get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint64_t get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}

template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t lwe_chunk_size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}

template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
uint64_t get_buffer_size_programmable_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -431,16 +431,15 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size) {
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size) {

uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);

int max_blocks_per_sm;
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {

extern __shared__ int8_t sharedmem[];
int8_t *selected_memory = sharedmem;
Expand Down Expand Up @@ -330,48 +330,26 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint64_t get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}

template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_multibit_programmable_bootstrap(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {

uint64_t buffer_size = 0;
buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2); // keybundle fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) *
sizeof(double2); // global_accumulator_fft
buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus); // global_accumulator

return buffer_size + buffer_size % sizeof(double2);
}

template <typename Torus, typename params>
__host__ void scratch_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
Expand Down Expand Up @@ -514,15 +492,15 @@ __host__ void execute_compute_keybundle(
<<<grid_keybundle, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, full_sm_keybundle);
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
d_mem, full_sm_keybundle);
else
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, lwe_offset, chunk_size,
keybundle_size_per_input, d_mem, 0);
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
d_mem, 0);
check_cuda_error(cudaGetLastError());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
}

template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap(
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,21 +149,18 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // distributed shared memory
}

template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
Expand Down Expand Up @@ -390,8 +387,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
}

template <typename Torus>
__host__ bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size) {
uint64_t minimum_sm =
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
Expand Down Expand Up @@ -484,7 +480,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
return cluster_size >= level_count * (glwe_dimension + 1);
}

template __host__ bool
template bool
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
uint32_t polynomial_size);
#endif // FASTMULTIBIT_PBS_H

0 comments on commit a26e68c

Please sign in to comment.