Skip to content

Commit

Permalink
refactor(gpu): Specify launch bounds on kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun authored and agnesLeroy committed Aug 5, 2024
1 parent b03921f commit bc4cd08
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,19 @@
#include <vector>

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

grid_group grid = this_grid();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,17 @@
#include "types/complex/operations.cuh"

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_step_one(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down Expand Up @@ -131,15 +132,16 @@ __global__ void device_programmable_bootstrap_step_one(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,16 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_one(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down Expand Up @@ -242,14 +243,15 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,20 @@
#include <vector>

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block, bool support_dsm) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm) {

cluster_group cluster = this_cluster();

Expand Down

0 comments on commit bc4cd08

Please sign in to comment.