diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index f5f8124aee..c7e07c9efb 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -3,6 +3,7 @@ #include "bootstrap.h" #include "bootstrap_multibit.h" +#include "pbs/bootstrap.cuh" #include #include #include @@ -297,31 +298,11 @@ template struct int_radix_lut { (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); /////////////// - // PBS - if (params.pbs_type == MULTI_BIT) { - // Only 64 bits is supported - static_assert( - sizeof(Torus) == 8, - "Error (GPU multi bit PBS): only 64 bits Torus is supported"); - scratch_cuda_multi_bit_pbs_64( - stream, &pbs_buffer, params.small_lwe_dimension, - params.glwe_dimension, params.polynomial_size, params.pbs_level, - params.grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory); - } else { - // Classic - // We only use low latency for classic mode - if (sizeof(Torus) == sizeof(uint32_t)) - scratch_cuda_bootstrap_low_latency_32( - stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size, - params.pbs_level, num_radix_blocks, - cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory); - else - scratch_cuda_bootstrap_low_latency_64( - stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size, - params.pbs_level, num_radix_blocks, - cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory); - } + execute_scratch_pbs( + stream, &pbs_buffer, params.glwe_dimension, params.small_lwe_dimension, + params.polynomial_size, params.pbs_level, params.grouping_factor, + num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index), + params.pbs_type, allocate_gpu_memory); if (allocate_gpu_memory) { // Allocate LUT diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index f257eeaab8..8da2b2aaa3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -1,88 +1,17 @@ #ifndef CUDA_INTEGER_CUH #define CUDA_INTEGER_CUH +#include "bootstrap.h" #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.h" #include "integer/scalar_addition.cuh" #include "linear_algebra.h" #include "linearalgebra/addition.cuh" -#include "pbs/bootstrap_low_latency.cuh" -#include "pbs/bootstrap_multibit.cuh" #include "polynomial/functions.cuh" #include "utils/kernel_dimensions.cuh" #include -template -void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, void *bootstrapping_key, - int8_t *pbs_buffer, uint32_t glwe_dimension, - uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, PBS_TYPE pbs_type) { - if (sizeof(Torus) == sizeof(uint32_t)) { - // 32 bits - switch (pbs_type) { - case MULTI_BIT: - printf("multibit\n"); - printf("Error: 32-bit multibit PBS is not supported.\n"); - break; - case LOW_LAT: - cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( - stream, lwe_array_out, lwe_output_indexes, lut_vector, - lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, input_lwe_ciphertext_count, - num_luts, lwe_idx, max_shared_memory); - break; - case AMORTIZED: - cuda_bootstrap_amortized_lwe_ciphertext_vector_32( - stream, lwe_array_out, lwe_output_indexes, lut_vector, - lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, input_lwe_ciphertext_count, - num_luts, lwe_idx, max_shared_memory); - break; - default: - break; - } - } else { - // 64 bits - switch (pbs_type) { - case MULTI_BIT: - cuda_multi_bit_pbs_lwe_ciphertext_vector_64( - stream, lwe_array_out, lwe_output_indexes, lut_vector, - lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, grouping_factor, base_log, level_count, - input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory); - break; - case LOW_LAT: - cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( - stream, lwe_array_out, lwe_output_indexes, lut_vector, - lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, input_lwe_ciphertext_count, - num_luts, lwe_idx, max_shared_memory); - break; - case AMORTIZED: - cuda_bootstrap_amortized_lwe_ciphertext_vector_64( - stream, lwe_array_out, lwe_output_indexes, lut_vector, - lut_vector_indexes, lwe_array_in, lwe_input_indexes, - bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, input_lwe_ciphertext_count, - num_luts, lwe_idx, max_shared_memory); - break; - default: - break; - } - } -} - // function rotates right radix ciphertext with specific value // grid is one dimensional // blockIdx.x represents x_th block of radix ciphertext @@ -187,12 +116,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level, num_radix_blocks); - execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut, - lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk, - lut->pbs_buffer, glwe_dimension, small_lwe_dimension, - polynomial_size, pbs_base_log, pbs_level, grouping_factor, - num_radix_blocks, 1, 0, - cuda_get_max_shared_memory(stream->gpu_index), pbs_type); + execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut, + lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, + bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension, + polynomial_size, pbs_base_log, pbs_level, grouping_factor, + num_radix_blocks, 1, 0, + cuda_get_max_shared_memory(stream->gpu_index), pbs_type); } template @@ -471,31 +400,12 @@ void scratch_cuda_full_propagation( uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { - // PBS int8_t *pbs_buffer; - if (pbs_type == MULTI_BIT) { - uint32_t lwe_chunk_size = get_average_lwe_chunk_size( - lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks); - // Only 64 bits is supported - scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension, - glwe_dimension, polynomial_size, pbs_level, - grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(stream->gpu_index), - allocate_gpu_memory, lwe_chunk_size); - } else { - // Classic - // We only use low latency for classic mode - if (sizeof(Torus) == sizeof(uint32_t)) - scratch_cuda_bootstrap_low_latency_32( - stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level, - num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index), - allocate_gpu_memory); - else - scratch_cuda_bootstrap_low_latency_64( - stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level, - num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index), - allocate_gpu_memory); - } + execute_scratch_pbs(stream, &pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, pbs_level, grouping_factor, + num_radix_blocks, + cuda_get_max_shared_memory(stream->gpu_index), + pbs_type, allocate_gpu_memory); // LUT Torus *lut_buffer; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 9303e213a4..f02d42b921 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -7,15 +7,11 @@ #endif #include "bootstrap.h" -#include "bootstrap_multibit.h" #include "crypto/keyswitch.cuh" #include "device.h" #include "integer.h" #include "integer/integer.cuh" #include "linear_algebra.h" -#include "pbs/bootstrap_amortized.cuh" -#include "pbs/bootstrap_low_latency.cuh" -#include "pbs/bootstrap_multibit.cuh" #include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu index 73ade26d94..81cc429136 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu @@ -1 +1 @@ -#include "bootstrapping_key.cuh" +#include "bootstrap.cuh" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh new file mode 100644 index 0000000000..6da130412a --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cuh @@ -0,0 +1,136 @@ +#include "../../include/bootstrap.h" +#include "../../include/device.h" +#include "../include/device.h" +#include "bootstrap_low_latency.cuh" +#include "bootstrap_multibit.cuh" + +template +void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, void *bootstrapping_key, + int8_t *pbs_buffer, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, + uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, + uint32_t num_luts, uint32_t lwe_idx, + uint32_t max_shared_memory, PBS_TYPE pbs_type) { + switch (sizeof(Torus)) { + case sizeof(uint32_t): + // 32 bits + switch (pbs_type) { + case MULTI_BIT: + PANIC("Error: 32-bit multibit PBS is not supported.\n"); + case LOW_LAT: + cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, input_lwe_ciphertext_count, + num_luts, lwe_idx, max_shared_memory); + break; + case AMORTIZED: + cuda_bootstrap_amortized_lwe_ciphertext_vector_32( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, input_lwe_ciphertext_count, + num_luts, lwe_idx, max_shared_memory); + break; + default: + break; + } + break; + case sizeof(uint64_t): + // 64 bits + switch (pbs_type) { + case MULTI_BIT: + cuda_multi_bit_pbs_lwe_ciphertext_vector_64( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, + input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory); + break; + case LOW_LAT: + cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, input_lwe_ciphertext_count, + num_luts, lwe_idx, max_shared_memory); + break; + case AMORTIZED: + cuda_bootstrap_amortized_lwe_ciphertext_vector_64( + stream, lwe_array_out, lwe_output_indexes, lut_vector, + lut_vector_indexes, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, input_lwe_ciphertext_count, + num_luts, lwe_idx, max_shared_memory); + break; + default: + PANIC("Error: unsupported cuda PBS type."); + } + break; + default: + PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer " + "moduli are supported."); + } +} + +template +void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer, + uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t grouping_factor, + uint32_t input_lwe_ciphertext_count, + uint32_t max_shared_memory, PBS_TYPE pbs_type, + bool allocate_gpu_memory) { + switch (sizeof(Torus)) { + case sizeof(uint32_t): + // 32 bits + switch (pbs_type) { + case MULTI_BIT: + PANIC("Error: 32-bit multibit PBS is not supported.\n"); + case LOW_LAT: + scratch_cuda_bootstrap_low_latency_32( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case AMORTIZED: + scratch_cuda_bootstrap_amortized_32( + stream, pbs_buffer, glwe_dimension, polynomial_size, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + default: + PANIC("Error: unsupported cuda PBS type."); + } + break; + case sizeof(uint64_t): + // 64 bits + switch (pbs_type) { + case MULTI_BIT: + scratch_cuda_multi_bit_pbs_64( + stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, + level_count, grouping_factor, input_lwe_ciphertext_count, + max_shared_memory, allocate_gpu_memory); + break; + case LOW_LAT: + scratch_cuda_bootstrap_low_latency_64( + stream, pbs_buffer, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + case AMORTIZED: + scratch_cuda_bootstrap_amortized_64( + stream, pbs_buffer, glwe_dimension, polynomial_size, + input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + break; + default: + PANIC("Error: unsupported cuda PBS type."); + } + break; + default: + PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer " + "moduli are supported."); + } +} diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh index 2b10bd6ec8..b74eb989e1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh @@ -3,7 +3,6 @@ #include "bootstrap.h" #include "bootstrap_multibit.cuh" -#include "bootstrap_multibit.h" #include "cooperative_groups.h" #include "crypto/gadget.cuh" #include "crypto/ggsw.cuh" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu new file mode 100644 index 0000000000..821e12b6f8 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu @@ -0,0 +1,78 @@ +#include "bootstrapping_key.cuh" + +void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, + cuda_stream_t *stream, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size) { + uint32_t total_polynomials = + input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; + cuda_convert_lwe_bootstrap_key( + (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim, + level_count, polynomial_size, total_polynomials); +} + +void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, + cuda_stream_t *stream, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size) { + uint32_t total_polynomials = + input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; + cuda_convert_lwe_bootstrap_key( + (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim, + level_count, polynomial_size, total_polynomials); +} + +void cuda_convert_lwe_multi_bit_bootstrap_key_64( + void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size, + uint32_t grouping_factor) { + uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * + level_count * (1 << grouping_factor) / + grouping_factor; + size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t); + + cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size, + stream); +} + +// We need these lines so the compiler knows how to specialize these functions +template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); +template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); +template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k, + int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); +template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); +template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); +template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k, + int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t level_count); + +template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block( + uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); + +template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block( + double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index d009b6e7c6..7c1a3a9f5f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -244,43 +244,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, free(h_bsk); } -void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, - cuda_stream_t *stream, - uint32_t input_lwe_dim, - uint32_t glwe_dim, uint32_t level_count, - uint32_t polynomial_size) { - uint32_t total_polynomials = - input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; - cuda_convert_lwe_bootstrap_key( - (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim, - level_count, polynomial_size, total_polynomials); -} - -void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, - cuda_stream_t *stream, - uint32_t input_lwe_dim, - uint32_t glwe_dim, uint32_t level_count, - uint32_t polynomial_size) { - uint32_t total_polynomials = - input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count; - cuda_convert_lwe_bootstrap_key( - (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim, - level_count, polynomial_size, total_polynomials); -} - -void cuda_convert_lwe_multi_bit_bootstrap_key_64( - void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim, - uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size, - uint32_t grouping_factor) { - uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * - level_count * (1 << grouping_factor) / - grouping_factor; - size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t); - - cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size, - stream); -} - void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output, cuda_stream_t *stream, uint32_t polynomial_size, @@ -458,43 +421,4 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output, cuda_drop_async(buffer, stream); } -// We need these lines so the compiler knows how to specialize these functions -template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i, - int k, int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); -template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i, - int k, int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); -template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k, - int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); -template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i, - int k, int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); -template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i, - int k, int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); -template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k, - int level, - uint32_t polynomial_size, - int glwe_dimension, - uint32_t level_count); - -template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block( - uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); - -template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block( - double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count); #endif // CNCRT_BSK_H