Skip to content

Commit

Permalink
feat(gpu): Implement benchmark for low latency and amortized PBS in a…
Browse files Browse the repository at this point in the history
…ll variants and the FFT
  • Loading branch information
pdroalves committed Mar 8, 2024
1 parent dd98724 commit 89928fa
Show file tree
Hide file tree
Showing 17 changed files with 1,343 additions and 1,152 deletions.
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,13 @@ build_concrete_csprng: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets

.PHONY: build_cuda_backend # Build the full CUDA backend, including custom tools
build_cuda_backend:
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
make -j

.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
Expand Down Expand Up @@ -893,7 +900,7 @@ sha256_bool: install_rs_check_toolchain
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests

.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_gpu
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_gpu build_cuda_backend

.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
Expand Down
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ while getopts ":c" option; do
case $option in
c)
# code to execute when flag1 is provided
find ./{include,src,tests/include,tests/src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests/include,tests/src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
git diff --exit-code
exit
;;
esac
done
find ./{include,src,tests/include,tests/src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests/include,tests/src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
39 changes: 39 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,45 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}

template <typename Torus>
bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);

template <typename Torus>
void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

template <typename Torus>
void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

template <typename Torus, typename STorus>
void scratch_cuda_fast_bootstrap_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);

template <typename Torus, typename STorus>
void scratch_cuda_bootstrap_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);

#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
Expand Down
54 changes: 50 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,25 @@
#include <cstdint>

extern "C" {

bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);

void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor);

void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);

void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
Expand All @@ -18,15 +32,47 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t chunk_size = 0);

void scratch_cuda_multi_bit_pbs_64(
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
}

template <typename Torus, typename STorus>
void scratch_cuda_fast_multi_bit_pbs(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);

template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory,
bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);

template <typename Torus>
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);

void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
}
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,12 +436,12 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
thds, 0);
thds, partial_sm);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
0);
full_sm);
}

// Get the number of streaming multiprocessors
Expand All @@ -450,4 +450,46 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus>
__host__ bool supports_cooperative_groups_on_lowlat_pbs(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 512:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 1024:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 8192:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 16384:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC("Cuda error (low latency PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}

#endif // LOWLAT_FAST_PBS_H
Original file line number Diff line number Diff line change
Expand Up @@ -318,4 +318,46 @@ verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

// Verify if the grid size for the multi-bit kernel satisfies the cooperative
// group constraints
template <typename Torus>
__host__ bool supports_cooperative_groups_on_multibit_pbs(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 512:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 1024:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 8192:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 16384:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
#endif // FASTMULTIBIT_PBS_H
Loading

0 comments on commit 89928fa

Please sign in to comment.