-
Notifications
You must be signed in to change notification settings - Fork 156
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(gpu): implement custom benchmarks
- Loading branch information
Showing
16 changed files
with
289 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
12 changes: 12 additions & 0 deletions
12
backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF) | ||
option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF) | ||
|
||
if(TFHE_CUDA_BACKEND_BUILD_TESTS) | ||
message(STATUS "Building the test tool") | ||
add_subdirectory(tests) | ||
endif() | ||
|
||
if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS) | ||
message(STATUS "Building the benchmark tool") | ||
add_subdirectory(benchmarks) | ||
endif() |
73 changes: 73 additions & 0 deletions
73
backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
find_package(CUDA REQUIRED) | ||
find_package(CUDAToolkit REQUIRED) | ||
|
||
if(NOT CMAKE_BUILD_TYPE) | ||
set(CMAKE_BUILD_TYPE Release) | ||
endif() | ||
|
||
# Disable the Google Benchmark requirement on Google Test | ||
set(BENCHMARK_ENABLE_GTEST_TESTS OFF) | ||
set(BENCHMARK_ENABLE_TESTING OFF) | ||
|
||
include(FetchContent) | ||
FetchContent_Declare( | ||
googlebenchmark | ||
GIT_REPOSITORY https://github.com/google/benchmark.git | ||
GIT_TAG v1.7.1) | ||
FetchContent_MakeAvailable(googlebenchmark) | ||
|
||
# Enable ExternalProject CMake module | ||
include(ExternalProject) | ||
|
||
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../") | ||
|
||
# Enable ExternalProject CMake module | ||
include(ExternalProject) | ||
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../") | ||
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release") | ||
|
||
if(NOT TARGET tfhe-rs) | ||
ExternalProject_Add( | ||
tfhe-rs | ||
SOURCE_DIR ${TFHE_RS_SOURCE_DIR} | ||
BUILD_IN_SOURCE 1 | ||
BUILD_ALWAYS 1 | ||
UPDATE_COMMAND "" | ||
CONFIGURE_COMMAND "" | ||
DOWNLOAD_COMMAND "" | ||
BUILD_COMMAND make build_c_api | ||
INSTALL_COMMAND "" | ||
LOG_BUILD ON) | ||
endif() | ||
|
||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include) | ||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include) | ||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src) | ||
include_directories(${TFHE_RS_BINARY_DIR}) | ||
include_directories(${TFHE_RS_BINARY_DIR}/deps) | ||
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}") | ||
|
||
find_package(OpenMP REQUIRED) | ||
# Add the OpenMP flag to the compiler flags | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") | ||
|
||
add_library(tfhe_rs_lib STATIC IMPORTED) | ||
add_dependencies(tfhe_rs_lib tfhe-rs) | ||
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a) | ||
|
||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl") | ||
|
||
set(BINARY benchmark_tfhe_cuda_backend) | ||
|
||
file( | ||
GLOB_RECURSE BENCH_SOURCES | ||
LIST_DIRECTORIES false | ||
benchmark*.cpp main.cpp) | ||
|
||
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp) | ||
|
||
set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) | ||
target_link_libraries( | ||
benchmark_tfhe_cuda_backend | ||
PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX | ||
PRIVATE CUDA::cudart) |
196 changes: 196 additions & 0 deletions
196
backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_multibit_pbs.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#include <benchmark/benchmark.h> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <cstdlib> | ||
#include <omp.h> | ||
#include <setup_and_teardown.h> | ||
|
||
typedef struct { | ||
int lwe_dimension; | ||
int glwe_dimension; | ||
int polynomial_size; | ||
int pbs_base_log; | ||
int pbs_level; | ||
int input_lwe_ciphertext_count; | ||
int grouping_factor; | ||
int chunk_size; | ||
} MultiBitPBSBenchmarkParams; | ||
|
||
class MultiBitBootstrap_u64 : public benchmark::Fixture { | ||
protected: | ||
int lwe_dimension; | ||
int glwe_dimension; | ||
int polynomial_size; | ||
int input_lwe_ciphertext_count; | ||
int input_lwe_ciphertext_count_per_gpu; | ||
int grouping_factor; | ||
double lwe_modular_variance = 0.000007069849454709433; | ||
double glwe_modular_variance = 0.00000000000000029403601535432533; | ||
int pbs_base_log; | ||
int pbs_level; | ||
int message_modulus = 4; | ||
int carry_modulus = 4; | ||
int payload_modulus; | ||
uint64_t delta; | ||
std::vector<uint64_t *> d_bsk_array; | ||
std::vector<uint64_t *> d_lut_pbs_identity; | ||
std::vector<uint64_t *> d_lut_pbs_indexes; | ||
std::vector<uint64_t *> d_lwe_ct_in_array; | ||
std::vector<uint64_t *> d_lwe_ct_out_array; | ||
std::vector<uint64_t *> d_lwe_input_indexes; | ||
std::vector<uint64_t *> d_lwe_output_indexes; | ||
uint64_t *lwe_sk_in_array; | ||
uint64_t *lwe_sk_out_array; | ||
uint64_t *plaintexts; | ||
std::vector<int8_t *> pbs_buffer; | ||
|
||
int chunk_size; | ||
|
||
int num_gpus; | ||
std::vector<cuda_stream_t *> streams; | ||
|
||
public: | ||
void SetUp(const ::benchmark::State &state) { | ||
|
||
lwe_dimension = state.range(0); | ||
glwe_dimension = state.range(1); | ||
polynomial_size = state.range(2); | ||
pbs_base_log = state.range(3); | ||
pbs_level = state.range(4); | ||
input_lwe_ciphertext_count = state.range(5); | ||
grouping_factor = state.range(6); | ||
chunk_size = state.range(7); | ||
|
||
num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count); | ||
|
||
assert(input_lwe_ciphertext_count % num_gpus == 0); | ||
input_lwe_ciphertext_count_per_gpu = | ||
std::max(1, input_lwe_ciphertext_count / num_gpus); | ||
|
||
Seed seed; | ||
init_seed(&seed); | ||
|
||
// Create streams | ||
for (int device = 0; device < num_gpus; device++) { | ||
cudaSetDevice(device); | ||
cuda_stream_t *stream = cuda_create_stream(device); | ||
streams.push_back(stream); | ||
|
||
uint64_t *d_bsk_array_per_gpu; | ||
uint64_t *d_lut_pbs_identity_per_gpu; | ||
uint64_t *d_lut_pbs_indexes_per_gpu; | ||
uint64_t *d_lwe_ct_in_array_per_gpu; | ||
uint64_t *d_lwe_ct_out_array_per_gpu; | ||
uint64_t *d_lwe_input_indexes_per_gpu; | ||
uint64_t *d_lwe_output_indexes_per_gpu; | ||
int8_t *pbs_buffer_per_gpu; | ||
|
||
bootstrap_multibit_setup( | ||
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, | ||
&d_bsk_array_per_gpu, &plaintexts, &d_lut_pbs_identity_per_gpu, | ||
&d_lut_pbs_indexes_per_gpu, &d_lwe_ct_in_array_per_gpu, | ||
&d_lwe_input_indexes_per_gpu, &d_lwe_ct_out_array_per_gpu, | ||
&d_lwe_output_indexes_per_gpu, &pbs_buffer_per_gpu, lwe_dimension, | ||
glwe_dimension, polynomial_size, grouping_factor, | ||
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level, | ||
message_modulus, carry_modulus, &payload_modulus, &delta, | ||
input_lwe_ciphertext_count_per_gpu, 1, 1, chunk_size); | ||
|
||
d_bsk_array.push_back(d_bsk_array_per_gpu); | ||
d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu); | ||
d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu); | ||
d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu); | ||
d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu); | ||
d_lwe_input_indexes.push_back(d_lwe_input_indexes_per_gpu); | ||
d_lwe_output_indexes.push_back(d_lwe_output_indexes_per_gpu); | ||
pbs_buffer.push_back(pbs_buffer_per_gpu); | ||
} | ||
} | ||
|
||
void TearDown(const ::benchmark::State &state) { | ||
free(lwe_sk_in_array); | ||
free(lwe_sk_out_array); | ||
free(plaintexts); | ||
|
||
for (int device = 0; device < num_gpus; device++) { | ||
cudaSetDevice(device); | ||
cleanup_cuda_multi_bit_pbs(streams[device], &pbs_buffer[device]); | ||
cuda_drop_async(d_bsk_array[device], streams[device]); | ||
cuda_drop_async(d_lut_pbs_identity[device], streams[device]); | ||
cuda_drop_async(d_lut_pbs_indexes[device], streams[device]); | ||
cuda_drop_async(d_lwe_ct_in_array[device], streams[device]); | ||
cuda_drop_async(d_lwe_ct_out_array[device], streams[device]); | ||
cuda_drop_async(d_lwe_input_indexes[device], streams[device]); | ||
cuda_drop_async(d_lwe_output_indexes[device], streams[device]); | ||
cuda_synchronize_stream(streams[device]); | ||
cuda_destroy_stream(streams[device]); | ||
} | ||
d_bsk_array.clear(); | ||
d_lut_pbs_identity.clear(); | ||
d_lut_pbs_indexes.clear(); | ||
d_lwe_ct_in_array.clear(); | ||
d_lwe_ct_out_array.clear(); | ||
d_lwe_input_indexes.clear(); | ||
d_lwe_output_indexes.clear(); | ||
pbs_buffer.clear(); | ||
streams.clear(); | ||
cudaDeviceReset(); | ||
} | ||
}; | ||
|
||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit) | ||
(benchmark::State &st) { | ||
|
||
for (auto _ : st) { | ||
#pragma omp parallel for num_threads(num_gpus) | ||
for (int device = 0; device < num_gpus; device++) { | ||
cudaSetDevice(device); | ||
// Execute PBS | ||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64( | ||
streams[device], (void *)d_lwe_ct_out_array[device], | ||
(void *)d_lwe_output_indexes[device], | ||
(void *)d_lut_pbs_identity[device], (void *)d_lut_pbs_indexes[device], | ||
(void *)d_lwe_ct_in_array[device], | ||
(void *)d_lwe_input_indexes[device], (void *)d_bsk_array[device], | ||
pbs_buffer[device], lwe_dimension, glwe_dimension, polynomial_size, | ||
grouping_factor, pbs_base_log, pbs_level, | ||
input_lwe_ciphertext_count_per_gpu, 1, 0, | ||
cuda_get_max_shared_memory(device), chunk_size); | ||
} | ||
|
||
for (int device = 0; device < num_gpus; device++) { | ||
cudaSetDevice(device); | ||
cuda_synchronize_stream(streams[device]); | ||
} | ||
} | ||
} | ||
|
||
static void | ||
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) { | ||
// Define the parameters to benchmark | ||
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, | ||
// input_lwe_ciphertext_count | ||
std::vector<MultiBitPBSBenchmarkParams> params = { | ||
// 4_bits_multi_bit_group_2 | ||
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2}, | ||
// 4_bits_multi_bit_group_3 | ||
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3}, | ||
}; | ||
|
||
// Add to the list of parameters to benchmark | ||
for (auto x : params) { | ||
for(int lwe_chunk_size = 1; lwe_chunk_size <= x.lwe_dimension / x.grouping_factor; | ||
lwe_chunk_size *= 2) | ||
for (int input_lwe_ciphertext_count = 1; | ||
input_lwe_ciphertext_count <= 16384; input_lwe_ciphertext_count *= 2) | ||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size, | ||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count, | ||
x.grouping_factor, lwe_chunk_size}); | ||
} | ||
} | ||
|
||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit) | ||
->Apply(MultiBitPBSBenchmarkGenerateParams) | ||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size", | ||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count", | ||
"grouping_factor", "chunk_size"}); |
3 changes: 3 additions & 0 deletions
3
backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/main.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#include <benchmark/benchmark.h> | ||
|
||
BENCHMARK_MAIN(); |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.