Skip to content

Commit

Permalink
feat(gpu): implement custom benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Mar 7, 2024
1 parent 053d56a commit dd98724
Show file tree
Hide file tree
Showing 16 changed files with 289 additions and 11 deletions.
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ set(INCLUDE_DIR include)

add_subdirectory(src)
enable_testing()
add_subdirectory(tests)
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

# This is required for rust cargo build
Expand Down
6 changes: 0 additions & 6 deletions backends/tfhe-cuda-backend/cuda/tests/CMakeLists.txt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF)
option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF)

if(TFHE_CUDA_BACKEND_BUILD_TESTS)
message(STATUS "Building the test tool")
add_subdirectory(tests)
endif()

if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS)
message(STATUS "Building the benchmark tool")
add_subdirectory(benchmarks)
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
find_package(CUDA REQUIRED)
find_package(CUDAToolkit REQUIRED)

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

# Disable the Google Benchmark requirement on Google Test
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
set(BENCHMARK_ENABLE_TESTING OFF)

include(FetchContent)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.7.1)
FetchContent_MakeAvailable(googlebenchmark)

# Enable ExternalProject CMake module
include(ExternalProject)

set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")

# Enable ExternalProject CMake module
include(ExternalProject)
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")

if(NOT TARGET tfhe-rs)
ExternalProject_Add(
tfhe-rs
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
BUILD_IN_SOURCE 1
BUILD_ALWAYS 1
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
DOWNLOAD_COMMAND ""
BUILD_COMMAND make build_c_api
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()

include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
include_directories(${TFHE_RS_BINARY_DIR})
include_directories(${TFHE_RS_BINARY_DIR}/deps)
include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")

find_package(OpenMP REQUIRED)
# Add the OpenMP flag to the compiler flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

add_library(tfhe_rs_lib STATIC IMPORTED)
add_dependencies(tfhe_rs_lib tfhe-rs)
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")

set(BINARY benchmark_tfhe_cuda_backend)

file(
GLOB_RECURSE BENCH_SOURCES
LIST_DIRECTORIES false
benchmark*.cpp main.cpp)

add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)

set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(
benchmark_tfhe_cuda_backend
PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX
PRIVATE CUDA::cudart)
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <omp.h>
#include <setup_and_teardown.h>

typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
int grouping_factor;
int chunk_size;
} MultiBitPBSBenchmarkParams;

class MultiBitBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
int input_lwe_ciphertext_count_per_gpu;
int grouping_factor;
double lwe_modular_variance = 0.000007069849454709433;
double glwe_modular_variance = 0.00000000000000029403601535432533;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
std::vector<uint64_t *> d_bsk_array;
std::vector<uint64_t *> d_lut_pbs_identity;
std::vector<uint64_t *> d_lut_pbs_indexes;
std::vector<uint64_t *> d_lwe_ct_in_array;
std::vector<uint64_t *> d_lwe_ct_out_array;
std::vector<uint64_t *> d_lwe_input_indexes;
std::vector<uint64_t *> d_lwe_output_indexes;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
std::vector<int8_t *> pbs_buffer;

int chunk_size;

int num_gpus;
std::vector<cuda_stream_t *> streams;

public:
void SetUp(const ::benchmark::State &state) {

lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
grouping_factor = state.range(6);
chunk_size = state.range(7);

num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);

assert(input_lwe_ciphertext_count % num_gpus == 0);
input_lwe_ciphertext_count_per_gpu =
std::max(1, input_lwe_ciphertext_count / num_gpus);

Seed seed;
init_seed(&seed);

// Create streams
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cuda_stream_t *stream = cuda_create_stream(device);
streams.push_back(stream);

uint64_t *d_bsk_array_per_gpu;
uint64_t *d_lut_pbs_identity_per_gpu;
uint64_t *d_lut_pbs_indexes_per_gpu;
uint64_t *d_lwe_ct_in_array_per_gpu;
uint64_t *d_lwe_ct_out_array_per_gpu;
uint64_t *d_lwe_input_indexes_per_gpu;
uint64_t *d_lwe_output_indexes_per_gpu;
int8_t *pbs_buffer_per_gpu;

bootstrap_multibit_setup(
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
&d_bsk_array_per_gpu, &plaintexts, &d_lut_pbs_identity_per_gpu,
&d_lut_pbs_indexes_per_gpu, &d_lwe_ct_in_array_per_gpu,
&d_lwe_input_indexes_per_gpu, &d_lwe_ct_out_array_per_gpu,
&d_lwe_output_indexes_per_gpu, &pbs_buffer_per_gpu, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor,
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count_per_gpu, 1, 1, chunk_size);

d_bsk_array.push_back(d_bsk_array_per_gpu);
d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
d_lwe_input_indexes.push_back(d_lwe_input_indexes_per_gpu);
d_lwe_output_indexes.push_back(d_lwe_output_indexes_per_gpu);
pbs_buffer.push_back(pbs_buffer_per_gpu);
}
}

void TearDown(const ::benchmark::State &state) {
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);

for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cleanup_cuda_multi_bit_pbs(streams[device], &pbs_buffer[device]);
cuda_drop_async(d_bsk_array[device], streams[device]);
cuda_drop_async(d_lut_pbs_identity[device], streams[device]);
cuda_drop_async(d_lut_pbs_indexes[device], streams[device]);
cuda_drop_async(d_lwe_ct_in_array[device], streams[device]);
cuda_drop_async(d_lwe_ct_out_array[device], streams[device]);
cuda_drop_async(d_lwe_input_indexes[device], streams[device]);
cuda_drop_async(d_lwe_output_indexes[device], streams[device]);
cuda_synchronize_stream(streams[device]);
cuda_destroy_stream(streams[device]);
}
d_bsk_array.clear();
d_lut_pbs_identity.clear();
d_lut_pbs_indexes.clear();
d_lwe_ct_in_array.clear();
d_lwe_ct_out_array.clear();
d_lwe_input_indexes.clear();
d_lwe_output_indexes.clear();
pbs_buffer.clear();
streams.clear();
cudaDeviceReset();
}
};

BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
(benchmark::State &st) {

for (auto _ : st) {
#pragma omp parallel for num_threads(num_gpus)
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
// Execute PBS
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
streams[device], (void *)d_lwe_ct_out_array[device],
(void *)d_lwe_output_indexes[device],
(void *)d_lut_pbs_identity[device], (void *)d_lut_pbs_indexes[device],
(void *)d_lwe_ct_in_array[device],
(void *)d_lwe_input_indexes[device], (void *)d_bsk_array[device],
pbs_buffer[device], lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, pbs_base_log, pbs_level,
input_lwe_ciphertext_count_per_gpu, 1, 0,
cuda_get_max_shared_memory(device), chunk_size);
}

for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
cuda_synchronize_stream(streams[device]);
}
}
}

static void
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
std::vector<MultiBitPBSBenchmarkParams> params = {
// 4_bits_multi_bit_group_2
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
// 4_bits_multi_bit_group_3
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
};

// Add to the list of parameters to benchmark
for (auto x : params) {
for(int lwe_chunk_size = 1; lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
lwe_chunk_size *= 2)
for (int input_lwe_ciphertext_count = 1;
input_lwe_ciphertext_count <= 16384; input_lwe_ciphertext_count *= 2)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, lwe_chunk_size});
}
}

BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#include <benchmark/benchmark.h>

BENCHMARK_MAIN();
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ void bootstrap_classical_teardown(
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
cuda_destroy_stream(stream);
stream->release();
}

void bootstrap_multibit_setup(
Expand Down Expand Up @@ -260,7 +260,7 @@ void bootstrap_multibit_teardown(
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
cuda_destroy_stream(stream);
stream->release();
}

void keyswitch_setup(cuda_stream_t *stream, Seed *seed,
Expand Down Expand Up @@ -365,7 +365,7 @@ void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
cuda_destroy_stream(stream);
stream->release();
}


Expand Down Expand Up @@ -438,5 +438,5 @@ void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
cuda_drop_async(d_cpoly1, stream);
cuda_drop_async(d_cpoly2, stream);
stream->synchronize();
cuda_destroy_stream(stream);
stream->release();
}

0 comments on commit dd98724

Please sign in to comment.