feat(gpu): implement custom benchmarks

zama-ai · Mar 7, 2024 · dd98724 · dd98724
1 parent 053d56a
commit dd98724
Show file tree

Hide file tree

Showing 16 changed files with 289 additions and 11 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -72,7 +72,7 @@ set(INCLUDE_DIR include)
 
 add_subdirectory(src)
 enable_testing()
-add_subdirectory(tests)
+add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
 
 # This is required for rust cargo build

diff --git a/backends/tfhe-cuda-backend/cuda/tests/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/tests/CMakeLists.txt
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/CMakeLists.txt
@@ -0,0 +1,12 @@
+option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF)
+option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF)
+
+if(TFHE_CUDA_BACKEND_BUILD_TESTS)
+  message(STATUS "Building the test tool")
+  add_subdirectory(tests)
+endif()
+
+if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS)
+  message(STATUS "Building the benchmark tool")
+  add_subdirectory(benchmarks)
+endif()
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -0,0 +1,73 @@
+find_package(CUDA REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Disable the Google Benchmark requirement on Google Test
+set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
+set(BENCHMARK_ENABLE_TESTING OFF)
+
+include(FetchContent)
+FetchContent_Declare(
+  googlebenchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.7.1)
+FetchContent_MakeAvailable(googlebenchmark)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
+set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
+
+if(NOT TARGET tfhe-rs)
+  ExternalProject_Add(
+    tfhe-rs
+    SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_ALWAYS 1
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    DOWNLOAD_COMMAND ""
+    BUILD_COMMAND make build_c_api
+    INSTALL_COMMAND ""
+    LOG_BUILD ON)
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
+include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
+include_directories(${TFHE_RS_BINARY_DIR})
+include_directories(${TFHE_RS_BINARY_DIR}/deps)
+include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
+
+find_package(OpenMP REQUIRED)
+# Add the OpenMP flag to the compiler flags
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+add_library(tfhe_rs_lib STATIC IMPORTED)
+add_dependencies(tfhe_rs_lib tfhe-rs)
+set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
+
+set(BINARY benchmark_tfhe_cuda_backend)
+
+file(
+  GLOB_RECURSE BENCH_SOURCES
+  LIST_DIRECTORIES false
+  benchmark*.cpp main.cpp)
+
+add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
+
+set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+target_link_libraries(
+  benchmark_tfhe_cuda_backend
+  PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX
+  PRIVATE CUDA::cudart)
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_multibit_pbs.cpp
@@ -0,0 +1,196 @@
+#include <benchmark/benchmark.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <omp.h>
+#include <setup_and_teardown.h>
+
+typedef struct {
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int pbs_base_log;
+  int pbs_level;
+  int input_lwe_ciphertext_count;
+  int grouping_factor;
+  int chunk_size;
+} MultiBitPBSBenchmarkParams;
+
+class MultiBitBootstrap_u64 : public benchmark::Fixture {
+protected:
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int input_lwe_ciphertext_count;
+  int input_lwe_ciphertext_count_per_gpu;
+  int grouping_factor;
+  double lwe_modular_variance = 0.000007069849454709433;
+  double glwe_modular_variance = 0.00000000000000029403601535432533;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus = 4;
+  int carry_modulus = 4;
+  int payload_modulus;
+  uint64_t delta;
+  std::vector<uint64_t *> d_bsk_array;
+  std::vector<uint64_t *> d_lut_pbs_identity;
+  std::vector<uint64_t *> d_lut_pbs_indexes;
+  std::vector<uint64_t *> d_lwe_ct_in_array;
+  std::vector<uint64_t *> d_lwe_ct_out_array;
+  std::vector<uint64_t *> d_lwe_input_indexes;
+  std::vector<uint64_t *> d_lwe_output_indexes;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  std::vector<int8_t *> pbs_buffer;
+
+  int chunk_size;
+
+  int num_gpus;
+  std::vector<cuda_stream_t *> streams;
+
+public:
+  void SetUp(const ::benchmark::State &state) {
+
+    lwe_dimension = state.range(0);
+    glwe_dimension = state.range(1);
+    polynomial_size = state.range(2);
+    pbs_base_log = state.range(3);
+    pbs_level = state.range(4);
+    input_lwe_ciphertext_count = state.range(5);
+    grouping_factor = state.range(6);
+    chunk_size = state.range(7);
+
+    num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);
+
+    assert(input_lwe_ciphertext_count % num_gpus == 0);
+    input_lwe_ciphertext_count_per_gpu =
+        std::max(1, input_lwe_ciphertext_count / num_gpus);
+
+    Seed seed;
+    init_seed(&seed);
+
+    // Create streams
+    for (int device = 0; device < num_gpus; device++) {
+      cudaSetDevice(device);
+      cuda_stream_t *stream = cuda_create_stream(device);
+      streams.push_back(stream);
+
+      uint64_t *d_bsk_array_per_gpu;
+      uint64_t *d_lut_pbs_identity_per_gpu;
+      uint64_t *d_lut_pbs_indexes_per_gpu;
+      uint64_t *d_lwe_ct_in_array_per_gpu;
+      uint64_t *d_lwe_ct_out_array_per_gpu;
+      uint64_t *d_lwe_input_indexes_per_gpu;
+      uint64_t *d_lwe_output_indexes_per_gpu;
+      int8_t *pbs_buffer_per_gpu;
+
+      bootstrap_multibit_setup(
+          stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
+          &d_bsk_array_per_gpu, &plaintexts, &d_lut_pbs_identity_per_gpu,
+          &d_lut_pbs_indexes_per_gpu, &d_lwe_ct_in_array_per_gpu,
+          &d_lwe_input_indexes_per_gpu, &d_lwe_ct_out_array_per_gpu,
+          &d_lwe_output_indexes_per_gpu, &pbs_buffer_per_gpu, lwe_dimension,
+          glwe_dimension, polynomial_size, grouping_factor,
+          lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
+          message_modulus, carry_modulus, &payload_modulus, &delta,
+          input_lwe_ciphertext_count_per_gpu, 1, 1, chunk_size);
+
+      d_bsk_array.push_back(d_bsk_array_per_gpu);
+      d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
+      d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
+      d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
+      d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
+      d_lwe_input_indexes.push_back(d_lwe_input_indexes_per_gpu);
+      d_lwe_output_indexes.push_back(d_lwe_output_indexes_per_gpu);
+      pbs_buffer.push_back(pbs_buffer_per_gpu);
+    }
+  }
+
+  void TearDown(const ::benchmark::State &state) {
+    free(lwe_sk_in_array);
+    free(lwe_sk_out_array);
+    free(plaintexts);
+
+    for (int device = 0; device < num_gpus; device++) {
+      cudaSetDevice(device);
+      cleanup_cuda_multi_bit_pbs(streams[device], &pbs_buffer[device]);
+      cuda_drop_async(d_bsk_array[device], streams[device]);
+      cuda_drop_async(d_lut_pbs_identity[device], streams[device]);
+      cuda_drop_async(d_lut_pbs_indexes[device], streams[device]);
+      cuda_drop_async(d_lwe_ct_in_array[device], streams[device]);
+      cuda_drop_async(d_lwe_ct_out_array[device], streams[device]);
+      cuda_drop_async(d_lwe_input_indexes[device], streams[device]);
+      cuda_drop_async(d_lwe_output_indexes[device], streams[device]);
+      cuda_synchronize_stream(streams[device]);
+      cuda_destroy_stream(streams[device]);
+    }
+    d_bsk_array.clear();
+    d_lut_pbs_identity.clear();
+    d_lut_pbs_indexes.clear();
+    d_lwe_ct_in_array.clear();
+    d_lwe_ct_out_array.clear();
+    d_lwe_input_indexes.clear();
+    d_lwe_output_indexes.clear();
+    pbs_buffer.clear();
+    streams.clear();
+    cudaDeviceReset();
+  }
+};
+
+BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
+(benchmark::State &st) {
+
+  for (auto _ : st) {
+#pragma omp parallel for num_threads(num_gpus)
+    for (int device = 0; device < num_gpus; device++) {
+      cudaSetDevice(device);
+      // Execute PBS
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          streams[device], (void *)d_lwe_ct_out_array[device],
+          (void *)d_lwe_output_indexes[device],
+          (void *)d_lut_pbs_identity[device], (void *)d_lut_pbs_indexes[device],
+          (void *)d_lwe_ct_in_array[device],
+          (void *)d_lwe_input_indexes[device], (void *)d_bsk_array[device],
+          pbs_buffer[device], lwe_dimension, glwe_dimension, polynomial_size,
+          grouping_factor, pbs_base_log, pbs_level,
+          input_lwe_ciphertext_count_per_gpu, 1, 0,
+          cuda_get_max_shared_memory(device), chunk_size);
+    }
+
+    for (int device = 0; device < num_gpus; device++) {
+      cudaSetDevice(device);
+      cuda_synchronize_stream(streams[device]);
+    }
+  }
+}
+
+static void
+MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
+  // Define the parameters to benchmark
+  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+  // input_lwe_ciphertext_count
+  std::vector<MultiBitPBSBenchmarkParams> params = {
+      // 4_bits_multi_bit_group_2
+      (MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
+      // 4_bits_multi_bit_group_3
+      (MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
+  };
+
+  // Add to the list of parameters to benchmark
+  for (auto x : params) {
+      for(int lwe_chunk_size = 1; lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
+      lwe_chunk_size *= 2)
+    for (int input_lwe_ciphertext_count = 1;
+         input_lwe_ciphertext_count <= 16384; input_lwe_ciphertext_count *= 2)
+      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+               x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
+               x.grouping_factor, lwe_chunk_size});
+  }
+}
+
+BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
+    ->Apply(MultiBitPBSBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
+                "grouping_factor", "chunk_size"});
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/main.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/main.cpp
@@ -0,0 +1,3 @@
+#include <benchmark/benchmark.h>
+
+BENCHMARK_MAIN();
diff --git a/...d/cuda/tests/include/setup_and_teardown.h → ...d_benchmarks/include/setup_and_teardown.h b/...d/cuda/tests/include/setup_and_teardown.h → ...d_benchmarks/include/setup_and_teardown.h
diff --git a/...e-cuda-backend/cuda/tests/include/utils.h → ...cuda/tests_and_benchmarks/include/utils.h b/...e-cuda-backend/cuda/tests/include/utils.h → ...cuda/tests_and_benchmarks/include/utils.h
diff --git a/...backend/cuda/tests/setup_and_teardown.cpp → ...sts_and_benchmarks/setup_and_teardown.cpp b/...backend/cuda/tests/setup_and_teardown.cpp → ...sts_and_benchmarks/setup_and_teardown.cpp
@@ -125,7 +125,7 @@ void bootstrap_classical_teardown(
   cuda_drop_async(d_lwe_input_indexes, stream);
   cuda_drop_async(d_lwe_output_indexes, stream);
   stream->synchronize();
-  cuda_destroy_stream(stream);
+  stream->release();
 }
 
 void bootstrap_multibit_setup(
@@ -260,7 +260,7 @@ void bootstrap_multibit_teardown(
   cuda_drop_async(d_lwe_input_indexes, stream);
   cuda_drop_async(d_lwe_output_indexes, stream);
   stream->synchronize();
-  cuda_destroy_stream(stream);
+  stream->release();
 }
 
 void keyswitch_setup(cuda_stream_t *stream, Seed *seed,
@@ -365,7 +365,7 @@ void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
   cuda_drop_async(d_lwe_input_indexes, stream);
   cuda_drop_async(d_lwe_output_indexes, stream);
   stream->synchronize();
-  cuda_destroy_stream(stream);
+  stream->release();
 }
 
 
@@ -438,5 +438,5 @@ void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
   cuda_drop_async(d_cpoly1, stream);
   cuda_drop_async(d_cpoly2, stream);
   stream->synchronize();
-  cuda_destroy_stream(stream);
+  stream->release();
 }
diff --git a/...uda-backend/cuda/tests/src/CMakeLists.txt → ...tests_and_benchmarks/tests/CMakeLists.txt b/...uda-backend/cuda/tests/src/CMakeLists.txt → ...tests_and_benchmarks/tests/CMakeLists.txt
diff --git a/...fhe-cuda-backend/cuda/tests/src/README.md → ...cuda/tests_and_benchmarks/tests/README.md b/...fhe-cuda-backend/cuda/tests/src/README.md → ...cuda/tests_and_benchmarks/tests/README.md
diff --git a/...end/cuda/tests/src/test_classical_pbs.cpp → ...d_benchmarks/tests/test_classical_pbs.cpp b/...end/cuda/tests/src/test_classical_pbs.cpp → ...d_benchmarks/tests/test_classical_pbs.cpp
diff --git a/...-cuda-backend/cuda/tests/src/test_fft.cpp → ...a/tests_and_benchmarks/tests/test_fft.cpp b/...-cuda-backend/cuda/tests/src/test_fft.cpp → ...a/tests_and_benchmarks/tests/test_fft.cpp
diff --git a/...backend/cuda/tests/src/test_keyswitch.cpp → ...s_and_benchmarks/tests/test_keyswitch.cpp b/...backend/cuda/tests/src/test_keyswitch.cpp → ...s_and_benchmarks/tests/test_keyswitch.cpp
diff --git a/...kend/cuda/tests/src/test_multibit_pbs.cpp → ...nd_benchmarks/tests/test_multibit_pbs.cpp b/...kend/cuda/tests/src/test_multibit_pbs.cpp → ...nd_benchmarks/tests/test_multibit_pbs.cpp
diff --git a/...ds/tfhe-cuda-backend/cuda/tests/utils.cpp → ...ckend/cuda/tests_and_benchmarks/utils.cpp b/...ds/tfhe-cuda-backend/cuda/tests/utils.cpp → ...ckend/cuda/tests_and_benchmarks/utils.cpp
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#include <benchmark/benchmark.h>

		BENCHMARK_MAIN();