From af9a341d8f1f58ef2a655f17c7b05ff01f9ec4c5 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 1 Feb 2024 10:20:50 +0100 Subject: [PATCH] chore(gpu): add fmt and clippy checks in tfhe-cuda-backend --- .github/workflows/aws_tfhe_gpu_tests.yml | 4 ++ Makefile | 14 ++++- .../cuda/format_tfhe_cuda_backend.sh | 13 ++++- .../tfhe-cuda-backend/cuda/include/device.h | 19 +++---- .../tfhe-cuda-backend/cuda/include/integer.h | 33 +++++------ backends/tfhe-cuda-backend/cuda/src/device.cu | 25 ++++----- .../cuda/src/integer/integer.cuh | 10 ++-- .../cuda/src/integer/multiplication.cuh | 21 ++++--- .../cuda/src/integer/scalar_comparison.cuh | 7 +-- .../cuda/src/integer/scalar_shifts.cuh | 6 +- .../cuda/src/pbs/bootstrap_amortized.cu | 56 +++++++++---------- .../cuda/src/pbs/bootstrap_amortized.cuh | 4 +- 12 files changed, 114 insertions(+), 98 deletions(-) diff --git a/.github/workflows/aws_tfhe_gpu_tests.yml b/.github/workflows/aws_tfhe_gpu_tests.yml index 2e6d1ac097..a1cd364505 100644 --- a/.github/workflows/aws_tfhe_gpu_tests.yml +++ b/.github/workflows/aws_tfhe_gpu_tests.yml @@ -95,6 +95,10 @@ jobs: echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}" echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}" + - name: Run fmt checks + run: | + make check_fmt_gpu + - name: Run clippy checks run: | make clippy_gpu diff --git a/Makefile b/Makefile index 72480ff64e..bb0731b745 100644 --- a/Makefile +++ b/Makefile @@ -150,8 +150,13 @@ fmt_gpu: install_rs_check_toolchain check_fmt: install_rs_check_toolchain cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check -.PHONY: clippy_gpu # Run clippy lints on the gpu backend -clippy_gpu: install_rs_check_toolchain +.PHONY: check_fmt_gpu # Check rust and cuda code format +check_fmt_gpu: install_rs_check_toolchain + cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check + cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c + +.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled +clippy_gpu: install_rs_check_toolchain clippy_cuda_backend RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ --features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \ -p tfhe -- --no-deps -D warnings @@ -239,6 +244,11 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \ clippy_concrete_csprng +.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend +clippy_cuda_backend: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ + -p tfhe-cuda-backend -- --no-deps -D warnings + .PHONY: build_core # Build core_crypto without experimental features build_core: install_rs_build_toolchain install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \ diff --git a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh index 12719a8cff..9c68095a8e 100755 --- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh +++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh @@ -1,6 +1,17 @@ #!/bin/bash +while getopts ":c" option; do + case $option in + c) + # code to execute when flag1 is provided + find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror + cmake-format -i CMakeLists.txt -c .cmake-format-config.py + find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py' + git diff --exit-code + exit + ;; + esac +done find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' cmake-format -i CMakeLists.txt -c .cmake-format-config.py - find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py' diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 0dc28684b5..3cd843b4d6 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -15,15 +15,15 @@ extern "C" { { cuda_error((ans), __FILE__, __LINE__); } inline void cuda_error(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { - std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), - file, line); + std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), + file, line); std::abort(); } } #define PANIC(format, ...) \ { \ std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \ - __LINE__, __func__, ##__VA_ARGS__); \ + __LINE__, __func__, ##__VA_ARGS__); \ std::abort(); \ } @@ -42,9 +42,7 @@ struct cuda_stream_t { check_cuda_error(cudaStreamDestroy(stream)); } - void synchronize() { - check_cuda_error(cudaStreamSynchronize(stream)); - } + void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); } }; cuda_stream_t *cuda_create_stream(uint32_t gpu_index); @@ -62,18 +60,18 @@ bool cuda_check_support_cooperative_groups(); void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size); void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, - cuda_stream_t *stream); + cuda_stream_t *stream); void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, - cuda_stream_t *stream); + cuda_stream_t *stream); void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size); void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, - cuda_stream_t *stream); + cuda_stream_t *stream); void cuda_memset_async(void *dest, uint64_t val, uint64_t size, - cuda_stream_t *stream); + cuda_stream_t *stream); int cuda_get_number_of_gpus(); @@ -86,7 +84,6 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream); int cuda_get_max_shared_memory(uint32_t gpu_index); void cuda_synchronize_stream(cuda_stream_t *stream); - } template diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index b822deaf6e..097ca85826 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -484,8 +484,8 @@ template struct int_sc_prop_memory { }; // create lut objects - luts_array = new int_radix_lut( - stream, params, 2, num_radix_blocks, allocate_gpu_memory); + luts_array = new int_radix_lut(stream, params, 2, num_radix_blocks, + allocate_gpu_memory); luts_carry_propagation_sum = new struct int_radix_lut( stream, params, 1, num_radix_blocks, allocate_gpu_memory); message_acc = new struct int_radix_lut( @@ -507,8 +507,9 @@ template struct int_sc_prop_memory { num_radix_blocks - 1); generate_device_accumulator_bivariate( - stream, luts_carry_propagation_sum->lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_luts_carry_propagation_sum); + stream, luts_carry_propagation_sum->lut, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_luts_carry_propagation_sum); generate_device_accumulator(stream, message_acc->lut, glwe_dimension, polynomial_size, message_modulus, @@ -575,12 +576,12 @@ template struct int_mul_memory { // create int_radix_lut objects for lsb, msb, message, carry // luts_array -> lut = {lsb_acc, msb_acc} - luts_array = new int_radix_lut( - stream, params, 2, total_block_count, allocate_gpu_memory); - luts_message = new int_radix_lut( - stream, params, 1, total_block_count, luts_array); - luts_carry = new int_radix_lut( - stream, params, 1, total_block_count, luts_array); + luts_array = new int_radix_lut(stream, params, 2, total_block_count, + allocate_gpu_memory); + luts_message = new int_radix_lut(stream, params, 1, + total_block_count, luts_array); + luts_carry = new int_radix_lut(stream, params, 1, total_block_count, + luts_array); auto lsb_acc = luts_array->get_lut(0); auto msb_acc = luts_array->get_lut(1); @@ -674,9 +675,9 @@ template struct int_shift_buffer { // here we generate 'num_bits_in_block' times lut // one for each 'shift_within_block' = 'shift' % 'num_bits_in_block' // even though lut_left contains 'num_bits_in_block' lut - // lut_indexes will have indexes for single lut only and those indexes will be 0 - // it means for pbs corresponding lut should be selected and pass along - // lut_indexes filled with zeros + // lut_indexes will have indexes for single lut only and those indexes + // will be 0 it means for pbs corresponding lut should be selected and + // pass along lut_indexes filled with zeros // calculate bivariate lut for each 'shift_within_block' for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) { @@ -730,9 +731,9 @@ template struct int_shift_buffer { // here we generate 'message_modulus' times lut // one for each 'shift' - // lut_indexes will have indexes for single lut only and those indexes will be 0 - // it means for pbs corresponding lut should be selected and pass along - // lut_indexes filled with zeros + // lut_indexes will have indexes for single lut only and those indexes + // will be 0 it means for pbs corresponding lut should be selected and + // pass along lut_indexes filled with zeros // calculate lut for each 'shift' for (int shift = 0; shift < params.message_modulus; shift++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index d9ec4c81c9..6524c636f0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -10,9 +10,7 @@ cuda_stream_t *cuda_create_stream(uint32_t gpu_index) { } /// Unsafe function to destroy CUDA stream, must check first the GPU exists -void cuda_destroy_stream(cuda_stream_t *stream) { - stream->release(); -} +void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); } /// Unsafe function that will try to allocate even if gpu_index is invalid /// or if there's not enough memory. A safe wrapper around it must call @@ -57,7 +55,8 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) { check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem)); if (size > free_mem) { PANIC("Cuda error: not enough memory on device. " - "Available: %zu vs Requested: %lu", free_mem, size); + "Available: %zu vs Requested: %lu", + free_mem, size); } } @@ -67,14 +66,14 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) { bool cuda_check_support_cooperative_groups() { int cooperative_groups_supported = 0; check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported, - cudaDevAttrCooperativeLaunch, 0)); + cudaDevAttrCooperativeLaunch, 0)); return cooperative_groups_supported > 0; } /// Copy memory to the GPU asynchronously void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, - cuda_stream_t *stream) { + cuda_stream_t *stream) { cudaPointerAttributes attr; check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) { @@ -108,7 +107,7 @@ void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) { /// Copy memory within a GPU asynchronously void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, - cuda_stream_t *stream) { + cuda_stream_t *stream) { cudaPointerAttributes attr_dest; check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest)); if (attr_dest.device != stream->gpu_index && @@ -137,7 +136,7 @@ void cuda_synchronize_device(uint32_t gpu_index) { } void cuda_memset_async(void *dest, uint64_t val, uint64_t size, - cuda_stream_t *stream) { + cuda_stream_t *stream) { cudaPointerAttributes attr; check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (size == 0) { @@ -164,8 +163,8 @@ void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value, int num_blocks = (n + block_size - 1) / block_size; // Launch the kernel - cuda_set_value_kernel<<>> - (d_array, value, n); + cuda_set_value_kernel<<>>(d_array, value, + n); check_cuda_error(cudaGetLastError()); } @@ -177,7 +176,7 @@ template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array, /// Copy memory to the CPU asynchronously void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, - cuda_stream_t *stream) { + cuda_stream_t *stream) { cudaPointerAttributes attr; check_cuda_error(cudaPointerGetAttributes(&attr, src)); if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) { @@ -238,6 +237,4 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { return max_shared_memory; } -void cuda_synchronize_stream(cuda_stream_t *stream) { - stream->synchronize(); -} +void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index fcc7a82f7a..f257eeaab8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -59,8 +59,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - input_lwe_ciphertext_count, num_luts, lwe_idx, - max_shared_memory); + input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory); break; case LOW_LAT: cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( @@ -413,10 +412,9 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream, /* * input_blocks: input radix ciphertext propagation will happen inplace * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)] - * lut_indexes_message_carry: lut_indexes for message and carry, should always be {0, 1} - * small_lwe_vector: output of keyswitch should have - * size = 2 * (lwe_dimension + 1) * sizeof(Torus) - * big_lwe_vector: output of pbs should have + * lut_indexes_message_carry: lut_indexes for message and carry, should always + * be {0, 1} small_lwe_vector: output of keyswitch should have size = 2 * + * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have * size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus) */ template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index e6b9f18236..9303e213a4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -361,10 +361,9 @@ __host__ void host_integer_mult_radix_kb( size_t sm_size = big_lwe_size * sizeof(Torus); if (ch_amount != 0) { // cuda_memset with size 0 is invalid, so avoid it - cuda_memset_async( - new_blocks, 0, - ch_amount * num_blocks * big_lwe_size * sizeof(Torus), - stream); + cuda_memset_async(new_blocks, 0, + ch_amount * num_blocks * big_lwe_size * sizeof(Torus), + stream); } tree_add_chunks<<stream>>>( @@ -420,8 +419,8 @@ __host__ void host_integer_mult_radix_kb( execute_pbs(stream, carry_blocks_vector, lwe_indexes, luts_carry->lut, luts_carry->lut_indexes, &small_lwe_vector[message_count * (lwe_dimension + 1)], - lwe_indexes, bsk, luts_carry->pbs_buffer, - glwe_dimension, lwe_dimension, polynomial_size, + lwe_indexes, bsk, luts_carry->pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, carry_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type); @@ -449,7 +448,7 @@ __host__ void host_integer_mult_radix_kb( dim3 add_grid(1, num_blocks, 1); size_t sm_size = big_lwe_size * sizeof(Torus); - cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus), + cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus), stream); tree_add_chunks<<stream>>>( radix_lwe_out, old_blocks, r, num_blocks); @@ -548,13 +547,13 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts, Torus *cur_lut_indexes; if (lsb_msb_mode) { cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count) - ? mem_ptr->lut_indexes_lsb_multi_gpu[i] - : mem_ptr->lut_indexes_msb_multi_gpu[i]; + ? mem_ptr->lut_indexes_lsb_multi_gpu[i] + : mem_ptr->lut_indexes_msb_multi_gpu[i]; } else { cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count) - ? mem_ptr->lut_indexes_message_multi_gpu[i] - : mem_ptr->lut_indexes_carry_multi_gpu[i]; + ? mem_ptr->lut_indexes_message_multi_gpu[i] + : mem_ptr->lut_indexes_carry_multi_gpu[i]; } // execute keyswitch on a current gpu with corresponding input and output diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 5d4be87597..6bd87346f7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -68,10 +68,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb( // garbage. if (total_num_radix_blocks > 1) { // cuda_memset with size 0 is invalid, so avoid it - cuda_memset_async( - lwe_array_out + big_lwe_size, 0, - big_lwe_size_bytes * (total_num_radix_blocks - 1), - stream); + cuda_memset_async(lwe_array_out + big_lwe_size, 0, + big_lwe_size_bytes * (total_num_radix_blocks - 1), + stream); } } else if (total_num_scalar_blocks < total_num_radix_blocks) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index e271310e95..b73db8fe0a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -98,9 +98,9 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace( // create trivial assign for value = 0 if (rotations > 0) { // cuda_memset with size 0 is invalid, so avoid it - cuda_memset_async( - rotated_buffer + (num_blocks - rotations) * big_lwe_size, - 0, rotations * big_lwe_size_bytes, stream); + cuda_memset_async(rotated_buffer + + (num_blocks - rotations) * big_lwe_size, + 0, rotations * big_lwe_size_bytes, stream); } cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, stream); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu index 2b0fe42305..7b4ebee024 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu @@ -159,8 +159,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 512: host_bootstrap_amortized>( @@ -168,8 +168,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 1024: host_bootstrap_amortized>( @@ -177,8 +177,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 2048: host_bootstrap_amortized>( @@ -186,8 +186,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 4096: host_bootstrap_amortized>( @@ -195,8 +195,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 8192: host_bootstrap_amortized>( @@ -204,8 +204,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 16384: host_bootstrap_amortized>( @@ -213,8 +213,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; default: break; @@ -303,8 +303,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 512: host_bootstrap_amortized>( @@ -312,8 +312,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 1024: host_bootstrap_amortized>( @@ -321,8 +321,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 2048: host_bootstrap_amortized>( @@ -330,8 +330,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 4096: host_bootstrap_amortized>( @@ -339,8 +339,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 8192: host_bootstrap_amortized>( @@ -348,8 +348,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; case 16384: host_bootstrap_amortized>( @@ -357,8 +357,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, - polynomial_size, base_log, level_count, num_samples, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx, + max_shared_memory); break; default: break; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh index 5389576c78..7f1d24512f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh @@ -288,8 +288,8 @@ __host__ void host_bootstrap_amortized( Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, + uint32_t max_shared_memory) { cudaSetDevice(stream->gpu_index); uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized(