Skip to content

Commit

Permalink
chore(gpu): add fmt and clippy checks in tfhe-cuda-backend
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 1, 2024
1 parent 253062c commit bce3bf1
Show file tree
Hide file tree
Showing 12 changed files with 114 additions and 98 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/aws_tfhe_gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ jobs:
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Run fmt checks
run: |
make check_fmt_gpu
- name: Run clippy checks
run: |
make clippy_gpu
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,13 @@ fmt_gpu: install_rs_check_toolchain
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check

.PHONY: clippy_gpu # Run clippy lints on the gpu backend
clippy_gpu: install_rs_check_toolchain
.PHONY: check_fmt_gpu # Check rust and cuda code format
check_fmt_gpu: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c

.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain clippy_cuda_backend
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-p $(TFHE_SPEC) -- --no-deps -D warnings
Expand Down Expand Up @@ -246,6 +251,11 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
clippy_concrete_csprng

.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-p tfhe-cuda-backend -- --no-deps -D warnings

.PHONY: build_core # Build core_crypto without experimental features
build_core: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
Expand Down
13 changes: 12 additions & 1 deletion backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
#!/bin/bash

while getopts ":c" option; do
case $option in
c)
# code to execute when flag1 is provided
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
git diff --exit-code
exit
;;
esac
done
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py

find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
19 changes: 8 additions & 11 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ extern "C" {
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
file, line);
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
file, line);
std::abort();
}
}
#define PANIC(format, ...) \
{ \
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
__LINE__, __func__, ##__VA_ARGS__); \
__LINE__, __func__, ##__VA_ARGS__); \
std::abort(); \
}

Expand All @@ -42,9 +42,7 @@ struct cuda_stream_t {
check_cuda_error(cudaStreamDestroy(stream));
}

void synchronize() {
check_cuda_error(cudaStreamSynchronize(stream));
}
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
};

cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
Expand All @@ -62,18 +60,18 @@ bool cuda_check_support_cooperative_groups();
void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);

void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
cuda_stream_t *stream);

void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
cuda_stream_t *stream);

void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);

void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream);
cuda_stream_t *stream);

void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream);
cuda_stream_t *stream);

int cuda_get_number_of_gpus();

Expand All @@ -86,7 +84,6 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);

void cuda_synchronize_stream(cuda_stream_t *stream);

}

template <typename Torus>
Expand Down
33 changes: 17 additions & 16 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -484,8 +484,8 @@ template <typename Torus> struct int_sc_prop_memory {
};

// create lut objects
luts_array = new int_radix_lut<Torus>(
stream, params, 2, num_radix_blocks, allocate_gpu_memory);
luts_array = new int_radix_lut<Torus>(stream, params, 2, num_radix_blocks,
allocate_gpu_memory);
luts_carry_propagation_sum = new struct int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
message_acc = new struct int_radix_lut<Torus>(
Expand All @@ -507,8 +507,9 @@ template <typename Torus> struct int_sc_prop_memory {
num_radix_blocks - 1);

generate_device_accumulator_bivariate<Torus>(
stream, luts_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_luts_carry_propagation_sum);
stream, luts_carry_propagation_sum->lut, glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_carry_propagation_sum);

generate_device_accumulator<Torus>(stream, message_acc->lut, glwe_dimension,
polynomial_size, message_modulus,
Expand Down Expand Up @@ -575,12 +576,12 @@ template <typename Torus> struct int_mul_memory {

// create int_radix_lut objects for lsb, msb, message, carry
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(
stream, params, 2, total_block_count, allocate_gpu_memory);
luts_message = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, luts_array);
luts_carry = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, luts_array);
luts_array = new int_radix_lut<Torus>(stream, params, 2, total_block_count,
allocate_gpu_memory);
luts_message = new int_radix_lut<Torus>(stream, params, 1,
total_block_count, luts_array);
luts_carry = new int_radix_lut<Torus>(stream, params, 1, total_block_count,
luts_array);

auto lsb_acc = luts_array->get_lut(0);
auto msb_acc = luts_array->get_lut(1);
Expand Down Expand Up @@ -674,9 +675,9 @@ template <typename Torus> struct int_shift_buffer {
// here we generate 'num_bits_in_block' times lut
// one for each 'shift_within_block' = 'shift' % 'num_bits_in_block'
// even though lut_left contains 'num_bits_in_block' lut
// lut_indexes will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// lut_indexes filled with zeros
// lut_indexes will have indexes for single lut only and those indexes
// will be 0 it means for pbs corresponding lut should be selected and
// pass along lut_indexes filled with zeros

// calculate bivariate lut for each 'shift_within_block'
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
Expand Down Expand Up @@ -730,9 +731,9 @@ template <typename Torus> struct int_shift_buffer {

// here we generate 'message_modulus' times lut
// one for each 'shift'
// lut_indexes will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// lut_indexes filled with zeros
// lut_indexes will have indexes for single lut only and those indexes
// will be 0 it means for pbs corresponding lut should be selected and
// pass along lut_indexes filled with zeros

// calculate lut for each 'shift'
for (int shift = 0; shift < params.message_modulus; shift++) {
Expand Down
25 changes: 11 additions & 14 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
}

/// Unsafe function to destroy CUDA stream, must check first the GPU exists
void cuda_destroy_stream(cuda_stream_t *stream) {
stream->release();
}
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }

/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
Expand Down Expand Up @@ -57,7 +55,8 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
if (size > free_mem) {
PANIC("Cuda error: not enough memory on device. "
"Available: %zu vs Requested: %lu", free_mem, size);
"Available: %zu vs Requested: %lu",
free_mem, size);
}
}

Expand All @@ -67,14 +66,14 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
bool cuda_check_support_cooperative_groups() {
int cooperative_groups_supported = 0;
check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
cudaDevAttrCooperativeLaunch, 0));
cudaDevAttrCooperativeLaunch, 0));

return cooperative_groups_supported > 0;
}

/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
cuda_stream_t *stream) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
Expand Down Expand Up @@ -108,7 +107,7 @@ void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {

/// Copy memory within a GPU asynchronously
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
cuda_stream_t *stream) {
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.device != stream->gpu_index &&
Expand Down Expand Up @@ -137,7 +136,7 @@ void cuda_synchronize_device(uint32_t gpu_index) {
}

void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream) {
cuda_stream_t *stream) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (size == 0) {
Expand All @@ -164,8 +163,8 @@ void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
int num_blocks = (n + block_size - 1) / block_size;

// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>
(d_array, value, n);
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
}

Expand All @@ -177,7 +176,7 @@ template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,

/// Copy memory to the CPU asynchronously
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream) {
cuda_stream_t *stream) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
Expand Down Expand Up @@ -238,6 +237,4 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
return max_shared_memory;
}

void cuda_synchronize_stream(cuda_stream_t *stream) {
stream->synchronize();
}
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
10 changes: 4 additions & 6 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_luts, lwe_idx,
max_shared_memory);
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
Expand Down Expand Up @@ -413,10 +412,9 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
/*
* input_blocks: input radix ciphertext propagation will happen inplace
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
* lut_indexes_message_carry: lut_indexes for message and carry, should always be {0, 1}
* small_lwe_vector: output of keyswitch should have
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
* big_lwe_vector: output of pbs should have
* lut_indexes_message_carry: lut_indexes for message and carry, should always
* be {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
* (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus, typename STorus, class params>
Expand Down
21 changes: 10 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,9 @@ __host__ void host_integer_mult_radix_kb(
size_t sm_size = big_lwe_size * sizeof(Torus);
if (ch_amount != 0) {
// cuda_memset with size 0 is invalid, so avoid it
cuda_memset_async(
new_blocks, 0,
ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
stream);
cuda_memset_async(new_blocks, 0,
ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
stream);
}

tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
Expand Down Expand Up @@ -420,8 +419,8 @@ __host__ void host_integer_mult_radix_kb(
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
luts_carry->lut, luts_carry->lut_indexes,
&small_lwe_vector[message_count * (lwe_dimension + 1)],
lwe_indexes, bsk, luts_carry->pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
lwe_indexes, bsk, luts_carry->pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, carry_count, 1, 0,
max_shared_memory, mem_ptr->params.pbs_type);
Expand Down Expand Up @@ -449,7 +448,7 @@ __host__ void host_integer_mult_radix_kb(

dim3 add_grid(1, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
stream);
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
radix_lwe_out, old_blocks, r, num_blocks);
Expand Down Expand Up @@ -548,13 +547,13 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
Torus *cur_lut_indexes;
if (lsb_msb_mode) {
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
: mem_ptr->lut_indexes_msb_multi_gpu[i];
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
: mem_ptr->lut_indexes_msb_multi_gpu[i];

} else {
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->lut_indexes_message_multi_gpu[i]
: mem_ptr->lut_indexes_carry_multi_gpu[i];
? mem_ptr->lut_indexes_message_multi_gpu[i]
: mem_ptr->lut_indexes_carry_multi_gpu[i];
}

// execute keyswitch on a current gpu with corresponding input and output
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
// garbage.
if (total_num_radix_blocks > 1) {
// cuda_memset with size 0 is invalid, so avoid it
cuda_memset_async(
lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (total_num_radix_blocks - 1),
stream);
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (total_num_radix_blocks - 1),
stream);
}

} else if (total_num_scalar_blocks < total_num_radix_blocks) {
Expand Down
6 changes: 3 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
// create trivial assign for value = 0
if (rotations > 0) {
// cuda_memset with size 0 is invalid, so avoid it
cuda_memset_async(
rotated_buffer + (num_blocks - rotations) * big_lwe_size,
0, rotations * big_lwe_size_bytes, stream);
cuda_memset_async(rotated_buffer +
(num_blocks - rotations) * big_lwe_size,
0, rotations * big_lwe_size_bytes, stream);
}
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, stream);
Expand Down
Loading

0 comments on commit bce3bf1

Please sign in to comment.