Skip to content

Commit

Permalink
feat(gpu): implement signed scalar ge, gt, le, lt, max, and min
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy authored and pdroalves committed Apr 8, 2024
1 parent b6868e0 commit 3972e62
Show file tree
Hide file tree
Showing 18 changed files with 1,229 additions and 249 deletions.
60 changes: 41 additions & 19 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ enum COMPARISON_TYPE {
MAX = 6,
MIN = 7,
};
enum IS_RELATIONSHIP { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

extern "C" {
void scratch_cuda_full_propagation_64(
Expand Down Expand Up @@ -1846,6 +1846,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
bool allocate_gpu_memory) {
this->params = params;

Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);

block_selector_f = [](Torus msb, Torus lsb) -> Torus {
if (msb == IS_EQUAL) // EQUAL
return lsb;
Expand All @@ -1854,13 +1856,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
};

if (allocate_gpu_memory) {
tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
num_radix_blocks * sizeof(Torus),
stream);
tmp_y = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
num_radix_blocks * sizeof(Torus),
stream);

tmp_x = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
tmp_y = (Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
// LUTs
tree_inner_leaf_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
Expand Down Expand Up @@ -1901,6 +1898,10 @@ template <typename Torus> struct int_comparison_diff_buffer {

int_tree_sign_reduction_buffer<Torus> *tree_buffer;

Torus *tmp_signs_a;
Torus *tmp_signs_b;
int_radix_lut<Torus> *reduce_signs_lut;

int_comparison_diff_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
Expand All @@ -1922,7 +1923,6 @@ template <typename Torus> struct int_comparison_diff_buffer {
return 42;
}
};

if (allocate_gpu_memory) {

Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
Expand All @@ -1935,15 +1935,26 @@ template <typename Torus> struct int_comparison_diff_buffer {

tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
stream, operator_f, params, num_radix_blocks, allocate_gpu_memory);
tmp_signs_a =
(Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
tmp_signs_b =
(Torus *)cuda_malloc_async(big_size * num_radix_blocks, stream);
// LUTs
reduce_signs_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
}
}

void release(cuda_stream_t *stream) {
tree_buffer->release(stream);
delete tree_buffer;
reduce_signs_lut->release(stream);
delete reduce_signs_lut;

cuda_drop_async(tmp_packed_left, stream);
cuda_drop_async(tmp_packed_right, stream);
cuda_drop_async(tmp_signs_a, stream);
cuda_drop_async(tmp_signs_b, stream);
}
};

Expand All @@ -1963,6 +1974,7 @@ template <typename Torus> struct int_comparison_buffer {

Torus *tmp_block_comparisons;
Torus *tmp_lwe_array_out;
Torus *tmp_trivial_sign_block;

// Scalar EQ / NE
Torus *tmp_packed_input;
Expand All @@ -1975,6 +1987,7 @@ template <typename Torus> struct int_comparison_buffer {
bool is_signed;

// Used for scalar comparisons
int_radix_lut<Torus> *signed_msb_lut;
cuda_stream_t *lsb_stream;
cuda_stream_t *msb_stream;

Expand All @@ -1987,22 +2000,22 @@ template <typename Torus> struct int_comparison_buffer {

identity_lut_f = [](Torus x) -> Torus { return x; };

auto big_lwe_size = params.big_lwe_dimension + 1;

if (allocate_gpu_memory) {
lsb_stream = cuda_create_stream(stream->gpu_index);
msb_stream = cuda_create_stream(stream->gpu_index);

// +1 to have space for signed comparison
tmp_lwe_array_out = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
stream);
big_lwe_size * (num_radix_blocks + 1) * sizeof(Torus), stream);

tmp_packed_input = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * 2 * num_radix_blocks * sizeof(Torus),
stream);
big_lwe_size * 2 * num_radix_blocks * sizeof(Torus), stream);

// Block comparisons
tmp_block_comparisons = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
stream);
big_lwe_size * num_radix_blocks * sizeof(Torus), stream);

// Cleaning LUT
identity_lut = new int_radix_lut<Torus>(
Expand Down Expand Up @@ -2054,13 +2067,19 @@ template <typename Torus> struct int_comparison_buffer {
}

if (is_signed) {

tmp_trivial_sign_block =
(Torus *)cuda_malloc_async(big_lwe_size * sizeof(Torus), stream);

signed_lut =
new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);
signed_msb_lut =
new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);

auto message_modulus = (int)params.message_modulus;
uint32_t sign_bit_pos = log2(message_modulus) - 1;
std::function<Torus(Torus, Torus)> signed_lut_f;
signed_lut_f = [sign_bit_pos](Torus x, Torus y) -> Torus {
std::function<Torus(Torus, Torus)> signed_lut_f =
[sign_bit_pos](Torus x, Torus y) -> Torus {
auto x_sign_bit = x >> sign_bit_pos;
auto y_sign_bit = y >> sign_bit_pos;

Expand All @@ -2076,14 +2095,14 @@ template <typename Torus> struct int_comparison_buffer {
return (Torus)(IS_INFERIOR);
else if (x == y)
return (Torus)(IS_EQUAL);
else if (x > y)
else
return (Torus)(IS_SUPERIOR);
} else {
if (x < y)
return (Torus)(IS_SUPERIOR);
else if (x == y)
return (Torus)(IS_EQUAL);
else if (x > y)
else
return (Torus)(IS_INFERIOR);
}
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
Expand Down Expand Up @@ -2126,8 +2145,11 @@ template <typename Torus> struct int_comparison_buffer {
cuda_drop_async(tmp_packed_input, stream);

if (is_signed) {
cuda_drop_async(tmp_trivial_sign_block, stream);
signed_lut->release(stream);
delete (signed_lut);
signed_msb_lut->release(stream);
delete (signed_msb_lut);
}
cuda_destroy_stream(lsb_stream);
cuda_destroy_stream(msb_stream);
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ __host__ void host_compare_with_zero_equality(
remainder_blocks -= (chunk_size - 1);

// Update operands
chunk += chunk_size * big_lwe_size;
chunk += (chunk_size - 1) * big_lwe_size;
sum_i += big_lwe_size;
}
}
Expand Down
119 changes: 118 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
}

if (num_radix_blocks % 2 != 0) {
if (num_radix_blocks % 2 == 1) {
// We couldn't pack the last block, so we just copy it
Torus *lsb_block =
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
Expand All @@ -599,6 +599,36 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
}
}

// template <typename Torus>
//__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
// uint32_t lwe_dimension,
// uint32_t num_radix_blocks, uint32_t
// factor) {
// int tid = threadIdx.x + blockIdx.x * blockDim.x;
// int bid = tid / (lwe_dimension + 1);
// int total_blocks = (num_radix_blocks / 2) + (num_radix_blocks % 2);
//
// if (tid < total_blocks * (lwe_dimension + 1)) {
//
// if (bid < num_radix_blocks / 2) {
// Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
// Torus *msb_block = lsb_block + (lwe_dimension + 1);
//
// Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
//
// packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
// }else if (bid == num_radix_blocks / 2) {
// // We can't pack the last block, so we just copy it
// Torus *lsb_block =
// lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
// Torus *last_block =
// lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
//
// last_block[tid] = lsb_block[tid];
// }
// }
// }

// Packs the low ciphertext in the message parts of the high ciphertext
// and moves the high ciphertext into the carry part.
//
Expand Down Expand Up @@ -684,4 +714,91 @@ __host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
num_radix_blocks * bits_per_block, bit_extract->lut);
}

template <typename Torus>
__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
Torus *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f,
void *bsk, Torus *ksk, uint32_t num_sign_blocks) {

auto diff_buffer = mem_ptr->diff_buffer;

auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;

std::function<Torus(Torus)> reduce_two_orderings_function =
[diff_buffer, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;

return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
};

auto signs_a = diff_buffer->tmp_signs_a;
auto signs_b = diff_buffer->tmp_signs_b;

cuda_memcpy_async_gpu_to_gpu(
signs_a, signs_array_in,
(big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, reduce_two_orderings_function);

while (num_sign_blocks > 2) {
pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
4);
integer_radix_apply_univariate_lookup_table_kb(
stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);

auto last_block_signs_b =
signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
auto last_block_signs_a =
signs_a + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
if (num_sign_blocks % 2 == 1)
cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
(big_lwe_dimension + 1) * sizeof(Torus),
stream);

num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
}
}

if (num_sign_blocks == 2) {
std::function<Torus(Torus)> final_lut_f =
[reduce_two_orderings_function, sign_handler_f](Torus x) -> Torus {
Torus final_sign = reduce_two_orderings_function(x);
return sign_handler_f(final_sign);
};

auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, final_lut_f);

pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
signs_b, bsk, ksk, 1, lut);

} else {

std::function<Torus(Torus)> final_lut_f =
[mem_ptr, sign_handler_f](Torus x) -> Torus {
return sign_handler_f(x & 3);
};

auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
polynomial_size, message_modulus,
carry_modulus, final_lut_f);

integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
signs_a, bsk, ksk, 1, lut);
}
}
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
Loading

0 comments on commit 3972e62

Please sign in to comment.