Skip to content

Commit

Permalink
fix(gpu): fix some edge-cases (and booleans) on compression
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves authored and agnesLeroy committed Sep 10, 2024
1 parent 39c424b commit 2a4026c
Show file tree
Hide file tree
Showing 11 changed files with 150 additions and 96 deletions.
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
make test_integer_compression_gpu
make test_cuda_backend
- name: Run user docs tests
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/gpu_full_multi_gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ jobs:
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run multi-bit CUDA integer compression tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
Expand Down
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

.PHONY: test_integer_compression_gpu
test_integer_compression_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress

.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
Expand Down Expand Up @@ -883,6 +890,12 @@ bench_integer_gpu: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --

.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
Expand Down
11 changes: 7 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory);

void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down Expand Up @@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {

uint32_t storage_log_modulus;

uint32_t num_lwes;
uint32_t body_count;

Torus *tmp_extracted_glwe;
Expand All @@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t num_radix_blocks, uint32_t body_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
this->num_lwes = num_radix_blocks;
this->body_count = body_count;

if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory) {

// Decompression doesn't keyswitch, so big and small dimensions are the same
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
message_modulus, carry_modulus);

int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);

scratch_cuda_integer_decompress_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
(int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
encryption_params, compression_params, storage_log_modulus,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto number_bits_to_pack = in_len * log_modulus;

auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
Expand Down Expand Up @@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
Expand All @@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
compression_params.ks_level, body_count);
}

auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
Expand Down Expand Up @@ -156,15 +154,15 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;

uint32_t body_count = mem_ptr->body_count;

auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
params.glwe_dimension * params.polynomial_size + body_count;

// We assure the tail of the glwe is zeroed
auto zeroed_slice =
glwe_array_out + params.glwe_dimension * params.polynomial_size;
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
auto zeroed_slice = glwe_array_out + initial_out_len;
cuda_memset_async(zeroed_slice, 0,
(params.polynomial_size - body_count) * sizeof(Torus),
stream, gpu_index);

int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
Expand All @@ -187,7 +185,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);

auto num_lwes = mem_ptr->body_count;
auto num_lwes = mem_ptr->num_lwes;

// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
Expand All @@ -199,17 +197,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
auto lut = mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
compression_params.small_lwe_dimension + 1);

/// Apply PBS
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_lwes,
encryption_params.big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

template <typename Torus>
Expand All @@ -227,12 +266,12 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {

*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
}
#endif
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ extern "C" {
carry_modulus: u32,
pbs_type: u32,
storage_log_modulus: u32,
bodies_count: u32,
allocate_gpu_memory: bool,
);

Expand Down
103 changes: 44 additions & 59 deletions tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,79 +138,64 @@ mod tests {
use super::*;
use crate::integer::gpu::gen_keys_radix_gpu;
use crate::integer::ClientKey;
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;

const NB_TESTS: usize = 10;
#[test]
fn test_gpu_ciphertext_compression() {
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);

let private_compression_key =
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);

let streams = CudaStreams::new_multi_gpu();

let num_blocks = 4;
let num_blocks = 32;
let (radix_cks, _) = gen_keys_radix_gpu(
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
num_blocks,
&streams,
);

let (cuda_compression_key, cuda_decompression_key) =
radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);

let ct1 = radix_cks.encrypt(3_u32);
let ct2 = radix_cks.encrypt(2_u32);
let ct3 = radix_cks.encrypt_signed(-2);
let ct4 = cks.encrypt_bool(true);

// Copy to GPU
let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams);
let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams);
let d_ct4 = CudaBooleanBlock::from_boolean_block(&ct4, &streams);

let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
.push(d_ct1, &streams)
.push(d_ct2, &streams)
.push(d_ct3, &streams)
.push(d_ct4, &streams)
.build(&cuda_compression_key, &streams);

let d_decompressed1 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
};

let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed1);

assert_eq!(decrypted, 3_u32);
let d_decompressed2 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
};

let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed2);

assert_eq!(decrypted, 2_u32);
let d_decompressed3 = CudaSignedRadixCiphertext {
ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams),
};

let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams);
let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3);

assert_eq!(decrypted, -2);
let d_decompressed4 = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_compressed.get(
3,
&cuda_decompression_key,
&streams,
));

let decompressed4 = d_decompressed4.to_boolean_block(&streams);
let decrypted = radix_cks.decrypt_bool(&decompressed4);

assert!(decrypted);
for _ in 0..NB_TESTS {
let ct1 = radix_cks.encrypt(3_u32);
let ct2 = radix_cks.encrypt_signed(-2);
let ct3 = radix_cks.encrypt_bool(true);

// Copy to GPU
let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams);
let d_ct3 = CudaBooleanBlock::from_boolean_block(&ct3, &streams);

let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
.push(d_ct1, &streams)
.push(d_ct2, &streams)
.push(d_ct3, &streams)
.build(&cuda_compression_key, &streams);

let d_decompressed1 = CudaUnsignedRadixCiphertext {
ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
};
let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
let decrypted: u32 = radix_cks.decrypt(&decompressed1);
assert_eq!(decrypted, 3_u32);

let d_decompressed2 = CudaSignedRadixCiphertext {
ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
};
let decompressed2 = d_decompressed2.to_signed_radix_ciphertext(&streams);
let decrypted: i32 = radix_cks.decrypt_signed(&decompressed2);
assert_eq!(decrypted, -2);

let d_decompressed3 = CudaBooleanBlock::from_cuda_radix_ciphertext(
cuda_compressed.get(2, &cuda_decompression_key, &streams),
);
let decompressed3 = d_decompressed3.to_boolean_block(&streams);
let decrypted = radix_cks.decrypt_bool(&decompressed3);
assert!(decrypted);
}
}
}
Loading

0 comments on commit 2a4026c

Please sign in to comment.