From 1e2d237c90af6dda2ae78c2c270d7d76fa19d90d Mon Sep 17 00:00:00 2001 From: tmontaigu Date: Mon, 2 Sep 2024 16:07:31 +0200 Subject: [PATCH] feat(integer): add count_ones/zeros The non naive version made for 2_2 parameters only bring slight (10-15%) for some small sizes like (64, 128, 256 bits) but reduces number of PBS. The place where it brings the best improvements it for very large numbers (e.g 6400 blocks 1.8s for naive, 1.1 sec for non-naive) --- tfhe/benches/integer/bench.rs | 4 + tfhe/benches/integer/signed_bench.rs | 4 + .../high_level_api/integers/signed/base.rs | 74 +++ .../high_level_api/integers/unsigned/base.rs | 74 +++ tfhe/src/integer/server_key/mod.rs | 20 +- .../radix_parallel/count_zeros_ones.rs | 543 ++++++++++++++++++ .../integer/server_key/radix_parallel/mod.rs | 1 + .../radix_parallel/tests_signed/mod.rs | 1 + .../tests_signed/test_count_zeros_ones.rs | 179 ++++++ .../radix_parallel/tests_unsigned/mod.rs | 1 + .../tests_unsigned/test_count_zeros_ones.rs | 153 +++++ 11 files changed, 1047 insertions(+), 7 deletions(-) create mode 100644 tfhe/src/integer/server_key/radix_parallel/count_zeros_ones.rs create mode 100644 tfhe/src/integer/server_key/radix_parallel/tests_signed/test_count_zeros_ones.rs create mode 100644 tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_count_zeros_ones.rs diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index 6b8b7c1799..11c0b387b2 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -1011,6 +1011,8 @@ define_server_key_bench_unary_default_fn!(method_name: leading_ones_parallelized define_server_key_bench_unary_default_fn!(method_name: trailing_zeros_parallelized, display_name: trailing_zeros); define_server_key_bench_unary_default_fn!(method_name: trailing_ones_parallelized, display_name: trailing_ones); define_server_key_bench_unary_default_fn!(method_name: ilog2_parallelized, display_name: ilog2); +define_server_key_bench_unary_default_fn!(method_name: count_ones_parallelized, display_name: count_ones); +define_server_key_bench_unary_default_fn!(method_name: count_zeros_parallelized, display_name: count_zeros); define_server_key_bench_unary_default_fn!(method_name: checked_ilog2_parallelized, display_name: checked_ilog2); define_server_key_bench_unary_default_fn!(method_name: unchecked_abs_parallelized, display_name: abs); @@ -2227,6 +2229,8 @@ criterion_group!( trailing_ones_parallelized, ilog2_parallelized, checked_ilog2_parallelized, + count_zeros_parallelized, + count_ones_parallelized, ); criterion_group!( diff --git a/tfhe/benches/integer/signed_bench.rs b/tfhe/benches/integer/signed_bench.rs index 1967b3a753..96f52d176c 100644 --- a/tfhe/benches/integer/signed_bench.rs +++ b/tfhe/benches/integer/signed_bench.rs @@ -286,6 +286,8 @@ define_server_key_bench_unary_signed_clean_input_fn!(method_name: leading_ones_p define_server_key_bench_unary_signed_clean_input_fn!(method_name: trailing_zeros_parallelized, display_name: trailing_zeros); define_server_key_bench_unary_signed_clean_input_fn!(method_name: trailing_ones_parallelized, display_name: trailing_ones); define_server_key_bench_unary_signed_clean_input_fn!(method_name: ilog2_parallelized, display_name: ilog2); +define_server_key_bench_unary_signed_clean_input_fn!(method_name: count_zeros_parallelized, display_name: count_zeros); +define_server_key_bench_unary_signed_clean_input_fn!(method_name: count_ones_parallelized, display_name: count_ones); define_server_key_bench_unary_signed_clean_input_fn!(method_name: checked_ilog2_parallelized, display_name: checked_ilog2); define_server_key_bench_binary_signed_clean_inputs_fn!( @@ -448,6 +450,8 @@ criterion_group!( trailing_ones_parallelized, ilog2_parallelized, checked_ilog2_parallelized, + count_ones_parallelized, + count_zeros_parallelized, ); criterion_group!( diff --git a/tfhe/src/high_level_api/integers/signed/base.rs b/tfhe/src/high_level_api/integers/signed/base.rs index 4d735898d7..de6e492f42 100644 --- a/tfhe/src/high_level_api/integers/signed/base.rs +++ b/tfhe/src/high_level_api/integers/signed/base.rs @@ -374,6 +374,80 @@ where }) } + /// Returns the number of ones in the binary representation of self. + /// + /// # Example + /// + /// ```rust + /// use tfhe::prelude::*; + /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheBool, FheInt16}; + /// + /// let (client_key, server_key) = generate_keys(ConfigBuilder::default()); + /// set_server_key(server_key); + /// + /// let clear_a = 0b0000000_0110111i16; + /// let a = FheInt16::encrypt(clear_a, &client_key); + /// + /// let result = a.count_ones(); + /// let decrypted: u32 = result.decrypt(&client_key); + /// assert_eq!(decrypted, clear_a.count_ones()); + /// ``` + pub fn count_ones(&self) -> crate::FheUint32 { + global_state::with_internal_keys(|key| match key { + InternalServerKey::Cpu(cpu_key) => { + let result = cpu_key + .pbs_key() + .count_ones_parallelized(&*self.ciphertext.on_cpu()); + let result = cpu_key.pbs_key().cast_to_unsigned( + result, + crate::FheUint32Id::num_blocks(cpu_key.pbs_key().message_modulus()), + ); + crate::FheUint32::new(result) + } + #[cfg(feature = "gpu")] + InternalServerKey::Cuda(_) => { + panic!("Cuda devices do not support count_ones yet"); + } + }) + } + + /// Returns the number of zeros in the binary representation of self. + /// + /// # Example + /// + /// ```rust + /// use tfhe::prelude::*; + /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheBool, FheInt16}; + /// + /// let (client_key, server_key) = generate_keys(ConfigBuilder::default()); + /// set_server_key(server_key); + /// + /// let clear_a = 0b0000000_0110111i16; + /// let a = FheInt16::encrypt(clear_a, &client_key); + /// + /// let result = a.count_zeros(); + /// let decrypted: u32 = result.decrypt(&client_key); + /// assert_eq!(decrypted, clear_a.count_zeros()); + /// ``` + pub fn count_zeros(&self) -> crate::FheUint32 { + global_state::with_internal_keys(|key| match key { + InternalServerKey::Cpu(cpu_key) => { + let result = cpu_key + .pbs_key() + .count_zeros_parallelized(&*self.ciphertext.on_cpu()); + let result = cpu_key.pbs_key().cast_to_unsigned( + result, + crate::FheUint32Id::num_blocks(cpu_key.pbs_key().message_modulus()), + ); + crate::FheUint32::new(result) + } + #[cfg(feature = "gpu")] + InternalServerKey::Cuda(_) => { + panic!("Cuda devices do not support count_zeros yet"); + } + }) + } + /// Returns the base 2 logarithm of the number, rounded down. /// /// Result has no meaning if self encrypts a value <= 0. See [Self::checked_ilog2] diff --git a/tfhe/src/high_level_api/integers/unsigned/base.rs b/tfhe/src/high_level_api/integers/unsigned/base.rs index 69bc5a1b6f..6f8c5e730e 100644 --- a/tfhe/src/high_level_api/integers/unsigned/base.rs +++ b/tfhe/src/high_level_api/integers/unsigned/base.rs @@ -476,6 +476,80 @@ where }) } + /// Returns the number of ones in the binary representation of self. + /// + /// # Example + /// + /// ```rust + /// use tfhe::prelude::*; + /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheBool, FheUint16}; + /// + /// let (client_key, server_key) = generate_keys(ConfigBuilder::default()); + /// set_server_key(server_key); + /// + /// let clear_a = 0b0000000_0110111u16; + /// let a = FheUint16::encrypt(clear_a, &client_key); + /// + /// let result = a.count_ones(); + /// let decrypted: u32 = result.decrypt(&client_key); + /// assert_eq!(decrypted, clear_a.count_ones()); + /// ``` + pub fn count_ones(&self) -> super::FheUint32 { + global_state::with_internal_keys(|key| match key { + InternalServerKey::Cpu(cpu_key) => { + let result = cpu_key + .pbs_key() + .count_ones_parallelized(&*self.ciphertext.on_cpu()); + let result = cpu_key.pbs_key().cast_to_unsigned( + result, + super::FheUint32Id::num_blocks(cpu_key.pbs_key().message_modulus()), + ); + super::FheUint32::new(result) + } + #[cfg(feature = "gpu")] + InternalServerKey::Cuda(_) => { + panic!("Cuda devices do not support count_ones yet"); + } + }) + } + + /// Returns the number of zeros in the binary representation of self. + /// + /// # Example + /// + /// ```rust + /// use tfhe::prelude::*; + /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheBool, FheUint16}; + /// + /// let (client_key, server_key) = generate_keys(ConfigBuilder::default()); + /// set_server_key(server_key); + /// + /// let clear_a = 0b0000000_0110111u16; + /// let a = FheUint16::encrypt(clear_a, &client_key); + /// + /// let result = a.count_zeros(); + /// let decrypted: u32 = result.decrypt(&client_key); + /// assert_eq!(decrypted, clear_a.count_zeros()); + /// ``` + pub fn count_zeros(&self) -> super::FheUint32 { + global_state::with_internal_keys(|key| match key { + InternalServerKey::Cpu(cpu_key) => { + let result = cpu_key + .pbs_key() + .count_zeros_parallelized(&*self.ciphertext.on_cpu()); + let result = cpu_key.pbs_key().cast_to_unsigned( + result, + super::FheUint32Id::num_blocks(cpu_key.pbs_key().message_modulus()), + ); + super::FheUint32::new(result) + } + #[cfg(feature = "gpu")] + InternalServerKey::Cuda(_) => { + panic!("Cuda devices do not support count_zeros yet"); + } + }) + } + /// Returns the base 2 logarithm of the number, rounded down. /// /// Result has no meaning if self encrypts 0. See [Self::checked_ilog2] diff --git a/tfhe/src/integer/server_key/mod.rs b/tfhe/src/integer/server_key/mod.rs index bfe5d8fdd4..f7eceef0e6 100644 --- a/tfhe/src/integer/server_key/mod.rs +++ b/tfhe/src/integer/server_key/mod.rs @@ -33,7 +33,7 @@ use super::backward_compatibility::server_key::{CompressedServerKeyVersions, Ser #[derive(Serialize, Deserialize, Clone, Versionize)] #[versionize(ServerKeyVersions)] pub struct ServerKey { - pub(crate) key: crate::shortint::ServerKey, + pub key: crate::shortint::ServerKey, } impl From for crate::shortint::ServerKey { @@ -216,19 +216,25 @@ impl ServerKey { self.key.carry_modulus } - /// Returns how many blocks a radix ciphertext should have to - /// be able to represent the given unsigned integer - pub fn num_blocks_to_represent_unsigned_value(&self, clear: Clear) -> usize + pub fn num_bits_to_represent_unsigned_value(&self, clear: Clear) -> usize where Clear: UnsignedInteger, { - let num_bits_in_message = self.message_modulus().0.ilog2(); - let num_bits_to_represent_output_value = if clear == Clear::MAX { + if clear == Clear::MAX { Clear::BITS } else { (clear + Clear::ONE).ceil_ilog2() as usize - }; + } + } + /// Returns how many blocks a radix ciphertext should have to + /// be able to represent the given unsigned integer + pub fn num_blocks_to_represent_unsigned_value(&self, clear: Clear) -> usize + where + Clear: UnsignedInteger, + { + let num_bits_to_represent_output_value = self.num_bits_to_represent_unsigned_value(clear); + let num_bits_in_message = self.message_modulus().0.ilog2(); num_bits_to_represent_output_value.div_ceil(num_bits_in_message as usize) } } diff --git a/tfhe/src/integer/server_key/radix_parallel/count_zeros_ones.rs b/tfhe/src/integer/server_key/radix_parallel/count_zeros_ones.rs new file mode 100644 index 0000000000..50e9c78bb4 --- /dev/null +++ b/tfhe/src/integer/server_key/radix_parallel/count_zeros_ones.rs @@ -0,0 +1,543 @@ +use super::ServerKey; +use crate::integer::{IntegerRadixCiphertext, RadixCiphertext, SignedRadixCiphertext}; + +use rayon::prelude::*; + +#[derive(Copy, Clone, PartialEq, Eq)] +enum BitCountKind { + Zero, + One, +} + +impl BitCountKind { + fn is_ok(self, bit_value: u64) -> u64 { + match self { + Self::Zero => u64::from(bit_value == 0), + Self::One => u64::from(bit_value == 1), + } + } +} + +impl ServerKey { + /// Returns the number of ones in the binary representation of `ct` + /// + /// * ct must not have any carries + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn unchecked_count_ones_parallelized(&self, ct: &T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.unchecked_count_bits_parallelized(ct, BitCountKind::One) + } + + /// Returns the number of zeros in the binary representation of `ct` + /// + /// * ct must not have any carries + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn unchecked_count_zeros_parallelized(&self, ct: &T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.unchecked_count_bits_parallelized(ct, BitCountKind::Zero) + } + + fn unchecked_count_bits_parallelized(&self, ct: &T, kind: BitCountKind) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + if self.message_modulus().0 == 4 && self.carry_modulus().0 >= 4 { + self.count_bits_2_2(ct, kind) + } else { + self.count_bits_naive(ct, kind) + } + } + + /// Returns the number of ones in the binary representation of `ct` + /// + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn smart_count_ones_parallelized(&self, ct: &mut T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.smart_count_bits_parallelized(ct, BitCountKind::One) + } + + /// Returns the number of zeros in the binary representation of `ct` + /// + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn smart_count_zeros_parallelized(&self, ct: &mut T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.smart_count_bits_parallelized(ct, BitCountKind::Zero) + } + + fn smart_count_bits_parallelized(&self, ct: &mut T, kind: BitCountKind) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + if !ct.block_carries_are_empty() { + self.full_propagate_parallelized(ct); + }; + + self.unchecked_count_bits_parallelized(ct, kind) + } + + /// Returns the number of ones in the binary representation of `ct` + /// + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn count_ones_parallelized(&self, ct: &T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.count_bits_parallelized(ct, BitCountKind::One) + } + + /// Returns the number of zeros in the binary representation of `ct` + /// + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + pub fn count_zeros_parallelized(&self, ct: &T) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + self.count_bits_parallelized(ct, BitCountKind::Zero) + } + + fn count_bits_parallelized(&self, ct: &T, kind: BitCountKind) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + let mut tmp_ct; + let ct = if ct.block_carries_are_empty() { + ct + } else { + tmp_ct = ct.clone(); + self.full_propagate_parallelized(&mut tmp_ct); + &tmp_ct + }; + + self.unchecked_count_bits_parallelized(ct, kind) + } + + /// 'Naive' implementation of count zeros/ones + /// + /// * It will work for all parameters + /// * ct must not have any carries + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + fn count_bits_naive(&self, ct: &T, count_kind: BitCountKind) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + let min_num_blocks_to_have_32_bits = 32u32.div_ceil(self.message_modulus().0.ilog2()); + if ct.blocks().is_empty() { + return self.create_trivial_zero_radix(min_num_blocks_to_have_32_bits as usize); + } + let num_bits_in_block = self.message_modulus().0.ilog2(); + + let lut_count_bits = self.key.generate_lookup_table(|x| { + let mut count = 0u64; + for i in 0..(num_bits_in_block * 2) { + count += (x >> i) & 1; + } + count + }); + + // We can pack the block if the carry space allow it, but more importantly, + // if the number of bits in 2 blocks does not exceed the message modulus + // e.g. 1_1 -> 2 bits in one block -> 2 blocks = 2 bits -> 2 >= 2**1 (2)-> cant pack + // 3_3 -> 3 bits in one block -> 2 blocks = 6 bits -> 6 < 2**3 (8) -> can pack + let can_pack = self.carry_modulus().0 >= self.message_modulus().0 + && (num_bits_in_block * 2) < (self.message_modulus().0 as u32); + let pre_count = if can_pack { + ct.blocks() + .par_chunks(2) + .map(|chunk_of_two| { + let mut packed = self.pack_block_chunk(chunk_of_two); + self.key + .apply_lookup_table_assign(&mut packed, &lut_count_bits); + RadixCiphertext::from(vec![packed]) + }) + .collect::>() + } else if num_bits_in_block > 1 { + // This is a bit suboptimal for 2_2, but there is a specialized algorithm for that + ct.blocks() + .par_iter() + .map(|block| { + let mut block = self.key.apply_lookup_table(block, &lut_count_bits); + // We used a LUT that spans 2*num_bits_in_block, however there was only one + // block, so the estimated degree is not correct, we set it, otherwise + // a spurious full propagation would happen later + block.degree = + crate::shortint::ciphertext::Degree::new(num_bits_in_block as usize); + RadixCiphertext::from(vec![block]) + }) + .collect::>() + } else { + // For 1_1, no need to do a PBS to count bits + ct.blocks() + .iter() + .cloned() + .map(|block| RadixCiphertext::from(vec![block])) + .collect::>() + }; + + let max_possible_bit_count = num_bits_in_block + .checked_mul(ct.blocks().len() as u32) + .expect("Number of bits exceed u32::MAX"); + let num_unsigned_blocks = + self.num_blocks_to_represent_unsigned_value(max_possible_bit_count); + if count_kind == BitCountKind::One { + let things_to_sum = pre_count + .into_iter() + .map(|ct| self.cast_to_unsigned(ct, num_unsigned_blocks)) + .collect::>(); + + let result = self + .unchecked_sum_ciphertexts_vec_parallelized(things_to_sum) + .unwrap_or_else(|| { + self.create_trivial_zero_radix(min_num_blocks_to_have_32_bits as usize) + }); + + self.cast_to_unsigned(result, min_num_blocks_to_have_32_bits as usize) + } else { + // This is like the ilog2 idea + // + // num_zeros = num_bits - num_ones + // num_zeros = -(-(num_bits - num_ones)) + // -num_zeros = -(num_bits - num_ones) + // -num_zeros = -num_bits + num_ones + // + // doing `-num_bits` is easy + // + // We could technically have done a LUT that counted zeros instead of ones in the + // step above. + // But in the case of 1_X parameters, counting ones does not require to have + // a LUT done on each block to count the number of ones, and to avoid having to do a + // LUT to count zeros we prefer to change a bit the sum + let num_bits_needed = + self.num_bits_to_represent_unsigned_value(max_possible_bit_count) + 1; + let num_signed_blocks = num_bits_needed.div_ceil(num_bits_in_block as usize); + assert!(num_signed_blocks >= num_unsigned_blocks); + + let mut things_to_sum = pre_count + .into_iter() + .map(|ct| self.cast_to_signed(ct, num_signed_blocks)) + .collect::>(); + + things_to_sum.push( + self.create_trivial_radix(-i64::from(max_possible_bit_count), num_signed_blocks), + ); + let result = self + .unchecked_partial_sum_ciphertexts_vec_parallelized(things_to_sum) + .expect("internal error, empty ciphertext count"); + let (message_blocks, carry_blocks) = rayon::join( + || { + let lut = self.key.generate_lookup_table(|x| { + // extract message + let x = x % self.key.message_modulus.0 as u64; + // bitnot the message + (!x) % self.key.message_modulus.0 as u64 + }); + result + .blocks + .par_iter() + .map(|block| self.key.apply_lookup_table(block, &lut)) + .collect::>() + }, + || { + let lut = self.key.generate_lookup_table(|x| { + // extract carry + let x = x / self.key.message_modulus.0 as u64; + // bitnot the carry + (!x) % self.key.message_modulus.0 as u64 + }); + let mut carry_blocks = Vec::with_capacity(num_unsigned_blocks); + result.blocks[..num_signed_blocks - 1] // last carry is not interesting + .par_iter() + .map(|block| self.key.apply_lookup_table(block, &lut)) + .collect_into_vec(&mut carry_blocks); + // Normally this would be 0, but we want the bitnot of 0, which is msg_mod-1 + carry_blocks.insert( + 0, + self.key + .create_trivial((self.message_modulus().0 - 1) as u64), + ); + carry_blocks + }, + ); + let message = SignedRadixCiphertext::from(message_blocks); + let carry = SignedRadixCiphertext::from(carry_blocks); + let result = self + .sum_ciphertexts_parallelized( + [ + message, + carry, + self.create_trivial_radix(2u32, num_signed_blocks), + ] + .iter(), + ) + // Go back to unsigned world because we know the value cannot be negative + // but casting from signed to unsigned may require to look at the sign bit + // which we know is not set + .map(|ct| RadixCiphertext::from(ct.blocks)) + .unwrap(); + + self.cast_to_unsigned(result, min_num_blocks_to_have_32_bits as usize) + } + } + + /// More complex implementation of count zeros/ones meant for 2_2 parameters + /// + /// * It will only work for 2_2 parameters + /// * ct must not have any carries + /// * The returned result has enough blocks to encrypt 32bits (e.g. 1_1 parameters -> 32 blocks, + /// 3_3 parameters -> 11 blocks == 33 bits) + fn count_bits_2_2(&self, ct: &T, count_kind: BitCountKind) -> RadixCiphertext + where + T: IntegerRadixCiphertext, + { + let num_bits_in_block = self.message_modulus().0.ilog2(); + let num_blocks = ct.blocks().len(); + let min_num_blocks_to_have_32_bits = + 32u32.div_ceil(self.message_modulus().0.ilog2()) as usize; + + if num_blocks == 0 { + return self.create_trivial_zero_radix(min_num_blocks_to_have_32_bits); + } + + // In 2_2, each block may have between 0 and 2 bits set. + // 2_2 also allows 5 additions maximum (noise wise) + // 2 * 5 = 10 which is less than the max value storable (15 = (2**4) -1) + // + // Since in 2_2 bivariate PBS is possible, we can actually group blocks by two. + // Each pair of block may have between 0 and 4 bits set, meaning we could add 3 of those + // count to stay <= 15 + // Degree: 4 * 3 == 12 which is <= 15 + // NoiseLevel: 3 + // + // Now, to go further, with 3 blocks, which is 6 bits, we can do 2 bivariate PBS, to split + // the count in two blocks with value in 0…=3 + // [b0,b1] [b2, b3] [b4, b5] + // PBS 1 -> [b0, b2, b2, b3] -> count(b0, b2,b2) + // PBS 2 -> [b2, b3, b4, b5] -> count(b3, b4, b5) + // This also mean 2 PBS for 3 blocks, instead of 3 for 3 + // + // Each of these blocks could be added to pairs described above, so the + // degree would become (4 * 3) + 3 = 12 + 3 = 15, + // and the noise level would be 3 + 1 = 4 + + // As described, 3 pairs form a chunk, so we split the input blocks in chunks of + // `3 * 2 = 6` blocks + // + // non_full_chunks are chunks with degree 12, and noise level 3 + // non_chunked are single blocks not belonging to any chunk + let (mut num_non_full_chunks, mut num_non_chunked) = (num_blocks / 6, num_blocks % 6); + let mut num_full_chunks = 0; + + // 'Dispatch' some of the non chunked blocks in to complete a chunk + // + // 3 blocks can be used to fill 2 chunks + // We know num_non_chunked < 6, that's why this is an if, not a loop + let mut num_duo_completer_blocks = 0; + if num_non_full_chunks >= 2 && num_non_chunked >= 3 { + num_non_chunked -= 3; + num_non_full_chunks -= 2; + num_full_chunks += 2; + num_duo_completer_blocks += 3; + } + + // The rest of non chunked blocks are simply going to complete + // chunk each by adding their block count (that is in range 0…=2) + // such complete chunk will have a degree = (4 * 3) + 2 = 14 + // + // But that's as long as there are chunk to complete + let num_single_completer_blocks = num_non_chunked.min(num_non_full_chunks); + num_non_full_chunks -= num_single_completer_blocks; + num_full_chunks += num_single_completer_blocks; + num_non_chunked -= num_single_completer_blocks; + + // No we go a bit beyond again + // + // A non-full chunk has 3 packed blocks, so 6 ciphertexts + // 3 ciphertexts can be split into 2 blocks, to complete 2 non-full chunks + // so, this means that with 6 blocks we can complete 4 chunks + // + // So, a non-full chunk can be deconstructed to fill 4 other non-full chunks. + // So for every 4 chunks non-full chunk we deconstruct one to fill them + // + // This lightly increases the number of PBS done at this stage, + // but it gets compensated by reducing the number of PBS done at later stages + // and reduces the number of ciphertexts to sum together + // + // This will actually start to happen for rather num_blocks >= 30 (aka 60 bits) + let mut num_to_deconstruct = 0; + let mut n = num_non_full_chunks; + // >= 5 because to complete 4 chunks we need one chunk + while n >= 5 { + num_to_deconstruct += 1; + num_full_chunks += 4; + num_non_full_chunks -= 5; + n -= 5; // 4 chunks are full because we deconstructed one + } + + // We have 3 slices + // * one with blocks to pack and apply the full bit-count on + // * one with blocks to pack in a way that we can then apply bit-count on 3 bits + // * one where we apply a bit count on 2 bits // the rest + + num_duo_completer_blocks += 6 * num_to_deconstruct; + let num_single_blocks = num_non_chunked + num_single_completer_blocks; + let num_regular_blocks = (num_full_chunks + num_non_full_chunks) * 6; + + // Make sure this span the whole input slice + assert_eq!( + num_regular_blocks + num_duo_completer_blocks + num_single_blocks, + num_blocks + ); + + let regular_blocks = &ct.blocks()[..num_regular_blocks]; + let duo_completer_blocks = + &ct.blocks()[num_regular_blocks..num_regular_blocks + num_duo_completer_blocks]; + let single_completer_blocks = &ct.blocks()[num_regular_blocks + num_duo_completer_blocks..]; + // Since we took the rest, make sure it has the len we expect, otherwise + // result won't be correct + assert_eq!(single_completer_blocks.len(), num_single_blocks); + // must be chunk_exact by 3 otherwise there was an error earlier + assert_eq!(duo_completer_blocks.len() % 3, 0); + + let lut_count_bits_full_range = self.key.generate_lookup_table(|x| { + let mut count = 0u64; + for i in 0..(num_bits_in_block * 2) { + count += count_kind.is_ok((x >> i) & 1); + } + count + }); + + let lut_count_bits_half_range = self.key.generate_lookup_table(|x| { + let mut count = 0u64; + for i in 0..num_bits_in_block { + count += count_kind.is_ok((x >> i) & 1); + } + count + }); + + let (bit_count_of_packed_blocks, mut bit_counts_of_completer_blocks) = rayon::join( + || { + regular_blocks + .par_chunks_exact(2) + .map(|chunk_of_two| { + let mut packed = self.pack_block_chunk(chunk_of_two); + self.key + .apply_lookup_table_assign(&mut packed, &lut_count_bits_full_range); + packed + }) + .collect::>() + }, + || { + let luts = [ + self.key.generate_lookup_table(|x| { + let mut count = 0u64; + for i in 0..num_bits_in_block + 1 { + count += count_kind.is_ok((x >> i) & 1); + } + count + }), + self.key.generate_lookup_table(|x| { + let mut count = 0u64; + for i in 1..num_bits_in_block + 2 { + count += count_kind.is_ok((x >> i) & 1); + } + count + }), + ]; + + let mut packed_blocks = Vec::new(); + for chunk_of_3 in duo_completer_blocks.chunks_exact(3) { + packed_blocks.push(self.pack_block_chunk(&chunk_of_3[..2])); + packed_blocks.push(self.pack_block_chunk(&chunk_of_3[1..3])); + } + + packed_blocks + .par_iter() + .enumerate() + .map(|(i, packed_block)| { + self.key.apply_lookup_table(packed_block, &luts[(i) % 2]) + }) + .chain(single_completer_blocks.par_iter().map(|block| { + self.key + .apply_lookup_table(block, &lut_count_bits_half_range) + })) + .collect::>() + }, + ); + + let num_sum = ((self.message_modulus().0 * self.carry_modulus().0) - 1) + / (self.message_modulus().0 * self.carry_modulus().0).ilog2() as usize; + + let mut pre_count = bit_count_of_packed_blocks + .chunks_exact(num_sum) + .map(|chunk| { + let mut result = chunk[0].clone(); + for s in &chunk[1..] { + self.key.unchecked_add_assign(&mut result, s); + } + result + }) + .collect::>(); + + // Complete the chunks to maximize degree and minimize sum depths + for (p, c) in pre_count + .iter_mut() + .zip(bit_counts_of_completer_blocks.iter()) + { + self.key.unchecked_add_assign(p, c); + } + + let mut pre_count = pre_count + .par_iter() + .map(|block| { + if block.degree.get() >= self.message_modulus().0 { + let (msg, carry) = rayon::join( + || self.key.message_extract(block), + || self.key.carry_extract(block), + ); + RadixCiphertext::from(vec![msg, carry]) + } else { + let msg = self.key.message_extract(block); + RadixCiphertext::from(vec![msg]) + } + }) + .collect::>(); + + if pre_count.len() < bit_counts_of_completer_blocks.len() { + // Then not all blocks were consumed + // we not forget to add them to the sum list + for b in bit_counts_of_completer_blocks.drain(pre_count.len()..) { + pre_count.push(RadixCiphertext::from(vec![b])); + } + } + + let max_possible_bit_count = num_bits_in_block + .checked_mul(ct.blocks().len() as u32) + .expect("Number of bits exceed u32::MAX"); + let num_blocks = self.num_blocks_to_represent_unsigned_value(max_possible_bit_count); + + let things_to_sum = pre_count + .into_iter() + .map(|ct| self.cast_to_unsigned(ct, num_blocks)) + .collect::>(); + + let result = self + .unchecked_sum_ciphertexts_vec_parallelized(things_to_sum) + .unwrap_or_else(|| self.create_trivial_zero_radix(min_num_blocks_to_have_32_bits)); + + self.cast_to_unsigned(result, min_num_blocks_to_have_32_bits) + } +} diff --git a/tfhe/src/integer/server_key/radix_parallel/mod.rs b/tfhe/src/integer/server_key/radix_parallel/mod.rs index 4739c9e88f..8f1fa16ddc 100644 --- a/tfhe/src/integer/server_key/radix_parallel/mod.rs +++ b/tfhe/src/integer/server_key/radix_parallel/mod.rs @@ -21,6 +21,7 @@ mod shift; pub(crate) mod sub; mod sum; +mod count_zeros_ones; pub(crate) mod ilog2; mod reverse_bits; mod slice; diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_signed/mod.rs b/tfhe/src/integer/server_key/radix_parallel/tests_signed/mod.rs index 16e356ba5b..08921472b2 100644 --- a/tfhe/src/integer/server_key/radix_parallel/tests_signed/mod.rs +++ b/tfhe/src/integer/server_key/radix_parallel/tests_signed/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod test_add; pub(crate) mod test_bitwise_op; pub(crate) mod test_cmux; pub(crate) mod test_comparison; +mod test_count_zeros_ones; pub(crate) mod test_ilog2; pub(crate) mod test_mul; pub(crate) mod test_neg; diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_count_zeros_ones.rs b/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_count_zeros_ones.rs new file mode 100644 index 0000000000..b7c8a153e6 --- /dev/null +++ b/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_count_zeros_ones.rs @@ -0,0 +1,179 @@ +use crate::integer::keycache::KEY_CACHE; +use crate::integer::server_key::radix_parallel::tests_cases_unsigned::FunctionExecutor; +use crate::integer::server_key::radix_parallel::tests_unsigned::{ + nb_tests_smaller_for_params, CpuFunctionExecutor, MAX_NB_CTXT, NB_CTXT, +}; +use crate::integer::tests::create_parametrized_test; +use crate::integer::{ + IntegerKeyKind, RadixCiphertext, RadixClientKey, ServerKey, SignedRadixCiphertext, +}; +use crate::shortint::parameters::*; +use crate::shortint::PBSParameters; +use rand::Rng; +use std::sync::Arc; + +create_parametrized_test!(integer_extensive_trivial_signed_default_count_zeros_ones); +create_parametrized_test!(integer_signed_default_count_zeros_ones); + +fn integer_extensive_trivial_signed_default_count_zeros_ones

(param: P) +where + P: Into, +{ + let count_zeros_executor = CpuFunctionExecutor::new(&ServerKey::count_zeros_parallelized); + let count_ones_executor = CpuFunctionExecutor::new(&ServerKey::count_ones_parallelized); + extensive_trivial_signed_default_count_zeros_ones_test( + param, + count_zeros_executor, + count_ones_executor, + ); +} + +fn integer_signed_default_count_zeros_ones

(param: P) +where + P: Into, +{ + let count_zeros_executor = CpuFunctionExecutor::new(&ServerKey::count_zeros_parallelized); + let count_ones_executor = CpuFunctionExecutor::new(&ServerKey::count_ones_parallelized); + signed_default_count_zeros_ones_test(param, count_zeros_executor, count_ones_executor); +} + +pub(crate) fn signed_default_count_zeros_ones_test( + param: P, + mut count_zeros_executor: E1, + mut count_ones_executor: E2, +) where + P: Into, + E1: for<'a> FunctionExecutor<&'a SignedRadixCiphertext, RadixCiphertext>, + E2: for<'a> FunctionExecutor<&'a SignedRadixCiphertext, RadixCiphertext>, +{ + let param = param.into(); + let nb_tests = nb_tests_smaller_for_params(param); + let (cks, mut sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let cks = RadixClientKey::from((cks, NB_CTXT)); + + sks.set_deterministic_pbs_execution(true); + let sks = Arc::new(sks); + + let mut rng = rand::thread_rng(); + + count_zeros_executor.setup(&cks, sks.clone()); + count_ones_executor.setup(&cks, sks); + + let cks: crate::integer::ClientKey = cks.into(); + + for num_blocks in 1..=MAX_NB_CTXT { + let Some(modulus) = + (cks.parameters().message_modulus().0 as u128).checked_pow(num_blocks as u32) + else { + break; + }; + if modulus > i128::MAX as u128 || modulus <= 2 { + break; + } + let half_modulus = modulus / 2; + + for _ in 0..nb_tests { + let clear_a = rng.gen_range(-(half_modulus as i128)..half_modulus as i128); + + // Set all bits above the modulus to 0, so the count_ones does not count them + // mask looks like `000000000000001111111` + // ^ modulus.ilog2() + // This has to be done for signed numbers because if clear_a < 0 + // then bits above 2**modulus are all `1`, thus the clear_a.count_one() is not correct + let mask = (half_modulus as i128 * 2) - 1; + let clear_a = mask & clear_a; + + let a: SignedRadixCiphertext = cks.encrypt_signed_radix(clear_a, num_blocks); + + let encrypted = count_ones_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_ones(), + "Invalid count_ones for input {clear_a}" + ); + + // Set all bits above the modulus to 1, so the count_zeros does no count them + // mask looks like `111111111111110000000` + // ^ modulus.ilog2() + let mask = -1i128.wrapping_mul(modulus as i128); + let clear_a = mask | clear_a; + let encrypted = count_zeros_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_zeros(), + "Invalid count_zeros for input {clear_a}" + ); + } + } +} + +pub(crate) fn extensive_trivial_signed_default_count_zeros_ones_test( + param: P, + mut count_zeros_executor: E1, + mut count_ones_executor: E2, +) where + P: Into, + E1: for<'a> FunctionExecutor<&'a SignedRadixCiphertext, RadixCiphertext>, + E2: for<'a> FunctionExecutor<&'a SignedRadixCiphertext, RadixCiphertext>, +{ + let param = param.into(); + let (cks, mut sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let cks = RadixClientKey::from((cks, NB_CTXT)); + + sks.set_deterministic_pbs_execution(true); + let sks = Arc::new(sks); + + let mut rng = rand::thread_rng(); + + count_zeros_executor.setup(&cks, sks.clone()); + count_ones_executor.setup(&cks, sks.clone()); + + let cks: crate::integer::ClientKey = cks.into(); + + for num_blocks in 1..=64 { + let Some(modulus) = (cks.parameters().message_modulus().0 as u128).checked_pow(num_blocks) + else { + break; + }; + if modulus > i128::MAX as u128 || modulus <= 2 { + break; + } + let half_modulus = modulus / 2; + for _ in 0..50 { + let clear_a = rng.gen_range(-(half_modulus as i128)..half_modulus as i128); + + // Set all bits above the modulus to 0, so the count_ones does not count them + // mask looks like `000000000000001111111` + // ^ modulus.ilog2() + // This has to be done for signed numbers because if clear_a < 0 + // then bits above 2**modulus are all `1`, thus the clear_a.count_one() is not correct + let mask = (half_modulus as i128 * 2) - 1; + let clear_a = mask & clear_a; + + let a: SignedRadixCiphertext = sks.create_trivial_radix(clear_a, num_blocks as usize); + + let encrypted = count_ones_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_ones(), + "Invalid count_ones for input {clear_a}" + ); + + // Set all bits above the modulus to 1, so the count_zeros does not count them + // mask looks like `111111111111110000000` + // ^ modulus.ilog2() + let mask = -1i128.wrapping_mul(modulus as i128); + let clear_a = mask | clear_a; + let encrypted = count_zeros_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_zeros(), + "Invalid count_zeros for input {clear_a}" + ); + } + } +} diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs index 00886fd9b2..36f8584662 100644 --- a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs +++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod test_add; pub(crate) mod test_bitwise_op; pub(crate) mod test_cmux; pub(crate) mod test_comparison; +mod test_count_zeros_ones; pub(crate) mod test_div_mod; pub(crate) mod test_ilog2; pub(crate) mod test_mul; diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_count_zeros_ones.rs b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_count_zeros_ones.rs new file mode 100644 index 0000000000..b0f9e2a048 --- /dev/null +++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_count_zeros_ones.rs @@ -0,0 +1,153 @@ +use crate::integer::keycache::KEY_CACHE; +use crate::integer::server_key::radix_parallel::tests_cases_unsigned::FunctionExecutor; +use crate::integer::server_key::radix_parallel::tests_unsigned::{ + nb_tests_smaller_for_params, CpuFunctionExecutor, MAX_NB_CTXT, NB_CTXT, +}; +use crate::integer::tests::create_parametrized_test; +use crate::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey, ServerKey}; +use crate::shortint::PBSParameters; +use rand::Rng; +use std::sync::Arc; + +use crate::shortint::parameters::*; + +create_parametrized_test!(integer_extensive_trivial_default_count_zeros_ones); +create_parametrized_test!(integer_default_count_zeros_ones); + +fn integer_extensive_trivial_default_count_zeros_ones

(param: P) +where + P: Into, +{ + let count_zeros_executor = CpuFunctionExecutor::new(&ServerKey::count_zeros_parallelized); + let count_ones_executor = CpuFunctionExecutor::new(&ServerKey::count_ones_parallelized); + extensive_trivial_default_count_zeros_ones_test( + param, + count_zeros_executor, + count_ones_executor, + ); +} + +fn integer_default_count_zeros_ones

(param: P) +where + P: Into, +{ + let count_zeros_executor = CpuFunctionExecutor::new(&ServerKey::count_zeros_parallelized); + let count_ones_executor = CpuFunctionExecutor::new(&ServerKey::count_ones_parallelized); + default_count_zeros_ones_test(param, count_zeros_executor, count_ones_executor); +} + +pub(crate) fn default_count_zeros_ones_test( + param: P, + mut count_zeros_executor: E1, + mut count_ones_executor: E2, +) where + P: Into, + E1: for<'a> FunctionExecutor<&'a RadixCiphertext, RadixCiphertext>, + E2: for<'a> FunctionExecutor<&'a RadixCiphertext, RadixCiphertext>, +{ + let param = param.into(); + let nb_tests = nb_tests_smaller_for_params(param); + let (cks, mut sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let cks = RadixClientKey::from((cks, NB_CTXT)); + + sks.set_deterministic_pbs_execution(true); + let sks = Arc::new(sks); + + let mut rng = rand::thread_rng(); + + count_zeros_executor.setup(&cks, sks.clone()); + count_ones_executor.setup(&cks, sks); + + let cks: crate::integer::ClientKey = cks.into(); + + for num_blocks in 1..=MAX_NB_CTXT { + let Some(modulus) = + (cks.parameters().message_modulus().0 as u128).checked_pow(num_blocks as u32) + else { + break; + }; + for _ in 0..nb_tests { + let clear_a = rng.gen::() % modulus; + + let a: RadixCiphertext = cks.encrypt_radix(clear_a, num_blocks); + + let encrypted = count_ones_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_ones(), + "Invalid count_ones for input {clear_a}" + ); + + // Set all bits above the modulus to 1, so the count_zeros does no count them + // mask looks like `111111111111110000000` + // ^ modulus.ilog2() + let mask = u128::MAX.wrapping_mul(modulus); + let clear_a = mask | clear_a; + let encrypted = count_zeros_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_zeros(), + "Invalid count_zeros for input {clear_a}" + ); + } + } +} + +pub(crate) fn extensive_trivial_default_count_zeros_ones_test( + param: P, + mut count_zeros_executor: E1, + mut count_ones_executor: E2, +) where + P: Into, + E1: for<'a> FunctionExecutor<&'a RadixCiphertext, RadixCiphertext>, + E2: for<'a> FunctionExecutor<&'a RadixCiphertext, RadixCiphertext>, +{ + let param = param.into(); + let (cks, mut sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let cks = RadixClientKey::from((cks, NB_CTXT)); + + sks.set_deterministic_pbs_execution(true); + let sks = Arc::new(sks); + + let mut rng = rand::thread_rng(); + + count_zeros_executor.setup(&cks, sks.clone()); + count_ones_executor.setup(&cks, sks.clone()); + + let cks: crate::integer::ClientKey = cks.into(); + + for num_blocks in 1..=64 { + let Some(modulus) = (cks.parameters().message_modulus().0 as u128).checked_pow(num_blocks) + else { + break; + }; + for _ in 0..50 { + let clear_a = rng.gen::() % modulus; + + let a: RadixCiphertext = sks.create_trivial_radix(clear_a, num_blocks as usize); + + let encrypted = count_ones_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_ones(), + "Invalid count_ones for input {clear_a}" + ); + + // Set all bits above the modulus to 1, so the count_zeros does no count them + // mask looks like `111111111111110000000` + // ^ modulus.ilog2() + let mask = u128::MAX.wrapping_mul(modulus); + let clear_a = mask | clear_a; + let encrypted = count_zeros_executor.execute(&a); + let decrypted: u32 = cks.decrypt_radix(&encrypted); + assert_eq!( + decrypted, + clear_a.count_zeros(), + "Invalid count_zeros for input {clear_a}" + ); + } + } +}