From 9be1e4b1cd5abe26e1970a15d4b0e115eec44a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 11:33:36 -0400 Subject: [PATCH 01/12] First impl trying to iter on local containers --- src/bitmap/mod.rs | 3 +- src/bitmap/ops_with_serialized.rs | 145 ++++++++++++++++++++++++++++++ src/bitmap/serialization.rs | 10 +-- 3 files changed, 152 insertions(+), 6 deletions(-) create mode 100644 src/bitmap/ops_with_serialized.rs diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 4aebfc37..1d481afa 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -12,10 +12,11 @@ mod cmp; mod inherent; mod iter; mod ops; +mod ops_with_serialized; #[cfg(feature = "serde")] mod serde; #[cfg(feature = "std")] -mod serialization; +pub(crate) mod serialization; use self::cmp::Pairs; pub use self::iter::IntoIter; diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs new file mode 100644 index 00000000..2b7c367e --- /dev/null +++ b/src/bitmap/ops_with_serialized.rs @@ -0,0 +1,145 @@ +use bytemuck::{cast_slice_mut, pod_collect_to_vec}; +use byteorder::{LittleEndian, ReadBytesExt}; +use core::convert::Infallible; +use core::ops::{BitAndAssign, RangeInclusive}; +use std::error::Error; +use std::io; + +use crate::bitmap::container::Container; +use crate::bitmap::serialization::{ + DESCRIPTION_BYTES, NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, + SERIAL_COOKIE_NO_RUNCONTAINER, +}; +use crate::RoaringBitmap; + +#[cfg(not(feature = "std"))] +use alloc::vec::Vec; + +use super::container::ARRAY_LIMIT; +use super::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; + +impl RoaringBitmap { + /// Computes the len of the intersection with the specified other bitmap without creating a + /// new bitmap. + /// + /// This is faster and more space efficient when you're only interested in the cardinality of + /// the intersection. + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// + /// let rb1: RoaringBitmap = (1..4).collect(); + /// let rb2: RoaringBitmap = (3..5).collect(); + /// + /// + /// assert_eq!(rb1.intersection_len(&rb2), (rb1 & rb2).len()); + /// ``` + // TODO convert this into a trait + pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result + where + R: io::Read, + { + RoaringBitmap::intersection_with_serialized_impl::( + self, + other, + |values| Ok(ArrayStore::from_vec_unchecked(values)), + |len, values| Ok(BitmapStore::from_unchecked(len, values)), + ) + } + + fn intersection_with_serialized_impl( + &self, + mut other: R, + a: A, + b: B, + ) -> io::Result + where + R: io::Read, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + // First read the cookie to determine which version of the format we are reading + let (size, has_offsets, has_run_containers) = { + let cookie = other.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (other.read_u32::()? as usize, true, false) + } else if (cookie as u16) == SERIAL_COOKIE { + let size = ((cookie >> 16) + 1) as usize; + (size, size >= NO_OFFSET_THRESHOLD, true) + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + // Read the run container bitmap if necessary + let run_container_bitmap = if has_run_containers { + let mut bitmap = vec![0u8; (size + 7) / 8]; + other.read_exact(&mut bitmap)?; + Some(bitmap) + } else { + None + }; + + if size > u16::MAX as usize + 1 { + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); + } + + // Read the container descriptions + let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES]; + other.read_exact(&mut description_bytes)?; + let mut description_bytes: Vec<[u16; 2]> = pod_collect_to_vec(&description_bytes); + description_bytes.iter_mut().for_each(|[ref mut k, ref mut c]| { + *k = u16::from_le(*k); + *c = u16::from_le(*c); + }); + + if has_offsets { + let mut offsets = vec![0u8; size * OFFSET_BYTES]; + other.read_exact(&mut offsets)?; + drop(offsets); // Not useful when deserializing into memory + } + + let mut containers = Vec::with_capacity(size); + for container in &self.containers { + let (i, key, cardinality) = + match description_bytes.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => { + let [key, cardinality] = description_bytes[index]; + (index, key, u64::from(cardinality) + 1) + } + Err(_) => continue, + }; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let mut store = if is_run_container { + todo!("support run containers") + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + other.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + other.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + store &= &container.store; + + containers.push(Container { key, store }); + } + + Ok(RoaringBitmap { containers }) + } +} diff --git a/src/bitmap/serialization.rs b/src/bitmap/serialization.rs index 790e2d28..f04f5039 100644 --- a/src/bitmap/serialization.rs +++ b/src/bitmap/serialization.rs @@ -9,13 +9,13 @@ use crate::bitmap::container::{Container, ARRAY_LIMIT}; use crate::bitmap::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; use crate::RoaringBitmap; -const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; -const SERIAL_COOKIE: u16 = 12347; -const NO_OFFSET_THRESHOLD: usize = 4; +pub const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +pub const SERIAL_COOKIE: u16 = 12347; +pub const NO_OFFSET_THRESHOLD: usize = 4; // Sizes of header structures -const DESCRIPTION_BYTES: usize = 4; -const OFFSET_BYTES: usize = 4; +pub const DESCRIPTION_BYTES: usize = 4; +pub const OFFSET_BYTES: usize = 4; impl RoaringBitmap { /// Return the size in bytes of the serialized output. From 9ba1ca1dad336ea262aa0cd3e44ed044303a33b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 4 Jun 2024 16:20:46 -0400 Subject: [PATCH 02/12] Prefer reading the content store by store --- src/bitmap/ops_with_serialized.rs | 75 ++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index 2b7c367e..ec9d31d3 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -51,7 +51,7 @@ impl RoaringBitmap { fn intersection_with_serialized_impl( &self, - mut other: R, + mut reader: R, a: A, b: B, ) -> io::Result @@ -64,9 +64,9 @@ impl RoaringBitmap { { // First read the cookie to determine which version of the format we are reading let (size, has_offsets, has_run_containers) = { - let cookie = other.read_u32::()?; + let cookie = reader.read_u32::()?; if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { - (other.read_u32::()? as usize, true, false) + (reader.read_u32::()? as usize, true, false) } else if (cookie as u16) == SERIAL_COOKIE { let size = ((cookie >> 16) + 1) as usize; (size, size >= NO_OFFSET_THRESHOLD, true) @@ -78,7 +78,7 @@ impl RoaringBitmap { // Read the run container bitmap if necessary let run_container_bitmap = if has_run_containers { let mut bitmap = vec![0u8; (size + 7) / 8]; - other.read_exact(&mut bitmap)?; + reader.read_exact(&mut bitmap)?; Some(bitmap) } else { None @@ -90,54 +90,75 @@ impl RoaringBitmap { // Read the container descriptions let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES]; - other.read_exact(&mut description_bytes)?; - let mut description_bytes: Vec<[u16; 2]> = pod_collect_to_vec(&description_bytes); - description_bytes.iter_mut().for_each(|[ref mut k, ref mut c]| { - *k = u16::from_le(*k); - *c = u16::from_le(*c); - }); + reader.read_exact(&mut description_bytes)?; + let mut description_bytes = &description_bytes[..]; if has_offsets { let mut offsets = vec![0u8; size * OFFSET_BYTES]; - other.read_exact(&mut offsets)?; - drop(offsets); // Not useful when deserializing into memory + reader.read_exact(&mut offsets)?; + drop(offsets); // We could use these offsets but we are lazy } let mut containers = Vec::with_capacity(size); - for container in &self.containers { - let (i, key, cardinality) = - match description_bytes.binary_search_by_key(&container.key, |[k, _]| *k) { - Ok(index) => { - let [key, cardinality] = description_bytes[index]; - (index, key, u64::from(cardinality) + 1) - } - Err(_) => continue, - }; + + // Read each container + for i in 0..size { + let key = description_bytes.read_u16::()?; + let container = match self.containers.binary_search_by_key(&key, |c| c.key) { + Ok(index) => self.containers.get(index), + Err(_) => None, + }; + let cardinality = u64::from(description_bytes.read_u16::()?) + 1; // If the run container bitmap is present, check if this container is a run container let is_run_container = run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); let mut store = if is_run_container { - todo!("support run containers") + let runs = reader.read_u16::()?; + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + if container.is_none() { + continue; + } + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store } else if cardinality <= ARRAY_LIMIT { let mut values = vec![0; cardinality as usize]; - other.read_exact(cast_slice_mut(&mut values))?; + reader.read_exact(cast_slice_mut(&mut values))?; + if container.is_none() { + continue; + } values.iter_mut().for_each(|n| *n = u16::from_le(*n)); let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Array(array) } else { let mut values = Box::new([0; BITMAP_LENGTH]); - other.read_exact(cast_slice_mut(&mut values[..]))?; + reader.read_exact(cast_slice_mut(&mut values[..]))?; + if container.is_none() { + continue; + } values.iter_mut().for_each(|n| *n = u64::from_le(*n)); let bitmap = b(cardinality, values) .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Store::Bitmap(bitmap) }; - store &= &container.store; - - containers.push(Container { key, store }); + if let Some(container) = container { + store &= &container.store; + containers.push(Container { key, store }); + } } Ok(RoaringBitmap { containers }) From 95432ece20e966b40f86b2b4ec648fbe3b3606b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 10:56:39 -0400 Subject: [PATCH 03/12] Do the intersections on containers to ensure correct store type --- src/bitmap/container.rs | 4 + src/bitmap/ops_with_serialized.rs | 130 +++++++++++++++++++--------- src/bitmap/store/array_store/mod.rs | 4 + src/bitmap/store/bitmap_store.rs | 4 + src/bitmap/store/mod.rs | 7 ++ 5 files changed, 108 insertions(+), 41 deletions(-) diff --git a/src/bitmap/container.rs b/src/bitmap/container.rs index a28c7c67..3ab1e5ad 100644 --- a/src/bitmap/container.rs +++ b/src/bitmap/container.rs @@ -37,6 +37,10 @@ impl Container { self.store.len() } + pub fn is_empty(&self) -> bool { + self.store.is_empty() + } + pub fn insert(&mut self, index: u16) -> bool { if self.store.insert(index) { self.ensure_correct_store(); diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index ec9d31d3..e0eb06a4 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -1,9 +1,10 @@ -use bytemuck::{cast_slice_mut, pod_collect_to_vec}; +use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; use core::convert::Infallible; -use core::ops::{BitAndAssign, RangeInclusive}; +use core::mem; +use core::ops::RangeInclusive; use std::error::Error; -use std::io; +use std::io::{self, SeekFrom}; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ @@ -39,7 +40,7 @@ impl RoaringBitmap { // TODO convert this into a trait pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result where - R: io::Read, + R: io::Read + io::Seek, { RoaringBitmap::intersection_with_serialized_impl::( self, @@ -56,7 +57,7 @@ impl RoaringBitmap { b: B, ) -> io::Result where - R: io::Read, + R: io::Read + io::Seek, A: Fn(Vec) -> Result, AErr: Error + Send + Sync + 'static, B: Fn(u64, Box<[u64; 1024]>) -> Result, @@ -101,7 +102,7 @@ impl RoaringBitmap { let mut containers = Vec::with_capacity(size); - // Read each container + // Read each container and skip the useless ones for i in 0..size { let key = description_bytes.read_u16::()?; let container = match self.containers.binary_search_by_key(&key, |c| c.key) { @@ -114,53 +115,100 @@ impl RoaringBitmap { let is_run_container = run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); - let mut store = if is_run_container { + let store = if is_run_container { let runs = reader.read_u16::()?; - let mut intervals = vec![[0, 0]; runs as usize]; - reader.read_exact(cast_slice_mut(&mut intervals))?; - if container.is_none() { - continue; + match container { + Some(_) => { + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals))?; + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each( + |[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + }, + )?; + store + } + None => { + let runs_size = mem::size_of::() * 2 * runs as usize; + reader.seek(SeekFrom::Current(runs_size as i64))?; + continue; + } } - intervals.iter_mut().for_each(|[s, len]| { - *s = u16::from_le(*s); - *len = u16::from_le(*len); - }); - - let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); - let mut store = Store::with_capacity(cardinality); - intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { - let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; - store.insert_range(RangeInclusive::new(s, end)); - Ok(()) - })?; - store } else if cardinality <= ARRAY_LIMIT { - let mut values = vec![0; cardinality as usize]; - reader.read_exact(cast_slice_mut(&mut values))?; - if container.is_none() { - continue; + match container { + Some(_) => { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values))?; + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = + a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } + None => { + let array_size = mem::size_of::() * cardinality as usize; + reader.seek(SeekFrom::Current(array_size as i64))?; + continue; + } } - values.iter_mut().for_each(|n| *n = u16::from_le(*n)); - let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Array(array) } else { - let mut values = Box::new([0; BITMAP_LENGTH]); - reader.read_exact(cast_slice_mut(&mut values[..]))?; - if container.is_none() { - continue; + match container { + Some(_) => { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..]))?; + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + } + None => { + let bitmap_size = mem::size_of::() * BITMAP_LENGTH; + reader.seek(SeekFrom::Current(bitmap_size as i64))?; + continue; + } } - values.iter_mut().for_each(|n| *n = u64::from_le(*n)); - let bitmap = b(cardinality, values) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - Store::Bitmap(bitmap) }; if let Some(container) = container { - store &= &container.store; - containers.push(Container { key, store }); + let mut tmp_container = Container { key, store }; + tmp_container &= container; + if !tmp_container.is_empty() { + containers.push(tmp_container); + } } } Ok(RoaringBitmap { containers }) } } + +#[cfg(test)] +#[cfg(feature = "std")] // We need this to serialize bitmaps +mod test { + use crate::RoaringBitmap; + use proptest::prelude::*; + use std::io::Cursor; + + // fast count tests + proptest! { + #[test] + fn intersection_with_serialized_eq_materialized_intersection( + a in RoaringBitmap::arbitrary(), + b in RoaringBitmap::arbitrary() + ) { + let mut serialized_bytes_b = Vec::new(); + b.serialize_into(&mut serialized_bytes_b).unwrap(); + let serialized_bytes_b = &serialized_bytes_b[..]; + + prop_assert_eq!(a.intersection_with_serialized_unchecked(Cursor::new(serialized_bytes_b)).unwrap(), a & b); + } + } +} diff --git a/src/bitmap/store/array_store/mod.rs b/src/bitmap/store/array_store/mod.rs index 6c41aadb..883db31f 100644 --- a/src/bitmap/store/array_store/mod.rs +++ b/src/bitmap/store/array_store/mod.rs @@ -200,6 +200,10 @@ impl ArrayStore { self.vec.len() as u64 } + pub fn is_empty(&self) -> bool { + self.vec.is_empty() + } + pub fn min(&self) -> Option { self.vec.first().copied() } diff --git a/src/bitmap/store/bitmap_store.rs b/src/bitmap/store/bitmap_store.rs index 731fc929..f349a2aa 100644 --- a/src/bitmap/store/bitmap_store.rs +++ b/src/bitmap/store/bitmap_store.rs @@ -241,6 +241,10 @@ impl BitmapStore { self.len } + pub fn is_empty(&self) -> bool { + self.len == 0 + } + pub fn min(&self) -> Option { self.bits .iter() diff --git a/src/bitmap/store/mod.rs b/src/bitmap/store/mod.rs index 25426295..bb0d5822 100644 --- a/src/bitmap/store/mod.rs +++ b/src/bitmap/store/mod.rs @@ -177,6 +177,13 @@ impl Store { } } + pub fn is_empty(&self) -> bool { + match self { + Array(vec) => vec.is_empty(), + Bitmap(bits) => bits.is_empty(), + } + } + pub fn min(&self) -> Option { match self { Array(vec) => vec.min(), From 77166fa114edda698f1f3f42a682f25fa7dea059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 13:50:36 -0400 Subject: [PATCH 04/12] Use the container and store is_empty methods --- src/bitmap/inherent.rs | 4 ++-- src/bitmap/multiops.rs | 4 ++-- src/bitmap/ops.rs | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/bitmap/inherent.rs b/src/bitmap/inherent.rs index 57c3ffec..559b7579 100644 --- a/src/bitmap/inherent.rs +++ b/src/bitmap/inherent.rs @@ -208,7 +208,7 @@ impl RoaringBitmap { match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(loc) => { if self.containers[loc].remove(index) { - if self.containers[loc].len() == 0 { + if self.containers[loc].is_empty() { self.containers.remove(loc); } true @@ -253,7 +253,7 @@ impl RoaringBitmap { let a = if key == start_container_key { start_index } else { 0 }; let b = if key == end_container_key { end_index } else { u16::MAX }; removed += self.containers[index].remove_range(a..=b); - if self.containers[index].len() == 0 { + if self.containers[index].is_empty() { self.containers.remove(index); continue; } diff --git a/src/bitmap/multiops.rs b/src/bitmap/multiops.rs index 1982e4f3..66a4e085 100644 --- a/src/bitmap/multiops.rs +++ b/src/bitmap/multiops.rs @@ -232,7 +232,7 @@ fn try_multi_or_owned( } containers.retain_mut(|container| { - if container.len() > 0 { + if !container.is_empty() { container.ensure_correct_store(); true } else { @@ -258,7 +258,7 @@ fn try_multi_xor_owned( } containers.retain_mut(|container| { - if container.len() > 0 { + if !container.is_empty() { container.ensure_correct_store(); true } else { diff --git a/src/bitmap/ops.rs b/src/bitmap/ops.rs index f99a376b..4337dbdb 100644 --- a/src/bitmap/ops.rs +++ b/src/bitmap/ops.rs @@ -223,7 +223,7 @@ impl BitAnd<&RoaringBitmap> for &RoaringBitmap { for pair in Pairs::new(&self.containers, &rhs.containers) { if let (Some(lhs), Some(rhs)) = pair { let container = BitAnd::bitand(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -248,7 +248,7 @@ impl BitAndAssign for RoaringBitmap { let rhs_cont = &mut rhs.containers[loc]; let rhs_cont = mem::replace(rhs_cont, Container::new(rhs_cont.key)); BitAndAssign::bitand_assign(cont, rhs_cont); - cont.len() != 0 + !cont.is_empty() } Err(_) => false, } @@ -264,7 +264,7 @@ impl BitAndAssign<&RoaringBitmap> for RoaringBitmap { match rhs.containers.binary_search_by_key(&key, |c| c.key) { Ok(loc) => { BitAndAssign::bitand_assign(cont, &rhs.containers[loc]); - cont.len() != 0 + !cont.is_empty() } Err(_) => false, } @@ -314,7 +314,7 @@ impl Sub<&RoaringBitmap> for &RoaringBitmap { (None, Some(_)) => (), (Some(lhs), Some(rhs)) => { let container = Sub::sub(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -340,7 +340,7 @@ impl SubAssign<&RoaringBitmap> for RoaringBitmap { match rhs.containers.binary_search_by_key(&cont.key, |c| c.key) { Ok(loc) => { SubAssign::sub_assign(cont, &rhs.containers[loc]); - cont.len() != 0 + !cont.is_empty() } Err(_) => true, } @@ -390,7 +390,7 @@ impl BitXor<&RoaringBitmap> for &RoaringBitmap { (None, Some(rhs)) => containers.push(rhs.clone()), (Some(lhs), Some(rhs)) => { let container = BitXor::bitxor(lhs, rhs); - if container.len() != 0 { + if !container.is_empty() { containers.push(container); } } @@ -409,7 +409,7 @@ impl BitXorAssign for RoaringBitmap { match pair { (Some(mut lhs), Some(rhs)) => { BitXorAssign::bitxor_assign(&mut lhs, rhs); - if lhs.len() != 0 { + if !lhs.is_empty() { self.containers.push(lhs); } } @@ -428,7 +428,7 @@ impl BitXorAssign<&RoaringBitmap> for RoaringBitmap { match pair { (Some(mut lhs), Some(rhs)) => { BitXorAssign::bitxor_assign(&mut lhs, rhs); - if lhs.len() != 0 { + if !lhs.is_empty() { self.containers.push(lhs); } } From 4466ae0104ed44a8cf41d187d9359483fe190701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:01:13 -0400 Subject: [PATCH 05/12] Fix building with no-std --- src/bitmap/mod.rs | 1 + src/bitmap/ops_with_serialized.rs | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 1d481afa..ed5567d5 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -12,6 +12,7 @@ mod cmp; mod inherent; mod iter; mod ops; +#[cfg(feature = "std")] mod ops_with_serialized; #[cfg(feature = "serde")] mod serde; diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index e0eb06a4..9d33c560 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -1,10 +1,10 @@ use bytemuck::cast_slice_mut; use byteorder::{LittleEndian, ReadBytesExt}; use core::convert::Infallible; -use core::mem; -use core::ops::RangeInclusive; use std::error::Error; use std::io::{self, SeekFrom}; +use std::mem; +use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ @@ -13,9 +13,6 @@ use crate::bitmap::serialization::{ }; use crate::RoaringBitmap; -#[cfg(not(feature = "std"))] -use alloc::vec::Vec; - use super::container::ARRAY_LIMIT; use super::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; From add2002f5c3ecd3b2419d0d0a8c2400b3d340474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 22:09:19 -0400 Subject: [PATCH 06/12] Avoid allocating too much containers --- src/bitmap/ops_with_serialized.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index 9d33c560..c2811f48 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -97,7 +97,7 @@ impl RoaringBitmap { drop(offsets); // We could use these offsets but we are lazy } - let mut containers = Vec::with_capacity(size); + let mut containers = Vec::new(); // Read each container and skip the useless ones for i in 0..size { From 7b4df74555ffe25061a0670aac807722c0bfbf2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 09:25:32 -0400 Subject: [PATCH 07/12] Skip offsets instead of copying and dropping them --- src/bitmap/ops_with_serialized.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index c2811f48..1a174598 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -92,9 +92,8 @@ impl RoaringBitmap { let mut description_bytes = &description_bytes[..]; if has_offsets { - let mut offsets = vec![0u8; size * OFFSET_BYTES]; - reader.read_exact(&mut offsets)?; - drop(offsets); // We could use these offsets but we are lazy + // We could use these offsets but we are lazy + reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; } let mut containers = Vec::new(); @@ -175,10 +174,10 @@ impl RoaringBitmap { }; if let Some(container) = container { - let mut tmp_container = Container { key, store }; - tmp_container &= container; - if !tmp_container.is_empty() { - containers.push(tmp_container); + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); } } } From 88ceb7de4d76c4f47bb0a14a13506a74db09c22e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 09:44:12 -0400 Subject: [PATCH 08/12] Improve the documentation of the function --- src/bitmap/ops_with_serialized.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index 1a174598..cb27dd45 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -17,24 +17,31 @@ use super::container::ARRAY_LIMIT; use super::store::{ArrayStore, BitmapStore, Store, BITMAP_LENGTH}; impl RoaringBitmap { - /// Computes the len of the intersection with the specified other bitmap without creating a - /// new bitmap. + /// Computes the intersection between a materialized [`RoaringBitmap`] and a serialized one. /// - /// This is faster and more space efficient when you're only interested in the cardinality of - /// the intersection. + /// This is faster and more space efficient when you only need the intersection result. + /// It reduces the number of deserialized internal container and therefore + /// the number of allocations and copies of bytes. /// /// # Examples /// /// ```rust /// use roaring::RoaringBitmap; + /// use std::io::Cursor; /// /// let rb1: RoaringBitmap = (1..4).collect(); /// let rb2: RoaringBitmap = (3..5).collect(); /// + /// // Let's say the rb2 bitmap is serialized + /// let mut bytes = Vec::new(); + /// rb2.serialize_into(&mut bytes).unwrap(); + /// let rb2_bytes = Cursor::new(bytes); /// - /// assert_eq!(rb1.intersection_len(&rb2), (rb1 & rb2).len()); + /// assert_eq!( + /// rb1.intersection_with_serialized_unchecked(rb2_bytes).unwrap(), + /// rb1 & rb2, + /// ); /// ``` - // TODO convert this into a trait pub fn intersection_with_serialized_unchecked(&self, other: R) -> io::Result where R: io::Read + io::Seek, @@ -92,7 +99,7 @@ impl RoaringBitmap { let mut description_bytes = &description_bytes[..]; if has_offsets { - // We could use these offsets but we are lazy + // I could use these offsets but I am a lazy developer (for now) reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; } From b92f110c7363901b2bc170585c4fc7e98538c79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:09:03 -0400 Subject: [PATCH 09/12] Deserialize the description info at the beginning --- src/bitmap/ops_with_serialized.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index cb27dd45..e9404e9d 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -8,8 +8,7 @@ use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ - DESCRIPTION_BYTES, NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, - SERIAL_COOKIE_NO_RUNCONTAINER, + NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, }; use crate::RoaringBitmap; @@ -94,25 +93,26 @@ impl RoaringBitmap { } // Read the container descriptions - let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES]; - reader.read_exact(&mut description_bytes)?; - let mut description_bytes = &description_bytes[..]; + let mut description_bytes = vec![[0u16; 2]; size]; + reader.read_exact(cast_slice_mut(&mut description_bytes))?; + description_bytes.iter_mut().for_each(|[ref mut key, ref mut len]| { + *key = u16::from_le(*key); + *len = u16::from_le(*len); + }); + if has_offsets { // I could use these offsets but I am a lazy developer (for now) reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; } - let mut containers = Vec::new(); - // Read each container and skip the useless ones - for i in 0..size { - let key = description_bytes.read_u16::()?; + for (i, &[key, len_minus_one]) in description_bytes.iter().enumerate() { let container = match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(index) => self.containers.get(index), Err(_) => None, }; - let cardinality = u64::from(description_bytes.read_u16::()?) + 1; + let cardinality = u64::from(len_minus_one) + 1; // If the run container bitmap is present, check if this container is a run container let is_run_container = From 88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:37:41 -0400 Subject: [PATCH 10/12] Use containers offsets when available --- src/bitmap/ops_with_serialized.rs | 105 +++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index e9404e9d..6711b1b0 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -8,7 +8,7 @@ use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ - NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, + NO_OFFSET_THRESHOLD, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, }; use crate::RoaringBitmap; @@ -93,21 +93,35 @@ impl RoaringBitmap { } // Read the container descriptions - let mut description_bytes = vec![[0u16; 2]; size]; - reader.read_exact(cast_slice_mut(&mut description_bytes))?; - description_bytes.iter_mut().for_each(|[ref mut key, ref mut len]| { + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { *key = u16::from_le(*key); *len = u16::from_le(*len); }); - if has_offsets { - // I could use these offsets but I am a lazy developer (for now) - reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + + // Loop on the materialized containers if there + // are less or as many of them than serialized ones. + if self.containers.len() <= size { + return self.intersection_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); + } } // Read each container and skip the useless ones - for (i, &[key, len_minus_one]) in description_bytes.iter().enumerate() { + let mut containers = Vec::new(); + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { let container = match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(index) => self.containers.get(index), Err(_) => None, @@ -191,6 +205,81 @@ impl RoaringBitmap { Ok(RoaringBitmap { containers }) } + + fn intersection_with_serialized_impl_with_offsets( + &self, + mut reader: R, + a: A, + b: B, + descriptions: &[[u16; 2]], + offsets: &[u32], + run_container_bitmap: Option<&[u8]>, + ) -> io::Result + where + R: io::Read + io::Seek, + A: Fn(Vec) -> Result, + AErr: Error + Send + Sync + 'static, + B: Fn(u64, Box<[u64; 1024]>) -> Result, + BErr: Error + Send + Sync + 'static, + { + let mut containers = Vec::new(); + for container in &self.containers { + let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => index, + Err(_) => continue, + }; + + // Seek to the bytes of the container we want. + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let [key, len_minus_one] = descriptions[i]; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = + run_container_bitmap.as_ref().map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each(|[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + })?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + + Ok(RoaringBitmap { containers }) + } } #[cfg(test)] From e417314f0774a44bbb4bc2578a3bf457413134ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:49:13 -0400 Subject: [PATCH 11/12] Always use container offsets when available --- src/bitmap/ops_with_serialized.rs | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index 6711b1b0..f19ba53c 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -104,19 +104,14 @@ impl RoaringBitmap { let mut offsets = vec![0; size]; reader.read_exact(cast_slice_mut(&mut offsets))?; offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); - - // Loop on the materialized containers if there - // are less or as many of them than serialized ones. - if self.containers.len() <= size { - return self.intersection_with_serialized_impl_with_offsets( - reader, - a, - b, - &descriptions, - &offsets, - run_container_bitmap.as_deref(), - ); - } + return self.intersection_with_serialized_impl_with_offsets( + reader, + a, + b, + &descriptions, + &offsets, + run_container_bitmap.as_deref(), + ); } // Read each container and skip the useless ones From 457b2c91c125c4319730776203b81c6867300365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 8 Jun 2024 00:12:24 +0200 Subject: [PATCH 12/12] Remove useless std feature gate Co-authored-by: Tamo --- src/bitmap/ops_with_serialized.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index f19ba53c..e3808803 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -278,7 +278,6 @@ impl RoaringBitmap { } #[cfg(test)] -#[cfg(feature = "std")] // We need this to serialize bitmaps mod test { use crate::RoaringBitmap; use proptest::prelude::*;