From 0a11dd0192878e49b0d8a350f8ff1242c6281ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:37:41 -0400 Subject: [PATCH] Use containers offsets when available --- src/bitmap/ops_with_serialized.rs | 81 ++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 7 deletions(-) diff --git a/src/bitmap/ops_with_serialized.rs b/src/bitmap/ops_with_serialized.rs index e9404e9d..ca4f9104 100644 --- a/src/bitmap/ops_with_serialized.rs +++ b/src/bitmap/ops_with_serialized.rs @@ -8,7 +8,7 @@ use std::ops::RangeInclusive; use crate::bitmap::container::Container; use crate::bitmap::serialization::{ - NO_OFFSET_THRESHOLD, OFFSET_BYTES, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, + NO_OFFSET_THRESHOLD, SERIAL_COOKIE, SERIAL_COOKIE_NO_RUNCONTAINER, }; use crate::RoaringBitmap; @@ -93,21 +93,88 @@ impl RoaringBitmap { } // Read the container descriptions - let mut description_bytes = vec![[0u16; 2]; size]; - reader.read_exact(cast_slice_mut(&mut description_bytes))?; - description_bytes.iter_mut().for_each(|[ref mut key, ref mut len]| { + let mut descriptions = vec![[0; 2]; size]; + reader.read_exact(cast_slice_mut(&mut descriptions))?; + descriptions.iter_mut().for_each(|[ref mut key, ref mut len]| { *key = u16::from_le(*key); *len = u16::from_le(*len); }); + let mut containers = Vec::new(); if has_offsets { - // I could use these offsets but I am a lazy developer (for now) - reader.seek(SeekFrom::Current((size * OFFSET_BYTES) as i64))?; + let mut offsets = vec![0; size]; + reader.read_exact(cast_slice_mut(&mut offsets))?; + offsets.iter_mut().for_each(|offset| *offset = u32::from_le(*offset)); + + // Loop on the materialized containers if there + // are less or as many of them than serialized ones. + if self.containers.len() <= size { + for container in &self.containers { + let i = match descriptions.binary_search_by_key(&container.key, |[k, _]| *k) { + Ok(index) => index, + Err(_) => continue, + }; + + // Seek to the bytes of the container we want. + reader.seek(SeekFrom::Start(offsets[i] as u64))?; + + let [key, len_minus_one] = descriptions[i]; + let cardinality = u64::from(len_minus_one) + 1; + + // If the run container bitmap is present, check if this container is a run container + let is_run_container = run_container_bitmap + .as_ref() + .map_or(false, |bm| bm[i / 8] & (1 << (i % 8)) != 0); + + let store = if is_run_container { + let runs = reader.read_u16::().unwrap(); + let mut intervals = vec![[0, 0]; runs as usize]; + reader.read_exact(cast_slice_mut(&mut intervals)).unwrap(); + intervals.iter_mut().for_each(|[s, len]| { + *s = u16::from_le(*s); + *len = u16::from_le(*len); + }); + + let cardinality = intervals.iter().map(|[_, len]| *len as usize).sum(); + let mut store = Store::with_capacity(cardinality); + intervals.into_iter().try_for_each( + |[s, len]| -> Result<(), io::ErrorKind> { + let end = s.checked_add(len).ok_or(io::ErrorKind::InvalidData)?; + store.insert_range(RangeInclusive::new(s, end)); + Ok(()) + }, + )?; + store + } else if cardinality <= ARRAY_LIMIT { + let mut values = vec![0; cardinality as usize]; + reader.read_exact(cast_slice_mut(&mut values)).unwrap(); + values.iter_mut().for_each(|n| *n = u16::from_le(*n)); + let array = + a(values).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Array(array) + } else { + let mut values = Box::new([0; BITMAP_LENGTH]); + reader.read_exact(cast_slice_mut(&mut values[..])).unwrap(); + values.iter_mut().for_each(|n| *n = u64::from_le(*n)); + let bitmap = b(cardinality, values) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + Store::Bitmap(bitmap) + }; + + let mut other_container = Container { key, store }; + other_container &= container; + if !other_container.is_empty() { + containers.push(other_container); + } + } + + return Ok(RoaringBitmap { containers }); + } } // Read each container and skip the useless ones - for (i, &[key, len_minus_one]) in description_bytes.iter().enumerate() { + for (i, &[key, len_minus_one]) in descriptions.iter().enumerate() { let container = match self.containers.binary_search_by_key(&key, |c| c.key) { Ok(index) => self.containers.get(index), Err(_) => None,