Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Union with serialized bitmap #263

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/bitmap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod cmp;
mod inherent;
mod iter;
mod ops;
mod ops_with_serialized;
#[cfg(feature = "serde")]
mod serde;
mod serialization;
Expand Down
191 changes: 191 additions & 0 deletions src/bitmap/ops_with_serialized.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
use bytemuck::cast_slice_mut;
use byteorder::{LittleEndian, ReadBytesExt};
use std::{
io::{self, Read},
mem::size_of,
};

use crate::RoaringBitmap;

use super::{
container::Container,
store::{ArrayStore, BitmapStore, Store},
};

const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;

impl RoaringBitmap {
/// Returns the union between a roaring bitmap and a serialized roaring bitmap.
///
/// Example
/// =======
///
/// ```
/// use roaring::RoaringBitmap;
///
/// let mut a = RoaringBitmap::from_sorted_iter(0..3000).unwrap();
/// let b = RoaringBitmap::from_sorted_iter(2000..6000).unwrap();
/// let union = &a | &b;
///
/// let mut b_ser = Vec::new();
/// b.serialize_into(&mut b_ser).unwrap();
/// a.union_with_serialized_unchecked(&*b_ser).unwrap();
///
/// assert_eq!(a, union);
/// ```
pub fn union_with_serialized_unchecked(&mut self, mut reader: impl Read) -> io::Result<()> {
let (size, has_offsets) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};

if size > u16::MAX as usize + 1 {
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
}

let mut description_bytes = vec![0u8; size * 4];
reader.read_exact(&mut description_bytes)?;
let mut description_bytes = &description_bytes[..];

if has_offsets {
let mut offsets = vec![0u8; size * 4];
reader.read_exact(&mut offsets)?;
drop(offsets); // Not useful when deserializing into memory
}

for _ in 0..size {
let key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;

if len <= 4096 {
match self.containers.binary_search_by_key(&key, |c| c.key) {
Ok(loc) => {
let container = &mut self.containers[loc];

for _ in 0..len {
let mut value = [0u8; size_of::<u16>()];
reader.read_exact(value.as_mut())?;
// TODO: since this is sorted it could probably be faster
let value = u16::from_le_bytes(value);
container.insert(value);
}
}
Err(loc) => {
let mut values = vec![0u16; len as usize];
reader.read_exact(cast_slice_mut(&mut values))?;
values.iter_mut().for_each(|n| *n = u16::from_le(*n));

let array = ArrayStore::from_vec_unchecked(values);
let mut container = Container::new(key);
container.store = Store::Array(array);
self.containers.insert(loc, container);
}
}
} else {
match self.containers.binary_search_by_key(&key, |c| c.key) {
Ok(loc) => {
let current_store = std::mem::take(&mut self.containers[loc].store);

let mut values = Box::new([0; 1024]);
reader.read_exact(cast_slice_mut(&mut values[..]))?;
values.iter_mut().for_each(|n| *n = u64::from_le(*n));

let mut store = BitmapStore::from_unchecked(len, values);

match current_store {
Store::Array(array) => array.into_iter().for_each(|el| {
store.insert(el);
}),
Store::Bitmap(bitmap_store) => store |= &bitmap_store,
};

self.containers[loc].store = Store::Bitmap(store);
}
Err(loc) => {
let mut values = Box::new([0; 1024]);
reader.read_exact(cast_slice_mut(&mut values[..]))?;
values.iter_mut().for_each(|n| *n = u64::from_le(*n));

let array = BitmapStore::from_unchecked(len, values);
let mut container = Container::new(key);
container.store = Store::Bitmap(array);
self.containers.insert(loc, container);
}
}
};
}
Ok(())
}
}

#[cfg(test)]
mod test {
use crate::RoaringBitmap;
use proptest::prelude::*;

proptest! {
#[test]
fn prop_or_with_serialized(
mut a in RoaringBitmap::arbitrary(),
b in RoaringBitmap::arbitrary()
) {
let union = &a | &b;

let mut b_ser = Vec::new();
b.serialize_into(&mut b_ser).unwrap();
a.union_with_serialized_unchecked(&*b_ser).unwrap();

prop_assert_eq!(a, union);
}
}

#[test]
fn or_with_serialized() {
let unions = [
(RoaringBitmap::new(), RoaringBitmap::new()),
(RoaringBitmap::from_sorted_iter([0]).unwrap(), RoaringBitmap::new()),
(RoaringBitmap::new(), RoaringBitmap::from_sorted_iter([0]).unwrap()),
(
RoaringBitmap::from_sorted_iter([0]).unwrap(),
RoaringBitmap::from_sorted_iter([0]).unwrap(),
),
(
RoaringBitmap::from_sorted_iter([0]).unwrap(),
RoaringBitmap::from_sorted_iter([1]).unwrap(),
),
(
RoaringBitmap::from_sorted_iter([0]).unwrap(),
RoaringBitmap::from_sorted_iter(0..3000).unwrap(),
),
(
RoaringBitmap::from_sorted_iter([]).unwrap(),
RoaringBitmap::from_sorted_iter(0..3000).unwrap(),
),
(
RoaringBitmap::from_sorted_iter(0..3000).unwrap(),
RoaringBitmap::from_sorted_iter([3001]).unwrap(),
),
(
RoaringBitmap::from_sorted_iter(0..3000).unwrap(),
RoaringBitmap::from_sorted_iter(3000..6000).unwrap(),
),
];

for (mut a, b) in unions {
let union = &a | &b;

let mut b_ser = Vec::new();
b.serialize_into(&mut b_ser).unwrap();
a.union_with_serialized_unchecked(&*b_ser).unwrap();

assert_eq!(a, union, "When testing: {a:?} | {b:?}");
}
}
}
Loading