Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Run container #56

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/bitmap/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::fmt;
use super::store::{self, Store};
use super::util;

const ARRAY_LIMIT: u64 = 4096;
pub const ARRAY_LIMIT: u64 = 4096;
pub const RUN_MAX_SIZE: u64 = 2048;

#[derive(PartialEq, Clone)]
pub struct Container {
Expand Down Expand Up @@ -103,14 +104,33 @@ impl Container {
self.store.max()
}

fn ensure_correct_store(&mut self) {
fn ensure_correct_store(&mut self) -> bool {
let new_store = match (&self.store, self.len) {
(store @ &Store::Bitmap(..), len) if len <= ARRAY_LIMIT => Some(store.to_array()),
(store @ &Store::Array(..), len) if len > ARRAY_LIMIT => Some(store.to_bitmap()),
_ => None,
};
if let Some(new_store) = new_store {
self.store = new_store;
true
} else {
false
}
}

pub fn optimize(&mut self) -> bool {
match self.store {
Store::Array(..) | Store::Bitmap(..) => {
let num_runs = self.store.count_runs();
if num_runs <= RUN_MAX_SIZE && num_runs <= self.len / 2 {
// convert to run container
self.store = self.store.to_run();
true
} else {
self.ensure_correct_store()
}
}
Store::Run(..) => self.ensure_correct_store(),
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/bitmap/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ impl fmt::Debug for RoaringBitmap {
} else {
write!(
f,
"RoaringBitmap<{:?} values between {:?} and {:?}>",
"RoaringBitmap<{:?} values between {:?} and {:?} in {:?} containers>",
self.len(),
self.min().unwrap(),
self.max().unwrap()
self.max().unwrap(),
self.containers.len(),
)
}
}
Expand Down
18 changes: 18 additions & 0 deletions src/bitmap/inherent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,24 @@ impl RoaringBitmap {
.last()
.map(|tail| util::join(tail.key, tail.max()))
}

// TODO(jpg) actually come up with example that illustrates creation of run containers
/// Optimizes the container storage for this bitmap.
/// Returns true if the container storage was modified, false if not.
///
/// # Examples
/// use roaring::RoaringBitmap;
///
/// let mut rb = RoaringBitmap::from_iter(1000..100000)
/// rb.optimize()
/// ```
pub fn optimize(&mut self) -> bool {
let mut changed = false;
for container in &mut self.containers {
changed |= container.optimize()
}
changed
}
}

impl Default for RoaringBitmap {
Expand Down
171 changes: 140 additions & 31 deletions src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,25 @@ use std::io;

use super::container::Container;
use super::store::Store;
use crate::bitmap::container::ARRAY_LIMIT;
use crate::bitmap::store::{Interval, BITMAP_LENGTH};
use crate::RoaringBitmap;

const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
// TODO: Need this once run containers are supported
// const NO_OFFSET_THRESHOLD: u8 = 4;
const NO_OFFSET_THRESHOLD: usize = 4;

// Sizes of header structures
const COOKIE_BYTES: usize = 4;
const SIZE_BYTES: usize = 4;
const DESCRIPTION_BYTES: usize = 4;
const OFFSET_BYTES: usize = 4;

// Sizes of container structures
const BITMAP_BYTES: usize = BITMAP_LENGTH * 8;
const ARRAY_ELEMENT_BYTES: usize = 2;
const RUN_NUM_BYTES: usize = 2;
const RUN_ELEMENT_BYTES: usize = 4;

impl RoaringBitmap {
/// Return the size in bytes of the serialized output.
Expand All @@ -27,17 +40,23 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialized_size(&self) -> usize {
let mut has_run_containers = false;
let size = self.containers.len();
let container_sizes: usize = self
.containers
.iter()
.map(|container| match container.store {
Store::Array(ref values) => 8 + values.len() * 2,
Store::Bitmap(..) => 8 + 8 * 1024,
Store::Array(ref values) => values.len() * ARRAY_ELEMENT_BYTES,
Store::Bitmap(..) => BITMAP_BYTES,
Store::Run(ref intervals) => {
has_run_containers = true;
RUN_NUM_BYTES + (RUN_ELEMENT_BYTES * intervals.len())
}
})
.sum();

// header + container sizes
8 + container_sizes
header_size(size, has_run_containers) + container_sizes
}

/// Serialize this bitmap into [the standard Roaring on-disk format][format].
Expand All @@ -58,27 +77,61 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialize_into<W: io::Write>(&self, mut writer: W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
let has_run_containers = self.containers.iter().any(|c| {
if let Store::Run(_) = c.store {
true
} else {
false
}
});
let size = self.containers.len();

// Depending on if run containers are present or not write the appropriate header
if has_run_containers {
// The new format stores the container count in the most significant bits of the header
let cookie = SERIAL_COOKIE as u32 | ((size as u32 - 1) << 16);
writer.write_u32::<LittleEndian>(cookie)?;
// It is then followed by a bitset indicating which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
let mut run_container_bitmap = vec![0; run_container_bitmap_size];
for (i, container) in self.containers.iter().enumerate() {
if let Store::Run(_) = container.store {
run_container_bitmap[i / 8] |= 1 << (i % 8);
}
}
writer.write_all(&run_container_bitmap)?;
} else {
// Write old format, cookie followed by container count
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(size as u32)?;
}

// Write the container descriptions
for container in &self.containers {
writer.write_u16::<LittleEndian>(container.key)?;
writer.write_u16::<LittleEndian>((container.len - 1) as u16)?;
}

let mut offset = 8 + 8 * self.containers.len() as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += values.len() as u32 * 2;
}
Store::Bitmap(..) => {
offset += 8 * 1024;
// Write offsets if there are no runs or NO_OFFSET_THRESHOLD containers is reached
if !has_run_containers || size >= NO_OFFSET_THRESHOLD {
let mut offset = header_size(size, has_run_containers) as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += (values.len() * ARRAY_ELEMENT_BYTES) as u32;
}
Store::Bitmap(..) => {
offset += BITMAP_BYTES as u32;
}
Store::Run(ref intervals) => {
offset += (RUN_NUM_BYTES + (intervals.len() * RUN_ELEMENT_BYTES)) as u32;
}
}
}
}

// Finally serialize each of the containers
for container in &self.containers {
match container.store {
Store::Array(ref values) => {
Expand All @@ -91,6 +144,13 @@ impl RoaringBitmap {
writer.write_u64::<LittleEndian>(value)?;
}
}
Store::Run(ref intervals) => {
writer.write_u16::<LittleEndian>(intervals.len() as u16)?;
for iv in intervals {
writer.write_u16::<LittleEndian>(iv.start)?;
writer.write_u16::<LittleEndian>(iv.end - iv.start)?;
}
}
}
}

Expand All @@ -116,60 +176,109 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn deserialize_from<R: io::Read>(mut reader: R) -> io::Result<RoaringBitmap> {
let (size, has_offsets) = {
// First read the cookie to determine which version of the format we are reading
let (size, has_offsets, has_run_containers) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true)
(reader.read_u32::<LittleEndian>()? as usize, true, false)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(
io::ErrorKind::Other,
"run containers are unsupported",
));
let size = ((cookie >> 16) + 1) as usize;
(size, size >= NO_OFFSET_THRESHOLD, true)
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};

// Read the run container bitmap if necessary
let run_container_bitmap = if has_run_containers {
let mut bitmap = vec![0u8; (size + 7) / 8];
reader.read_exact(&mut bitmap)?;
Some(bitmap)
} else {
None
};

if size > u16::max_value() as usize {
return Err(io::Error::new(
io::ErrorKind::Other,
"size is greater than supported",
));
}

let mut description_bytes = vec![0u8; size * 4];
// Read the container descriptions
let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
reader.read_exact(&mut description_bytes)?;
let description_bytes = &mut &description_bytes[..];

// Read the offsets if present
if has_offsets {
let mut offsets = vec![0u8; size * 4];
let mut offsets = vec![0u8; size * OFFSET_BYTES];
reader.read_exact(&mut offsets)?;
drop(offsets); // Not useful when deserializing into memory
}

let mut containers = Vec::with_capacity(size);

for _ in 0..size {
// Read each of the containers
for i in 0..size {
let key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;

let store = if len < 4096 {
let mut values = Vec::with_capacity(len as usize);
for _ in 0..len {
// If the run container bitmap is present, check if this container is a run container
let is_run_container = match run_container_bitmap {
Some(ref bm) => bm[i / 8] & (1 << (i % 8)) != 0,
None => false,
};

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>()?;
let mut intervals = Vec::with_capacity(runs as usize);
for _ in 0..runs {
let start = reader.read_u16::<LittleEndian>()?;
let run_len = reader.read_u16::<LittleEndian>()?;
let end = start + run_len;
intervals.push(Interval { start, end })
}
Store::Run(intervals)
} else if cardinality < ARRAY_LIMIT {
let mut values = Vec::with_capacity(cardinality as usize);
for _ in 0..cardinality {
values.push(reader.read_u16::<LittleEndian>()?);
}
Store::Array(values)
} else {
let mut values = Box::new([0; 1024]);
let mut values = Box::new([0; BITMAP_LENGTH]);
for value in values.iter_mut() {
*value = reader.read_u64::<LittleEndian>()?;
}
Store::Bitmap(values)
};

containers.push(Container { key, len, store });
containers.push(Container {
key,
len: cardinality,
store,
});
}

Ok(RoaringBitmap { containers })
}
}

fn header_size(size: usize, has_run_containers: bool) -> usize {
if has_run_containers {
// New format encodes the size (number of containers) into the 4 byte cookie
// Additionally a bitmap is included marking which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
// New format conditionally includes offsets if there are 4 or more containers
if size >= NO_OFFSET_THRESHOLD {
COOKIE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size) + run_container_bitmap_size
} else {
COOKIE_BYTES + (DESCRIPTION_BYTES * size) + run_container_bitmap_size
}
} else {
// Old format encodes cookie followed by container count
// It also always includes the offsets
COOKIE_BYTES + SIZE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size)
}
}
Loading