Skip to content

Commit

Permalink
WIP: Run container
Browse files Browse the repository at this point in the history
  • Loading branch information
josephglanville committed Apr 2, 2020
1 parent adc8824 commit 58a89ec
Show file tree
Hide file tree
Showing 9 changed files with 620 additions and 67 deletions.
24 changes: 22 additions & 2 deletions src/bitmap/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::fmt;
use super::store::{self, Store};
use super::util;

const ARRAY_LIMIT: u64 = 4096;
pub const ARRAY_LIMIT: u64 = 4096;
pub const RUN_MAX_SIZE: u64 = 2048;

#[derive(PartialEq, Clone)]
pub struct Container {
Expand Down Expand Up @@ -103,14 +104,33 @@ impl Container {
self.store.max()
}

fn ensure_correct_store(&mut self) {
fn ensure_correct_store(&mut self) -> bool {
let new_store = match (&self.store, self.len) {
(store @ &Store::Bitmap(..), len) if len <= ARRAY_LIMIT => Some(store.to_array()),
(store @ &Store::Array(..), len) if len > ARRAY_LIMIT => Some(store.to_bitmap()),
_ => None,
};
if let Some(new_store) = new_store {
self.store = new_store;
true
} else {
false
}
}

pub fn optimize(&mut self) -> bool {
match self.store {
Store::Array(..) | Store::Bitmap(..) => {
let num_runs = self.store.count_runs();
if num_runs <= RUN_MAX_SIZE && num_runs <= self.len / 2 {
// convert to run container
self.store = self.store.to_run();
true
} else {
self.ensure_correct_store()
}
}
Store::Run(..) => self.ensure_correct_store(),
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/bitmap/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ impl fmt::Debug for RoaringBitmap {
} else {
write!(
f,
"RoaringBitmap<{:?} values between {:?} and {:?}>",
"RoaringBitmap<{:?} values between {:?} and {:?} in {:?} containers>",
self.len(),
self.min().unwrap(),
self.max().unwrap()
self.max().unwrap(),
self.containers.len(),
)
}
}
Expand Down
18 changes: 18 additions & 0 deletions src/bitmap/inherent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,24 @@ impl RoaringBitmap {
.last()
.map(|tail| util::join(tail.key, tail.max()))
}

// TODO(jpg) actually come up with example that illustrates creation of run containers
/// Optimizes the container storage for this bitmap.
/// Returns true if the container storage was modified, false if not.
///
/// # Examples
/// use roaring::RoaringBitmap;
///
/// let mut rb = RoaringBitmap::from_iter(1000..100000)
/// rb.optimize()
/// ```
pub fn optimize(&mut self) -> bool {
let mut changed = false;
for container in &mut self.containers {
changed |= container.optimize()
}
changed
}
}

impl Default for RoaringBitmap {
Expand Down
171 changes: 140 additions & 31 deletions src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,25 @@ use std::io;

use super::container::Container;
use super::store::Store;
use crate::bitmap::container::ARRAY_LIMIT;
use crate::bitmap::store::{Interval, BITMAP_LENGTH};
use crate::RoaringBitmap;

const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
const SERIAL_COOKIE: u16 = 12347;
// TODO: Need this once run containers are supported
// const NO_OFFSET_THRESHOLD: u8 = 4;
const NO_OFFSET_THRESHOLD: usize = 4;

// Sizes of header structures
const COOKIE_BYTES: usize = 4;
const SIZE_BYTES: usize = 4;
const DESCRIPTION_BYTES: usize = 4;
const OFFSET_BYTES: usize = 4;

// Sizes of container structures
const BITMAP_BYTES: usize = BITMAP_LENGTH * 8;
const ARRAY_ELEMENT_BYTES: usize = 2;
const RUN_NUM_BYTES: usize = 2;
const RUN_ELEMENT_BYTES: usize = 4;

impl RoaringBitmap {
/// Return the size in bytes of the serialized output.
Expand All @@ -27,17 +40,23 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialized_size(&self) -> usize {
let mut has_run_containers = false;
let size = self.containers.len();
let container_sizes: usize = self
.containers
.iter()
.map(|container| match container.store {
Store::Array(ref values) => 8 + values.len() * 2,
Store::Bitmap(..) => 8 + 8 * 1024,
Store::Array(ref values) => values.len() * ARRAY_ELEMENT_BYTES,
Store::Bitmap(..) => BITMAP_BYTES,
Store::Run(ref intervals) => {
has_run_containers = true;
RUN_NUM_BYTES + (RUN_ELEMENT_BYTES * intervals.len())
}
})
.sum();

// header + container sizes
8 + container_sizes
header_size(size, has_run_containers) + container_sizes
}

/// Serialize this bitmap into [the standard Roaring on-disk format][format].
Expand All @@ -58,27 +77,61 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn serialize_into<W: io::Write>(&self, mut writer: W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
let has_run_containers = self.containers.iter().any(|c| {
if let Store::Run(_) = c.store {
true
} else {
false
}
});
let size = self.containers.len();

// Depending on if run containers are present or not write the appropriate header
if has_run_containers {
// The new format stores the container count in the most significant bits of the header
let cookie = SERIAL_COOKIE as u32 | ((size as u32 - 1) << 16);
writer.write_u32::<LittleEndian>(cookie)?;
// It is then followed by a bitset indicating which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
let mut run_container_bitmap = vec![0; run_container_bitmap_size];
for (i, container) in self.containers.iter().enumerate() {
if let Store::Run(_) = container.store {
run_container_bitmap[i / 8] |= 1 << (i % 8);
}
}
writer.write_all(&run_container_bitmap)?;
} else {
// Write old format, cookie followed by container count
writer.write_u32::<LittleEndian>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
writer.write_u32::<LittleEndian>(size as u32)?;
}

// Write the container descriptions
for container in &self.containers {
writer.write_u16::<LittleEndian>(container.key)?;
writer.write_u16::<LittleEndian>((container.len - 1) as u16)?;
}

let mut offset = 8 + 8 * self.containers.len() as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += values.len() as u32 * 2;
}
Store::Bitmap(..) => {
offset += 8 * 1024;
// Write offsets if there are no runs or NO_OFFSET_THRESHOLD containers is reached
if !has_run_containers || size >= NO_OFFSET_THRESHOLD {
let mut offset = header_size(size, has_run_containers) as u32;
for container in &self.containers {
writer.write_u32::<LittleEndian>(offset)?;
match container.store {
Store::Array(ref values) => {
offset += (values.len() * ARRAY_ELEMENT_BYTES) as u32;
}
Store::Bitmap(..) => {
offset += BITMAP_BYTES as u32;
}
Store::Run(ref intervals) => {
offset += (RUN_NUM_BYTES + (intervals.len() * RUN_ELEMENT_BYTES)) as u32;
}
}
}
}

// Finally serialize each of the containers
for container in &self.containers {
match container.store {
Store::Array(ref values) => {
Expand All @@ -91,6 +144,13 @@ impl RoaringBitmap {
writer.write_u64::<LittleEndian>(value)?;
}
}
Store::Run(ref intervals) => {
writer.write_u16::<LittleEndian>(intervals.len() as u16)?;
for iv in intervals {
writer.write_u16::<LittleEndian>(iv.start)?;
writer.write_u16::<LittleEndian>(iv.end - iv.start)?;
}
}
}
}

Expand All @@ -116,60 +176,109 @@ impl RoaringBitmap {
/// assert_eq!(rb1, rb2);
/// ```
pub fn deserialize_from<R: io::Read>(mut reader: R) -> io::Result<RoaringBitmap> {
let (size, has_offsets) = {
// First read the cookie to determine which version of the format we are reading
let (size, has_offsets, has_run_containers) = {
let cookie = reader.read_u32::<LittleEndian>()?;
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
(reader.read_u32::<LittleEndian>()? as usize, true)
(reader.read_u32::<LittleEndian>()? as usize, true, false)
} else if (cookie as u16) == SERIAL_COOKIE {
return Err(io::Error::new(
io::ErrorKind::Other,
"run containers are unsupported",
));
let size = ((cookie >> 16) + 1) as usize;
(size, size >= NO_OFFSET_THRESHOLD, true)
} else {
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
}
};

// Read the run container bitmap if necessary
let run_container_bitmap = if has_run_containers {
let mut bitmap = vec![0u8; (size + 7) / 8];
reader.read_exact(&mut bitmap)?;
Some(bitmap)
} else {
None
};

if size > u16::max_value() as usize {
return Err(io::Error::new(
io::ErrorKind::Other,
"size is greater than supported",
));
}

let mut description_bytes = vec![0u8; size * 4];
// Read the container descriptions
let mut description_bytes = vec![0u8; size * DESCRIPTION_BYTES];
reader.read_exact(&mut description_bytes)?;
let description_bytes = &mut &description_bytes[..];

// Read the offsets if present
if has_offsets {
let mut offsets = vec![0u8; size * 4];
let mut offsets = vec![0u8; size * OFFSET_BYTES];
reader.read_exact(&mut offsets)?;
drop(offsets); // Not useful when deserializing into memory
}

let mut containers = Vec::with_capacity(size);

for _ in 0..size {
// Read each of the containers
for i in 0..size {
let key = description_bytes.read_u16::<LittleEndian>()?;
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
let cardinality = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;

let store = if len < 4096 {
let mut values = Vec::with_capacity(len as usize);
for _ in 0..len {
// If the run container bitmap is present, check if this container is a run container
let is_run_container = match run_container_bitmap {
Some(ref bm) => bm[i / 8] & (1 << (i % 8)) != 0,
None => false,
};

let store = if is_run_container {
let runs = reader.read_u16::<LittleEndian>()?;
let mut intervals = Vec::with_capacity(runs as usize);
for _ in 0..runs {
let start = reader.read_u16::<LittleEndian>()?;
let run_len = reader.read_u16::<LittleEndian>()?;
let end = start + run_len;
intervals.push(Interval { start, end })
}
Store::Run(intervals)
} else if cardinality < ARRAY_LIMIT {
let mut values = Vec::with_capacity(cardinality as usize);
for _ in 0..cardinality {
values.push(reader.read_u16::<LittleEndian>()?);
}
Store::Array(values)
} else {
let mut values = Box::new([0; 1024]);
let mut values = Box::new([0; BITMAP_LENGTH]);
for value in values.iter_mut() {
*value = reader.read_u64::<LittleEndian>()?;
}
Store::Bitmap(values)
};

containers.push(Container { key, len, store });
containers.push(Container {
key,
len: cardinality,
store,
});
}

Ok(RoaringBitmap { containers })
}
}

fn header_size(size: usize, has_run_containers: bool) -> usize {
if has_run_containers {
// New format encodes the size (number of containers) into the 4 byte cookie
// Additionally a bitmap is included marking which containers are run containers
let run_container_bitmap_size = (size + 7) / 8;
// New format conditionally includes offsets if there are 4 or more containers
if size >= NO_OFFSET_THRESHOLD {
COOKIE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size) + run_container_bitmap_size
} else {
COOKIE_BYTES + (DESCRIPTION_BYTES * size) + run_container_bitmap_size
}
} else {
// Old format encodes cookie followed by container count
// It also always includes the offsets
COOKIE_BYTES + SIZE_BYTES + ((DESCRIPTION_BYTES + OFFSET_BYTES) * size)
}
}
Loading

0 comments on commit 58a89ec

Please sign in to comment.