diff --git a/Cargo.toml b/Cargo.toml index 118fef79..2e340d2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,8 @@ byteorder = "1.0" [dev-dependencies] criterion = "0.3" +quickcheck = "0.9" +quickcheck_macros = "0.9" [[bench]] name = "lib" diff --git a/benches/lib.rs b/benches/lib.rs index d7c8a7f0..ba7218b2 100644 --- a/benches/lib.rs +++ b/benches/lib.rs @@ -206,6 +206,30 @@ fn remove_range_bitmap(c: &mut Criterion) { }); } +fn insert_range_bitmap(c: &mut Criterion) { + for &size in &[10, 100, 1_000, 5_000, 10_000, 20_000] { + let mut group = c.benchmark_group("insert_range"); + group.throughput(criterion::Throughput::Elements(size)); + group.bench_function(format!("from_empty_{}", size), |b| { + let bm = RoaringBitmap::new(); + b.iter_batched( + || bm.clone(), + |mut bm| black_box(bm.insert_range(0..size)), + criterion::BatchSize::SmallInput, + ) + }); + group.bench_function(format!("pre_populated_{}", size), |b| { + let mut bm = RoaringBitmap::new(); + bm.insert_range(0..size); + b.iter_batched( + || bm.clone(), + |mut bm| black_box(bm.insert_range(0..size)), + criterion::BatchSize::SmallInput, + ) + }); + } +} + fn iter(c: &mut Criterion) { c.bench_function("iter", |b| { let bitmap: RoaringBitmap = (1..10_000).collect(); @@ -300,6 +324,7 @@ criterion_group!( is_subset, remove, remove_range_bitmap, + insert_range_bitmap, iter, is_empty, serialize, diff --git a/src/bitmap/container.rs b/src/bitmap/container.rs index a06d2edf..89b97156 100644 --- a/src/bitmap/container.rs +++ b/src/bitmap/container.rs @@ -1,4 +1,4 @@ -use std::fmt; +use std::{fmt, ops::Range}; use super::store::{self, Store}; use super::util; @@ -38,6 +38,19 @@ impl Container { } } + pub fn insert_range(&mut self, range: Range) -> u64 { + // If the range is larger than the array limit, skip populating the + // array to then have to convert it to a bitmap anyway. + if matches!(self.store, Store::Array(_)) && range.end - range.start > ARRAY_LIMIT as u16 { + self.store = self.store.to_bitmap() + } + + let inserted = self.store.insert_range(range); + self.len += inserted; + self.ensure_correct_store(); + inserted + } + pub fn push(&mut self, index: u16) { if self.store.push(index) { self.len += 1; diff --git a/src/bitmap/inherent.rs b/src/bitmap/inherent.rs index a8076d2f..d72ef201 100644 --- a/src/bitmap/inherent.rs +++ b/src/bitmap/inherent.rs @@ -43,6 +43,102 @@ impl RoaringBitmap { container.insert(index) } + /// Inserts a range of values from the set specific as [start..end). Returns + /// the number of inserted values. + /// + /// Note that due to the exclusive end this functions take indexes as u64 + /// but you still can't index past 2**32 (u32::MAX + 1). + /// + /// # Safety + /// + /// This function panics if the range upper bound exceeds `u32::MAX`. + /// + /// # Examples + /// + /// ```rust + /// use roaring::RoaringBitmap; + /// + /// let mut rb = RoaringBitmap::new(); + /// rb.insert_range(2..4); + /// assert!(rb.contains(2)); + /// assert!(rb.contains(3)); + /// assert!(!rb.contains(4)); + /// ``` + pub fn insert_range(&mut self, range: Range) -> u64 { + assert!( + range.end <= u64::from(u32::max_value()) + 1, + "can't index past 2**32" + ); + if range.is_empty() { + return 0; + } + + let (start_container_key, start_index) = util::split(range.start as u32); + let (end_container_key, end_index) = util::split((range.end) as u32); + + // Find the container index for start_container_key + let start_i = match self + .containers + .binary_search_by_key(&start_container_key, |c| c.key) + { + Ok(loc) => loc, + Err(loc) => { + self.containers + .insert(loc, Container::new(start_container_key)); + loc + } + }; + + // If the end range value is in the same container, just call into + // the one container. + if start_container_key == end_container_key { + return self.containers[start_i].insert_range(start_index..end_index); + } + + // For the first container, insert start_index..u16::MAX, with + // subsequent containers inserting 0..MAX. + // + // The last container (end_container_key) is handled explicitly outside + // the loop. + let mut low = start_index; + let mut inserted = 0; + + // Walk through the containers until the container for end_container_key + let end_i = usize::from(end_container_key - start_container_key); + for i in start_i..end_i { + // Fetch (or upsert) the container for i + let c = match self.containers.get_mut(i) { + Some(c) => c, + None => { + // For each i, the container key is start_container + i in + // the upper u8 of the u16. + let key = start_container_key + ((1 << 8) * i) as u16; + self.containers.insert(i, Container::new(key)); + &mut self.containers[i] + } + }; + + // Insert the range subset for this container + inserted += c.insert_range(low..u16::MAX); + + // After the first container, always fill the containers. + low = 0; + } + + // Handle the last container + let c = match self.containers.get_mut(end_i) { + Some(c) => c, + None => { + let (key, _) = util::split(range.start as u32); + self.containers.insert(end_i, Container::new(key)); + &mut self.containers[end_i] + } + }; + c.insert_range(0..end_index); + + inserted + } + /// Adds a value to the set. /// The value **must** be greater or equal to the maximum value in the set. /// @@ -131,7 +227,7 @@ impl RoaringBitmap { range.end <= u64::from(u32::max_value()) + 1, "can't index past 2**32" ); - if range.start == range.end { + if range.is_empty() { return 0; } // inclusive bounds for start and end @@ -292,3 +388,61 @@ impl Default for RoaringBitmap { RoaringBitmap::new() } } + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck_macros::quickcheck; + + #[quickcheck] + fn insert_range(r: Range, checks: Vec) { + let r: Range = u64::from(r.start)..u64::from(r.end); + + let mut b = RoaringBitmap::new(); + let inserted = b.insert_range(r.clone()); + if r.end > r.start { + assert_eq!(inserted, r.end - r.start); + } else { + assert_eq!(inserted, 0); + } + + // Assert all values in the range are present + for i in r.clone() { + assert!(b.contains(i as u32), format!("does not contain {}", i)); + } + + // Run the check values looking for any false positives + for i in checks { + let bitmap_has = b.contains(i); + let range_has = r.contains(&u64::from(i)); + assert!( + bitmap_has == range_has, + format!( + "value {} in bitmap={} and range={}", + i, bitmap_has, range_has + ) + ); + } + } + + #[test] + fn test_insert_range_same_container() { + let mut b = RoaringBitmap::new(); + let inserted = b.insert_range(1..5); + assert_eq!(inserted, 4); + + for i in 1..5 { + assert!(b.contains(i)); + } + } + + #[test] + fn test_insert_range_pre_populated() { + let mut b = RoaringBitmap::new(); + let inserted = b.insert_range(1..20_000); + assert_eq!(inserted, 19_999); + + let inserted = b.insert_range(1..20_000); + assert_eq!(inserted, 0); + } +} diff --git a/src/bitmap/store.rs b/src/bitmap/store.rs index d834c190..2bf6cf83 100644 --- a/src/bitmap/store.rs +++ b/src/bitmap/store.rs @@ -1,7 +1,7 @@ -use std::borrow::Borrow; use std::cmp::Ordering::{Equal, Greater, Less}; use std::slice; use std::vec; +use std::{borrow::Borrow, ops::Range}; const BITMAP_LENGTH: usize = 1024; @@ -43,6 +43,67 @@ impl Store { } } + pub fn insert_range(&mut self, range: Range) -> u64 { + // A Range is defined as being of size 0 if start >= end. + if range.is_empty() { + return 0; + } + + match *self { + Array(ref mut vec) => { + // Figure out the starting/ending position in the vec + let pos_start = vec.binary_search(&range.start).unwrap_or_else(|x| x); + let pos_end = vec.binary_search(&range.end).unwrap_or_else(|x| x); + + // Overwrite the range in the middle - there's no need to take + // into account any existing elements between start and end, as + // they're all being added to the set. + let dropped = vec.splice(pos_start..pos_end, range.clone()); + + u64::from(range.end - range.start) - dropped.len() as u64 + } + Bitmap(ref mut bits) => { + let (start_key, start_bit) = (key(range.start), bit(range.start)); + let (end_key, end_bit) = (key(range.end), bit(range.end)); + + if start_key == end_key { + // Set the end_bit -> LSB to 1 + let mut mask = (1 << end_bit) - 1; + // Set start_bit -> LSB to 0 + mask &= !((1 << start_bit) - 1); + // Leaving end_bit -> start_bit set to 1 + + let existed = (bits[start_key] & mask).count_ones(); + bits[start_key] |= mask; + + return u64::from(range.end - range.start) - u64::from(existed); + } + + // Mask off the left-most bits (MSB -> start_bit) + let mask = !((1 << start_bit) - 1); + + // Keep track of the number of bits that were already set to + // return how many new bits were set later + let mut existed = (bits[start_key] & mask).count_ones(); + + bits[start_key] |= mask; + + // Set the full blocks, tracking the number of set bits + for i in (start_key + 1)..end_key { + existed += bits[i].count_ones(); + bits[i] = u64::MAX; + } + + // Set the end bits in the last chunk (MSB -> end_bit) + let mask = (1 << end_bit) - 1; + existed += (bits[end_key] & mask).count_ones(); + bits[end_key] |= mask; + + u64::from(range.end - range.start) - u64::from(existed) + } + } + } + /// Push the value that must be the new max of the set. /// /// This function returns whether the value is equal to the @@ -536,3 +597,149 @@ fn key(index: u16) -> usize { fn bit(index: u16) -> usize { index as usize % 64 } + +#[cfg(test)] +mod tests { + use super::*; + + fn as_vec(s: Store) -> Vec { + if let Store::Array(v) = s { + return v; + } + as_vec(s.to_array()) + } + + #[test] + #[allow(clippy::reversed_empty_ranges)] + fn test_array_insert_invalid_range() { + let mut store = Store::Array(vec![1, 2, 8, 9]); + + // Insert a range with start > end. + let new = store.insert_range(6..1); + assert_eq!(new, 0); + + assert_eq!(as_vec(store), vec![1, 2, 8, 9]); + } + + #[test] + fn test_array_insert_range() { + let mut store = Store::Array(vec![1, 2, 8, 9]); + + let new = store.insert_range(4..6); + assert_eq!(new, 2); + + assert_eq!(as_vec(store), vec![1, 2, 4, 5, 8, 9]); + } + + #[test] + fn test_array_insert_range_left_overlap() { + let mut store = Store::Array(vec![1, 2, 8, 9]); + + let new = store.insert_range(2..6); + assert_eq!(new, 3); + + assert_eq!(as_vec(store), vec![1, 2, 3, 4, 5, 8, 9]); + } + + #[test] + fn test_array_insert_range_right_overlap() { + let mut store = Store::Array(vec![1, 2, 8, 9]); + + let new = store.insert_range(4..9); + assert_eq!(new, 4); + + assert_eq!(as_vec(store), vec![1, 2, 4, 5, 6, 7, 8, 9]); + } + + #[test] + fn test_array_insert_range_full_overlap() { + let mut store = Store::Array(vec![1, 2, 8, 9]); + + let new = store.insert_range(1..10); + assert_eq!(new, 5); + + assert_eq!(as_vec(store), vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); + } + + #[test] + #[allow(clippy::reversed_empty_ranges)] + fn test_bitmap_insert_invalid_range() { + let store = Store::Array(vec![1, 2, 8, 9]); + let mut store = store.to_bitmap(); + + // Insert a range with start > end. + let new = store.insert_range(6..1); + assert_eq!(new, 0); + + assert_eq!(as_vec(store), vec![1, 2, 8, 9]); + } + + #[test] + fn test_bitmap_insert_same_key_overlap() { + let store = Store::Array(vec![1, 2, 3, 62, 63]); + let mut store = store.to_bitmap(); + + let new = store.insert_range(1..63); + assert_eq!(new, 58); + + assert_eq!(as_vec(store), (1..64).collect::>()); + } + + #[test] + fn test_bitmap_insert_range() { + let store = Store::Array(vec![1, 2, 130]); + let mut store = store.to_bitmap(); + + let new = store.insert_range(4..129); + assert_eq!(new, 125); + + let mut want = vec![1, 2]; + want.extend(4..129); + want.extend(&[130]); + + assert_eq!(as_vec(store), want); + } + + #[test] + fn test_bitmap_insert_range_left_overlap() { + let store = Store::Array(vec![1, 2, 130]); + let mut store = store.to_bitmap(); + + let new = store.insert_range(1..129); + assert_eq!(new, 126); + + let mut want = Vec::new(); + want.extend(1..129); + want.extend(&[130]); + + assert_eq!(as_vec(store), want); + } + + #[test] + fn test_bitmap_insert_range_right_overlap() { + let store = Store::Array(vec![1, 2, 130]); + let mut store = store.to_bitmap(); + + let new = store.insert_range(4..133); + assert_eq!(new, 128); + + let mut want = vec![1, 2]; + want.extend(4..133); + + assert_eq!(as_vec(store), want); + } + + #[test] + fn test_bitmap_insert_range_full_overlap() { + let store = Store::Array(vec![1, 2, 130]); + let mut store = store.to_bitmap(); + + let new = store.insert_range(1..135); + assert_eq!(new, 131); + + let mut want = Vec::new(); + want.extend(1..135); + + assert_eq!(as_vec(store), want); + } +}