From a5c1a15fabb9a1bcd156f7638257da0f7904124e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Fri, 9 Feb 2024 09:52:24 -0500 Subject: [PATCH] Deduplication Signed-off-by: Moritz Hoffmann --- src/impls/dedup_one.rs | 352 ++++++++++++++++++++++++++++++++++++++++ src/impls/mod.rs | 1 + src/impls/slice_copy.rs | 8 +- src/lib.rs | 76 ++++++--- 4 files changed, 411 insertions(+), 26 deletions(-) create mode 100644 src/impls/dedup_one.rs diff --git a/src/impls/dedup_one.rs b/src/impls/dedup_one.rs new file mode 100644 index 0000000..2c0382a --- /dev/null +++ b/src/impls/dedup_one.rs @@ -0,0 +1,352 @@ +//! Simple deduplication of equal consecutive items. + +use crate::{CopyOnto, Region}; + +/// A region to deduplicate consecutive equal items. +#[derive(Debug)] +pub struct Deduplicating { + inner: R, + last_index: Option, +} + +impl Default for Deduplicating { + fn default() -> Self { + Self { + inner: R::default(), + last_index: None, + } + } +} + +impl Region for Deduplicating +where + for<'a, 'b> R::ReadItem<'a>: PartialEq>, +{ + type ReadItem<'a> = R::ReadItem<'a> where Self: 'a; + type Index = R::Index; + + fn index(&self, index: Self::Index) -> Self::ReadItem<'_> { + self.inner.index(index) + } + + fn reserve_regions<'a, I>(&mut self, regions: I) + where + Self: 'a, + I: Iterator + Clone, + { + self.inner.reserve_regions(regions.map(|r| &r.inner)); + } + + fn clear(&mut self) { + self.inner.clear(); + self.last_index = None; + } +} + +impl> CopyOnto> for T +where + for<'a> R::ReadItem<'a>: PartialEq, + for<'a, 'b> R::ReadItem<'a>: PartialEq>, +{ + fn copy_onto(self, target: &mut Deduplicating) -> as Region>::Index { + if let Some(last_index) = target.last_index { + if target.inner.index(last_index) == self { + return last_index; + } + } + let index = self.copy_onto(&mut target.inner); + target.last_index = Some(index); + index + } +} + +/// TODO +pub trait OffsetContainer: Default { + /// Accepts a newly pushed element. + fn push(&mut self, item: usize); + + /// Lookup an index + fn index(&self, index: usize) -> usize; + + /// Clear all contents. + fn clear(&mut self); + + /// Returns the number of elements. + fn len(&self) -> usize; +} + +#[derive(Debug, Default)] +enum OffsetStride { + #[default] + Empty, + Zero, + Striding(usize, usize), + Saturated(usize, usize, usize), +} + +impl OffsetStride { + /// Accepts or rejects a newly pushed element. + fn push(&mut self, item: usize) -> bool { + match self { + OffsetStride::Empty => { + if item == 0 { + *self = OffsetStride::Zero; + true + } else { + false + } + } + OffsetStride::Zero => { + *self = OffsetStride::Striding(item, 2); + true + } + OffsetStride::Striding(stride, count) => { + if item == *stride * *count { + *count += 1; + true + } else if item == *stride * (*count - 1) { + *self = OffsetStride::Saturated(*stride, *count, 1); + true + } else { + false + } + } + OffsetStride::Saturated(stride, count, reps) => { + if item == *stride * (*count - 1) { + *reps += 1; + true + } else { + false + } + } + } + } + + fn index(&self, index: usize) -> usize { + match self { + OffsetStride::Empty => { + panic!("Empty OffsetStride") + } + OffsetStride::Zero => 0, + OffsetStride::Striding(stride, _steps) => *stride * index, + OffsetStride::Saturated(stride, steps, _reps) => { + if index < *steps { + *stride * index + } else { + *stride * (*steps - 1) + } + } + } + } + + fn len(&self) -> usize { + match self { + OffsetStride::Empty => 0, + OffsetStride::Zero => 1, + OffsetStride::Striding(_stride, steps) => *steps, + OffsetStride::Saturated(_stride, steps, reps) => *steps + *reps, + } + } +} + +/// A list of unsigned integers that uses `u32` elements as long as they are small enough, and switches to `u64` once they are not. +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Debug, Default)] +pub struct OffsetList { + /// Length of a prefix of zero elements. + pub zero_prefix: usize, + /// Offsets that fit within a `u32`. + pub smol: Vec, + /// Offsets that either do not fit in a `u32`, or are inserted after some offset that did not fit. + pub chonk: Vec, +} + +impl OffsetList { + /// Allocate a new list with a specified capacity. + pub fn with_capacity(cap: usize) -> Self { + Self { + zero_prefix: 0, + smol: Vec::with_capacity(cap), + chonk: Vec::new(), + } + } + /// Inserts the offset, as a `u32` if that is still on the table. + pub fn push(&mut self, offset: usize) { + if self.smol.is_empty() && self.chonk.is_empty() && offset == 0 { + self.zero_prefix += 1; + } else if self.chonk.is_empty() { + if let Ok(smol) = offset.try_into() { + self.smol.push(smol); + } else { + self.chonk.push(offset.try_into().unwrap()) + } + } else { + self.chonk.push(offset.try_into().unwrap()) + } + } + /// Like `std::ops::Index`, which we cannot implement as it must return a `&usize`. + pub fn index(&self, index: usize) -> usize { + if index < self.zero_prefix { + 0 + } else if index - self.zero_prefix < self.smol.len() { + self.smol[index - self.zero_prefix].try_into().unwrap() + } else { + self.chonk[index - self.zero_prefix - self.smol.len()] + .try_into() + .unwrap() + } + } + /// The number of offsets in the list. + pub fn len(&self) -> usize { + self.zero_prefix + self.smol.len() + self.chonk.len() + } + + /// Returns `true` if this list contains no elements. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// TODO +#[derive(Default, Debug)] +pub struct OffsetOptimized { + strided: OffsetStride, + spilled: OffsetList, +} + +impl OffsetContainer for OffsetOptimized { + fn push(&mut self, item: usize) { + if !self.spilled.is_empty() { + self.spilled.push(item); + } else { + let inserted = self.strided.push(item); + if !inserted { + self.spilled.push(item); + } + } + } + + fn index(&self, index: usize) -> usize { + if index < self.strided.len() { + self.strided.index(index) + } else { + self.spilled.index(index - self.strided.len()) + } + } + + fn clear(&mut self) { + todo!() + } + + fn len(&self) -> usize { + self.strided.len() + self.spilled.len() + } +} + +/// TODO +#[derive(Debug)] +pub struct ConsecutiveOffsetPairs, O: OffsetContainer> { + inner: R, + offsets: O, + last_index: usize, +} + +impl, O: OffsetContainer> Default + for ConsecutiveOffsetPairs +{ + fn default() -> Self { + let mut d = Self { + inner: Default::default(), + offsets: Default::default(), + last_index: 0, + }; + d.offsets.push(0); + d + } +} + +impl, O: OffsetContainer> Region + for ConsecutiveOffsetPairs +{ + type ReadItem<'a> = R::ReadItem<'a> + where + Self: 'a; + + type Index = usize; + + fn index(&self, index: Self::Index) -> Self::ReadItem<'_> { + self.inner.index((self.offsets.index(index - 1), self.offsets.index(index))) + } + + fn reserve_regions<'a, I>(&mut self, regions: I) + where + Self: 'a, + I: Iterator + Clone, + { + self.inner.reserve_regions(regions.map(|r| &r.inner)); + } + + fn clear(&mut self) { + self.inner.clear(); + self.offsets.clear(); + } +} + +impl, O: OffsetContainer, T: CopyOnto> + CopyOnto> for T +{ + fn copy_onto( + self, + target: &mut ConsecutiveOffsetPairs, + ) -> as Region>::Index { + let index = self.copy_onto(&mut target.inner); + assert_eq!(index.0, target.last_index); + target.offsets.push(index.1); + target.offsets.len() - 2 + } +} + +#[cfg(test)] +mod tests { + use crate::impls::dedup_one::{ConsecutiveOffsetPairs, Deduplicating, OffsetOptimized}; + use crate::{CopyOnto, FlatStack, Region, StringRegion}; + + #[test] + fn test_dedup_flatstack() { + let mut fs = FlatStack::>::default(); + + fs.copy("abc"); + fs.copy("abc"); + + println!("{fs:?}"); + println!("{:?}", fs.as_parts()); + } + + #[test] + fn test_dedup_region() { + let mut r = Deduplicating::::default(); + + fn copy(r: &mut R, item: impl CopyOnto) -> R::Index { + item.copy_onto(r) + } + + assert_eq!(copy(&mut r, "abc"), copy(&mut r, "abc")); + + println!("{r:?}"); + } + + #[test] + fn test_offset_optimized() { + let mut r = ConsecutiveOffsetPairs::, OffsetOptimized>::default(); + + fn copy(r: &mut R, item: impl CopyOnto) -> R::Index { + item.copy_onto(r) + } + + for _ in 0..1000 { + copy(&mut r, "abc"); + } + + println!("{r:?}"); + + } +} diff --git a/src/impls/mod.rs b/src/impls/mod.rs index 4ad472e..3066a18 100644 --- a/src/impls/mod.rs +++ b/src/impls/mod.rs @@ -1,5 +1,6 @@ //! Various region implementations. +pub mod dedup_one; pub mod mirror; pub mod option; pub mod result; diff --git a/src/impls/slice_copy.rs b/src/impls/slice_copy.rs index db66276..25c74ea 100644 --- a/src/impls/slice_copy.rs +++ b/src/impls/slice_copy.rs @@ -63,6 +63,7 @@ impl CopyOnto> for &[T] where T: Copy, { + #[inline] fn copy_onto(self, target: &mut CopyRegion) -> as Region>::Index { let start = target.slices.len(); target.slices.extend_from_slice(self); @@ -83,10 +84,9 @@ impl CopyOnto> for &Vec where T: Copy, { + #[inline] fn copy_onto(self, target: &mut CopyRegion) -> as Region>::Index { - let start = target.slices.len(); - target.slices.extend_from_slice(self); - (start, target.slices.len()) + self.as_slice().copy_onto(target) } } @@ -95,6 +95,6 @@ impl ReserveItems> for &Vec { where I: Iterator + Clone, { - target.slices.reserve(items.clone().map(|i| i.len()).sum()); + ReserveItems::reserve_items(target, items.map(Vec::as_slice)) } } diff --git a/src/lib.rs b/src/lib.rs index 5c5ee07..2db0c2f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,18 +99,31 @@ pub trait CopyOnto { /// Copy self into the target container, returning an index that allows to /// look up the corresponding read item. fn copy_onto(self, target: &mut C) -> C::Index; + + /// Copies an iterator of self into the target container, returning an iterator + /// over indexes. + #[inline] + fn copy_iter( + iter: impl IntoIterator, + target: &mut C, + ) -> impl Iterator + where + Self: Sized, + { + iter.into_iter().map(|item| item.copy_onto(target)) + } } // Blanket implementation for `Box`. This might be a bad idea because it precludes blanket // implementations. -impl CopyOnto for Box -where - for<'a> &'a T: CopyOnto, -{ - fn copy_onto(self, target: &mut R) -> R::Index { - self.as_ref().copy_onto(target) - } -} +// impl CopyOnto for Box +// where +// for<'a> &'a T: CopyOnto, +// { +// fn copy_onto(self, target: &mut R) -> R::Index { +// self.as_ref().copy_onto(target) +// } +// } /// Reserve space in the receiving region. pub trait ReserveItems { @@ -223,6 +236,18 @@ impl FlatStack { pub fn iter(&self) -> Iter<'_, R> { self.into_iter() } + + /// TODO + pub fn as_parts(&self) -> (&[R::Index], &R) { + (self.indices.as_slice(), &self.region) + } +} + +impl<'a, T: CopyOnto, R: Region> Extend for FlatStack { + fn extend>(&mut self, iter: I) { + self.indices + .extend(CopyOnto::::copy_iter(iter, &mut self.region)); + } } impl<'a, R: Region> IntoIterator for &'a FlatStack { @@ -262,8 +287,15 @@ impl<'a, R: Region> ExactSizeIterator for Iter<'a, R> {} #[cfg(test)] mod tests { + use crate::impls::tuple::TupleARegion; + use super::*; + fn copy(r: &mut R, item: impl CopyOnto) -> R::Index { + item.copy_onto(r) + } + + #[test] fn test_readme() { let r: Result<_, u16> = Ok("abc"); @@ -292,7 +324,7 @@ mod tests { #[test] fn test_vec() { - let mut c = SliceRegion::default(); + let mut c = SliceRegion::>::default(); let slice = &[1u8, 2, 3]; let idx = slice.copy_onto(&mut c); assert_eq!(slice, c.index(idx).1) @@ -430,8 +462,8 @@ mod tests { #[test] fn test_result() { let r: Result<_, u16> = Ok("abc"); - let mut c = ResultRegion::default(); - let idx = r.copy_onto(&mut c); + let mut c = ResultRegion::>::default(); + let idx = copy(&mut c, r); assert_eq!(r, c.index(idx)); } @@ -500,20 +532,20 @@ mod tests { test_copy::<_, MirrorRegion<_>>(std::num::Wrapping(0isize)); test_copy::<_, MirrorRegion<_>>(&std::num::Wrapping(0isize)); - test_copy::<_, ResultRegion<_, _>>(Result::::Ok(0)); - test_copy::<_, ResultRegion<_, _>>(&Result::::Ok(0)); + test_copy::<_, ResultRegion, MirrorRegion<_>>>(Result::::Ok(0)); + test_copy::<_, ResultRegion, MirrorRegion<_>>>(&Result::::Ok(0)); - test_copy::<_, SliceRegion<_>>([0u8].as_slice()); - test_copy::<_, SliceRegion<_>>(vec![0u8]); - test_copy::<_, SliceRegion<_>>(&vec![0u8]); + test_copy::<_, SliceRegion>>([0u8].as_slice()); + test_copy::<_, SliceRegion>>(vec![0u8]); + test_copy::<_, SliceRegion>>(&vec![0u8]); - test_copy::<_, SliceRegion<_>>(["a"].as_slice()); - test_copy::<_, SliceRegion<_>>(vec!["a"]); - test_copy::<_, SliceRegion<_>>(&vec!["a"]); + test_copy::<_, SliceRegion>(["a"].as_slice()); + test_copy::<_, SliceRegion>(vec!["a"]); + test_copy::<_, SliceRegion>(&vec!["a"]); - test_copy::<_, SliceRegion<_>>([("a",)].as_slice()); - test_copy::<_, SliceRegion<_>>(vec![("a",)]); - test_copy::<_, SliceRegion<_>>(&vec![("a",)]); + test_copy::<_, SliceRegion>>([("a",)].as_slice()); + test_copy::<_, SliceRegion>>(vec![("a",)]); + test_copy::<_, SliceRegion>>(&vec![("a",)]); test_copy::<_, CopyRegion<_>>([0u8].as_slice());