-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Moritz Hoffmann <[email protected]>
- Loading branch information
Showing
4 changed files
with
411 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,352 @@ | ||
//! Simple deduplication of equal consecutive items. | ||
use crate::{CopyOnto, Region}; | ||
|
||
/// A region to deduplicate consecutive equal items. | ||
#[derive(Debug)] | ||
pub struct Deduplicating<R: Region> { | ||
inner: R, | ||
last_index: Option<R::Index>, | ||
} | ||
|
||
impl<R: Region> Default for Deduplicating<R> { | ||
fn default() -> Self { | ||
Self { | ||
inner: R::default(), | ||
last_index: None, | ||
} | ||
} | ||
} | ||
|
||
impl<R: Region> Region for Deduplicating<R> | ||
where | ||
for<'a, 'b> R::ReadItem<'a>: PartialEq<R::ReadItem<'b>>, | ||
{ | ||
type ReadItem<'a> = R::ReadItem<'a> where Self: 'a; | ||
type Index = R::Index; | ||
|
||
fn index(&self, index: Self::Index) -> Self::ReadItem<'_> { | ||
self.inner.index(index) | ||
} | ||
|
||
fn reserve_regions<'a, I>(&mut self, regions: I) | ||
where | ||
Self: 'a, | ||
I: Iterator<Item = &'a Self> + Clone, | ||
{ | ||
self.inner.reserve_regions(regions.map(|r| &r.inner)); | ||
} | ||
|
||
fn clear(&mut self) { | ||
self.inner.clear(); | ||
self.last_index = None; | ||
} | ||
} | ||
|
||
impl<R: Region, T: CopyOnto<R>> CopyOnto<Deduplicating<R>> for T | ||
where | ||
for<'a> R::ReadItem<'a>: PartialEq<T>, | ||
for<'a, 'b> R::ReadItem<'a>: PartialEq<R::ReadItem<'b>>, | ||
{ | ||
fn copy_onto(self, target: &mut Deduplicating<R>) -> <Deduplicating<R> as Region>::Index { | ||
if let Some(last_index) = target.last_index { | ||
if target.inner.index(last_index) == self { | ||
return last_index; | ||
} | ||
} | ||
let index = self.copy_onto(&mut target.inner); | ||
target.last_index = Some(index); | ||
index | ||
} | ||
} | ||
|
||
/// TODO | ||
pub trait OffsetContainer: Default { | ||
/// Accepts a newly pushed element. | ||
fn push(&mut self, item: usize); | ||
|
||
/// Lookup an index | ||
fn index(&self, index: usize) -> usize; | ||
|
||
/// Clear all contents. | ||
fn clear(&mut self); | ||
|
||
/// Returns the number of elements. | ||
fn len(&self) -> usize; | ||
} | ||
|
||
#[derive(Debug, Default)] | ||
enum OffsetStride { | ||
#[default] | ||
Empty, | ||
Zero, | ||
Striding(usize, usize), | ||
Saturated(usize, usize, usize), | ||
} | ||
|
||
impl OffsetStride { | ||
/// Accepts or rejects a newly pushed element. | ||
fn push(&mut self, item: usize) -> bool { | ||
match self { | ||
OffsetStride::Empty => { | ||
if item == 0 { | ||
*self = OffsetStride::Zero; | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
OffsetStride::Zero => { | ||
*self = OffsetStride::Striding(item, 2); | ||
true | ||
} | ||
OffsetStride::Striding(stride, count) => { | ||
if item == *stride * *count { | ||
*count += 1; | ||
true | ||
} else if item == *stride * (*count - 1) { | ||
*self = OffsetStride::Saturated(*stride, *count, 1); | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
OffsetStride::Saturated(stride, count, reps) => { | ||
if item == *stride * (*count - 1) { | ||
*reps += 1; | ||
true | ||
} else { | ||
false | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn index(&self, index: usize) -> usize { | ||
match self { | ||
OffsetStride::Empty => { | ||
panic!("Empty OffsetStride") | ||
} | ||
OffsetStride::Zero => 0, | ||
OffsetStride::Striding(stride, _steps) => *stride * index, | ||
OffsetStride::Saturated(stride, steps, _reps) => { | ||
if index < *steps { | ||
*stride * index | ||
} else { | ||
*stride * (*steps - 1) | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn len(&self) -> usize { | ||
match self { | ||
OffsetStride::Empty => 0, | ||
OffsetStride::Zero => 1, | ||
OffsetStride::Striding(_stride, steps) => *steps, | ||
OffsetStride::Saturated(_stride, steps, reps) => *steps + *reps, | ||
} | ||
} | ||
} | ||
|
||
/// A list of unsigned integers that uses `u32` elements as long as they are small enough, and switches to `u64` once they are not. | ||
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Debug, Default)] | ||
pub struct OffsetList { | ||
/// Length of a prefix of zero elements. | ||
pub zero_prefix: usize, | ||
/// Offsets that fit within a `u32`. | ||
pub smol: Vec<u32>, | ||
/// Offsets that either do not fit in a `u32`, or are inserted after some offset that did not fit. | ||
pub chonk: Vec<u64>, | ||
} | ||
|
||
impl OffsetList { | ||
/// Allocate a new list with a specified capacity. | ||
pub fn with_capacity(cap: usize) -> Self { | ||
Self { | ||
zero_prefix: 0, | ||
smol: Vec::with_capacity(cap), | ||
chonk: Vec::new(), | ||
} | ||
} | ||
/// Inserts the offset, as a `u32` if that is still on the table. | ||
pub fn push(&mut self, offset: usize) { | ||
if self.smol.is_empty() && self.chonk.is_empty() && offset == 0 { | ||
self.zero_prefix += 1; | ||
} else if self.chonk.is_empty() { | ||
if let Ok(smol) = offset.try_into() { | ||
self.smol.push(smol); | ||
} else { | ||
self.chonk.push(offset.try_into().unwrap()) | ||
} | ||
} else { | ||
self.chonk.push(offset.try_into().unwrap()) | ||
} | ||
} | ||
/// Like `std::ops::Index`, which we cannot implement as it must return a `&usize`. | ||
pub fn index(&self, index: usize) -> usize { | ||
if index < self.zero_prefix { | ||
0 | ||
} else if index - self.zero_prefix < self.smol.len() { | ||
self.smol[index - self.zero_prefix].try_into().unwrap() | ||
} else { | ||
self.chonk[index - self.zero_prefix - self.smol.len()] | ||
.try_into() | ||
.unwrap() | ||
} | ||
} | ||
/// The number of offsets in the list. | ||
pub fn len(&self) -> usize { | ||
self.zero_prefix + self.smol.len() + self.chonk.len() | ||
} | ||
|
||
/// Returns `true` if this list contains no elements. | ||
pub fn is_empty(&self) -> bool { | ||
self.len() == 0 | ||
} | ||
} | ||
|
||
/// TODO | ||
#[derive(Default, Debug)] | ||
pub struct OffsetOptimized { | ||
strided: OffsetStride, | ||
spilled: OffsetList, | ||
} | ||
|
||
impl OffsetContainer for OffsetOptimized { | ||
fn push(&mut self, item: usize) { | ||
if !self.spilled.is_empty() { | ||
self.spilled.push(item); | ||
} else { | ||
let inserted = self.strided.push(item); | ||
if !inserted { | ||
self.spilled.push(item); | ||
} | ||
} | ||
} | ||
|
||
fn index(&self, index: usize) -> usize { | ||
if index < self.strided.len() { | ||
self.strided.index(index) | ||
} else { | ||
self.spilled.index(index - self.strided.len()) | ||
} | ||
} | ||
|
||
fn clear(&mut self) { | ||
todo!() | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.strided.len() + self.spilled.len() | ||
} | ||
} | ||
|
||
/// TODO | ||
#[derive(Debug)] | ||
pub struct ConsecutiveOffsetPairs<R: Region<Index = (usize, usize)>, O: OffsetContainer> { | ||
inner: R, | ||
offsets: O, | ||
last_index: usize, | ||
} | ||
|
||
impl<R: Region<Index = (usize, usize)>, O: OffsetContainer> Default | ||
for ConsecutiveOffsetPairs<R, O> | ||
{ | ||
fn default() -> Self { | ||
let mut d = Self { | ||
inner: Default::default(), | ||
offsets: Default::default(), | ||
last_index: 0, | ||
}; | ||
d.offsets.push(0); | ||
d | ||
} | ||
} | ||
|
||
impl<R: Region<Index = (usize, usize)>, O: OffsetContainer> Region | ||
for ConsecutiveOffsetPairs<R, O> | ||
{ | ||
type ReadItem<'a> = R::ReadItem<'a> | ||
where | ||
Self: 'a; | ||
|
||
type Index = usize; | ||
|
||
fn index(&self, index: Self::Index) -> Self::ReadItem<'_> { | ||
self.inner.index((self.offsets.index(index - 1), self.offsets.index(index))) | ||
} | ||
|
||
fn reserve_regions<'a, I>(&mut self, regions: I) | ||
where | ||
Self: 'a, | ||
I: Iterator<Item = &'a Self> + Clone, | ||
{ | ||
self.inner.reserve_regions(regions.map(|r| &r.inner)); | ||
} | ||
|
||
fn clear(&mut self) { | ||
self.inner.clear(); | ||
self.offsets.clear(); | ||
} | ||
} | ||
|
||
impl<R: Region<Index = (usize, usize)>, O: OffsetContainer, T: CopyOnto<R>> | ||
CopyOnto<ConsecutiveOffsetPairs<R, O>> for T | ||
{ | ||
fn copy_onto( | ||
self, | ||
target: &mut ConsecutiveOffsetPairs<R, O>, | ||
) -> <ConsecutiveOffsetPairs<R, O> as Region>::Index { | ||
let index = self.copy_onto(&mut target.inner); | ||
assert_eq!(index.0, target.last_index); | ||
target.offsets.push(index.1); | ||
target.offsets.len() - 2 | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::impls::dedup_one::{ConsecutiveOffsetPairs, Deduplicating, OffsetOptimized}; | ||
use crate::{CopyOnto, FlatStack, Region, StringRegion}; | ||
|
||
#[test] | ||
fn test_dedup_flatstack() { | ||
let mut fs = FlatStack::<Deduplicating<StringRegion>>::default(); | ||
|
||
fs.copy("abc"); | ||
fs.copy("abc"); | ||
|
||
println!("{fs:?}"); | ||
println!("{:?}", fs.as_parts()); | ||
} | ||
|
||
#[test] | ||
fn test_dedup_region() { | ||
let mut r = Deduplicating::<StringRegion>::default(); | ||
|
||
fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index { | ||
item.copy_onto(r) | ||
} | ||
|
||
assert_eq!(copy(&mut r, "abc"), copy(&mut r, "abc")); | ||
|
||
println!("{r:?}"); | ||
} | ||
|
||
#[test] | ||
fn test_offset_optimized() { | ||
let mut r = ConsecutiveOffsetPairs::<Deduplicating::<StringRegion>, OffsetOptimized>::default(); | ||
|
||
fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index { | ||
item.copy_onto(r) | ||
} | ||
|
||
for _ in 0..1000 { | ||
copy(&mut r, "abc"); | ||
} | ||
|
||
println!("{r:?}"); | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
//! Various region implementations. | ||
pub mod dedup_one; | ||
pub mod mirror; | ||
pub mod option; | ||
pub mod result; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.