Skip to content

Commit

Permalink
Deduplication
Browse files Browse the repository at this point in the history
Signed-off-by: Moritz Hoffmann <[email protected]>
  • Loading branch information
antiguru committed Feb 9, 2024
1 parent 949b886 commit a5c1a15
Show file tree
Hide file tree
Showing 4 changed files with 411 additions and 26 deletions.
352 changes: 352 additions & 0 deletions src/impls/dedup_one.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,352 @@
//! Simple deduplication of equal consecutive items.
use crate::{CopyOnto, Region};

/// A region to deduplicate consecutive equal items.
#[derive(Debug)]
pub struct Deduplicating<R: Region> {
inner: R,
last_index: Option<R::Index>,
}

impl<R: Region> Default for Deduplicating<R> {
fn default() -> Self {
Self {
inner: R::default(),
last_index: None,
}
}
}

impl<R: Region> Region for Deduplicating<R>
where
for<'a, 'b> R::ReadItem<'a>: PartialEq<R::ReadItem<'b>>,
{
type ReadItem<'a> = R::ReadItem<'a> where Self: 'a;
type Index = R::Index;

fn index(&self, index: Self::Index) -> Self::ReadItem<'_> {
self.inner.index(index)
}

fn reserve_regions<'a, I>(&mut self, regions: I)
where
Self: 'a,
I: Iterator<Item = &'a Self> + Clone,
{
self.inner.reserve_regions(regions.map(|r| &r.inner));
}

fn clear(&mut self) {
self.inner.clear();
self.last_index = None;
}
}

impl<R: Region, T: CopyOnto<R>> CopyOnto<Deduplicating<R>> for T
where
for<'a> R::ReadItem<'a>: PartialEq<T>,
for<'a, 'b> R::ReadItem<'a>: PartialEq<R::ReadItem<'b>>,
{
fn copy_onto(self, target: &mut Deduplicating<R>) -> <Deduplicating<R> as Region>::Index {
if let Some(last_index) = target.last_index {
if target.inner.index(last_index) == self {
return last_index;
}
}
let index = self.copy_onto(&mut target.inner);
target.last_index = Some(index);
index
}
}

/// TODO
pub trait OffsetContainer: Default {
/// Accepts a newly pushed element.
fn push(&mut self, item: usize);

/// Lookup an index
fn index(&self, index: usize) -> usize;

/// Clear all contents.
fn clear(&mut self);

/// Returns the number of elements.
fn len(&self) -> usize;
}

#[derive(Debug, Default)]
enum OffsetStride {
#[default]
Empty,
Zero,
Striding(usize, usize),
Saturated(usize, usize, usize),
}

impl OffsetStride {
/// Accepts or rejects a newly pushed element.
fn push(&mut self, item: usize) -> bool {
match self {
OffsetStride::Empty => {
if item == 0 {
*self = OffsetStride::Zero;
true
} else {
false
}
}
OffsetStride::Zero => {
*self = OffsetStride::Striding(item, 2);
true
}
OffsetStride::Striding(stride, count) => {
if item == *stride * *count {
*count += 1;
true
} else if item == *stride * (*count - 1) {
*self = OffsetStride::Saturated(*stride, *count, 1);
true
} else {
false
}
}
OffsetStride::Saturated(stride, count, reps) => {
if item == *stride * (*count - 1) {
*reps += 1;
true
} else {
false
}
}
}
}

fn index(&self, index: usize) -> usize {
match self {
OffsetStride::Empty => {
panic!("Empty OffsetStride")
}
OffsetStride::Zero => 0,
OffsetStride::Striding(stride, _steps) => *stride * index,
OffsetStride::Saturated(stride, steps, _reps) => {
if index < *steps {
*stride * index
} else {
*stride * (*steps - 1)
}
}
}
}

fn len(&self) -> usize {
match self {
OffsetStride::Empty => 0,
OffsetStride::Zero => 1,
OffsetStride::Striding(_stride, steps) => *steps,
OffsetStride::Saturated(_stride, steps, reps) => *steps + *reps,
}
}
}

/// A list of unsigned integers that uses `u32` elements as long as they are small enough, and switches to `u64` once they are not.
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Debug, Default)]
pub struct OffsetList {
/// Length of a prefix of zero elements.
pub zero_prefix: usize,
/// Offsets that fit within a `u32`.
pub smol: Vec<u32>,
/// Offsets that either do not fit in a `u32`, or are inserted after some offset that did not fit.
pub chonk: Vec<u64>,
}

impl OffsetList {
/// Allocate a new list with a specified capacity.
pub fn with_capacity(cap: usize) -> Self {
Self {
zero_prefix: 0,
smol: Vec::with_capacity(cap),
chonk: Vec::new(),
}
}
/// Inserts the offset, as a `u32` if that is still on the table.
pub fn push(&mut self, offset: usize) {
if self.smol.is_empty() && self.chonk.is_empty() && offset == 0 {
self.zero_prefix += 1;
} else if self.chonk.is_empty() {
if let Ok(smol) = offset.try_into() {
self.smol.push(smol);
} else {
self.chonk.push(offset.try_into().unwrap())
}
} else {
self.chonk.push(offset.try_into().unwrap())
}
}
/// Like `std::ops::Index`, which we cannot implement as it must return a `&usize`.
pub fn index(&self, index: usize) -> usize {
if index < self.zero_prefix {
0
} else if index - self.zero_prefix < self.smol.len() {
self.smol[index - self.zero_prefix].try_into().unwrap()
} else {
self.chonk[index - self.zero_prefix - self.smol.len()]
.try_into()
.unwrap()
}
}
/// The number of offsets in the list.
pub fn len(&self) -> usize {
self.zero_prefix + self.smol.len() + self.chonk.len()
}

/// Returns `true` if this list contains no elements.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}

/// TODO
#[derive(Default, Debug)]
pub struct OffsetOptimized {
strided: OffsetStride,
spilled: OffsetList,
}

impl OffsetContainer for OffsetOptimized {
fn push(&mut self, item: usize) {
if !self.spilled.is_empty() {
self.spilled.push(item);
} else {
let inserted = self.strided.push(item);
if !inserted {
self.spilled.push(item);
}
}
}

fn index(&self, index: usize) -> usize {
if index < self.strided.len() {
self.strided.index(index)
} else {
self.spilled.index(index - self.strided.len())
}
}

fn clear(&mut self) {
todo!()
}

fn len(&self) -> usize {
self.strided.len() + self.spilled.len()
}
}

/// TODO
#[derive(Debug)]
pub struct ConsecutiveOffsetPairs<R: Region<Index = (usize, usize)>, O: OffsetContainer> {
inner: R,
offsets: O,
last_index: usize,
}

impl<R: Region<Index = (usize, usize)>, O: OffsetContainer> Default
for ConsecutiveOffsetPairs<R, O>
{
fn default() -> Self {
let mut d = Self {
inner: Default::default(),
offsets: Default::default(),
last_index: 0,
};
d.offsets.push(0);
d
}
}

impl<R: Region<Index = (usize, usize)>, O: OffsetContainer> Region
for ConsecutiveOffsetPairs<R, O>
{
type ReadItem<'a> = R::ReadItem<'a>
where
Self: 'a;

type Index = usize;

fn index(&self, index: Self::Index) -> Self::ReadItem<'_> {
self.inner.index((self.offsets.index(index - 1), self.offsets.index(index)))
}

fn reserve_regions<'a, I>(&mut self, regions: I)
where
Self: 'a,
I: Iterator<Item = &'a Self> + Clone,
{
self.inner.reserve_regions(regions.map(|r| &r.inner));
}

fn clear(&mut self) {
self.inner.clear();
self.offsets.clear();
}
}

impl<R: Region<Index = (usize, usize)>, O: OffsetContainer, T: CopyOnto<R>>
CopyOnto<ConsecutiveOffsetPairs<R, O>> for T
{
fn copy_onto(
self,
target: &mut ConsecutiveOffsetPairs<R, O>,
) -> <ConsecutiveOffsetPairs<R, O> as Region>::Index {
let index = self.copy_onto(&mut target.inner);
assert_eq!(index.0, target.last_index);
target.offsets.push(index.1);
target.offsets.len() - 2
}
}

#[cfg(test)]
mod tests {
use crate::impls::dedup_one::{ConsecutiveOffsetPairs, Deduplicating, OffsetOptimized};
use crate::{CopyOnto, FlatStack, Region, StringRegion};

#[test]
fn test_dedup_flatstack() {
let mut fs = FlatStack::<Deduplicating<StringRegion>>::default();

fs.copy("abc");
fs.copy("abc");

println!("{fs:?}");
println!("{:?}", fs.as_parts());
}

#[test]
fn test_dedup_region() {
let mut r = Deduplicating::<StringRegion>::default();

fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index {
item.copy_onto(r)
}

assert_eq!(copy(&mut r, "abc"), copy(&mut r, "abc"));

println!("{r:?}");
}

#[test]
fn test_offset_optimized() {
let mut r = ConsecutiveOffsetPairs::<Deduplicating::<StringRegion>, OffsetOptimized>::default();

fn copy<R: Region>(r: &mut R, item: impl CopyOnto<R>) -> R::Index {
item.copy_onto(r)
}

for _ in 0..1000 {
copy(&mut r, "abc");
}

println!("{r:?}");

}
}
1 change: 1 addition & 0 deletions src/impls/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Various region implementations.
pub mod dedup_one;
pub mod mirror;
pub mod option;
pub mod result;
Expand Down
8 changes: 4 additions & 4 deletions src/impls/slice_copy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ impl<T> CopyOnto<CopyRegion<T>> for &[T]
where
T: Copy,
{
#[inline]
fn copy_onto(self, target: &mut CopyRegion<T>) -> <CopyRegion<T> as Region>::Index {
let start = target.slices.len();
target.slices.extend_from_slice(self);
Expand All @@ -83,10 +84,9 @@ impl<T> CopyOnto<CopyRegion<T>> for &Vec<T>
where
T: Copy,
{
#[inline]
fn copy_onto(self, target: &mut CopyRegion<T>) -> <CopyRegion<T> as Region>::Index {
let start = target.slices.len();
target.slices.extend_from_slice(self);
(start, target.slices.len())
self.as_slice().copy_onto(target)
}
}

Expand All @@ -95,6 +95,6 @@ impl<T: Copy> ReserveItems<CopyRegion<T>> for &Vec<T> {
where
I: Iterator<Item = Self> + Clone,
{
target.slices.reserve(items.clone().map(|i| i.len()).sum());
ReserveItems::reserve_items(target, items.map(Vec::as_slice))
}
}
Loading

0 comments on commit a5c1a15

Please sign in to comment.