Skip to content

Commit

Permalink
extract Lookup and SlidingWindow
Browse files Browse the repository at this point in the history
It is easier (for me) to track smaller files.
  • Loading branch information
ianic committed Feb 6, 2024
1 parent 92cebce commit bbf5e27
Show file tree
Hide file tree
Showing 3 changed files with 299 additions and 290 deletions.
125 changes: 125 additions & 0 deletions src/Lookup.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/// Lookup of the previous locations for the same 4 byte data. Works on hash of
/// 4 bytes data. Head contains position of the first match for each hash. Chain
/// points to the previous position of the same hash given the current location.
///
const std = @import("std");
const testing = std.testing;
const expect = testing.expect;
const consts = @import("consts.zig");

const Self = @This();

const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
const chain_len = 2 * consts.history.len;

// Maps hash => first position
head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
// Maps position => previous positions for the same hash value
chain: [chain_len]u16 = [_]u16{0} ** (chain_len),

// Calculates hash of the 4 bytes from data.
// Inserts `pos` position of that hash in the lookup tables.
// Returns previous location with the same hash value.
pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
if (data.len < 4) return 0;
const h = hash(data[0..4]);
return self.set(h, pos);
}

// Retruns previous location with the same hash value given the current
// position.
pub inline fn prev(self: *Self, pos: u16) u16 {
return self.chain[pos];
}

inline fn set(self: *Self, h: u32, pos: u16) u16 {
const p = self.head[h];
self.head[h] = pos;
self.chain[pos] = p;
return p;
}

// Slide all positions in head and chain for `n`
pub fn slide(self: *Self, n: u16) void {
for (&self.head) |*v| {
v.* -|= n;
}
var i: usize = 0;
while (i < n) : (i += 1) {
self.chain[i] = self.chain[i + n] -| n;
}
}

// Add `len` 4 bytes hashes from `data` into lookup.
// Position of the first byte is `pos`.
pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void {
if (len == 0 or data.len < consts.match.min_length) {
return;
}
var hb =
@as(u32, data[3]) |
@as(u32, data[2]) << 8 |
@as(u32, data[1]) << 16 |
@as(u32, data[0]) << 24;
_ = self.set(hashu(hb), pos);

var i = pos;
for (4..@min(len + 3, data.len)) |j| {
hb = (hb << 8) | @as(u32, data[j]);
i += 1;
_ = self.set(hashu(hb), i);
}
}

// Calculates hash of the first 4 bytes of `b`.
inline fn hash(b: *const [4]u8) u32 {
return hashu(@as(u32, b[3]) |
@as(u32, b[2]) << 8 |
@as(u32, b[1]) << 16 |
@as(u32, b[0]) << 24);
}

inline fn hashu(v: u32) u32 {
return @intCast((v *% prime4) >> consts.lookup.shift);
}

test "Lookup add/prev" {
const data = [_]u8{
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x01, 0x02, 0x03,
};

var h: Self = .{};
for (data, 0..) |_, i| {
const p = h.add(data[i..], @intCast(i));
if (i >= 8 and i < 24) {
try expect(p == i - 8);
} else {
try expect(p == 0);
}
}

const v = Self.hash(data[2 .. 2 + 4]);
try expect(h.head[v] == 2 + 16);
try expect(h.chain[2 + 16] == 2 + 8);
try expect(h.chain[2 + 8] == 2);
}

test "Lookup bulkAdd" {
const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";

// one by one
var h: Self = .{};
for (data, 0..) |_, i| {
_ = h.add(data[i..], @intCast(i));
}

// in bulk
var bh: Self = .{};
bh.bulkAdd(data, data.len, 0);

try testing.expectEqualSlices(u16, &h.head, &bh.head);
try testing.expectEqualSlices(u16, &h.chain, &bh.chain);
}
147 changes: 147 additions & 0 deletions src/SlidingWindow.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
const std = @import("std");
const consts = @import("consts.zig");

const expect = testing.expect;
const assert = std.debug.assert;
const testing = std.testing;

// Buffer of history data.

const hist_len = consts.history.len;
const buffer_len = 2 * hist_len;
const min_lookahead = consts.match.min_length + consts.match.max_length;
const max_rp = buffer_len - min_lookahead;

const Self = @This();

buffer: [buffer_len]u8 = undefined,
wp: usize = 0, // write position
rp: usize = 0, // read position
fp: isize = 0, // flush position, tokens are build from fp..rp

// Returns number of bytes written, or 0 if buffer is full and need to slide.
pub fn write(self: *Self, buf: []const u8) usize {
if (self.rp >= max_rp) return 0; // need to slide

const n = @min(buf.len, buffer_len - self.wp);
@memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
self.wp += n;
return n;
}

// Slide buffer for hist_len.
// Drops old history, preserves bwtween hist_len and hist_len - min_lookahead.
// Returns number of bytes removed.
pub fn slide(self: *Self) u16 {
assert(self.rp >= max_rp and self.wp >= self.rp);
const n = self.wp - hist_len;
@memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
self.rp -= hist_len;
self.wp -= hist_len;
self.fp -= hist_len;
return @intCast(n);
}

// flush - process all data from window
// If not flush preserve enough data for the loghest match.
// Returns null if there is not enough data.
pub fn activeLookahead(self: *Self, flush: bool) ?[]const u8 {
const min: usize = if (flush) 0 else min_lookahead;
const lh = self.lookahead();
return if (lh.len > min) lh else null;
}

pub inline fn lookahead(self: *Self) []const u8 {
assert(self.wp >= self.rp);
return self.buffer[self.rp..self.wp];
}

pub fn writable(self: *Self) []u8 {
return self.buffer[self.wp..];
}

pub fn written(self: *Self, n: usize) void {
self.wp += n;
}

pub fn advance(self: *Self, n: u16) void {
assert(self.wp >= self.rp + n);
self.rp += n;
}

// Finds match length between previous and current position.
pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
// lookahead buffers from previous and current positions
const prev_lh = self.buffer[prev_pos..][0..max_len];
const curr_lh = self.buffer[curr_pos..][0..max_len];

// If we alread have match (min_len > 0),
// test the first byte above previous len a[min_len] != b[min_len]
// and then all the bytes from that position to zero.
// That is likely positions to find difference than looping from first bytes.
var i: usize = min_len;
if (i > 0) {
if (max_len <= i) return 0;
while (true) {
if (prev_lh[i] != curr_lh[i]) return 0;
if (i == 0) break;
i -= 1;
}
i = min_len;
}
while (i < max_len) : (i += 1)
if (prev_lh[i] != curr_lh[i]) break;
return if (i >= consts.match.min_length) @intCast(i) else 0;
}

pub fn pos(self: *Self) u16 {
return @intCast(self.rp);
}

pub fn flushed(self: *Self) void {
self.fp = @intCast(self.rp);
}

pub fn tokensBuffer(self: *Self) ?[]const u8 {
assert(self.fp <= self.rp);
if (self.fp < 0) return null;
return self.buffer[@intCast(self.fp)..self.rp];
}

test "SlidingWindow match" {
const data = "Blah blah blah blah blah!";
var win: Self = .{};
try expect(win.write(data) == data.len);
try expect(win.wp == data.len);
try expect(win.rp == 0);

// length between l symbols
try expect(win.match(1, 6, 0) == 18);
try expect(win.match(1, 11, 0) == 13);
try expect(win.match(1, 16, 0) == 8);
try expect(win.match(1, 21, 0) == 0);

// position 15 = "blah blah!"
// position 20 = "blah!"
try expect(win.match(15, 20, 0) == 4);
try expect(win.match(15, 20, 3) == 4);
try expect(win.match(15, 20, 4) == 0);
}

test "SlidingWindow slide" {
var win: Self = .{};
win.wp = Self.buffer_len - 11;
win.rp = Self.buffer_len - 111;
win.buffer[win.rp] = 0xab;
try expect(win.lookahead().len == 100);
try expect(win.tokensBuffer().?.len == win.rp);

const n = win.slide();
try expect(n == 32757);
try expect(win.buffer[win.rp] == 0xab);
try expect(win.rp == Self.hist_len - 111);
try expect(win.wp == Self.hist_len - 11);
try expect(win.lookahead().len == 100);
try expect(win.tokensBuffer() == null);
}
Loading

0 comments on commit bbf5e27

Please sign in to comment.