extract Lookup and SlidingWindow

It is easier (for me) to track smaller files.
ianic · Feb 6, 2024 · bbf5e27 · bbf5e27
1 parent 92cebce
commit bbf5e27
Show file tree

Hide file tree

Showing 3 changed files with 299 additions and 290 deletions.
diff --git a/src/Lookup.zig b/src/Lookup.zig
@@ -0,0 +1,125 @@
+/// Lookup of the previous locations for the same 4 byte data. Works on hash of
+/// 4 bytes data. Head contains position of the first match for each hash. Chain
+/// points to the previous position of the same hash given the current location.
+///
+const std = @import("std");
+const testing = std.testing;
+const expect = testing.expect;
+const consts = @import("consts.zig");
+
+const Self = @This();
+
+const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
+const chain_len = 2 * consts.history.len;
+
+// Maps hash => first position
+head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
+// Maps position => previous positions for the same hash value
+chain: [chain_len]u16 = [_]u16{0} ** (chain_len),
+
+// Calculates hash of the 4 bytes from data.
+// Inserts `pos` position of that hash in the lookup tables.
+// Returns previous location with the same hash value.
+pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
+    if (data.len < 4) return 0;
+    const h = hash(data[0..4]);
+    return self.set(h, pos);
+}
+
+// Retruns previous location with the same hash value given the current
+// position.
+pub inline fn prev(self: *Self, pos: u16) u16 {
+    return self.chain[pos];
+}
+
+inline fn set(self: *Self, h: u32, pos: u16) u16 {
+    const p = self.head[h];
+    self.head[h] = pos;
+    self.chain[pos] = p;
+    return p;
+}
+
+// Slide all positions in head and chain for `n`
+pub fn slide(self: *Self, n: u16) void {
+    for (&self.head) |*v| {
+        v.* -|= n;
+    }
+    var i: usize = 0;
+    while (i < n) : (i += 1) {
+        self.chain[i] = self.chain[i + n] -| n;
+    }
+}
+
+// Add `len` 4 bytes hashes from `data` into lookup.
+// Position of the first byte is `pos`.
+pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void {
+    if (len == 0 or data.len < consts.match.min_length) {
+        return;
+    }
+    var hb =
+        @as(u32, data[3]) |
+        @as(u32, data[2]) << 8 |
+        @as(u32, data[1]) << 16 |
+        @as(u32, data[0]) << 24;
+    _ = self.set(hashu(hb), pos);
+
+    var i = pos;
+    for (4..@min(len + 3, data.len)) |j| {
+        hb = (hb << 8) | @as(u32, data[j]);
+        i += 1;
+        _ = self.set(hashu(hb), i);
+    }
+}
+
+// Calculates hash of the first 4 bytes of `b`.
+inline fn hash(b: *const [4]u8) u32 {
+    return hashu(@as(u32, b[3]) |
+        @as(u32, b[2]) << 8 |
+        @as(u32, b[1]) << 16 |
+        @as(u32, b[0]) << 24);
+}
+
+inline fn hashu(v: u32) u32 {
+    return @intCast((v *% prime4) >> consts.lookup.shift);
+}
+
+test "Lookup add/prev" {
+    const data = [_]u8{
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03,
+    };
+
+    var h: Self = .{};
+    for (data, 0..) |_, i| {
+        const p = h.add(data[i..], @intCast(i));
+        if (i >= 8 and i < 24) {
+            try expect(p == i - 8);
+        } else {
+            try expect(p == 0);
+        }
+    }
+
+    const v = Self.hash(data[2 .. 2 + 4]);
+    try expect(h.head[v] == 2 + 16);
+    try expect(h.chain[2 + 16] == 2 + 8);
+    try expect(h.chain[2 + 8] == 2);
+}
+
+test "Lookup bulkAdd" {
+    const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
+
+    // one by one
+    var h: Self = .{};
+    for (data, 0..) |_, i| {
+        _ = h.add(data[i..], @intCast(i));
+    }
+
+    // in bulk
+    var bh: Self = .{};
+    bh.bulkAdd(data, data.len, 0);
+
+    try testing.expectEqualSlices(u16, &h.head, &bh.head);
+    try testing.expectEqualSlices(u16, &h.chain, &bh.chain);
+}
diff --git a/src/SlidingWindow.zig b/src/SlidingWindow.zig
@@ -0,0 +1,147 @@
+const std = @import("std");
+const consts = @import("consts.zig");
+
+const expect = testing.expect;
+const assert = std.debug.assert;
+const testing = std.testing;
+
+// Buffer of history data.
+
+const hist_len = consts.history.len;
+const buffer_len = 2 * hist_len;
+const min_lookahead = consts.match.min_length + consts.match.max_length;
+const max_rp = buffer_len - min_lookahead;
+
+const Self = @This();
+
+buffer: [buffer_len]u8 = undefined,
+wp: usize = 0, // write position
+rp: usize = 0, // read position
+fp: isize = 0, // flush position, tokens are build from fp..rp
+
+// Returns number of bytes written, or 0 if buffer is full and need to slide.
+pub fn write(self: *Self, buf: []const u8) usize {
+    if (self.rp >= max_rp) return 0; // need to slide
+
+    const n = @min(buf.len, buffer_len - self.wp);
+    @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
+    self.wp += n;
+    return n;
+}
+
+// Slide buffer for hist_len.
+// Drops old history, preserves bwtween hist_len and hist_len - min_lookahead.
+// Returns number of bytes removed.
+pub fn slide(self: *Self) u16 {
+    assert(self.rp >= max_rp and self.wp >= self.rp);
+    const n = self.wp - hist_len;
+    @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
+    self.rp -= hist_len;
+    self.wp -= hist_len;
+    self.fp -= hist_len;
+    return @intCast(n);
+}
+
+// flush - process all data from window
+// If not flush preserve enough data for the loghest match.
+// Returns null if there is not enough data.
+pub fn activeLookahead(self: *Self, flush: bool) ?[]const u8 {
+    const min: usize = if (flush) 0 else min_lookahead;
+    const lh = self.lookahead();
+    return if (lh.len > min) lh else null;
+}
+
+pub inline fn lookahead(self: *Self) []const u8 {
+    assert(self.wp >= self.rp);
+    return self.buffer[self.rp..self.wp];
+}
+
+pub fn writable(self: *Self) []u8 {
+    return self.buffer[self.wp..];
+}
+
+pub fn written(self: *Self, n: usize) void {
+    self.wp += n;
+}
+
+pub fn advance(self: *Self, n: u16) void {
+    assert(self.wp >= self.rp + n);
+    self.rp += n;
+}
+
+// Finds match length between previous and current position.
+pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
+    const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
+    // lookahead buffers from previous and current positions
+    const prev_lh = self.buffer[prev_pos..][0..max_len];
+    const curr_lh = self.buffer[curr_pos..][0..max_len];
+
+    // If we alread have match (min_len > 0),
+    // test the first byte above previous len a[min_len] != b[min_len]
+    // and then all the bytes from that position to zero.
+    // That is likely positions to find difference than looping from first bytes.
+    var i: usize = min_len;
+    if (i > 0) {
+        if (max_len <= i) return 0;
+        while (true) {
+            if (prev_lh[i] != curr_lh[i]) return 0;
+            if (i == 0) break;
+            i -= 1;
+        }
+        i = min_len;
+    }
+    while (i < max_len) : (i += 1)
+        if (prev_lh[i] != curr_lh[i]) break;
+    return if (i >= consts.match.min_length) @intCast(i) else 0;
+}
+
+pub fn pos(self: *Self) u16 {
+    return @intCast(self.rp);
+}
+
+pub fn flushed(self: *Self) void {
+    self.fp = @intCast(self.rp);
+}
+
+pub fn tokensBuffer(self: *Self) ?[]const u8 {
+    assert(self.fp <= self.rp);
+    if (self.fp < 0) return null;
+    return self.buffer[@intCast(self.fp)..self.rp];
+}
+
+test "SlidingWindow match" {
+    const data = "Blah blah blah blah blah!";
+    var win: Self = .{};
+    try expect(win.write(data) == data.len);
+    try expect(win.wp == data.len);
+    try expect(win.rp == 0);
+
+    // length between l symbols
+    try expect(win.match(1, 6, 0) == 18);
+    try expect(win.match(1, 11, 0) == 13);
+    try expect(win.match(1, 16, 0) == 8);
+    try expect(win.match(1, 21, 0) == 0);
+
+    // position 15 = "blah blah!"
+    // position 20 = "blah!"
+    try expect(win.match(15, 20, 0) == 4);
+    try expect(win.match(15, 20, 3) == 4);
+    try expect(win.match(15, 20, 4) == 0);
+}
+
+test "SlidingWindow slide" {
+    var win: Self = .{};
+    win.wp = Self.buffer_len - 11;
+    win.rp = Self.buffer_len - 111;
+    win.buffer[win.rp] = 0xab;
+    try expect(win.lookahead().len == 100);
+    try expect(win.tokensBuffer().?.len == win.rp);
+
+    const n = win.slide();
+    try expect(n == 32757);
+    try expect(win.buffer[win.rp] == 0xab);
+    try expect(win.rp == Self.hist_len - 111);
+    try expect(win.wp == Self.hist_len - 11);
+    try expect(win.lookahead().len == 100);
+    try expect(win.tokensBuffer() == null);
+}