From bbf5e276d239b6544a7f67e00de9a35d6bcebd18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20Anic=CC=81?= <igor.anic@gmail.com>
Date: Tue, 6 Feb 2024 17:31:47 +0100
Subject: [PATCH] extract Lookup and SlidingWindow

It is easier (for me) to track smaller files.
---
 src/Lookup.zig        | 125 +++++++++++++++++
 src/SlidingWindow.zig | 147 ++++++++++++++++++++
 src/deflate.zig       | 317 ++++--------------------------------------
 3 files changed, 299 insertions(+), 290 deletions(-)
 create mode 100644 src/Lookup.zig
 create mode 100644 src/SlidingWindow.zig

diff --git a/src/Lookup.zig b/src/Lookup.zig
new file mode 100644
index 0000000..c5c21d1
--- /dev/null
+++ b/src/Lookup.zig
@@ -0,0 +1,125 @@
+/// Lookup of the previous locations for the same 4 byte data. Works on hash of
+/// 4 bytes data. Head contains position of the first match for each hash. Chain
+/// points to the previous position of the same hash given the current location.
+///
+const std = @import("std");
+const testing = std.testing;
+const expect = testing.expect;
+const consts = @import("consts.zig");
+
+const Self = @This();
+
+const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
+const chain_len = 2 * consts.history.len;
+
+// Maps hash => first position
+head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
+// Maps position => previous positions for the same hash value
+chain: [chain_len]u16 = [_]u16{0} ** (chain_len),
+
+// Calculates hash of the 4 bytes from data.
+// Inserts `pos` position of that hash in the lookup tables.
+// Returns previous location with the same hash value.
+pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
+    if (data.len < 4) return 0;
+    const h = hash(data[0..4]);
+    return self.set(h, pos);
+}
+
+// Retruns previous location with the same hash value given the current
+// position.
+pub inline fn prev(self: *Self, pos: u16) u16 {
+    return self.chain[pos];
+}
+
+inline fn set(self: *Self, h: u32, pos: u16) u16 {
+    const p = self.head[h];
+    self.head[h] = pos;
+    self.chain[pos] = p;
+    return p;
+}
+
+// Slide all positions in head and chain for `n`
+pub fn slide(self: *Self, n: u16) void {
+    for (&self.head) |*v| {
+        v.* -|= n;
+    }
+    var i: usize = 0;
+    while (i < n) : (i += 1) {
+        self.chain[i] = self.chain[i + n] -| n;
+    }
+}
+
+// Add `len` 4 bytes hashes from `data` into lookup.
+// Position of the first byte is `pos`.
+pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void {
+    if (len == 0 or data.len < consts.match.min_length) {
+        return;
+    }
+    var hb =
+        @as(u32, data[3]) |
+        @as(u32, data[2]) << 8 |
+        @as(u32, data[1]) << 16 |
+        @as(u32, data[0]) << 24;
+    _ = self.set(hashu(hb), pos);
+
+    var i = pos;
+    for (4..@min(len + 3, data.len)) |j| {
+        hb = (hb << 8) | @as(u32, data[j]);
+        i += 1;
+        _ = self.set(hashu(hb), i);
+    }
+}
+
+// Calculates hash of the first 4 bytes of `b`.
+inline fn hash(b: *const [4]u8) u32 {
+    return hashu(@as(u32, b[3]) |
+        @as(u32, b[2]) << 8 |
+        @as(u32, b[1]) << 16 |
+        @as(u32, b[0]) << 24);
+}
+
+inline fn hashu(v: u32) u32 {
+    return @intCast((v *% prime4) >> consts.lookup.shift);
+}
+
+test "Lookup add/prev" {
+    const data = [_]u8{
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+        0x01, 0x02, 0x03,
+    };
+
+    var h: Self = .{};
+    for (data, 0..) |_, i| {
+        const p = h.add(data[i..], @intCast(i));
+        if (i >= 8 and i < 24) {
+            try expect(p == i - 8);
+        } else {
+            try expect(p == 0);
+        }
+    }
+
+    const v = Self.hash(data[2 .. 2 + 4]);
+    try expect(h.head[v] == 2 + 16);
+    try expect(h.chain[2 + 16] == 2 + 8);
+    try expect(h.chain[2 + 8] == 2);
+}
+
+test "Lookup bulkAdd" {
+    const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
+
+    // one by one
+    var h: Self = .{};
+    for (data, 0..) |_, i| {
+        _ = h.add(data[i..], @intCast(i));
+    }
+
+    // in bulk
+    var bh: Self = .{};
+    bh.bulkAdd(data, data.len, 0);
+
+    try testing.expectEqualSlices(u16, &h.head, &bh.head);
+    try testing.expectEqualSlices(u16, &h.chain, &bh.chain);
+}
diff --git a/src/SlidingWindow.zig b/src/SlidingWindow.zig
new file mode 100644
index 0000000..48612a7
--- /dev/null
+++ b/src/SlidingWindow.zig
@@ -0,0 +1,147 @@
+const std = @import("std");
+const consts = @import("consts.zig");
+
+const expect = testing.expect;
+const assert = std.debug.assert;
+const testing = std.testing;
+
+// Buffer of history data.
+
+const hist_len = consts.history.len;
+const buffer_len = 2 * hist_len;
+const min_lookahead = consts.match.min_length + consts.match.max_length;
+const max_rp = buffer_len - min_lookahead;
+
+const Self = @This();
+
+buffer: [buffer_len]u8 = undefined,
+wp: usize = 0, // write position
+rp: usize = 0, // read position
+fp: isize = 0, // flush position, tokens are build from fp..rp
+
+// Returns number of bytes written, or 0 if buffer is full and need to slide.
+pub fn write(self: *Self, buf: []const u8) usize {
+    if (self.rp >= max_rp) return 0; // need to slide
+
+    const n = @min(buf.len, buffer_len - self.wp);
+    @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
+    self.wp += n;
+    return n;
+}
+
+// Slide buffer for hist_len.
+// Drops old history, preserves bwtween hist_len and hist_len - min_lookahead.
+// Returns number of bytes removed.
+pub fn slide(self: *Self) u16 {
+    assert(self.rp >= max_rp and self.wp >= self.rp);
+    const n = self.wp - hist_len;
+    @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
+    self.rp -= hist_len;
+    self.wp -= hist_len;
+    self.fp -= hist_len;
+    return @intCast(n);
+}
+
+// flush - process all data from window
+// If not flush preserve enough data for the loghest match.
+// Returns null if there is not enough data.
+pub fn activeLookahead(self: *Self, flush: bool) ?[]const u8 {
+    const min: usize = if (flush) 0 else min_lookahead;
+    const lh = self.lookahead();
+    return if (lh.len > min) lh else null;
+}
+
+pub inline fn lookahead(self: *Self) []const u8 {
+    assert(self.wp >= self.rp);
+    return self.buffer[self.rp..self.wp];
+}
+
+pub fn writable(self: *Self) []u8 {
+    return self.buffer[self.wp..];
+}
+
+pub fn written(self: *Self, n: usize) void {
+    self.wp += n;
+}
+
+pub fn advance(self: *Self, n: u16) void {
+    assert(self.wp >= self.rp + n);
+    self.rp += n;
+}
+
+// Finds match length between previous and current position.
+pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
+    const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
+    // lookahead buffers from previous and current positions
+    const prev_lh = self.buffer[prev_pos..][0..max_len];
+    const curr_lh = self.buffer[curr_pos..][0..max_len];
+
+    // If we alread have match (min_len > 0),
+    // test the first byte above previous len a[min_len] != b[min_len]
+    // and then all the bytes from that position to zero.
+    // That is likely positions to find difference than looping from first bytes.
+    var i: usize = min_len;
+    if (i > 0) {
+        if (max_len <= i) return 0;
+        while (true) {
+            if (prev_lh[i] != curr_lh[i]) return 0;
+            if (i == 0) break;
+            i -= 1;
+        }
+        i = min_len;
+    }
+    while (i < max_len) : (i += 1)
+        if (prev_lh[i] != curr_lh[i]) break;
+    return if (i >= consts.match.min_length) @intCast(i) else 0;
+}
+
+pub fn pos(self: *Self) u16 {
+    return @intCast(self.rp);
+}
+
+pub fn flushed(self: *Self) void {
+    self.fp = @intCast(self.rp);
+}
+
+pub fn tokensBuffer(self: *Self) ?[]const u8 {
+    assert(self.fp <= self.rp);
+    if (self.fp < 0) return null;
+    return self.buffer[@intCast(self.fp)..self.rp];
+}
+
+test "SlidingWindow match" {
+    const data = "Blah blah blah blah blah!";
+    var win: Self = .{};
+    try expect(win.write(data) == data.len);
+    try expect(win.wp == data.len);
+    try expect(win.rp == 0);
+
+    // length between l symbols
+    try expect(win.match(1, 6, 0) == 18);
+    try expect(win.match(1, 11, 0) == 13);
+    try expect(win.match(1, 16, 0) == 8);
+    try expect(win.match(1, 21, 0) == 0);
+
+    // position 15 = "blah blah!"
+    // position 20 = "blah!"
+    try expect(win.match(15, 20, 0) == 4);
+    try expect(win.match(15, 20, 3) == 4);
+    try expect(win.match(15, 20, 4) == 0);
+}
+
+test "SlidingWindow slide" {
+    var win: Self = .{};
+    win.wp = Self.buffer_len - 11;
+    win.rp = Self.buffer_len - 111;
+    win.buffer[win.rp] = 0xab;
+    try expect(win.lookahead().len == 100);
+    try expect(win.tokensBuffer().?.len == win.rp);
+
+    const n = win.slide();
+    try expect(n == 32757);
+    try expect(win.buffer[win.rp] == 0xab);
+    try expect(win.rp == Self.hist_len - 111);
+    try expect(win.wp == Self.hist_len - 11);
+    try expect(win.lookahead().len == 100);
+    try expect(win.tokensBuffer() == null);
+}
diff --git a/src/deflate.zig b/src/deflate.zig
index f460ed6..1c500e8 100644
--- a/src/deflate.zig
+++ b/src/deflate.zig
@@ -9,6 +9,8 @@ const Token = @import("Token.zig");
 const consts = @import("consts.zig");
 const BlockWriter = @import("block_writer.zig").BlockWriter;
 const Container = @import("container.zig").Container;
+const SlidingWindow = @import("SlidingWindow.zig");
+const Lookup = @import("Lookup.zig");
 
 pub const Level = enum(u4) {
     // zig fmt: off
@@ -79,7 +81,7 @@ pub fn Compressor(comptime container: Container, comptime WriterType: type) type
 fn Deflate(comptime container: Container, comptime WriterType: type, comptime BlockWriterType: type) type {
     return struct {
         lookup: Lookup = .{},
-        win: Window = .{},
+        win: SlidingWindow = .{},
         tokens: Tokens = .{},
         wrt: WriterType,
         block_writer: BlockWriterType,
@@ -291,6 +293,29 @@ fn Deflate(comptime container: Container, comptime WriterType: type, comptime Bl
     };
 }
 
+// Tokens store
+const Tokens = struct {
+    list: [consts.deflate.tokens]Token = undefined,
+    pos: usize = 0,
+
+    fn add(self: *Tokens, t: Token) void {
+        self.list[self.pos] = t;
+        self.pos += 1;
+    }
+
+    fn full(self: *Tokens) bool {
+        return self.pos == self.list.len;
+    }
+
+    fn reset(self: *Tokens) void {
+        self.pos = 0;
+    }
+
+    fn tokens(self: *Tokens) []const Token {
+        return self.list[0..self.pos];
+    }
+};
+
 pub fn huffmanOnlyCompressor(comptime container: Container, writer: anytype) !HuffmanOnlyCompressor(
     container,
     @TypeOf(writer),
@@ -403,146 +428,6 @@ const TestTokenWriter = struct {
     pub fn flush(_: *Self) !void {}
 };
 
-// Buffer of history data.
-const Window = struct {
-    const hist_len = consts.history.len;
-    const buffer_len = 2 * hist_len;
-    const min_lookahead = consts.match.min_length + consts.match.max_length;
-    const max_rp = buffer_len - min_lookahead;
-
-    buffer: [buffer_len]u8 = undefined,
-    wp: usize = 0, // write position
-    rp: usize = 0, // read position
-    fp: isize = 0, // flush position, tokens are build from fp..rp
-
-    // Returns number of bytes written, or 0 if buffer is full and need to slide.
-    pub fn write(self: *Window, buf: []const u8) usize {
-        if (self.rp >= max_rp) return 0; // need to slide
-
-        const n = @min(buf.len, buffer_len - self.wp);
-        @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
-        self.wp += n;
-        return n;
-    }
-
-    // Slide buffer for hist_len.
-    // Drops old history, preserves bwtween hist_len and hist_len - min_lookahead.
-    // Returns number of bytes removed.
-    pub fn slide(self: *Window) u16 {
-        assert(self.rp >= max_rp and self.wp >= self.rp);
-        const n = self.wp - hist_len;
-        @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
-        self.rp -= hist_len;
-        self.wp -= hist_len;
-        self.fp -= hist_len;
-        return @intCast(n);
-    }
-
-    // flush - process all data from window
-    // If not flush preserve enough data for the loghest match.
-    // Returns null if there is not enough data.
-    pub fn activeLookahead(self: *Window, flush: bool) ?[]const u8 {
-        const min: usize = if (flush) 0 else min_lookahead;
-        const lh = self.lookahead();
-        return if (lh.len > min) lh else null;
-    }
-
-    pub inline fn lookahead(self: *Window) []const u8 {
-        assert(self.wp >= self.rp);
-        return self.buffer[self.rp..self.wp];
-    }
-
-    pub fn writable(self: *Window) []u8 {
-        return self.buffer[self.wp..];
-    }
-
-    pub fn written(self: *Window, n: usize) void {
-        self.wp += n;
-    }
-
-    pub fn advance(self: *Window, n: u16) void {
-        assert(self.wp >= self.rp + n);
-        self.rp += n;
-    }
-
-    // Finds match length between previous and current position.
-    pub fn match(self: *Window, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
-        const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
-        // lookahead buffers from previous and current positions
-        const prev_lh = self.buffer[prev_pos..][0..max_len];
-        const curr_lh = self.buffer[curr_pos..][0..max_len];
-
-        // If we alread have match (min_len > 0),
-        // test the first byte above previous len a[min_len] != b[min_len]
-        // and then all the bytes from that position to zero.
-        // That is likely positions to find difference than looping from first bytes.
-        var i: usize = min_len;
-        if (i > 0) {
-            if (max_len <= i) return 0;
-            while (true) {
-                if (prev_lh[i] != curr_lh[i]) return 0;
-                if (i == 0) break;
-                i -= 1;
-            }
-            i = min_len;
-        }
-        while (i < max_len) : (i += 1)
-            if (prev_lh[i] != curr_lh[i]) break;
-        return if (i >= consts.match.min_length) @intCast(i) else 0;
-    }
-
-    pub fn pos(self: *Window) u16 {
-        return @intCast(self.rp);
-    }
-
-    pub fn flushed(self: *Window) void {
-        self.fp = @intCast(self.rp);
-    }
-
-    pub fn tokensBuffer(self: *Window) ?[]const u8 {
-        assert(self.fp <= self.rp);
-        if (self.fp < 0) return null;
-        return self.buffer[@intCast(self.fp)..self.rp];
-    }
-};
-
-test "Window match" {
-    const data = "Blah blah blah blah blah!";
-    var win: Window = .{};
-    try expect(win.write(data) == data.len);
-    try expect(win.wp == data.len);
-    try expect(win.rp == 0);
-
-    // length between l symbols
-    try expect(win.match(1, 6, 0) == 18);
-    try expect(win.match(1, 11, 0) == 13);
-    try expect(win.match(1, 16, 0) == 8);
-    try expect(win.match(1, 21, 0) == 0);
-
-    // position 15 = "blah blah!"
-    // position 20 = "blah!"
-    try expect(win.match(15, 20, 0) == 4);
-    try expect(win.match(15, 20, 3) == 4);
-    try expect(win.match(15, 20, 4) == 0);
-}
-
-test "Window slide" {
-    var win: Window = .{};
-    win.wp = Window.buffer_len - 11;
-    win.rp = Window.buffer_len - 111;
-    win.buffer[win.rp] = 0xab;
-    try expect(win.lookahead().len == 100);
-    try expect(win.tokensBuffer().?.len == win.rp);
-
-    const n = win.slide();
-    try expect(n == 32757);
-    try expect(win.buffer[win.rp] == 0xab);
-    try expect(win.rp == Window.hist_len - 111);
-    try expect(win.wp == Window.hist_len - 11);
-    try expect(win.lookahead().len == 100);
-    try expect(win.tokensBuffer() == null);
-}
-
 test "check struct sizes" {
     try expect(@sizeOf(Token) == 4);
 
@@ -556,7 +441,7 @@ test "check struct sizes" {
 
     // buffer: (32k * 2), wp: 8, rp: 8, fp: 8
     const window_size = 64 * 1024 + 8 + 8 + 8;
-    try expect(@sizeOf(Window) == window_size);
+    try expect(@sizeOf(SlidingWindow) == window_size);
 
     const Bw = BlockWriter(@TypeOf(io.null_writer));
     // huffman bit writer internal: 11480
@@ -579,38 +464,6 @@ test "check struct sizes" {
     // defer cmp.deinit();
 }
 
-// Tokens store
-const Tokens = struct {
-    list: [consts.deflate.tokens]Token = undefined,
-    pos: usize = 0,
-
-    fn add(self: *Tokens, t: Token) void {
-        self.list[self.pos] = t;
-        self.pos += 1;
-    }
-
-    fn len(self: *Tokens) usize {
-        return self.pos;
-    }
-
-    fn full(self: *Tokens) bool {
-        return self.pos == consts.deflate.tokens;
-    }
-
-    fn reset(self: *Tokens) void {
-        self.pos = 0;
-    }
-
-    fn at(self: *Tokens, n: usize) Token {
-        assert(n < self.pos);
-        return self.list[n];
-    }
-
-    fn tokens(self: *Tokens) []const Token {
-        return self.list[0..self.pos];
-    }
-};
-
 test "deflate file tokenization" {
     const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
     const cases = [_]struct {
@@ -709,119 +562,3 @@ fn TokenDecoder(comptime WriterType: type) type {
         pub fn flush(_: *Self) !void {}
     };
 }
-
-const Lookup = struct {
-    const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
-    const chain_len = Window.buffer_len;
-
-    // hash => location lookup
-    head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
-    // location => prev location for the same hash value
-    chain: [chain_len]u16 = [_]u16{0} ** (chain_len),
-
-    // Calculates hash of the 4 bytes from data.
-    // Inserts idx location of that hash in the lookup tables.
-    // Resturns previous location with the same hash value.
-    pub fn add(self: *Lookup, data: []const u8, idx: u16) u16 {
-        if (data.len < 4) return 0;
-        const h = hash(data[0..4]);
-        return self.set(h, idx);
-    }
-
-    // Previous location with the same hash value.
-    pub inline fn prev(self: *Lookup, idx: u16) u16 {
-        return self.chain[idx];
-    }
-
-    inline fn set(self: *Lookup, h: u32, idx: u16) u16 {
-        const p = self.head[h];
-        self.head[h] = idx;
-        self.chain[idx] = p;
-        return p;
-    }
-
-    // Slide all positions in head and chain for n.
-    pub fn slide(self: *Lookup, n: u16) void {
-        for (&self.head) |*v| {
-            v.* -|= n;
-        }
-        var i: usize = 0;
-        while (i < n) : (i += 1) {
-            self.chain[i] = self.chain[i + n] -| n;
-        }
-    }
-
-    // Add `len` 4 bytes hashes from `data` into lookup.
-    // Position of the first byte is `idx`.
-    pub fn bulkAdd(self: *Lookup, data: []const u8, len: u16, idx: u16) void {
-        if (len == 0 or data.len < consts.match.min_length) {
-            return;
-        }
-        var hb =
-            @as(u32, data[3]) |
-            @as(u32, data[2]) << 8 |
-            @as(u32, data[1]) << 16 |
-            @as(u32, data[0]) << 24;
-        _ = self.set(hashu(hb), idx);
-
-        var i = idx;
-        for (4..@min(len + 3, data.len)) |j| {
-            hb = (hb << 8) | @as(u32, data[j]);
-            i += 1;
-            _ = self.set(hashu(hb), i);
-        }
-    }
-
-    // Calculates hash of the first 4 bytes of `b`.
-    inline fn hash(b: *const [4]u8) u32 {
-        return hashu(@as(u32, b[3]) |
-            @as(u32, b[2]) << 8 |
-            @as(u32, b[1]) << 16 |
-            @as(u32, b[0]) << 24);
-    }
-
-    inline fn hashu(v: u32) u32 {
-        return @intCast((v *% prime4) >> consts.lookup.shift);
-    }
-};
-
-test "Lookup add/prev" {
-    const data = [_]u8{
-        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
-        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
-        0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
-        0x01, 0x02, 0x03,
-    };
-
-    var h: Lookup = .{};
-    for (data, 0..) |_, i| {
-        const prev = h.add(data[i..], @intCast(i));
-        if (i >= 8 and i < 24) {
-            try expect(prev == i - 8);
-        } else {
-            try expect(prev == 0);
-        }
-    }
-
-    const v = Lookup.hash(data[2 .. 2 + 4]);
-    try expect(h.head[v] == 2 + 16);
-    try expect(h.chain[2 + 16] == 2 + 8);
-    try expect(h.chain[2 + 8] == 2);
-}
-
-test "Lookup bulkAdd" {
-    const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
-
-    // one by one
-    var h: Lookup = .{};
-    for (data, 0..) |_, i| {
-        _ = h.add(data[i..], @intCast(i));
-    }
-
-    // in bulk
-    var bh: Lookup = .{};
-    bh.bulkAdd(data, data.len, 0);
-
-    try testing.expectEqualSlices(u16, &h.head, &bh.head);
-    try testing.expectEqualSlices(u16, &h.chain, &bh.chain);
-}