-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
It is easier (for me) to track smaller files.
- Loading branch information
Showing
3 changed files
with
299 additions
and
290 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/// Lookup of the previous locations for the same 4 byte data. Works on hash of | ||
/// 4 bytes data. Head contains position of the first match for each hash. Chain | ||
/// points to the previous position of the same hash given the current location. | ||
/// | ||
const std = @import("std"); | ||
const testing = std.testing; | ||
const expect = testing.expect; | ||
const consts = @import("consts.zig"); | ||
|
||
const Self = @This(); | ||
|
||
const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761 | ||
const chain_len = 2 * consts.history.len; | ||
|
||
// Maps hash => first position | ||
head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len, | ||
// Maps position => previous positions for the same hash value | ||
chain: [chain_len]u16 = [_]u16{0} ** (chain_len), | ||
|
||
// Calculates hash of the 4 bytes from data. | ||
// Inserts `pos` position of that hash in the lookup tables. | ||
// Returns previous location with the same hash value. | ||
pub fn add(self: *Self, data: []const u8, pos: u16) u16 { | ||
if (data.len < 4) return 0; | ||
const h = hash(data[0..4]); | ||
return self.set(h, pos); | ||
} | ||
|
||
// Retruns previous location with the same hash value given the current | ||
// position. | ||
pub inline fn prev(self: *Self, pos: u16) u16 { | ||
return self.chain[pos]; | ||
} | ||
|
||
inline fn set(self: *Self, h: u32, pos: u16) u16 { | ||
const p = self.head[h]; | ||
self.head[h] = pos; | ||
self.chain[pos] = p; | ||
return p; | ||
} | ||
|
||
// Slide all positions in head and chain for `n` | ||
pub fn slide(self: *Self, n: u16) void { | ||
for (&self.head) |*v| { | ||
v.* -|= n; | ||
} | ||
var i: usize = 0; | ||
while (i < n) : (i += 1) { | ||
self.chain[i] = self.chain[i + n] -| n; | ||
} | ||
} | ||
|
||
// Add `len` 4 bytes hashes from `data` into lookup. | ||
// Position of the first byte is `pos`. | ||
pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void { | ||
if (len == 0 or data.len < consts.match.min_length) { | ||
return; | ||
} | ||
var hb = | ||
@as(u32, data[3]) | | ||
@as(u32, data[2]) << 8 | | ||
@as(u32, data[1]) << 16 | | ||
@as(u32, data[0]) << 24; | ||
_ = self.set(hashu(hb), pos); | ||
|
||
var i = pos; | ||
for (4..@min(len + 3, data.len)) |j| { | ||
hb = (hb << 8) | @as(u32, data[j]); | ||
i += 1; | ||
_ = self.set(hashu(hb), i); | ||
} | ||
} | ||
|
||
// Calculates hash of the first 4 bytes of `b`. | ||
inline fn hash(b: *const [4]u8) u32 { | ||
return hashu(@as(u32, b[3]) | | ||
@as(u32, b[2]) << 8 | | ||
@as(u32, b[1]) << 16 | | ||
@as(u32, b[0]) << 24); | ||
} | ||
|
||
inline fn hashu(v: u32) u32 { | ||
return @intCast((v *% prime4) >> consts.lookup.shift); | ||
} | ||
|
||
test "Lookup add/prev" { | ||
const data = [_]u8{ | ||
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | ||
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | ||
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | ||
0x01, 0x02, 0x03, | ||
}; | ||
|
||
var h: Self = .{}; | ||
for (data, 0..) |_, i| { | ||
const p = h.add(data[i..], @intCast(i)); | ||
if (i >= 8 and i < 24) { | ||
try expect(p == i - 8); | ||
} else { | ||
try expect(p == 0); | ||
} | ||
} | ||
|
||
const v = Self.hash(data[2 .. 2 + 4]); | ||
try expect(h.head[v] == 2 + 16); | ||
try expect(h.chain[2 + 16] == 2 + 8); | ||
try expect(h.chain[2 + 8] == 2); | ||
} | ||
|
||
test "Lookup bulkAdd" { | ||
const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; | ||
|
||
// one by one | ||
var h: Self = .{}; | ||
for (data, 0..) |_, i| { | ||
_ = h.add(data[i..], @intCast(i)); | ||
} | ||
|
||
// in bulk | ||
var bh: Self = .{}; | ||
bh.bulkAdd(data, data.len, 0); | ||
|
||
try testing.expectEqualSlices(u16, &h.head, &bh.head); | ||
try testing.expectEqualSlices(u16, &h.chain, &bh.chain); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
const std = @import("std"); | ||
const consts = @import("consts.zig"); | ||
|
||
const expect = testing.expect; | ||
const assert = std.debug.assert; | ||
const testing = std.testing; | ||
|
||
// Buffer of history data. | ||
|
||
const hist_len = consts.history.len; | ||
const buffer_len = 2 * hist_len; | ||
const min_lookahead = consts.match.min_length + consts.match.max_length; | ||
const max_rp = buffer_len - min_lookahead; | ||
|
||
const Self = @This(); | ||
|
||
buffer: [buffer_len]u8 = undefined, | ||
wp: usize = 0, // write position | ||
rp: usize = 0, // read position | ||
fp: isize = 0, // flush position, tokens are build from fp..rp | ||
|
||
// Returns number of bytes written, or 0 if buffer is full and need to slide. | ||
pub fn write(self: *Self, buf: []const u8) usize { | ||
if (self.rp >= max_rp) return 0; // need to slide | ||
|
||
const n = @min(buf.len, buffer_len - self.wp); | ||
@memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]); | ||
self.wp += n; | ||
return n; | ||
} | ||
|
||
// Slide buffer for hist_len. | ||
// Drops old history, preserves bwtween hist_len and hist_len - min_lookahead. | ||
// Returns number of bytes removed. | ||
pub fn slide(self: *Self) u16 { | ||
assert(self.rp >= max_rp and self.wp >= self.rp); | ||
const n = self.wp - hist_len; | ||
@memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]); | ||
self.rp -= hist_len; | ||
self.wp -= hist_len; | ||
self.fp -= hist_len; | ||
return @intCast(n); | ||
} | ||
|
||
// flush - process all data from window | ||
// If not flush preserve enough data for the loghest match. | ||
// Returns null if there is not enough data. | ||
pub fn activeLookahead(self: *Self, flush: bool) ?[]const u8 { | ||
const min: usize = if (flush) 0 else min_lookahead; | ||
const lh = self.lookahead(); | ||
return if (lh.len > min) lh else null; | ||
} | ||
|
||
pub inline fn lookahead(self: *Self) []const u8 { | ||
assert(self.wp >= self.rp); | ||
return self.buffer[self.rp..self.wp]; | ||
} | ||
|
||
pub fn writable(self: *Self) []u8 { | ||
return self.buffer[self.wp..]; | ||
} | ||
|
||
pub fn written(self: *Self, n: usize) void { | ||
self.wp += n; | ||
} | ||
|
||
pub fn advance(self: *Self, n: u16) void { | ||
assert(self.wp >= self.rp + n); | ||
self.rp += n; | ||
} | ||
|
||
// Finds match length between previous and current position. | ||
pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 { | ||
const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length); | ||
// lookahead buffers from previous and current positions | ||
const prev_lh = self.buffer[prev_pos..][0..max_len]; | ||
const curr_lh = self.buffer[curr_pos..][0..max_len]; | ||
|
||
// If we alread have match (min_len > 0), | ||
// test the first byte above previous len a[min_len] != b[min_len] | ||
// and then all the bytes from that position to zero. | ||
// That is likely positions to find difference than looping from first bytes. | ||
var i: usize = min_len; | ||
if (i > 0) { | ||
if (max_len <= i) return 0; | ||
while (true) { | ||
if (prev_lh[i] != curr_lh[i]) return 0; | ||
if (i == 0) break; | ||
i -= 1; | ||
} | ||
i = min_len; | ||
} | ||
while (i < max_len) : (i += 1) | ||
if (prev_lh[i] != curr_lh[i]) break; | ||
return if (i >= consts.match.min_length) @intCast(i) else 0; | ||
} | ||
|
||
pub fn pos(self: *Self) u16 { | ||
return @intCast(self.rp); | ||
} | ||
|
||
pub fn flushed(self: *Self) void { | ||
self.fp = @intCast(self.rp); | ||
} | ||
|
||
pub fn tokensBuffer(self: *Self) ?[]const u8 { | ||
assert(self.fp <= self.rp); | ||
if (self.fp < 0) return null; | ||
return self.buffer[@intCast(self.fp)..self.rp]; | ||
} | ||
|
||
test "SlidingWindow match" { | ||
const data = "Blah blah blah blah blah!"; | ||
var win: Self = .{}; | ||
try expect(win.write(data) == data.len); | ||
try expect(win.wp == data.len); | ||
try expect(win.rp == 0); | ||
|
||
// length between l symbols | ||
try expect(win.match(1, 6, 0) == 18); | ||
try expect(win.match(1, 11, 0) == 13); | ||
try expect(win.match(1, 16, 0) == 8); | ||
try expect(win.match(1, 21, 0) == 0); | ||
|
||
// position 15 = "blah blah!" | ||
// position 20 = "blah!" | ||
try expect(win.match(15, 20, 0) == 4); | ||
try expect(win.match(15, 20, 3) == 4); | ||
try expect(win.match(15, 20, 4) == 0); | ||
} | ||
|
||
test "SlidingWindow slide" { | ||
var win: Self = .{}; | ||
win.wp = Self.buffer_len - 11; | ||
win.rp = Self.buffer_len - 111; | ||
win.buffer[win.rp] = 0xab; | ||
try expect(win.lookahead().len == 100); | ||
try expect(win.tokensBuffer().?.len == win.rp); | ||
|
||
const n = win.slide(); | ||
try expect(n == 32757); | ||
try expect(win.buffer[win.rp] == 0xab); | ||
try expect(win.rp == Self.hist_len - 111); | ||
try expect(win.wp == Self.hist_len - 11); | ||
try expect(win.lookahead().len == 100); | ||
try expect(win.tokensBuffer() == null); | ||
} |
Oops, something went wrong.