Skip to content

Commit

Permalink
split processWindow in separate functions
Browse files Browse the repository at this point in the history
Implement proper bulkHash method.
  • Loading branch information
ianic committed Jan 19, 2024
1 parent 5484f5a commit de45bc5
Showing 1 changed file with 78 additions and 62 deletions.
140 changes: 78 additions & 62 deletions src/deflate.zig
Original file line number Diff line number Diff line change
Expand Up @@ -60,70 +60,78 @@ pub fn Deflate(comptime WriterType: type) type {
// flush - process all data from window
const flsh = (opt != .none);

var match: ?Token = self.prev_match;
var literal: ?u8 = self.prev_literal;

// While there is data in active lookahead buffer.
while (self.win.activeLookahead(flsh)) |lh| {
var step: usize = 1; // 1 in the case of literal, match length otherwise
const pos: usize = self.win.pos();
const min_len: u16 = if (match) |m| m.length() else 4;
const literal = lh[0]; // literal at current position
const min_len: u16 = if (self.prev_match) |m| m.length() else consts.match.min_length;

// Try to find match at least min_len long.
if (self.findMatch(pos, lh, min_len)) |token| {
if (self.findMatch(pos, lh, min_len)) |match| {
// Found better match than previous.
// Write previous literal (if any).
if (literal) |l| try self.addLiteral(l);

const tl = token.length();
if (tl >= level.lazy) {
try self.addPrevLiteral();

// Is found match length good enough?
if (match.length() >= level.lazy) {
// Don't try to lazy find better match, use this.
try self.addToken(token);
step = tl;
literal = null;
match = null;
step = try self.addMatch(match);
} else {
// Store this match.
literal = lh[0];
match = token;
self.prev_literal = literal;
self.prev_match = match;
}
} else {
// There is no better match at current pos the it was previous.
// There is no better match at current pos then it was previous.
// Write previous match or literal.
// If there is no previous match we are advancing 1 step so remember this literal.
if (match) |m| {
if (self.prev_match) |m| {
// Write match from previous position.
try self.addToken(m);
step = m.length() - 1;
literal = null;
match = null;
step = try self.addMatch(m) - 1; // we already advanced 1 from previous position
} else {
// No match at previous postition.
// Write previous literal if any, and remember this literal.
if (literal) |l| try self.addLiteral(l);
literal = lh[0];
try self.addPrevLiteral();
self.prev_literal = literal;
}
}
// Advance window and add hashes.
self.hasher.bulkAdd(lh[1..], step - 1, @intCast(pos + 1));
self.win.advance(step);
self.windowAdvance(step);
}

if (flsh) {
// In the case of flushing, last few lookahead buffers were smaller then min match len.
// So only last literal can be unwritten.
assert(match == null);
if (literal) |l| try self.addLiteral(l);
literal = null;
assert(self.prev_match == null);
try self.addPrevLiteral();
self.prev_literal = null;
}
self.prev_literal = literal;
self.prev_match = match;

if (flsh) try self.flushTokens(opt == .final);
}

inline fn addLiteral(self: *Self, lit: u8) !void {
try self.addToken(Token.initLiteral(lit));
inline fn windowAdvance(self: *Self, step: usize) void {
// assuming current position is already added in findMatch
if (step > 1) {
const lh = self.win.lookahead();
const pos = self.win.pos();
self.hasher.bulkAdd(lh[1..], step - 1, @intCast(pos + 1));
}
self.win.advance(step);
}

// Add previous literal (if any) to the tokens list.
inline fn addPrevLiteral(self: *Self) !void {
if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
}

// Add match to the tokens list, reset prev pointers.
// Returns length of the added match.
inline fn addMatch(self: *Self, m: Token) !u16 {
try self.addToken(m);
self.prev_literal = null;
self.prev_match = null;
return m.length();
}

inline fn addToken(self: *Self, token: Token) !void {
Expand Down Expand Up @@ -429,43 +437,34 @@ const Hasher = struct {
}
}

fn bulkAdd(self: *Hasher, data: []const u8, len: usize, idx: u32) void {
if (len == 0) return;
// TOOD: use bulk alg from below
var i: u32 = idx;
for (0..len) |j| {
const d = data[j..];
if (d.len < consts.match.min_length) return;
_ = self.add(d, i);
i += 1;
}
}

fn hash(b: *const [4]u8) u32 {
return (((@as(u32, b[3]) |
@as(u32, b[2]) << 8 |
@as(u32, b[1]) << 16 |
@as(u32, b[0]) << 24) *% mul) >> consts.hash.shift) & consts.hash.mask;
}

fn bulk(b: []u8, dst: []u32) u32 {
if (b.len < consts.match.min_length) {
return 0;
fn bulkAdd(self: *Hasher, b: []const u8, len: usize, idx: u32) void {
if (len == 0 or b.len < consts.match.min_length) {
return;
}
var hb =
@as(u32, b[3]) |
@as(u32, b[2]) << 8 |
@as(u32, b[1]) << 16 |
@as(u32, b[0]) << 24;
_ = self.set(hashu(hb), idx);

dst[0] = (hb *% mul) >> consts.hash.shift;
const end = b.len - consts.match.min_length + 1;
var i: u32 = 1;
while (i < end) : (i += 1) {
hb = (hb << 8) | @as(u32, b[i + 3]);
dst[i] = (hb *% mul) >> consts.hash.shift;
var i = idx;
for (4..@min(len + 3, b.len)) |j| {
hb = (hb << 8) | @as(u32, b[j]);
i += 1;
_ = self.set(hashu(hb), i);
}
return hb;
}

inline fn hash(b: *const [4]u8) u32 {
return hashu(@as(u32, b[3]) |
@as(u32, b[2]) << 8 |
@as(u32, b[1]) << 16 |
@as(u32, b[0]) << 24);
}

inline fn hashu(v: u32) u32 {
return (v *% mul) >> consts.hash.shift;
}
};

Expand Down Expand Up @@ -493,6 +492,23 @@ test "Hasher add/prev" {
try testing.expect(h.chain[2 + 8] == 2);
}

test "Hasher bulkAdd" {
const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";

// one by one
var h: Hasher = .{};
for (data, 0..) |_, i| {
_ = h.add(data[i..], @intCast(i));
}

// in bulk
var bh: Hasher = .{};
bh.bulkAdd(data, data.len, 0);

try testing.expectEqualSlices(u32, &h.head, &bh.head);
try testing.expectEqualSlices(u32, &h.chain, &bh.chain);
}

test "Token size" {
// // TODO: remove this
// print("size of Tokens {d}\n", .{
Expand Down

0 comments on commit de45bc5

Please sign in to comment.