Skip to content

Commit

Permalink
refactor deflate
Browse files Browse the repository at this point in the history
  • Loading branch information
ianic committed Jan 15, 2024
1 parent 6d807b3 commit 8079ef7
Showing 1 changed file with 84 additions and 93 deletions.
177 changes: 84 additions & 93 deletions src/deflate.zig
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pub fn deflate(reader: anytype, writer: anytype) !void {
const tw = try tokenWriter(writer);
var df = Deflate(@TypeOf(tw)).init(tw);
try df.compress(reader);
try df.close();
}

pub fn Deflate(comptime WriterType: type) type {
Expand All @@ -51,69 +52,77 @@ pub fn Deflate(comptime WriterType: type) type {
return .{ .token_writer = w };
}

fn tokenize(self: *Self) bool {
const L = Token.literal;
const M = Token.match;

while (!self.tokens.full()) {
const lh = self.win.lookahead();
if (lh.len == 0) return false;

var token = L(lh[0]);
var length: usize = 1;

const pos = self.win.pos();
var prev = self.hasher.add(lh, @intCast(pos));
var tries: usize = 128;
while (prev != Hasher.not_found and tries > 0) : (tries -= 1) {
const d = pos - prev;
//print("prev: {d} {d} {d}\n", .{ pos, prev, d });
if (d > limits.match.max_distance) break;
const l = self.win.match(prev, pos);
if (l > length) {
token = M(d, l);
length = l;
}
prev = self.hasher.prev(prev);
fn nextToken(self: *Self, min_lookahead: usize) ?Token {
const lh = self.win.lookahead();
if (lh.len <= min_lookahead) return null;

var token = Token.literal(lh[0]);
var length: usize = 1;

const curr_pos = self.win.pos();
var match_pos = self.hasher.add(lh, @intCast(curr_pos)); // TODO: rethink intCast

var tries: usize = 128; // TODO: this is just hack
while (match_pos != Hasher.not_found and tries > 0) : (tries -= 1) {
const distance = curr_pos - match_pos;
if (distance > limits.match.max_distance or
match_pos < self.win.offset) break;
const match_length = self.win.match(match_pos, curr_pos);
if (match_length > length) {
token = Token.match(distance, match_length);
length = match_length;
}
match_pos = self.hasher.prev(match_pos);
}

self.win.advance(length);
if (length > 1)
self.hasher.bulkAdd(lh[1..], length - 1, @intCast(curr_pos + 1));

return token;
}

const ProcessOption = enum { none, flush, final };

self.win.advance(length);
if (length > 1)
self.hasher.bulkAdd(lh[1..], length - 1, @intCast(pos + 1));
// Process data in window and create tokens.
// If token buffer is full flush tokens to the token writer.
fn processWindow(self: *Self, opt: ProcessOption) !void {
const min_lookahead: usize = if (opt == .none) limits.match.max_length else 0;

while (self.nextToken(min_lookahead)) |token| {
self.tokens.add(token);
if (self.tokens.full()) try self.flushTokens(false);
}
return true;
}

fn step(self: *Self) !void {
while (self.tokenize())
try self.flush();
if (opt != .none) try self.flushTokens(opt == .final);
}

fn flush(self: *Self) !void {
try self.token_writer.write(self.tokens.tokens());
fn flushTokens(self: *Self, final: bool) !void {
try self.token_writer.write(self.tokens.tokens(), final);
self.tokens.reset();
}

pub fn flush(self: *Self) !void {
try self.processWindow(.flush);
}

pub fn close(self: *Self) !void {
try self.step();
try self.flush();
try self.token_writer.close();
try self.processWindow(.final);
}

pub fn write(self: *Self, input: []const u8) !usize {
var buf = input;

while (buf.len > 0) {
try self.step();
const n = self.win.write(buf);
if (n == 0) {
try self.processWindow(.none);
self.slide();
continue;
}
buf = buf[n..];
}
try self.processWindow(.none);

return input.len;
}
Expand All @@ -136,11 +145,10 @@ pub fn Deflate(comptime WriterType: type) type {
const n = try rdr.readAll(buf);
self.win.written(n);
// process win
try self.step();
try self.processWindow(.none);
// no more data in reader
if (n < buf.len) break;
}
try self.close();
}

// Writer interface
Expand All @@ -154,40 +162,7 @@ pub fn Deflate(comptime WriterType: type) type {
};
}

fn tokenize(src: []const u8, tokens: *Tokens) void {
const L = Token.literal;
const M = Token.match;

var hasher: Hasher = .{};
var win: StreamWindow = .{};
assert(win.write(src) == src.len);

while (true) {
const lh = win.lookahead();
if (lh.len == 0) break;

var token = L(lh[0]);
var length: usize = 1;

const pos = win.pos();
var prev = hasher.add(lh, @intCast(pos));
while (prev != Hasher.not_found) {
const l = win.match(prev, pos);
if (l > length) {
token = M(pos - prev, l);
length = l;
}
prev = hasher.prev(prev);
}

tokens.add(token);
win.advance(length);
if (length > 0)
hasher.bulkAdd(lh[1..], length - 1, @intCast(pos + 1));
}
}

test "deflate" {
test "deflate: tokenization" {
const L = Token.literal;
const M = Token.match;

Expand All @@ -202,15 +177,30 @@ test "deflate" {
};

for (cases) |c| {
var tokens: Tokens = .{};
tokenize(c.data, &tokens);
var fbs = std.io.fixedBufferStream(c.data);
var nw: TestTokenWriter = .{
.expected = c.tokens,
};
var df = deflateWriter(&nw);
try df.compress(fbs.reader());
try df.close();
try expect(nw.pos == c.tokens.len);
}
}

try expect(tokens.len() == c.tokens.len);
for (c.tokens, 0..) |t, i| {
try expect(t.eql(tokens.at(i)));
// Tests that tokens writen are equal to expected token list.
const TestTokenWriter = struct {
const Self = @This();
expected: []const Token,
pos: usize = 0,

pub fn write(self: *Self, tokens: []const Token, _: bool) !void {
for (tokens) |t| {
try expect(t.eql(self.expected[self.pos]));
self.pos += 1;
}
}
}
};

fn matchLength(src: []const u8, prev: usize, pos: usize) u16 {
assert(prev < pos);
Expand All @@ -228,7 +218,7 @@ const StreamWindow = struct {
const hist_len = limits.window.size;
const buffer_len = 2 * hist_len;
const max_rp = buffer_len - (limits.match.min_length + limits.match.max_length);
const max_offset = (1 << 32) - (4 * limits.window.size);
const max_offset = (1 << 32) - (2 * buffer_len);

buffer: [buffer_len]u8 = undefined,
wp: usize = 0, // write position
Expand Down Expand Up @@ -284,7 +274,11 @@ const StreamWindow = struct {

// Finds match length between previous and current position.
pub fn match(self: *StreamWindow, prev: usize, curr: usize) usize {
assert(prev > self.offset and curr > prev);
//if (!(prev > self.offset and curr > prev)) {
//if (self.offset > 0)
// print("match prev: {d}, self.offset: {d}, curr: {d}\n", .{ prev, self.offset, curr });
//}
assert(prev >= self.offset and curr > prev);
var p1: usize = prev - self.offset;
var p2: usize = curr - self.offset;
var n: usize = 0;
Expand Down Expand Up @@ -409,7 +403,7 @@ const Hasher = struct {
}
};

test "Hasher" {
test "Hasher add/prev" {
const data = [_]u8{
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
Expand Down Expand Up @@ -492,7 +486,8 @@ const Token = packed struct {
test "Token size" {
try expect(@sizeOf(Token) == 4);
try expect(@bitSizeOf(Token) == 26);
print("size of Hasher {d}\n", .{@sizeOf(Hasher) / 1024});
// print("size of Hasher {d}\n", .{@sizeOf(Hasher)});
try expect(@sizeOf(Hasher) == 655_360);
}

const Tokens = struct {
Expand Down Expand Up @@ -536,7 +531,7 @@ const Tokens = struct {
}
};

test "zig tar" {
test "deflate compress file to stdout" {
if (true) return error.SkipZigTest;

const file_name = "testdata/2600.txt.utf-8";
Expand Down Expand Up @@ -595,7 +590,7 @@ const StdoutTokenWriter = struct {
const std_token = @import("std/token.zig");
const hm_bw = @import("std/huffman_bit_writer.zig");

test "compress file" {
test "deflate compress file" {
const input_file_name = "testdata/2600.txt.utf-8";
var input = try std.fs.cwd().openFile(input_file_name, .{});
defer input.close();
Expand Down Expand Up @@ -638,7 +633,7 @@ fn TokenWriter(comptime WriterType: type) type {
return .{ .hw_bw = try hm_bw.huffmanBitWriter(allocator, writer) };
}

pub fn write(self: *Self, tokens: []const Token) !void {
pub fn write(self: *Self, tokens: []const Token, final: bool) !void {
for (tokens, 0..) |t, i| {
self.tokens[i] = switch (t.kind) {
.literal => std_token.literalToken(t.symbol()),
Expand All @@ -647,12 +642,8 @@ fn TokenWriter(comptime WriterType: type) type {
};
}
const std_tokens = self.tokens[0..tokens.len];
try self.hw_bw.writeBlock(std_tokens, false, null);
}

pub fn close(self: *Self) !void {
try self.hw_bw.writeStoredHeader(0, true);
try self.hw_bw.flush();
try self.hw_bw.writeBlock(std_tokens, final, null);
if (final) try self.hw_bw.flush();
}

pub fn deinit(self: *Self) void {
Expand Down

0 comments on commit 8079ef7

Please sign in to comment.