diff --git a/src/consts.zig b/src/consts.zig new file mode 100644 index 0000000..f86e357 --- /dev/null +++ b/src/consts.zig @@ -0,0 +1,25 @@ +pub const block = struct { + pub const tokens = 1 << 14; +}; + +pub const match = struct { + pub const base_length = 3; // smallest match length per the RFC section 3.2.5 + pub const min_length = 4; // min length used in this algorithm + pub const max_length = 258; + + pub const min_distance = 1; + pub const max_distance = 32768; +}; + +pub const window = struct { // TODO: consider renaming this into history + pub const bits = 15; + pub const size = 1 << bits; + pub const mask = size - 1; +}; + +pub const hash = struct { + pub const bits = 17; + pub const size = 1 << bits; + pub const mask = size - 1; + pub const shift = 32 - bits; +}; diff --git a/src/deflate.zig b/src/deflate.zig index efa96a0..e7133e8 100644 --- a/src/deflate.zig +++ b/src/deflate.zig @@ -3,32 +3,8 @@ const assert = std.debug.assert; const testing = std.testing; const expect = testing.expect; const print = std.debug.print; -const Token = @import("std/token.zig").Token; - -const limits = struct { - const block = struct { - const tokens = 1 << 14; - }; - const match = struct { - const base_length = 3; // smallest match length per the RFC section 3.2.5 - const min_length = 4; // min length used in this algorithm - const max_length = 258; - - const min_distance = 1; - const max_distance = 32768; - }; - const window = struct { // TODO: consider renaming this into history - const bits = 15; - const size = 1 << bits; - const mask = size - 1; - }; - const hash = struct { - const bits = 17; - const size = 1 << bits; - const mask = size - 1; - const shift = 32 - bits; - }; -}; +const Token = @import("token.zig").Token; +const consts = @import("consts.zig"); pub fn deflateWriter(writer: anytype) Deflate(@TypeOf(writer)) { return Deflate(@TypeOf(writer)).init(writer); @@ -66,11 +42,11 @@ pub fn Deflate(comptime WriterType: type) type { var tries: usize = 128; // TODO: this is just hack while (match_pos != Hasher.not_found and tries > 0) : (tries -= 1) { const distance = curr_pos - match_pos; - if (distance > limits.match.max_distance or + if (distance > consts.match.max_distance or match_pos < self.win.offset) break; const match_length = self.win.match(match_pos, curr_pos); if (match_length > length) { - token = Token.initMatch(distance, match_length); + token = Token.initMatch(@intCast(distance), match_length); length = match_length; } match_pos = self.hasher.prev(match_pos); @@ -88,7 +64,7 @@ pub fn Deflate(comptime WriterType: type) type { // Process data in window and create tokens. // If token buffer is full flush tokens to the token writer. fn processWindow(self: *Self, opt: ProcessOption) !void { - const min_lookahead: usize = if (opt == .none) limits.match.max_length else 0; + const min_lookahead: usize = if (opt == .none) consts.match.max_length else 0; while (self.nextToken(min_lookahead)) |token| { self.tokens.add(token); @@ -216,9 +192,9 @@ fn matchLength(src: []const u8, prev: usize, pos: usize) u16 { } const StreamWindow = struct { - const hist_len = limits.window.size; + const hist_len = consts.window.size; const buffer_len = 2 * hist_len; - const max_rp = buffer_len - (limits.match.min_length + limits.match.max_length); + const max_rp = buffer_len - (consts.match.min_length + consts.match.max_length); const max_offset = (1 << 32) - (2 * buffer_len); buffer: [buffer_len]u8 = undefined, @@ -274,7 +250,7 @@ const StreamWindow = struct { } // Finds match length between previous and current position. - pub fn match(self: *StreamWindow, prev: usize, curr: usize) usize { + pub fn match(self: *StreamWindow, prev: usize, curr: usize) u16 { //if (!(prev > self.offset and curr > prev)) { //if (self.offset > 0) // print("match prev: {d}, self.offset: {d}, curr: {d}\n", .{ prev, self.offset, curr }); @@ -282,13 +258,13 @@ const StreamWindow = struct { assert(prev >= self.offset and curr > prev); var p1: usize = prev - self.offset; var p2: usize = curr - self.offset; - var n: usize = 0; - while (p2 < self.wp and self.buffer[p1] == self.buffer[p2] and n < limits.match.max_length) { + var n: u16 = 0; + while (p2 < self.wp and self.buffer[p1] == self.buffer[p2] and n < consts.match.max_length) { n += 1; p1 += 1; p2 += 1; } - return if (n > limits.match.min_length) n else 0; + return if (n > consts.match.min_length) n else 0; } pub fn pos(self: *StreamWindow) usize { @@ -330,10 +306,10 @@ test "StreamWindow slide" { const Hasher = struct { const mul = 0x1e35a7bd; const not_found = (1 << 32) - 1; - const mask = limits.window.mask; + const mask = consts.window.mask; - head: [limits.hash.size]u32 = [_]u32{not_found} ** limits.hash.size, - chain: [limits.window.size]u32 = [_]u32{not_found} ** (limits.window.size), + head: [consts.hash.size]u32 = [_]u32{not_found} ** consts.hash.size, + chain: [consts.window.size]u32 = [_]u32{not_found} ** (consts.window.size), fn add(self: *Hasher, data: []const u8, idx: u32) u32 { if (data.len < 4) return not_found; @@ -370,7 +346,7 @@ const Hasher = struct { var i: u32 = idx; for (0..len) |j| { const d = data[j..]; - if (d.len < limits.match.min_length) return; + if (d.len < consts.match.min_length) return; _ = self.add(d, i); i += 1; } @@ -380,11 +356,11 @@ const Hasher = struct { return (((@as(u32, b[3]) | @as(u32, b[2]) << 8 | @as(u32, b[1]) << 16 | - @as(u32, b[0]) << 24) *% mul) >> limits.hash.shift) & limits.hash.mask; + @as(u32, b[0]) << 24) *% mul) >> consts.hash.shift) & consts.hash.mask; } fn bulk(b: []u8, dst: []u32) u32 { - if (b.len < limits.match.min_length) { + if (b.len < consts.match.min_length) { return 0; } var hb = @@ -393,12 +369,12 @@ const Hasher = struct { @as(u32, b[1]) << 16 | @as(u32, b[0]) << 24; - dst[0] = (hb *% mul) >> limits.hash.shift; - const end = b.len - limits.match.min_length + 1; + dst[0] = (hb *% mul) >> consts.hash.shift; + const end = b.len - consts.match.min_length + 1; var i: u32 = 1; while (i < end) : (i += 1) { hb = (hb << 8) | @as(u32, b[i + 3]); - dst[i] = (hb *% mul) >> limits.hash.shift; + dst[i] = (hb *% mul) >> consts.hash.shift; } return hb; } @@ -443,7 +419,7 @@ test "Token size" { } const Tokens = struct { - list: [limits.block.tokens]Token = undefined, + list: [consts.block.tokens]Token = undefined, pos: usize = 0, fn add(self: *Tokens, t: Token) void { @@ -456,7 +432,7 @@ const Tokens = struct { } fn full(self: *Tokens) bool { - return self.pos == limits.block.tokens; + return self.pos == consts.block.tokens; } fn reset(self: *Tokens) void { @@ -471,16 +447,6 @@ const Tokens = struct { fn tokens(self: *Tokens) []const Token { return self.list[0..self.pos]; } - - fn toStd(self: *Tokens, s: []std_token.Token) void { - for (self.tokens(), 0..) |t, i| { - s[i] = switch (t.kind) { - .literal => std_token.literalToken(t.symbol()), - .match => std_token.matchToken(t.length(), t.distance()), - else => unreachable, - }; - } - } }; test "deflate compress file to stdout" { @@ -539,7 +505,6 @@ const StdoutTokenWriter = struct { } }; -const std_token = @import("std/token.zig"); const hm_bw = @import("std/huffman_bit_writer.zig"); test "deflate compress file" { @@ -576,7 +541,6 @@ pub fn tokenWriter(writer: anytype) TokenWriter(@TypeOf(writer)) { fn TokenWriter(comptime WriterType: type) type { return struct { hw_bw: hm_bw.HuffmanBitWriter(WriterType), - tokens: [limits.block.tokens]std_token.Token = undefined, const Self = @This(); diff --git a/src/std/huffman_bit_writer.zig b/src/std/huffman_bit_writer.zig index af5da44..4558d15 100644 --- a/src/std/huffman_bit_writer.zig +++ b/src/std/huffman_bit_writer.zig @@ -3,7 +3,7 @@ const io = std.io; const deflate_const = @import("deflate_const.zig"); const hm_code = @import("huffman_code.zig"); -const Token = @import("token.zig").Token; +const Token = @import("../token.zig").Token; // The first length code. const length_codes_start = 257; @@ -24,7 +24,7 @@ const buffer_flush_size = 240; const buffer_size = buffer_flush_size + 8; // The number of extra bits needed by length code X - LENGTH_CODES_START. -var length_extra_bits = [_]u8{ +var length_extra_bits = [_]u8{ // TODO: why not const 0, 0, 0, // 257 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, // 260 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // 270 @@ -605,7 +605,7 @@ pub fn HuffmanBitWriter(comptime WriterType: type) type { for (tokens) |t| { if (t.kind == Token.Kind.literal) { - self.literal_freq[t.symbol()] += 1; + self.literal_freq[t.literal()] += 1; continue; } self.literal_freq[length_codes_start + t.lengthCode()] += 1; @@ -651,11 +651,11 @@ pub fn HuffmanBitWriter(comptime WriterType: type) type { } for (tokens) |t| { if (t.kind == Token.Kind.literal) { - try self.writeCode(le_codes[t.symbol()]); + try self.writeCode(le_codes[t.literal()]); continue; } // Write the length - const length = t.lc_sym; // TODO: napravi nesto + const length = t.length(); const length_code = t.lengthCode(); try self.writeCode(le_codes[length_code + length_codes_start]); const extra_length_bits = @as(u32, @intCast(length_extra_bits[length_code])); @@ -664,7 +664,7 @@ pub fn HuffmanBitWriter(comptime WriterType: type) type { try self.writeBits(extra_length, extra_length_bits); } // Write the offset - const offset = t.dc; // TODO: + const offset = t.offset(); const offset_code = t.offsetCode(); try self.writeCode(oe_codes[offset_code]); const extra_offset_bits = @as(u32, @intCast(offset_extra_bits[offset_code])); diff --git a/src/std/token.zig b/src/token.zig similarity index 60% rename from src/std/token.zig rename to src/token.zig index 66a54d7..4bca9ec 100644 --- a/src/std/token.zig +++ b/src/token.zig @@ -1,35 +1,10 @@ const std = @import("std"); const assert = std.debug.assert; - -// TODO: remove to common place -const limits = struct { - const block = struct { - const tokens = 1 << 14; - }; - const match = struct { - const base_length = 3; // smallest match length per the RFC section 3.2.5 - const min_length = 4; // min length used in this algorithm - const max_length = 258; - - const min_distance = 1; - const max_distance = 32768; - }; - const window = struct { // TODO: consider renaming this into history - const bits = 15; - const size = 1 << bits; - const mask = size - 1; - }; - const hash = struct { - const bits = 17; - const size = 1 << bits; - const mask = size - 1; - const shift = 32 - bits; - }; -}; +const consts = @import("consts.zig"); // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH) // is length_codes[length - MIN_MATCH_LENGTH] -const length_codes = [_]u32{ +const length_codes = [_]u32{ // TODO: why u32 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, @@ -58,7 +33,7 @@ const length_codes = [_]u32{ 27, 27, 27, 27, 27, 28, }; -const offset_codes = [_]u32{ +const offset_codes = [_]u32{ // TODO: why u32 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, @@ -78,67 +53,56 @@ const offset_codes = [_]u32{ }; pub const Token = struct { - pub const Kind = enum(u2) { + pub const Kind = enum(u1) { literal, match, - end_of_block, }; - dc: u16 = 0, // distance code: (1 - 32768) - 1 - lc_sym: u8 = 0, // length code: (3 - 258) - 3, or symbol + off: u15 = 0, // offset: (1 - 32768) - 1 + len_lit: u8 = 0, // length: (3 - 258) - 3, or literal kind: Kind = .literal, - pub fn symbol(t: Token) u8 { - return t.lc_sym; + pub fn literal(t: Token) u8 { + return t.len_lit; } - pub fn distance(t: Token) u16 { - return if (t.kind == .match) @as(u16, t.dc) + limits.match.min_distance else 0; + pub fn offset(t: Token) u16 { + return t.off; } pub fn length(t: Token) u16 { - return if (t.kind == .match) @as(u16, t.lc_sym) + limits.match.base_length else 1; + return t.len_lit; } - pub fn initLiteral(sym: u8) Token { - return .{ .kind = .literal, .lc_sym = sym }; + pub fn initLiteral(lit: u8) Token { + return .{ .kind = .literal, .len_lit = lit }; } - pub fn initMatch(dis: usize, len: usize) Token { - assert(len >= limits.match.min_length and len <= limits.match.max_length); - assert(dis >= limits.match.min_distance and dis <= limits.match.max_distance); + // offset range 1 - 32768, stored in off as 0 - 32767 (u16) + // length range 3 - 258, stored in len_lit as 0 - 255 (u8) + pub fn initMatch(off: u16, len: u16) Token { + assert(len >= consts.match.min_length and len <= consts.match.max_length); + assert(off >= consts.match.min_distance and off <= consts.match.max_distance); return .{ .kind = .match, - .dc = @intCast(dis - limits.match.min_distance), - .lc_sym = @intCast(len - limits.match.base_length), + .off = @intCast(off - consts.match.min_distance), + .len_lit = @intCast(len - consts.match.base_length), }; } - pub fn endOfBlock() Token { - return .{ .kind = .end_of_block }; - } - pub fn eql(t: Token, o: Token) bool { return t.kind == o.kind and - t.dc == o.dc and - t.lc_sym == o.lc_sym; - } - - pub fn string(t: Token) void { - switch (t.kind) { - .literal => std.debug.print("L('{c}') \n", .{t.symbol()}), - .match => std.debug.print("R({d}, {d}) \n", .{ t.distance(), t.length() }), - .end_of_block => std.debug.print("E()", .{}), - } + t.off == o.off and + t.len_lit == o.len_lit; } pub fn lengthCode(t: Token) u32 { - return length_codes[t.lc_sym]; + return length_codes[t.len_lit]; } // Returns the offset code corresponding to a specific offset pub fn offsetCode(t: Token) u32 { - var off: u32 = t.dc; + var off: u32 = t.off; if (off < @as(u32, @intCast(offset_codes.len))) { return offset_codes[off]; } @@ -150,3 +114,15 @@ pub const Token = struct { return offset_codes[off] + 28; } }; + +const print = std.debug.print; + +test "Token size" { + print("bit_offset: {d} {d} {d} size of: {d}\n", .{ + @bitOffsetOf(Token, "off"), + @bitOffsetOf(Token, "len_lit"), + @bitOffsetOf(Token, "kind"), + @sizeOf(Token), + }); + //try expect(@sizeOf(Token) == 4); +}