Skip to content

Commit

Permalink
compress file to valid gzip archive
Browse files Browse the repository at this point in the history
  • Loading branch information
ianic committed Jan 6, 2024
1 parent 51ff48f commit 82f419b
Show file tree
Hide file tree
Showing 6 changed files with 2,447 additions and 18 deletions.
181 changes: 163 additions & 18 deletions src/deflate.zig
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ const limits = struct {
const tokens = 1 << 14;
};
const match = struct {
const min_length = 4;
const base_length = 3; // smallest match length per the RFC section 3.2.5
const min_length = 4; // min length used in this algorithm
const max_length = 258;

const min_distance = 1;
const max_distance = 32768;
};
const window = struct {
const window = struct { // TODO: consider renaming this into history
const bits = 15;
const size = 1 << bits;
const mask = size - 1;
Expand Down Expand Up @@ -45,7 +47,7 @@ pub fn Deflate(comptime WriterType: type) type {

fn tokenize(self: *Self) bool {
const L = Token.literal;
const R = Token.backreference;
const M = Token.match;

while (!self.tokens.full()) {
const lh = self.win.lookahead();
Expand All @@ -61,7 +63,7 @@ pub fn Deflate(comptime WriterType: type) type {
if (d > limits.match.max_distance) break;
const l = self.win.match(prev, pos);
if (l > length) {
token = R(d, l);
token = M(d, l);
length = l;
}
prev = self.hasher.prev(prev);
Expand Down Expand Up @@ -90,6 +92,7 @@ pub fn Deflate(comptime WriterType: type) type {
pub fn close(self: *Self) !void {
try self.step();
try self.flush();
try self.token_writer.close();
}

pub fn write(self: *Self, input: []const u8) !usize {
Expand All @@ -109,6 +112,16 @@ pub fn Deflate(comptime WriterType: type) type {
return input.len;
}

pub fn compress(self: *Self, rdr: anytype) !void {
var buf: [4096]u8 = undefined;
while (true) {
const n = try rdr.readAll(&buf);
_ = try self.write(buf[0..n]);
if (n < buf.len) break;
}
try self.close();
}

// Writer interface

pub const Writer = std.io.Writer(*Self, Error, write);
Expand All @@ -122,7 +135,7 @@ pub fn Deflate(comptime WriterType: type) type {

fn tokenize(src: []const u8, tokens: *Tokens) void {
const L = Token.literal;
const R = Token.backreference;
const M = Token.match;

var hasher: Hasher = .{};
var win: StreamWindow = .{};
Expand All @@ -140,7 +153,7 @@ fn tokenize(src: []const u8, tokens: *Tokens) void {
while (prev != Hasher.not_found) {
const l = win.match(prev, pos);
if (l > length) {
token = R(pos - prev, l);
token = M(pos - prev, l);
length = l;
}
prev = hasher.prev(prev);
Expand All @@ -155,15 +168,15 @@ fn tokenize(src: []const u8, tokens: *Tokens) void {

test "deflate" {
const L = Token.literal;
const R = Token.backreference;
const M = Token.match;

const cases = [_]struct {
data: []const u8,
tokens: []const Token,
}{
.{
.data = "Blah blah blah blah blah!",
.tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), R(5, 18), L('!') },
.tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
},
};

Expand Down Expand Up @@ -380,37 +393,37 @@ test "Hasher" {
const Token = packed struct {
const Kind = enum(u2) {
literal,
match,
end_of_block,
backreference,
};

dc: u16 = 0, // distance code: 1 - 32768
lc_sym: u8 = 0, // length code: 3 - 258, or symbol
dc: u16 = 0, // distance code: (1 - 32768) - 1
lc_sym: u8 = 0, // length code: (3 - 258) - 3, or symbol
kind: Kind = .literal,

pub fn symbol(t: Token) u8 {
return t.lc_sym;
}

pub fn distance(t: Token) u16 {
return if (t.kind == .backreference) @as(u16, t.dc) + limits.match.min_distance else 0;
return if (t.kind == .match) @as(u16, t.dc) + limits.match.min_distance else 0;
}

pub fn length(t: Token) u16 {
return if (t.kind == .backreference) @as(u16, t.lc_sym) + limits.match.min_length else 1;
return if (t.kind == .match) @as(u16, t.lc_sym) + limits.match.base_length else 1;
}

pub fn literal(sym: u8) Token {
return .{ .kind = .literal, .lc_sym = sym };
}

pub fn backreference(dis: usize, len: usize) Token {
pub fn match(dis: usize, len: usize) Token {
assert(len >= limits.match.min_length and len <= limits.match.max_length);
assert(dis >= limits.match.min_distance and dis <= limits.match.max_distance);
return .{
.kind = .backreference,
.kind = .match,
.dc = @intCast(dis - limits.match.min_distance),
.lc_sym = @intCast(len - limits.match.min_length),
.lc_sym = @intCast(len - limits.match.base_length),
};
}

Expand All @@ -427,7 +440,7 @@ const Token = packed struct {
pub fn string(t: Token) void {
switch (t.kind) {
.literal => std.debug.print("L('{c}') \n", .{t.symbol()}),
.backreference => std.debug.print("R({d}, {d}) \n", .{ t.distance(), t.length() }),
.match => std.debug.print("R({d}, {d}) \n", .{ t.distance(), t.length() }),
.end_of_block => std.debug.print("E()", .{}),
}
}
Expand Down Expand Up @@ -468,6 +481,16 @@ const Tokens = struct {
fn tokens(self: *Tokens) []const Token {
return self.list[0..self.pos];
}

fn toStd(self: *Tokens, s: []std_token.Token) void {
for (self.tokens(), 0..) |t, i| {
s[i] = switch (t.kind) {
.literal => std_token.literalToken(t.symbol()),
.match => std_token.matchToken(t.length(), t.distance()),
else => unreachable,
};
}
}
};

test "zig tar" {
Expand All @@ -490,6 +513,7 @@ test "zig tar" {
}

const SlidingWindow = @import("sliding_window.zig").SlidingWindow;

const StdoutTokenWriter = struct {
win: SlidingWindow = .{},

Expand All @@ -499,7 +523,7 @@ const StdoutTokenWriter = struct {
for (tokens) |t| {
switch (t.kind) {
.literal => self.win.write(t.symbol()),
.backreference => self.win.writeCopy(t.length(), t.distance()),
.match => self.win.writeCopy(t.length(), t.distance()),
else => unreachable,
}
if (self.win.free() < 285) {
Expand All @@ -517,4 +541,125 @@ const StdoutTokenWriter = struct {
try stdout.writeAll(buf);
}
}

pub fn close(self: *StdoutTokenWriter) !void {
_ = self;
}
};

const std_token = @import("std/token.zig");
const hm_bw = @import("std/huffman_bit_writer.zig");

test "compress file" {
const input_file_name = "testdata/2600.txt.utf-8";
//const input_file_name = "testdata/h";
var input = try std.fs.cwd().openFile(input_file_name, .{});
defer input.close();

const output_file_name = "testdata/output.gz";
var output = try std.fs.cwd().createFile(output_file_name, .{ .truncate = true });
//var in_file = try std.fs.cwd().openFile(in_file_name, .{ .mode = .write_only });
defer output.close();

//var gz = try gzipHeaderFooterWriter(output.writer());

var tw = try tokenWriter(output.writer());
var df = deflate(&tw);

const gzipHeader = [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
try output.writeAll(&gzipHeader);

var cr = chksumReader(input.reader());
try df.compress(cr.reader());
// var buf: [4096]u8 = undefined;
// while (true) {
// const n = try rdr.readAll(&buf);
// _ = try df.write(buf[0..n]);
// crc.update(buf[0..n]);
// len += n;
// if (n < buf.len) break;
// }
// try df.close();

var bits: [4]u8 = undefined;
std.mem.writeInt(u32, &bits, cr.chksum(), .little);
try output.writeAll(&bits);

std.mem.writeInt(u32, &bits, cr.bytesRead(), .little);
try output.writeAll(&bits);
}

pub fn tokenWriter(writer: anytype) !TokenWriter(@TypeOf(writer)) {
return try TokenWriter(@TypeOf(writer)).init(writer);
}

fn TokenWriter(comptime WriterType: type) type {
return struct {
hw_bw: hm_bw.HuffmanBitWriter(WriterType),
tokens: [limits.block.tokens]std_token.Token = undefined,

const Self = @This();

pub fn init(writer: WriterType) !Self {
const allocator = std.heap.page_allocator;
return .{ .hw_bw = try hm_bw.huffmanBitWriter(allocator, writer) };
}

pub fn write(self: *Self, tokens: []const Token) !void {
for (tokens, 0..) |t, i| {
self.tokens[i] = switch (t.kind) {
.literal => std_token.literalToken(t.symbol()),
.match => std_token.matchToken(t.lc_sym, t.dc),
else => unreachable,
};
}
const std_tokens = self.tokens[0..tokens.len];
try self.hw_bw.writeBlock(std_tokens, false, null);
}

pub fn close(self: *Self) !void {
try self.hw_bw.writeStoredHeader(0, true);
try self.hw_bw.flush();
}

pub fn deinit(self: *Self) void {
self.hw_bw.deinit();
}
};
}

pub fn ChksumReader(comptime ReaderType: type) type {
return struct {
rdr: ReaderType,
bytes: usize = 0,
hasher: std.hash.Crc32 = std.hash.Crc32.init(),

const Self = @This();

pub const Error = ReaderType.Error;
pub const Reader = std.io.Reader(*Self, Error, read);

pub fn reader(self: *Self) Reader {
return .{ .context = self };
}

pub fn read(self: *Self, buf: []u8) Error!usize {
const n = try self.rdr.read(buf);
self.hasher.update(buf[0..n]);
self.bytes += n;
return n;
}

pub fn chksum(self: *Self) u32 {
return self.hasher.final();
}

pub fn bytesRead(self: *Self) u32 {
return @intCast(self.bytes);
}
};
}

pub fn chksumReader(reader: anytype) ChksumReader(@TypeOf(reader)) {
return .{ .rdr = reader };
}
33 changes: 33 additions & 0 deletions src/std/bits_utils.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
const math = @import("std").math;

// Reverse bit-by-bit a N-bit code.
pub fn bitReverse(comptime T: type, value: T, N: usize) T {
const r = @bitReverse(value);
return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).Int.bits - N));
}

test "bitReverse" {
const std = @import("std");

const ReverseBitsTest = struct {
in: u16,
bit_count: u5,
out: u16,
};

const reverse_bits_tests = [_]ReverseBitsTest{
.{ .in = 1, .bit_count = 1, .out = 1 },
.{ .in = 1, .bit_count = 2, .out = 2 },
.{ .in = 1, .bit_count = 3, .out = 4 },
.{ .in = 1, .bit_count = 4, .out = 8 },
.{ .in = 1, .bit_count = 5, .out = 16 },
.{ .in = 17, .bit_count = 5, .out = 17 },
.{ .in = 257, .bit_count = 9, .out = 257 },
.{ .in = 29, .bit_count = 5, .out = 23 },
};

for (reverse_bits_tests) |h| {
const v = bitReverse(u16, h.in, h.bit_count);
try std.testing.expectEqual(h.out, v);
}
}
28 changes: 28 additions & 0 deletions src/std/deflate_const.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Deflate

// Biggest block size for uncompressed block.
pub const max_store_block_size = 65535;
// The special code used to mark the end of a block.
pub const end_block_marker = 256;

// LZ77

// The smallest match length per the RFC section 3.2.5
pub const base_match_length = 3;
// The smallest match offset.
pub const base_match_offset = 1;
// The largest match length.
pub const max_match_length = 258;
// The largest match offset.
pub const max_match_offset = 1 << 15;

// Huffman Codes

// The largest offset code.
pub const offset_code_count = 30;
// Max number of frequencies used for a Huffman Code
// Possible lengths are codegenCodeCount (19), offset_code_count (30) and max_num_lit (286).
// The largest of these is max_num_lit.
pub const max_num_frequencies = max_num_lit;
// Maximum number of literals.
pub const max_num_lit = 286;
Loading

0 comments on commit 82f419b

Please sign in to comment.