From a6035227c0d529f05edb042d928d3d696a22fddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Igor=20Anic=CC=81?= Date: Fri, 9 Feb 2024 22:22:13 +0100 Subject: [PATCH] enable writing store block types Refactor previous huffman only compressor to be huffman or store. Internal logic is the same. Expose huffmanCompress and storeCompress API methods. --- bin/deflate_bench.zig | 18 +++-- src/block_writer.zig | 45 ++++++------ src/deflate.zig | 155 ++++++++++++++++++++++++++++++++++++------ src/root.zig | 74 ++++++++++++++------ 4 files changed, 222 insertions(+), 70 deletions(-) diff --git a/bin/deflate_bench.zig b/bin/deflate_bench.zig index 72f71f2..178be44 100644 --- a/bin/deflate_bench.zig +++ b/bin/deflate_bench.zig @@ -40,9 +40,17 @@ pub fn run(output: anytype, opt: Options) !void { } else { if (opt.level == 0) { switch (opt.alg) { - .deflate => try raw.compressHuffmanOnly(input, output), - .zlib => try zlib.compressHuffmanOnly(input, output), - .gzip => try gzip.compressHuffmanOnly(input, output), + .deflate => try raw.storeCompress(input, output), + .zlib => try zlib.storeCompress(input, output), + .gzip => try gzip.storeCompress(input, output), + } + return; + } + if (opt.level == 1) { + switch (opt.alg) { + .deflate => try raw.huffmanCompress(input, output), + .zlib => try zlib.huffmanCompress(input, output), + .gzip => try gzip.huffmanCompress(input, output), } return; } @@ -136,8 +144,8 @@ pub fn readArgs() !?Options { if (std.mem.eql(u8, a, "-l")) { if (args.next()) |i| { opt.level = try std.fmt.parseInt(u8, i, 10); - if (!(opt.level == 0 or (opt.level >= 4 and opt.level <= 9))) { - print("Compression level must be in range 4-9 or 0 for huffman only!\n", .{}); + if (!(opt.level == 0 or opt.level == 1 or (opt.level >= 4 and opt.level <= 9))) { + print("Compression level must be in range 4-9 or 0 for store, 1 for huffman only!\n", .{}); return error.InvalidArgs; } } else { diff --git a/src/block_writer.zig b/src/block_writer.zig index 2efc917..26dc922 100644 --- a/src/block_writer.zig +++ b/src/block_writer.zig @@ -1,5 +1,6 @@ const std = @import("std"); const io = std.io; +const assert = std.debug.assert; const hc = @import("huffman_encoder.zig"); const consts = @import("consts.zig").huffman; @@ -227,7 +228,7 @@ pub fn BlockWriter(comptime WriterType: type) type { // num_distances: The number of distances specified in codegen // num_codegens: The number of codegens used in codegen // eof: Is it the end-of-file? (end of stream) - fn writeDynamicHeader( + fn dynamicHeader( self: *Self, num_literals: u32, num_distances: u32, @@ -273,7 +274,8 @@ pub fn BlockWriter(comptime WriterType: type) type { } } - fn writeStoredHeader(self: *Self, length: usize, eof: bool) Error!void { + fn storedHeader(self: *Self, length: usize, eof: bool) Error!void { + assert(length <= 65535); const flag: u32 = if (eof) 1 else 0; try self.bit_writer.writeBits(flag, 3); try self.flush(); @@ -282,7 +284,7 @@ pub fn BlockWriter(comptime WriterType: type) type { try self.bit_writer.writeBits(~l, 16); } - fn writeFixedHeader(self: *Self, eof: bool) Error!void { + fn fixedHeader(self: *Self, eof: bool) Error!void { // Indicate that we are a fixed Huffman block var value: u32 = 2; if (eof) { @@ -291,17 +293,12 @@ pub fn BlockWriter(comptime WriterType: type) type { try self.bit_writer.writeBits(value, 3); } - // Write a block of tokens with the smallest encoding. + // Write a block of tokens with the smallest encoding. Will choose block type. // The original input can be supplied, and if the huffman encoded data // is larger than the original bytes, the data will be written as a // stored block. // If the input is null, the tokens will always be Huffman encoded. - pub fn writeBlock( - self: *Self, - tokens: []const Token, - eof: bool, - input: ?[]const u8, - ) Error!void { + pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void { const lit_and_dist = self.indexTokens(tokens); const num_literals = lit_and_dist.num_literals; const num_distances = lit_and_dist.num_distances; @@ -364,23 +361,23 @@ pub fn BlockWriter(comptime WriterType: type) type { // Stored bytes? if (storable and stored_size < size) { - try self.writeBlockStored(input.?, eof); + try self.storedBlock(input.?, eof); return; } // Huffman. if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) { - try self.writeFixedHeader(eof); + try self.fixedHeader(eof); } else { - try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof); + try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); } // Write the tokens. try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes); } - pub fn writeBlockStored(self: *Self, input: []const u8, eof: bool) Error!void { - try self.writeStoredHeader(input.len, eof); + pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void { + try self.storedHeader(input.len, eof); try self.bit_writer.writeBytes(input); } @@ -389,7 +386,7 @@ pub fn BlockWriter(comptime WriterType: type) type { // histogram distribution. // If input is supplied and the compression savings are below 1/16th of the // input size the block is stored. - fn writeBlockDynamic( + fn dynamicBlock( self: *Self, tokens: []const Token, eof: bool, @@ -418,12 +415,12 @@ pub fn BlockWriter(comptime WriterType: type) type { const ssize = stored_size.size; const storable = stored_size.storable; if (storable and ssize < (size + (size >> 4))) { - try self.writeBlockStored(input.?, eof); + try self.storedBlock(input.?, eof); return; } // Write Huffman table. - try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof); + try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); // Write the tokens. try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes); @@ -518,7 +515,7 @@ pub fn BlockWriter(comptime WriterType: type) type { // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes // if the results only gains very little from compression. - pub fn writeBlockHuff(self: *Self, eof: bool, input: []const u8) Error!void { + pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void { // Add everything as literals histogram(input, &self.literal_freq); @@ -553,12 +550,12 @@ pub fn BlockWriter(comptime WriterType: type) type { const storable = stored_size_ret.storable; if (storable and ssize < (size + (size >> 4))) { - try self.writeBlockStored(input, eof); + try self.storedBlock(input, eof); return; } // Huffman. - try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof); + try self.dynamicHeader(num_literals, num_distances, num_codegens, eof); const encoding = self.literal_encoding.codes[0..257]; for (input) |t| { @@ -638,9 +635,9 @@ const TestFn = enum { final: bool, ) !void { switch (self) { - .write_block => try bw.writeBlock(tok, final, input), - .write_dyn_block => try bw.writeBlockDynamic(tok, final, input), - .write_huffman_block => try bw.writeBlockHuff(final, input.?), + .write_block => try bw.write(tok, final, input), + .write_dyn_block => try bw.dynamicBlock(tok, final, input), + .write_huffman_block => try bw.huffmanBlock(input.?, final), } try bw.flush(); } diff --git a/src/deflate.zig b/src/deflate.zig index f9be943..ff0a4df 100644 --- a/src/deflate.zig +++ b/src/deflate.zig @@ -261,7 +261,7 @@ fn Deflate(comptime container: Container, comptime WriterType: type, comptime Bl } fn flushTokens(self: *Self, final: bool) !void { - try self.block_writer.writeBlock(self.tokens.tokens(), final, self.win.tokensBuffer()); + try self.block_writer.write(self.tokens.tokens(), final, self.win.tokensBuffer()); try self.block_writer.flush(); self.tokens.reset(); self.win.flush(); @@ -353,22 +353,65 @@ const Tokens = struct { } }; -pub fn huffmanOnlyCompressor(comptime container: Container, writer: anytype) !HuffmanOnlyCompressor( - container, - @TypeOf(writer), -) { - return try HuffmanOnlyCompressor(container, @TypeOf(writer)).init(writer); -} - /// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and /// only performs Huffman entropy encoding. Results in faster compression, much /// less memory requirements during compression but bigger compressed sizes. /// -/// Allocates ~11.2K +pub fn HuffmanCompressor(comptime container: Container, comptime WriterType: type) type { + return SimpleCompressor(.huffman, container, WriterType); +} + +pub fn huffmanCompressor(comptime container: Container, writer: anytype) !HuffmanCompressor(container, @TypeOf(writer)) { + return try HuffmanCompressor(container, @TypeOf(writer)).init(writer); +} + +pub fn huffmanCompress(comptime container: Container, reader: anytype, writer: anytype) !void { + var c = try huffmanCompressor(container, writer); + try c.compress(reader); + try c.close(); +} + +/// Creates store blocks only. Data are not compressed only packed into deflate +/// store blocks. That adds 9 bytes of header for each block. Max stored block +/// size is 64K. Block is emitted when flush is called on on close. /// -pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType: type) type { +pub fn StoreCompressor(comptime container: Container, comptime WriterType: type) type { + return SimpleCompressor(.store, container, WriterType); +} + +pub fn storeCompressor(comptime container: Container, writer: anytype) !StoreCompressor(container, @TypeOf(writer)) { + return try StoreCompressor(container, @TypeOf(writer)).init(writer); +} + +pub fn storeCompress(comptime container: Container, reader: anytype, writer: anytype) !void { + var c = try storeCompressor(container, writer); + try c.compress(reader); + try c.close(); +} + +const SimpleCompressorKind = enum { + huffman, + store, +}; + +fn simpleCompressor( + comptime kind: SimpleCompressorKind, + comptime container: Container, + writer: anytype, +) !SimpleCompressor(kind, container, @TypeOf(writer)) { + return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer); +} + +fn SimpleCompressor( + comptime kind: SimpleCompressorKind, + comptime container: Container, + comptime WriterType: type, +) type { const BlockWriterType = BlockWriter(WriterType); return struct { + buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes + wp: usize = 0, + wrt: WriterType, block_writer: BlockWriterType, hasher: container.Hasher() = .{}, @@ -384,15 +427,57 @@ pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType: return self; } + pub fn flush(self: *Self) !void { + try self.flushBuffer(false); + } + pub fn close(self: *Self) !void { - try self.block_writer.writeBlockStored("", true); - try self.block_writer.flush(); + try self.flushBuffer(true); try container.writeFooter(&self.hasher, self.wrt); } - pub fn writeBlock(self: *Self, input: []const u8) !void { - self.hasher.update(input); - try self.block_writer.writeBlockHuff(false, input); + fn flushBuffer(self: *Self, final: bool) !void { + const buf = self.buffer[0..self.wp]; + switch (kind) { + .huffman => try self.block_writer.huffmanBlock(buf, final), + .store => try self.block_writer.storedBlock(buf, final), + } + try self.block_writer.flush(); + self.wp = 0; + } + + // Writes all data from the input reader of uncompressed data. + // It is up to the caller to call flush or close if there is need to + // output compressed blocks. + pub fn compress(self: *Self, reader: anytype) !void { + while (true) { + // read from rdr into buffer + const buf = self.buffer[self.wp..]; + if (buf.len == 0) { + try self.flushBuffer(false); + continue; + } + const n = try reader.readAll(buf); + self.hasher.update(buf[0..n]); + self.wp += n; + if (n < buf.len) break; // no more data in reader + } + } + + // Writer interface + + pub const Writer = io.Writer(*Self, Error, write); + pub const Error = BlockWriterType.Error; + + // Write `input` of uncompressed data. + pub fn write(self: *Self, input: []const u8) !usize { + var fbs = io.fixedBufferStream(input); + try self.compress(fbs.reader()); + return input.len; + } + + pub fn writer(self: *Self) Writer { + return .{ .context = self }; } }; } @@ -448,7 +533,7 @@ const TestTokenWriter = struct { pub fn init(_: anytype) Self { return .{}; } - pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { + pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { for (tokens) |t| { self.actual[self.pos] = t; self.pos += 1; @@ -503,9 +588,12 @@ test "check struct sizes" { // var cmp = try std.compress.deflate.compressor(allocator, io.null_writer, .{}); // defer cmp.deinit(); - const HOC = HuffmanOnlyCompressor(.raw, @TypeOf(io.null_writer)); - try expect(@sizeOf(HOC) == 11480); + const HOC = HuffmanCompressor(.raw, @TypeOf(io.null_writer)); //print("size of HOC {d}\n", .{@sizeOf(HOC)}); + try expect(@sizeOf(HOC) == 77024); + // 64K buffer + // 11480 huffman_encoded + // 8 buffer write pointer } test "deflate file tokenization" { @@ -583,7 +671,7 @@ fn TokenDecoder(comptime WriterType: type) type { return .{ .wrt = wrt }; } - pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { + pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void { self.tokens_count += tokens.len; for (tokens) |t| { switch (t.kind) { @@ -606,3 +694,32 @@ fn TokenDecoder(comptime WriterType: type) type { pub fn flush(_: *Self) !void {} }; } + +test "store simple compressor" { + const data = "Hello world!"; + const expected = [_]u8{ + 0x1, // block type 0, final bit set + 0xc, 0x0, // len = 12 + 0xf3, 0xff, // ~len + 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', // + //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21, + }; + + var fbs = std.io.fixedBufferStream(data); + var al = std.ArrayList(u8).init(testing.allocator); + defer al.deinit(); + + var cmp = try storeCompressor(.raw, al.writer()); + try cmp.compress(fbs.reader()); + try cmp.close(); + try testing.expectEqualSlices(u8, &expected, al.items); + + fbs.reset(); + try al.resize(0); + + // huffman only compresoor will also emit store block for this small sample + var hc = try huffmanCompressor(.raw, al.writer()); + try hc.compress(fbs.reader()); + try hc.close(); + try testing.expectEqualSlices(u8, &expected, al.items); +} diff --git a/src/root.zig b/src/root.zig index dd26db3..3ffd68f 100644 --- a/src/root.zig +++ b/src/root.zig @@ -45,29 +45,34 @@ fn byContainer(comptime container: Container) type { return try deflate.compressor(container, writer, level); } - pub fn HuffmanOnlyCompressor(comptime WriterType: type) type { - return deflate.HuffmanOnlyCompressor(container, WriterType); + pub fn HuffmanCompressor(comptime WriterType: type) type { + return deflate.HuffmanCompressor(container, WriterType); + } + + pub fn huffmanCompress(reader: anytype, writer: anytype) !void { + try deflate.huffmanCompress(container, reader, writer); } /// Disables Lempel-Ziv match searching and only performs Huffman /// entropy encoding. Results in faster compression, much less memory /// requirements during compression but bigger compressed sizes. - pub fn huffmanOnlyCompressor(writer: anytype) !HuffmanOnlyCompressor(@TypeOf(writer)) { - return deflate.huffmanOnlyCompressor(container, writer); + pub fn huffmanCompressor(writer: anytype) !HuffmanCompressor(@TypeOf(writer)) { + return deflate.huffmanCompressor(container, writer); } - /// Compress plain data from reader and write them to the writer using - /// huffman only compression algorithm. - pub fn compressHuffmanOnly(reader: anytype, writer: anytype) !void { - var cmp = try huffmanOnlyCompressor(writer); - var buf: [1024 * 64]u8 = undefined; - while (true) { - const n = try reader.readAll(&buf); - if (n == 0) break; - try cmp.writeBlock(buf[0..n]); - if (n < buf.len) break; - } - try cmp.close(); + pub fn StoreCompressor(comptime WriterType: type) type { + return deflate.StoreCompressor(container, WriterType); + } + + /// Disables Lempel-Ziv match searching and only performs Huffman + /// entropy encoding. Results in faster compression, much less memory + /// requirements during compression but bigger compressed sizes. + pub fn storeCompressor(writer: anytype) !StoreCompressor(@TypeOf(writer)) { + return deflate.storeCompressor(container, writer); + } + + pub fn storeCompress(reader: anytype, writer: anytype) !void { + try deflate.storeCompress(container, reader, writer); } }; } @@ -165,7 +170,7 @@ test "decompress" { test "compress/decompress" { const fixedBufferStream = std.io.fixedBufferStream; - var cmp_buf: [32 * 1024]u8 = undefined; // compressed data buffer + var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 }; @@ -173,11 +178,13 @@ test "compress/decompress" { data: []const u8, // uncompressed content gzip_sizes: [levels.len]usize, // compressed data sizes per level 4-9 huffman_only_size: usize, + store_size: usize, }{ .{ .data = @embedFile("testdata/rfc1951.txt"), .gzip_sizes = [_]usize{ 11513, 11217, 11139, 11126, 11122, 11119 }, - .huffman_only_size = 20291, + .huffman_only_size = 20287, + .store_size = 36967, }, }; @@ -241,16 +248,39 @@ test "compress/decompress" { } // huffman only compression { - const gzip_size = case.huffman_only_size; + inline for (Container.list) |container| { // for each wrapping + const compressed_size = case.huffman_only_size - Container.gzip.size() + container.size(); + // compress original stream to compressed stream + { + var original = fixedBufferStream(data); + var compressed = fixedBufferStream(&cmp_buf); + var cmp = try deflate.huffmanCompressor(container, compressed.writer()); + try cmp.compress(original.reader()); + try cmp.close(); + try testing.expectEqual(compressed_size, compressed.pos); + } + // decompress compressed stream to decompressed stream + { + var compressed = fixedBufferStream(cmp_buf[0..compressed_size]); + var decompressed = fixedBufferStream(&dcm_buf); + try inflate.decompress(container, compressed.reader(), decompressed.writer()); + try testing.expectEqualSlices(u8, data, decompressed.getWritten()); + } + } + } + + // store only + { inline for (Container.list) |container| { // for each wrapping - const compressed_size = gzip_size - Container.gzip.size() + container.size(); + const compressed_size = case.store_size - Container.gzip.size() + container.size(); // compress original stream to compressed stream { + var original = fixedBufferStream(data); var compressed = fixedBufferStream(&cmp_buf); - var cmp = try deflate.huffmanOnlyCompressor(container, compressed.writer()); - try cmp.writeBlock(case.data); + var cmp = try deflate.storeCompressor(container, compressed.writer()); + try cmp.compress(original.reader()); try cmp.close(); try testing.expectEqual(compressed_size, compressed.pos); }