From a6035227c0d529f05edb042d928d3d696a22fddc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20Anic=CC=81?= <igor.anic@gmail.com>
Date: Fri, 9 Feb 2024 22:22:13 +0100
Subject: [PATCH] enable writing store block types

Refactor previous huffman only compressor to be huffman or store.
Internal logic is the same.
Expose huffmanCompress and storeCompress API methods.
---
 bin/deflate_bench.zig |  18 +++--
 src/block_writer.zig  |  45 ++++++------
 src/deflate.zig       | 155 ++++++++++++++++++++++++++++++++++++------
 src/root.zig          |  74 ++++++++++++++------
 4 files changed, 222 insertions(+), 70 deletions(-)

diff --git a/bin/deflate_bench.zig b/bin/deflate_bench.zig
index 72f71f2..178be44 100644
--- a/bin/deflate_bench.zig
+++ b/bin/deflate_bench.zig
@@ -40,9 +40,17 @@ pub fn run(output: anytype, opt: Options) !void {
     } else {
         if (opt.level == 0) {
             switch (opt.alg) {
-                .deflate => try raw.compressHuffmanOnly(input, output),
-                .zlib => try zlib.compressHuffmanOnly(input, output),
-                .gzip => try gzip.compressHuffmanOnly(input, output),
+                .deflate => try raw.storeCompress(input, output),
+                .zlib => try zlib.storeCompress(input, output),
+                .gzip => try gzip.storeCompress(input, output),
+            }
+            return;
+        }
+        if (opt.level == 1) {
+            switch (opt.alg) {
+                .deflate => try raw.huffmanCompress(input, output),
+                .zlib => try zlib.huffmanCompress(input, output),
+                .gzip => try gzip.huffmanCompress(input, output),
             }
             return;
         }
@@ -136,8 +144,8 @@ pub fn readArgs() !?Options {
         if (std.mem.eql(u8, a, "-l")) {
             if (args.next()) |i| {
                 opt.level = try std.fmt.parseInt(u8, i, 10);
-                if (!(opt.level == 0 or (opt.level >= 4 and opt.level <= 9))) {
-                    print("Compression level must be in range 4-9 or 0 for huffman only!\n", .{});
+                if (!(opt.level == 0 or opt.level == 1 or (opt.level >= 4 and opt.level <= 9))) {
+                    print("Compression level must be in range 4-9 or 0 for store, 1 for huffman only!\n", .{});
                     return error.InvalidArgs;
                 }
             } else {
diff --git a/src/block_writer.zig b/src/block_writer.zig
index 2efc917..26dc922 100644
--- a/src/block_writer.zig
+++ b/src/block_writer.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const io = std.io;
+const assert = std.debug.assert;
 
 const hc = @import("huffman_encoder.zig");
 const consts = @import("consts.zig").huffman;
@@ -227,7 +228,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
         //  num_distances: The number of distances specified in codegen
         //  num_codegens: The number of codegens used in codegen
         //  eof: Is it the end-of-file? (end of stream)
-        fn writeDynamicHeader(
+        fn dynamicHeader(
             self: *Self,
             num_literals: u32,
             num_distances: u32,
@@ -273,7 +274,8 @@ pub fn BlockWriter(comptime WriterType: type) type {
             }
         }
 
-        fn writeStoredHeader(self: *Self, length: usize, eof: bool) Error!void {
+        fn storedHeader(self: *Self, length: usize, eof: bool) Error!void {
+            assert(length <= 65535);
             const flag: u32 = if (eof) 1 else 0;
             try self.bit_writer.writeBits(flag, 3);
             try self.flush();
@@ -282,7 +284,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
             try self.bit_writer.writeBits(~l, 16);
         }
 
-        fn writeFixedHeader(self: *Self, eof: bool) Error!void {
+        fn fixedHeader(self: *Self, eof: bool) Error!void {
             // Indicate that we are a fixed Huffman block
             var value: u32 = 2;
             if (eof) {
@@ -291,17 +293,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
             try self.bit_writer.writeBits(value, 3);
         }
 
-        // Write a block of tokens with the smallest encoding.
+        // Write a block of tokens with the smallest encoding. Will choose block type.
         // The original input can be supplied, and if the huffman encoded data
         // is larger than the original bytes, the data will be written as a
         // stored block.
         // If the input is null, the tokens will always be Huffman encoded.
-        pub fn writeBlock(
-            self: *Self,
-            tokens: []const Token,
-            eof: bool,
-            input: ?[]const u8,
-        ) Error!void {
+        pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void {
             const lit_and_dist = self.indexTokens(tokens);
             const num_literals = lit_and_dist.num_literals;
             const num_distances = lit_and_dist.num_distances;
@@ -364,23 +361,23 @@ pub fn BlockWriter(comptime WriterType: type) type {
 
             // Stored bytes?
             if (storable and stored_size < size) {
-                try self.writeBlockStored(input.?, eof);
+                try self.storedBlock(input.?, eof);
                 return;
             }
 
             // Huffman.
             if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) {
-                try self.writeFixedHeader(eof);
+                try self.fixedHeader(eof);
             } else {
-                try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
+                try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
             }
 
             // Write the tokens.
             try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes);
         }
 
-        pub fn writeBlockStored(self: *Self, input: []const u8, eof: bool) Error!void {
-            try self.writeStoredHeader(input.len, eof);
+        pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void {
+            try self.storedHeader(input.len, eof);
             try self.bit_writer.writeBytes(input);
         }
 
@@ -389,7 +386,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
         // histogram distribution.
         // If input is supplied and the compression savings are below 1/16th of the
         // input size the block is stored.
-        fn writeBlockDynamic(
+        fn dynamicBlock(
             self: *Self,
             tokens: []const Token,
             eof: bool,
@@ -418,12 +415,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
             const ssize = stored_size.size;
             const storable = stored_size.storable;
             if (storable and ssize < (size + (size >> 4))) {
-                try self.writeBlockStored(input.?, eof);
+                try self.storedBlock(input.?, eof);
                 return;
             }
 
             // Write Huffman table.
-            try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
+            try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
 
             // Write the tokens.
             try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes);
@@ -518,7 +515,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
 
         // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes
         // if the results only gains very little from compression.
-        pub fn writeBlockHuff(self: *Self, eof: bool, input: []const u8) Error!void {
+        pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void {
             // Add everything as literals
             histogram(input, &self.literal_freq);
 
@@ -553,12 +550,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
             const storable = stored_size_ret.storable;
 
             if (storable and ssize < (size + (size >> 4))) {
-                try self.writeBlockStored(input, eof);
+                try self.storedBlock(input, eof);
                 return;
             }
 
             // Huffman.
-            try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
+            try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
             const encoding = self.literal_encoding.codes[0..257];
 
             for (input) |t| {
@@ -638,9 +635,9 @@ const TestFn = enum {
         final: bool,
     ) !void {
         switch (self) {
-            .write_block => try bw.writeBlock(tok, final, input),
-            .write_dyn_block => try bw.writeBlockDynamic(tok, final, input),
-            .write_huffman_block => try bw.writeBlockHuff(final, input.?),
+            .write_block => try bw.write(tok, final, input),
+            .write_dyn_block => try bw.dynamicBlock(tok, final, input),
+            .write_huffman_block => try bw.huffmanBlock(input.?, final),
         }
         try bw.flush();
     }
diff --git a/src/deflate.zig b/src/deflate.zig
index f9be943..ff0a4df 100644
--- a/src/deflate.zig
+++ b/src/deflate.zig
@@ -261,7 +261,7 @@ fn Deflate(comptime container: Container, comptime WriterType: type, comptime Bl
         }
 
         fn flushTokens(self: *Self, final: bool) !void {
-            try self.block_writer.writeBlock(self.tokens.tokens(), final, self.win.tokensBuffer());
+            try self.block_writer.write(self.tokens.tokens(), final, self.win.tokensBuffer());
             try self.block_writer.flush();
             self.tokens.reset();
             self.win.flush();
@@ -353,22 +353,65 @@ const Tokens = struct {
     }
 };
 
-pub fn huffmanOnlyCompressor(comptime container: Container, writer: anytype) !HuffmanOnlyCompressor(
-    container,
-    @TypeOf(writer),
-) {
-    return try HuffmanOnlyCompressor(container, @TypeOf(writer)).init(writer);
-}
-
 /// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
 /// only performs Huffman entropy encoding. Results in faster compression, much
 /// less memory requirements during compression but bigger compressed sizes.
 ///
-/// Allocates ~11.2K
+pub fn HuffmanCompressor(comptime container: Container, comptime WriterType: type) type {
+    return SimpleCompressor(.huffman, container, WriterType);
+}
+
+pub fn huffmanCompressor(comptime container: Container, writer: anytype) !HuffmanCompressor(container, @TypeOf(writer)) {
+    return try HuffmanCompressor(container, @TypeOf(writer)).init(writer);
+}
+
+pub fn huffmanCompress(comptime container: Container, reader: anytype, writer: anytype) !void {
+    var c = try huffmanCompressor(container, writer);
+    try c.compress(reader);
+    try c.close();
+}
+
+/// Creates store blocks only. Data are not compressed only packed into deflate
+/// store blocks. That adds 9 bytes of header for each block. Max stored block
+/// size is 64K. Block is emitted when flush is called on on close.
 ///
-pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType: type) type {
+pub fn StoreCompressor(comptime container: Container, comptime WriterType: type) type {
+    return SimpleCompressor(.store, container, WriterType);
+}
+
+pub fn storeCompressor(comptime container: Container, writer: anytype) !StoreCompressor(container, @TypeOf(writer)) {
+    return try StoreCompressor(container, @TypeOf(writer)).init(writer);
+}
+
+pub fn storeCompress(comptime container: Container, reader: anytype, writer: anytype) !void {
+    var c = try storeCompressor(container, writer);
+    try c.compress(reader);
+    try c.close();
+}
+
+const SimpleCompressorKind = enum {
+    huffman,
+    store,
+};
+
+fn simpleCompressor(
+    comptime kind: SimpleCompressorKind,
+    comptime container: Container,
+    writer: anytype,
+) !SimpleCompressor(kind, container, @TypeOf(writer)) {
+    return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
+}
+
+fn SimpleCompressor(
+    comptime kind: SimpleCompressorKind,
+    comptime container: Container,
+    comptime WriterType: type,
+) type {
     const BlockWriterType = BlockWriter(WriterType);
     return struct {
+        buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
+        wp: usize = 0,
+
         wrt: WriterType,
         block_writer: BlockWriterType,
         hasher: container.Hasher() = .{},
@@ -384,15 +427,57 @@ pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType:
             return self;
         }
 
+        pub fn flush(self: *Self) !void {
+            try self.flushBuffer(false);
+        }
+
         pub fn close(self: *Self) !void {
-            try self.block_writer.writeBlockStored("", true);
-            try self.block_writer.flush();
+            try self.flushBuffer(true);
             try container.writeFooter(&self.hasher, self.wrt);
         }
 
-        pub fn writeBlock(self: *Self, input: []const u8) !void {
-            self.hasher.update(input);
-            try self.block_writer.writeBlockHuff(false, input);
+        fn flushBuffer(self: *Self, final: bool) !void {
+            const buf = self.buffer[0..self.wp];
+            switch (kind) {
+                .huffman => try self.block_writer.huffmanBlock(buf, final),
+                .store => try self.block_writer.storedBlock(buf, final),
+            }
+            try self.block_writer.flush();
+            self.wp = 0;
+        }
+
+        // Writes all data from the input reader of uncompressed data.
+        // It is up to the caller to call flush or close if there is need to
+        // output compressed blocks.
+        pub fn compress(self: *Self, reader: anytype) !void {
+            while (true) {
+                // read from rdr into buffer
+                const buf = self.buffer[self.wp..];
+                if (buf.len == 0) {
+                    try self.flushBuffer(false);
+                    continue;
+                }
+                const n = try reader.readAll(buf);
+                self.hasher.update(buf[0..n]);
+                self.wp += n;
+                if (n < buf.len) break; // no more data in reader
+            }
+        }
+
+        // Writer interface
+
+        pub const Writer = io.Writer(*Self, Error, write);
+        pub const Error = BlockWriterType.Error;
+
+        // Write `input` of uncompressed data.
+        pub fn write(self: *Self, input: []const u8) !usize {
+            var fbs = io.fixedBufferStream(input);
+            try self.compress(fbs.reader());
+            return input.len;
+        }
+
+        pub fn writer(self: *Self) Writer {
+            return .{ .context = self };
         }
     };
 }
@@ -448,7 +533,7 @@ const TestTokenWriter = struct {
     pub fn init(_: anytype) Self {
         return .{};
     }
-    pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+    pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
         for (tokens) |t| {
             self.actual[self.pos] = t;
             self.pos += 1;
@@ -503,9 +588,12 @@ test "check struct sizes" {
     // var cmp = try std.compress.deflate.compressor(allocator, io.null_writer, .{});
     // defer cmp.deinit();
 
-    const HOC = HuffmanOnlyCompressor(.raw, @TypeOf(io.null_writer));
-    try expect(@sizeOf(HOC) == 11480);
+    const HOC = HuffmanCompressor(.raw, @TypeOf(io.null_writer));
     //print("size of HOC {d}\n", .{@sizeOf(HOC)});
+    try expect(@sizeOf(HOC) == 77024);
+    // 64K buffer
+    // 11480 huffman_encoded
+    // 8 buffer write pointer
 }
 
 test "deflate file tokenization" {
@@ -583,7 +671,7 @@ fn TokenDecoder(comptime WriterType: type) type {
             return .{ .wrt = wrt };
         }
 
-        pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+        pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
             self.tokens_count += tokens.len;
             for (tokens) |t| {
                 switch (t.kind) {
@@ -606,3 +694,32 @@ fn TokenDecoder(comptime WriterType: type) type {
         pub fn flush(_: *Self) !void {}
     };
 }
+
+test "store simple compressor" {
+    const data = "Hello world!";
+    const expected = [_]u8{
+        0x1, // block type 0, final bit set
+        0xc, 0x0, // len = 12
+        0xf3, 0xff, // ~len
+        'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
+        //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
+    };
+
+    var fbs = std.io.fixedBufferStream(data);
+    var al = std.ArrayList(u8).init(testing.allocator);
+    defer al.deinit();
+
+    var cmp = try storeCompressor(.raw, al.writer());
+    try cmp.compress(fbs.reader());
+    try cmp.close();
+    try testing.expectEqualSlices(u8, &expected, al.items);
+
+    fbs.reset();
+    try al.resize(0);
+
+    // huffman only compresoor will also emit store block for this small sample
+    var hc = try huffmanCompressor(.raw, al.writer());
+    try hc.compress(fbs.reader());
+    try hc.close();
+    try testing.expectEqualSlices(u8, &expected, al.items);
+}
diff --git a/src/root.zig b/src/root.zig
index dd26db3..3ffd68f 100644
--- a/src/root.zig
+++ b/src/root.zig
@@ -45,29 +45,34 @@ fn byContainer(comptime container: Container) type {
             return try deflate.compressor(container, writer, level);
         }
 
-        pub fn HuffmanOnlyCompressor(comptime WriterType: type) type {
-            return deflate.HuffmanOnlyCompressor(container, WriterType);
+        pub fn HuffmanCompressor(comptime WriterType: type) type {
+            return deflate.HuffmanCompressor(container, WriterType);
+        }
+
+        pub fn huffmanCompress(reader: anytype, writer: anytype) !void {
+            try deflate.huffmanCompress(container, reader, writer);
         }
 
         /// Disables Lempel-Ziv match searching and only performs Huffman
         /// entropy encoding. Results in faster compression, much less memory
         /// requirements during compression but bigger compressed sizes.
-        pub fn huffmanOnlyCompressor(writer: anytype) !HuffmanOnlyCompressor(@TypeOf(writer)) {
-            return deflate.huffmanOnlyCompressor(container, writer);
+        pub fn huffmanCompressor(writer: anytype) !HuffmanCompressor(@TypeOf(writer)) {
+            return deflate.huffmanCompressor(container, writer);
         }
 
-        /// Compress plain data from reader and write them to the writer using
-        /// huffman only compression algorithm.
-        pub fn compressHuffmanOnly(reader: anytype, writer: anytype) !void {
-            var cmp = try huffmanOnlyCompressor(writer);
-            var buf: [1024 * 64]u8 = undefined;
-            while (true) {
-                const n = try reader.readAll(&buf);
-                if (n == 0) break;
-                try cmp.writeBlock(buf[0..n]);
-                if (n < buf.len) break;
-            }
-            try cmp.close();
+        pub fn StoreCompressor(comptime WriterType: type) type {
+            return deflate.StoreCompressor(container, WriterType);
+        }
+
+        /// Disables Lempel-Ziv match searching and only performs Huffman
+        /// entropy encoding. Results in faster compression, much less memory
+        /// requirements during compression but bigger compressed sizes.
+        pub fn storeCompressor(writer: anytype) !StoreCompressor(@TypeOf(writer)) {
+            return deflate.storeCompressor(container, writer);
+        }
+
+        pub fn storeCompress(reader: anytype, writer: anytype) !void {
+            try deflate.storeCompress(container, reader, writer);
         }
     };
 }
@@ -165,7 +170,7 @@ test "decompress" {
 test "compress/decompress" {
     const fixedBufferStream = std.io.fixedBufferStream;
 
-    var cmp_buf: [32 * 1024]u8 = undefined; // compressed data buffer
+    var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer
     var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer
 
     const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
@@ -173,11 +178,13 @@ test "compress/decompress" {
         data: []const u8, // uncompressed content
         gzip_sizes: [levels.len]usize, // compressed data sizes per level 4-9
         huffman_only_size: usize,
+        store_size: usize,
     }{
         .{
             .data = @embedFile("testdata/rfc1951.txt"),
             .gzip_sizes = [_]usize{ 11513, 11217, 11139, 11126, 11122, 11119 },
-            .huffman_only_size = 20291,
+            .huffman_only_size = 20287,
+            .store_size = 36967,
         },
     };
 
@@ -241,16 +248,39 @@ test "compress/decompress" {
         }
         // huffman only compression
         {
-            const gzip_size = case.huffman_only_size;
+            inline for (Container.list) |container| { // for each wrapping
+                const compressed_size = case.huffman_only_size - Container.gzip.size() + container.size();
 
+                // compress original stream to compressed stream
+                {
+                    var original = fixedBufferStream(data);
+                    var compressed = fixedBufferStream(&cmp_buf);
+                    var cmp = try deflate.huffmanCompressor(container, compressed.writer());
+                    try cmp.compress(original.reader());
+                    try cmp.close();
+                    try testing.expectEqual(compressed_size, compressed.pos);
+                }
+                // decompress compressed stream to decompressed stream
+                {
+                    var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
+                    var decompressed = fixedBufferStream(&dcm_buf);
+                    try inflate.decompress(container, compressed.reader(), decompressed.writer());
+                    try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+                }
+            }
+        }
+
+        // store only
+        {
             inline for (Container.list) |container| { // for each wrapping
-                const compressed_size = gzip_size - Container.gzip.size() + container.size();
+                const compressed_size = case.store_size - Container.gzip.size() + container.size();
 
                 // compress original stream to compressed stream
                 {
+                    var original = fixedBufferStream(data);
                     var compressed = fixedBufferStream(&cmp_buf);
-                    var cmp = try deflate.huffmanOnlyCompressor(container, compressed.writer());
-                    try cmp.writeBlock(case.data);
+                    var cmp = try deflate.storeCompressor(container, compressed.writer());
+                    try cmp.compress(original.reader());
                     try cmp.close();
                     try testing.expectEqual(compressed_size, compressed.pos);
                 }