Skip to content

Commit

Permalink
enable writing store block types
Browse files Browse the repository at this point in the history
Refactor previous huffman only compressor to be huffman or store.
Internal logic is the same.
Expose huffmanCompress and storeCompress API methods.
ianic committed Feb 9, 2024
1 parent 05d294e commit a603522
Showing 4 changed files with 222 additions and 70 deletions.
18 changes: 13 additions & 5 deletions bin/deflate_bench.zig
Original file line number Diff line number Diff line change
@@ -40,9 +40,17 @@ pub fn run(output: anytype, opt: Options) !void {
} else {
if (opt.level == 0) {
switch (opt.alg) {
.deflate => try raw.compressHuffmanOnly(input, output),
.zlib => try zlib.compressHuffmanOnly(input, output),
.gzip => try gzip.compressHuffmanOnly(input, output),
.deflate => try raw.storeCompress(input, output),
.zlib => try zlib.storeCompress(input, output),
.gzip => try gzip.storeCompress(input, output),
}
return;
}
if (opt.level == 1) {
switch (opt.alg) {
.deflate => try raw.huffmanCompress(input, output),
.zlib => try zlib.huffmanCompress(input, output),
.gzip => try gzip.huffmanCompress(input, output),
}
return;
}
@@ -136,8 +144,8 @@ pub fn readArgs() !?Options {
if (std.mem.eql(u8, a, "-l")) {
if (args.next()) |i| {
opt.level = try std.fmt.parseInt(u8, i, 10);
if (!(opt.level == 0 or (opt.level >= 4 and opt.level <= 9))) {
print("Compression level must be in range 4-9 or 0 for huffman only!\n", .{});
if (!(opt.level == 0 or opt.level == 1 or (opt.level >= 4 and opt.level <= 9))) {
print("Compression level must be in range 4-9 or 0 for store, 1 for huffman only!\n", .{});
return error.InvalidArgs;
}
} else {
45 changes: 21 additions & 24 deletions src/block_writer.zig
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const std = @import("std");
const io = std.io;
const assert = std.debug.assert;

const hc = @import("huffman_encoder.zig");
const consts = @import("consts.zig").huffman;
@@ -227,7 +228,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
// num_distances: The number of distances specified in codegen
// num_codegens: The number of codegens used in codegen
// eof: Is it the end-of-file? (end of stream)
fn writeDynamicHeader(
fn dynamicHeader(
self: *Self,
num_literals: u32,
num_distances: u32,
@@ -273,7 +274,8 @@ pub fn BlockWriter(comptime WriterType: type) type {
}
}

fn writeStoredHeader(self: *Self, length: usize, eof: bool) Error!void {
fn storedHeader(self: *Self, length: usize, eof: bool) Error!void {
assert(length <= 65535);
const flag: u32 = if (eof) 1 else 0;
try self.bit_writer.writeBits(flag, 3);
try self.flush();
@@ -282,7 +284,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
try self.bit_writer.writeBits(~l, 16);
}

fn writeFixedHeader(self: *Self, eof: bool) Error!void {
fn fixedHeader(self: *Self, eof: bool) Error!void {
// Indicate that we are a fixed Huffman block
var value: u32 = 2;
if (eof) {
@@ -291,17 +293,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
try self.bit_writer.writeBits(value, 3);
}

// Write a block of tokens with the smallest encoding.
// Write a block of tokens with the smallest encoding. Will choose block type.
// The original input can be supplied, and if the huffman encoded data
// is larger than the original bytes, the data will be written as a
// stored block.
// If the input is null, the tokens will always be Huffman encoded.
pub fn writeBlock(
self: *Self,
tokens: []const Token,
eof: bool,
input: ?[]const u8,
) Error!void {
pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void {
const lit_and_dist = self.indexTokens(tokens);
const num_literals = lit_and_dist.num_literals;
const num_distances = lit_and_dist.num_distances;
@@ -364,23 +361,23 @@ pub fn BlockWriter(comptime WriterType: type) type {

// Stored bytes?
if (storable and stored_size < size) {
try self.writeBlockStored(input.?, eof);
try self.storedBlock(input.?, eof);
return;
}

// Huffman.
if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) {
try self.writeFixedHeader(eof);
try self.fixedHeader(eof);
} else {
try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
}

// Write the tokens.
try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes);
}

pub fn writeBlockStored(self: *Self, input: []const u8, eof: bool) Error!void {
try self.writeStoredHeader(input.len, eof);
pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void {
try self.storedHeader(input.len, eof);
try self.bit_writer.writeBytes(input);
}

@@ -389,7 +386,7 @@ pub fn BlockWriter(comptime WriterType: type) type {
// histogram distribution.
// If input is supplied and the compression savings are below 1/16th of the
// input size the block is stored.
fn writeBlockDynamic(
fn dynamicBlock(
self: *Self,
tokens: []const Token,
eof: bool,
@@ -418,12 +415,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
const ssize = stored_size.size;
const storable = stored_size.storable;
if (storable and ssize < (size + (size >> 4))) {
try self.writeBlockStored(input.?, eof);
try self.storedBlock(input.?, eof);
return;
}

// Write Huffman table.
try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);

// Write the tokens.
try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes);
@@ -518,7 +515,7 @@ pub fn BlockWriter(comptime WriterType: type) type {

// Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes
// if the results only gains very little from compression.
pub fn writeBlockHuff(self: *Self, eof: bool, input: []const u8) Error!void {
pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void {
// Add everything as literals
histogram(input, &self.literal_freq);

@@ -553,12 +550,12 @@ pub fn BlockWriter(comptime WriterType: type) type {
const storable = stored_size_ret.storable;

if (storable and ssize < (size + (size >> 4))) {
try self.writeBlockStored(input, eof);
try self.storedBlock(input, eof);
return;
}

// Huffman.
try self.writeDynamicHeader(num_literals, num_distances, num_codegens, eof);
try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
const encoding = self.literal_encoding.codes[0..257];

for (input) |t| {
@@ -638,9 +635,9 @@ const TestFn = enum {
final: bool,
) !void {
switch (self) {
.write_block => try bw.writeBlock(tok, final, input),
.write_dyn_block => try bw.writeBlockDynamic(tok, final, input),
.write_huffman_block => try bw.writeBlockHuff(final, input.?),
.write_block => try bw.write(tok, final, input),
.write_dyn_block => try bw.dynamicBlock(tok, final, input),
.write_huffman_block => try bw.huffmanBlock(input.?, final),
}
try bw.flush();
}
155 changes: 136 additions & 19 deletions src/deflate.zig
Original file line number Diff line number Diff line change
@@ -261,7 +261,7 @@ fn Deflate(comptime container: Container, comptime WriterType: type, comptime Bl
}

fn flushTokens(self: *Self, final: bool) !void {
try self.block_writer.writeBlock(self.tokens.tokens(), final, self.win.tokensBuffer());
try self.block_writer.write(self.tokens.tokens(), final, self.win.tokensBuffer());
try self.block_writer.flush();
self.tokens.reset();
self.win.flush();
@@ -353,22 +353,65 @@ const Tokens = struct {
}
};

pub fn huffmanOnlyCompressor(comptime container: Container, writer: anytype) !HuffmanOnlyCompressor(
container,
@TypeOf(writer),
) {
return try HuffmanOnlyCompressor(container, @TypeOf(writer)).init(writer);
}

/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
/// only performs Huffman entropy encoding. Results in faster compression, much
/// less memory requirements during compression but bigger compressed sizes.
///
/// Allocates ~11.2K
pub fn HuffmanCompressor(comptime container: Container, comptime WriterType: type) type {
return SimpleCompressor(.huffman, container, WriterType);
}

pub fn huffmanCompressor(comptime container: Container, writer: anytype) !HuffmanCompressor(container, @TypeOf(writer)) {
return try HuffmanCompressor(container, @TypeOf(writer)).init(writer);
}

pub fn huffmanCompress(comptime container: Container, reader: anytype, writer: anytype) !void {
var c = try huffmanCompressor(container, writer);
try c.compress(reader);
try c.close();
}

/// Creates store blocks only. Data are not compressed only packed into deflate
/// store blocks. That adds 9 bytes of header for each block. Max stored block
/// size is 64K. Block is emitted when flush is called on on close.
///
pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType: type) type {
pub fn StoreCompressor(comptime container: Container, comptime WriterType: type) type {
return SimpleCompressor(.store, container, WriterType);
}

pub fn storeCompressor(comptime container: Container, writer: anytype) !StoreCompressor(container, @TypeOf(writer)) {
return try StoreCompressor(container, @TypeOf(writer)).init(writer);
}

pub fn storeCompress(comptime container: Container, reader: anytype, writer: anytype) !void {
var c = try storeCompressor(container, writer);
try c.compress(reader);
try c.close();
}

const SimpleCompressorKind = enum {
huffman,
store,
};

fn simpleCompressor(
comptime kind: SimpleCompressorKind,
comptime container: Container,
writer: anytype,
) !SimpleCompressor(kind, container, @TypeOf(writer)) {
return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
}

fn SimpleCompressor(
comptime kind: SimpleCompressorKind,
comptime container: Container,
comptime WriterType: type,
) type {
const BlockWriterType = BlockWriter(WriterType);
return struct {
buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
wp: usize = 0,

wrt: WriterType,
block_writer: BlockWriterType,
hasher: container.Hasher() = .{},
@@ -384,15 +427,57 @@ pub fn HuffmanOnlyCompressor(comptime container: Container, comptime WriterType:
return self;
}

pub fn flush(self: *Self) !void {
try self.flushBuffer(false);
}

pub fn close(self: *Self) !void {
try self.block_writer.writeBlockStored("", true);
try self.block_writer.flush();
try self.flushBuffer(true);
try container.writeFooter(&self.hasher, self.wrt);
}

pub fn writeBlock(self: *Self, input: []const u8) !void {
self.hasher.update(input);
try self.block_writer.writeBlockHuff(false, input);
fn flushBuffer(self: *Self, final: bool) !void {
const buf = self.buffer[0..self.wp];
switch (kind) {
.huffman => try self.block_writer.huffmanBlock(buf, final),
.store => try self.block_writer.storedBlock(buf, final),
}
try self.block_writer.flush();
self.wp = 0;
}

// Writes all data from the input reader of uncompressed data.
// It is up to the caller to call flush or close if there is need to
// output compressed blocks.
pub fn compress(self: *Self, reader: anytype) !void {
while (true) {
// read from rdr into buffer
const buf = self.buffer[self.wp..];
if (buf.len == 0) {
try self.flushBuffer(false);
continue;
}
const n = try reader.readAll(buf);
self.hasher.update(buf[0..n]);
self.wp += n;
if (n < buf.len) break; // no more data in reader
}
}

// Writer interface

pub const Writer = io.Writer(*Self, Error, write);
pub const Error = BlockWriterType.Error;

// Write `input` of uncompressed data.
pub fn write(self: *Self, input: []const u8) !usize {
var fbs = io.fixedBufferStream(input);
try self.compress(fbs.reader());
return input.len;
}

pub fn writer(self: *Self) Writer {
return .{ .context = self };
}
};
}
@@ -448,7 +533,7 @@ const TestTokenWriter = struct {
pub fn init(_: anytype) Self {
return .{};
}
pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
for (tokens) |t| {
self.actual[self.pos] = t;
self.pos += 1;
@@ -503,9 +588,12 @@ test "check struct sizes" {
// var cmp = try std.compress.deflate.compressor(allocator, io.null_writer, .{});
// defer cmp.deinit();

const HOC = HuffmanOnlyCompressor(.raw, @TypeOf(io.null_writer));
try expect(@sizeOf(HOC) == 11480);
const HOC = HuffmanCompressor(.raw, @TypeOf(io.null_writer));
//print("size of HOC {d}\n", .{@sizeOf(HOC)});
try expect(@sizeOf(HOC) == 77024);
// 64K buffer
// 11480 huffman_encoded
// 8 buffer write pointer
}

test "deflate file tokenization" {
@@ -583,7 +671,7 @@ fn TokenDecoder(comptime WriterType: type) type {
return .{ .wrt = wrt };
}

pub fn writeBlock(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
self.tokens_count += tokens.len;
for (tokens) |t| {
switch (t.kind) {
@@ -606,3 +694,32 @@ fn TokenDecoder(comptime WriterType: type) type {
pub fn flush(_: *Self) !void {}
};
}

test "store simple compressor" {
const data = "Hello world!";
const expected = [_]u8{
0x1, // block type 0, final bit set
0xc, 0x0, // len = 12
0xf3, 0xff, // ~len
'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
//0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
};

var fbs = std.io.fixedBufferStream(data);
var al = std.ArrayList(u8).init(testing.allocator);
defer al.deinit();

var cmp = try storeCompressor(.raw, al.writer());
try cmp.compress(fbs.reader());
try cmp.close();
try testing.expectEqualSlices(u8, &expected, al.items);

fbs.reset();
try al.resize(0);

// huffman only compresoor will also emit store block for this small sample
var hc = try huffmanCompressor(.raw, al.writer());
try hc.compress(fbs.reader());
try hc.close();
try testing.expectEqualSlices(u8, &expected, al.items);
}
74 changes: 52 additions & 22 deletions src/root.zig
Original file line number Diff line number Diff line change
@@ -45,29 +45,34 @@ fn byContainer(comptime container: Container) type {
return try deflate.compressor(container, writer, level);
}

pub fn HuffmanOnlyCompressor(comptime WriterType: type) type {
return deflate.HuffmanOnlyCompressor(container, WriterType);
pub fn HuffmanCompressor(comptime WriterType: type) type {
return deflate.HuffmanCompressor(container, WriterType);
}

pub fn huffmanCompress(reader: anytype, writer: anytype) !void {
try deflate.huffmanCompress(container, reader, writer);
}

/// Disables Lempel-Ziv match searching and only performs Huffman
/// entropy encoding. Results in faster compression, much less memory
/// requirements during compression but bigger compressed sizes.
pub fn huffmanOnlyCompressor(writer: anytype) !HuffmanOnlyCompressor(@TypeOf(writer)) {
return deflate.huffmanOnlyCompressor(container, writer);
pub fn huffmanCompressor(writer: anytype) !HuffmanCompressor(@TypeOf(writer)) {
return deflate.huffmanCompressor(container, writer);
}

/// Compress plain data from reader and write them to the writer using
/// huffman only compression algorithm.
pub fn compressHuffmanOnly(reader: anytype, writer: anytype) !void {
var cmp = try huffmanOnlyCompressor(writer);
var buf: [1024 * 64]u8 = undefined;
while (true) {
const n = try reader.readAll(&buf);
if (n == 0) break;
try cmp.writeBlock(buf[0..n]);
if (n < buf.len) break;
}
try cmp.close();
pub fn StoreCompressor(comptime WriterType: type) type {
return deflate.StoreCompressor(container, WriterType);
}

/// Disables Lempel-Ziv match searching and only performs Huffman
/// entropy encoding. Results in faster compression, much less memory
/// requirements during compression but bigger compressed sizes.
pub fn storeCompressor(writer: anytype) !StoreCompressor(@TypeOf(writer)) {
return deflate.storeCompressor(container, writer);
}

pub fn storeCompress(reader: anytype, writer: anytype) !void {
try deflate.storeCompress(container, reader, writer);
}
};
}
@@ -165,19 +170,21 @@ test "decompress" {
test "compress/decompress" {
const fixedBufferStream = std.io.fixedBufferStream;

var cmp_buf: [32 * 1024]u8 = undefined; // compressed data buffer
var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer
var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer

const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
const cases = [_]struct {
data: []const u8, // uncompressed content
gzip_sizes: [levels.len]usize, // compressed data sizes per level 4-9
huffman_only_size: usize,
store_size: usize,
}{
.{
.data = @embedFile("testdata/rfc1951.txt"),
.gzip_sizes = [_]usize{ 11513, 11217, 11139, 11126, 11122, 11119 },
.huffman_only_size = 20291,
.huffman_only_size = 20287,
.store_size = 36967,
},
};

@@ -241,16 +248,39 @@ test "compress/decompress" {
}
// huffman only compression
{
const gzip_size = case.huffman_only_size;
inline for (Container.list) |container| { // for each wrapping
const compressed_size = case.huffman_only_size - Container.gzip.size() + container.size();

// compress original stream to compressed stream
{
var original = fixedBufferStream(data);
var compressed = fixedBufferStream(&cmp_buf);
var cmp = try deflate.huffmanCompressor(container, compressed.writer());
try cmp.compress(original.reader());
try cmp.close();
try testing.expectEqual(compressed_size, compressed.pos);
}
// decompress compressed stream to decompressed stream
{
var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
var decompressed = fixedBufferStream(&dcm_buf);
try inflate.decompress(container, compressed.reader(), decompressed.writer());
try testing.expectEqualSlices(u8, data, decompressed.getWritten());
}
}
}

// store only
{
inline for (Container.list) |container| { // for each wrapping
const compressed_size = gzip_size - Container.gzip.size() + container.size();
const compressed_size = case.store_size - Container.gzip.size() + container.size();

// compress original stream to compressed stream
{
var original = fixedBufferStream(data);
var compressed = fixedBufferStream(&cmp_buf);
var cmp = try deflate.huffmanOnlyCompressor(container, compressed.writer());
try cmp.writeBlock(case.data);
var cmp = try deflate.storeCompressor(container, compressed.writer());
try cmp.compress(original.reader());
try cmp.close();
try testing.expectEqual(compressed_size, compressed.pos);
}

0 comments on commit a603522

Please sign in to comment.