From 118d9c2adbcc1808def816befedd0c7eb093ce85 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Thu, 2 Dec 2021 23:25:36 +0100 Subject: [PATCH] Fix/Improve handling of codeword issues --- dahuffman/huffmancodec.py | 19 +++++++++++++++---- tests/test_dahuffman.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/dahuffman/huffmancodec.py b/dahuffman/huffmancodec.py index 7e08de9..1116ef1 100644 --- a/dahuffman/huffmancodec.py +++ b/dahuffman/huffmancodec.py @@ -1,5 +1,6 @@ import collections import itertools +import math from io import IOBase import sys from heapq import heappush, heappop, heapify @@ -67,20 +68,30 @@ def ensure_dir(path: Union[str, Path]) -> Path: class CodeTable: """ - Code table: mapping a symbol to codes (and vice versa). + Code table: mapping a symbol to codewords (and vice versa). The symbols are the things you want to encode, usually characters in a string or byte sequence, but it can be anything hashable. - The codes are the corresponding bit sequences, represented as a tuple (bits, value) + The codewords are the corresponding bit sequences, represented as a tuple (bits, value) where `bits` is the number of bits and `value` the integer interpretation of these bits. """ + # TODO: use something like namedtuple or class with slots for codewords instead of tuples? + def __init__(self, symbol_code_map: dict): self._symbol_map = {} self._code_map = {} for symbol, (bits, value) in symbol_code_map.items(): - assert isinstance(bits, int) and bits >= 1, f"Invalid bit count {bits}" - assert isinstance(value, int) and value >= 0, f"Invalid code value {value}" + if not ( + isinstance(bits, int) + and bits >= 1 + and isinstance(value, int) + and value >= 0 + and math.log2(max(value, 1)) < bits + ): + raise ValueError( + "Invalid code: {b} bits, value {v}".format(b=bits, v=value) + ) self._symbol_map[symbol] = (bits, value) self._code_map[(bits, value)] = symbol # TODO check if code table is actually a prefix code diff --git a/tests/test_dahuffman.py b/tests/test_dahuffman.py index 99073cc..3a3f7a4 100644 --- a/tests/test_dahuffman.py +++ b/tests/test_dahuffman.py @@ -1,18 +1,50 @@ # coding=utf-8 import io import re +import textwrap from io import StringIO from pathlib import Path import pytest from dahuffman import HuffmanCodec -from dahuffman.huffmancodec import PrefixCodec, _EOF +from dahuffman.huffmancodec import PrefixCodec, _EOF, CodeTable # TODO test streaming +class TestCodeTable: + def test_basic(self): + table = CodeTable({"a": (1, 0), "b": (1, 1)}) + assert table.get_code("a") == (1, 0) + assert table.get_code("b") == (1, 1) + assert table.get_symbol(1, 0) == "a" + assert table.get_symbol(1, 1) == "b" + assert len(table) == 2 + + @pytest.mark.parametrize("codes", [ + {"a": (0, 0), "b": (1, 1)}, + {"a": (1, 0), "b": (1, -1)}, + {"a": (1, 2), "b": (1, 1)}, + ]) + def test_invalid(self, codes): + with pytest.raises(ValueError): + CodeTable(codes) + + def test_print(self): + table = CodeTable({"a": (2, 0), "b": (3, 7)}) + out = StringIO() + table.print(out) + assert out.getvalue() == textwrap.dedent( + """\ + Bits Code Value Symbol + 2 00 0 'a' + 3 111 7 'b' + """ + ) + + def test_prefix_codec(): code_table = {"A": (2, 0), "B": (2, 1), _EOF: (2, 3)} codec = PrefixCodec(code_table)