Skip to content

Commit

Permalink
Fix/Improve handling of codeword issues
Browse files Browse the repository at this point in the history
  • Loading branch information
soxofaan committed Dec 2, 2021
1 parent a4d705f commit 118d9c2
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 5 deletions.
19 changes: 15 additions & 4 deletions dahuffman/huffmancodec.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import collections
import itertools
import math
from io import IOBase
import sys
from heapq import heappush, heappop, heapify
Expand Down Expand Up @@ -67,20 +68,30 @@ def ensure_dir(path: Union[str, Path]) -> Path:

class CodeTable:
"""
Code table: mapping a symbol to codes (and vice versa).
Code table: mapping a symbol to codewords (and vice versa).
The symbols are the things you want to encode, usually characters in a string
or byte sequence, but it can be anything hashable.
The codes are the corresponding bit sequences, represented as a tuple (bits, value)
The codewords are the corresponding bit sequences, represented as a tuple (bits, value)
where `bits` is the number of bits and `value` the integer interpretation of these bits.
"""

# TODO: use something like namedtuple or class with slots for codewords instead of tuples?

def __init__(self, symbol_code_map: dict):
self._symbol_map = {}
self._code_map = {}
for symbol, (bits, value) in symbol_code_map.items():
assert isinstance(bits, int) and bits >= 1, f"Invalid bit count {bits}"
assert isinstance(value, int) and value >= 0, f"Invalid code value {value}"
if not (
isinstance(bits, int)
and bits >= 1
and isinstance(value, int)
and value >= 0
and math.log2(max(value, 1)) < bits
):
raise ValueError(
"Invalid code: {b} bits, value {v}".format(b=bits, v=value)
)
self._symbol_map[symbol] = (bits, value)
self._code_map[(bits, value)] = symbol
# TODO check if code table is actually a prefix code
Expand Down
34 changes: 33 additions & 1 deletion tests/test_dahuffman.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,50 @@
# coding=utf-8
import io
import re
import textwrap
from io import StringIO
from pathlib import Path

import pytest

from dahuffman import HuffmanCodec
from dahuffman.huffmancodec import PrefixCodec, _EOF
from dahuffman.huffmancodec import PrefixCodec, _EOF, CodeTable


# TODO test streaming


class TestCodeTable:
def test_basic(self):
table = CodeTable({"a": (1, 0), "b": (1, 1)})
assert table.get_code("a") == (1, 0)
assert table.get_code("b") == (1, 1)
assert table.get_symbol(1, 0) == "a"
assert table.get_symbol(1, 1) == "b"
assert len(table) == 2

@pytest.mark.parametrize("codes", [
{"a": (0, 0), "b": (1, 1)},
{"a": (1, 0), "b": (1, -1)},
{"a": (1, 2), "b": (1, 1)},
])
def test_invalid(self, codes):
with pytest.raises(ValueError):
CodeTable(codes)

def test_print(self):
table = CodeTable({"a": (2, 0), "b": (3, 7)})
out = StringIO()
table.print(out)
assert out.getvalue() == textwrap.dedent(
"""\
Bits Code Value Symbol
2 00 0 'a'
3 111 7 'b'
"""
)


def test_prefix_codec():
code_table = {"A": (2, 0), "B": (2, 1), _EOF: (2, 3)}
codec = PrefixCodec(code_table)
Expand Down

0 comments on commit 118d9c2

Please sign in to comment.