From 2a370f74c07d9b7c1b05d803aaf588879767df43 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Sat, 13 Jul 2024 22:25:06 +0200 Subject: [PATCH] Issue #17 initial steps to decouple code table IO from PrefixCodec - Refactor out pickle save/load from PrefixCode - Initial JSON based code table storage --- dahuffman/codecs/__init__.py | 3 +- dahuffman/codecs/json-compact.json | 1 + dahuffman/codecs/json.json | 1 + dahuffman/codecs/shakespeare-lower.json | 1 + dahuffman/codecs/shakespeare-raw.json | 1 + dahuffman/codecs/shakespeare.json | 1 + dahuffman/codecs/xml.json | 1 + dahuffman/codetableio.py | 132 ++++++++++++++++++++++++ dahuffman/huffmancodec.py | 52 +++------- tests/test_codetableio.py | 63 +++++++++++ tests/test_dahuffman.py | 2 +- train/json-data.py | 23 ++++- train/shakespeare.py | 34 +++++- train/train_utils.py | 4 +- train/xml-data.py | 8 +- 15 files changed, 277 insertions(+), 50 deletions(-) create mode 100644 dahuffman/codecs/json-compact.json create mode 100644 dahuffman/codecs/json.json create mode 100644 dahuffman/codecs/shakespeare-lower.json create mode 100644 dahuffman/codecs/shakespeare-raw.json create mode 100644 dahuffman/codecs/shakespeare.json create mode 100644 dahuffman/codecs/xml.json create mode 100644 dahuffman/codetableio.py create mode 100644 tests/test_codetableio.py diff --git a/dahuffman/codecs/__init__.py b/dahuffman/codecs/__init__.py index d8c53c8..9533519 100644 --- a/dahuffman/codecs/__init__.py +++ b/dahuffman/codecs/__init__.py @@ -9,6 +9,7 @@ from functools import partial from pathlib import Path +import dahuffman.codetableio from dahuffman.huffmancodec import PrefixCodec @@ -22,7 +23,7 @@ def load(name: str) -> PrefixCodec: if not name.endswith(".pickle"): name = name + ".pickle" with importlib.resources.path("dahuffman.codecs", resource=name) as path: - return PrefixCodec.load(path) + return dahuffman.codetableio.pickle_load(path) load_shakespeare = partial(load, "shakespeare") diff --git a/dahuffman/codecs/json-compact.json b/dahuffman/codecs/json-compact.json new file mode 100644 index 0000000..56d9b69 --- /dev/null +++ b/dahuffman/codecs/json-compact.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[["-",5,0],["5",5,1],["{",7,8],["y",7,9],[".",6,5],["t",5,3],["9",5,4],["a",5,5],["D",7,24],["I",8,50],["z",9,102],["x",9,103],["d",6,13],["B",7,28],["E",7,29],["p",7,30],["J",11,496],["%",12,994],["'",14,3980],["Z",14,3981],[">",13,1991],["Y",10,249],["T",9,125],["O",9,126],["M",9,127],["2",5,8],["l",5,9],["3",5,10],["4",5,11],["\"",3,3],[":",6,32],["h",7,66],["q",10,536],["G",11,1074],["K",11,1075],["/",9,269],["P",9,270],["L",9,271],["e",5,17],["n",5,18],["v",8,152],["_",8,153],["H",9,308],["k",9,309],["\\",8,155],["s",6,39],[" ",5,20],["C",7,84],["b",8,170],["S",8,171],["]",8,172],["[",8,173],["A",7,87],["0",4,11],[",",4,12],["u",6,52],["~",10,848],["X",11,1698],["*",13,6796],["=",14,13594],["+",16,54380],[";",16,54381],["&",16,54382],["?",19,435065],["#",18,217533],["<",17,108767],["V",12,3399],[")",10,850],["(",10,851],["f",8,213],["m",7,107],["1",5,27],["r",6,56],["g",8,228],["w",8,229],["N",8,230],["j",10,924],["Q",12,3700],["@",12,3701],["W",11,1851],["R",10,926],["U",10,927],["i",6,58],["8",6,59],["o",6,60],["7",6,61],["6",6,62],["F",8,252],["}",8,253],["c",7,127]],"eof_code":[19,435064],"metadata":{"frequencies":{"{":7773,"\"":167778,"m":12999,"e":40821,"t":34222,"a":34406,":":19344,"v":5582,"i":29239,"w":6928,"d":17319,"k":3013,"u":25503,"6":30930,"-":31180,"n":41836,"x":2274,",":101928,"D":8359,"o":29553,"g":6808,"r":27010,"p":9126,"h":10043,"c":15557," ":46824,"S":6054,"s":23004,"B":8840,"y":7924,"Z":84,"C":11851,"b":5917,"f":6500,"Y":1210,"l":37539,"(":1582,")":1581,"R":1864,"0":99430,"G":642,"A":12529,"1":52881,"3":38723,"7":30603,"5":32443,"4":39551,"z":2211,"T":2361,"8":29389,"2":36755,"F":7721,"J":545,"U":1887,"9":34397,"O":2411,"E":9084,"I":4262,"L":2738,"M":2483,"[":6124,"N":7237,"_":5684,"}":7761,"]":6104,"j":1773,".":16877,"P":2724,"K":710,"W":936,"H":2761,"V":403,"/":2641,"q":1246,"?":3,"~":1517,"\\":5915,";":24,"X":762,"Q":419,"'":60,"*":180,"@":441,"&":27,"%":265,"<":18,">":146,"+":20,"=":92,"#":9}},"concat":"str_join"} diff --git a/dahuffman/codecs/json.json b/dahuffman/codecs/json.json new file mode 100644 index 0000000..f62d5b6 --- /dev/null +++ b/dahuffman/codecs/json.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[[" ",2,0],["4",5,8],["p",7,36],["J",11,592],["'",14,4744],[";",15,9490],["&",15,9491],[">",13,2373],["\u2019",16,18992],["<",16,18993],["\u00b3",20,303905],["\u00b5",20,303906],["\u2018",20,303907],["\u201c",20,303908],["\u201d",20,303909],["?",19,151955],["#",17,37989],["+",16,18995],["\u2013",14,4749],["*",13,2375],["Y",10,297],["T",9,149],["O",9,150],["M",9,151],["h",7,38],["q",10,312],["G",11,626],["K",11,627],["P",9,157],["\\",8,79],["n",5,10],["e",5,11],[":",6,24],["H",9,200],["L",9,201],["v",8,101],["C",7,51],["s",6,26],["/",9,216],["~",10,434],["(",10,435],["_",8,109],["]",8,110],["[",8,111],["0",4,7],[",",4,8],["S",8,144],["k",9,290],[")",10,582],["X",11,1166],["V",12,2334],["=",14,9340],["Z",14,9341],["%",13,4671],["A",7,73],["u",6,37],["1",5,19],["b",8,160],["f",8,161],["m",7,81],["r",6,41],["8",6,42],["w",8,172],["g",8,173],["N",8,174],["j",10,700],["Q",12,2804],["@",12,2805],["W",11,1403],["R",10,702],["U",10,703],["7",6,44],["6",6,45],["i",6,46],["-",6,47],["o",6,48],["5",6,49],["F",8,200],["y",8,201],["c",7,101],["}",8,204],["{",8,205],[".",7,103],["9",6,52],["D",8,212],["B",8,213],["I",9,428],["z",10,858],["x",10,859],["E",8,215],["a",6,54],["2",6,55],["t",6,56],["d",7,114],["\n",7,115],["l",6,58],["3",6,59],["\"",4,15]],"eof_code":[20,303904],"metadata":{"frequencies":{"{":7221,"\n":16612," ":274544,"\"":139628,"m":12060,"e":37559,"t":31650,"a":31095,":":18551,"v":4984,"i":26909,"w":6193,"d":15858,"k":2694,"u":22373,"6":26443,"-":26960,"n":37383,"x":2007,",":83974,"D":7319,"o":27011,"g":6197,"r":24175,"p":8411,"h":8986,"c":14405,"S":5342,"s":20773,"B":7595,"y":7193,"Z":83,"C":10275,"b":5411,"f":5884,"Y":1049,"l":32957,"(":1348,")":1348,"R":1633,"0":80916,"G":590,"A":10839,"1":45087,"3":33133,"7":26092,"5":27695,"4":34520,"z":1919,"T":2119,"8":25079,"2":31617,"F":6684,"J":459,"U":1711,"9":29803,"O":2178,"E":7787,"I":3761,"L":2424,"M":2202,"[":5294,"N":6502,"_":5261,"}":7207,"]":5271,"j":1582,".":14620,"P":2345,"K":656,"W":843,"H":2404,"V":338,"/":2499,"q":1077,"?":3,"~":1311,"\\":4819,"\u2013":73,";":24,"X":668,"Q":362,"'":50,"*":149,"@":423,"&":27,"%":198,"\u00b5":1,"\u00b3":1,"<":17,">":125,"+":20,"=":74,"#":9,"\u2019":10,"\u2018":1,"\u201c":1,"\u201d":1}},"concat":"str_join"} diff --git a/dahuffman/codecs/shakespeare-lower.json b/dahuffman/codecs/shakespeare-lower.json new file mode 100644 index 0000000..d71dbc5 --- /dev/null +++ b/dahuffman/codecs/shakespeare-lower.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[["r",4,0],["h",4,1],["n",4,2],["s",4,3],["b",6,16],["\u2019",8,68],["!",9,138],["z",11,556],["2",14,4456],["5",15,8914],["&",16,17830],["|",17,35662],["*",17,35663],["\u201d",14,4458],[")",14,4459],["(",14,4460],["8",15,8922],["6",15,8923],["\"",15,8924],["0",15,8925],["\u201c",14,4463],[":",10,279],["'",8,70],[";",8,71],["u",5,9],["i",4,5],["\n",5,12],["g",6,26],["k",7,54],["v",7,55],["a",4,7],["d",5,16],[".",6,34],["f",6,35],["o",4,9],["t",4,10],["l",5,22],["c",6,46],[",",6,47],[" ",3,6],["w",6,56],["y",6,57],["_",10,928],["j",10,929],["?",9,465],["x",10,932],["\u2014",12,3732],["\u2018",14,14932],["\u00e8",17,119464],["\u00e6",17,119465],["7",16,59733],["3",15,29867],["1",14,14934],["\u00e9",17,119480],["/",19,477924],["\u00e0",19,477925],["\u0153",21,1911704],["\u00ee",22,3823410],["#",23,7646823],["\t",22,3823412],["$",22,3823413],["%",23,7646828],["@",23,7646829],["\\",23,7646830],["`",23,7646831],["}",22,3823416],["\u00e2",22,3823417],["\u00ea",21,1911709],["\u00e7",20,955855],["9",16,59741],["4",15,29871],["]",11,1867],["-",10,934],["[",11,1870],["q",11,1871],["p",7,117],["m",6,59],["e",4,15]],"eof_code":[23,7646822],"metadata":{"frequencies":{"\n":138037,"p":61600,"r":252082,"o":332873,"j":4910,"e":481144,"c":92002,"t":354271," ":823018,"g":72877,"u":137495,"n":260496,"b":64105,"\u2019":14526,"s":266719,"h":255777,"m":117542,"l":180842,"w":96316,"k":37816,"f":86188,"i":269305,"a":309773,",":92277,"y":99531,"d":158820,"v":40214,".":83846,"-":6324,":":4523,"*":38,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"9":107,"4":218,"[":3333,"#":1,"]":3324,"3":188,"8":151,"x":5330,";":17964,"z":1840,"\u2018":361,"?":11061,"q":3953,"5":122,"6":158,"!":8591,"\u00e6":43,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"\"":170,"'":17806,"|":32,"\u0153":2,"\u00e0":13,"\u00e9":45,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"} diff --git a/dahuffman/codecs/shakespeare-raw.json b/dahuffman/codecs/shakespeare-raw.json new file mode 100644 index 0000000..385f671 --- /dev/null +++ b/dahuffman/codecs/shakespeare-raw.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[[" ",2,0],["a",4,4],["d",5,10],["c",6,22],["E",7,46],["v",7,47],["o",4,6],["f",6,28],["T",7,58],["C",8,118],["x",10,476],["(",14,7632],["8",15,15266],["6",15,15267],["\"",15,15268],["%",22,1954432],["@",22,1954433],["\\",22,1954434],["`",22,1954435],["}",21,977218],["\u00e2",21,977219],["\u00e7",19,244305],["\u00c9",20,488612],["\u00ea",20,488613],["/",19,244307],["\u00e6",17,61077],["7",16,30539],["\u201c",14,7635],["\u2014",12,1909],["q",11,955],["G",9,239],["l",5,15],["t",4,8],["w",6,36],[".",6,37],["\n",5,19],["P",9,320],["?",9,321],["L",8,161],["A",7,81],["y",6,41],[",",6,42],["F",9,344],["K",10,690],["j",11,1382],["V",11,1383],["R",8,173],["b",7,87],["p",7,88],["N",8,178],["-",10,716],["]",11,1434],["[",11,1435],["U",9,359],["m",6,45],["i",5,23],["e",4,12],["r",5,26],["I",7,108],["D",9,436],["B",9,437],["O",8,219],["\u2019",9,440],["M",9,441],["S",8,221],["g",7,111],["n",5,28],["s",5,29],["h",5,30],["u",6,62],["k",8,252],["Y",10,1012],["0",15,32416],["3",15,32417],["\u2018",14,16209],["X",14,16210],["1",14,16211],["J",12,4053],[":",11,2027],["W",9,507],["H",9,508],["'",9,509],["!",10,1020],["\u00e9",17,130688],["\u0153",21,2091024],["\u00c6",21,2091025],["\u00ee",22,4182052],["#",23,8364107],["\t",22,4182054],["$",22,4182055],["\u00e0",19,522757],["|",18,261379],["9",16,65345],["4",15,32673],["2",15,32674],["5",16,65350],["&",17,130702],["*",18,261406],["\u00e8",18,261407],["\u201d",15,32676],[")",15,32677],["Z",14,16339],["Q",13,8170],["z",13,8171],["_",11,2043],[";",9,511]],"eof_code":[23,8364106],"metadata":{"frequencies":{"\n":164202,"P":10725,"r":227404,"o":304523,"j":3050,"e":444276,"c":73035,"t":315370," ":1104779,"G":10529,"u":124306,"n":235066,"b":50694,"g":62348,"\u2019":14526,"s":235725,"T":38901,"h":238755,"C":18967,"m":102883,"p":50875,"l":158708,"W":16698,"k":31929,"f":74755,"i":217354,"a":264813,"S":30994,",":92277,"y":92141,"B":13411,"w":79618,"U":13189,"d":145498,"v":37036,".":83846,"Y":7390,"-":6324,"L":22134,"I":51951,":":4523,"*":38,"O":28350,"N":25430,"E":36868,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"A":44960,"R":24678,"D":13322,"J":1860,"9":107,"4":218,"[":3333,"#":1,"]":3324,"M":14659,"3":188,"F":11433,"8":151,"H":17022,"K":5887,"X":382,"V":3178,";":17964,"z":1240,"\u2018":361,"x":4948,"?":11061,"q":2725,"5":122,"6":158,"!":8591,"\u00e6":40,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"Q":1228,"\"":170,"'":17806,"|":32,"Z":600,"\u0153":2,"\u00c6":3,"\u00e0":13,"\u00c9":5,"\u00e9":40,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"} diff --git a/dahuffman/codecs/shakespeare.json b/dahuffman/codecs/shakespeare.json new file mode 100644 index 0000000..d59f20c --- /dev/null +++ b/dahuffman/codecs/shakespeare.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[["n",4,0],["s",4,1],["h",4,2],["u",5,6],["k",7,28],["Y",9,116],["0",14,3744],["3",14,3745],["\u2018",13,1873],["X",13,1874],["1",13,1875],["J",11,469],[":",10,235],["W",8,59],["H",8,60],["'",8,61],["!",9,124],["\u00e9",16,16000],["\u0153",20,256016],["\u00c6",20,256017],["\u00ee",21,512036],["#",22,1024075],["\t",21,512038],["$",21,512039],["\u00e0",18,64005],["|",17,32003],["9",15,8001],["4",14,4001],["2",14,4002],["5",15,8006],["&",16,16014],["*",17,32030],["\u00e8",17,32031],["\u201d",14,4004],[")",14,4005],["Z",13,2003],["Q",12,1002],["z",12,1003],["_",10,251],[";",8,63],["a",4,4],["\n",5,10],["d",5,11],["c",6,24],["E",7,50],["v",7,51],["f",6,26],["T",7,54],["C",8,110],["x",10,444],["(",14,7120],["8",15,14242],["6",15,14243],["\"",15,14244],["%",22,1823360],["@",22,1823361],["\\",22,1823362],["`",22,1823363],["}",21,911682],["\u00e2",21,911683],["\u00e7",19,227921],["\u00c9",20,455844],["\u00ea",20,455845],["/",19,227923],["\u00e6",17,56981],["7",16,28491],["\u201c",14,7123],["\u2014",12,1781],["q",11,891],["G",9,223],["o",4,7],["t",4,8],["l",5,18],["w",6,38],[".",6,39],["P",9,320],["?",9,321],["L",8,161],["A",7,81],["y",6,41],[",",6,42],["F",9,344],["K",10,690],["j",11,1382],["V",11,1383],["R",8,173],["b",7,87],["p",7,88],["N",8,178],["-",10,716],["]",11,1434],["[",11,1435],["U",9,359],["m",6,45],["i",5,23],[" ",3,6],["e",4,14],["r",5,30],["I",7,124],["D",9,500],["B",9,501],["O",8,251],["\u2019",9,504],["M",9,505],["S",8,253],["g",7,127]],"eof_code":[22,1024074],"metadata":{"frequencies":{"\n":138037,"P":10725,"r":227404,"o":304523,"j":3050,"e":444276,"c":73035,"t":315370," ":823018,"G":10529,"u":124306,"n":235066,"b":50694,"g":62348,"\u2019":14526,"s":235725,"T":38901,"h":238755,"C":18967,"m":102883,"p":50875,"l":158708,"W":16698,"k":31929,"f":74755,"i":217354,"a":264813,"S":30994,",":92277,"y":92141,"B":13411,"w":79618,"U":13189,"d":145498,"v":37036,".":83846,"Y":7390,"-":6324,"L":22134,"I":51951,":":4523,"*":38,"O":28350,"N":25430,"E":36868,"(":294,"2":247,"0":173,"1":390,"7":94,")":293,"A":44960,"R":24678,"D":13322,"J":1860,"9":107,"4":218,"[":3333,"#":1,"]":3324,"M":14659,"3":188,"F":11433,"8":151,"H":17022,"K":5887,"X":382,"V":3178,";":17964,"z":1240,"\u2018":361,"x":4948,"?":11061,"q":2725,"5":122,"6":158,"!":8591,"\u00e6":40,"&":62,"\u2014":1412,"\u201c":353,"\u201d":284,"_":4651,"Q":1228,"\"":170,"'":17806,"|":32,"Z":600,"\u0153":2,"\u00c6":3,"\u00e0":13,"\u00c9":5,"\u00e9":40,"\u00e8":38,"\u00e2":2,"\u00e7":9,"\u00ee":1,"\u00ea":5,"`":1,"\t":2,"}":2,"\\":1,"/":12,"%":1,"@":1,"$":2}},"concat":"str_join"} diff --git a/dahuffman/codecs/xml.json b/dahuffman/codecs/xml.json new file mode 100644 index 0000000..c32a9ec --- /dev/null +++ b/dahuffman/codecs/xml.json @@ -0,0 +1 @@ +{"type":"dahuffman code table","version":1,"code_table":[["6",6,0],["T",10,16],["U",10,17],["~",9,9],["q",8,5],["N",8,6],["Q",12,112],["\\",12,113],["K",11,57],[")",10,29],["\n",9,15],["9",6,2],["w",6,3],["y",6,4],["z",8,20],[",",8,21],["(",10,88],["%",13,712],["Z",15,2852],["[",15,2853],["\u2013",14,1427],["{",13,714],["}",13,715],["X",11,179],["R",10,90],["Y",11,182],["J",12,366],["*",13,734],["]",15,2940],["\t",15,2941],["|",14,1471],["x",8,23],["g",6,6],[".",6,7],["a",4,2],["u",5,6],["F",8,56],[":",8,57],["v",7,29],["3",6,15],["b",7,32],["B",8,66],["D",8,67],["l",6,17],["/",5,9],["t",4,5],["j",8,96],["E",8,97],["f",7,49],["2",6,25],["m",6,26],["p",6,27],["d",5,14],["c",5,15],["e",4,8],["h",6,36],["I",9,296],["H",10,594],["O",10,595],["P",10,596],["W",12,2388],["V",12,2389],["#",13,4780],["'",14,9562],["?",16,38252],["+",17,76506],["@",18,153014],["\u200c",19,306030],["\ufeff",21,1224125],["\u00f1",20,612063],["!",15,19127],["G",12,2391],["&",10,598],["M",10,599],["=",7,75],["<",5,19],[">",5,20],["1",6,42],["-",6,43],["s",5,22],["\"",6,46],["5",7,94],["L",10,760],[";",10,761],["S",9,381],["C",8,191],["0",5,24],["n",5,25],["_",5,26],["o",5,27],["i",5,28],["8",7,116],["7",7,117],["4",7,118],["A",8,238],["k",8,239],[" ",5,30],["r",5,31]],"eof_code":[21,1224124],"metadata":{"frequencies":{"<":46938,"r":59681,"e":90285,"s":50883,"p":20779,"o":55844,"n":55269,">":46983,"w":14986," ":57483,"_":55491,"i":57191,"d":41552,"=":12621,"\"":25423,"-":25348,"a":71880,"b":9330,"~":1905,"8":14107,"5":13388,"f":10425,"z":3910,"k":7305,"c":42556,"u":35712,"0":53392,"4":14407,"1":24184,"D":4802,"A":7226,"9":14922,"6":14904,"F":4441,"B":4775,"t":80992,"h":21868,":":4540,"/":38985,".":17228,"y":15221,"x":4391,"j":4977,"m":20687,"l":19812,"2":20584,"g":16871,"3":18132,"7":14377,"v":9120,"q":3682,"E":5051,"C":7077,"N":3713,",":4127,"(":993,")":992,"K":484,"V":320,"I":2495,"*":154,"U":915,"X":530,"Y":541,"S":3627,"W":315,"M":1496,"J":282,"R":1112,"O":1304,"'":86,"G":403,"H":1268,"Q":212,"L":1532,"P":1389,"%":124,"T":827,"Z":30,"\u2013":68,"#":163,"&":1393,";":1584,"!":53,"+":13,"\n":1947,"[":30,"]":30,"?":21,"|":78,"\\":231,"{":137,"}":137,"\t":48,"\u00f1":3,"\u200c":3,"\ufeff":2,"@":7}},"concat":"str_join"} diff --git a/dahuffman/codetableio.py b/dahuffman/codetableio.py new file mode 100644 index 0000000..04bcab3 --- /dev/null +++ b/dahuffman/codetableio.py @@ -0,0 +1,132 @@ +""" +Functionality to save/load a code table to/from a file +""" + +import json +import logging +import pickle +from pathlib import Path +from typing import Any, Optional, Union + +from dahuffman.huffmancodec import _EOF, PrefixCodec + +_log = logging.getLogger(__name__) + + +def ensure_dir(path: Union[str, Path]) -> Path: + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + return path + + +def pickle_save( + codec: PrefixCodec, path: Union[str, Path], metadata: Any = None +) -> None: + """ + Persist the code table to a file. + + :param path: file path to persist to + :param metadata: additional metadata to include + """ + code_table = codec.get_code_table() + data = { + "code_table": code_table, + "type": type(codec), + "concat": codec._concat, + } + if metadata: + data["metadata"] = metadata + path = Path(path) + ensure_dir(path.parent) + with path.open(mode="wb") as f: + pickle.dump(data, file=f) + _log.info( + f"Saved {type(codec).__name__} code table ({len(code_table)} items) to {str(path)!r}" + ) + + +def pickle_load(path: Union[str, Path]) -> PrefixCodec: + """ + Load a persisted PrefixCodec + :param path: path to serialized PrefixCodec code table data. + """ + path = Path(path) + with path.open(mode="rb") as f: + data = pickle.load(f) + cls = data["type"] + assert issubclass(cls, PrefixCodec) + code_table = data["code_table"] + _log.info( + f"Loading {cls.__name__} with {len(code_table)} code table items from {str(path)!r}" + ) + return cls(code_table, concat=data["concat"]) + + +def json_save( + codec: PrefixCodec, path: Union[str, Path], metadata: Optional[dict] = None +) -> None: + """ + Persist the code table as a JSON file. + Requires that all structures in the code table are JSON-serializable. + + :param path: file path to persist to + :param metadata: additional metadata to include in the file. + """ + code_table = codec.get_code_table() + + # Extract internal _EOF symbol from code table + if _EOF in code_table: + eof_code = code_table.pop(_EOF) + else: + eof_code = None + + # Transform code table dictionary to a list, to avoid string-coercion of keys in JSON mappings. + code_table = [[k, *v] for (k, v) in code_table.items()] + + data = { + "type": "dahuffman code table", + "version": 1, + "code_table": code_table, + } + if eof_code: + data["eof_code"] = eof_code + if metadata: + data["metadata"] = metadata + if codec._concat == list: + data["concat"] = "list" + elif codec._concat == "".join: + data["concat"] = "str_join" + elif codec._concat == bytes: + data["concat"] = "bytes" + else: + _log.warning(f"Unsupported concat callable {codec._concat!r}") + + path = Path(path) + ensure_dir(path.parent) + with path.open("w", encoding="utf8") as f: + json.dump(obj=data, fp=f, indent=None, separators=(",", ":")) + _log.info( + f"Saved {type(codec).__name__} code table ({len(code_table)} items) to {str(path)!r}" + ) + + +def json_load(path: Union[str, Path]) -> PrefixCodec: + path = Path(path) + with path.open(mode="r", encoding="utf8") as f: + data = json.load(fp=f) + + assert data["type"] == "dahuffman code table" + assert data["version"] == 1 + + # Reconstruct code table + code_table = {row[0]: row[1:] for row in data["code_table"]} + + if "eof_code" in data: + code_table[_EOF] = data["eof_code"] + + concat = {"str_join": "".join, "bytes": bytes}.get(data["concat"], list) + + _log.info( + f"Loading PrefixCodec with {len(code_table)} code table items from {str(path)!r}" + ) + return PrefixCodec(code_table, concat=concat) diff --git a/dahuffman/huffmancodec.py b/dahuffman/huffmancodec.py index fb28cec..461ed7f 100644 --- a/dahuffman/huffmancodec.py +++ b/dahuffman/huffmancodec.py @@ -1,8 +1,8 @@ import collections import itertools import logging -import pickle import sys +import warnings from heapq import heapify, heappop, heappush from io import IOBase from pathlib import Path @@ -55,14 +55,6 @@ def _guess_concat(data: Any) -> Callable: }.get(type(data), list) -def ensure_dir(path: Union[str, Path]) -> Path: - path = Path(path) - if not path.exists(): - path.mkdir(parents=True) - assert path.is_dir() - return path - - class PrefixCodec: """ Prefix code codec, using given code table. @@ -218,23 +210,14 @@ def save(self, path: Union[str, Path], metadata: Any = None) -> None: :param metadata: additional metadata :return: """ - code_table = self.get_code_table() - data = { - "code_table": code_table, - "type": type(self), - "concat": self._concat, - } - if metadata: - data["metadata"] = metadata - path = Path(path) - ensure_dir(path.parent) - with path.open(mode="wb") as f: - # TODO also provide JSON option? Requires handling of _EOF and possibly other non-string code table keys. - pickle.dump(data, file=f) - _log.info( - "Saved {c} code table ({l} items) to {p!r}".format( - c=type(self).__name__, l=len(code_table), p=str(path) - ) + warnings.warn( + "`PrefixCodec.save()` is deprecated, use `dahuffman.codetableio` functionality instead", + DeprecationWarning, + ) + import dahuffman.codetableio + + return dahuffman.codetableio.pickle_save( + codec=self, path=path, metadata=metadata ) @staticmethod @@ -244,18 +227,13 @@ def load(path: Union[str, Path]) -> "PrefixCodec": :param path: path to serialized PrefixCodec code table data. :return: """ - path = Path(path) - with path.open(mode="rb") as f: - data = pickle.load(f) - cls = data["type"] - assert issubclass(cls, PrefixCodec) - code_table = data["code_table"] - _log.info( - "Loading {c} with {l} code table items from {p!r}".format( - c=cls.__name__, l=len(code_table), p=str(path) - ) + warnings.warn( + "`PrefixCodec.load()` is deprecated, use `dahuffman.codetableio` functionality instead", + DeprecationWarning, ) - return cls(code_table, concat=data["concat"]) + import dahuffman.codetableio + + return dahuffman.codetableio.pickle_load(path=path) class HuffmanCodec(PrefixCodec): diff --git a/tests/test_codetableio.py b/tests/test_codetableio.py new file mode 100644 index 0000000..82b7b55 --- /dev/null +++ b/tests/test_codetableio.py @@ -0,0 +1,63 @@ +from pathlib import Path + +import pytest + +from dahuffman.codetableio import json_load, json_save, pickle_load, pickle_save +from dahuffman.huffmancodec import HuffmanCodec + + +@pytest.mark.parametrize( + ["train_data", "data"], + [ + ("aabcbcdbabdbcbd", "abcdabcd"), + ( + ["FR", "UK", "BE", "IT", "FR", "IT", "GR", "FR", "NL", "BE", "DE"], + ["FR", "IT", "BE", "FR", "UK"], + ), + (b"aabcbcdbabdbcbd", b"abcdabcd"), + ( + [(0, 0), (0, 1), (1, 0), (0, 0), (1, 0), (1, 0)], + [(1, 0), (0, 0), (0, 1), (1, 0)], + ), + ], +) +def test_pickle_save_and_load(tmp_path: Path, train_data, data): + codec1 = HuffmanCodec.from_data(train_data) + encoded1 = codec1.encode(data) + + path = tmp_path / "code-table.pickle" + pickle_save(codec=codec1, path=path) + codec2 = pickle_load(path) + encoded2 = codec2.encode(data) + + assert encoded1 == encoded2 + assert codec1.decode(encoded1) == codec2.decode(encoded2) + + +@pytest.mark.parametrize( + ["train_data", "data"], + [ + ("aabcbcdbabdbcbd", "abcdabcd"), + ( + ["FR", "UK", "BE", "IT", "FR", "IT", "GR", "FR", "NL", "BE", "DE"], + ["FR", "IT", "BE", "FR", "UK"], + ), + (b"aabcbcdbabdbcbd", b"abcdabcd"), + # TODO: + # ( + # [(0, 0), (0, 1), (1, 0), (0, 0), (1, 0), (1, 0)], + # [(1, 0), (0, 0), (0, 1), (1, 0)], + # ), + ], +) +def test_json_save_and_load(tmp_path: Path, train_data, data): + codec1 = HuffmanCodec.from_data(train_data) + encoded1 = codec1.encode(data) + + path = tmp_path / "code-table.json" + json_save(codec=codec1, path=path) + codec2 = json_load(path) + encoded2 = codec2.encode(data) + + assert encoded1 == encoded2 + assert codec1.decode(encoded1) == codec2.decode(encoded2) diff --git a/tests/test_dahuffman.py b/tests/test_dahuffman.py index a1a1405..6c50042 100644 --- a/tests/test_dahuffman.py +++ b/tests/test_dahuffman.py @@ -136,7 +136,7 @@ def test_eof_cut_off(): assert data == codec.decode(encoded) -def test_save(tmp_path: Path): +def test_save_and_load(tmp_path: Path): codec1 = HuffmanCodec.from_data("aabcbcdbabdbcbd") path = str(tmp_path / "foo" / "bar.huff") codec1.save(path) diff --git a/train/json-data.py b/train/json-data.py index a3d4573..1062ea1 100644 --- a/train/json-data.py +++ b/train/json-data.py @@ -4,6 +4,7 @@ from collections import Counter from dahuffman import HuffmanCodec +from dahuffman.codetableio import json_save, pickle_save from train.train_utils import CODECS, download _log = logging.getLogger() @@ -54,12 +55,28 @@ def main(): # TODO add more metadata _log.info(f"Frequencies raw {len(frequencies_raw)}: {frequencies_raw}") codec = HuffmanCodec.from_frequencies(frequencies_raw) - codec.save(CODECS / "json.pickle", metadata={"frequencies": frequencies_raw}) + pickle_save( + codec=codec, + path=CODECS / "json.pickle", + metadata={"frequencies": frequencies_raw}, + ) + json_save( + codec=codec, + path=CODECS / "json.json", + metadata={"frequencies": frequencies_raw}, + ) _log.info(f"Frequencies compact {len(frequencies_compact)}: {frequencies_compact}") codec = HuffmanCodec.from_frequencies(frequencies_compact) - codec.save( - CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact} + pickle_save( + codec=codec, + path=CODECS / "json-compact.pickle", + metadata={"frequencies": frequencies_compact}, + ) + json_save( + codec=codec, + path=CODECS / "json-compact.json", + metadata={"frequencies": frequencies_compact}, ) diff --git a/train/shakespeare.py b/train/shakespeare.py index 84cf3da..083791d 100644 --- a/train/shakespeare.py +++ b/train/shakespeare.py @@ -9,6 +9,7 @@ from collections import Counter from dahuffman import HuffmanCodec +from dahuffman.codetableio import json_save, pickle_save from train.train_utils import CODECS, download _log = logging.getLogger() @@ -27,7 +28,16 @@ def main(): frequencies = Counter(raw) _log.info(f"Frequencies {len(frequencies)}: {frequencies}") codec = HuffmanCodec.from_frequencies(frequencies) - codec.save(CODECS / "shakespeare-raw.pickle", metadata={"frequencies": frequencies}) + pickle_save( + codec=codec, + path=CODECS / "shakespeare-raw.pickle", + metadata={"frequencies": frequencies}, + ) + json_save( + codec=codec, + path=CODECS / "shakespeare-raw.json", + metadata={"frequencies": frequencies}, + ) _log.info("Doing white space clean up") clean = raw @@ -36,15 +46,31 @@ def main(): frequencies = Counter(clean) _log.info(f"Frequencies {len(frequencies)}: {frequencies}") codec = HuffmanCodec.from_frequencies(frequencies) - codec.save(CODECS / "shakespeare.pickle", metadata={"frequencies": frequencies}) + pickle_save( + codec=codec, + path=CODECS / "shakespeare.pickle", + metadata={"frequencies": frequencies}, + ) + json_save( + codec=codec, + path=CODECS / "shakespeare.json", + metadata={"frequencies": frequencies}, + ) _log.info("Only handling lower case") lower = clean.lower() frequencies = Counter(lower) _log.info(f"Frequencies {len(frequencies)}: {frequencies}") codec = HuffmanCodec.from_frequencies(frequencies) - codec.save( - CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies} + pickle_save( + codec=codec, + path=CODECS / "shakespeare-lower.pickle", + metadata={"frequencies": frequencies}, + ) + json_save( + codec=codec, + path=CODECS / "shakespeare-lower.json", + metadata={"frequencies": frequencies}, ) diff --git a/train/train_utils.py b/train/train_utils.py index 98a82d6..1ce0ab6 100644 --- a/train/train_utils.py +++ b/train/train_utils.py @@ -3,8 +3,6 @@ import requests -from dahuffman.huffmancodec import ensure_dir - DOWNLOADS = Path(__file__).parent / "data" CODECS = Path(__file__).parent / "codecs" @@ -14,7 +12,7 @@ def download(url: str, path: str) -> Path: path = DOWNLOADS / path if not path.exists(): - ensure_dir(path.parent) + path.parent.mkdir(parents=True, exists_ok=True) _log.info(f"Downloading {url}") with requests.get(url) as r: r.raise_for_status() diff --git a/train/xml-data.py b/train/xml-data.py index 323324d..f85867a 100644 --- a/train/xml-data.py +++ b/train/xml-data.py @@ -3,6 +3,7 @@ from collections import Counter from dahuffman import HuffmanCodec +from dahuffman.codetableio import json_save, pickle_save from train.train_utils import CODECS, download _log = logging.getLogger() @@ -49,7 +50,12 @@ def main(): # TODO add more metadata _log.info(f"Frequencies raw {len(frequencies)}: {frequencies}") codec = HuffmanCodec.from_frequencies(frequencies) - codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies}) + pickle_save( + codec=codec, path=CODECS / "xml.pickle", metadata={"frequencies": frequencies} + ) + json_save( + codec=codec, path=CODECS / "xml.json", metadata={"frequencies": frequencies} + ) if __name__ == "__main__":