diff --git a/.gitignore b/.gitignore index f136d47..63833d8 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,6 @@ docs/_build /build/ /dist/ /pdfminer.six.egg-info/ -tests/*.xml -tests/*.txt .idea/ .tox/ .nox/ diff --git a/TODO.md b/TODO.md index 2d6291e..68cf29f 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,5 @@ ## PLAYA 0.2.5 -- [ ] implement CMap parsing for CIDs (submit PR to pdfminer) +- [x] implement CMap parsing for Encoding CMaps - [x] add "default" as a synonym of badly-named "user" space - [x] update `pdfplumber` branch and run `pdfplumber` tests in CI - [x] reimplement on top of ContentObject diff --git a/playa/cmapdb.py b/playa/cmapdb.py index 3ce8c09..6764930 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -9,6 +9,7 @@ """ +from bisect import bisect_left import functools import gzip import logging @@ -31,14 +32,12 @@ cast, ) -from playa.encodingdb import name2unicode from playa.exceptions import PDFSyntaxError from playa.parser import ( KWD, ObjectParser, PDFObject, PSKeyword, - PSLiteral, literal_name, ) from playa.utils import choplist, nunpack @@ -208,15 +207,13 @@ def _load_data(cls, name: str) -> Any: @classmethod def get_cmap(cls, name: str) -> CMapBase: if name == "Identity-H": - return IdentityCMap(WMode=0) - elif name == "Adobe-Identity-UCS": - return IdentityCMap(WMode=0) # FIXME: WMode??? + return IdentityCMap(CMapName=name, WMode=0) elif name == "Identity-V": - return IdentityCMap(WMode=1) + return IdentityCMap(CMapName=name, WMode=1) elif name == "OneByteIdentityH": - return IdentityCMapByte(WMode=0) + return IdentityCMapByte(CMapName=name, WMode=0) elif name == "OneByteIdentityV": - return IdentityCMapByte(WMode=1) + return IdentityCMapByte(CMapName=name, WMode=1) if name in cls._cmap_cache: return cls._cmap_cache[name] data = cls._load_data(name) @@ -260,6 +257,7 @@ def decode_utf16_char(utf16: bytes) -> str: class FileUnicodeMap(UnicodeMap): + """ToUnicode map loaded from a PDF stream""" def add_cid2bytes(self, cid: int, utf16: bytes) -> None: self.add_cid2unichr(cid, decode_utf16_char(utf16)) @@ -267,12 +265,6 @@ def add_cid2code(self, cid: int, code: int) -> None: unichr = chr(code) self.add_cid2unichr(cid, unichr) - def add_cid2lit(self, cid: int, name: PSLiteral) -> None: - # Interpret as an Adobe glyph name. - assert isinstance(name.name, str) - unichr = name2unicode(name.name) - self.add_cid2unichr(cid, unichr) - def add_cid2unichr(self, cid: int, unichr: str) -> None: # A0 = non-breaking space, some weird fonts can have a collision on a cid here. assert isinstance(unichr, str) @@ -280,51 +272,29 @@ def add_cid2unichr(self, cid: int, unichr: str) -> None: return self.cid2unichr[cid] = unichr - -def add_cid_range( - cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, cid: int -) -> None: - start_prefix = start_byte[:-4] - end_prefix = end_byte[:-4] - if start_prefix != end_prefix: - log.warning( - "The prefix of the start and end byte of " - "begincidrange are not the same.", - ) - return - svar = start_byte[-4:] - evar = end_byte[-4:] - start = nunpack(svar) - end = nunpack(evar) - vlen = len(svar) - for i in range(end - start + 1): - x = start_prefix + struct.pack(">L", start + i)[-vlen:] - cmap.add_cid2bytes(cid + i, x) - - -def add_bf_range( - cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, code: PDFObject -) -> None: - start = nunpack(start_byte) - end = nunpack(end_byte) - if isinstance(code, list): - if len(code) != end - start + 1: - log.warning( - "The difference between the start and end " - "offsets does not match the code length.", - ) - for cid, unicode_value in zip(range(start, end + 1), code): - assert isinstance(unicode_value, bytes) - cmap.add_cid2bytes(cid, unicode_value) - else: - assert isinstance(code, bytes) - var = code[-4:] - base = nunpack(var) - prefix = code[:-4] - vlen = len(var) - for i in range(end - start + 1): - x = prefix + struct.pack(">L", base + i)[-vlen:] - cmap.add_cid2bytes(start + i, x) + def add_bf_range( + self, start_byte: bytes, end_byte: bytes, code: PDFObject + ) -> None: + start = nunpack(start_byte) + end = nunpack(end_byte) + if isinstance(code, list): + if len(code) != end - start + 1: + log.warning( + "The difference between the start and end " + "offsets does not match the code length.", + ) + for cid, unicode_value in zip(range(start, end + 1), code): + assert isinstance(unicode_value, bytes) + self.add_cid2bytes(cid, unicode_value) + else: + assert isinstance(code, bytes) + var = code[-4:] + base = nunpack(var) + prefix = code[:-4] + vlen = len(var) + for i in range(end - start + 1): + x = prefix + struct.pack(">L", base + i)[-vlen:] + self.add_cid2bytes(start + i, x) def parse_tounicode(data: bytes) -> FileUnicodeMap: @@ -380,6 +350,132 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap: del stack[:] elif obj is KEYWORD_BEGINCIDRANGE: del stack[:] + elif obj is KEYWORD_ENDCIDRANGE: + del stack[:] + elif obj is KEYWORD_BEGINCIDCHAR: + del stack[:] + elif obj is KEYWORD_ENDCIDCHAR: + del stack[:] + elif obj is KEYWORD_BEGINBFRANGE: + del stack[:] + elif obj is KEYWORD_ENDBFRANGE: + for start_byte, end_byte, code in choplist(3, stack): + if not isinstance(start_byte, bytes): + log.warning("The start object is not a byte.") + continue + if not isinstance(end_byte, bytes): + log.warning("The end object is not a byte.") + continue + if len(start_byte) != len(end_byte): + log.warning("The start and end byte have different lengths.") + continue + cmap.add_bf_range(start_byte, end_byte, code) + del stack[:] + elif obj is KEYWORD_BEGINBFCHAR: + del stack[:] + elif obj is KEYWORD_ENDBFCHAR: + for cid, code in choplist(2, stack): + if isinstance(cid, bytes) and isinstance(code, bytes): + cmap.add_cid2bytes(nunpack(cid), code) + del stack[:] + elif obj is KEYWORD_BEGINNOTDEFRANGE: + del stack[:] + elif obj is KEYWORD_ENDNOTDEFRANGE: + del stack[:] + else: + # It's ... something else (probably bogus) + stack.append(obj) + return cmap + + +class EncodingCMap(CMap): + """Encoding map loaded from a PDF stream.""" + def __init__(self): + super().__init__() + self.bytes2cid: Dict[bytes, int] = {} + self.code_lengths = [] + + def decode(self, code: bytes) -> Tuple[int, ...]: + idx = 0 + codes = [] + # Match longest substring in bytes2cid + while idx < len(code): + for codelen in self.code_lengths[::-1]: + if code[idx: idx + codelen] in self.bytes2cid: + codes.append(self.bytes2cid[code[idx: idx + codelen]]) + idx += codelen + break + else: + log.warning("Unknown byte sequence %r", code[idx:]) + idx += 1 + return tuple(codes) + + def add_bytes2cid(self, utf16: bytes, cid: int) -> None: + codelen = len(utf16) + pos = bisect_left(self.code_lengths, codelen) + if pos == len(self.code_lengths) or self.code_lengths[pos] != codelen: + self.code_lengths.insert(pos, codelen) + self.bytes2cid[utf16] = cid + + def add_cid_range( + self, start_byte: bytes, end_byte: bytes, cid: int + ) -> None: + start_prefix = start_byte[:-4] + end_prefix = end_byte[:-4] + if start_prefix != end_prefix: + log.warning( + "The prefix of the start and end byte of " + "begincidrange are not the same.", + ) + return + svar = start_byte[-4:] + evar = end_byte[-4:] + start = nunpack(svar) + end = nunpack(evar) + vlen = len(svar) + for i in range(end - start + 1): + x = start_prefix + struct.pack(">L", start + i)[-vlen:] + self.add_bytes2cid(x, cid + i) + + +def parse_encoding(data: bytes) -> EncodingCMap: + """Parse an Encoding CMap.""" + cmap = EncodingCMap() + stack: List[PDFObject] = [] + parser = ObjectParser(data) + + while True: + try: + pos, obj = next(parser) + except PDFSyntaxError as e: + log.debug("Ignoring syntax error: %s", e) + parser.reset() + continue + except StopIteration: + break + + if not isinstance(obj, PSKeyword): + stack.append(obj) + continue + log.debug("keyword: %r (%r)", obj, stack) + + if obj is KEYWORD_DEF: + try: + # Might fail with IndexError if the file is corrputed + v = stack.pop() + k = stack.pop() + cmap.set_attr(literal_name(k), v) + except (IndexError, TypeError): + pass + elif obj is KEYWORD_USECMAP: + log.warning("usecmap not supported for EncodingCMap") + del stack[:] + elif obj is KEYWORD_BEGINCODESPACERANGE: + del stack[:] + elif obj is KEYWORD_ENDCODESPACERANGE: + del stack[:] + elif obj is KEYWORD_BEGINCIDRANGE: + del stack[:] elif obj is KEYWORD_ENDCIDRANGE: for start_byte, end_byte, cid in choplist(3, stack): if not isinstance(start_byte, bytes): @@ -396,36 +492,22 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap: "The start and end byte of begincidrange have different lengths.", ) return cmap - add_cid_range(cmap, start_byte, end_byte, cid) + cmap.add_cid_range(start_byte, end_byte, cid) del stack[:] elif obj is KEYWORD_BEGINCIDCHAR: del stack[:] elif obj is KEYWORD_ENDCIDCHAR: - for cid, code in choplist(2, stack): + for code, cid in choplist(2, stack): if isinstance(code, bytes) and isinstance(cid, int): - cmap.add_cid2bytes(cid, code) + cmap.add_bytes2cid(code, cid) del stack[:] elif obj is KEYWORD_BEGINBFRANGE: del stack[:] elif obj is KEYWORD_ENDBFRANGE: - for start_byte, end_byte, code in choplist(3, stack): - if not isinstance(start_byte, bytes): - log.warning("The start object is not a byte.") - continue - if not isinstance(end_byte, bytes): - log.warning("The end object is not a byte.") - continue - if len(start_byte) != len(end_byte): - log.warning("The start and end byte have different lengths.") - continue - add_bf_range(cmap, start_byte, end_byte, code) del stack[:] elif obj is KEYWORD_BEGINBFCHAR: del stack[:] elif obj is KEYWORD_ENDBFCHAR: - for cid, code in choplist(2, stack): - if isinstance(cid, bytes) and isinstance(code, bytes): - cmap.add_cid2bytes(nunpack(cid), code) del stack[:] elif obj is KEYWORD_BEGINNOTDEFRANGE: del stack[:] diff --git a/playa/font.py b/playa/font.py index 018b09a..8ec9cce 100644 --- a/playa/font.py +++ b/playa/font.py @@ -23,6 +23,7 @@ CMapBase, CMapDB, parse_tounicode, + parse_encoding, FileUnicodeMap, IdentityUnicodeMap, UnicodeMap, @@ -1067,10 +1068,14 @@ def __init__( if isinstance(spec["ToUnicode"], ContentStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = parse_tounicode(strm.buffer) - if isinstance(spec["Encoding"], ContentStream): + # FIXME: For the moment only replace the cmap if we don't + # have a predefined one (this may or may not be correct) + # FIXME: self.cmap should just be None here, WTF pdfminer.six! + if self.cmap.attrs.get("CMapName") is None and isinstance( + spec["Encoding"], ContentStream + ): strm = stream_value(spec["Encoding"]) - # FIXME: it's not a tounicode, but it plays one on TV - # _ = parse_tounicode(strm.buffer) + self.cmap = parse_encoding(strm.buffer) if self.unicode_map is None: cmap_name = literal_name(spec["ToUnicode"]) diff --git a/playa/page.py b/playa/page.py index bbc1678..93f07ba 100644 --- a/playa/page.py +++ b/playa/page.py @@ -755,6 +755,7 @@ def init_resources(self, page: Page, resources: Dict) -> None: spec = dict_value(spec) self.fontmap[fontid] = doc.get_font(objid, spec) except TypeError: + # FIXME: This is very very wrong! DO NOT WANT! log.warning("Broken/missing font spec for %r", fontid) self.fontmap[fontid] = doc.get_font(objid, {}) elif k == "ColorSpace": diff --git a/tests/cmap-encoding.txt b/tests/cmap-encoding.txt new file mode 100644 index 0000000..c0e5b70 --- /dev/null +++ b/tests/cmap-encoding.txt @@ -0,0 +1,261 @@ +%!PS-Adobe-3.0 Resource-CMap +%%DocumentNeededResources: ProcSet (CIDInit) +%%IncludeResource: ProcSet (CIDInit) +%%BeginResource: CMap (Adobe-Identity-UCS) +%%Title: (Adobe-Identity-UCS Adobe Identity 0) +%%Version: 1 +%%EndComments +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +3 dict dup begin +/Registry (Adobe) def +/Ordering (Identity) def +/Supplement 0 def +end def +/CMapName /Adobe-Identity-UCS def +/CMapVersion 1 def +/CMapType 0 def +/WMode 0 def +2 begincodespacerange +<20> <20> +<0000> <19FF> +endcodespacerange +229 begincidchar +<0001> 1 +<0002> 2 +<0003> 3 +<0004> 4 +<0005> 5 +<0006> 6 +<0007> 7 +<0008> 8 +<0009> 9 +<000a> 10 +<000b> 11 +<000c> 12 +<000d> 13 +<000e> 14 +<000f> 15 +<0010> 16 +<0011> 17 +<0012> 18 +<0013> 19 +<0014> 20 +<0015> 21 +<0016> 22 +<0017> 23 +<0018> 24 +<0019> 25 +<001a> 26 +<001b> 27 +<001c> 28 +<001d> 29 +<001e> 30 +<001f> 31 +<0020> 32 +<0021> 33 +<0022> 34 +<0023> 35 +<0024> 36 +<0025> 37 +<0026> 38 +<0027> 39 +<0028> 40 +<0029> 41 +<002a> 42 +<002b> 43 +<002c> 44 +<002d> 45 +<002e> 46 +<002f> 47 +<0030> 48 +<0031> 49 +<0032> 50 +<0033> 51 +<0034> 52 +<0035> 53 +<0036> 54 +<0037> 55 +<0038> 56 +<0039> 57 +<003a> 58 +<003b> 59 +<003c> 60 +<003d> 61 +<003e> 62 +<003f> 63 +<0040> 64 +<0041> 65 +<0042> 66 +<0043> 67 +<0044> 68 +<0045> 69 +<0046> 70 +<0047> 71 +<0048> 72 +<0049> 73 +<004a> 74 +<004b> 75 +<004c> 76 +<004d> 77 +<004e> 78 +<004f> 79 +<0050> 80 +<0051> 81 +<0052> 82 +<0053> 83 +<0054> 84 +<0055> 85 +<0056> 86 +<0057> 87 +<0058> 88 +<0059> 89 +<005a> 90 +<005b> 91 +<005c> 92 +<005d> 93 +<005e> 94 +<005f> 95 +<0060> 96 +<0061> 97 +<0062> 98 +<0063> 99 +<0064> 100 +<0065> 101 +<0066> 102 +<0067> 103 +<0068> 104 +<0069> 105 +<006a> 106 +<006b> 107 +<006c> 108 +<006d> 109 +<006e> 110 +<006f> 111 +<0070> 112 +<0071> 113 +<0072> 114 +<0073> 115 +<0074> 116 +<0075> 117 +<0076> 118 +<0077> 119 +<0078> 120 +<0079> 121 +<007a> 122 +<007b> 123 +<007c> 124 +<007d> 125 +<007e> 126 +<007f> 127 +<0080> 128 +<0081> 129 +<0082> 130 +<0083> 131 +<0084> 132 +<0085> 133 +<0086> 134 +<0087> 135 +<0088> 136 +<0089> 137 +<008a> 138 +<008b> 139 +<008c> 140 +<008d> 141 +<008e> 142 +<008f> 143 +<0090> 144 +<0091> 145 +<0092> 146 +<0093> 147 +<0094> 148 +<0095> 149 +<0096> 150 +<0097> 151 +<0098> 152 +<0099> 153 +<009a> 154 +<009b> 155 +<009c> 156 +<009d> 157 +<009e> 158 +<009f> 159 +<00a0> 160 +<00a1> 161 +<00a2> 162 +<00a3> 163 +<00a4> 164 +<00a5> 165 +<00a6> 166 +<00a7> 167 +<00a8> 168 +<00a9> 169 +<00aa> 170 +<00ab> 171 +<00ac> 172 +<00ad> 173 +<00ae> 174 +<00af> 175 +<00b0> 176 +<00b1> 177 +<00b2> 178 +<00b3> 179 +<00b4> 180 +<00b5> 181 +<00b6> 182 +<00b7> 183 +<00b8> 184 +<00b9> 185 +<00ba> 186 +<00bb> 187 +<00bc> 188 +<00bd> 189 +<00be> 190 +<00bf> 191 +<00c0> 192 +<00c1> 193 +<00c2> 194 +<00c3> 195 +<00c4> 196 +<00c5> 197 +<00c6> 198 +<00c7> 199 +<00c8> 200 +<00c9> 201 +<00ca> 202 +<00cb> 203 +<00cc> 204 +<00cd> 205 +<00ce> 206 +<00cf> 207 +<00d0> 208 +<00d1> 209 +<00d2> 210 +<00d3> 211 +<00d4> 212 +<00d5> 213 +<00d6> 214 +<00d7> 215 +<00d8> 216 +<00d9> 217 +<00da> 218 +<00db> 219 +<00dc> 220 +<00dd> 221 +<00de> 222 +<00df> 223 +<00e0> 224 +<00e1> 225 +<00e2> 226 +<00e3> 227 +<00e4> 228 +<20> 229 +endcidchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end +%%EndResource +%%EOF diff --git a/tests/cmap-onebyte-encoding.txt b/tests/cmap-onebyte-encoding.txt new file mode 100644 index 0000000..4ad1e30 --- /dev/null +++ b/tests/cmap-onebyte-encoding.txt @@ -0,0 +1,24 @@ +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo 3 dict dup begin +/Registry (Adobe) def +/Ordering (Identity) def +/Supplement 0 def +end def +/CMapName /OneByteIdentityH def +/CMapVersion 1.000 def +/CMapType 1 def +/UIDOffset 0 def +/XUID [1 10 25404 9999] def +/WMode 0 def +1 begincodespacerange +<00> +endcodespacerange +1 begincidrange +<00> 0 +endcidrange +endcmap +CMapName currentdict /CMap defineresource pop +end +end diff --git a/tests/cmap-tounicode.txt b/tests/cmap-tounicode.txt new file mode 100644 index 0000000..3004818 --- /dev/null +++ b/tests/cmap-tounicode.txt @@ -0,0 +1,25 @@ +/CIDInit/ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo<< +/Registry (Adobe) +/Ordering (UCS) +/Supplement 0 +>> def +/CMapName/Adobe-Identity-UCS def +/CMapType 2 def +1 begincodespacerange +<00> +endcodespacerange +1 beginbfrange +<006F> <0072> [<00E7> <00E9> <00E8> <00EA>] +endbfrange +3 beginbfchar +<01> <0078> +<02> <030C> +<03> <0075> +endbfchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end diff --git a/tests/data.py b/tests/data.py index 405a961..04e879e 100644 --- a/tests/data.py +++ b/tests/data.py @@ -45,8 +45,10 @@ # really rather broken. "issue9418.pdf", "bug1250079.pdf", - # FIXME: These can likely be fixed by correctly parsing CMaps - # (which should also be submitted as a PR to pdfminer.six) + # FIXME: We "accept" these but the Unicode mappings are incorrect. + # Need to see what pdf.js does for them - it seems falling back to + # the string may work, but it might be ASCII, PDFDocEncoding, + # UTF-16BE, or UTF-8 (each of these is different), so... "issue9915_reduced.pdf", "issue2931.pdf", "issue9534_reduced.pdf", diff --git a/tests/test_cmapdb.py b/tests/test_cmapdb.py index d5165a8..533762c 100644 --- a/tests/test_cmapdb.py +++ b/tests/test_cmapdb.py @@ -2,40 +2,18 @@ Inadequately test CMap parsing and such. """ -from playa.cmapdb import parse_tounicode +from pathlib import Path + +from playa.cmapdb import parse_tounicode, parse_encoding from playa.font import Type1FontHeaderParser -STREAMDATA = b""" -/CIDInit/ProcSet findresource begin -12 dict begin -begincmap -/CIDSystemInfo<< -/Registry (Adobe) -/Ordering (UCS) -/Supplement 0 ->> def -/CMapName/Adobe-Identity-UCS def -/CMapType 2 def -1 begincodespacerange -<00> -endcodespacerange -1 beginbfrange -<006F> <0072> [<00E7> <00E9> <00E8> <00EA>] -endbfrange -3 beginbfchar -<01> <0078> -<02> <030C> -<03> <0075> -endbfchar -endcmap -CMapName currentdict /CMap defineresource pop -end -end -""" +THISDIR = Path(__file__).parent -def test_cmap_parser(): - cmap = parse_tounicode(STREAMDATA) +def test_parse_tounicode(): + with open(THISDIR / "cmap-tounicode.txt", "rb") as infh: + data = infh.read() + cmap = parse_tounicode(data) assert cmap.cid2unichr == { 1: "x", 2: "̌", @@ -47,6 +25,17 @@ def test_cmap_parser(): } +def test_parse_encoding(): + with open(THISDIR / "cmap-encoding.txt", "rb") as infh: + data = infh.read() + cmap = parse_encoding(data) + cids = list(cmap.decode("hello world".encode("UTF-16-BE"))) + assert cids == [ord(x) for x in "hello world"] + cids = list(cmap.decode(b"\x00W \x00T \x00F")) + assert cids == [87, 229, 84, 229, 70] + + + # Basically the sort of stuff we try to find in a Type 1 font TYPE1DATA = b""" %!PS-AdobeFont-1.0: MyBogusFont 0.1 diff --git a/tests/test_open.py b/tests/test_open.py index 77338f0..3631fa6 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -27,6 +27,16 @@ "issue-1114-dedupe-chars.pdf", "malformed-from-issue-932.pdf", "mcid_example.pdf", + # FIXME: This can be fixed by correctly parsing Encoding CMaps, + # which be submitted as a PR to pdfminer.six + "issue7901.pdf", + # FIXME: These have invalid ToUnicode mappings but can be fixed by + # falling back to the input string (as PDFDocEncoding or UTF-16BE) + "issue9915_reduced.pdf", + "issue2931.pdf", + "issue9534_reduced.pdf", + # FIXME: And this here one is just UTF-8 + "issue18117.pdf", }