Implement Encoding CMaps (#27)

* feat: implement Encoding CMaps Some tests still fail due to invalid ToUnicode * fix(test): add test files * fix: do not ignore * feat: add CMap names, we may want to use them * doc: describe why these are still XFAILS
dhdaines · Dec 13, 2024 · 402b0fd · 402b0fd
1 parent 64f62a9
commit 402b0fd
Show file tree

Hide file tree

Showing 11 changed files with 511 additions and 114 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,8 +13,6 @@ docs/_build
 /build/
 /dist/
 /pdfminer.six.egg-info/
-tests/*.xml
-tests/*.txt
 .idea/
 .tox/
 .nox/

diff --git a/TODO.md b/TODO.md
@@ -1,5 +1,5 @@
 ## PLAYA 0.2.5
-- [ ] implement CMap parsing for CIDs (submit PR to pdfminer)
+- [x] implement CMap parsing for Encoding CMaps
 - [x] add "default" as a synonym of badly-named "user" space
 - [x] update `pdfplumber` branch and run `pdfplumber` tests in CI
   - [x] reimplement on top of ContentObject

diff --git a/playa/cmapdb.py b/playa/cmapdb.py
@@ -9,6 +9,7 @@
 
 """
 
+from bisect import bisect_left
 import functools
 import gzip
 import logging
@@ -31,14 +32,12 @@
     cast,
 )
 
-from playa.encodingdb import name2unicode
 from playa.exceptions import PDFSyntaxError
 from playa.parser import (
     KWD,
     ObjectParser,
     PDFObject,
     PSKeyword,
-    PSLiteral,
     literal_name,
 )
 from playa.utils import choplist, nunpack
@@ -208,15 +207,13 @@ def _load_data(cls, name: str) -> Any:
     @classmethod
     def get_cmap(cls, name: str) -> CMapBase:
         if name == "Identity-H":
-            return IdentityCMap(WMode=0)
-        elif name == "Adobe-Identity-UCS":
-            return IdentityCMap(WMode=0)  # FIXME: WMode???
+            return IdentityCMap(CMapName=name, WMode=0)
         elif name == "Identity-V":
-            return IdentityCMap(WMode=1)
+            return IdentityCMap(CMapName=name, WMode=1)
         elif name == "OneByteIdentityH":
-            return IdentityCMapByte(WMode=0)
+            return IdentityCMapByte(CMapName=name, WMode=0)
         elif name == "OneByteIdentityV":
-            return IdentityCMapByte(WMode=1)
+            return IdentityCMapByte(CMapName=name, WMode=1)
         if name in cls._cmap_cache:
             return cls._cmap_cache[name]
         data = cls._load_data(name)
@@ -260,71 +257,44 @@ def decode_utf16_char(utf16: bytes) -> str:
 
 
 class FileUnicodeMap(UnicodeMap):
+    """ToUnicode map loaded from a PDF stream"""
     def add_cid2bytes(self, cid: int, utf16: bytes) -> None:
         self.add_cid2unichr(cid, decode_utf16_char(utf16))
 
     def add_cid2code(self, cid: int, code: int) -> None:
         unichr = chr(code)
         self.add_cid2unichr(cid, unichr)
 
-    def add_cid2lit(self, cid: int, name: PSLiteral) -> None:
-        # Interpret as an Adobe glyph name.
-        assert isinstance(name.name, str)
-        unichr = name2unicode(name.name)
-        self.add_cid2unichr(cid, unichr)
-
     def add_cid2unichr(self, cid: int, unichr: str) -> None:
         # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
         assert isinstance(unichr, str)
         if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
             return
         self.cid2unichr[cid] = unichr
 
-
-def add_cid_range(
-    cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, cid: int
-) -> None:
-    start_prefix = start_byte[:-4]
-    end_prefix = end_byte[:-4]
-    if start_prefix != end_prefix:
-        log.warning(
-            "The prefix of the start and end byte of "
-            "begincidrange are not the same.",
-        )
-        return
-    svar = start_byte[-4:]
-    evar = end_byte[-4:]
-    start = nunpack(svar)
-    end = nunpack(evar)
-    vlen = len(svar)
-    for i in range(end - start + 1):
-        x = start_prefix + struct.pack(">L", start + i)[-vlen:]
-        cmap.add_cid2bytes(cid + i, x)
-
-
-def add_bf_range(
-    cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, code: PDFObject
-) -> None:
-    start = nunpack(start_byte)
-    end = nunpack(end_byte)
-    if isinstance(code, list):
-        if len(code) != end - start + 1:
-            log.warning(
-                "The difference between the start and end "
-                "offsets does not match the code length.",
-            )
-        for cid, unicode_value in zip(range(start, end + 1), code):
-            assert isinstance(unicode_value, bytes)
-            cmap.add_cid2bytes(cid, unicode_value)
-    else:
-        assert isinstance(code, bytes)
-        var = code[-4:]
-        base = nunpack(var)
-        prefix = code[:-4]
-        vlen = len(var)
-        for i in range(end - start + 1):
-            x = prefix + struct.pack(">L", base + i)[-vlen:]
-            cmap.add_cid2bytes(start + i, x)
+    def add_bf_range(
+        self, start_byte: bytes, end_byte: bytes, code: PDFObject
+    ) -> None:
+        start = nunpack(start_byte)
+        end = nunpack(end_byte)
+        if isinstance(code, list):
+            if len(code) != end - start + 1:
+                log.warning(
+                    "The difference between the start and end "
+                    "offsets does not match the code length.",
+                )
+            for cid, unicode_value in zip(range(start, end + 1), code):
+                assert isinstance(unicode_value, bytes)
+                self.add_cid2bytes(cid, unicode_value)
+        else:
+            assert isinstance(code, bytes)
+            var = code[-4:]
+            base = nunpack(var)
+            prefix = code[:-4]
+            vlen = len(var)
+            for i in range(end - start + 1):
+                x = prefix + struct.pack(">L", base + i)[-vlen:]
+                self.add_cid2bytes(start + i, x)
 
 
 def parse_tounicode(data: bytes) -> FileUnicodeMap:
@@ -380,6 +350,132 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
             del stack[:]
         elif obj is KEYWORD_BEGINCIDRANGE:
             del stack[:]
+        elif obj is KEYWORD_ENDCIDRANGE:
+            del stack[:]
+        elif obj is KEYWORD_BEGINCIDCHAR:
+            del stack[:]
+        elif obj is KEYWORD_ENDCIDCHAR:
+            del stack[:]
+        elif obj is KEYWORD_BEGINBFRANGE:
+            del stack[:]
+        elif obj is KEYWORD_ENDBFRANGE:
+            for start_byte, end_byte, code in choplist(3, stack):
+                if not isinstance(start_byte, bytes):
+                    log.warning("The start object is not a byte.")
+                    continue
+                if not isinstance(end_byte, bytes):
+                    log.warning("The end object is not a byte.")
+                    continue
+                if len(start_byte) != len(end_byte):
+                    log.warning("The start and end byte have different lengths.")
+                    continue
+                cmap.add_bf_range(start_byte, end_byte, code)
+            del stack[:]
+        elif obj is KEYWORD_BEGINBFCHAR:
+            del stack[:]
+        elif obj is KEYWORD_ENDBFCHAR:
+            for cid, code in choplist(2, stack):
+                if isinstance(cid, bytes) and isinstance(code, bytes):
+                    cmap.add_cid2bytes(nunpack(cid), code)
+            del stack[:]
+        elif obj is KEYWORD_BEGINNOTDEFRANGE:
+            del stack[:]
+        elif obj is KEYWORD_ENDNOTDEFRANGE:
+            del stack[:]
+        else:
+            # It's ... something else (probably bogus)
+            stack.append(obj)
+    return cmap
+
+
+class EncodingCMap(CMap):
+    """Encoding map loaded from a PDF stream."""
+    def __init__(self):
+        super().__init__()
+        self.bytes2cid: Dict[bytes, int] = {}
+        self.code_lengths = []
+
+    def decode(self, code: bytes) -> Tuple[int, ...]:
+        idx = 0
+        codes = []
+        # Match longest substring in bytes2cid
+        while idx < len(code):
+            for codelen in self.code_lengths[::-1]:
+                if code[idx: idx + codelen] in self.bytes2cid:
+                    codes.append(self.bytes2cid[code[idx: idx + codelen]])
+                    idx += codelen
+                    break
+            else:
+                log.warning("Unknown byte sequence %r", code[idx:])
+                idx += 1
+        return tuple(codes)
+
+    def add_bytes2cid(self, utf16: bytes, cid: int) -> None:
+        codelen = len(utf16)
+        pos = bisect_left(self.code_lengths, codelen)
+        if pos == len(self.code_lengths) or self.code_lengths[pos] != codelen:
+            self.code_lengths.insert(pos, codelen)
+        self.bytes2cid[utf16] = cid
+
+    def add_cid_range(
+        self, start_byte: bytes, end_byte: bytes, cid: int
+    ) -> None:
+        start_prefix = start_byte[:-4]
+        end_prefix = end_byte[:-4]
+        if start_prefix != end_prefix:
+            log.warning(
+                "The prefix of the start and end byte of "
+                "begincidrange are not the same.",
+            )
+            return
+        svar = start_byte[-4:]
+        evar = end_byte[-4:]
+        start = nunpack(svar)
+        end = nunpack(evar)
+        vlen = len(svar)
+        for i in range(end - start + 1):
+            x = start_prefix + struct.pack(">L", start + i)[-vlen:]
+            self.add_bytes2cid(x, cid + i)
+
+
+def parse_encoding(data: bytes) -> EncodingCMap:
+    """Parse an Encoding CMap."""
+    cmap = EncodingCMap()
+    stack: List[PDFObject] = []
+    parser = ObjectParser(data)
+
+    while True:
+        try:
+            pos, obj = next(parser)
+        except PDFSyntaxError as e:
+            log.debug("Ignoring syntax error: %s", e)
+            parser.reset()
+            continue
+        except StopIteration:
+            break
+
+        if not isinstance(obj, PSKeyword):
+            stack.append(obj)
+            continue
+        log.debug("keyword: %r (%r)", obj, stack)
+
+        if obj is KEYWORD_DEF:
+            try:
+                # Might fail with IndexError if the file is corrputed
+                v = stack.pop()
+                k = stack.pop()
+                cmap.set_attr(literal_name(k), v)
+            except (IndexError, TypeError):
+                pass
+        elif obj is KEYWORD_USECMAP:
+            log.warning("usecmap not supported for EncodingCMap")
+            del stack[:]
+        elif obj is KEYWORD_BEGINCODESPACERANGE:
+            del stack[:]
+        elif obj is KEYWORD_ENDCODESPACERANGE:
+            del stack[:]
+        elif obj is KEYWORD_BEGINCIDRANGE:
+            del stack[:]
         elif obj is KEYWORD_ENDCIDRANGE:
             for start_byte, end_byte, cid in choplist(3, stack):
                 if not isinstance(start_byte, bytes):
@@ -396,36 +492,22 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
                         "The start and end byte of begincidrange have different lengths.",
                     )
                     return cmap
-                add_cid_range(cmap, start_byte, end_byte, cid)
+                cmap.add_cid_range(start_byte, end_byte, cid)
             del stack[:]
         elif obj is KEYWORD_BEGINCIDCHAR:
             del stack[:]
         elif obj is KEYWORD_ENDCIDCHAR:
-            for cid, code in choplist(2, stack):
+            for code, cid in choplist(2, stack):
                 if isinstance(code, bytes) and isinstance(cid, int):
-                    cmap.add_cid2bytes(cid, code)
+                    cmap.add_bytes2cid(code, cid)
             del stack[:]
         elif obj is KEYWORD_BEGINBFRANGE:
             del stack[:]
         elif obj is KEYWORD_ENDBFRANGE:
-            for start_byte, end_byte, code in choplist(3, stack):
-                if not isinstance(start_byte, bytes):
-                    log.warning("The start object is not a byte.")
-                    continue
-                if not isinstance(end_byte, bytes):
-                    log.warning("The end object is not a byte.")
-                    continue
-                if len(start_byte) != len(end_byte):
-                    log.warning("The start and end byte have different lengths.")
-                    continue
-                add_bf_range(cmap, start_byte, end_byte, code)
             del stack[:]
         elif obj is KEYWORD_BEGINBFCHAR:
             del stack[:]
         elif obj is KEYWORD_ENDBFCHAR:
-            for cid, code in choplist(2, stack):
-                if isinstance(cid, bytes) and isinstance(code, bytes):
-                    cmap.add_cid2bytes(nunpack(cid), code)
             del stack[:]
         elif obj is KEYWORD_BEGINNOTDEFRANGE:
             del stack[:]

diff --git a/playa/font.py b/playa/font.py
@@ -23,6 +23,7 @@
     CMapBase,
     CMapDB,
     parse_tounicode,
+    parse_encoding,
     FileUnicodeMap,
     IdentityUnicodeMap,
     UnicodeMap,
@@ -1067,10 +1068,14 @@ def __init__(
             if isinstance(spec["ToUnicode"], ContentStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = parse_tounicode(strm.buffer)
-            if isinstance(spec["Encoding"], ContentStream):
+            # FIXME: For the moment only replace the cmap if we don't
+            # have a predefined one (this may or may not be correct)
+            # FIXME: self.cmap should just be None here, WTF pdfminer.six!
+            if self.cmap.attrs.get("CMapName") is None and isinstance(
+                spec["Encoding"], ContentStream
+            ):
                 strm = stream_value(spec["Encoding"])
-                # FIXME: it's not a tounicode, but it plays one on TV
-                # _ = parse_tounicode(strm.buffer)
+                self.cmap = parse_encoding(strm.buffer)
 
             if self.unicode_map is None:
                 cmap_name = literal_name(spec["ToUnicode"])

diff --git a/playa/page.py b/playa/page.py
@@ -755,6 +755,7 @@ def init_resources(self, page: Page, resources: Dict) -> None:
                         spec = dict_value(spec)
                         self.fontmap[fontid] = doc.get_font(objid, spec)
                     except TypeError:
+                        # FIXME: This is very very wrong! DO NOT WANT!
                         log.warning("Broken/missing font spec for %r", fontid)
                         self.fontmap[fontid] = doc.get_font(objid, {})
             elif k == "ColorSpace":
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,8 +13,6 @@ docs/_build @@
     /build/
     /dist/
     /pdfminer.six.egg-info/
-    tests/*.xml
-    tests/*.txt
     .idea/
     .tox/
     .nox/
@@ Expand Down @@