Skip to content

Commit

Permalink
Implement Encoding CMaps (#27)
Browse files Browse the repository at this point in the history
* feat: implement Encoding CMaps

Some tests still fail due to invalid ToUnicode

* fix(test): add test files

* fix: do not ignore

* feat: add CMap names, we may want to use them

* doc: describe why these are still XFAILS
  • Loading branch information
dhdaines authored Dec 13, 2024
1 parent 64f62a9 commit 402b0fd
Show file tree
Hide file tree
Showing 11 changed files with 511 additions and 114 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ docs/_build
/build/
/dist/
/pdfminer.six.egg-info/
tests/*.xml
tests/*.txt
.idea/
.tox/
.nox/
Expand Down
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## PLAYA 0.2.5
- [ ] implement CMap parsing for CIDs (submit PR to pdfminer)
- [x] implement CMap parsing for Encoding CMaps
- [x] add "default" as a synonym of badly-named "user" space
- [x] update `pdfplumber` branch and run `pdfplumber` tests in CI
- [x] reimplement on top of ContentObject
Expand Down
234 changes: 158 additions & 76 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""

from bisect import bisect_left
import functools
import gzip
import logging
Expand All @@ -31,14 +32,12 @@
cast,
)

from playa.encodingdb import name2unicode
from playa.exceptions import PDFSyntaxError
from playa.parser import (
KWD,
ObjectParser,
PDFObject,
PSKeyword,
PSLiteral,
literal_name,
)
from playa.utils import choplist, nunpack
Expand Down Expand Up @@ -208,15 +207,13 @@ def _load_data(cls, name: str) -> Any:
@classmethod
def get_cmap(cls, name: str) -> CMapBase:
if name == "Identity-H":
return IdentityCMap(WMode=0)
elif name == "Adobe-Identity-UCS":
return IdentityCMap(WMode=0) # FIXME: WMode???
return IdentityCMap(CMapName=name, WMode=0)
elif name == "Identity-V":
return IdentityCMap(WMode=1)
return IdentityCMap(CMapName=name, WMode=1)
elif name == "OneByteIdentityH":
return IdentityCMapByte(WMode=0)
return IdentityCMapByte(CMapName=name, WMode=0)
elif name == "OneByteIdentityV":
return IdentityCMapByte(WMode=1)
return IdentityCMapByte(CMapName=name, WMode=1)
if name in cls._cmap_cache:
return cls._cmap_cache[name]
data = cls._load_data(name)
Expand Down Expand Up @@ -260,71 +257,44 @@ def decode_utf16_char(utf16: bytes) -> str:


class FileUnicodeMap(UnicodeMap):
"""ToUnicode map loaded from a PDF stream"""
def add_cid2bytes(self, cid: int, utf16: bytes) -> None:
self.add_cid2unichr(cid, decode_utf16_char(utf16))

def add_cid2code(self, cid: int, code: int) -> None:
unichr = chr(code)
self.add_cid2unichr(cid, unichr)

def add_cid2lit(self, cid: int, name: PSLiteral) -> None:
# Interpret as an Adobe glyph name.
assert isinstance(name.name, str)
unichr = name2unicode(name.name)
self.add_cid2unichr(cid, unichr)

def add_cid2unichr(self, cid: int, unichr: str) -> None:
# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
assert isinstance(unichr, str)
if unichr == "\u00a0" and self.cid2unichr.get(cid) == " ":
return
self.cid2unichr[cid] = unichr


def add_cid_range(
cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, cid: int
) -> None:
start_prefix = start_byte[:-4]
end_prefix = end_byte[:-4]
if start_prefix != end_prefix:
log.warning(
"The prefix of the start and end byte of "
"begincidrange are not the same.",
)
return
svar = start_byte[-4:]
evar = end_byte[-4:]
start = nunpack(svar)
end = nunpack(evar)
vlen = len(svar)
for i in range(end - start + 1):
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
cmap.add_cid2bytes(cid + i, x)


def add_bf_range(
cmap: FileUnicodeMap, start_byte: bytes, end_byte: bytes, code: PDFObject
) -> None:
start = nunpack(start_byte)
end = nunpack(end_byte)
if isinstance(code, list):
if len(code) != end - start + 1:
log.warning(
"The difference between the start and end "
"offsets does not match the code length.",
)
for cid, unicode_value in zip(range(start, end + 1), code):
assert isinstance(unicode_value, bytes)
cmap.add_cid2bytes(cid, unicode_value)
else:
assert isinstance(code, bytes)
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in range(end - start + 1):
x = prefix + struct.pack(">L", base + i)[-vlen:]
cmap.add_cid2bytes(start + i, x)
def add_bf_range(
self, start_byte: bytes, end_byte: bytes, code: PDFObject
) -> None:
start = nunpack(start_byte)
end = nunpack(end_byte)
if isinstance(code, list):
if len(code) != end - start + 1:
log.warning(
"The difference between the start and end "
"offsets does not match the code length.",
)
for cid, unicode_value in zip(range(start, end + 1), code):
assert isinstance(unicode_value, bytes)
self.add_cid2bytes(cid, unicode_value)
else:
assert isinstance(code, bytes)
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in range(end - start + 1):
x = prefix + struct.pack(">L", base + i)[-vlen:]
self.add_cid2bytes(start + i, x)


def parse_tounicode(data: bytes) -> FileUnicodeMap:
Expand Down Expand Up @@ -380,6 +350,132 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
del stack[:]
elif obj is KEYWORD_BEGINCIDRANGE:
del stack[:]
elif obj is KEYWORD_ENDCIDRANGE:
del stack[:]
elif obj is KEYWORD_BEGINCIDCHAR:
del stack[:]
elif obj is KEYWORD_ENDCIDCHAR:
del stack[:]
elif obj is KEYWORD_BEGINBFRANGE:
del stack[:]
elif obj is KEYWORD_ENDBFRANGE:
for start_byte, end_byte, code in choplist(3, stack):
if not isinstance(start_byte, bytes):
log.warning("The start object is not a byte.")
continue
if not isinstance(end_byte, bytes):
log.warning("The end object is not a byte.")
continue
if len(start_byte) != len(end_byte):
log.warning("The start and end byte have different lengths.")
continue
cmap.add_bf_range(start_byte, end_byte, code)
del stack[:]
elif obj is KEYWORD_BEGINBFCHAR:
del stack[:]
elif obj is KEYWORD_ENDBFCHAR:
for cid, code in choplist(2, stack):
if isinstance(cid, bytes) and isinstance(code, bytes):
cmap.add_cid2bytes(nunpack(cid), code)
del stack[:]
elif obj is KEYWORD_BEGINNOTDEFRANGE:
del stack[:]
elif obj is KEYWORD_ENDNOTDEFRANGE:
del stack[:]
else:
# It's ... something else (probably bogus)
stack.append(obj)
return cmap


class EncodingCMap(CMap):
"""Encoding map loaded from a PDF stream."""
def __init__(self):
super().__init__()
self.bytes2cid: Dict[bytes, int] = {}
self.code_lengths = []

def decode(self, code: bytes) -> Tuple[int, ...]:
idx = 0
codes = []
# Match longest substring in bytes2cid
while idx < len(code):
for codelen in self.code_lengths[::-1]:
if code[idx: idx + codelen] in self.bytes2cid:
codes.append(self.bytes2cid[code[idx: idx + codelen]])
idx += codelen
break
else:
log.warning("Unknown byte sequence %r", code[idx:])
idx += 1
return tuple(codes)

def add_bytes2cid(self, utf16: bytes, cid: int) -> None:
codelen = len(utf16)
pos = bisect_left(self.code_lengths, codelen)
if pos == len(self.code_lengths) or self.code_lengths[pos] != codelen:
self.code_lengths.insert(pos, codelen)
self.bytes2cid[utf16] = cid

def add_cid_range(
self, start_byte: bytes, end_byte: bytes, cid: int
) -> None:
start_prefix = start_byte[:-4]
end_prefix = end_byte[:-4]
if start_prefix != end_prefix:
log.warning(
"The prefix of the start and end byte of "
"begincidrange are not the same.",
)
return
svar = start_byte[-4:]
evar = end_byte[-4:]
start = nunpack(svar)
end = nunpack(evar)
vlen = len(svar)
for i in range(end - start + 1):
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
self.add_bytes2cid(x, cid + i)


def parse_encoding(data: bytes) -> EncodingCMap:
"""Parse an Encoding CMap."""
cmap = EncodingCMap()
stack: List[PDFObject] = []
parser = ObjectParser(data)

while True:
try:
pos, obj = next(parser)
except PDFSyntaxError as e:
log.debug("Ignoring syntax error: %s", e)
parser.reset()
continue
except StopIteration:
break

if not isinstance(obj, PSKeyword):
stack.append(obj)
continue
log.debug("keyword: %r (%r)", obj, stack)

if obj is KEYWORD_DEF:
try:
# Might fail with IndexError if the file is corrputed
v = stack.pop()
k = stack.pop()
cmap.set_attr(literal_name(k), v)
except (IndexError, TypeError):
pass
elif obj is KEYWORD_USECMAP:
log.warning("usecmap not supported for EncodingCMap")
del stack[:]
elif obj is KEYWORD_BEGINCODESPACERANGE:
del stack[:]
elif obj is KEYWORD_ENDCODESPACERANGE:
del stack[:]
elif obj is KEYWORD_BEGINCIDRANGE:
del stack[:]
elif obj is KEYWORD_ENDCIDRANGE:
for start_byte, end_byte, cid in choplist(3, stack):
if not isinstance(start_byte, bytes):
Expand All @@ -396,36 +492,22 @@ def parse_tounicode(data: bytes) -> FileUnicodeMap:
"The start and end byte of begincidrange have different lengths.",
)
return cmap
add_cid_range(cmap, start_byte, end_byte, cid)
cmap.add_cid_range(start_byte, end_byte, cid)
del stack[:]
elif obj is KEYWORD_BEGINCIDCHAR:
del stack[:]
elif obj is KEYWORD_ENDCIDCHAR:
for cid, code in choplist(2, stack):
for code, cid in choplist(2, stack):
if isinstance(code, bytes) and isinstance(cid, int):
cmap.add_cid2bytes(cid, code)
cmap.add_bytes2cid(code, cid)
del stack[:]
elif obj is KEYWORD_BEGINBFRANGE:
del stack[:]
elif obj is KEYWORD_ENDBFRANGE:
for start_byte, end_byte, code in choplist(3, stack):
if not isinstance(start_byte, bytes):
log.warning("The start object is not a byte.")
continue
if not isinstance(end_byte, bytes):
log.warning("The end object is not a byte.")
continue
if len(start_byte) != len(end_byte):
log.warning("The start and end byte have different lengths.")
continue
add_bf_range(cmap, start_byte, end_byte, code)
del stack[:]
elif obj is KEYWORD_BEGINBFCHAR:
del stack[:]
elif obj is KEYWORD_ENDBFCHAR:
for cid, code in choplist(2, stack):
if isinstance(cid, bytes) and isinstance(code, bytes):
cmap.add_cid2bytes(nunpack(cid), code)
del stack[:]
elif obj is KEYWORD_BEGINNOTDEFRANGE:
del stack[:]
Expand Down
11 changes: 8 additions & 3 deletions playa/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
CMapBase,
CMapDB,
parse_tounicode,
parse_encoding,
FileUnicodeMap,
IdentityUnicodeMap,
UnicodeMap,
Expand Down Expand Up @@ -1067,10 +1068,14 @@ def __init__(
if isinstance(spec["ToUnicode"], ContentStream):
strm = stream_value(spec["ToUnicode"])
self.unicode_map = parse_tounicode(strm.buffer)
if isinstance(spec["Encoding"], ContentStream):
# FIXME: For the moment only replace the cmap if we don't
# have a predefined one (this may or may not be correct)
# FIXME: self.cmap should just be None here, WTF pdfminer.six!
if self.cmap.attrs.get("CMapName") is None and isinstance(
spec["Encoding"], ContentStream
):
strm = stream_value(spec["Encoding"])
# FIXME: it's not a tounicode, but it plays one on TV
# _ = parse_tounicode(strm.buffer)
self.cmap = parse_encoding(strm.buffer)

if self.unicode_map is None:
cmap_name = literal_name(spec["ToUnicode"])
Expand Down
1 change: 1 addition & 0 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ def init_resources(self, page: Page, resources: Dict) -> None:
spec = dict_value(spec)
self.fontmap[fontid] = doc.get_font(objid, spec)
except TypeError:
# FIXME: This is very very wrong! DO NOT WANT!
log.warning("Broken/missing font spec for %r", fontid)
self.fontmap[fontid] = doc.get_font(objid, {})
elif k == "ColorSpace":
Expand Down
Loading

0 comments on commit 402b0fd

Please sign in to comment.