Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ongoing deJava.fiCation() #13

Merged
merged 22 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6f4fd02
refactor!: remove some useless typing
dhdaines Nov 6, 2024
f60d2b8
refactor!: UseSomeMoreConciseNaming
dhdaines Nov 6, 2024
23ee3d3
refactor!: PDFParser parses indirect objects, which are now a thing
dhdaines Nov 8, 2024
c004dae
feat: Version could be in catalog (PDF 1.7 sec 7.5.2)
dhdaines Nov 8, 2024
a563e63
refactor!: start making parsers into iterators
dhdaines Nov 11, 2024
c41a828
refactor!: properly separate object and indirect object parsers
dhdaines Nov 11, 2024
9b6478a
refactor!: *_value will throw TypeError (which you can catch)
dhdaines Nov 11, 2024
59c0f34
fix: propery implmenet and test inline images
dhdaines Nov 11, 2024
3deded0
refactor: remove evil implementation inheritance
dhdaines Nov 11, 2024
776ae5a
refactor: redo using Lexer (FIXME: how to test?)
dhdaines Nov 11, 2024
5647fb8
fix: actually test t1 header parser and fix it
dhdaines Nov 11, 2024
3b0cfbc
fix: mostly fix everything for new parsers finally
dhdaines Nov 11, 2024
15b6b95
chore: ruff
dhdaines Nov 11, 2024
ba69c41
fix: contentstream needs its own objid/genno
dhdaines Nov 11, 2024
c6acdf0
fix: do not clear trailer as this function gets called recursivey
dhdaines Nov 12, 2024
2f0ec82
fix: handle null/None better (we changed it upstream)
dhdaines Nov 12, 2024
a2ac391
fix: handle inline images separately (because Reasons)
dhdaines Nov 12, 2024
95f23d9
fix: make cmap parsing more robust and concise
dhdaines Nov 12, 2024
c4b5aef
fix: replicate "fallback" behaviour with unresolved /Length alues
dhdaines Nov 12, 2024
1a98821
chore: ruff
dhdaines Nov 12, 2024
eb7ce6d
feat!: remove another use of settings.STRICT
dhdaines Nov 12, 2024
93cd2fa
fix: (re-)enable strict stream parsing when encrypted
dhdaines Nov 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions playa/ccitt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
cast,
)


def get_bytes(data: bytes) -> Iterator[int]:
yield from data

Expand Down Expand Up @@ -83,15 +84,19 @@ def _parse_bit(self, x: object) -> None:
class CCITTException(Exception):
pass


class EOFB(CCITTException):
pass


class InvalidData(CCITTException):
pass


class ByteSkip(CCITTException):
pass


class CCITTG4Parser(BitParser):
MODE = [None, None]
BitParser.add(MODE, 0, "1")
Expand Down
157 changes: 74 additions & 83 deletions playa/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@
)

from playa.encodingdb import name2unicode
from playa.exceptions import PDFSyntaxError
from playa.parser import KWD, Parser, PSKeyword, PSLiteral, literal_name
from playa.parser import (
KWD,
ObjectParser,
PDFObject,
PSKeyword,
PSLiteral,
literal_name,
)
from playa.utils import choplist, nunpack

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -272,83 +278,83 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
return cls._umap_cache[name][vertical]


class CMapParser(Parser[PSKeyword]):
KEYWORD_BEGINCMAP = KWD(b"begincmap")
KEYWORD_ENDCMAP = KWD(b"endcmap")
KEYWORD_USECMAP = KWD(b"usecmap")
KEYWORD_DEF = KWD(b"def")
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")


class CMapParser:
def __init__(self, cmap: CMapBase, data: bytes) -> None:
super().__init__(data)
self.cmap = cmap
self.stack: List[PDFObject] = []
self._parser = ObjectParser(data)
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
self._warnings: Set[str] = set()

def run(self) -> None:
next(self, None)

KEYWORD_BEGINCMAP = KWD(b"begincmap")
KEYWORD_ENDCMAP = KWD(b"endcmap")
KEYWORD_USECMAP = KWD(b"usecmap")
KEYWORD_DEF = KWD(b"def")
KEYWORD_BEGINCODESPACERANGE = KWD(b"begincodespacerange")
KEYWORD_ENDCODESPACERANGE = KWD(b"endcodespacerange")
KEYWORD_BEGINCIDRANGE = KWD(b"begincidrange")
KEYWORD_ENDCIDRANGE = KWD(b"endcidrange")
KEYWORD_BEGINCIDCHAR = KWD(b"begincidchar")
KEYWORD_ENDCIDCHAR = KWD(b"endcidchar")
KEYWORD_BEGINBFRANGE = KWD(b"beginbfrange")
KEYWORD_ENDBFRANGE = KWD(b"endbfrange")
KEYWORD_BEGINBFCHAR = KWD(b"beginbfchar")
KEYWORD_ENDBFCHAR = KWD(b"endbfchar")
KEYWORD_BEGINNOTDEFRANGE = KWD(b"beginnotdefrange")
KEYWORD_ENDNOTDEFRANGE = KWD(b"endnotdefrange")
for pos, obj in self._parser:
log.debug("token @ %d: %r", pos, obj)
if isinstance(obj, PSKeyword):
self.do_keyword(pos, obj)
else:
self.stack.append(obj)

def popall(self) -> None:
del self.stack[:]

def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""ToUnicode CMaps

See Section 5.9.2 - ToUnicode CMaps of the PDF Reference.
"""
if token is self.KEYWORD_BEGINCMAP:
log.debug("keyword: %r (%r)", token, self.stack)

# Ignore everything outside begincmap / endcmap
if token is KEYWORD_BEGINCMAP:
self._in_cmap = True
self.popall()
return

elif token is self.KEYWORD_ENDCMAP:
elif token is KEYWORD_ENDCMAP:
self._in_cmap = False
return

if not self._in_cmap:
return

if token is self.KEYWORD_DEF:
if token is KEYWORD_DEF:
try:
((_, k), (_, v)) = self.pop(2)
# Might fail with IndexError if the file is corrputed
v = self.stack.pop()
k = self.stack.pop()
self.cmap.set_attr(literal_name(k), v)
except PDFSyntaxError:
except (IndexError, TypeError):
pass
return

if token is self.KEYWORD_USECMAP:
elif token is KEYWORD_USECMAP:
try:
((_, cmapname),) = self.pop(1)
cmapname = self.stack.pop()
self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
except PDFSyntaxError:
pass
except KeyError:
except (IndexError, TypeError, KeyError):
pass
return

if token is self.KEYWORD_BEGINCODESPACERANGE:
elif token is KEYWORD_BEGINCODESPACERANGE:
self.popall()
return
if token is self.KEYWORD_ENDCODESPACERANGE:
elif token is KEYWORD_ENDCODESPACERANGE:
self.popall()
return

if token is self.KEYWORD_BEGINCIDRANGE:
elif token is KEYWORD_BEGINCIDRANGE:
self.popall()
return

if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for start_byte, end_byte, cid in choplist(3, objs):
elif token is KEYWORD_ENDCIDRANGE:
for start_byte, end_byte, cid in choplist(3, self.stack):
if not isinstance(start_byte, bytes):
self._warn_once("The start object of begincidrange is not a byte.")
continue
Expand Down Expand Up @@ -380,26 +386,18 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
for i in range(end - start + 1):
x = start_prefix + struct.pack(">L", start + i)[-vlen:]
self.cmap.add_cid2unichr(cid + i, x)
return

if token is self.KEYWORD_BEGINCIDCHAR:
self.popall()
return

if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()]
for cid, code in choplist(2, objs):
elif token is KEYWORD_BEGINCIDCHAR:
self.popall()
elif token is KEYWORD_ENDCIDCHAR:
for cid, code in choplist(2, self.stack):
if isinstance(code, bytes) and isinstance(cid, int):
self.cmap.add_cid2unichr(cid, code)
return

if token is self.KEYWORD_BEGINBFRANGE:
self.popall()
return

if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()]
for start_byte, end_byte, code in choplist(3, objs):
elif token is KEYWORD_BEGINBFRANGE:
self.popall()
elif token is KEYWORD_ENDBFRANGE:
for start_byte, end_byte, code in choplist(3, self.stack):
if not isinstance(start_byte, bytes):
self._warn_once("The start object is not a byte.")
continue
Expand Down Expand Up @@ -428,28 +426,21 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
for i in range(end - start + 1):
x = prefix + struct.pack(">L", base + i)[-vlen:]
self.cmap.add_cid2unichr(start + i, x)
return

if token is self.KEYWORD_BEGINBFCHAR:
self.popall()
return

if token is self.KEYWORD_ENDBFCHAR:
objs = [obj for (__, obj) in self.popall()]
for cid, code in choplist(2, objs):
elif token is KEYWORD_BEGINBFCHAR:
self.popall()
elif token is KEYWORD_ENDBFCHAR:
for cid, code in choplist(2, self.stack):
if isinstance(cid, bytes) and isinstance(code, bytes):
self.cmap.add_cid2unichr(nunpack(cid), code)
return

if token is self.KEYWORD_BEGINNOTDEFRANGE:
self.popall()
return

if token is self.KEYWORD_ENDNOTDEFRANGE:
elif token is KEYWORD_BEGINNOTDEFRANGE:
self.popall()
return

self.push((pos, token))
elif token is KEYWORD_ENDNOTDEFRANGE:
self.popall()
else:
# It's ... something else (probably bogus)
self.stack.append(token)

def _warn_once(self, msg: str) -> None:
"""Warn once for each unique message"""
Expand Down
3 changes: 1 addition & 2 deletions playa/color.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import collections
from typing import Dict, NamedTuple, Union

from playa import settings
from playa.casting import safe_float
from playa.exceptions import PDFInterpreterError
from playa.parser import LIT
Expand Down Expand Up @@ -45,7 +44,7 @@ def __init__(self, name: str, ncomponents: int) -> None:
self.ncomponents = ncomponents

def make_color(self, *components) -> Color:
if settings.STRICT and len(components) != self.ncomponents:
if len(components) != self.ncomponents:
raise PDFInterpreterError(
"%s requires %d components, got %d!"
% (self.name, self.ncomponents, len(components))
Expand Down
Loading