diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..3aee82de --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,21 @@ +name: Benchmark +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install Hatch + uses: pypa/hatch@install + - name: Run benchmarks + run: | + hatch run bench:all diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index caa625e6..500b4f9a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Run all tests +name: Test on: push: branches: [ "main" ] @@ -17,4 +17,4 @@ jobs: - name: Install Hatch uses: pypa/hatch@install - name: Run tests - run: hatch test + run: hatch test --cover diff --git a/README.md b/README.md index b36cd396..e34972c0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PLAYA Ain't a LAYout Analyzer 🏖️ +# **P**LAYA ain't a **LAY**out **A**nalyzer 🏖️ ## About @@ -28,7 +28,110 @@ Notably this does *not* include the largely undocumented heuristic to understand due to a Java-damaged API based on deeply nested class hierarchies, and because layout analysis is best done probabilistically/visually. Also, pdfplumber does its own, much -nicer, layout analysis. +nicer, layout analysis. Also, if you just want to extract text from a +PDF, there are a lot of better and faster tools and libraries out +there, see [benchmarks]() for a summary (TL;DR pypdfium2 is probably +what you want, but pdfplumber does a nice job of converting PDF to +ASCII art). + +## Usage + +Do you want to get stuff out of a PDF? You have come to the right +place! Let's open up a PDF and see what's in it: + +```python +pdf = playa.open("my_awesome_document.pdf") +raw_byte_stream = pdf.buffer +a_bunch_of_tokens = list(pdf.tokens) +a_bunch_of_objects = list(pdf) +a_particular_indirect_object = pdf[42] +``` + +The raw PDF tokens and objects are probably not terribly useful to +you, but you might find them interesting. + +It probably has some pages. How many? What are their numbers/labels? +(they could be things like "xviii", 'a", or "42", for instance) + +```python +npages = len(pdf.pages) +page_numbers = [page.label for page in pdf.pages] +``` + +What's in the table of contents? + +```python +for entry in pdf.outlines: + ... +``` + +If you are lucky it has a "logical structure tree". The elements here +might even be referenced from the table of contents! (or, they might +not... with PDF you never know) + +```python +structure = pdf.structtree +for element in structure: + for child in element: + ... +``` + +Now perhaps we want to look at a specific page. Okay! +```python +page = pdf.pages[0] # they are numbered from 0 +page = pdf.pages["xviii"] # but you can get them by label +page = pdf.pages["42"] # or "logical" page number (also a label) +a_few_content_streams = list(page.contents) +raw_bytes = b"".join(stream.buffer for stream in page.contents) +``` + +This page probably has text, graphics, etc, etc, in it. Remember that +**P**LAYA ain't a **LAY**out **A**nalyzer! You can either look at the +stream of tokens or mysterious PDF objects: +```python +for token in page.tokens: + ... +for object in page: + ... +``` + +Or you can access individual characters, lines, curves, and rectangles +(if you wanted to, for instance, do layout analysis): +```python +for item in page.layout: + ... +``` + +Do we make you spelunk in a dank class hierarchy to know what these +items are? No, we do not! They are just NamedTuples with a very +helpful field *telling* you what they are, as a string. + +In particular you can also extract all these items into a dataframe +using the library of your choosing (I like [Polars]()) and I dunno do +some Artifishul Intelligents or something with them: +```python +``` + +Or just write them to a CSV file: +```python +``` + +Note again that PLAYA doesn't guarantee that these characters come at +you in anything other than the order they occur in the file (but it +does guarantee that). It does, however, put them in (hopefully) the +right absolute positions on the page, and keep track of the clipping +path and the graphics state, so yeah, you *could* "render" them like +`pdfminer.six` pretended to do. + +Certain PDF tools and/or authors are notorious for using "whiteout" +(set the color to the background color) or "scissors" (the clipping +path) to hide arbitrary text that maybe *you* don't want to see +either. PLAYA gives you some rudimentary tools to detect this: +```python +``` + +For everything else, there's pdfplumber, pdfium2, pikepdf, pypdf, +borb, pydyf, etc, etc, etc. ## Acknowledgement diff --git a/playa/__init__.py b/playa/__init__.py index 20f87e01..1020e0d8 100644 --- a/playa/__init__.py +++ b/playa/__init__.py @@ -10,7 +10,7 @@ from os import PathLike from typing import Union -from playa.pdfdocument import PDFDocument +from playa.document import PDFDocument __version__ = "0.0.1" diff --git a/playa/cmapdb.py b/playa/cmapdb.py index bde0dde3..bd23e428 100644 --- a/playa/cmapdb.py +++ b/playa/cmapdb.py @@ -32,8 +32,8 @@ ) from playa.encodingdb import name2unicode -from playa.exceptions import PSEOF, PDFException, PDFTypeError, PSSyntaxError -from playa.psparser import KWD, PSKeyword, PSLiteral, PSStackParser, literal_name +from playa.exceptions import PDFException, PDFTypeError, PSSyntaxError +from playa.parser import KWD, Parser, PSKeyword, PSLiteral, literal_name from playa.utils import choplist, nunpack log = logging.getLogger(__name__) @@ -275,7 +275,7 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap: return cls._umap_cache[name][vertical] -class CMapParser(PSStackParser[PSKeyword]): +class CMapParser(Parser[PSKeyword]): def __init__(self, cmap: CMapBase, data: bytes) -> None: super().__init__(data) self.cmap = cmap @@ -284,10 +284,7 @@ def __init__(self, cmap: CMapBase, data: bytes) -> None: self._warnings: Set[str] = set() def run(self) -> None: - try: - self.nextobject() - except PSEOF: - pass + next(self, None) KEYWORD_BEGINCMAP = KWD(b"begincmap") KEYWORD_ENDCMAP = KWD(b"endcmap") diff --git a/playa/pdfcolor.py b/playa/color.py similarity index 96% rename from playa/pdfcolor.py rename to playa/color.py index b4c2021f..1bc1bcb4 100644 --- a/playa/pdfcolor.py +++ b/playa/color.py @@ -1,7 +1,7 @@ import collections from typing import Dict -from playa.psparser import LIT +from playa.parser import LIT LITERAL_DEVICE_GRAY = LIT("DeviceGray") LITERAL_DEVICE_RGB = LIT("DeviceRGB") diff --git a/playa/pdfdocument.py b/playa/document.py similarity index 91% rename from playa/pdfdocument.py rename to playa/document.py index ed609a5c..27a5fab3 100644 --- a/playa/pdfdocument.py +++ b/playa/document.py @@ -1,6 +1,7 @@ import io import itertools import logging +import mmap import re import struct from collections import deque @@ -28,10 +29,8 @@ from playa import settings from playa.arcfour import Arcfour -from playa.cmapdb import CMap, CMapBase, CMapDB from playa.data_structures import NameTree, NumberTree from playa.exceptions import ( - PSEOF, PDFEncryptionError, PDFException, PDFFontError, @@ -45,19 +44,23 @@ PDFTypeError, PSException, ) -from playa.pdffont import ( - PDFCIDFont, - PDFFont, - PDFTrueTypeFont, - PDFType1Font, - PDFType3Font, +from playa.font import PDFCIDFont, PDFFont, PDFTrueTypeFont, PDFType1Font, PDFType3Font +from playa.page import Page +from playa.parser import ( + KEYWORD_OBJ, + KEYWORD_TRAILER, + KEYWORD_XREF, + LIT, + ContentStreamParser, + PDFParser, + PSBaseParserToken, + PSLiteral, + literal_name, ) -from playa.pdfpage import PDFPage -from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser from playa.pdftypes import ( + ContentStream, DecipherCallable, - PDFObjRef, - PDFStream, + ObjRef, decipher_all, dict_value, int_value, @@ -67,7 +70,6 @@ stream_value, uint_value, ) -from playa.psparser import KWD, LIT, PSLiteral, literal_name from playa.utils import ( choplist, decode_text, @@ -89,7 +91,6 @@ LITERAL_CATALOG = LIT("Catalog") LITERAL_PAGE = LIT("Page") LITERAL_PAGES = LIT("Pages") -KEYWORD_OBJ = KWD(b"obj") INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} @@ -116,14 +117,11 @@ def __init__(self, parser: PDFParser) -> None: self._load(parser) def _load(self, parser: PDFParser) -> None: - while True: - try: - (pos, line) = parser.nextline() - line = line.strip() - if not line: - continue - except PSEOF: - raise PDFNoValidXRef("Unexpected EOF - file corrupted?") + lines = parser.iter_lines() + for pos, line in lines: + line = line.strip() + if not line: + continue if line.startswith(b"trailer"): parser.seek(pos) break @@ -137,11 +135,8 @@ def _load(self, parser: PDFParser) -> None: error_msg = f"Invalid line: {parser!r}: line={line!r}" raise PDFNoValidXRef(error_msg) for objid in range(start, start + nobjs): - try: - (_, line) = parser.nextline() - line = line.strip() - except PSEOF: - raise PDFNoValidXRef("Unexpected EOF - file corrupted?") + _, line = next(lines) + line = line.strip() f = line.split(b" ") if len(f) != 3: error_msg = f"Invalid XRef format: {parser!r}, line={line!r}" @@ -156,9 +151,16 @@ def _load(self, parser: PDFParser) -> None: def _load_trailer(self, parser: PDFParser) -> None: try: (_, kwd) = parser.nexttoken() - assert kwd is KWD(b"trailer"), str(kwd) - (_, dic) = parser.nextobject() - except PSEOF: + if kwd is not KEYWORD_TRAILER: + raise PDFSyntaxError( + "Expected %r, got %r" + % ( + KEYWORD_TRAILER, + kwd, + ) + ) + (_, dic) = next(parser) + except StopIteration: x = parser.pop(1) if not x: raise PDFNoValidXRef("Unexpected EOF - file corrupted") @@ -190,11 +192,7 @@ def __repr__(self) -> str: def _load(self, parser: PDFParser) -> None: parser.seek(0) - while 1: - try: - (pos, line_bytes) = parser.nextline() - except PSEOF: - break + for pos, line_bytes in parser.iter_lines(): if line_bytes.startswith(b"trailer"): parser.seek(pos) self._load_trailer(parser) @@ -210,8 +208,8 @@ def _load(self, parser: PDFParser) -> None: self.offsets[objid] = (None, pos, genno) # expand ObjStm. parser.seek(pos) - (_, obj) = parser.nextobject() - if isinstance(obj, PDFStream) and obj.get("Type") is LITERAL_OBJSTM: + (_, obj) = next(parser) + if isinstance(obj, ContentStream) and obj.get("Type") is LITERAL_OBJSTM: stream = stream_value(obj) try: n = stream["N"] @@ -222,14 +220,9 @@ def _load(self, parser: PDFParser) -> None: doc = parser.doc() if doc is None: raise RuntimeError("Document no longer exists!") - parser1 = PDFStreamParser(stream.get_data(), doc) - objs: List[int] = [] - try: - while 1: - (_, obj) = parser1.nextobject() - objs.append(cast(int, obj)) - except PSEOF: - pass + parser1 = ContentStreamParser(stream.get_data(), doc) + objs: List = [obj for _, obj in parser1] + # FIXME: This is choplist n = min(n, len(objs) // 2) for index in range(n): objid1 = objs[index * 2] @@ -255,9 +248,12 @@ def _load(self, parser: PDFParser) -> None: (_, objid) = parser.nexttoken() # ignored (_, genno) = parser.nexttoken() # ignored (_, kwd) = parser.nexttoken() - (_, stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream.get("Type") is not LITERAL_XREF: - raise PDFNoValidXRef("Invalid PDF stream spec.") + (_, stream) = next(parser) + if ( + not isinstance(stream, ContentStream) + or stream.get("Type") is not LITERAL_XREF + ): + raise PDFNoValidXRef(f"Invalid PDF stream spec {stream!r}") size = stream["Size"] index_array = stream.get("Index", (0, size)) if len(index_array) % 2 != 0: @@ -704,79 +700,7 @@ class OutlineItem(NamedTuple): # FIXME: Create Destination and Action types dest: Union[PSLiteral, bytes, list, None] action: Union[dict, None] - se: Union[PDFObjRef, None] - - -class PDFResourceManager: - """Repository of shared resources. - - ResourceManager facilitates reuse of shared resources - such as fonts and images so that large objects are not - allocated multiple times. - """ - - def __init__(self, caching: bool = True) -> None: - self.caching = caching - self._cached_fonts: Dict[object, PDFFont] = {} - - def get_procset(self, procs: Sequence[object]) -> None: - for proc in procs: - if proc is LITERAL_PDF or proc is LITERAL_TEXT: - pass - else: - pass - - def get_cmap(self, cmapname: str, strict: bool = False) -> CMapBase: - try: - return CMapDB.get_cmap(cmapname) - except CMapDB.CMapNotFound: - if strict: - raise - return CMap() - - def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: - if objid and objid in self._cached_fonts: - font = self._cached_fonts[objid] - else: - log.debug("get_font: create: objid=%r, spec=%r", objid, spec) - if settings.STRICT: - if spec["Type"] is not LITERAL_FONT: - raise PDFFontError("Type is not /Font") - # Create a Font object. - if "Subtype" in spec: - subtype = literal_name(spec["Subtype"]) - else: - if settings.STRICT: - raise PDFFontError("Font Subtype is not specified.") - subtype = "Type1" - if subtype in ("Type1", "MMType1"): - # Type1 Font - font = PDFType1Font(spec) - elif subtype == "TrueType": - # TrueType Font - font = PDFTrueTypeFont(spec) - elif subtype == "Type3": - # Type3 Font - font = PDFType3Font(spec) - elif subtype in ("CIDFontType0", "CIDFontType2"): - # CID Font - font = PDFCIDFont(spec) - elif subtype == "Type0": - # Type0 Font - dfonts = list_value(spec["DescendantFonts"]) - assert dfonts - subspec = dict_value(dfonts[0]).copy() - for k in ("Encoding", "ToUnicode"): - if k in spec: - subspec[k] = resolve1(spec[k]) - font = self.get_font(None, subspec) - else: - if settings.STRICT: - raise PDFFontError("Invalid Font spec: %r" % spec) - font = PDFType1Font(spec) # FIXME: this is so wrong! - if objid and self.caching: - self._cached_fonts[objid] = font - return font + se: Union[ObjRef, None] class PDFDocument: @@ -800,7 +724,7 @@ class PDFDocument: """ _fp: Union[BinaryIO, None] = None - _pages: Union[List[PDFPage], None] = None + _pages: Union[List[Page], None] = None def __enter__(self) -> "PDFDocument": return self @@ -823,10 +747,26 @@ def __init__( self.decipher: Optional[DecipherCallable] = None self._cached_objs: Dict[int, Tuple[object, int]] = {} self._parsed_objs: Dict[int, Tuple[List[object], int]] = {} + self._cached_fonts: Dict[object, PDFFont] = {} if isinstance(fp, io.TextIOBase): raise PSException("fp is not a binary file") - self.pdf_version = read_header(fp) - self.parser = PDFParser(fp, self) + # The header is frequently mangled, in which case we will try to read the + # file anyway. + try: + self.pdf_version = read_header(fp) + except PDFSyntaxError: + log.warning("PDF header not found, will try to read the file anyway") + self.pdf_version = "UNKNOWN" + try: + self.buffer: Union[bytes, mmap.mmap] = mmap.mmap( + fp.fileno(), 0, access=mmap.ACCESS_READ + ) + except io.UnsupportedOperation: + log.warning("mmap not supported on %r, reading document into memory", fp) + self.buffer = fp.read() + except ValueError as e: + raise PSException from e + self.parser = PDFParser(self.buffer, self) self.is_printable = self.is_modifiable = self.is_extractable = True # Getting the XRef table and trailer is done non-lazily # because they contain encryption information among other @@ -868,8 +808,6 @@ def __init__( if self.catalog.get("Type") is not LITERAL_CATALOG: if settings.STRICT: raise PDFSyntaxError("Catalog not found!") - # NOTE: This does nearly nothing at all - self.rsrcmgr = PDFResourceManager(True) def _initialize_password(self, password: str = "") -> None: """Initialize the decryption handler with a given password, if any. @@ -899,7 +837,20 @@ def _initialize_password(self, password: str = "") -> None: assert self.parser is not None self.parser.fallback = False # need to read streams with exact length - def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: + def __iter__(self) -> Iterator[Tuple[int, object]]: + """Iterate over (position, object) tuples, raising StopIteration at EOF.""" + # FIXME: Should create a new parser + self.parser.seek(0) + return self.parser + + @property + def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over (position, token) tuples, raising StopIteration at EOF.""" + # FIXME: Should create a new parser + self.parser.seek(0) + return self.parser.tokens + + def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] else: @@ -913,7 +864,7 @@ def _getobj_objstm(self, stream: PDFStream, index: int, objid: int) -> object: raise PDFSyntaxError("index too big: %r" % index) return obj - def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: + def _get_objects(self, stream: ContentStream) -> Tuple[List[object], int]: if stream.get("Type") is not LITERAL_OBJSTM: if settings.STRICT: raise PDFSyntaxError("Not a stream object: %r" % stream) @@ -923,14 +874,8 @@ def _get_objects(self, stream: PDFStream) -> Tuple[List[object], int]: if settings.STRICT: raise PDFSyntaxError("N is not defined: %r" % stream) n = 0 - parser = PDFStreamParser(stream.get_data(), self) - objs: List[object] = [] - try: - while 1: - (_, obj) = parser.nextobject() - objs.append(obj) - except PSEOF: - pass + parser = ContentStreamParser(stream.get_data(), self) + objs: List[object] = [obj for _, obj in parser] return (objs, n) def _getobj_parse(self, pos: int, objid: int) -> object: @@ -951,7 +896,7 @@ def _getobj_parse(self, pos: int, objid: int) -> object: while True: try: (_, token) = self.parser.nexttoken() - except PSEOF: + except StopIteration: raise PDFSyntaxError( f"object {objid!r} not found at or after position {pos}" ) @@ -966,7 +911,7 @@ def _getobj_parse(self, pos: int, objid: int) -> object: (_, kwd) = self.parser.nexttoken() if kwd != KEYWORD_OBJ: raise PDFSyntaxError("Invalid object spec: offset=%r" % pos) - (_, obj) = self.parser.nextobject() + (_, obj) = next(self.parser) return obj def __getitem__(self, objid: int) -> object: @@ -996,10 +941,10 @@ def __getitem__(self, objid: int) -> object: if self.decipher: obj = decipher_all(self.decipher, objid, genno, obj) - if isinstance(obj, PDFStream): + if isinstance(obj, ContentStream): obj.set_objid(objid, genno) break - except (PSEOF, PDFSyntaxError): + except (StopIteration, PDFSyntaxError): continue if obj is None: raise IndexError(f"Object with ID {objid} not found") @@ -1007,6 +952,50 @@ def __getitem__(self, objid: int) -> object: self._cached_objs[objid] = (obj, genno) return obj + def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: + if objid and objid in self._cached_fonts: + font = self._cached_fonts[objid] + else: + log.debug("get_font: create: objid=%r, spec=%r", objid, spec) + if settings.STRICT: + if spec["Type"] is not LITERAL_FONT: + raise PDFFontError("Type is not /Font") + # Create a Font object. + if "Subtype" in spec: + subtype = literal_name(spec["Subtype"]) + else: + if settings.STRICT: + raise PDFFontError("Font Subtype is not specified.") + subtype = "Type1" + if subtype in ("Type1", "MMType1"): + # Type1 Font + font = PDFType1Font(spec) + elif subtype == "TrueType": + # TrueType Font + font = PDFTrueTypeFont(spec) + elif subtype == "Type3": + # Type3 Font + font = PDFType3Font(spec) + elif subtype in ("CIDFontType0", "CIDFontType2"): + # CID Font + font = PDFCIDFont(spec) + elif subtype == "Type0": + # Type0 Font + dfonts = list_value(spec["DescendantFonts"]) + assert dfonts + subspec = dict_value(dfonts[0]).copy() + for k in ("Encoding", "ToUnicode"): + if k in spec: + subspec[k] = resolve1(spec[k]) + font = self.get_font(None, subspec) + else: + if settings.STRICT: + raise PDFFontError("Invalid Font spec: %r" % spec) + font = PDFType1Font(spec) # FIXME: this is so wrong! + if objid: + self._cached_fonts[objid] = font + return font + @property def outlines(self) -> Iterator[OutlineItem]: if "Outlines" not in self.catalog: @@ -1080,11 +1069,11 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: visited = set() while stack: (obj, parent) = stack.pop() - if isinstance(obj, PDFObjRef): + if isinstance(obj, ObjRef): # The PDF specification *requires* both the Pages # element of the catalog and the entries in Kids in # the page tree to be indirect references. - object_id = obj.objid + object_id = int(obj.objid) elif isinstance(obj, int): # Should not happen in a valid PDF, but probably does? log.warning("Page tree contains bare integer: %r in %r", obj, parent) @@ -1118,23 +1107,28 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: log.debug("Page: %r", object_properties) yield object_id, object_properties + # FIXME: Make an object that can be indexed by int or str @property - def pages(self) -> List[PDFPage]: + def pages(self) -> List[Page]: if self._pages is None: try: page_labels: Iterator[Optional[str]] = self.page_labels except PDFNoPageLabels: page_labels = itertools.repeat(None) try: - self._pages = [PDFPage(self, objid, properties, label, page_number + 1) - for page_number, ((objid, properties), label) in enumerate( - zip(self.get_page_objects(), page_labels) - )] + self._pages = [ + Page(self, objid, properties, label, page_idx) + for page_idx, ((objid, properties), label) in enumerate( + zip(self.get_page_objects(), page_labels) + ) + ] except PDFNoPageTree: - self._pages = [PDFPage(self, objid, properties, label, page_number + 1) - for page_number, ((objid, properties), label) in enumerate( - zip(self.get_pages_from_xrefs(), page_labels) - )] + self._pages = [ + Page(self, objid, properties, label, page_idx) + for page_idx, ((objid, properties), label) in enumerate( + zip(self.get_pages_from_xrefs(), page_labels) + ) + ] return self._pages @property @@ -1184,7 +1178,8 @@ def find_xref(self) -> int: prev = b"" # FIXME: This will scan *the whole file* looking for an xref # table, it should maybe give up sooner? - for line in self.parser.revreadlines(): + self.parser.seek(self.parser.end) + for line in self.parser.reverse_iter_lines(): line = line.strip() log.debug("find_xref: %r", line) if line == b"startxref": @@ -1210,7 +1205,7 @@ def read_xref_from( self.parser.reset() try: (pos, token) = self.parser.nexttoken() - except PSEOF: + except StopIteration: raise PDFNoValidXRef("Unexpected EOF at {start}") log.debug("read_xref_from: start=%d, token=%r", start, token) if isinstance(token, int): @@ -1220,7 +1215,7 @@ def read_xref_from( xref: PDFXRef = PDFXRefStream(self.parser) else: if token is KEYWORD_XREF: - self.parser.nextline() + next(self.parser.iter_lines()) xref = PDFXRefTable(self.parser) xrefs.append(xref) trailer = xref.trailer diff --git a/playa/encodingdb.py b/playa/encodingdb.py index c44a2742..259f1f14 100644 --- a/playa/encodingdb.py +++ b/playa/encodingdb.py @@ -5,7 +5,7 @@ from playa.exceptions import PDFKeyError from playa.glyphlist import glyphname2unicode from playa.latin_enc import ENCODING -from playa.psparser import PSLiteral +from playa.parser import PSLiteral HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") diff --git a/playa/exceptions.py b/playa/exceptions.py index a886bbf6..7bbf8e78 100644 --- a/playa/exceptions.py +++ b/playa/exceptions.py @@ -7,10 +7,6 @@ class PSException(Exception): pass -class PSEOF(PSException): - pass - - class PSSyntaxError(PSException): pass diff --git a/playa/pdffont.py b/playa/font.py similarity index 98% rename from playa/pdffont.py rename to playa/font.py index d326e21e..c35c5277 100644 --- a/playa/pdffont.py +++ b/playa/font.py @@ -27,7 +27,6 @@ ) from playa.encodingdb import EncodingDB, name2unicode from playa.exceptions import ( - PSEOF, PDFException, PDFFontError, PDFKeyError, @@ -35,8 +34,16 @@ PDFValueError, ) from playa.fontmetrics import FONT_METRICS +from playa.parser import ( + KWD, + LIT, + Parser, + PSKeyword, + PSLiteral, + literal_name, +) from playa.pdftypes import ( - PDFStream, + ContentStream, dict_value, int_value, list_value, @@ -45,14 +52,6 @@ resolve_all, stream_value, ) -from playa.psparser import ( - KWD, - LIT, - PSKeyword, - PSLiteral, - PSStackParser, - literal_name, -) from playa.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack log = logging.getLogger(__name__) @@ -106,8 +105,8 @@ def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: return FONT_METRICS[fontname] -# int here means that we're not extending PSStackParser with additional types. -class Type1FontHeaderParser(PSStackParser[int]): +# int here means that we're not extending Parser with additional types. +class Type1FontHeaderParser(Parser[int]): KEYWORD_BEGIN = KWD(b"begin") KEYWORD_END = KWD(b"end") KEYWORD_DEF = KWD(b"def") @@ -137,8 +136,8 @@ def get_encoding(self) -> Dict[int, str]: """ while 1: try: - (cid, name) = self.nextobject() - except PSEOF: + (cid, name) = next(self) + except StopIteration: break try: self._cid2unicode[cid] = name2unicode(cast(str, name)) @@ -1070,7 +1069,7 @@ def __init__( ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) self.unicode_map: Optional[UnicodeMap] = None if "ToUnicode" in spec: - if isinstance(spec["ToUnicode"], PDFStream): + if isinstance(spec["ToUnicode"], ContentStream): strm = stream_value(spec["ToUnicode"]) self.unicode_map = FileUnicodeMap() CMapParser(self.unicode_map, strm.get_data()).run() @@ -1148,8 +1147,8 @@ def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: if strict: raise PDFFontError("Encoding is unspecified") - if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] - cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) + if type(cmap_name) is ContentStream: # type: ignore[comparison-overlap] + cmap_name_stream: ContentStream = cast(ContentStream, cmap_name) if "CMapName" in cmap_name_stream: cmap_name = cmap_name_stream.get("CMapName").name elif strict: diff --git a/playa/image.py b/playa/image.py index 714bd2af..5c35cfa4 100644 --- a/playa/image.py +++ b/playa/image.py @@ -4,16 +4,16 @@ from io import BytesIO from typing import BinaryIO, Literal, Tuple -from playa.exceptions import PDFValueError -from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter -from playa.layout import LTImage -from playa.pdfcolor import ( +from playa.color import ( LITERAL_DEVICE_CMYK, LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_INLINE_DEVICE_GRAY, LITERAL_INLINE_DEVICE_RGB, ) +from playa.exceptions import PDFValueError +from playa.jbig2 import JBIG2StreamReader, JBIG2StreamWriter +from playa.layout import LTImage from playa.pdftypes import ( LITERALS_DCT_DECODE, LITERALS_FLATE_DECODE, diff --git a/playa/layout.py b/playa/layout.py index 0085b12f..ee311284 100644 --- a/playa/layout.py +++ b/playa/layout.py @@ -1,22 +1,18 @@ import logging from typing import ( - Generic, Iterable, Iterator, List, Optional, Tuple, - TypeVar, Union, - cast, ) +from playa.color import PDFColorSpace from playa.exceptions import PDFValueError -from playa.pdfcolor import PDFColorSpace -from playa.pdffont import PDFFont -from playa.pdftypes import PDFStream +from playa.font import PDFFont +from playa.pdftypes import ContentStream from playa.utils import ( - INF, Matrix, PathSegment, Point, @@ -85,31 +81,10 @@ def __repr__(self) -> str: ) -class LTItem: - """Interface for things that can be analyzed""" - - # Any item could be in a marked content section - mcid: Optional[int] = None - # Which could have a tag - tag: Optional[str] = None - - -class LTText: - """Interface for things that have text""" - - def __repr__(self) -> str: - return f"<{self.__class__.__name__} {self.get_text()!r}>" - - def get_text(self) -> str: - """Text contained in this object""" - raise NotImplementedError - - -class LTComponent(LTItem): +class LTComponent: """Object with a bounding box""" def __init__(self, bbox: Rect) -> None: - LTItem.__init__(self) self.set_bbox(bbox) def __repr__(self) -> str: @@ -298,7 +273,7 @@ class LTImage(LTComponent): Embedded images can be in JPEG, Bitmap or JBIG2. """ - def __init__(self, name: str, stream: PDFStream, bbox: Rect) -> None: + def __init__(self, name: str, stream: ContentStream, bbox: Rect) -> None: LTComponent.__init__(self, bbox) self.name = name self.stream = stream @@ -313,22 +288,7 @@ def __repr__(self) -> str: return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} {self.srcsize!r}>" -class LTAnno(LTItem, LTText): - """Actual letter in the text as a Unicode string. - - Note that, while a LTChar object has actual boundaries, LTAnno objects does - not, as these are "virtual" characters, inserted by a layout analyzer - according to the relationship between two characters (e.g. a space). - """ - - def __init__(self, text: str) -> None: - self._text = text - - def get_text(self) -> str: - return self._text - - -class LTChar(LTComponent, LTText): +class LTChar(LTComponent): """Actual letter in the text as a Unicode string.""" def __init__( @@ -348,7 +308,6 @@ def __init__( stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, ) -> None: - LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname @@ -396,54 +355,7 @@ def get_text(self) -> str: return self._text -LTItemT = TypeVar("LTItemT", bound=LTItem) - - -class LTContainer(LTComponent, Generic[LTItemT]): - """Object that can be extended and analyzed""" - - def __init__(self, bbox: Rect) -> None: - LTComponent.__init__(self, bbox) - self._objs: List[LTItemT] = [] - - def __iter__(self) -> Iterator[LTItemT]: - return iter(self._objs) - - def __len__(self) -> int: - return len(self._objs) - - def add(self, obj: LTItemT) -> None: - self._objs.append(obj) - - def extend(self, objs: Iterable[LTItemT]) -> None: - for obj in objs: - self.add(obj) - - -class LTExpandableContainer(LTContainer[LTItemT]): - def __init__(self) -> None: - LTContainer.__init__(self, (+INF, +INF, -INF, -INF)) - - # Incompatible override: we take an LTComponent (with bounding box), but - # super() LTContainer only considers LTItem (no bounding box). - def add(self, obj: LTComponent) -> None: # type: ignore[override] - LTContainer.add(self, cast(LTItemT, obj)) - self.set_bbox( - ( - min(self.x0, obj.x0), - min(self.y0, obj.y0), - max(self.x1, obj.x1), - max(self.y1, obj.y1), - ), - ) - - -class LTLayoutContainer(LTContainer[LTComponent]): - def __init__(self, bbox: Rect) -> None: - LTContainer.__init__(self, bbox) - - -class LTFigure(LTLayoutContainer): +class LTFigure(LTComponent): """Represents an area used by PDF Form objects. PDF Forms can be used to present figures or pictures by embedding yet @@ -457,23 +369,21 @@ def __init__(self, name: str, bbox: Rect, matrix: Matrix) -> None: (x, y, w, h) = bbox bounds = ((x, y), (x + w, y), (x, y + h), (x + w, y + h)) bbox = get_bound(apply_matrix_pt(matrix, (p, q)) for (p, q) in bounds) - LTLayoutContainer.__init__(self, bbox) - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" + LTComponent.__init__(self, bbox) + self._objs: List[LTComponent] = [] + def __iter__(self) -> Iterator[LTComponent]: + return iter(self._objs) -class LTPage(LTLayoutContainer): - """Represents an entire page. + def __len__(self) -> int: + return len(self._objs) - Like any other LTLayoutContainer, an LTPage can be iterated to obtain child - objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. - """ + def add(self, obj: LTComponent) -> None: + self._objs.append(obj) - def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: - LTLayoutContainer.__init__(self, bbox) - self.pageid = pageid - self.rotate = rotate + def extend(self, objs: Iterable[LTComponent]) -> None: + for obj in objs: + self.add(obj) def __repr__(self) -> str: - return f"<{self.__class__.__name__}({self.pageid!r}) {bbox2str(self.bbox)} rotate={self.rotate!r}>" + return f"<{self.__class__.__name__}({self.name}) {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)}>" diff --git a/playa/pdfpage.py b/playa/page.py similarity index 82% rename from playa/pdfpage.py rename to playa/page.py index fff7a579..73cddd0d 100644 --- a/playa/pdfpage.py +++ b/playa/page.py @@ -5,6 +5,7 @@ TYPE_CHECKING, Dict, Iterable, + Iterator, List, Optional, Sequence, @@ -15,14 +16,15 @@ from playa import settings from playa.casting import safe_float +from playa.color import PREDEFINED_COLORSPACE, PDFColorSpace from playa.exceptions import ( - PSEOF, PDFInterpreterError, PDFSyntaxError, PDFUnicodeNotDefined, PDFValueError, PSTypeError, ) +from playa.font import PDFFont from playa.layout import ( Color, LTChar, @@ -30,36 +32,26 @@ LTCurve, LTFigure, LTImage, - LTLayoutContainer, LTLine, - LTPage, LTRect, PDFGraphicState, ) -from playa.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace -from playa.pdffont import ( - PDFFont, -) +from playa.parser import Parser, PDFStackT, PSBaseParserToken from playa.pdftypes import ( - LITERALS_ASCII85_DECODE, - PDFObjRef, - PDFStream, - dict_value, - int_value, - list_value, - resolve1, - stream_value, -) -from playa.psparser import ( KWD, LIT, - PSBaseParserToken, + LITERALS_ASCII85_DECODE, + ContentStream, + ObjRef, PSKeyword, PSLiteral, - PSStackParser, - PSStackType, + dict_value, + int_value, keyword_name, + list_value, literal_name, + resolve1, + stream_value, ) from playa.utils import ( MATRIX_IDENTITY, @@ -77,7 +69,7 @@ ) if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument, PDFResourceManager + from playa.document import PDFDocument log = logging.getLogger(__name__) @@ -89,47 +81,48 @@ PDFTextSeq = Iterable[Union[int, float, bytes]] -class PDFPage: +class Page: """An object that holds the information about a page. - A PDFPage object is merely a convenience class that has a set + A Page object is merely a convenience class that has a set of keys and values, which describe the properties of a page and point to its contents. Attributes ---------- - pageid: any Python object that can uniquely identify the page. + pageid: the integer object ID associated with the page in the page tree attrs: a dictionary of page attributes. - contents: a list of PDFStream objects that represents the page content. + contents: a list of ContentStream objects that represents the page content. resources: a dictionary of resources used by the page. mediabox: the physical size of the page. cropbox: the crop rectangle of the page. rotate: the page rotation (in degree). label: the page's label (typically, the logical page number). + page_number: the "physical" page number, indexed from 1. """ def __init__( self, doc: "PDFDocument", - pageid: object, - attrs: object, + pageid: int, + attrs: Dict, label: Optional[str], - page_number: int = 1, + page_idx: int = 0, ) -> None: """Initialize a page object. doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. + pageid: the integer PDF object ID associated with the page in the page tree. attrs: a dictionary of page attributes. label: page label string. - page_number: page number (starting from 1) + page_idx: 0-based index of the page in the document. """ self.doc = weakref.ref(doc) self.pageid = pageid - self.attrs = dict_value(attrs) + self.attrs = attrs self.label = label - self.page_number = page_number + self.page_idx = page_idx self.lastmod = resolve1(self.attrs.get("LastModified")) self.resources: Dict[object, object] = resolve1( self.attrs.get("Resources", dict()), @@ -162,32 +155,16 @@ def __init__( self.contents = [self.contents] else: self.contents = [] - self._layout: Optional["LTPage"] = None @property - def layout(self) -> "LTPage": - if self._layout is not None: - return self._layout - - doc = self.doc() - if doc is None: - raise RuntimeError("Document no longer exists!") - # Q: How many classes does does it take a Java programmer to - # install a lightbulb? - device = PDFLayoutAnalyzer( - doc.rsrcmgr, - pageno=self.page_number, - ) - interpreter = PDFPageInterpreter(doc.rsrcmgr, device) - interpreter.process_page(self) - assert device.result is not None - self._layout = device.result - return self._layout + def layout(self) -> Iterator[LTComponent]: + return iter(PageInterpreter(self)) def __repr__(self) -> str: - return f"" + return f"" +# FIXME: Make a dataclass or NamedTuple class PDFTextState: matrix: Matrix linematrix: Point @@ -248,7 +225,7 @@ def reset(self) -> None: KEYWORD_EI = KWD(b"EI") -class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): +class PDFContentParser(Parser[Union[PSKeyword, ContentStream]]): """Parse the concatenation of multiple content streams, as described in the spec (PDF 1.7, p.86): @@ -265,20 +242,21 @@ def __init__(self, streams: Sequence[object]) -> None: self.streamiter = iter(streams) try: stream = stream_value(next(self.streamiter)) + log.debug("PDFContentParser starting stream %r", stream) + super().__init__(stream.get_data()) except StopIteration: - raise PSEOF - log.debug("PDFContentParser starting stream %r", stream) - super().__init__(stream.get_data()) + log.debug("PDFContentParser has no content, returning nothing") + super().__init__(b"") - def __next__(self) -> Tuple[int, PSBaseParserToken]: + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: while True: try: - return super().__next__() + return super().nexttoken() except StopIteration: # Will also raise StopIteration if there are no more, # which is exactly what we want stream = stream_value(next(self.streamiter)) - log.debug("PDFContentParser starting stream %r", stream) + log.debug("PDFContentParser starting new stream %r", stream) self.reinit(stream.get_data()) def flush(self) -> None: @@ -323,7 +301,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: (pos, data) = self.get_inline_data(target=eos) if pos == -1: raise PDFSyntaxError("End of inline stream %r not found" % eos) - obj = PDFStream(d, data) + obj = ContentStream(d, data) self.push((pos, obj)) # This was included in the data but we need to "parse" it if eos == b"EI": @@ -335,592 +313,257 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: self.push((pos, token)) -PDFStackT = PSStackType[PDFStream] -"""Types that may appear on the PDF argument stack.""" +class PageInterpreter: + """Processor for the content of a PDF page + Reference: PDF Reference, Appendix A, Operator Summary + """ -class PDFLayoutAnalyzer: - cur_item: LTLayoutContainer ctm: Matrix cur_mcid: Optional[int] = None cur_tag: Optional[str] = None def __init__( self, - rsrcmgr: "PDFResourceManager", - pageno: int = 1, + page: Page, + resources: Union[Dict, None] = None, + contents: Union[List, None] = None, ) -> None: - self.rsrcmgr = rsrcmgr - self.pageno = pageno - self._stack: List[LTLayoutContainer] = [] - self.result: Optional[LTPage] = None + self.page = page + self.contents = page.contents if contents is None else contents + (x0, y0, x1, y1) = page.mediabox + # FIXME: NO, this is bad, pdfplumber has a bug related to it + # (specifically the translation, the rotation is kind of okay + # it seems) + if page.rotate == 90: + ctm = (0, -1, 1, 0, -y0, x1) + elif page.rotate == 180: + ctm = (-1, 0, 0, -1, x1, y1) + elif page.rotate == 270: + ctm = (0, 1, -1, 0, y1, -x0) + else: + ctm = (1, 0, 0, 1, -x0, -y0) + self.init_resources(page, page.resources if resources is None else resources) + self.init_state(ctm) + + def init_resources(self, page: Page, resources: Dict) -> None: + """Prepare the fonts and XObjects listed in the Resource attribute.""" + self.resources = resources + self.fontmap: Dict[object, PDFFont] = {} + self.xobjmap = {} + self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() + if not self.resources: + return + doc = page.doc() + if doc is None: + raise RuntimeError("Document no longer exists!") + + def get_colorspace(spec: object) -> Optional[PDFColorSpace]: + if isinstance(spec, list): + name = literal_name(spec[0]) + else: + name = literal_name(spec) + if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: + return PDFColorSpace(name, stream_value(spec[1])["N"]) + elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: + return PDFColorSpace(name, len(list_value(spec[1]))) + else: + return PREDEFINED_COLORSPACE.get(name) + + for k, v in dict_value(self.resources).items(): + log.debug("Resource: %r: %r", k, v) + if k == "Font": + for fontid, spec in dict_value(v).items(): + objid = None + if isinstance(spec, ObjRef): + objid = spec.objid + spec = dict_value(spec) + self.fontmap[fontid] = doc.get_font(objid, spec) + elif k == "ColorSpace": + for csid, spec in dict_value(v).items(): + colorspace = get_colorspace(resolve1(spec)) + if colorspace is not None: + self.csmap[csid] = colorspace + elif k == "ProcSet": + pass # called get_procset which did exactly + # nothing. perhaps we want to do something? + elif k == "XObject": + for xobjid, xobjstrm in dict_value(v).items(): + self.xobjmap[xobjid] = xobjstrm - def set_ctm(self, ctm: Matrix) -> None: + def init_state(self, ctm: Matrix) -> None: + """Initialize the text and graphic states for rendering a page.""" + # gstack: stack for graphical states. + self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] self.ctm = ctm + self.textstate = PDFTextState() + self.graphicstate = PDFGraphicState() + self.curpath: List[PathSegment] = [] + # argstack: stack for command arguments. + self.argstack: List[PDFStackT] = [] + # set some global states. + self.scs: Optional[PDFColorSpace] = None + self.ncs: Optional[PDFColorSpace] = None + if self.csmap: + self.scs = self.ncs = next(iter(self.csmap.values())) - def begin_page(self, page: PDFPage, ctm: Matrix) -> None: - (x0, y0, x1, y1) = page.mediabox - (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) - (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) - mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) - self.cur_item = LTPage(self.pageno, mediabox) - - def end_page(self, page: PDFPage) -> None: - assert not self._stack, str(len(self._stack)) - assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) - self.pageno += 1 - self.receive_layout(self.cur_item) - - def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: - self._stack.append(self.cur_item) - self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) - - def end_figure(self, _: str) -> None: - fig = self.cur_item - assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) - self.cur_item = self._stack.pop() - self.cur_item.add(fig) + def __iter__(self) -> Iterator[LTComponent]: + log.debug( + "PageInterpreter: resources=%r, streams=%r, ctm=%r", + self.resources, + self.contents, + self.ctm, + ) + parser = PDFContentParser(self.contents) + for _, obj in parser: + if isinstance(obj, PSKeyword): + name = keyword_name(obj) + method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( + "'", + "_q", + ) + if hasattr(self, method): + func = getattr(self, method) + nargs = func.__code__.co_argcount - 1 + if nargs: + args = self.pop(nargs) + log.debug("exec: %s %r", name, args) + if len(args) == nargs: + gen = func(*args) + else: + error_msg = ( + "Insufficient arguments (%d) for operator: %r" + % (len(args), name) + ) + raise PDFInterpreterError(error_msg) + else: + log.debug("exec: %s", name) + gen = func() + if gen is not None: + yield from gen + elif settings.STRICT: + error_msg = "Unknown operator: %r" % name + raise PDFInterpreterError(error_msg) + else: + self.push(obj) - def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: - """Handle beginning of tag, setting current MCID if any.""" - self.cur_tag = decode_text(tag.name) - if isinstance(props, dict) and "MCID" in props: - self.cur_mcid = props["MCID"] - else: - self.cur_mcid = None + def push(self, obj: PDFStackT) -> None: + self.argstack.append(obj) - def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: - pass + def pop(self, n: int) -> List[PDFStackT]: + if n == 0: + return [] + x = self.argstack[-n:] + self.argstack = self.argstack[:-n] + return x - def end_tag(self) -> None: - """Handle beginning of tag, clearing current MCID.""" - self.cur_tag = None - self.cur_mcid = None + def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: + return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - def add_item(self, item: LTComponent) -> None: - item.mcid = self.cur_mcid - item.tag = self.cur_tag - self.cur_item.add(item) + def set_current_state( + self, + state: Tuple[Matrix, PDFTextState, PDFGraphicState], + ) -> None: + (self.ctm, self.textstate, self.graphicstate) = state - def render_image(self, name: str, stream: PDFStream) -> None: - assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) - item = LTImage( - name, - stream, - (self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1), - ) - self.add_item(item) + def do_q(self) -> None: + """Save graphics state""" + self.gstack.append(self.get_current_state()) - def paint_path( + def do_Q(self) -> None: + """Restore graphics state""" + if self.gstack: + self.set_current_state(self.gstack.pop()) + + def do_cm( self, - gstate: PDFGraphicState, - stroke: bool, - fill: bool, - evenodd: bool, - path: Sequence[PathSegment], - ncs: Optional[PDFColorSpace] = None, - scs: Optional[PDFColorSpace] = None, + a1: PDFStackT, + b1: PDFStackT, + c1: PDFStackT, + d1: PDFStackT, + e1: PDFStackT, + f1: PDFStackT, ) -> None: - """Paint paths described in section 4.4 of the PDF reference manual""" - shape = "".join(x[0] for x in path) + """Concatenate matrix to current transformation matrix""" + self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm) - if shape[:1] != "m": - # Per PDF Reference Section 4.4.1, "path construction operators may - # be invoked in any sequence, but the first one invoked must be m - # or re to begin a new subpath." Since pdfminer.six already - # converts all `re` (rectangle) operators to their equivelent - # `mlllh` representation, paths ingested by `.paint_path(...)` that - # do not begin with the `m` operator are invalid. - pass + def do_w(self, linewidth: PDFStackT) -> None: + """Set line width""" + self.graphicstate.linewidth = cast(float, linewidth) - elif shape.count("m") > 1: - # recurse if there are multiple m's in this shape - for m in re.finditer(r"m[^m]+", shape): - subpath = path[m.start(0) : m.end(0)] - self.paint_path(gstate, stroke, fill, evenodd, subpath, ncs, scs) + def do_J(self, linecap: PDFStackT) -> None: + """Set line cap style""" + self.graphicstate.linecap = linecap - else: - # Although the 'h' command does not not literally provide a - # point-position, its position is (by definition) equal to the - # subpath's starting point. - # - # And, per Section 4.4's Table 4.9, all other path commands place - # their point-position in their final two arguments. (Any preceding - # arguments represent control points on Bézier curves.) - raw_pts = [ - cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path - ] - pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + def do_j(self, linejoin: PDFStackT) -> None: + """Set line join style""" + self.graphicstate.linejoin = linejoin - operators = [str(operation[0]) for operation in path] - transformed_points = [ - [ - apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) - for operand1, operand2 in zip(operation[1::2], operation[2::2]) - ] - for operation in path - ] - transformed_path = [ - cast(PathSegment, (o, *p)) - for o, p in zip(operators, transformed_points) - ] + def do_M(self, miterlimit: PDFStackT) -> None: + """Set miter limit""" + self.graphicstate.miterlimit = miterlimit - if shape in {"mlh", "ml"}: - # single line segment - # - # Note: 'ml', in conditional above, is a frequent anomaly - # that we want to support. - line = LTLine( - gstate.linewidth, - pts[0], - pts[1], - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - original_path=transformed_path, - dashing_style=gstate.dash, - ncs=ncs, - scs=scs, - ) - self.add_item(line) + def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: + """Set line dash pattern""" + self.graphicstate.dash = (dash, phase) - elif shape in {"mlllh", "mllll"}: - (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts + def do_ri(self, intent: PDFStackT) -> None: + """Set color rendering intent""" + self.graphicstate.intent = intent - is_closed_loop = pts[0] == pts[4] - has_square_coordinates = ( - x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 - ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) - if is_closed_loop and has_square_coordinates: - rect = LTRect( - gstate.linewidth, - (*pts[0], *pts[2]), - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ncs, - scs, - ) - self.add_item(rect) - else: - curve = LTCurve( - gstate.linewidth, - pts, - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ncs, - scs, - ) - self.add_item(curve) - else: - curve = LTCurve( - gstate.linewidth, - pts, - stroke, - fill, - evenodd, - gstate.scolor, - gstate.ncolor, - transformed_path, - gstate.dash, - ncs, - scs, - ) - self.add_item(curve) + def do_i(self, flatness: PDFStackT) -> None: + """Set flatness tolerance""" + self.graphicstate.flatness = flatness - def render_char( - self, - matrix: Matrix, - font: PDFFont, - fontsize: float, - scaling: float, - rise: float, - cid: int, - ncs: PDFColorSpace, - graphicstate: PDFGraphicState, - scs: Optional[PDFColorSpace] = None, - ) -> float: - try: - text = font.to_unichr(cid) - assert isinstance(text, str), str(type(text)) - except PDFUnicodeNotDefined: - text = self.handle_undefined_char(font, cid) - textwidth = font.char_width(cid) - textdisp = font.char_disp(cid) - item = LTChar( - matrix, - font, - fontsize, - scaling, - rise, - text, - textwidth, - textdisp, - ncs, - graphicstate, - scs, - graphicstate.scolor, - graphicstate.ncolor, - ) - self.add_item(item) - return item.adv + def do_gs(self, name: PDFStackT) -> None: + """Set parameters from graphics state parameter dictionary""" + # TODO - def render_string( - self, - textstate: "PDFTextState", - seq: PDFTextSeq, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - scs: Optional[PDFColorSpace] = None, - ) -> None: - assert self.ctm is not None - matrix = mult_matrix(textstate.matrix, self.ctm) - font = textstate.font - fontsize = textstate.fontsize - scaling = textstate.scaling * 0.01 - charspace = textstate.charspace * scaling - wordspace = textstate.wordspace * scaling - rise = textstate.rise - assert font is not None - if font.is_multibyte(): - wordspace = 0 - dxscale = 0.001 * fontsize * scaling - if font.is_vertical(): - textstate.linematrix = self.render_string_vertical( - seq, - matrix, - textstate.linematrix, - font, - fontsize, - scaling, - charspace, - wordspace, - rise, - dxscale, - ncs, - graphicstate, - scs, - ) - else: - textstate.linematrix = self.render_string_horizontal( - seq, - matrix, - textstate.linematrix, - font, - fontsize, - scaling, - charspace, - wordspace, - rise, - dxscale, - ncs, - graphicstate, - scs, - ) + def do_m(self, x: PDFStackT, y: PDFStackT) -> None: + """Begin new subpath""" + self.curpath.append(("m", cast(float, x), cast(float, y))) - def render_string_horizontal( - self, - seq: PDFTextSeq, - matrix: Matrix, - pos: Point, - font: PDFFont, - fontsize: float, - scaling: float, - charspace: float, - wordspace: float, - rise: float, - dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - scs: Optional[PDFColorSpace] = None, - ) -> Point: - (x, y) = pos - needcharspace = False - for obj in seq: - if isinstance(obj, (int, float)): - x -= obj * dxscale - needcharspace = True - else: - if isinstance(obj, str): - obj = make_compat_bytes(obj) - if not isinstance(obj, bytes): - continue - for cid in font.decode(obj): - if needcharspace: - x += charspace - x += self.render_char( - translate_matrix(matrix, (x, y)), - font, - fontsize, - scaling, - rise, - cid, - ncs, - graphicstate, - scs, - ) - if cid == 32 and wordspace: - x += wordspace - needcharspace = True - return (x, y) + def do_l(self, x: PDFStackT, y: PDFStackT) -> None: + """Append straight line segment to path""" + self.curpath.append(("l", cast(float, x), cast(float, y))) - def render_string_vertical( + def do_c( self, - seq: PDFTextSeq, - matrix: Matrix, - pos: Point, - font: PDFFont, - fontsize: float, - scaling: float, - charspace: float, - wordspace: float, - rise: float, - dxscale: float, - ncs: PDFColorSpace, - graphicstate: "PDFGraphicState", - scs: Optional[PDFColorSpace] = None, - ) -> Point: - (x, y) = pos - needcharspace = False - for obj in seq: - if isinstance(obj, (int, float)): - y -= obj * dxscale - needcharspace = True - else: - if isinstance(obj, str): - obj = make_compat_bytes(obj) - if not isinstance(obj, bytes): - continue - for cid in font.decode(obj): - if needcharspace: - y += charspace - y += self.render_char( - translate_matrix(matrix, (x, y)), - font, - fontsize, - scaling, - rise, - cid, - ncs, - graphicstate, - scs, - ) - if cid == 32 and wordspace: - y += wordspace - needcharspace = True - return (x, y) - - def handle_undefined_char(self, font: PDFFont, cid: int) -> str: - log.debug("undefined: %r, %r", font, cid) - return "(cid:%d)" % cid - - def receive_layout(self, ltpage: LTPage) -> None: - self.result = ltpage - - -class PDFPageInterpreter: - """Processor for the content of a PDF page + x1: PDFStackT, + y1: PDFStackT, + x2: PDFStackT, + y2: PDFStackT, + x3: PDFStackT, + y3: PDFStackT, + ) -> None: + """Append curved segment to path (three control points)""" + self.curpath.append( + ( + "c", + cast(float, x1), + cast(float, y1), + cast(float, x2), + cast(float, y2), + cast(float, x3), + cast(float, y3), + ), + ) - Reference: PDF Reference, Appendix A, Operator Summary - """ + def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: + """Append curved segment to path (initial point replicated)""" + self.curpath.append( + ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)), + ) - def __init__( - self, rsrcmgr: "PDFResourceManager", device: "PDFLayoutAnalyzer" - ) -> None: - self.rsrcmgr = rsrcmgr - self.device = device + def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: + """Append curved segment to path (final point replicated)""" + self.curpath.append( + ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)), + ) - def dup(self) -> "PDFPageInterpreter": - return self.__class__(self.rsrcmgr, self.device) - - def init_resources(self, resources: Dict[object, object]) -> None: - """Prepare the fonts and XObjects listed in the Resource attribute.""" - self.resources = resources - self.fontmap: Dict[object, PDFFont] = {} - self.xobjmap = {} - self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy() - if not resources: - return - - def get_colorspace(spec: object) -> Optional[PDFColorSpace]: - if isinstance(spec, list): - name = literal_name(spec[0]) - else: - name = literal_name(spec) - if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2: - return PDFColorSpace(name, stream_value(spec[1])["N"]) - elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2: - return PDFColorSpace(name, len(list_value(spec[1]))) - else: - return PREDEFINED_COLORSPACE.get(name) - - for k, v in dict_value(resources).items(): - log.debug("Resource: %r: %r", k, v) - if k == "Font": - for fontid, spec in dict_value(v).items(): - objid = None - if isinstance(spec, PDFObjRef): - objid = spec.objid - spec = dict_value(spec) - self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) - elif k == "ColorSpace": - for csid, spec in dict_value(v).items(): - colorspace = get_colorspace(resolve1(spec)) - if colorspace is not None: - self.csmap[csid] = colorspace - elif k == "ProcSet": - self.rsrcmgr.get_procset(list_value(v)) - elif k == "XObject": - for xobjid, xobjstrm in dict_value(v).items(): - self.xobjmap[xobjid] = xobjstrm - - def init_state(self, ctm: Matrix) -> None: - """Initialize the text and graphic states for rendering a page.""" - # gstack: stack for graphical states. - self.gstack: List[Tuple[Matrix, PDFTextState, PDFGraphicState]] = [] - self.ctm = ctm - self.device.set_ctm(self.ctm) - self.textstate = PDFTextState() - self.graphicstate = PDFGraphicState() - self.curpath: List[PathSegment] = [] - # argstack: stack for command arguments. - self.argstack: List[PDFStackT] = [] - # set some global states. - self.scs: Optional[PDFColorSpace] = None - self.ncs: Optional[PDFColorSpace] = None - if self.csmap: - self.scs = self.ncs = next(iter(self.csmap.values())) - - def push(self, obj: PDFStackT) -> None: - self.argstack.append(obj) - - def pop(self, n: int) -> List[PDFStackT]: - if n == 0: - return [] - x = self.argstack[-n:] - self.argstack = self.argstack[:-n] - return x - - def get_current_state(self) -> Tuple[Matrix, PDFTextState, PDFGraphicState]: - return (self.ctm, self.textstate.copy(), self.graphicstate.copy()) - - def set_current_state( - self, - state: Tuple[Matrix, PDFTextState, PDFGraphicState], - ) -> None: - (self.ctm, self.textstate, self.graphicstate) = state - self.device.set_ctm(self.ctm) - - def do_q(self) -> None: - """Save graphics state""" - self.gstack.append(self.get_current_state()) - - def do_Q(self) -> None: - """Restore graphics state""" - if self.gstack: - self.set_current_state(self.gstack.pop()) - - def do_cm( - self, - a1: PDFStackT, - b1: PDFStackT, - c1: PDFStackT, - d1: PDFStackT, - e1: PDFStackT, - f1: PDFStackT, - ) -> None: - """Concatenate matrix to current transformation matrix""" - self.ctm = mult_matrix(cast(Matrix, (a1, b1, c1, d1, e1, f1)), self.ctm) - self.device.set_ctm(self.ctm) - - def do_w(self, linewidth: PDFStackT) -> None: - """Set line width""" - self.graphicstate.linewidth = cast(float, linewidth) - - def do_J(self, linecap: PDFStackT) -> None: - """Set line cap style""" - self.graphicstate.linecap = linecap - - def do_j(self, linejoin: PDFStackT) -> None: - """Set line join style""" - self.graphicstate.linejoin = linejoin - - def do_M(self, miterlimit: PDFStackT) -> None: - """Set miter limit""" - self.graphicstate.miterlimit = miterlimit - - def do_d(self, dash: PDFStackT, phase: PDFStackT) -> None: - """Set line dash pattern""" - self.graphicstate.dash = (dash, phase) - - def do_ri(self, intent: PDFStackT) -> None: - """Set color rendering intent""" - self.graphicstate.intent = intent - - def do_i(self, flatness: PDFStackT) -> None: - """Set flatness tolerance""" - self.graphicstate.flatness = flatness - - def do_gs(self, name: PDFStackT) -> None: - """Set parameters from graphics state parameter dictionary""" - # TODO - - def do_m(self, x: PDFStackT, y: PDFStackT) -> None: - """Begin new subpath""" - self.curpath.append(("m", cast(float, x), cast(float, y))) - - def do_l(self, x: PDFStackT, y: PDFStackT) -> None: - """Append straight line segment to path""" - self.curpath.append(("l", cast(float, x), cast(float, y))) - - def do_c( - self, - x1: PDFStackT, - y1: PDFStackT, - x2: PDFStackT, - y2: PDFStackT, - x3: PDFStackT, - y3: PDFStackT, - ) -> None: - """Append curved segment to path (three control points)""" - self.curpath.append( - ( - "c", - cast(float, x1), - cast(float, y1), - cast(float, x2), - cast(float, y2), - cast(float, x3), - cast(float, y3), - ), - ) - - def do_v(self, x2: PDFStackT, y2: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: - """Append curved segment to path (initial point replicated)""" - self.curpath.append( - ("v", cast(float, x2), cast(float, y2), cast(float, x3), cast(float, y3)), - ) - - def do_y(self, x1: PDFStackT, y1: PDFStackT, x3: PDFStackT, y3: PDFStackT) -> None: - """Append curved segment to path (final point replicated)""" - self.curpath.append( - ("y", cast(float, x1), cast(float, y1), cast(float, x3), cast(float, y3)), - ) - - def do_h(self) -> None: - """Close subpath""" - self.curpath.append(("h",)) + def do_h(self) -> None: + """Close subpath""" + self.curpath.append(("h",)) def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: """Append rectangle to path""" @@ -934,21 +577,21 @@ def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: self.curpath.append(("l", x, y + h)) self.curpath.append(("h",)) - def do_S(self) -> None: + def do_S(self) -> Iterator[LTComponent]: """Stroke path""" - self.device.paint_path( + yield from self.paint_path( self.graphicstate, True, False, False, self.curpath, self.ncs, self.scs ) self.curpath = [] - def do_s(self) -> None: + def do_s(self) -> Iterator[LTComponent]: """Close and stroke path""" self.do_h() - self.do_S() + yield from self.do_S() - def do_f(self) -> None: + def do_f(self) -> Iterator[LTComponent]: """Fill path using nonzero winding number rule""" - self.device.paint_path( + yield from self.paint_path( self.graphicstate, False, True, False, self.curpath, self.ncs, self.scs ) self.curpath = [] @@ -956,36 +599,36 @@ def do_f(self) -> None: def do_F(self) -> None: """Fill path using nonzero winding number rule (obsolete)""" - def do_f_a(self) -> None: + def do_f_a(self) -> Iterator[LTComponent]: """Fill path using even-odd rule""" - self.device.paint_path( + yield from self.paint_path( self.graphicstate, False, True, True, self.curpath, self.ncs, self.scs ) self.curpath = [] - def do_B(self) -> None: + def do_B(self) -> Iterator[LTComponent]: """Fill and stroke path using nonzero winding number rule""" - self.device.paint_path( + yield from self.paint_path( self.graphicstate, True, True, False, self.curpath, self.ncs, self.scs ) self.curpath = [] - def do_B_a(self) -> None: + def do_B_a(self) -> Iterator[LTComponent]: """Fill and stroke path using even-odd rule""" - self.device.paint_path( + yield from self.paint_path( self.graphicstate, True, True, True, self.curpath, self.ncs, self.scs ) self.curpath = [] - def do_b(self) -> None: + def do_b(self) -> Iterator[LTComponent]: """Close, fill, and stroke path using nonzero winding number rule""" self.do_h() - self.do_B() + yield from self.do_B() - def do_b_a(self) -> None: + def do_b_a(self) -> Iterator[LTComponent]: """Close, fill, and stroke path using even-odd rule""" self.do_h() - self.do_B_a() + yield from self.do_B_a() def do_n(self) -> None: """End path without filling or stroking""" @@ -1107,23 +750,23 @@ def do_EX(self) -> None: def do_MP(self, tag: PDFStackT) -> None: """Define marked-content point""" - self.device.do_tag(cast(PSLiteral, tag)) + self.do_tag(cast(PSLiteral, tag)) def do_DP(self, tag: PDFStackT, props: PDFStackT) -> None: """Define marked-content point with property list""" - self.device.do_tag(cast(PSLiteral, tag), props) + self.do_tag(cast(PSLiteral, tag), props) def do_BMC(self, tag: PDFStackT) -> None: """Begin marked-content sequence""" - self.device.begin_tag(cast(PSLiteral, tag)) + self.begin_tag(cast(PSLiteral, tag)) def do_BDC(self, tag: PDFStackT, props: PDFStackT) -> None: """Begin marked-content sequence with property list""" - self.device.begin_tag(cast(PSLiteral, tag), props) + self.begin_tag(cast(PSLiteral, tag), props) def do_EMC(self) -> None: """End marked-content sequence""" - self.device.end_tag() + self.end_tag() def do_Tc(self, space: PDFStackT) -> None: """Set character spacing. @@ -1171,7 +814,10 @@ def do_Tf(self, fontid: PDFStackT, fontsize: PDFStackT) -> None: except KeyError: if settings.STRICT: raise PDFInterpreterError("Undefined Font id: %r" % fontid) - self.textstate.font = self.rsrcmgr.get_font(None, {}) + doc = self.page.doc() + if doc is None: + raise RuntimeError("Document no longer exists!") + self.textstate.font = doc.get_font(None, {}) self.textstate.fontsize = cast(float, fontsize) def do_Tr(self, render: PDFStackT) -> None: @@ -1252,7 +898,7 @@ def do_T_a(self) -> None: ) self.textstate.linematrix = (0, 0) - def do_TJ(self, seq: PDFStackT) -> None: + def do_TJ(self, seq: PDFStackT) -> Iterator[LTComponent]: """Show text, allowing individual glyph positioning""" if self.textstate.font is None: if settings.STRICT: @@ -1261,7 +907,7 @@ def do_TJ(self, seq: PDFStackT) -> None: # FIXME: Are we sure? assert self.ncs is not None assert self.scs is not None - self.device.render_string( + yield from self.render_string( self.textstate, cast(PDFTextSeq, seq), self.ncs, @@ -1269,26 +915,28 @@ def do_TJ(self, seq: PDFStackT) -> None: self.scs, ) - def do_Tj(self, s: PDFStackT) -> None: + def do_Tj(self, s: PDFStackT) -> Iterator[LTComponent]: """Show text""" - self.do_TJ([s]) + yield from self.do_TJ([s]) - def do__q(self, s: PDFStackT) -> None: + def do__q(self, s: PDFStackT) -> Iterator[LTComponent]: """Move to next line and show text The ' (single quote) operator. """ self.do_T_a() - self.do_TJ([s]) + yield from self.do_TJ([s]) - def do__w(self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT) -> None: + def do__w( + self, aw: PDFStackT, ac: PDFStackT, s: PDFStackT + ) -> Iterator[LTComponent]: """Set word and character spacing, move to next line, and show text The " (double quote) operator. """ self.do_Tw(aw) self.do_Tc(ac) - self.do_TJ([s]) + yield from self.do_TJ([s]) def do_BI(self) -> None: """Begin inline image object""" @@ -1296,15 +944,15 @@ def do_BI(self) -> None: def do_ID(self) -> None: """Begin inline image data""" - def do_EI(self, obj: PDFStackT) -> None: + def do_EI(self, obj: PDFStackT) -> Iterator[LTComponent]: """End inline image object""" - if isinstance(obj, PDFStream) and "W" in obj and "H" in obj: + if isinstance(obj, ContentStream) and "W" in obj and "H" in obj: iobjid = str(id(obj)) - self.device.begin_figure(iobjid, (0, 0, 1, 1), MATRIX_IDENTITY) - self.device.render_image(iobjid, obj) - self.device.end_figure(iobjid) + fig = LTFigure(iobjid, (0, 0, 1, 1), self.ctm) + fig.add(self.render_image(iobjid, obj, fig)) + yield fig - def do_Do(self, xobjid_arg: PDFStackT) -> None: + def do_Do(self, xobjid_arg: PDFStackT) -> Iterator[LTComponent]: """Invoke named XObject""" xobjid = literal_name(xobjid_arg) try: @@ -1316,7 +964,6 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None: log.debug("Processing xobj: %r", xobj) subtype = xobj.get("Subtype") if subtype is LITERAL_FORM and "BBox" in xobj: - interpreter = self.dup() bbox = cast(Rect, list_value(xobj["BBox"])) matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY))) # According to PDF reference 1.7 section 4.9.1, XObjects in @@ -1324,92 +971,367 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None: # instead of having their own Resources entry. xobjres = xobj.get("Resources") if xobjres: - resources = dict_value(xobjres) + interpreter = PageInterpreter( + self.page, resources=dict_value(xobjres), contents=[xobj] + ) else: - resources = self.resources.copy() - self.device.begin_figure(xobjid, bbox, matrix) - interpreter.render_contents( - resources, - [xobj], - ctm=mult_matrix(matrix, self.ctm), - ) - self.device.end_figure(xobjid) + interpreter = PageInterpreter(self.page, contents=[xobj]) + interpreter.ctm = mult_matrix(matrix, self.ctm) + fig = LTFigure(xobjid, bbox, interpreter.ctm) + for item in interpreter: + fig.add(item) + yield fig elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: - self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) - self.device.render_image(xobjid, xobj) - self.device.end_figure(xobjid) + fig = LTFigure(xobjid, (0, 0, 1, 1), self.ctm) + fig.add(self.render_image(xobjid, xobj, fig)) + yield fig else: # unsupported xobject type. pass - def process_page(self, page: PDFPage) -> None: - log.debug("Processing page: %r", page) - (x0, y0, x1, y1) = page.mediabox - # FIXME: NO, this is bad, pdfplumber has a bug related to it - # (specifically the translation, the rotation is kind of okay - # it seems) - if page.rotate == 90: - ctm = (0, -1, 1, 0, -y0, x1) - elif page.rotate == 180: - ctm = (-1, 0, 0, -1, x1, y1) - elif page.rotate == 270: - ctm = (0, 1, -1, 0, y1, -x0) + def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: + """Handle beginning of tag, setting current MCID if any.""" + self.cur_tag = decode_text(tag.name) + if isinstance(props, dict) and "MCID" in props: + self.cur_mcid = props["MCID"] else: - ctm = (1, 0, 0, 1, -x0, -y0) - self.device.begin_page(page, ctm) - self.render_contents(page.resources, page.contents, ctm=ctm) - self.device.end_page(page) + self.cur_mcid = None - def render_contents( - self, - resources: Dict[object, object], - streams: Sequence[object], - ctm: Matrix = MATRIX_IDENTITY, - ) -> None: - """Render the content streams. + def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: + pass - This method may be called recursively. - """ - log.debug( - "render_contents: resources=%r, streams=%r, ctm=%r", - resources, - streams, - ctm, + def end_tag(self) -> None: + """Handle beginning of tag, clearing current MCID.""" + self.cur_tag = None + self.cur_mcid = None + + def render_image( + self, name: str, stream: ContentStream, figure: LTFigure + ) -> LTImage: + return LTImage( + name, + stream, + (figure.x0, figure.y0, figure.x1, figure.y1), ) - self.init_resources(resources) - self.init_state(ctm) - self.execute(list_value(streams)) - def execute(self, streams: Sequence[object]) -> None: - try: - parser = PDFContentParser(streams) - except PSEOF: - # empty page - return - while True: - try: - (_, obj) = parser.nextobject() - except PSEOF: - break - if isinstance(obj, PSKeyword): - name = keyword_name(obj) - method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace( - "'", - "_q", + def paint_path( + self, + gstate: PDFGraphicState, + stroke: bool, + fill: bool, + evenodd: bool, + path: Sequence[PathSegment], + ncs: Optional[PDFColorSpace] = None, + scs: Optional[PDFColorSpace] = None, + ) -> Iterator[LTComponent]: + """Paint paths described in section 4.4 of the PDF reference manual""" + shape = "".join(x[0] for x in path) + + if shape[:1] != "m": + # Per PDF Reference Section 4.4.1, "path construction operators may + # be invoked in any sequence, but the first one invoked must be m + # or re to begin a new subpath." Since pdfminer.six already + # converts all `re` (rectangle) operators to their equivelent + # `mlllh` representation, paths ingested by `.paint_path(...)` that + # do not begin with the `m` operator are invalid. + pass + + elif shape.count("m") > 1: + # recurse if there are multiple m's in this shape + for m in re.finditer(r"m[^m]+", shape): + subpath = path[m.start(0) : m.end(0)] + yield from self.paint_path( + gstate, stroke, fill, evenodd, subpath, ncs, scs ) - if hasattr(self, method): - func = getattr(self, method) - nargs = func.__code__.co_argcount - 1 - if nargs: - args = self.pop(nargs) - log.debug("exec: %s %r", name, args) - if len(args) == nargs: - func(*args) - else: - log.debug("exec: %s", name) - func() - elif settings.STRICT: - error_msg = "Unknown operator: %r" % name - raise PDFInterpreterError(error_msg) - else: - self.push(obj) + + else: + # Although the 'h' command does not not literally provide a + # point-position, its position is (by definition) equal to the + # subpath's starting point. + # + # And, per Section 4.4's Table 4.9, all other path commands place + # their point-position in their final two arguments. (Any preceding + # arguments represent control points on Bézier curves.) + raw_pts = [ + cast(Point, p[-2:] if p[0] != "h" else path[0][-2:]) for p in path + ] + pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + + operators = [str(operation[0]) for operation in path] + transformed_points = [ + [ + apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) + for operand1, operand2 in zip(operation[1::2], operation[2::2]) + ] + for operation in path + ] + transformed_path = [ + cast(PathSegment, (o, *p)) + for o, p in zip(operators, transformed_points) + ] + + if shape in {"mlh", "ml"}: + # single line segment + # + # Note: 'ml', in conditional above, is a frequent anomaly + # that we want to support. + line = LTLine( + gstate.linewidth, + pts[0], + pts[1], + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + original_path=transformed_path, + dashing_style=gstate.dash, + ncs=ncs, + scs=scs, + ) + yield line + + elif shape in {"mlllh", "mllll"}: + (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts + + is_closed_loop = pts[0] == pts[4] + has_square_coordinates = ( + x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0 + ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) + if is_closed_loop and has_square_coordinates: + rect = LTRect( + gstate.linewidth, + (*pts[0], *pts[2]), + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ncs, + scs, + ) + yield rect + else: + curve = LTCurve( + gstate.linewidth, + pts, + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ncs, + scs, + ) + yield curve + else: + curve = LTCurve( + gstate.linewidth, + pts, + stroke, + fill, + evenodd, + gstate.scolor, + gstate.ncolor, + transformed_path, + gstate.dash, + ncs, + scs, + ) + yield curve + + def render_char( + self, + matrix: Matrix, + font: PDFFont, + fontsize: float, + scaling: float, + rise: float, + cid: int, + ncs: PDFColorSpace, + graphicstate: PDFGraphicState, + scs: Optional[PDFColorSpace] = None, + ) -> LTChar: + try: + text = font.to_unichr(cid) + assert isinstance(text, str), str(type(text)) + except PDFUnicodeNotDefined: + text = self.handle_undefined_char(font, cid) + textwidth = font.char_width(cid) + textdisp = font.char_disp(cid) + item = LTChar( + matrix, + font, + fontsize, + scaling, + rise, + text, + textwidth, + textdisp, + ncs, + graphicstate, + scs, + graphicstate.scolor, + graphicstate.ncolor, + ) + return item + + def render_string( + self, + textstate: "PDFTextState", + seq: PDFTextSeq, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState", + scs: Optional[PDFColorSpace] = None, + ) -> Iterator[LTComponent]: + assert self.ctm is not None + matrix = mult_matrix(textstate.matrix, self.ctm) + font = textstate.font + fontsize = textstate.fontsize + scaling = textstate.scaling * 0.01 + charspace = textstate.charspace * scaling + wordspace = textstate.wordspace * scaling + rise = textstate.rise + assert font is not None + if font.is_multibyte(): + wordspace = 0 + dxscale = 0.001 * fontsize * scaling + if font.is_vertical(): + textstate.linematrix, chars = self.render_string_vertical( + seq, + matrix, + textstate.linematrix, + font, + fontsize, + scaling, + charspace, + wordspace, + rise, + dxscale, + ncs, + graphicstate, + scs, + ) + else: + textstate.linematrix, chars = self.render_string_horizontal( + seq, + matrix, + textstate.linematrix, + font, + fontsize, + scaling, + charspace, + wordspace, + rise, + dxscale, + ncs, + graphicstate, + scs, + ) + yield from chars + + def render_string_horizontal( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState", + scs: Optional[PDFColorSpace] = None, + ) -> Tuple[Point, List[LTChar]]: + (x, y) = pos + needcharspace = False + chars = [] + for obj in seq: + if isinstance(obj, (int, float)): + x -= obj * dxscale + needcharspace = True + else: + if isinstance(obj, str): + obj = make_compat_bytes(obj) + if not isinstance(obj, bytes): + continue + for cid in font.decode(obj): + if needcharspace: + x += charspace + item = self.render_char( + translate_matrix(matrix, (x, y)), + font, + fontsize, + scaling, + rise, + cid, + ncs, + graphicstate, + scs, + ) + x += item.adv + chars.append(item) + if cid == 32 and wordspace: + x += wordspace + needcharspace = True + return ((x, y), chars) + + def render_string_vertical( + self, + seq: PDFTextSeq, + matrix: Matrix, + pos: Point, + font: PDFFont, + fontsize: float, + scaling: float, + charspace: float, + wordspace: float, + rise: float, + dxscale: float, + ncs: PDFColorSpace, + graphicstate: "PDFGraphicState", + scs: Optional[PDFColorSpace] = None, + ) -> Tuple[Point, List[LTChar]]: + (x, y) = pos + needcharspace = False + chars = [] + for obj in seq: + if isinstance(obj, (int, float)): + y -= obj * dxscale + needcharspace = True + else: + if isinstance(obj, str): + obj = make_compat_bytes(obj) + if not isinstance(obj, bytes): + continue + for cid in font.decode(obj): + if needcharspace: + y += charspace + item = self.render_char( + translate_matrix(matrix, (x, y)), + font, + fontsize, + scaling, + rise, + cid, + ncs, + graphicstate, + scs, + ) + chars.append(item) + y += item.adv + if cid == 32 and wordspace: + y += wordspace + needcharspace = True + return ((x, y), chars) + + def handle_undefined_char(self, font: PDFFont, cid: int) -> str: + log.debug("undefined: %r, %r", font, cid) + return "(cid:%d)" % cid diff --git a/playa/parser.py b/playa/parser.py new file mode 100644 index 00000000..f13e5da4 --- /dev/null +++ b/playa/parser.py @@ -0,0 +1,621 @@ +import logging +import mmap +import re +import weakref +from binascii import unhexlify +from collections import deque +from typing import ( + TYPE_CHECKING, + Deque, + Dict, + Generic, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, +) + +from playa import settings +from playa.casting import safe_int +from playa.exceptions import PDFSyntaxError, PSException, PSSyntaxError, PSTypeError +from playa.pdftypes import ( + KWD, + LIT, + ContentStream, + ObjRef, + PSKeyword, + PSLiteral, + dict_value, + int_value, + literal_name, + name_str, +) +from playa.utils import choplist + +log = logging.getLogger(__name__) +if TYPE_CHECKING: + from playa.document import PDFDocument + +# Intern a bunch of important keywords +KEYWORD_PROC_BEGIN = KWD(b"{") +KEYWORD_PROC_END = KWD(b"}") +KEYWORD_ARRAY_BEGIN = KWD(b"[") +KEYWORD_ARRAY_END = KWD(b"]") +KEYWORD_DICT_BEGIN = KWD(b"<<") +KEYWORD_DICT_END = KWD(b">>") +KEYWORD_GT = KWD(b">") +KEYWORD_R = KWD(b"R") +KEYWORD_NULL = KWD(b"null") +KEYWORD_ENDOBJ = KWD(b"endobj") +KEYWORD_STREAM = KWD(b"stream") +KEYWORD_XREF = KWD(b"xref") +KEYWORD_STARTXREF = KWD(b"startxref") +KEYWORD_OBJ = KWD(b"obj") +KEYWORD_TRAILER = KWD(b"trailer") + + +EOL = b"\r\n" +WHITESPACE = b" \t\n\r\f\v" +NUMBER = b"0123456789" +HEX = NUMBER + b"abcdef" + b"ABCDEF" +NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE +NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE +NOTSTRING = b"()\\" +OCTAL = b"01234567" +ESC_STRING = { + b"b": 8, + b"t": 9, + b"n": 10, + b"f": 12, + b"r": 13, + b"(": 40, + b")": 41, + b"\\": 92, +} + + +PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] +LEXER = re.compile( + rb"""(?: + (?P \s+) + | (?P %[^\r\n]*[\r\n]) + | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) + | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) + | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) + | (?P \([^()\\]*) + | (?P <[A-Fa-f\d\s]*>) + | (?P <<) + | (?P >>) + | (?P .) +) +""", + re.VERBOSE, +) +STRLEXER = re.compile( + rb"""(?: + (?P \\[0-7]{1,3}) + | (?P \\(?:\r\n?|\n)) + | (?P \\.) + | (?P \() + | (?P \)) + | (?P \r\n?|\n) + | (?P .) +)""", + re.VERBOSE, +) +HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") +EOLR = re.compile(rb"\r\n?|\n") +SPC = re.compile(rb"\s") + + +class Lexer: + """Lexer for PDF data.""" + + def __init__(self, data: Union[bytes, mmap.mmap]) -> None: + self.data = data + self.pos = 0 + self.end = len(data) + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() + + def seek(self, pos: int) -> None: + """Seek to a position and reinitialize parser state.""" + self.pos = pos + self._curtoken = b"" + self._curtokenpos = 0 + self._tokens.clear() + + def tell(self) -> int: + """Get the current position in the buffer.""" + return self.pos + + def read(self, objlen: int) -> bytes: + """Read data from current position, advancing to the end of + this data.""" + pos = self.pos + self.pos = min(pos + objlen, len(self.data)) + return self.data[pos : self.pos] + + def iter_lines(self) -> Iterator[Tuple[int, bytes]]: + r"""Iterate over lines that end either with \r, \n, or \r\n, + starting at the current position.""" + while self.pos < self.end: + linepos = self.pos + m = EOLR.search(self.data, self.pos) + if m is None: + self.pos = self.end + else: + self.pos = m.end() + yield (linepos, self.data[linepos : self.pos]) + + def reverse_iter_lines(self) -> Iterator[bytes]: + """Iterate backwards over lines starting at the current position. + + This is used to locate the trailers at the end of a file. + """ + endline = self.pos + while True: + nidx = self.data.rfind(b"\n", 0, self.pos) + ridx = self.data.rfind(b"\r", 0, self.pos) + best = max(nidx, ridx) + if best == -1: + yield self.data[:endline] + break + yield self.data[best + 1 : endline] + endline = best + 1 + self.pos = best + if self.pos > 0 and self.data[self.pos - 1 : self.pos + 1] == b"\r\n": + self.pos -= 1 + + def get_inline_data( + self, target: bytes = b"EI", blocksize: int = -1 + ) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + tpos = self.data.find(target, self.pos) + if tpos != -1: + nextpos = tpos + len(target) + result = (tpos, self.data[self.pos : nextpos]) + self.pos = nextpos + return result + return (-1, b"") + + def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over tokens.""" + return self + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + while True: + m = LEXER.match(self.data, self.pos) + if m is None: # can only happen at EOS + raise StopIteration + self._curtokenpos = m.start() + self.pos = m.end() + if m.lastgroup not in ("whitespace", "comment"): # type: ignore + # Okay, we got a token or something + break + self._curtoken = m[0] + if m.lastgroup == "name": # type: ignore + self._curtoken = m[0][1:] + self._curtoken = HEXDIGIT.sub( + lambda x: bytes((int(x[1], 16),)), self._curtoken + ) + tok = LIT(name_str(self._curtoken)) + return (self._curtokenpos, tok) + if m.lastgroup == "number": # type: ignore + if b"." in self._curtoken: + return (self._curtokenpos, float(self._curtoken)) + else: + return (self._curtokenpos, int(self._curtoken)) + if m.lastgroup == "startdict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_BEGIN) + if m.lastgroup == "enddict": # type: ignore + return (self._curtokenpos, KEYWORD_DICT_END) + if m.lastgroup == "startstr": # type: ignore + return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) + if m.lastgroup == "hexstr": # type: ignore + self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + return (self._curtokenpos, unhexlify(self._curtoken)) + # Anything else is treated as a keyword (whether explicitly matched or not) + if self._curtoken == b"true": + return (self._curtokenpos, True) + elif self._curtoken == b"false": + return (self._curtokenpos, False) + else: + return (self._curtokenpos, KWD(self._curtoken)) + + def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: + """Parse the remainder of a string.""" + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts = [EOLR.sub(b"\n", start)] + paren = 1 + for m in STRLEXER.finditer(self.data, pos): + self.pos = m.end() + if m.lastgroup == "parenright": # type: ignore + paren -= 1 + if paren == 0: + # By far the most common situation! + break + parts.append(m[0]) + elif m.lastgroup == "parenleft": # type: ignore + parts.append(m[0]) + paren += 1 + elif m.lastgroup == "escape": # type: ignore + chr = m[0][1:2] + if chr not in ESC_STRING: + log.warning("Unrecognized escape %r", m[0]) + parts.append(chr) + else: + parts.append(bytes((ESC_STRING[chr],))) + elif m.lastgroup == "octal": # type: ignore + chrcode = int(m[0][1:], 8) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be + # ignored." + log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) + else: + parts.append(bytes((chrcode,))) + elif m.lastgroup == "newline": # type: ignore + # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) + parts.append(b"\n") + elif m.lastgroup == "linebreak": # type: ignore + pass + else: + parts.append(m[0]) + if paren != 0: + log.warning("Unterminated string at %d", pos) + raise StopIteration + return (self._curtokenpos, b"".join(parts)) + + +# Stack slots may by occupied by any of: +# * the name of a literal +# * the PSBaseParserToken types +# * list (via KEYWORD_ARRAY) +# * dict (via KEYWORD_DICT) +# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT +ExtraT = TypeVar("ExtraT") +PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT] +PSStackEntry = Tuple[int, PSStackType[ExtraT]] +PDFStackT = PSStackType[ContentStream] # FIXME: Not entirely correct here + + +class Parser(Generic[ExtraT]): + """Basic parser for PDF objects in a bytes-like object.""" + + def __init__(self, data: Union[bytes, mmap.mmap]) -> None: + self.reinit(data) + + def reinit(self, data: Union[bytes, mmap.mmap]) -> None: + """Reinitialize with new data (FIXME: Should go away, use a + new parser for each stream as it's clearer and safer)""" + self._lexer = Lexer(data) + self.reset() + + def reset(self) -> None: + """Reset parser state.""" + self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] + self.curtype: Optional[str] = None + self.curstack: List[PSStackEntry[ExtraT]] = [] + self.results: List[PSStackEntry[ExtraT]] = [] + + def push(self, *objs: PSStackEntry[ExtraT]) -> None: + """Push some objects onto the stack.""" + self.curstack.extend(objs) + + def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: + """Pop some objects off the stack.""" + objs = self.curstack[-n:] + self.curstack[-n:] = [] + return objs + + def popall(self) -> List[PSStackEntry[ExtraT]]: + """Pop all the things off the stack.""" + objs = self.curstack + self.curstack = [] + return objs + + def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: + """Move some objects to the output.""" + try: + log.debug("add_results: %r", objs) + except Exception: + log.debug("add_results: (unprintable object)") + self.results.extend(objs) + + def start_type(self, pos: int, type: str) -> None: + """Start a composite object (array, dict, etc).""" + self.context.append((pos, self.curtype, self.curstack)) + (self.curtype, self.curstack) = (type, []) + log.debug("start_type: pos=%r, type=%r", pos, type) + + def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: + """End a composite object (array, dict, etc).""" + if self.curtype != type: + raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") + objs = [obj for (_, obj) in self.curstack] + (pos, self.curtype, self.curstack) = self.context.pop() + log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) + return (pos, objs) + + def do_keyword(self, pos: int, token: PSKeyword) -> None: + """Handle a PDF keyword.""" + pass + + def flush(self) -> None: + """Add objects from stack to output (or, actually, not).""" + return + + def __next__(self) -> PSStackEntry[ExtraT]: + """Return the next object, raising StopIteration at EOF. + + Arrays and dictionaries are represented as Python lists and + dictionaries. + """ + while not self.results: + (pos, token) = self.nexttoken() + if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): + # normal token + self.push((pos, token)) + elif token == KEYWORD_ARRAY_BEGIN: + # begin array + self.start_type(pos, "a") + elif token == KEYWORD_ARRAY_END: + # end array + try: + self.push(self.end_type("a")) + except PSTypeError: + if settings.STRICT: + raise + elif token == KEYWORD_DICT_BEGIN: + # begin dictionary + self.start_type(pos, "d") + elif token == KEYWORD_DICT_END: + # end dictionary + try: + (pos, objs) = self.end_type("d") + if len(objs) % 2 != 0: + error_msg = "Invalid dictionary construct: %r" % objs + raise PSSyntaxError(error_msg) + d = { + literal_name(k): v + for (k, v) in choplist(2, objs) + if v is not None + } + self.push((pos, d)) + except PSTypeError: + if settings.STRICT: + raise + elif token == KEYWORD_PROC_BEGIN: + # begin proc + self.start_type(pos, "p") + elif token == KEYWORD_PROC_END: + # end proc + try: + self.push(self.end_type("p")) + except PSTypeError: + if settings.STRICT: + raise + elif isinstance(token, PSKeyword): + log.debug( + "do_keyword: pos=%r, token=%r, stack=%r", + pos, + token, + self.curstack, + ) + self.do_keyword(pos, token) + else: + log.error( + "unknown token: pos=%r, token=%r, stack=%r", + pos, + token, + self.curstack, + ) + self.do_keyword(pos, token) + raise PSException + if self.context: + continue + else: + self.flush() + pos, obj = self.results.pop(0) + try: + log.debug("__next__: object at %d: %r", pos, obj) + except Exception: + log.debug("__next__: (unprintable object) at %d", pos) + return pos, obj + + def __iter__(self) -> Iterator[PSStackEntry[ExtraT]]: + """Iterate over (position, object) tuples, raising StopIteration at EOF.""" + return self + + @property + def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over (position, token) tuples, raising StopIteration at EOF.""" + return self._lexer + + # Delegation follows + def seek(self, pos: int) -> None: + """Seek to a position and reset parser state.""" + self._lexer.seek(pos) + self.reset() + + def tell(self) -> int: + """Get the current position in the file.""" + return self._lexer.tell() + + @property + def end(self) -> int: + """End (or size) of file, for use with seek().""" + return self._lexer.end + + def iter_lines(self) -> Iterator[Tuple[int, bytes]]: + r"""Iterate over lines that end either with \r, \n, or \r\n.""" + return self._lexer.iter_lines() + + def reverse_iter_lines(self) -> Iterator[bytes]: + """Iterate over lines starting at the end of the file + + This is used to locate the trailers at the end of a file. + """ + return self._lexer.reverse_iter_lines() + + def read(self, objlen: int) -> bytes: + """Read data from a specified position, moving the current + position to the end of this data.""" + return self._lexer.read(objlen) + + def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker.""" + return self._lexer.get_inline_data(target) + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + """Get the next token in iteration, raising StopIteration when + done.""" + return next(self._lexer) + + +class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]): + """PDFParser fetches PDF objects from a file stream. + It holds a weak reference to the document in order to + resolve indirect references. If the document is deleted + then this will obviously no longer work. + + Typical usage: + parser = PDFParser(fp, doc) + parser.seek(offset) + for object in parser: + ... + + """ + + def __init__(self, data: Union[bytes, mmap.mmap], doc: "PDFDocument") -> None: + super().__init__(data) + self.doc = weakref.ref(doc) + self.fallback = False + + def do_keyword(self, pos: int, token: PSKeyword) -> None: + """Handles PDF-related keywords.""" + if token in (KEYWORD_XREF, KEYWORD_STARTXREF): + self.add_results(*self.pop(1)) + + elif token is KEYWORD_ENDOBJ: + # objid genno "obj" ... and the object itself + self.add_results(*self.pop(4)) + + elif token is KEYWORD_NULL: + # null object + self.push((pos, None)) + + elif token is KEYWORD_R: + # reference to indirect object + if len(self.curstack) >= 2: + (_, _object_id), _ = self.pop(2) + object_id = safe_int(_object_id) + if object_id is not None: + obj = ObjRef(self.doc, object_id) + self.push((pos, obj)) + + elif token is KEYWORD_STREAM: + # stream dictionary, which precedes "stream" + ((_, dic),) = self.pop(1) + dic = dict_value(dic) + objlen = 0 + if not self.fallback: + try: + objlen = int_value(dic["Length"]) + except KeyError: + if settings.STRICT: + raise PDFSyntaxError("/Length is undefined: %r" % dic) + # back up and read the entire line including 'stream' as + # the data starts after the trailing newline + self.seek(pos) + try: + _, line = next(self.iter_lines()) # 'stream\n' + except StopIteration: + if settings.STRICT: + raise PDFSyntaxError("Unexpected EOF") + return + pos = self.tell() + data = self.read(objlen) + # FIXME: This is ... not really the right way to do this. + for linepos, line in self.iter_lines(): + if b"endstream" in line: + i = line.index(b"endstream") + objlen += i + if self.fallback: + data += line[:i] + break + objlen += len(line) + if self.fallback: + data += line + self.seek(pos + objlen) + # XXX limit objlen not to exceed object boundary + log.debug( + "ContentStream: pos=%d, objlen=%d, dic=%r, data=%r...", + pos, + objlen, + dic, + data[:10], + ) + doc = self.doc() + if doc is None: + raise RuntimeError("Document no longer exists!") + stream = ContentStream(dic, bytes(data), doc.decipher) + self.push((pos, stream)) + + else: + # others + self.push((pos, token)) + + +class ContentStreamParser(PDFParser): + """StreamParser is used to parse PDF content streams and object + streams. These have slightly different rules for how objects are + described than the top-level PDF file contents. + """ + + def __init__(self, data: bytes, doc: "PDFDocument") -> None: + super().__init__(data, doc) + + def flush(self) -> None: + self.add_results(*self.popall()) + + def do_keyword(self, pos: int, token: PSKeyword) -> None: + if token is KEYWORD_R: + # reference to indirect object + try: + (_, _object_id), _ = self.pop(2) + except ValueError: + raise PDFSyntaxError( + "Expected generation and object id in indirect object reference" + ) + object_id = safe_int(_object_id) + if object_id is not None: + obj = ObjRef(self.doc, object_id) + self.push((pos, obj)) + return + + elif token in (KEYWORD_OBJ, KEYWORD_ENDOBJ): + if settings.STRICT: + # See PDF Spec 3.4.6: Only the object values are stored in the + # stream; the obj and endobj keywords are not used. + raise PDFSyntaxError("Keyword endobj found in stream") + return + + # others + self.push((pos, token)) diff --git a/playa/pdfparser.py b/playa/pdfparser.py deleted file mode 100644 index e02b486b..00000000 --- a/playa/pdfparser.py +++ /dev/null @@ -1,164 +0,0 @@ -import logging -import weakref -from typing import TYPE_CHECKING, BinaryIO, Union - -from playa import settings -from playa.casting import safe_int -from playa.exceptions import PSEOF, PDFSyntaxError -from playa.pdftypes import PDFObjRef, PDFStream, dict_value, int_value -from playa.psparser import KWD, PSKeyword, PSStackParser - -if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument - -log = logging.getLogger(__name__) - -# Important keywords -KEYWORD_R = KWD(b"R") -KEYWORD_NULL = KWD(b"null") -KEYWORD_ENDOBJ = KWD(b"endobj") -KEYWORD_STREAM = KWD(b"stream") -KEYWORD_XREF = KWD(b"xref") -KEYWORD_STARTXREF = KWD(b"startxref") -KEYWORD_OBJ = KWD(b"obj") - - -# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None -class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): - """PDFParser fetch PDF objects from a file stream. - It can handle indirect references by referring to - a PDF document set by set_document method. - It also reads XRefs at the end of every PDF file. - - Typical usage: - parser = PDFParser(fp) - parser.read_xref() - parser.read_xref(fallback=True) # optional - parser.set_document(doc) - parser.seek(offset) - parser.nextobject() - - """ - - def __init__(self, data: Union[BinaryIO, bytes], doc: "PDFDocument") -> None: - super().__init__(data) - self.doc = weakref.ref(doc) - self.fallback = False - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - """Handles PDF-related keywords.""" - if token in (KEYWORD_XREF, KEYWORD_STARTXREF): - self.add_results(*self.pop(1)) - - elif token is KEYWORD_ENDOBJ: - self.add_results(*self.pop(4)) - - elif token is KEYWORD_NULL: - # null object - self.push((pos, None)) - - elif token is KEYWORD_R: - # reference to indirect object - if len(self.curstack) >= 2: - (_, _object_id), _ = self.pop(2) - object_id = safe_int(_object_id) - if object_id is not None: - obj = PDFObjRef(self.doc, object_id) - self.push((pos, obj)) - - elif token is KEYWORD_STREAM: - # stream object - ((_, dic),) = self.pop(1) - dic = dict_value(dic) - objlen = 0 - if not self.fallback: - try: - objlen = int_value(dic["Length"]) - except KeyError: - if settings.STRICT: - raise PDFSyntaxError("/Length is undefined: %r" % dic) - # back up and read the entire line including 'stream' as - # the data starts after the trailing newline - self.seek(pos) - try: - (_, line) = self.nextline() # 'stream\n' - except PSEOF: - if settings.STRICT: - raise PDFSyntaxError("Unexpected EOF") - return - pos = self.tell() - data = self.read(objlen) - while True: - try: - (linepos, line) = self.nextline() - except PSEOF: - if settings.STRICT: - raise PDFSyntaxError("Unexpected EOF") - break - if b"endstream" in line: - i = line.index(b"endstream") - objlen += i - if self.fallback: - data += line[:i] - break - objlen += len(line) - if self.fallback: - data += line - self.seek(pos + objlen) - # XXX limit objlen not to exceed object boundary - log.debug( - "Stream: pos=%d, objlen=%d, dic=%r, data=%r...", - pos, - objlen, - dic, - data[:10], - ) - doc = self.doc() - if doc is None: - raise RuntimeError("Document no longer exists!") - stream = PDFStream(dic, bytes(data), doc.decipher) - self.push((pos, stream)) - - else: - # others - self.push((pos, token)) - - -class PDFStreamParser(PDFParser): - """PDFStreamParser is used to parse PDF content streams - that is contained in each page and has instructions - for rendering the page. A reference to a PDF document is - needed because a PDF content stream can also have - indirect references to other objects in the same document. - """ - - def __init__(self, data: bytes, doc: "PDFDocument") -> None: - super().__init__(data, doc) - - def flush(self) -> None: - self.add_results(*self.popall()) - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is KEYWORD_R: - # reference to indirect object - try: - (_, _object_id), _ = self.pop(2) - except ValueError: - raise PDFSyntaxError( - "Expected generation and object id in indirect object reference" - ) - object_id = safe_int(_object_id) - if object_id is not None: - obj = PDFObjRef(self.doc, object_id) - self.push((pos, obj)) - return - - elif token in (KEYWORD_OBJ, KEYWORD_ENDOBJ): - if settings.STRICT: - # See PDF Spec 3.4.6: Only the object values are stored in the - # stream; the obj and endobj keywords are not used. - raise PDFSyntaxError("Keyword endobj found in stream") - return - - # others - self.push((pos, token)) diff --git a/playa/pdftypes.py b/playa/pdftypes.py index 8c4b71f7..f52d9b42 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -6,11 +6,14 @@ TYPE_CHECKING, Any, Dict, + Generic, Iterable, List, Optional, Protocol, Tuple, + Type, + TypeVar, Union, cast, ) @@ -23,19 +26,85 @@ PDFNotImplementedError, PDFTypeError, PDFValueError, + PSTypeError, ) from playa.lzw import lzwdecode -from playa.psparser import LIT from playa.runlength import rldecode from playa.utils import apply_png_predictor if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument logger = logging.getLogger(__name__) -LITERAL_CRYPT = LIT("Crypt") +class PSLiteral: + """A class that represents a PostScript literal. + + Postscript literals are used as identifiers, such as + variable names, property names and dictionary keys. + Literals are case sensitive and denoted by a preceding + slash sign (e.g. "/Name") + + Note: Do not create an instance of PSLiteral directly. + Always use PSLiteralTable.intern(). + """ + + def __init__(self, name: str) -> None: + self.name = name + + def __repr__(self) -> str: + return "/%r" % self.name + + +class PSKeyword: + """A class that represents a PostScript keyword. + + PostScript keywords are a dozen of predefined words. + Commands and directives in PostScript are expressed by keywords. + They are also used to denote the content boundaries. + + Note: Do not create an instance of PSKeyword directly. + Always use PSKeywordTable.intern(). + """ + + def __init__(self, name: bytes) -> None: + self.name = name + + def __repr__(self) -> str: + return "/%r" % self.name + + +_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) +_NameT = TypeVar("_NameT", str, bytes) + + +class PSSymbolTable(Generic[_SymbolT, _NameT]): + """Store globally unique name objects or language keywords.""" + + def __init__(self, table_type: Type[_SymbolT], name_type: Type[_NameT]) -> None: + self.dict: Dict[_NameT, _SymbolT] = {} + self.table_type: Type[_SymbolT] = table_type + self.name_type: Type[_NameT] = name_type + + def intern(self, name: _NameT) -> _SymbolT: + if not isinstance(name, self.name_type): + raise ValueError(f"{self.table_type} can only store {self.name_type}") + if name in self.dict: + lit = self.dict[name] + else: + lit = self.table_type(name) # type: ignore + self.dict[name] = lit + return lit + + +PSLiteralTable = PSSymbolTable(PSLiteral, str) +PSKeywordTable = PSSymbolTable(PSKeyword, bytes) +LIT = PSLiteralTable.intern +KWD = PSKeywordTable.intern + +# Intern a bunch of important literals +LITERAL_CRYPT = LIT("Crypt") # Abbreviation of Filter names in PDF 4.8.6. "Inline Images" LITERALS_FLATE_DECODE = (LIT("FlateDecode"), LIT("Fl")) LITERALS_LZW_DECODE = (LIT("LZWDecode"), LIT("LZW")) @@ -48,6 +117,51 @@ LITERALS_JPX_DECODE = (LIT("JPXDecode"),) +def name_str(x: bytes) -> str: + """Get the string representation for a name object. + + According to the PDF 1.7 spec (p.18): + + > Ordinarily, the bytes making up the name are never treated as + > text to be presented to a human user or to an application + > external to a conforming reader. However, occasionally the need + > arises to treat a name object as text... In such situations, the + > sequence of bytes (after expansion of NUMBER SIGN sequences, if + > any) should be interpreted according to UTF-8. + + Accordingly, if they *can* be decoded to UTF-8, then they *will* + be, and if not, we will just decode them as ISO-8859-1 since that + gives a unique (if possibly nonsensical) value for an 8-bit string. + """ + try: + return x.decode("utf-8") + except UnicodeDecodeError: + return x.decode("iso-8859-1") + + +def literal_name(x: Any) -> str: + if not isinstance(x, PSLiteral): + if settings.STRICT: + raise PSTypeError(f"Literal required: {x!r}") + return str(x) + else: + return x.name + + +def keyword_name(x: Any) -> str: + if not isinstance(x, PSKeyword): + if settings.STRICT: + raise PSTypeError("Keyword required: %r" % x) + else: + return str(x) + else: + # PDF keywords are *not* UTF-8 (they aren't ISO-8859-1 either, + # but this isn't very important, we just want some + # unique representation of 8-bit characters, as above) + name = x.name.decode("iso-8859-1") + return name + + class DecipherCallable(Protocol): """Fully typed a decipher callback, with optional parameter.""" @@ -64,7 +178,7 @@ def __call__( _DEFAULT = object() -class PDFObjRef: +class ObjRef: def __init__( self, doc: weakref.ReferenceType["PDFDocument"], @@ -83,7 +197,7 @@ def __init__( self.objid = objid def __repr__(self) -> str: - return "" % (self.objid) + return "" % (self.objid) def resolve(self, default: object = None) -> Any: doc = self.doc() @@ -101,7 +215,7 @@ def resolve1(x: object, default: object = None) -> Any: If this is an array or dictionary, it may still contains some indirect objects inside. """ - while isinstance(x, PDFObjRef): + while isinstance(x, ObjRef): x = x.resolve(default=default) return x @@ -112,7 +226,7 @@ def resolve_all(x: object, default: object = None) -> Any: Make sure there is no indirect reference within the nested object. This procedure might be slow. """ - while isinstance(x, PDFObjRef): + while isinstance(x, ObjRef): x = x.resolve(default=default) if isinstance(x, list): x = [resolve_all(v, default=default) for v in x] @@ -140,7 +254,7 @@ def int_value(x: object) -> int: x = resolve1(x) if not isinstance(x, int): if settings.STRICT: - raise PDFTypeError("Integer required: %r" % x) + raise PDFTypeError("Integer required: %r" % (x,)) return 0 return x @@ -149,7 +263,7 @@ def float_value(x: object) -> float: x = resolve1(x) if not isinstance(x, float): if settings.STRICT: - raise PDFTypeError("Float required: %r" % x) + raise PDFTypeError("Float required: %r" % (x,)) return 0.0 return x @@ -200,12 +314,12 @@ def dict_value(x: object) -> Dict[Any, Any]: return x -def stream_value(x: object) -> "PDFStream": +def stream_value(x: object) -> "ContentStream": x = resolve1(x) - if not isinstance(x, PDFStream): + if not isinstance(x, ContentStream): if settings.STRICT: - raise PDFTypeError("PDFStream required: %r" % x) - return PDFStream({}, b"") + raise PDFTypeError("ContentStream required: %r" % x) + return ContentStream({}, b"") return x @@ -230,7 +344,7 @@ def decompress_corrupted(data: bytes) -> bytes: return result_str -class PDFStream: +class ContentStream: def __init__( self, attrs: Dict[str, Any], @@ -252,14 +366,14 @@ def set_objid(self, objid: int, genno: int) -> None: def __repr__(self) -> str: if self.data is None: assert self.rawdata is not None - return "" % ( + return "" % ( self.objid, len(self.rawdata), self.attrs, ) else: assert self.data is not None - return "" % ( + return "" % ( self.objid, len(self.data), self.attrs, diff --git a/playa/psparser.py b/playa/psparser.py deleted file mode 100755 index c2700af7..00000000 --- a/playa/psparser.py +++ /dev/null @@ -1,1051 +0,0 @@ -#!/usr/bin/env python3 -import io -import logging -import mmap -import re -from binascii import unhexlify -from collections import deque -from typing import ( - Any, - BinaryIO, - Deque, - Dict, - Generic, - Iterator, - List, - Optional, - Tuple, - Type, - TypeVar, - Union, -) - -from playa import settings -from playa.exceptions import ( - PSEOF, - PSException, - PSSyntaxError, - PSTypeError, -) -from playa.utils import choplist - -log = logging.getLogger(__name__) - - -class PSLiteral: - """A class that represents a PostScript literal. - - Postscript literals are used as identifiers, such as - variable names, property names and dictionary keys. - Literals are case sensitive and denoted by a preceding - slash sign (e.g. "/Name") - - Note: Do not create an instance of PSLiteral directly. - Always use PSLiteralTable.intern(). - """ - - def __init__(self, name: str) -> None: - self.name = name - - def __repr__(self) -> str: - return "/%r" % self.name - - -class PSKeyword: - """A class that represents a PostScript keyword. - - PostScript keywords are a dozen of predefined words. - Commands and directives in PostScript are expressed by keywords. - They are also used to denote the content boundaries. - - Note: Do not create an instance of PSKeyword directly. - Always use PSKeywordTable.intern(). - """ - - def __init__(self, name: bytes) -> None: - self.name = name - - def __repr__(self) -> str: - return "/%r" % self.name - - -_SymbolT = TypeVar("_SymbolT", PSLiteral, PSKeyword) -_NameT = TypeVar("_NameT", str, bytes) - - -class PSSymbolTable(Generic[_SymbolT, _NameT]): - """Store globally unique name objects or language keywords.""" - - def __init__(self, table_type: Type[_SymbolT], name_type: Type[_NameT]) -> None: - self.dict: Dict[_NameT, _SymbolT] = {} - self.table_type: Type[_SymbolT] = table_type - self.name_type: Type[_NameT] = name_type - - def intern(self, name: _NameT) -> _SymbolT: - if not isinstance(name, self.name_type): - raise ValueError(f"{self.table_type} can only store {self.name_type}") - if name in self.dict: - lit = self.dict[name] - else: - lit = self.table_type(name) # type: ignore - self.dict[name] = lit - return lit - - -PSLiteralTable = PSSymbolTable(PSLiteral, str) -PSKeywordTable = PSSymbolTable(PSKeyword, bytes) -LIT = PSLiteralTable.intern -KWD = PSKeywordTable.intern -KEYWORD_PROC_BEGIN = KWD(b"{") -KEYWORD_PROC_END = KWD(b"}") -KEYWORD_ARRAY_BEGIN = KWD(b"[") -KEYWORD_ARRAY_END = KWD(b"]") -KEYWORD_DICT_BEGIN = KWD(b"<<") -KEYWORD_DICT_END = KWD(b">>") -KEYWORD_GT = KWD(b">") - - -def name_str(x: bytes) -> str: - """Get the string representation for a name object. - - According to the PDF 1.7 spec (p.18): - - > Ordinarily, the bytes making up the name are never treated as - > text to be presented to a human user or to an application - > external to a conforming reader. However, occasionally the need - > arises to treat a name object as text... In such situations, the - > sequence of bytes (after expansion of NUMBER SIGN sequences, if - > any) should be interpreted according to UTF-8. - - Accordingly, if they *can* be decoded to UTF-8, then they *will* - be, and if not, we will just decode them as ISO-8859-1 since that - gives a unique (if possibly nonsensical) value for an 8-bit string. - """ - try: - return x.decode("utf-8") - except UnicodeDecodeError: - return x.decode("iso-8859-1") - - -def literal_name(x: Any) -> str: - if not isinstance(x, PSLiteral): - if settings.STRICT: - raise PSTypeError(f"Literal required: {x!r}") - return str(x) - else: - return x.name - - -def keyword_name(x: Any) -> str: - if not isinstance(x, PSKeyword): - if settings.STRICT: - raise PSTypeError("Keyword required: %r" % x) - else: - return str(x) - else: - # PDF keywords are *not* UTF-8 (they aren't ISO-8859-1 either, - # but this isn't very important, we just want some - # unique representation of 8-bit characters, as above) - name = x.name.decode("iso-8859-1") - return name - - -EOL = b"\r\n" -WHITESPACE = b" \t\n\r\f\v" -NUMBER = b"0123456789" -HEX = NUMBER + b"abcdef" + b"ABCDEF" -NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE -NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE -NOTSTRING = b"()\\" -OCTAL = b"01234567" -ESC_STRING = { - b"b": 8, - b"t": 9, - b"n": 10, - b"f": 12, - b"r": 13, - b"(": 40, - b")": 41, - b"\\": 92, -} - - -PSBaseParserToken = Union[float, bool, PSLiteral, PSKeyword, bytes] - - -class PSFileParser: - """ - Parser (actually a lexer) for PDF data from a buffered file object. - """ - - def __init__(self, fp: BinaryIO) -> None: - self.fp = fp - self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() - self.seek(0) - - def reinit(self, fp: BinaryIO) -> None: - """Reinitialize parser with a new file.""" - self.fp = fp - self.seek(0) - - def seek(self, pos: int) -> None: - """Seek to a position and reinitialize parser state.""" - self.fp.seek(pos) - self._parse1 = self._parse_main - self._curtoken = b"" - self._curtokenpos = 0 - self._tokens.clear() - - def tell(self) -> int: - """Get the current position in the file.""" - return self.fp.tell() - - def read(self, objlen: int) -> bytes: - """Read data from a specified position, moving the current - position to the end of this data.""" - return self.fp.read(objlen) - - def nextline(self) -> Tuple[int, bytes]: - r"""Fetches a next line that ends either with \r, \n, or - \r\n.""" - linepos = self.fp.tell() - # readline() is implemented on BinarIO so just use that - # (except that it only accepts \n as a separator) - line_or_lines = self.fp.readline() - if line_or_lines == b"": - raise PSEOF - first, sep, rest = line_or_lines.partition(b"\r") - if len(rest) == 0: - return (linepos, line_or_lines) - elif rest != b"\n": - self.fp.seek(linepos + len(first) + 1) - return (linepos, first + sep) - else: - self.fp.seek(linepos + len(first) + 2) - return (linepos, first + b"\r\n") - - def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backwards. - - This is used to locate the trailers at the end of a file. - """ - self.fp.seek(0, io.SEEK_END) - pos = self.fp.tell() - buf = b"" - while pos > 0: - # NOTE: This can obviously be optimized to use regular - # expressions on the (known to exist) buffer in - # self.fp... - pos -= 1 - self.fp.seek(pos) - c = self.fp.read(1) - if c in b"\r\n": - yield buf - buf = c - if c == b"\n" and pos > 0: - self.fp.seek(pos - 1) - cc = self.fp.read(1) - if cc == b"\r": - pos -= 1 - buf = cc + buf - else: - buf = c + buf - yield buf - - def get_inline_data( - self, target: bytes = b"EI", blocksize: int = 4096 - ) -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker. - - Returns a tuple of the position of the target in the data and the - data *including* the end of stream marker. Advances the file - pointer to a position after the end of the stream. - - The caller is responsible for removing the end-of-stream if - necessary (this depends on the filter being used) and parsing - the end-of-stream token (likewise) if necessary. - """ - # PDF 1.7, p. 216: The bytes between the ID and EI operators - # shall be treated the same as a stream object’s data (see - # 7.3.8, "Stream Objects"), even though they do not follow the - # standard stream syntax. - data = [] # list of blocks - partial = b"" # partially seen target - pos = 0 - while True: - # Did we see part of the target at the end of the last - # block? Then scan ahead and try to find the rest (we - # assume the stream is buffered) - if partial: - extra_len = len(target) - len(partial) - extra = self.fp.read(extra_len) - if partial + extra == target: - pos -= len(partial) - data.append(extra) - break - # Put it back (assume buffering!) - self.fp.seek(-extra_len, io.SEEK_CUR) - partial = b"" - # Fall through (the target could be at the beginning) - buf = self.fp.read(blocksize) - if not buf: - return (-1, b"") - tpos = buf.find(target) - if tpos != -1: - data.append(buf[: tpos + len(target)]) - # Put the extra back (assume buffering!) - self.fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR) - pos += tpos - break - else: - pos += len(buf) - # look for the longest partial match at the end - plen = len(target) - 1 - while plen > 0: - ppos = len(buf) - plen - if buf[ppos:] == target[:plen]: - partial = buf[ppos:] - break - plen -= 1 - data.append(buf) - return (pos, b"".join(data)) - - def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: - """Iterate over tokens.""" - return self - - def __next__(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising StopIteration when - done.""" - while True: - c = self._parse1() - # print(c, self._curtoken, self._parse1) - if self._tokens or c == b"": - break - if not self._tokens: - raise StopIteration - return self._tokens.popleft() - - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising PSEOF when done.""" - try: - return self.__next__() - except StopIteration: - raise PSEOF - - def _parse_main(self) -> bytes: - """Initial/default state for the lexer.""" - c = self.fp.read(1) - # note that b"" (EOF) is in everything, which is fine - if c in WHITESPACE: - return c - self._curtokenpos = self.fp.tell() - 1 - if c == b"%": - self._curtoken = b"%" - self._parse1 = self._parse_comment - elif c == b"/": - self._curtoken = b"" - self._parse1 = self._parse_literal - elif c in b"-+" or c in NUMBER: - self._curtoken = c - self._parse1 = self._parse_number - elif c == b".": - self._curtoken = c - self._parse1 = self._parse_float - elif c.isalpha(): - self._curtoken = c - self._parse1 = self._parse_keyword - elif c == b"(": - self._curtoken = b"" - self.paren = 1 - self._parse1 = self._parse_string - elif c == b"<": - self._curtoken = b"" - self._parse1 = self._parse_wopen - elif c == b">": - self._curtoken = b"" - self._parse1 = self._parse_wclose - elif c == b"\x00": - pass - else: - self._add_token(KWD(c)) - return c - - def _add_token(self, obj: PSBaseParserToken) -> None: - """Add a succesfully parsed token.""" - self._tokens.append((self._curtokenpos, obj)) - - def _parse_comment(self) -> bytes: - """Comment state for the lexer""" - c = self.fp.read(1) - if c in EOL: # this includes b"", i.e. EOF - self._parse1 = self._parse_main - # We ignore comments. - # self._tokens.append(self._curtoken) - else: - self._curtoken += c - return c - - def _parse_literal(self) -> bytes: - """Literal (keyword) state for the lexer.""" - c = self.fp.read(1) - if c == b"#": - self.hex = b"" - self._parse1 = self._parse_literal_hex - elif c in NOTLITERAL: - if c: - self.fp.seek(-1, io.SEEK_CUR) - self._add_token(LIT(name_str(self._curtoken))) - self._parse1 = self._parse_main - else: - self._curtoken += c - return c - - def _parse_literal_hex(self) -> bytes: - """State for escaped hex characters in literal names""" - # Consume a hex digit only if we can ... consume a hex digit - if len(self.hex) >= 2: # it actually can't exceed 2 - self._curtoken += bytes((int(self.hex, 16),)) - self._parse1 = self._parse_literal - return b"/" - c = self.fp.read(1) - if c and c in HEX: - self.hex += c - else: - if c: # not EOF, but not hex either - log.warning("Invalid hex digit %r in literal", c) - self.fp.seek(-1, io.SEEK_CUR) - # Add the intervening junk, just in case - tok = LIT(name_str(self._curtoken)) - self._add_token(tok) - self._curtokenpos = self.tell() - 1 - len(self.hex) - self._add_token(KWD(b"#" + self.hex)) - self._parse1 = self._parse_main - return c - - def _parse_number(self) -> bytes: - """State for numeric objects.""" - c = self.fp.read(1) - if c and c in NUMBER: - self._curtoken += c - elif c == b".": - self._curtoken += c - self._parse1 = self._parse_float - else: - if c: - self.fp.seek(-1, io.SEEK_CUR) - try: - self._add_token(int(self._curtoken)) - except ValueError: - log.warning("Invalid int literal: %r", self._curtoken) - self._parse1 = self._parse_main - return c - - def _parse_float(self) -> bytes: - """State for fractional part of numeric objects.""" - c = self.fp.read(1) - # b"" is in everything so we have to add an extra check - if not c or c not in NUMBER: - if c: - self.fp.seek(-1, io.SEEK_CUR) - try: - self._add_token(float(self._curtoken)) - except ValueError: - log.warning("Invalid float literal: %r", self._curtoken) - self._parse1 = self._parse_main - else: - self._curtoken += c - return c - - def _parse_keyword(self) -> bytes: - """State for keywords.""" - c = self.fp.read(1) - if c in NOTKEYWORD: # includes EOF - if c: - self.fp.seek(-1, io.SEEK_CUR) - if self._curtoken == b"true": - self._add_token(True) - elif self._curtoken == b"false": - self._add_token(False) - else: - self._add_token(KWD(self._curtoken)) - self._parse1 = self._parse_main - else: - self._curtoken += c - return c - - def _parse_string(self) -> bytes: - """State for string objects.""" - c = self.fp.read(1) - if c and c in NOTSTRING: # does not include EOF - if c == b"\\": - self._parse1 = self._parse_string_esc - return c - elif c == b"(": - self.paren += 1 - self._curtoken += c - return c - elif c == b")": - self.paren -= 1 - if self.paren: - self._curtoken += c - return c - # We saw the last parenthesis and fell through (it will be - # consumed, but not added to self._curtoken) - self._add_token(self._curtoken) - self._parse1 = self._parse_main - elif c == b"\r": - # PDF 1.7 page 15: An end-of-line marker appearing within - # a literal string without a preceding REVERSE SOLIDUS - # shall be treated as a byte value of (0Ah), irrespective - # of whether the end-of-line marker was a CARRIAGE RETURN - # (0Dh), a LINE FEED (0Ah), or both. - cc = self.fp.read(1) - # Put it back if it isn't \n - if cc and cc != b"\n": - self.fp.seek(-1, io.SEEK_CUR) - self._curtoken += b"\n" - else: - self._curtoken += c - return c - - def _parse_string_esc(self) -> bytes: - """State for escapes in literal strings. We have seen a - backslash and nothing else.""" - c = self.fp.read(1) - if c and c in OCTAL: # exclude EOF - self.oct = c - self._parse1 = self._parse_string_octal - return c - elif c and c in ESC_STRING: - self._curtoken += bytes((ESC_STRING[c],)) - elif c == b"\n": # Skip newline after backslash - pass - elif c == b"\r": # Also skip CRLF after - cc = self.fp.read(1) - # Put it back if it isn't \n - if cc and cc != b"\n": - self.fp.seek(-1, io.SEEK_CUR) - elif c == b"": - log.warning("EOF inside escape %r", self._curtoken) - else: - log.warning("Unrecognized escape %r", c) - self._curtoken += c - self._parse1 = self._parse_string - return c - - def _parse_string_octal(self) -> bytes: - """State for an octal escape.""" - c = self.fp.read(1) - if c and c in OCTAL: # exclude EOF - self.oct += c - done = len(self.oct) >= 3 # it can't be > though - else: - if c: - self.fp.seek(-1, io.SEEK_CUR) - else: - log.warning("EOF in octal escape %r", self._curtoken) - done = True - if done: - chrcode = int(self.oct, 8) - if chrcode >= 256: - # PDF1.7 p.16: "high-order overflow shall be ignored." - log.warning("Invalid octal %r (%d)", self.oct, chrcode) - else: - self._curtoken += bytes((chrcode,)) - # Back to normal string parsing - self._parse1 = self._parse_string - return c - - def _parse_wopen(self) -> bytes: - """State for start of dictionary or hex string.""" - c = self.fp.read(1) - if c == b"<": - self._add_token(KEYWORD_DICT_BEGIN) - self._parse1 = self._parse_main - else: - if c: - self.fp.seek(-1, io.SEEK_CUR) - self._parse1 = self._parse_hexstring - return c - - def _parse_wclose(self) -> bytes: - """State for end of dictionary (accessed from initial state only)""" - c = self.fp.read(1) - if c == b">": - self._add_token(KEYWORD_DICT_END) - else: - # Assuming this is a keyword (which means nothing) - self._add_token(KEYWORD_GT) - if c: - self.fp.seek(-1, io.SEEK_CUR) - self._parse1 = self._parse_main - return c - - def _parse_hexstring(self) -> bytes: - """State for parsing hexadecimal literal strings.""" - c = self.fp.read(1) - if not c: - log.warning("EOF in hex string %r", self._curtoken) - elif c in WHITESPACE: - pass - elif c in HEX: - self._curtoken += c - elif c == b">": - if len(self._curtoken) % 2 == 1: - self._curtoken += b"0" - token = unhexlify(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - else: - log.warning("unexpected character %r in hex string %r", c, self._curtoken) - return c - - -LEXER = re.compile( - rb"""(?: - (?P \s+) - | (?P %[^\r\n]*[\r\n]) - | (?P /(?: \#[A-Fa-f\d][A-Fa-f\d] | [^#/%\[\]()<>{}\s])+ ) - | (?P [-+]? (?: \d*\.\d+ | \d+ ) ) - | (?P [A-Za-z] [^#/%\[\]()<>{}\s]*) - | (?P \([^()\\]*) - | (?P <[A-Fa-f\d\s]*>) - | (?P <<) - | (?P >>) - | (?P .) -) -""", - re.VERBOSE, -) -STRLEXER = re.compile( - rb"""(?: - (?P \\[0-7]{1,3}) - | (?P \\(?:\r\n?|\n)) - | (?P \\.) - | (?P \() - | (?P \)) - | (?P \r\n?|\n) - | (?P .) -)""", - re.VERBOSE, -) -HEXDIGIT = re.compile(rb"#([A-Fa-f\d][A-Fa-f\d])") -EOLR = re.compile(rb"\r\n?|\n") -SPC = re.compile(rb"\s") - - -class PSInMemoryParser: - """ - Parser for in-memory data streams. - """ - - def __init__(self, data: Union[bytes, mmap.mmap]) -> None: - self.data = data - self.pos = 0 - self.end = len(data) - self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() - - def reinit(self, data: bytes) -> None: - """Reinitialize parser with a new buffer.""" - self.data = data - self.seek(0) - - def seek(self, pos: int) -> None: - """Seek to a position and reinitialize parser state.""" - self.pos = pos - self._curtoken = b"" - self._curtokenpos = 0 - self._tokens.clear() - - def tell(self) -> int: - """Get the current position in the buffer.""" - return self.pos - - def read(self, objlen: int) -> bytes: - """Read data from current position, advancing to the end of - this data.""" - pos = self.pos - self.pos = min(pos + objlen, len(self.data)) - return self.data[pos : self.pos] - - def nextline(self) -> Tuple[int, bytes]: - r"""Fetches a next line that ends either with \r, \n, or \r\n.""" - if self.pos == self.end: - raise PSEOF - linepos = self.pos - m = EOLR.search(self.data, self.pos) - if m is None: - self.pos = self.end - else: - self.pos = m.end() - return (linepos, self.data[linepos : self.pos]) - - def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backwards. - - This is used to locate the trailers at the end of a file. So, - it isn't actually used in PSInMemoryParser, but is here for - completeness. - """ - endline = pos = self.end - while True: - nidx = self.data.rfind(b"\n", 0, pos) - ridx = self.data.rfind(b"\r", 0, pos) - best = max(nidx, ridx) - if best == -1: - yield self.data[:endline] - break - yield self.data[best + 1 : endline] - endline = best + 1 - pos = best - if pos > 0 and self.data[pos - 1 : pos + 1] == b"\r\n": - pos -= 1 - - def get_inline_data( - self, target: bytes = b"EI", blocksize: int = -1 - ) -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker. - - Returns a tuple of the position of the target in the data and the - data *including* the end of stream marker. Advances the file - pointer to a position after the end of the stream. - - The caller is responsible for removing the end-of-stream if - necessary (this depends on the filter being used) and parsing - the end-of-stream token (likewise) if necessary. - """ - tpos = self.data.find(target, self.pos) - if tpos != -1: - nextpos = tpos + len(target) - result = (tpos, self.data[self.pos : nextpos]) - self.pos = nextpos - return result - return (-1, b"") - - def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: - """Iterate over tokens.""" - return self - - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising PSEOF when done.""" - try: - return self.__next__() - except StopIteration: - raise PSEOF - - def __next__(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising StopIteration when - done.""" - while True: - m = LEXER.match(self.data, self.pos) - if m is None: # can only happen at EOS - raise StopIteration - self._curtokenpos = m.start() - self.pos = m.end() - if m.lastgroup not in ("whitespace", "comment"): # type: ignore - # Okay, we got a token or something - break - self._curtoken = m[0] - if m.lastgroup == "name": # type: ignore - self._curtoken = m[0][1:] - self._curtoken = HEXDIGIT.sub( - lambda x: bytes((int(x[1], 16),)), self._curtoken - ) - tok = LIT(name_str(self._curtoken)) - return (self._curtokenpos, tok) - if m.lastgroup == "number": # type: ignore - if b"." in self._curtoken: - return (self._curtokenpos, float(self._curtoken)) - else: - return (self._curtokenpos, int(self._curtoken)) - if m.lastgroup == "startdict": # type: ignore - return (self._curtokenpos, KEYWORD_DICT_BEGIN) - if m.lastgroup == "enddict": # type: ignore - return (self._curtokenpos, KEYWORD_DICT_END) - if m.lastgroup == "startstr": # type: ignore - return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end()) - if m.lastgroup == "hexstr": # type: ignore - self._curtoken = SPC.sub(b"", self._curtoken[1:-1]) - if len(self._curtoken) % 2 == 1: - self._curtoken += b"0" - return (self._curtokenpos, unhexlify(self._curtoken)) - # Anything else is treated as a keyword (whether explicitly matched or not) - if self._curtoken == b"true": - return (self._curtokenpos, True) - elif self._curtoken == b"false": - return (self._curtokenpos, False) - else: - return (self._curtokenpos, KWD(self._curtoken)) - - def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, PSBaseParserToken]: - """Parse the remainder of a string.""" - # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) - parts = [EOLR.sub(b"\n", start)] - paren = 1 - for m in STRLEXER.finditer(self.data, pos): - self.pos = m.end() - if m.lastgroup == "parenright": # type: ignore - paren -= 1 - if paren == 0: - # By far the most common situation! - break - parts.append(m[0]) - elif m.lastgroup == "parenleft": # type: ignore - parts.append(m[0]) - paren += 1 - elif m.lastgroup == "escape": # type: ignore - chr = m[0][1:2] - if chr not in ESC_STRING: - log.warning("Unrecognized escape %r", m[0]) - parts.append(chr) - else: - parts.append(bytes((ESC_STRING[chr],))) - elif m.lastgroup == "octal": # type: ignore - chrcode = int(m[0][1:], 8) - if chrcode >= 256: - # PDF1.7 p.16: "high-order overflow shall be - # ignored." - log.warning("Invalid octal %r (%d)", m[0][1:], chrcode) - else: - parts.append(bytes((chrcode,))) - elif m.lastgroup == "newline": # type: ignore - # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15) - parts.append(b"\n") - elif m.lastgroup == "linebreak": # type: ignore - pass - else: - parts.append(m[0]) - if paren != 0: - log.warning("Unterminated string at %d", pos) - raise StopIteration - return (self._curtokenpos, b"".join(parts)) - - -# Stack slots may by occupied by any of: -# * the name of a literal -# * the PSBaseParserToken types -# * list (via KEYWORD_ARRAY) -# * dict (via KEYWORD_DICT) -# * subclass-specific extensions (e.g. PDFStream, PDFObjRef) via ExtraT -ExtraT = TypeVar("ExtraT") -PSStackType = Union[str, float, bool, PSLiteral, bytes, List, Dict, ExtraT] -PSStackEntry = Tuple[int, PSStackType[ExtraT]] - - -class PSStackParser(Generic[ExtraT]): - """Basic parser for PDF objects, can take a file or a `bytes` as - input.""" - - _mmap: Optional[mmap.mmap] = None - - def __init__(self, reader: Union[BinaryIO, bytes]) -> None: - self.reinit(reader) - - def __del__(self): - if self._mmap is not None: - self._mmap.close() - - def reinit(self, reader: Union[BinaryIO, bytes]) -> None: - """Reinitialize parser with a new file or buffer.""" - if isinstance(reader, bytes): - self._parser: Union[PSInMemoryParser, PSFileParser] = PSInMemoryParser( - reader - ) - else: - try: - if self._mmap is not None: - self._mmap.close() - self._mmap = None - self._mmap = mmap.mmap(reader.fileno(), 0, access=mmap.ACCESS_READ) - self._parser = PSInMemoryParser(self._mmap) - except io.UnsupportedOperation: - log.warning( - "mmap not supported on %r, falling back to file parser", reader - ) - self._parser = PSFileParser(reader) - self.reset() - - def reset(self) -> None: - """Reset parser state.""" - self.context: List[Tuple[int, Optional[str], List[PSStackEntry[ExtraT]]]] = [] - self.curtype: Optional[str] = None - self.curstack: List[PSStackEntry[ExtraT]] = [] - self.results: List[PSStackEntry[ExtraT]] = [] - - def seek(self, pos: int) -> None: - """Seek to a position and reset parser state.""" - self._parser.seek(pos) - self.reset() - - def tell(self) -> int: - """Get the current position in the file.""" - return self._parser.tell() - - def push(self, *objs: PSStackEntry[ExtraT]) -> None: - """Push some objects onto the stack.""" - self.curstack.extend(objs) - - def pop(self, n: int) -> List[PSStackEntry[ExtraT]]: - """Pop some objects off the stack.""" - objs = self.curstack[-n:] - self.curstack[-n:] = [] - return objs - - def popall(self) -> List[PSStackEntry[ExtraT]]: - """Pop all the things off the stack.""" - objs = self.curstack - self.curstack = [] - return objs - - def add_results(self, *objs: PSStackEntry[ExtraT]) -> None: - """Move some objects to the output.""" - try: - log.debug("add_results: %r", objs) - except Exception: - log.debug("add_results: (unprintable object)") - self.results.extend(objs) - - def start_type(self, pos: int, type: str) -> None: - """Start a composite object (array, dict, etc).""" - self.context.append((pos, self.curtype, self.curstack)) - (self.curtype, self.curstack) = (type, []) - log.debug("start_type: pos=%r, type=%r", pos, type) - - def end_type(self, type: str) -> Tuple[int, List[PSStackType[ExtraT]]]: - """End a composite object (array, dict, etc).""" - if self.curtype != type: - raise PSTypeError(f"Type mismatch: {self.curtype!r} != {type!r}") - objs = [obj for (_, obj) in self.curstack] - (pos, self.curtype, self.curstack) = self.context.pop() - log.debug("end_type: pos=%r, type=%r, objs=%r", pos, type, objs) - return (pos, objs) - - def do_keyword(self, pos: int, token: PSKeyword) -> None: - """Handle a PDF keyword.""" - pass - - def flush(self) -> None: - """Get everything off the stack and into the output?""" - pass - - def nextobject(self) -> PSStackEntry[ExtraT]: - """Yields a list of objects. - - Arrays and dictionaries are represented as Python lists and - dictionaries. - - :return: keywords, literals, strings, numbers, arrays and dictionaries. - """ - while not self.results: - (pos, token) = self.nexttoken() - if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): - # normal token - self.push((pos, token)) - elif token == KEYWORD_ARRAY_BEGIN: - # begin array - self.start_type(pos, "a") - elif token == KEYWORD_ARRAY_END: - # end array - try: - self.push(self.end_type("a")) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_DICT_BEGIN: - # begin dictionary - self.start_type(pos, "d") - elif token == KEYWORD_DICT_END: - # end dictionary - try: - (pos, objs) = self.end_type("d") - if len(objs) % 2 != 0: - error_msg = "Invalid dictionary construct: %r" % objs - raise PSSyntaxError(error_msg) - d = { - literal_name(k): v - for (k, v) in choplist(2, objs) - if v is not None - } - self.push((pos, d)) - except PSTypeError: - if settings.STRICT: - raise - elif token == KEYWORD_PROC_BEGIN: - # begin proc - self.start_type(pos, "p") - elif token == KEYWORD_PROC_END: - # end proc - try: - self.push(self.end_type("p")) - except PSTypeError: - if settings.STRICT: - raise - elif isinstance(token, PSKeyword): - log.debug( - "do_keyword: pos=%r, token=%r, stack=%r", - pos, - token, - self.curstack, - ) - self.do_keyword(pos, token) - else: - log.error( - "unknown token: pos=%r, token=%r, stack=%r", - pos, - token, - self.curstack, - ) - self.do_keyword(pos, token) - raise PSException - if self.context: - continue - else: - self.flush() # Does nothing here, but in subclasses... (ugh) - obj = self.results.pop(0) - try: - log.debug("nextobject: %r", obj) - except Exception: - log.debug("nextobject: (unprintable object)") - return obj - - # Delegation follows - def nextline(self) -> Tuple[int, bytes]: - r"""Fetches a next line that ends either with \r, \n, or - \r\n.""" - return self._parser.nextline() - - def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backwards. - - This is used to locate the trailers at the end of a file. - """ - return self._parser.revreadlines() - - def read(self, objlen: int) -> bytes: - """Read data from a specified position, moving the current - position to the end of this data.""" - return self._parser.read(objlen) - - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising PSEOF when done.""" - try: - return self.__next__() - except StopIteration: - raise PSEOF - - def get_inline_data(self, target: bytes = b"EI") -> Tuple[int, bytes]: - """Get the data for an inline image up to the target - end-of-stream marker.""" - return self._parser.get_inline_data(target) - - def __iter__(self) -> Iterator[Tuple[int, PSBaseParserToken]]: - """Iterate over tokens.""" - return self - - def __next__(self) -> Tuple[int, PSBaseParserToken]: - """Get the next token in iteration, raising StopIteration when - done.""" - return self._parser.__next__() diff --git a/playa/pdfstructtree.py b/playa/structtree.py similarity index 94% rename from playa/pdfstructtree.py rename to playa/structtree.py index 5f1c69e8..39f07501 100644 --- a/playa/pdfstructtree.py +++ b/playa/structtree.py @@ -17,16 +17,15 @@ from playa.data_structures import NumberTree from playa.exceptions import PDFNoStructTree -from playa.pdfpage import PDFPage -from playa.pdfparser import KEYWORD_NULL -from playa.pdftypes import PDFObjRef, resolve1 -from playa.psparser import PSLiteral +from playa.page import Page +from playa.parser import KEYWORD_NULL, PSLiteral +from playa.pdftypes import ObjRef, resolve1 from playa.utils import decode_text logger = logging.getLogger(__name__) if TYPE_CHECKING: - from playa.pdfdocument import PDFDocument + from playa.document import PDFDocument MatchFunc = Callable[["PDFStructElement"], bool] @@ -103,7 +102,7 @@ class PDFStructElement(Findable): alt_text: Union[str, None] actual_text: Union[str, None] title: Union[str, None] - page_number: Union[int, None] + page_idx: Union[int, None] attributes: Dict[str, Any] = field(default_factory=dict) mcids: List[int] = field(default_factory=list) children: List["PDFStructElement"] = field(default_factory=list) @@ -117,12 +116,12 @@ def all_mcids(self) -> Iterator[Tuple[Union[int, None], int]]: """ # Collect them depth-first to preserve ordering for mcid in self.mcids: - yield self.page_number, mcid + yield self.page_idx, mcid d = deque(self.children) while d: el = d.popleft() for mcid in el.mcids: - yield el.page_number, mcid + yield el.page_idx, mcid d.extendleft(reversed(el.children)) def to_dict(self) -> Dict[str, Any]: @@ -154,17 +153,17 @@ class PDFStructTree(Findable): Args: doc: Document from which to extract structure tree - pages: List of (number, page) pairs - numbers will be used to - identify pages in the tree through the `page_number` + pages: List of (index, page) pairs - indices will be used to + identify pages in the tree through the `page_idx` attribute of `PDFStructElement`. """ - page: Union[PDFPage, None] + page: Union[Page, None] def __init__( self, doc: "PDFDocument", - pages: Union[Iterable[PDFPage], None] = None, + pages: Union[Iterable[Page], None] = None, ): if "StructTreeRoot" not in doc.catalog: raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry") @@ -175,11 +174,11 @@ def __init__( self.page_dict: Dict[Any, Union[int, None]] if pages is None: - self.page_dict = {page.pageid: page.page_number for page in doc.pages} + self.page_dict = {page.pageid: page.page_idx for page in doc.pages} self._parse_struct_tree() else: pagelist = list(pages) - self.page_dict = {page.pageid: page.page_number for page in pagelist} + self.page_dict = {page.pageid: page.page_idx for page in pagelist} parent_tree_obj = self.root.get("ParentTree") # If we have a single page then we will work backwards from # its ParentTree - this is because structure elements could @@ -258,12 +257,12 @@ def _make_element( # We hopefully caught these earlier assert "MCID" not in obj, "Uncaught MCR: %s" % obj assert "Obj" not in obj, "Uncaught OBJR: %s" % obj - # Get page number if necessary - page_number = None + # Get page index if necessary + page_idx = None if self.page_dict is not None and "Pg" in obj: page_objid = obj["Pg"].objid assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj - page_number = self.page_dict[page_objid] + page_idx = self.page_dict[page_objid] obj_tag = "" if "S" in obj: obj_tag = decode_text(obj["S"].name) @@ -286,7 +285,7 @@ def _make_element( element = PDFStructElement( type=obj_tag, id=element_id, - page_number=page_number, + page_idx=page_idx, revision=revision, lang=lang, title=title, @@ -368,7 +367,7 @@ def _parse_struct_tree(self) -> None: child = obj["Obj"] elif "MCID" in obj: continue - if isinstance(child, PDFObjRef): + if isinstance(child, ObjRef): d.append(child) # Traverse depth-first, removing empty elements (unsure how to @@ -438,7 +437,7 @@ def _resolve_children(self, seen: Dict[str, Any]) -> None: elif "Obj" in obj: child = obj["Obj"] # NOTE: if, not elif, in case of OBJR above - if isinstance(child, PDFObjRef): + if isinstance(child, ObjRef): child_element, _ = seen.get(repr(child), (None, None)) if child_element is not None: element.children.append(child_element) diff --git a/pyproject.toml b/pyproject.toml index 5a14530d..ac290cc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,3 +67,14 @@ ban-relative-imports = "all" [tool.pytest.ini_options] testpaths = [ "tests" ] +[tool.hatch.envs.hatch-test] +extra-dependencies = [ "pdfminer.six" ] + +[tool.hatch.envs.bench] +dependencies = [ "pdfminer.six" ] + +[tool.hatch.envs.bench.scripts] +all = [ + "python tests/benchmark_parser.py", + "python tests/benchmark_convert.py", +] diff --git a/tests/benchmark_convert.py b/tests/benchmark_convert.py index e1ced060..01519e7e 100644 --- a/tests/benchmark_convert.py +++ b/tests/benchmark_convert.py @@ -30,10 +30,10 @@ def benchmark_one_pdf(path: Path): passwords = PASSWORDS.get(path.name, [""]) for password in passwords: - LOG.debug("Reading %s", path) + LOG.info("Reading %s", path) with playa.open(path, password=password) as pdf: for page in pdf.pages: - _ = page.layout + _ = list(page.layout) def benchmark_one_pdfminer(path: Path): @@ -57,17 +57,21 @@ def benchmark_one_pdfminer(path: Path): if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) + # Silence warnings about broken PDFs + logging.basicConfig(level=logging.ERROR) niter = 10 - if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]: - start = time.time() - for _ in range(niter): - for path in ALLPDFS: - benchmark_one_pdfminer(path) - LOG.info("pdfminer.six took %f", time.time() - start) - if len(sys.argv) == 1 or "playa" in sys.argv[1:]: - start = time.time() - for _ in range(niter): - for path in ALLPDFS: + miner_time = beach_time = 0.0 + for iter in range(niter + 1): + for path in ALLPDFS: + if len(sys.argv) == 1 or "playa" in sys.argv[1:]: + start = time.time() benchmark_one_pdf(path) - LOG.info("PLAYA took %f", time.time() - start) + if iter != 0: + beach_time += time.time() - start + if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]: + start = time.time() + benchmark_one_pdfminer(path) + if iter != 0: + miner_time += time.time() - start + print("pdfminer.six took %.2fs / iter" % (miner_time / niter,)) + print("PLAYA took %.2fs / iter" % (beach_time / niter,)) diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index df999449..f67385d3 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -272,21 +272,21 @@ def bench_bytes(): - from playa.psparser import PSInMemoryParser + from playa.parser import Lexer runs = 100 start = time.time() - parser = PSInMemoryParser(DATA * runs) + parser = Lexer(DATA * runs) _ = list(parser) print( - "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000), + "PLAYA Lexer (bytes): %fms / run" % ((time.time() - start) / runs * 1000), ) def bench_mmap(): import mmap - from playa.psparser import PSInMemoryParser + from playa.parser import Lexer with tempfile.NamedTemporaryFile() as tf: runs = 100 @@ -295,55 +295,17 @@ def bench_mmap(): with open(tf.name, "rb") as infh: start = time.time() mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ) - parser = PSInMemoryParser(mapping) + parser = Lexer(mapping) _ = list(parser) print( - "PLAYA Parser (mmap): %fms / run" + "PLAYA Lexer (mmap): %fms / run" % ((time.time() - start) / runs * 1000), ) -def bench_bytesio(): - from pdfminer.psparser import PSEOF, PSBaseParser - - runs = 100 - start = time.time() - parser = PSBaseParser(BytesIO(DATA * runs)) - while True: - try: - _ = parser.nexttoken() - except PSEOF: - break - print( - "pdfminer.six Parser (BytesIO): %fms / run" - % ((time.time() - start) / runs * 1000), - ) - - def bench_playa(): - from playa.pdfdocument import PDFDocument - from playa.pdfpage import PDFPage - from playa.psparser import PSFileParser + from playa.document import PDFDocument - runs = 100 - start = time.time() - parser = PSFileParser(BytesIO(DATA * runs)) - _ = list(parser) - print( - "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), - ) - with tempfile.NamedTemporaryFile() as tf: - runs = 100 - with open(tf.name, "wb") as outfh: - outfh.write(DATA * runs) - with open(tf.name, "rb") as infh: - start = time.time() - parser = PSFileParser(infh) - _ = list(parser) - print( - "PLAYA Parser (BinaryIO): %fms / run" - % ((time.time() - start) / runs * 1000), - ) bench_bytes() bench_mmap() @@ -352,7 +314,7 @@ def bench_playa(): for _ in range(runs): with open(TESTDIR / "contrib" / "pagelabels.pdf", "rb") as infh: doc = PDFDocument(infh) - page = next(PDFPage.create_pages(doc)) + page = doc.pages[0] _ = page.layout print( "PLAYA Interpreter: %dms / run" % ((time.time() - start) / runs * 1000), @@ -376,7 +338,7 @@ def bench_pdfminer(): except PSEOF: break print( - "pdfminer.six Parser (BytesIO): %fms / run" + "pdfminer.six Lexer (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), ) with tempfile.NamedTemporaryFile() as tf: @@ -391,7 +353,7 @@ def bench_pdfminer(): except PSEOF: break print( - "pdfminer.six Parser (BinaryIO): %fms / run" + "pdfminer.six Lexer (BinaryIO): %fms / run" % ((time.time() - start) / runs * 1000), ) runs = 20 @@ -418,7 +380,5 @@ def bench_pdfminer(): bench_playa() if len(sys.argv) > 1 and sys.argv[1] == "bytes": bench_bytes() - if len(sys.argv) > 1 and sys.argv[1] == "bytesio": - bench_bytesio() if len(sys.argv) > 1 and sys.argv[1] == "mmap": bench_mmap() diff --git a/tests/test_open.py b/tests/test_open.py index fe67b616..79699f8a 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -24,36 +24,58 @@ @pytest.mark.parametrize("path", ALLPDFS, ids=str) def test_open(path: Path) -> None: - """Open all the documents""" + """Open all the documents and compare with pdfplumber""" + from pdfminer.converter import PDFPageAggregator + from pdfminer.pdfdocument import PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.pdfpage import PDFPage + from pdfminer.pdfparser import PDFParser + passwords = PASSWORDS.get(path.name, [""]) for password in passwords: - with playa.open(TESTDIR / path, password=password) as _pdf: - pass - - -def test_analyze() -> None: - """Test the layout analyzer (FIXME: PLAYA Ain't a Layout Analyzer)""" - with playa.open( - TESTDIR / "2023-04-06-ODJ et Résolutions-séance xtra 6 avril 2023.pdf" - ) as pdf: - for page in pdf.pages: - page_objs = list(page.layout) - print(len(page_objs)) + miner = [] + with open(path, "rb") as infh: + try: + rsrc = PDFResourceManager() + agg = PDFPageAggregator(rsrc, pageno=1) + interp = PDFPageInterpreter(rsrc, agg) + pdf = PDFDocument(PDFParser(infh), password=password) + for page in PDFPage.create_pages(pdf): + interp.process_page(page) + layout = agg.result + for item in layout: + miner.append((type(item).__name__, item.bbox)) + except Exception: + continue + + itor = iter(miner) + with playa.open(path, password=password) as pdf: + for page in pdf.pages: + for item in page.layout: + thingy = (type(item).__name__, item.bbox) + assert thingy == next(itor) def test_inline_data() -> None: - # No, there's no easy way to unit test PDFContentParser directly. - # The necessary mocking would be useless considering that I will - # shortly demolish these redundant and confusing APIs. with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc: - _ = doc.pages[0].layout + page = doc.pages[0] + items = list(page.layout) + assert len(items) == 456 def test_multiple_contents() -> None: with playa.open(TESTDIR / "jo.pdf") as doc: page = doc.pages[0] assert len(page.contents) > 1 - _ = page.layout + items = list(page.layout) + assert len(items) == 898 + + +def test_xobjects() -> None: + with playa.open(TESTDIR / "encryption/aes-256.pdf", password="foo") as doc: + for page in doc.pages: + for item in page.layout: + print(item) def test_weakrefs() -> None: @@ -67,4 +89,6 @@ def test_weakrefs() -> None: if __name__ == "__main__": - test_open(TESTDIR / "simple5.pdf") + import logging + logging.basicConfig(level=logging.DEBUG) + test_xobjects() diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 969643dc..af020f94 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -10,12 +10,10 @@ import playa import playa.settings from playa.data_structures import NameTree +from playa.document import read_header from playa.exceptions import PDFSyntaxError -from playa.pdfdocument import read_header from playa.utils import decode_text -playa.settings.STRICT = True - TESTDIR = Path(__file__).parent.parent / "samples" @@ -34,6 +32,12 @@ def test_read_header(): assert read_header(BytesIO(b"%PDF-1.7")) == "1.7" +def test_objects(): + with playa.open(TESTDIR / "simple1.pdf") as doc: + for obj in doc: + print(obj) + + def test_page_labels(): with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc: labels = [label for _, label in zip(range(10), doc.page_labels)] diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py new file mode 100644 index 00000000..e603fa05 --- /dev/null +++ b/tests/test_pdfparser.py @@ -0,0 +1,22 @@ +from pathlib import Path + +from playa.parser import PDFParser + +TESTDIR = Path(__file__).parent.parent / "samples" + + +class MockDoc: + def __call__(self): + return self + + decipher = None + + +def test_indirect_objects(): + """Verify that indirect objects are parsed properly.""" + with open(TESTDIR / "simple2.pdf", "rb") as infh: + data = infh.read() + doc = MockDoc() + parser = PDFParser(data, doc) + for obj in parser: + print(obj) diff --git a/tests/test_pdfstructtree.py b/tests/test_pdfstructtree.py index ae5fe15b..90eb37c0 100644 --- a/tests/test_pdfstructtree.py +++ b/tests/test_pdfstructtree.py @@ -3,7 +3,7 @@ from pathlib import Path import playa -from playa.pdfstructtree import PDFStructTree +from playa.structtree import PDFStructTree TESTDIR = Path(__file__).parent.parent / "samples" @@ -64,14 +64,14 @@ def test_all_mcids(self) -> None: stree = PDFStructTree(pdf) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) - page_numbers = set(page for page, mcid in mcids) - assert 1 in page_numbers - assert 2 in page_numbers + page_indices = set(page for page, mcid in mcids) + assert 0 in page_indices + assert 1 in page_indices stree = PDFStructTree(pdf, [pdf.pages[1]]) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) - page_numbers = set(page for page, mcid in mcids) - assert page_numbers == {2} + page_indices = set(page for page, mcid in mcids) + assert page_indices == {1} for p in sect.find_all("P"): assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids) diff --git a/tests/test_psparser.py b/tests/test_psparser.py index 948a042b..ce05d9f1 100644 --- a/tests/test_psparser.py +++ b/tests/test_psparser.py @@ -1,30 +1,24 @@ import logging -import tempfile -from io import BytesIO from typing import Any, List, Tuple import pytest -from playa.exceptions import PSEOF -from playa.psparser import ( +from playa.parser import ( KEYWORD_DICT_BEGIN, KEYWORD_DICT_END, + Lexer, + Parser, +) +from playa.pdftypes import ( KWD, LIT, - PSFileParser, - PSInMemoryParser, - PSStackParser, keyword_name, literal_name, ) logger = logging.getLogger(__name__) - -class TestPSFileParser: - """Simplistic Test cases""" - - TESTDATA = rb"""%!PS +TESTDATA1 = rb"""%!PS begin end " @ # /a/BCD /Some_Name /foo#5f#xbaa @@ -43,132 +37,113 @@ class TestPSFileParser: [ 1 (z) ! ] << /foo (bar) >> """ +TOKENS1 = [ + (5, KWD(b"begin")), + (11, KWD(b"end")), + (16, KWD(b'"')), + (19, KWD(b"@")), + (21, KWD(b"#")), + (23, LIT("a")), + (25, LIT("BCD")), + (30, LIT("Some_Name")), + (41, LIT("foo_")), + (48, KWD(b"#")), + (49, KWD(b"xbaa")), + (54, 0), + (56, 1), + (59, -2), + (62, 0.5), + (65, 1.234), + (71, b"abc"), + (77, b""), + (80, b"abc ( def ) ghi"), + (98, b"def \x00 4ghi"), + (118, b"bach\\slask"), + (132, b"foo\nbaa"), + (143, b"this % is not a comment."), + (170, b"foo\nbaa"), + (180, b"foobaa"), + (191, b""), + (194, b" "), + (199, b"@@ "), + (211, b"\xab\xcd\x00\x124\x50"), + (226, KWD(b"func")), + (230, LIT("a")), + (232, LIT("b")), + (234, KWD(b"{")), + (235, b"c"), + (238, KWD(b"do*")), + (241, KWD(b"}")), + (242, KWD(b"def")), + (246, KWD(b"[")), + (248, 1), + (250, b"z"), + (254, KWD(b"!")), + (256, KWD(b"]")), + (258, KWD(b"<<")), + (261, LIT("foo")), + (266, b"bar"), + (272, KWD(b">>")), +] +OBJS1 = [ + (23, LIT("a")), + (25, LIT("BCD")), + (30, LIT("Some_Name")), + (41, LIT("foo_")), + (54, 0), + (56, 1), + (59, -2), + (62, 0.5), + (65, 1.234), + (71, b"abc"), + (77, b""), + (80, b"abc ( def ) ghi"), + (98, b"def \x00 4ghi"), + (118, b"bach\\slask"), + (132, b"foo\nbaa"), + (143, b"this % is not a comment."), + (170, b"foo\nbaa"), + (180, b"foobaa"), + (191, b""), + (194, b" "), + (199, b"@@ "), + (211, b"\xab\xcd\x00\x124\x50"), + (230, LIT("a")), + (232, LIT("b")), + (234, [b"c"]), + (246, [1, b"z"]), + (258, {"foo": b"bar"}), +] + + +def test_lexer_miner(): + """Lexer test case from pdfminer""" + tokens = list(Lexer(TESTDATA1)) + logger.info(tokens) + assert tokens == TOKENS1 + + +def test_parser_miner(): + """Parser test case from pdfminer""" - TOKENS = [ - (5, KWD(b"begin")), - (11, KWD(b"end")), - (16, KWD(b'"')), - (19, KWD(b"@")), - (21, KWD(b"#")), - (23, LIT("a")), - (25, LIT("BCD")), - (30, LIT("Some_Name")), - (41, LIT("foo_")), - (48, KWD(b"#")), - (49, KWD(b"xbaa")), - (54, 0), - (56, 1), - (59, -2), - (62, 0.5), - (65, 1.234), - (71, b"abc"), - (77, b""), - (80, b"abc ( def ) ghi"), - (98, b"def \x00 4ghi"), - (118, b"bach\\slask"), - (132, b"foo\nbaa"), - (143, b"this % is not a comment."), - (170, b"foo\nbaa"), - (180, b"foobaa"), - (191, b""), - (194, b" "), - (199, b"@@ "), - (211, b"\xab\xcd\x00\x124\x50"), - (226, KWD(b"func")), - (230, LIT("a")), - (232, LIT("b")), - (234, KWD(b"{")), - (235, b"c"), - (238, KWD(b"do*")), - (241, KWD(b"}")), - (242, KWD(b"def")), - (246, KWD(b"[")), - (248, 1), - (250, b"z"), - (254, KWD(b"!")), - (256, KWD(b"]")), - (258, KWD(b"<<")), - (261, LIT("foo")), - (266, b"bar"), - (272, KWD(b">>")), - ] - - OBJS = [ - (23, LIT("a")), - (25, LIT("BCD")), - (30, LIT("Some_Name")), - (41, LIT("foo_")), - (54, 0), - (56, 1), - (59, -2), - (62, 0.5), - (65, 1.234), - (71, b"abc"), - (77, b""), - (80, b"abc ( def ) ghi"), - (98, b"def \x00 4ghi"), - (118, b"bach\\slask"), - (132, b"foo\nbaa"), - (143, b"this % is not a comment."), - (170, b"foo\nbaa"), - (180, b"foobaa"), - (191, b""), - (194, b" "), - (199, b"@@ "), - (211, b"\xab\xcd\x00\x124\x50"), - (230, LIT("a")), - (232, LIT("b")), - (234, [b"c"]), - (246, [1, b"z"]), - (258, {"foo": b"bar"}), - ] - - def get_tokens(self, s): - class MyParser(PSFileParser): - def flush(self): - self.add_results(*self.popall()) - - parser = MyParser(BytesIO(s)) - r = [] - try: - while True: - r.append(parser.nexttoken()) - except PSEOF: - pass - return r - - def get_objects(self, s): - class MyParser(PSStackParser): - def flush(self): - self.add_results(*self.popall()) - - parser = MyParser(s) - r = [] - try: - while True: - r.append(parser.nextobject()) - except PSEOF: - pass - return r - - def test_1(self): - tokens = self.get_tokens(self.TESTDATA) - logger.info(tokens) - assert tokens == self.TOKENS - - def test_2(self): - objs = self.get_objects(self.TESTDATA) - logger.info(objs) - assert objs == self.OBJS - - -TESTDATA = b""" + # FIXME: Still relying on subclassing + class MyParser(Parser): + def flush(self) -> None: + objs = self.popall() + self.add_results(*objs) + + objs = list(MyParser(TESTDATA1)) + logger.info(objs) + assert objs == OBJS1 + + +TESTDATA2 = b""" ugh foo\r bar\rbaz quxx bog""" -EXPECTED = [ +EXPECTED2 = [ (0, b"\n"), (1, b"ugh\n"), (5, b"foo\r\n"), @@ -179,54 +154,20 @@ def test_2(self): ] -def run_parsers(data: bytes, expected: List[Any], makefunc: Any) -> None: - """Test stuff on both BytesIO and BinaryIO.""" - bp = PSInMemoryParser(data) - output = [] - func = makefunc(bp) - while True: - try: - output.append(func()) - except PSEOF: - break - assert output == expected - with tempfile.NamedTemporaryFile() as tf: - with open(tf.name, "wb") as outfh: - outfh.write(data) - with open(tf.name, "rb") as infh: - fp = PSFileParser(infh) - func = makefunc(fp) - output = [] - while True: - try: - output.append(func()) - except PSEOF: - break - assert output == expected - - -def test_nextline() -> None: +def test_lines() -> None: """Verify that we replicate the old nextline method.""" - run_parsers(TESTDATA, EXPECTED, lambda foo: foo.nextline) + parser = Lexer(TESTDATA2) + output = list(parser.iter_lines()) + assert output == EXPECTED2 -def test_revreadlines() -> None: +def test_revlines() -> None: """Verify that we replicate the old revreadlines method.""" - expected = list(reversed([line for pos, line in EXPECTED])) - - def make_next(parser: Any) -> Any: - itor = parser.revreadlines() - - def nextor() -> Any: - try: - line = next(itor) - except StopIteration: - raise PSEOF - return line - - return nextor - - run_parsers(TESTDATA, expected, make_next) + expected = list(reversed([line for pos, line in EXPECTED2])) + parser = Lexer(TESTDATA2) + parser.seek(parser.end) + output = list(parser.reverse_iter_lines()) + assert output == expected SIMPLE1 = b"""1 0 obj @@ -258,22 +199,12 @@ def nextor() -> Any: def list_parsers(data: bytes, expected: List[Any], discard_pos: bool = False) -> None: - bp = PSInMemoryParser(data) + bp = Lexer(data) if discard_pos: tokens: List[Any] = [tok for pos, tok in list(bp)] else: tokens = list(bp) assert tokens == expected - with tempfile.NamedTemporaryFile() as tf: - with open(tf.name, "wb") as outfh: - outfh.write(data) - with open(tf.name, "rb") as infh: - fp = PSFileParser(infh) - if discard_pos: - tokens = [tok for pos, tok in list(fp)] - else: - tokens = list(fp) - assert tokens == expected def test_new_parser() -> None: @@ -364,18 +295,10 @@ def inline_parsers( nexttoken: Any = None, blocksize: int = 16, ) -> None: - bp = PSInMemoryParser(data) + bp = Lexer(data) assert bp.get_inline_data(target=target, blocksize=blocksize) == expected if nexttoken is not None: - assert bp.nexttoken() == nexttoken - with tempfile.NamedTemporaryFile() as tf: - with open(tf.name, "wb") as outfh: - outfh.write(data) - with open(tf.name, "rb") as infh: - fp = PSFileParser(infh) - assert fp.get_inline_data(target=target, blocksize=blocksize) == expected - if nexttoken is not None: - assert fp.nexttoken() == nexttoken + assert next(bp) == nexttoken def test_get_inline_data() -> None: