diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000..caa625e6 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,20 @@ +name: Run all tests +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install Hatch + uses: pypa/hatch@install + - name: Run tests + run: hatch test diff --git a/.gitignore b/.gitignore index 7f27b7ae..f136d472 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ Pipfile.lock .vscode/ poetry.lock .eggs +*~ diff --git a/playa/__init__.py b/playa/__init__.py index 86e53d4c..280e4016 100644 --- a/playa/__init__.py +++ b/playa/__init__.py @@ -12,7 +12,6 @@ from typing import Iterator from playa.pdfdocument import PDFDocument -from playa.pdfparser import PDFParser __version__ = "0.0.1" diff --git a/playa/data_structures.py b/playa/data_structures.py index 6dc4275c..fab26c84 100644 --- a/playa/data_structures.py +++ b/playa/data_structures.py @@ -37,9 +37,7 @@ def _parse(self) -> List[Tuple[int, Any]]: return items - values: List[Tuple[int, Any]] # workaround decorators unsupported by mypy - - @property # type: ignore[no-redef,misc] + @property def values(self) -> List[Tuple[int, Any]]: values = self._parse() diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py index 22e487be..64c92384 100644 --- a/playa/pdfdocument.py +++ b/playa/pdfdocument.py @@ -39,7 +39,7 @@ PDFSyntaxError, PDFTypeError, ) -from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser, read_header +from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser from playa.pdftypes import ( DecipherCallable, PDFStream, @@ -629,6 +629,27 @@ def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes: } +def read_header(fp: BinaryIO) -> str: + """Read the PDF header and return the (initial) version string. + + Note that this version can be overridden in the document catalog.""" + try: + hdr = fp.read(8) + except IOError as err: + raise PDFSyntaxError("Failed to read PDF header") from err + if not hdr.startswith(b"%PDF-"): + raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr) + try: + version = hdr[5:].decode("ascii") + except UnicodeDecodeError as err: + raise PDFSyntaxError( + "Version number in %r contains non-ASCII characters" % hdr + ) from err + if not re.match(r"\d\.\d", version): + raise PDFSyntaxError("Version number in %r is invalid" % hdr) + return version + + class PDFDocument: """Representation of a PDF document on disk. @@ -670,6 +691,7 @@ def __init__( self.decipher: Optional[DecipherCallable] = None self._cached_objs: Dict[int, Tuple[object, int]] = {} self._parsed_objs: Dict[int, Tuple[List[object], int]] = {} + self.pdf_version = read_header(fp) self.parser = PDFParser(fp) self.parser.set_document(self) # FIXME: annoying circular reference self.is_printable = self.is_modifiable = self.is_extractable = True @@ -818,6 +840,7 @@ def getobj(self, objid: int) -> object: if objid in self._cached_objs: (obj, genno) = self._cached_objs[objid] else: + obj = None for xref in self.xrefs: try: (strmid, index, genno) = xref.get_pos(objid) @@ -837,7 +860,7 @@ def getobj(self, objid: int) -> object: break except (PSEOF, PDFSyntaxError): continue - else: + if obj is None: raise PDFObjectNotFound(objid) log.debug("register: objid=%r: %r", objid, obj) self._cached_objs[objid] = (obj, genno) @@ -871,7 +894,9 @@ def get_page_labels(self) -> Iterator[str]: If the document includes page labels, generates strings, one per page. If not, raises PDFNoPageLabels. - The resulting iteration is unbounded. + The resulting iterator is unbounded, so it is recommended to + zip it with the iterator over actual pages returned by `get_pages`. + """ assert self.catalog is not None diff --git a/playa/pdfinterp.py b/playa/pdfinterp.py index 4b09589b..471092b7 100644 --- a/playa/pdfinterp.py +++ b/playa/pdfinterp.py @@ -1,7 +1,7 @@ +import io import logging -import re from io import BytesIO -from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast +from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast from playa import settings from playa.casting import safe_float @@ -247,6 +247,69 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont: return font +KEYWORD_BI = KWD(b"BI") +KEYWORD_ID = KWD(b"ID") +KEYWORD_EI = KWD(b"EI") + + +def get_inline_data( + fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096 +) -> Tuple[int, bytes]: + """Get the data for an inline image up to the target + end-of-stream marker. + + Returns a tuple of the position of the target in the data and the + data *including* the end of stream marker. Advances the file + pointer to a position after the end of the stream. + + The caller is responsible for removing the end-of-stream if + necessary (this depends on the filter being used) and parsing + the end-of-stream token (likewise) if necessary. + """ + # PDF 1.7, p. 216: The bytes between the ID and EI operators + # shall be treated the same as a stream object’s data (see + # 7.3.8, "Stream Objects"), even though they do not follow the + # standard stream syntax. + data = [] # list of blocks + partial = b"" # partially seen target + pos = 0 + while True: + # Did we see part of the target at the end of the last + # block? Then scan ahead and try to find the rest (we + # assume the stream is buffered) + if partial: + extra_len = len(target) - len(partial) + extra = fp.read(extra_len) + if partial + extra == target: + pos -= len(partial) + data.append(extra) + break + # Put it back (assume buffering!) + fp.seek(-extra_len, io.SEEK_CUR) + partial = b"" + # Fall through (the target could be at the beginning) + buf = fp.read(blocksize) + tpos = buf.find(target) + if tpos != -1: + data.append(buf[: tpos + len(target)]) + # Put the extra back (assume buffering!) + fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR) + pos += tpos + break + else: + pos += len(buf) + # look for the longest partial match at the end + plen = len(target) - 1 + while plen > 0: + ppos = len(buf) - plen + if buf[ppos:] == target[:plen]: + partial = buf[ppos:] + break + plen -= 1 + data.append(buf) + return (pos, b"".join(data)) + + class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]): def __init__(self, streams: Sequence[object]) -> None: self.streams = streams @@ -267,65 +330,16 @@ def fillfp(self) -> None: def seek(self, pos: int) -> None: self.fillfp() - PSStackParser.seek(self, pos) - - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - while 1: - self.fillfp() - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if self.buf: - break - self.fp = None # type: ignore[assignment] - self.charpos = 0 - - def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]: - self.seek(pos) - i = 0 - data = b"" - while i <= len(target): - self.fillbuf() - if i: - ci = self.buf[self.charpos] - c = bytes((ci,)) - data += c - self.charpos += 1 - if ( - len(target) <= i - and c.isspace() - or i < len(target) - and c == (bytes((target[i],))) - ): - i += 1 - else: - i = 0 - else: - try: - j = self.buf.index(target[0], self.charpos) - data += self.buf[self.charpos : j + 1] - self.charpos = j + 1 - i = 1 - except ValueError: - data += self.buf[self.charpos :] - self.charpos = len(self.buf) - data = data[: -(len(target) + 1)] # strip the last part - data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data) - return (pos, data) + super().seek(pos) def flush(self) -> None: self.add_results(*self.popall()) - KEYWORD_BI = KWD(b"BI") - KEYWORD_ID = KWD(b"ID") - KEYWORD_EI = KWD(b"EI") - def do_keyword(self, pos: int, token: PSKeyword) -> None: - if token is self.KEYWORD_BI: + if token is KEYWORD_BI: # inline image within a content stream self.start_type(pos, "inline") - elif token is self.KEYWORD_ID: + elif token is KEYWORD_ID: try: (_, objs) = self.end_type("inline") if len(objs) % 2 != 0: @@ -339,13 +353,30 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: filter = [filter] if filter[0] in LITERALS_ASCII85_DECODE: eos = b"~>" - (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) - if eos != b"EI": # it may be necessary for decoding - data += eos + # PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode + # or ASCII85Decode as one of its filters, the ID + # operator shall be followed by a single white-space + # character, and the next character shall be + # interpreted as the first byte of image data. + if eos == b"EI": + self.seek(pos + len(token.name) + 1) + (pos, data) = get_inline_data(self.fp, target=eos) + # FIXME: it is totally unspecified what to do with + # a newline between the end of the data and "EI", + # since there is no explicit stream length. (PDF + # 1.7 p. 756: There should be an end-of-line + # marker after the data and before endstream; this + # marker shall not be included in the stream + # length.) + data = data[: -len(eos)] + else: + self.seek(pos + len(token.name)) + (pos, data) = get_inline_data(self.fp, target=eos) obj = PDFStream(d, data) self.push((pos, obj)) - if eos == b"EI": # otherwise it is still in the stream - self.push((pos, self.KEYWORD_EI)) + # This was included in the data but we need to "parse" it + if eos == b"EI": + self.push((pos, KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise diff --git a/playa/pdfpage.py b/playa/pdfpage.py index 47fe6c78..f2064aea 100644 --- a/playa/pdfpage.py +++ b/playa/pdfpage.py @@ -3,13 +3,15 @@ from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple from playa import settings -from playa.exceptions import PDFObjectNotFound, PDFValueError -from playa.pdfdocument import ( - PDFDocument, +from playa.exceptions import ( PDFNoPageLabels, + PDFObjectNotFound, PDFTextExtractionNotAllowed, + PDFValueError, +) +from playa.pdfdocument import ( + PDFDocument, ) -from playa.pdfparser import PDFParser from playa.pdftypes import dict_value, int_value, list_value, resolve1 from playa.psparser import LIT from playa.utils import parse_rect @@ -173,10 +175,8 @@ def get_pages( caching: bool = True, check_extractable: bool = False, ) -> Iterator["PDFPage"]: - # Create a PDF parser object associated with the file object. - parser = PDFParser(fp) # Create a PDF document object that stores the document structure. - doc = PDFDocument(parser, password=password, caching=caching) + doc = PDFDocument(fp, password=password) # Check if the document allows text extraction. # If not, warn the user and proceed. if not doc.is_extractable: diff --git a/playa/pdfparser.py b/playa/pdfparser.py index 7c42c020..3603c79f 100644 --- a/playa/pdfparser.py +++ b/playa/pdfparser.py @@ -1,5 +1,4 @@ import logging -import re from io import BytesIO from typing import TYPE_CHECKING, BinaryIO, Optional, Union @@ -24,27 +23,6 @@ KEYWORD_OBJ = KWD(b"obj") -def read_header(fp: BinaryIO) -> str: - """Read the PDF header and return the (initial) version string. - - Note that this version can be overridden in the document catalog.""" - try: - hdr = fp.read(8) - except IOError as err: - raise PDFSyntaxError("Failed to read PDF header") from err - if not hdr.startswith(b"%PDF-"): - raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr) - try: - version = hdr[5:].decode("ascii") - except UnicodeDecodeError as err: - raise PDFSyntaxError( - "Version number in %r contains non-ASCII characters" % hdr - ) from err - if not re.match(r"\d\.\d", version): - raise PDFSyntaxError("Version number in %r is invalid" % hdr) - return version - - # PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): """PDFParser fetch PDF objects from a file stream. @@ -65,7 +43,6 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]): def __init__(self, fp: BinaryIO) -> None: PSStackParser.__init__(self, fp) self.doc: Optional[PDFDocument] = None - self.pdf_version = read_header(fp) self.fallback = False def set_document(self, doc: Union["PDFDocument", None]) -> None: @@ -115,7 +92,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: self.fp.seek(pos) data = bytearray(self.fp.read(objlen)) self.seek(pos + objlen) - while 1: + while True: try: (linepos, line) = self.nextline() except PSEOF: diff --git a/playa/pdftypes.py b/playa/pdftypes.py index c27f5941..a3691a3f 100644 --- a/playa/pdftypes.py +++ b/playa/pdftypes.py @@ -13,7 +13,6 @@ Union, cast, ) -from warnings import warn from playa import settings from playa.ascii85 import ascii85decode, asciihexdecode @@ -74,21 +73,12 @@ def __init__( self, doc: Optional["PDFDocument"], objid: int, - _: Any = _DEFAULT, ) -> None: """Reference to a PDF object. :param doc: The PDF document. :param objid: The object number. - :param _: Unused argument for backwards compatibility. """ - if _ is not _DEFAULT: - warn( - "The third argument of PDFObjRef is unused and will be removed after " - "2024", - DeprecationWarning, - ) - if objid == 0: if settings.STRICT: raise PDFValueError("PDF object id cannot be 0.") diff --git a/playa/psparser.py b/playa/psparser.py index b86ac126..1eb6d990 100755 --- a/playa/psparser.py +++ b/playa/psparser.py @@ -2,9 +2,12 @@ import io import logging import re +from binascii import unhexlify +from collections import deque from typing import ( Any, BinaryIO, + Deque, Dict, Generic, Iterator, @@ -107,6 +110,7 @@ def intern(self, name: PSLiteral.NameType) -> _SymbolT: KEYWORD_ARRAY_END = KWD(b"]") KEYWORD_DICT_BEGIN = KWD(b"<<") KEYWORD_DICT_END = KWD(b">>") +KEYWORD_GT = KWD(b">") def literal_name(x: Any) -> str: @@ -134,17 +138,15 @@ def keyword_name(x: Any) -> Any: return name -EOL = re.compile(rb"[\r\n]") +EOL = b"\r\n" SPC = re.compile(rb"\s") -NONSPC = re.compile(rb"\S") -HEX = re.compile(rb"[0-9a-fA-F]") -END_LITERAL = re.compile(rb"[#/%\[\]()<>{}\s]") -END_HEX_STRING = re.compile(rb"[^\s0-9a-fA-F]") -HEX_PAIR = re.compile(rb"[0-9a-fA-F]{2}|.") -END_NUMBER = re.compile(rb"[^0-9]") -END_KEYWORD = re.compile(rb"[#/%\[\]()<>{}\s]") -END_STRING = re.compile(rb"[()\134]") -OCT_STRING = re.compile(rb"[0-7]") +WHITESPACE = b" \t\n\r\f\v" +NUMBER = b"0123456789" +HEX = NUMBER + b"abcdef" + b"ABCDEF" +NOTLITERAL = b"#/%[]()<>{}" + WHITESPACE +NOTKEYWORD = b"#/%[]()<>{}" + WHITESPACE +NOTSTRING = b"()\\" +OCTAL = b"01234567" ESC_STRING = { b"b": 8, b"t": 9, @@ -161,91 +163,44 @@ def keyword_name(x: Any) -> Any: class PSBaseParser: - """Most basic PostScript parser that performs only tokenization.""" - - BUFSIZ = 4096 - - def __init__(self, fp: BinaryIO) -> None: + def __init__(self, fp: BinaryIO): self.fp = fp - self.eof = False + self._tokens: Deque[Tuple[int, PSBaseParserToken]] = deque() self.seek(0) - def __repr__(self) -> str: - return "<%s: %r, bufpos=%d>" % (self.__class__.__name__, self.fp, self.bufpos) - def flush(self) -> None: pass - def close(self) -> None: - self.flush() - - def tell(self) -> int: - return self.bufpos + self.charpos - - def poll(self, pos: Optional[int] = None, n: int = 80) -> None: - pos0 = self.fp.tell() - if not pos: - pos = self.bufpos + self.charpos - self.fp.seek(pos) - log.debug("poll(%d): %r", pos, self.fp.read(n)) - self.fp.seek(pos0) - def seek(self, pos: int) -> None: - """Seeks the parser to the given position.""" - log.debug("seek: %r", pos) self.fp.seek(pos) - # reset the status for nextline() - self.bufpos = pos - self.buf = b"" - self.charpos = 0 - # reset the status for nexttoken() self._parse1 = self._parse_main self._curtoken = b"" self._curtokenpos = 0 - self._tokens: List[Tuple[int, PSBaseParserToken]] = [] - self.eof = False - - def fillbuf(self) -> None: - if self.charpos < len(self.buf): - return - # fetch next chunk. - self.bufpos = self.fp.tell() - self.buf = self.fp.read(self.BUFSIZ) - if not self.buf: - raise PSEOF("Unexpected EOF") - self.charpos = 0 + self._tokens.clear() - def nextline(self) -> Tuple[int, bytes]: - """Fetches a next line that ends either with \\r or \\n.""" - linebuf = b"" - linepos = self.bufpos + self.charpos - eol = False - while 1: - self.fillbuf() - if eol: - c = self.buf[self.charpos : self.charpos + 1] - # handle b'\r\n' - if c == b"\n": - linebuf += c - self.charpos += 1 - break - m = EOL.search(self.buf, self.charpos) - if m: - linebuf += self.buf[self.charpos : m.end(0)] - self.charpos = m.end(0) - if linebuf[-1:] == b"\r": - eol = True - else: - break - else: - linebuf += self.buf[self.charpos :] - self.charpos = len(self.buf) - log.debug("nextline: %r, %r", linepos, linebuf) + def tell(self) -> int: + return self.fp.tell() - return (linepos, linebuf) + def nextline(self) -> Tuple[int, bytes]: + r"""Fetches a next line that ends either with \r, \n, or \r\n.""" + linepos = self.fp.tell() + # readline() is implemented on BinarIO so just use that + # (except that it only accepts \n as a separator) + line_or_lines = self.fp.readline() + if line_or_lines == b"": + raise PSEOF + first, sep, rest = line_or_lines.partition(b"\r") + if len(rest) == 0: + return (linepos, line_or_lines) + elif rest != b"\n": + self.fp.seek(linepos + len(first) + 1) + return (linepos, first + sep) + else: + self.fp.seek(linepos + len(first) + 2) + return (linepos, first + b"\r\n") def revreadlines(self) -> Iterator[bytes]: - """Fetches a next line backword. + """Fetches a next line backwards. This is used to locate the trailers at the end of a file. """ @@ -253,271 +208,305 @@ def revreadlines(self) -> Iterator[bytes]: pos = self.fp.tell() buf = b"" while pos > 0: - prevpos = pos - pos = max(0, pos - self.BUFSIZ) + # NOTE: This can obviously be optimized to use regular + # expressions on the (known to exist) buffer in + # self.fp... + pos -= 1 self.fp.seek(pos) - s = self.fp.read(prevpos - pos) - if not s: + c = self.fp.read(1) + if c in b"\r\n": + yield buf + buf = c + if c == b"\n" and pos > 0: + self.fp.seek(pos - 1) + cc = self.fp.read(1) + if cc == b"\r": + pos -= 1 + buf = cc + buf + else: + buf = c + buf + yield buf + + def __iter__(self): + return self + + def __next__(self) -> Tuple[int, PSBaseParserToken]: + while True: + c = self._parse1() + # print(c, self._curtoken, self._parse1) + if self._tokens or c == b"": break - while 1: - n = max(s.rfind(b"\r"), s.rfind(b"\n")) - if n == -1: - buf = s + buf - break - yield s[n:] + buf - s = s[:n] - buf = b"" - - def _parse_main(self, s: bytes, i: int) -> int: - m = NONSPC.search(s, i) - if not m: - return len(s) - j = m.start(0) - c = s[j : j + 1] - self._curtokenpos = self.bufpos + j + if not self._tokens: + raise StopIteration + return self._tokens.popleft() + + def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + try: + return self.__next__() + except StopIteration: + raise PSEOF + + def _parse_main(self): + """Initial/default state for the lexer.""" + c = self.fp.read(1) + # note that b"" (EOF) is in everything, which is fine + if c in WHITESPACE: + return c + self._curtokenpos = self.fp.tell() - 1 if c == b"%": self._curtoken = b"%" self._parse1 = self._parse_comment - return j + 1 elif c == b"/": self._curtoken = b"" self._parse1 = self._parse_literal - return j + 1 - elif c in b"-+" or c.isdigit(): + elif c in b"-+" or c in NUMBER: self._curtoken = c self._parse1 = self._parse_number - return j + 1 elif c == b".": self._curtoken = c self._parse1 = self._parse_float - return j + 1 elif c.isalpha(): self._curtoken = c self._parse1 = self._parse_keyword - return j + 1 elif c == b"(": self._curtoken = b"" self.paren = 1 self._parse1 = self._parse_string - return j + 1 elif c == b"<": self._curtoken = b"" self._parse1 = self._parse_wopen - return j + 1 elif c == b">": self._curtoken = b"" self._parse1 = self._parse_wclose - return j + 1 elif c == b"\x00": - return j + 1 + pass else: self._add_token(KWD(c)) - return j + 1 + return c def _add_token(self, obj: PSBaseParserToken) -> None: + """Add a succesfully parsed token.""" self._tokens.append((self._curtokenpos, obj)) - def _parse_comment(self, s: bytes, i: int) -> int: - m = EOL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - self._parse1 = self._parse_main - # We ignore comments. - # self._tokens.append(self._curtoken) - return j - - def _parse_literal(self, s: bytes, i: int) -> int: - m = END_LITERAL.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] + def _parse_comment(self): + """Comment state for the lexer""" + c = self.fp.read(1) + if c in EOL: # this includes b"", i.e. EOF + self._parse1 = self._parse_main + # We ignore comments. + # self._tokens.append(self._curtoken) + else: + self._curtoken += c + return c + + def _parse_literal(self): + """Literal (keyword) state for the lexer.""" + c = self.fp.read(1) if c == b"#": self.hex = b"" self._parse1 = self._parse_literal_hex - return j + 1 - try: - name: Union[str, bytes] = str(self._curtoken, "utf-8") - except Exception: - name = self._curtoken - self._add_token(LIT(name)) - self._parse1 = self._parse_main - return j + elif c in NOTLITERAL: + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(LIT(self._curtoken.decode("utf-8"))) + except UnicodeDecodeError: + self._add_token(LIT(self._curtoken)) + self._parse1 = self._parse_main + else: + self._curtoken += c + return c - def _parse_literal_hex(self, s: bytes, i: int) -> int: - c = s[i : i + 1] - if HEX.match(c) and len(self.hex) < 2: + def _parse_literal_hex(self): + """State for escaped hex characters in literal names""" + # Consume a hex digit only if we can ... consume a hex digit + c = self.fp.read(1) + if c and c in HEX and len(self.hex) < 2: self.hex += c - return i + 1 - if self.hex: - self._curtoken += bytes((int(self.hex, 16),)) - self._parse1 = self._parse_literal - return i - - def _parse_number(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b".": + else: + if c: + self.fp.seek(-1, io.SEEK_CUR) + if self.hex: + self._curtoken += bytes((int(self.hex, 16),)) + self._parse1 = self._parse_literal + return c + + def _parse_number(self): + """State for numeric objects.""" + c = self.fp.read(1) + if c and c in NUMBER: + self._curtoken += c + elif c == b".": self._curtoken += c self._parse1 = self._parse_float - return j + 1 - try: - self._add_token(int(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_float(self, s: bytes, i: int) -> int: - m = END_NUMBER.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - try: - self._add_token(float(self._curtoken)) - except ValueError: - pass - self._parse1 = self._parse_main - return j - - def _parse_keyword(self, s: bytes, i: int) -> int: - m = END_KEYWORD.search(s, i) - if m: - j = m.start(0) - self._curtoken += s[i:j] else: - self._curtoken += s[i:] - return len(s) - if self._curtoken == b"true": - token: Union[bool, PSKeyword] = True - elif self._curtoken == b"false": - token = False + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(int(self._curtoken)) + except ValueError: + log.warning("Invalid int literal: %r", self._curtoken) + self._parse1 = self._parse_main + return c + + def _parse_float(self): + """State for fractional part of numeric objects.""" + c = self.fp.read(1) + # b"" is in everything so we have to add an extra check + if not c or c not in NUMBER: + if c: + self.fp.seek(-1, io.SEEK_CUR) + try: + self._add_token(float(self._curtoken)) + except ValueError: + log.warning("Invalid float literal: %r", self._curtoken) + self._parse1 = self._parse_main else: - token = KWD(self._curtoken) - self._add_token(token) - self._parse1 = self._parse_main - return j - - def _parse_string(self, s: bytes, i: int) -> int: - m = END_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - c = s[j : j + 1] - if c == b"\\": - self.oct = b"" - self._parse1 = self._parse_string_1 - return j + 1 - if c == b"(": - self.paren += 1 self._curtoken += c - return j + 1 - if c == b")": - self.paren -= 1 - if self.paren: - # WTF, they said balanced parens need no special treatment. + return c + + def _parse_keyword(self): + """State for keywords.""" + c = self.fp.read(1) + if c in NOTKEYWORD: # includes EOF + if c: + self.fp.seek(-1, io.SEEK_CUR) + if self._curtoken == b"true": + self._add_token(True) + elif self._curtoken == b"false": + self._add_token(False) + else: + self._add_token(KWD(self._curtoken)) + self._parse1 = self._parse_main + else: + self._curtoken += c + return c + + def _parse_string(self): + """State for string objects.""" + c = self.fp.read(1) + if c and c in NOTSTRING: # does not include EOF + if c == b"\\": + self._parse1 = self._parse_string_esc + return c + elif c == b"(": + self.paren += 1 self._curtoken += c - return j + 1 - self._add_token(self._curtoken) - self._parse1 = self._parse_main - return j + 1 - - def _parse_string_1(self, s: bytes, i: int) -> int: - """Parse literal strings + return c + elif c == b")": + self.paren -= 1 + if self.paren: + self._curtoken += c + return c + # We saw the last parenthesis and fell through (it will be + # consumed, but not added to self._curtoken) + self._add_token(self._curtoken) + self._parse1 = self._parse_main + elif c == b"\r": + # PDF 1.7 page 15: An end-of-line marker appearing within + # a literal string without a preceding REVERSE SOLIDUS + # shall be treated as a byte value of (0Ah), irrespective + # of whether the end-of-line marker was a CARRIAGE RETURN + # (0Dh), a LINE FEED (0Ah), or both. + cc = self.fp.read(1) + # Put it back if it isn't \n + if cc and cc != b"\n": + self.fp.seek(-1, io.SEEK_CUR) + self._curtoken += b"\n" + else: + self._curtoken += c + return c + + def _parse_string_esc(self): + """State for escapes in literal strings. We have seen a + backslash and nothing else.""" + c = self.fp.read(1) + if c and c in OCTAL: # exclude EOF + self.oct = c + self._parse1 = self._parse_string_octal + return c + elif c and c in ESC_STRING: + self._curtoken += bytes((ESC_STRING[c],)) + elif c == b"\n": # Skip newline after backslash + pass + elif c == b"\r": # Also skip CRLF after + cc = self.fp.read(1) + # Put it back if it isn't \n + if cc and cc != b"\n": + self.fp.seek(-1, io.SEEK_CUR) + elif c == b"": + log.warning("EOF inside escape %r", self._curtoken) + else: + log.warning("Unrecognized escape %r", c) + self._curtoken += c + self._parse1 = self._parse_string + return c - PDF Reference 3.2.3 - """ - c = s[i : i + 1] - if OCT_STRING.match(c) and len(self.oct) < 3: + def _parse_string_octal(self): + """State for an octal escape.""" + c = self.fp.read(1) + if c and c in OCTAL: # exclude EOF self.oct += c - return i + 1 - - elif self.oct: + done = len(self.oct) >= 3 # it can't be > though + else: + if c: + self.fp.seek(-1, io.SEEK_CUR) + else: + log.warning("EOF in octal escape %r", self._curtoken) + done = True + if done: chrcode = int(self.oct, 8) - assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode) - self._curtoken += bytes((chrcode,)) + if chrcode >= 256: + # PDF1.7 p.16: "high-order overflow shall be ignored." + log.warning("Invalid octal %s (%d)", repr(self.oct), chrcode) + else: + self._curtoken += bytes((chrcode,)) + # Back to normal string parsing self._parse1 = self._parse_string - return i - - elif c in ESC_STRING: - self._curtoken += bytes((ESC_STRING[c],)) + return c - elif c == b"\r" and len(s) > i + 1 and s[i + 1 : i + 2] == b"\n": - # If current and next character is \r\n skip both because enters - # after a \ are ignored - i += 1 - - # default action - self._parse1 = self._parse_string - return i + 1 - - def _parse_wopen(self, s: bytes, i: int) -> int: - c = s[i : i + 1] + def _parse_wopen(self): + """State for start of dictionary or hex string.""" + c = self.fp.read(1) if c == b"<": self._add_token(KEYWORD_DICT_BEGIN) self._parse1 = self._parse_main - i += 1 else: + if c: + self.fp.seek(-1, io.SEEK_CUR) self._parse1 = self._parse_hexstring - return i + return c - def _parse_wclose(self, s: bytes, i: int) -> int: - c = s[i : i + 1] + def _parse_wclose(self): + """State for end of dictionary (accessed from initial state only)""" + c = self.fp.read(1) if c == b">": self._add_token(KEYWORD_DICT_END) - i += 1 - self._parse1 = self._parse_main - return i - - def _parse_hexstring(self, s: bytes, i: int) -> int: - m = END_HEX_STRING.search(s, i) - if not m: - self._curtoken += s[i:] - return len(s) - j = m.start(0) - self._curtoken += s[i:j] - token = HEX_PAIR.sub( - lambda m: bytes((int(m.group(0), 16),)), - SPC.sub(b"", self._curtoken), - ) - self._add_token(token) + else: + # Assuming this is a keyword (which means nothing) + self._add_token(KEYWORD_GT) + if c: + self.fp.seek(-1, io.SEEK_CUR) self._parse1 = self._parse_main - return j - def nexttoken(self) -> Tuple[int, PSBaseParserToken]: - if self.eof: - # It's not really unexpected, come on now... - raise PSEOF("Unexpected EOF") - while not self._tokens: - try: - self.fillbuf() - self.charpos = self._parse1(self.buf, self.charpos) - except PSEOF: - # If we hit EOF in the middle of a token, try to parse - # it by tacking on whitespace, and delay raising PSEOF - # until next time around - self.charpos = self._parse1(b"\n", 0) - self.eof = True - # Oh, so there wasn't actually a token there? OK. - if not self._tokens: - raise - token = self._tokens.pop(0) - log.debug("nexttoken: %r", token) - return token + def _parse_hexstring(self): + """State for parsing hexadecimal literal strings.""" + c = self.fp.read(1) + if not c: + log.warning("EOF in hex string %r", self._curtoken) + elif c in WHITESPACE: + pass + elif c in HEX: + self._curtoken += c + elif c == b">": + if len(self._curtoken) % 2 == 1: + self._curtoken += b"0" + token = unhexlify(self._curtoken) + self._add_token(token) + self._parse1 = self._parse_main + else: + log.warning("unexpected character %r in hex string %r", c, self._curtoken) + return c # Stack slots may by occupied by any of: @@ -532,8 +521,8 @@ def nexttoken(self) -> Tuple[int, PSBaseParserToken]: class PSStackParser(PSBaseParser, Generic[ExtraT]): - def __init__(self, fp: BinaryIO) -> None: - PSBaseParser.__init__(self, fp) + def __init__(self, reader: BinaryIO) -> None: + PSBaseParser.__init__(self, reader) self.reset() def reset(self) -> None: @@ -654,7 +643,7 @@ def nextobject(self) -> PSStackEntry[ExtraT]: if self.context: continue else: - self.flush() + self.flush() # FIXME: what does it do? obj = self.results.pop(0) try: log.debug("nextobject: %r", obj) diff --git a/playa/utils.py b/playa/utils.py index fa98312e..418887eb 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -74,11 +74,10 @@ def make_compat_bytes(in_str: str) -> bytes: def make_compat_str(o: object) -> str: """Converts everything to string, if bytes guessing the encoding.""" if isinstance(o, bytes): - enc = charset_normalizer.detect(o) - try: - return o.decode(enc["encoding"]) - except UnicodeDecodeError: + result = charset_normalizer.from_bytes(o) + if result is None: return str(o) + return str(result.best()) else: return str(o) diff --git a/samples/contrib/PSC_Station.pdf b/samples/contrib/PSC_Station.pdf new file mode 100644 index 00000000..326888dc Binary files /dev/null and b/samples/contrib/PSC_Station.pdf differ diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py new file mode 100644 index 00000000..a4ba03c0 --- /dev/null +++ b/tests/benchmark_parser.py @@ -0,0 +1,372 @@ +import logging +import tempfile +import time +from io import BytesIO +from pathlib import Path + +log = logging.getLogger(Path(__file__).stem) +TESTDIR = Path(__file__).parent.parent / "samples" +DATA = rb""" +1 0 5 30 6 46 9 76 10 93 13 123 14 139 17 169 18 202 19 234 +20 366 21 501 22 636 23 771 26 906 27 949 28 993 3 1037 24 1080 34 1157 +36 1249 7 1292 33 1336 38 1401 40 1493 11 1536 37 1580 42 1645 44 1737 41 1780 +46 1833 15 1925 45 1969 48 2034 49 2439 50 2877 52 3323 54 3602 56 3883 30 4119 +29 4244 31 4369 32 4493 57 4564 16 4621 12 4679 8 4749 4 4817 58 4872 59 5028 +60 5151 61 5213 62 5233 +<< /S /GoTo /D (section.1) >> +(First section) +<< /S /GoTo /D (section.2) >> +(Second section) +<< /S /GoTo /D (section.3) >> +(Third section) +<< /S /GoTo /D (section.4) >> +(Heading on Level 1 \(section\)) +<< /S /GoTo /D [19 0 R /Fit] >> +<< +/Type /Page +/Contents 25 0 R +/Resources 24 0 R +/MediaBox [0 0 612 792] +/Parent 32 0 R +/Annots [ 20 0 R 21 0 R 22 0 R 23 0 R ] +>> +<< +/Type /Annot +/Subtype /Link +/Border[0 0 1]/H/I/C[1 0 0] +/Rect [132.772 634.321 212.206 643.232] +/A << /S /GoTo /D (section.1) >> +>> +<< +/Type /Annot +/Subtype /Link +/Border[0 0 1]/H/I/C[1 0 0] +/Rect [132.772 612.403 223.288 621.314] +/A << /S /GoTo /D (section.2) >> +>> +<< +/Type /Annot +/Subtype /Link +/Border[0 0 1]/H/I/C[1 0 0] +/Rect [132.772 590.486 216.722 599.397] +/A << /S /GoTo /D (section.3) >> +>> +<< +/Type /Annot +/Subtype /Link +/Border[0 0 1]/H/I/C[1 0 0] +/Rect [132.772 566.077 294.043 578.032] +/A << /S /GoTo /D (section.4) >> +>> +<< +/D [19 0 R /XYZ 132.768 705.06 null] +>> +<< +/D [19 0 R /XYZ 133.768 667.198 null] +>> +<< +/D [19 0 R /XYZ 133.768 675.168 null] +>> +<< +/D [19 0 R /XYZ 133.768 552.06 null] +>> +<< +/Font << /F26 29 0 R /F27 30 0 R /F8 31 0 R >> +/ProcSet [ /PDF /Text ] +>> +<< +/Type /Page +/Contents 35 0 R +/Resources 33 0 R +/MediaBox [0 0 612 792] +/Parent 32 0 R +>> +<< +/D [34 0 R /XYZ 132.768 705.06 null] +>> +<< +/D [34 0 R /XYZ 133.768 667.198 null] +>> +<< +/Font << /F26 29 0 R /F8 31 0 R >> +/ProcSet [ /PDF /Text ] +>> +<< +/Type /Page +/Contents 39 0 R +/Resources 37 0 R +/MediaBox [0 0 612 792] +/Parent 32 0 R +>> +<< +/D [38 0 R /XYZ 132.768 705.06 null] +>> +<< +/D [38 0 R /XYZ 133.768 667.198 null] +>> +<< +/Font << /F26 29 0 R /F8 31 0 R >> +/ProcSet [ /PDF /Text ] +>> +<< +/Type /Page +/Contents 43 0 R +/Resources 41 0 R +/MediaBox [0 0 612 792] +/Parent 32 0 R +>> +<< +/D [42 0 R /XYZ 132.768 705.06 null] +>> +<< +/Font << /F8 31 0 R >> +/ProcSet [ /PDF /Text ] +>> +<< +/Type /Page +/Contents 47 0 R +/Resources 45 0 R +/MediaBox [0 0 612 792] +/Parent 32 0 R +>> +<< +/D [46 0 R /XYZ 133.768 667.198 null] +>> +<< +/Font << /F26 29 0 R /F8 31 0 R >> +/ProcSet [ /PDF /Text ] +>> +[277.8 500 500 500 500 500 500 500 500 500 500 500 277.8 277.8 277.8 777.8 472.2 472.2 777.8 750 708.3 722.2 763.9 680.6 652.8 784.7 750 361.1 513.9 777.8 625 916.7 750 777.8 680.6 777.8 736.1 555.6 722.2 750 750 1027.8 750 750 611.1 277.8 500 277.8 500 277.8 277.8 500 555.6 444.4 555.6 444.4 305.6 500 555.6 277.8 305.6 527.8 277.8 833.3 555.6 500 555.6 527.8 391.7 394.4 388.9 555.6 527.8 722.2 527.8] +[447.2 447.2 575 894.4 319.4 383.3 319.4 575 575 575 575 575 575 575 575 575 575 575 319.4 319.4 350 894.4 543.1 543.1 894.4 869.4 818.1 830.6 881.9 755.6 723.6 904.2 900 436.1 594.4 901.4 691.7 1091.7 900 863.9 786.1 863.9 862.5 638.9 800 884.7 869.4 1188.9 869.4 869.4 702.8 319.4 602.8 319.4 575 319.4 319.4 559 638.9 511.1 638.9 527.1 351.4 575 638.9 319.4 351.4 606.9 319.4 958.3 638.9 575 638.9 606.9 473.6 453.6 447.2 638.9 606.9] +[437.5 437.5 562.5 875 312.5 375 312.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 562.5 312.5 312.5 342.6 875 531.2 531.2 875 849.5 799.8 812.5 862.3 738.4 707.2 884.3 879.6 419 581 880.8 675.9 1067.1 879.6 844.9 768.5 844.9 839.1 625 782.4 864.6 849.5 1162 849.5 849.5 687.5 312.5 581 312.5 562.5 312.5 312.5 546.9 625 500 625 513.3 343.7 562.5 625 312.5 343.7 593.7 312.5 937.5 625 562.5 625 593.7 459.5 443.8 437.5 625 593.7] +<< +/Type /FontDescriptor +/FontName /ZSHFTL+CMBX10 +/Flags 4 +/FontBBox [-56 -250 1164 750] +/Ascent 694 +/CapHeight 686 +/Descent -194 +/ItalicAngle 0 +/StemV 114 +/XHeight 444 +/CharSet (/F/H/L/S/T/a/c/d/e/four/g/h/i/l/n/o/one/parenleft/parenright/r/s/t/three/two/v) +/FontFile 51 0 R +>> +<< +/Type /FontDescriptor +/FontName /NJBTSJ+CMBX12 +/Flags 4 +/FontBBox [-53 -251 1139 750] +/Ascent 694 +/CapHeight 686 +/Descent -194 +/ItalicAngle 0 +/StemV 109 +/XHeight 444 +/CharSet (/C/F/H/L/S/T/a/c/d/e/four/g/h/i/l/n/o/one/parenleft/parenright/r/s/t/three/two/v) +/FontFile 53 0 R +>> +<< +/Type /FontDescriptor +/FontName /PQDURT+CMR10 +/Flags 4 +/FontBBox [-40 -250 1009 750] +/Ascent 694 +/CapHeight 683 +/Descent -194 +/ItalicAngle 0 +/StemV 69 +/XHeight 431 +/CharSet (/M/S/e/h/i/m/o/one/period/r/t/two/v/x) +/FontFile 55 0 R +>> +<< +/Type /Font +/Subtype /Type1 +/BaseFont /ZSHFTL+CMBX10 +/FontDescriptor 52 0 R +/FirstChar 40 +/LastChar 118 +/Widths 49 0 R +>> +<< +/Type /Font +/Subtype /Type1 +/BaseFont /NJBTSJ+CMBX12 +/FontDescriptor 54 0 R +/FirstChar 40 +/LastChar 118 +/Widths 50 0 R +>> +<< +/Type /Font +/Subtype /Type1 +/BaseFont /PQDURT+CMR10 +/FontDescriptor 56 0 R +/FirstChar 46 +/LastChar 120 +/Widths 48 0 R +>> +<< +/Type /Pages +/Count 5 +/Kids [19 0 R 34 0 R 38 0 R 42 0 R 46 0 R] +>> +<< +/Type /Outlines +/First 4 0 R +/Last 16 0 R +/Count 4 +>> +<< +/Title 17 0 R +/A 14 0 R +/Parent 57 0 R +/Prev 12 0 R +>> +<< +/Title 13 0 R +/A 10 0 R +/Parent 57 0 R +/Prev 8 0 R +/Next 16 0 R +>> +<< +/Title 9 0 R +/A 6 0 R +/Parent 57 0 R +/Prev 4 0 R +/Next 12 0 R +>> +<< +/Title 5 0 R +/A 1 0 R +/Parent 57 0 R +/Next 8 0 R +>> +<< +/Names [(Doc-Start) 27 0 R (page.1) 40 0 R (page.2) 44 0 R (page.iii) 26 0 R (page.iv) 36 0 R (section*.1) 28 0 R] +/Limits [(Doc-Start) (section*.1)] +>> +<< +/Names [(section.1) 3 0 R (section.2) 7 0 R (section.3) 11 0 R (section.4) 15 0 R] +/Limits [(section.1) (section.4)] +>> +<< +/Kids [58 0 R 59 0 R] +/Limits [(Doc-Start) (section.4)] +>> +<< +/Dests 60 0 R +>> +<< +/Type /Catalog +/Pages 32 0 R +/Outlines 57 0 R +/Names 61 0 R +/PageMode/UseOutlines/PageLabels<>2<>4<>]>> +/OpenAction 18 0 R +>> +""" + + +def bench_playa(): + from playa.converter import PDFPageAggregator + from playa.pdfdocument import PDFDocument + from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager + from playa.pdfpage import PDFPage + from playa.psparser import PSBaseParser + + runs = 100 + start = time.time() + parser = PSBaseParser(BytesIO(DATA * runs)) + _ = list(parser) + print( + "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), + ) + with tempfile.NamedTemporaryFile() as tf: + runs = 100 + with open(tf.name, "wb") as outfh: + outfh.write(DATA * runs) + with open(tf.name, "rb") as infh: + start = time.time() + parser = PSBaseParser(infh) + _ = list(parser) + print( + "PLAYA Parser (BinaryIO): %fms / run" + % ((time.time() - start) / runs * 1000), + ) + + runs = 20 + start = time.time() + for _ in range(runs): + with open(TESTDIR / "contrib" / "pagelabels.pdf", "rb") as infh: + rsrc = PDFResourceManager() + agg = PDFPageAggregator(rsrc, pageno=1) + interp = PDFPageInterpreter(rsrc, agg) + doc = PDFDocument(infh) + page = next(PDFPage.create_pages(doc)) + interp.process_page(page) + print( + "PLAYA Interpreter: %dms / run" % ((time.time() - start) / runs * 1000), + ) + + +def bench_pdfminer(): + from pdfminer.converter import PDFPageAggregator + from pdfminer.pdfdocument import PDFDocument + from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager + from pdfminer.pdfpage import PDFPage + from pdfminer.pdfparser import PDFParser + from pdfminer.psparser import PSEOF, PSBaseParser + + runs = 100 + start = time.time() + parser = PSBaseParser(BytesIO(DATA * runs)) + while True: + try: + _ = parser.nexttoken() + except PSEOF: + break + print( + "pdfminer.six Parser (BytesIO): %fms / run" + % ((time.time() - start) / runs * 1000), + ) + with tempfile.NamedTemporaryFile() as tf: + runs = 100 + with open(tf.name, "wb") as outfh: + outfh.write(DATA * runs) + with open(tf.name, "rb") as infh: + parser = PSBaseParser(infh) + while True: + try: + _ = parser.nexttoken() + except PSEOF: + break + print( + "pdfminer.six Parser (BinaryIO): %fms / run" + % ((time.time() - start) / runs * 1000), + ) + runs = 20 + start = time.time() + for _ in range(runs): + with open(TESTDIR / "contrib" / "pagelabels.pdf", "rb") as infh: + rsrc = PDFResourceManager() + agg = PDFPageAggregator(rsrc, pageno=1) + interp = PDFPageInterpreter(rsrc, agg) + doc = PDFDocument(PDFParser(infh)) + page = next(PDFPage.create_pages(doc)) + interp.process_page(page) + print( + "pdfminer.six Interpreter: %dms / run" % ((time.time() - start) / runs * 1000), + ) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) < 2 or sys.argv[1] == "pdfminer": + bench_pdfminer() + if len(sys.argv) < 2 or sys.argv[1] == "playa": + bench_playa() diff --git a/tests/test_open.py b/tests/test_open.py index 8a6642ae..df57513f 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -7,6 +7,11 @@ import pytest import playa +from playa.converter import PDFPageAggregator + +# These APIs will go away soon +from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager +from playa.pdfpage import PDFPage TESTDIR = Path(__file__).parent.parent / "samples" ALLPDFS = TESTDIR.glob("**/*.pdf") @@ -31,3 +36,20 @@ def test_open(path: Path): pass assert pdf.parser.fp.closed assert pdf.parser.doc is None + + +def test_inline_data(): + # No, there's no easy way to unit test PDFContentParser directly. + # The necessary mocking would be useless considering that I will + # shortly demolish these redundant and confusing APIs. + with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc: + # Seriously WTF is all this... just to get a page... OMG + rsrc = PDFResourceManager() + agg = PDFPageAggregator(rsrc, pageno=1) + interp = PDFPageInterpreter(rsrc, agg) + page = next(PDFPage.create_pages(doc)) + interp.process_page(page) + + +if __name__ == "__main__": + test_open(TESTDIR / "simple5.pdf") diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 3ec2ea8e..e82f576f 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -1,3 +1,38 @@ """ Test the classes in pdfdocument.py """ + +from io import BytesIO +from pathlib import Path + +import pytest + +import playa +import playa.settings +from playa.exceptions import PDFSyntaxError +from playa.pdfdocument import read_header + +playa.settings.STRICT = True + +TESTDIR = Path(__file__).parent.parent / "samples" + + +def test_read_header(): + """Verify reading header.""" + with pytest.raises(PDFSyntaxError): + read_header(BytesIO(b"NOT-A-PDF!!!")) + with pytest.raises(PDFSyntaxError): + read_header(BytesIO(b"%PDF")) + with pytest.raises(PDFSyntaxError) as e: + read_header(BytesIO("%PDF-ÅÖÜ".encode("latin1"))) + assert "ASCII" in str(e) + with pytest.raises(PDFSyntaxError) as e: + read_header(BytesIO(b"%PDF-OMG")) + assert "invalid" in str(e) + assert read_header(BytesIO(b"%PDF-1.7")) == "1.7" + + +def test_page_labels(): + with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc: + labels = [label for _, label in zip(range(10), doc.get_page_labels())] + assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"] diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index 4b31bd99..b4d9df97 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -4,22 +4,202 @@ from io import BytesIO -import pytest - -from playa.pdfparser import read_header, PDFParser -from playa.exceptions import PDFSyntaxError - - -def test_read_header(): - """Verify reading header.""" - with pytest.raises(PDFSyntaxError): - read_header(BytesIO(b"NOT-A-PDF!!!")) - with pytest.raises(PDFSyntaxError): - read_header(BytesIO(b"%PDF")) - with pytest.raises(PDFSyntaxError) as e: - read_header(BytesIO("%PDF-ÅÖÜ".encode("latin1"))) - assert "ASCII" in str(e) - with pytest.raises(PDFSyntaxError) as e: - read_header(BytesIO(b"%PDF-OMG")) - assert "invalid" in str(e) - assert read_header(BytesIO(b"%PDF-1.7")) == "1.7" +from playa.exceptions import PSEOF +from playa.pdfinterp import get_inline_data +from playa.psparser import ( + KEYWORD_DICT_BEGIN, + KEYWORD_DICT_END, + KWD, + LIT, + PSBaseParser, +) + +TESTDATA = b""" +ugh +foo\r +bar\rbaz +quxx +bog""" +EXPECTED = [ + (0, b"\n"), + (1, b"ugh\n"), + (5, b"foo\r\n"), + (10, b"bar\r"), + (14, b"baz\n"), + (18, b"quxx\n"), + (23, b"bog"), +] + + +def test_nextline(): + """Verify that we replicate the old nextline method.""" + parser = PSBaseParser(BytesIO(TESTDATA)) + lines = [] + while True: + try: + linepos, line = parser.nextline() + except PSEOF: + break + lines.append((linepos, line)) + assert lines == EXPECTED + + +def test_revreadlines(): + """Verify that we replicate the old revreadlines method.""" + parser = PSBaseParser(BytesIO(TESTDATA)) + lines = list(parser.revreadlines()) + assert lines == list(reversed([line for pos, line in EXPECTED])) + + +SIMPLE1 = b"""1 0 obj +<< + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R +>> +endobj +""" +SIMPLETOK = [ + 1, + 0, + KWD(b"obj"), + KEYWORD_DICT_BEGIN, + LIT("Type"), + LIT("Catalog"), + LIT("Outlines"), + 2, + 0, + KWD(b"R"), + LIT("Pages"), + 3, + 0, + KWD(b"R"), + KEYWORD_DICT_END, + KWD(b"endobj"), +] + + +def test_new_parser(): + # Do a lot of them to make sure buffering works correctly + parser = PSBaseParser(BytesIO(SIMPLE1 * 100)) + tokens = [tok for pos, tok in list(parser)] + assert tokens == SIMPLETOK * 100 + + +def test_new_parser_eof(): + # Make sure we get a keyword at eof + parser = PSBaseParser(BytesIO(SIMPLE1[:-1])) + tokens = [tok for pos, tok in list(parser)] + assert tokens == SIMPLETOK + + +PAGE17 = b""" + /A;Name_With-Various***Characters? + /lime#20Green + /paired#28#29parentheses +""" + + +def test_new_parser1(): + parser = PSBaseParser(BytesIO(b"123.456")) + assert list(parser) == [(0, 123.456)] + parser = PSBaseParser(BytesIO(b"+.013")) + assert list(parser) == [(0, 0.013)] + parser = PSBaseParser(BytesIO(b"123")) + assert list(parser) == [(0, 123)] + parser = PSBaseParser(BytesIO(b"true false")) + assert list(parser) == [(0, True), (5, False)] + parser = PSBaseParser(BytesIO(b"(foobie bletch)")) + assert list(parser) == [(0, b"foobie bletch")] + parser = PSBaseParser(BytesIO(b"(foo")) # Invalid string + assert list(parser) == [] + + +def test_new_parser_names(): + # Examples from PDF 1.7 page 17 + parser = PSBaseParser(BytesIO(PAGE17)) + tokens = list(parser) + assert tokens == [ + (5, LIT("A;Name_With-Various***Characters?")), + (44, LIT("lime Green")), + (62, LIT("paired()parentheses")), + ] + + +def test_new_parser_strings(): + parser = PSBaseParser( + BytesIO( + rb"( Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . )" + ) + ) + assert list(parser) == [ + ( + 0, + rb" Strings may contain balanced parentheses ( ) and " + rb"special characters ( * ! & } ^ % and so on ) . ", + ) + ] + parser = PSBaseParser(BytesIO(b"()")) + assert list(parser) == [(0, b"")] + parser = PSBaseParser( + BytesIO( + rb"""( These \ +two strings \ +are the same . ) + """ + ) + ) + assert list(parser) == [(0, b" These two strings are the same . ")] + parser = PSBaseParser(BytesIO(b"(foo\rbar)")) + assert list(parser) == [(0, b"foo\nbar")] + parser = PSBaseParser(BytesIO(b"(foo\r)")) + assert list(parser) == [(0, b"foo\n")] + parser = PSBaseParser(BytesIO(b"(foo\r\nbaz)")) + assert list(parser) == [(0, b"foo\nbaz")] + parser = PSBaseParser(BytesIO(b"(foo\n)")) + assert list(parser) == [(0, b"foo\n")] + parser = PSBaseParser( + BytesIO(rb"( This string contains \245two octal characters\307 . )") + ) + assert list(parser) == [ + (0, b" This string contains \245two octal characters\307 . ") + ] + parser = PSBaseParser(BytesIO(rb"(\0053 \053 \53)")) + assert list(parser) == [(0, b"\0053 \053 +")] + parser = PSBaseParser(BytesIO(rb"< 4E6F762073686D6F7A206B6120706F702E >")) + assert list(parser) == [(0, b"Nov shmoz ka pop.")] + parser = PSBaseParser(BytesIO(rb"<73 686 D6F7A2>")) + assert list(parser) == [(0, b"shmoz ")] + parser = PSBaseParser(BytesIO(rb"(\400)")) + assert list(parser) == [(0, b"")] + + +def test_invalid_strings_eof(): + parser = PSBaseParser(BytesIO(rb"(\00")) + assert list(parser) == [] + parser = PSBaseParser(BytesIO(rb"(abracadab")) + assert list(parser) == [] + parser = PSBaseParser(BytesIO(rb"<73686")) + assert list(parser) == [] + + +def test_get_inline_data(): + fp = BytesIO(b"""0123456789EI""") + assert get_inline_data(fp) == (10, b"0123456789EI") + fp = BytesIO(b"""0123456789EIEIO""") + assert get_inline_data(fp) == (10, b"0123456789EI") + assert fp.read(3) == b"EIO" + fp = BytesIO(b"""012EIEIO""") + assert get_inline_data(fp, blocksize=4) == (3, b"012EI") + assert fp.read(3) == b"EIO" + fp = BytesIO(b"""0123012EIEIO""") + assert get_inline_data(fp, blocksize=4) == (7, b"0123012EI") + assert fp.read(3) == b"EIO" + for blocksize in range(1, 8): + fp = BytesIO(b"""012EIEIOOMG""") + assert get_inline_data(fp, blocksize=blocksize, target=b"EIEIO") == ( + 3, + b"012EIEIO", + ) + assert fp.read(3) == b"OMG"