dhdaines · dhdaines · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,20 @@
+name: Run all tests
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Hatch
+        uses: pypa/hatch@install
+      - name: Run tests
+        run: hatch test
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,4 @@ Pipfile.lock
 .vscode/
 poetry.lock
 .eggs
+*~
diff --git a/playa/__init__.py b/playa/__init__.py
@@ -12,7 +12,6 @@
 from typing import Iterator
 
 from playa.pdfdocument import PDFDocument
-from playa.pdfparser import PDFParser
 
 __version__ = "0.0.1"
 

diff --git a/playa/data_structures.py b/playa/data_structures.py
@@ -37,9 +37,7 @@ def _parse(self) -> List[Tuple[int, Any]]:
 
         return items
 
-    values: List[Tuple[int, Any]]  # workaround decorators unsupported by mypy
-
-    @property  # type: ignore[no-redef,misc]
+    @property
     def values(self) -> List[Tuple[int, Any]]:
         values = self._parse()
 

diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
@@ -39,7 +39,7 @@
     PDFSyntaxError,
     PDFTypeError,
 )
-from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser, read_header
+from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
 from playa.pdftypes import (
     DecipherCallable,
     PDFStream,
@@ -629,6 +629,27 @@ def decrypt_aes256(self, objid: int, genno: int, data: bytes) -> bytes:
 }
 
 
+def read_header(fp: BinaryIO) -> str:
+    """Read the PDF header and return the (initial) version string.
+
+    Note that this version can be overridden in the document catalog."""
+    try:
+        hdr = fp.read(8)
+    except IOError as err:
+        raise PDFSyntaxError("Failed to read PDF header") from err
+    if not hdr.startswith(b"%PDF-"):
+        raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr)
+    try:
+        version = hdr[5:].decode("ascii")
+    except UnicodeDecodeError as err:
+        raise PDFSyntaxError(
+            "Version number in %r contains non-ASCII characters" % hdr
+        ) from err
+    if not re.match(r"\d\.\d", version):
+        raise PDFSyntaxError("Version number in  %r is invalid" % hdr)
+    return version
+
+
 class PDFDocument:
     """Representation of a PDF document on disk.
 
@@ -670,6 +691,7 @@ def __init__(
         self.decipher: Optional[DecipherCallable] = None
         self._cached_objs: Dict[int, Tuple[object, int]] = {}
         self._parsed_objs: Dict[int, Tuple[List[object], int]] = {}
+        self.pdf_version = read_header(fp)
         self.parser = PDFParser(fp)
         self.parser.set_document(self)  # FIXME: annoying circular reference
         self.is_printable = self.is_modifiable = self.is_extractable = True
@@ -818,6 +840,7 @@ def getobj(self, objid: int) -> object:
         if objid in self._cached_objs:
             (obj, genno) = self._cached_objs[objid]
         else:
+            obj = None
             for xref in self.xrefs:
                 try:
                     (strmid, index, genno) = xref.get_pos(objid)
@@ -837,7 +860,7 @@ def getobj(self, objid: int) -> object:
                     break
                 except (PSEOF, PDFSyntaxError):
                     continue
-            else:
+            if obj is None:
                 raise PDFObjectNotFound(objid)
             log.debug("register: objid=%r: %r", objid, obj)
             self._cached_objs[objid] = (obj, genno)
@@ -871,7 +894,9 @@ def get_page_labels(self) -> Iterator[str]:
         If the document includes page labels, generates strings, one per page.
         If not, raises PDFNoPageLabels.
 
-        The resulting iteration is unbounded.
+        The resulting iterator is unbounded, so it is recommended to
+        zip it with the iterator over actual pages returned by `get_pages`.
+
         """
         assert self.catalog is not None
 

diff --git a/playa/pdfinterp.py b/playa/pdfinterp.py
@@ -1,7 +1,7 @@
+import io
 import logging
-import re
 from io import BytesIO
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
+from typing import BinaryIO, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 from playa import settings
 from playa.casting import safe_float
@@ -247,6 +247,69 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
         return font
 
 
+KEYWORD_BI = KWD(b"BI")
+KEYWORD_ID = KWD(b"ID")
+KEYWORD_EI = KWD(b"EI")
+
+
+def get_inline_data(
+    fp: BinaryIO, target: bytes = b"EI", blocksize: int = 4096
+) -> Tuple[int, bytes]:
+    """Get the data for an inline image up to the target
+    end-of-stream marker.
+
+    Returns a tuple of the position of the target in the data and the
+    data *including* the end of stream marker.  Advances the file
+    pointer to a position after the end of the stream.
+
+    The caller is responsible for removing the end-of-stream if
+    necessary (this depends on the filter being used) and parsing
+    the end-of-stream token (likewise) if necessary.
+    """
+    # PDF 1.7, p. 216: The bytes between the ID and EI operators
+    # shall be treated the same as a stream object’s data (see
+    # 7.3.8, "Stream Objects"), even though they do not follow the
+    # standard stream syntax.
+    data = []  # list of blocks
+    partial = b""  # partially seen target
+    pos = 0
+    while True:
+        # Did we see part of the target at the end of the last
+        # block?  Then scan ahead and try to find the rest (we
+        # assume the stream is buffered)
+        if partial:
+            extra_len = len(target) - len(partial)
+            extra = fp.read(extra_len)
+            if partial + extra == target:
+                pos -= len(partial)
+                data.append(extra)
+                break
+            # Put it back (assume buffering!)
+            fp.seek(-extra_len, io.SEEK_CUR)
+            partial = b""
+            # Fall through (the target could be at the beginning)
+        buf = fp.read(blocksize)
+        tpos = buf.find(target)
+        if tpos != -1:
+            data.append(buf[: tpos + len(target)])
+            # Put the extra back (assume buffering!)
+            fp.seek(tpos - len(buf) + len(target), io.SEEK_CUR)
+            pos += tpos
+            break
+        else:
+            pos += len(buf)
+            # look for the longest partial match at the end
+            plen = len(target) - 1
+            while plen > 0:
+                ppos = len(buf) - plen
+                if buf[ppos:] == target[:plen]:
+                    partial = buf[ppos:]
+                    break
+                plen -= 1
+            data.append(buf)
+    return (pos, b"".join(data))
+
+
 class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
     def __init__(self, streams: Sequence[object]) -> None:
         self.streams = streams
@@ -267,65 +330,16 @@ def fillfp(self) -> None:
 
     def seek(self, pos: int) -> None:
         self.fillfp()
-        PSStackParser.seek(self, pos)
-
-    def fillbuf(self) -> None:
-        if self.charpos < len(self.buf):
-            return
-        while 1:
-            self.fillfp()
-            self.bufpos = self.fp.tell()
-            self.buf = self.fp.read(self.BUFSIZ)
-            if self.buf:
-                break
-            self.fp = None  # type: ignore[assignment]
-        self.charpos = 0
-
-    def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
-        self.seek(pos)
-        i = 0
-        data = b""
-        while i <= len(target):
-            self.fillbuf()
-            if i:
-                ci = self.buf[self.charpos]
-                c = bytes((ci,))
-                data += c
-                self.charpos += 1
-                if (
-                    len(target) <= i
-                    and c.isspace()
-                    or i < len(target)
-                    and c == (bytes((target[i],)))
-                ):
-                    i += 1
-                else:
-                    i = 0
-            else:
-                try:
-                    j = self.buf.index(target[0], self.charpos)
-                    data += self.buf[self.charpos : j + 1]
-                    self.charpos = j + 1
-                    i = 1
-                except ValueError:
-                    data += self.buf[self.charpos :]
-                    self.charpos = len(self.buf)
-        data = data[: -(len(target) + 1)]  # strip the last part
-        data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
-        return (pos, data)
+        super().seek(pos)
 
     def flush(self) -> None:
         self.add_results(*self.popall())
 
-    KEYWORD_BI = KWD(b"BI")
-    KEYWORD_ID = KWD(b"ID")
-    KEYWORD_EI = KWD(b"EI")
-
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
-        if token is self.KEYWORD_BI:
+        if token is KEYWORD_BI:
             # inline image within a content stream
             self.start_type(pos, "inline")
-        elif token is self.KEYWORD_ID:
+        elif token is KEYWORD_ID:
             try:
                 (_, objs) = self.end_type("inline")
                 if len(objs) % 2 != 0:
@@ -339,13 +353,30 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                         filter = [filter]
                     if filter[0] in LITERALS_ASCII85_DECODE:
                         eos = b"~>"
-                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
-                if eos != b"EI":  # it may be necessary for decoding
-                    data += eos
+                # PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
+                # or ASCII85Decode as one of its filters, the ID
+                # operator shall be followed by a single white-space
+                # character, and the next character shall be
+                # interpreted as the first byte of image data.
+                if eos == b"EI":
+                    self.seek(pos + len(token.name) + 1)
+                    (pos, data) = get_inline_data(self.fp, target=eos)
+                    # FIXME: it is totally unspecified what to do with
+                    # a newline between the end of the data and "EI",
+                    # since there is no explicit stream length.  (PDF
+                    # 1.7 p. 756: There should be an end-of-line
+                    # marker after the data and before endstream; this
+                    # marker shall not be included in the stream
+                    # length.)
+                    data = data[: -len(eos)]
+                else:
+                    self.seek(pos + len(token.name))
+                    (pos, data) = get_inline_data(self.fp, target=eos)
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
-                if eos == b"EI":  # otherwise it is still in the stream
-                    self.push((pos, self.KEYWORD_EI))
+                # This was included in the data but we need to "parse" it
+                if eos == b"EI":
+                    self.push((pos, KEYWORD_EI))
             except PSTypeError:
                 if settings.STRICT:
                     raise

diff --git a/playa/pdfpage.py b/playa/pdfpage.py
@@ -3,13 +3,15 @@
 from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
 
 from playa import settings
-from playa.exceptions import PDFObjectNotFound, PDFValueError
-from playa.pdfdocument import (
-    PDFDocument,
+from playa.exceptions import (
     PDFNoPageLabels,
+    PDFObjectNotFound,
     PDFTextExtractionNotAllowed,
+    PDFValueError,
+)
+from playa.pdfdocument import (
+    PDFDocument,
 )
-from playa.pdfparser import PDFParser
 from playa.pdftypes import dict_value, int_value, list_value, resolve1
 from playa.psparser import LIT
 from playa.utils import parse_rect
@@ -173,10 +175,8 @@ def get_pages(
         caching: bool = True,
         check_extractable: bool = False,
     ) -> Iterator["PDFPage"]:
-        # Create a PDF parser object associated with the file object.
-        parser = PDFParser(fp)
         # Create a PDF document object that stores the document structure.
-        doc = PDFDocument(parser, password=password, caching=caching)
+        doc = PDFDocument(fp, password=password)
         # Check if the document allows text extraction.
         # If not, warn the user and proceed.
         if not doc.is_extractable:

diff --git a/playa/pdfparser.py b/playa/pdfparser.py
@@ -1,5 +1,4 @@
 import logging
-import re
 from io import BytesIO
 from typing import TYPE_CHECKING, BinaryIO, Optional, Union
 
@@ -24,27 +23,6 @@
 KEYWORD_OBJ = KWD(b"obj")
 
 
-def read_header(fp: BinaryIO) -> str:
-    """Read the PDF header and return the (initial) version string.
-
-    Note that this version can be overridden in the document catalog."""
-    try:
-        hdr = fp.read(8)
-    except IOError as err:
-        raise PDFSyntaxError("Failed to read PDF header") from err
-    if not hdr.startswith(b"%PDF-"):
-        raise PDFSyntaxError("Expected b'%%PDF-', got %r, is this a PDF?" % hdr)
-    try:
-        version = hdr[5:].decode("ascii")
-    except UnicodeDecodeError as err:
-        raise PDFSyntaxError(
-            "Version number in %r contains non-ASCII characters" % hdr
-        ) from err
-    if not re.match(r"\d\.\d", version):
-        raise PDFSyntaxError("Version number in  %r is invalid" % hdr)
-    return version
-
-
 # PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
 class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
     """PDFParser fetch PDF objects from a file stream.
@@ -65,7 +43,6 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
     def __init__(self, fp: BinaryIO) -> None:
         PSStackParser.__init__(self, fp)
         self.doc: Optional[PDFDocument] = None
-        self.pdf_version = read_header(fp)
         self.fallback = False
 
     def set_document(self, doc: Union["PDFDocument", None]) -> None:
@@ -115,7 +92,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
             self.fp.seek(pos)
             data = bytearray(self.fp.read(objlen))
             self.seek(pos + objlen)
-            while 1:
+            while True:
                 try:
                     (linepos, line) = self.nextline()
                 except PSEOF:
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,3 +26,4 @@ Pipfile.lock @@
     .vscode/
     poetry.lock
     .eggs
+    *~