dhdaines · dhdaines · Sep 30, 2024 · Sep 18, 2024 · Sep 29, 2024 · Sep 30, 2024
diff --git a/playa/exceptions.py b/playa/exceptions.py
@@ -71,6 +71,10 @@ class PDFNoPageLabels(PDFException):
     pass
 
 
+class PDFNoPageTree(PDFException):
+    pass
+
+
 class PDFDestinationNotFound(PDFException):
     pass
 

diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
@@ -33,6 +33,7 @@
     PDFKeyError,
     PDFNoOutlines,
     PDFNoPageLabels,
+    PDFNoPageTree,
     PDFNoValidXRef,
     PDFObjectNotFound,
     PDFPasswordIncorrect,
@@ -42,6 +43,7 @@
 from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
 from playa.pdftypes import (
     DecipherCallable,
+    PDFObjRef,
     PDFStream,
     decipher_all,
     dict_value,
@@ -52,6 +54,7 @@
     uint_value,
 )
 from playa.psparser import KWD, LIT, literal_name
+from playa.pdfpage import PDFPage
 from playa.utils import (
     choplist,
     decode_text,
@@ -68,7 +71,10 @@
 LITERAL_OBJSTM = LIT("ObjStm")
 LITERAL_XREF = LIT("XRef")
 LITERAL_CATALOG = LIT("Catalog")
+LITERAL_PAGE = LIT("Page")
+LITERAL_PAGES = LIT("Pages")
 KEYWORD_OBJ = KWD(b"obj")
+INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
 
 
 class PDFBaseXRef:
@@ -907,6 +913,90 @@ def get_page_labels(self) -> Iterator[str]:
 
         return page_labels.labels
 
+    PageType = Dict[Any, Dict[Any, Any]]
+
+    def pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
+        """Find pages from the cross-reference tables if the page tree
+        is missing (note that this only happens in invalid PDFs, but
+        it happens.)
+
+        Returns an iterator over (objid, dict) pairs.
+        """
+        for xref in self.xrefs:
+            for object_id in xref.get_objids():
+                try:
+                    obj = self.getobj(object_id)
+                    if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
+                        yield object_id, obj
+                except PDFObjectNotFound:
+                    pass
+
+    def page_tree(self) -> Iterator[Tuple[int, PageType]]:
+        """Iterate over the flattened page tree in reading order, propagating
+        inheritable attributes.  Returns an iterator over (objid, dict) pairs.
+
+        Will raise PDFNoPageTree if there is no page tree.
+        """
+        if "Pages" not in self.catalog:
+            raise PDFNoPageTree("No 'Pages' entry in catalog")
+        stack = [(self.catalog["Pages"], self.catalog)]
+        visited = set()
+        while stack:
+            (obj, parent) = stack.pop()
+            if isinstance(obj, PDFObjRef):
+                # The PDF specification *requires* both the Pages
+                # element of the catalog and the entries in Kids in
+                # the page tree to be indirect references.
+                object_id = obj.objid
+            elif isinstance(obj, int):
+                # Should not happen in a valid PDF, but probably does?
+                log.warning("Page tree contains bare integer: %r in %r", obj, parent)
+                object_id = obj
+            else:
+                log.warning("Page tree contains unknown object: %r", obj)
+            page_object = dict_value(self.getobj(object_id))
+
+            # Avoid recursion errors by keeping track of visited nodes
+            # (again, this should never actually happen in a valid PDF)
+            if object_id in visited:
+                log.warning("Circular reference %r in page tree", obj)
+                continue
+            visited.add(object_id)
+
+            # Propagate inheritable attributes
+            object_properties = page_object.copy()
+            for k, v in parent.items():
+                if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
+                    object_properties[k] = v
+
+            # Recurse, depth-first
+            object_type = object_properties.get("Type")
+            if object_type is None and not settings.STRICT:  # See #64
+                object_type = object_properties.get("type")
+            if object_type is LITERAL_PAGES and "Kids" in object_properties:
+                log.debug("Pages: Kids=%r", object_properties["Kids"])
+                for child in reversed(list_value(object_properties["Kids"])):
+                    stack.append((child, object_properties))
+            elif object_type is LITERAL_PAGE:
+                log.debug("Page: %r", object_properties)
+                yield object_id, object_properties
+
+    def get_pages(self) -> Iterator[PDFPage]:
+        """Get an iterator over PDFPage objects, which contain
+        information about the pages in the document.
+        """
+        try:
+            page_labels: Iterator[Optional[str]] = self.get_page_labels()
+        except PDFNoPageLabels:
+            page_labels = itertools.repeat(None)
+        try:
+            page_tree = self.page_tree()
+        except PDFNoPageTree:
+            page_tree = self.pages_from_xrefs()
+
+        for (objid, properties), label in zip(page_tree, page_labels):
+            yield PDFPage(objid, properties, label)
+
     def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
         try:
             names = dict_value(self.catalog["Names"])

diff --git a/playa/pdfpage.py b/playa/pdfpage.py
@@ -1,18 +1,8 @@
-import itertools
 import logging
-from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
+from typing import Dict, List, Optional
 
-from playa import settings
-from playa.exceptions import (
-    PDFNoPageLabels,
-    PDFObjectNotFound,
-    PDFTextExtractionNotAllowed,
-    PDFValueError,
-)
-from playa.pdfdocument import (
-    PDFDocument,
-)
-from playa.pdftypes import dict_value, int_value, list_value, resolve1
+from playa.exceptions import PDFValueError
+from playa.pdftypes import dict_value, int_value, resolve1
 from playa.psparser import LIT
 from playa.utils import parse_rect
 
@@ -32,7 +22,6 @@ class PDFPage:
 
     Attributes
     ----------
-      doc: a PDFDocument object.
       pageid: any Python object that can uniquely identify the page.
       attrs: a dictionary of page attributes.
       contents: a list of PDFStream objects that represents the page content.
@@ -49,7 +38,6 @@ class PDFPage:
 
     def __init__(
         self,
-        doc: PDFDocument,
         pageid: object,
         attrs: object,
         label: Optional[str],
@@ -61,7 +49,6 @@ def __init__(
         attrs: a dictionary of page attributes.
         label: page label string.
         """
-        self.doc = doc
         self.pageid = pageid
         self.attrs = dict_value(attrs)
         self.label = label
@@ -100,102 +87,3 @@ def __init__(
 
     def __repr__(self) -> str:
         return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
-
-    INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
-
-    @classmethod
-    def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
-        def depth_first_search(
-            obj: Any,
-            parent: Dict[str, Any],
-            visited: Optional[Set[Any]] = None,
-        ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
-            if isinstance(obj, int):
-                object_id = obj
-                object_properties = dict_value(document.getobj(object_id)).copy()
-            else:
-                # This looks broken. obj.objid means obj could be either
-                # PDFObjRef or PDFStream, but neither is valid for dict_value.
-                object_id = obj.objid  # type: ignore[attr-defined]
-                object_properties = dict_value(obj).copy()
-
-            # Avoid recursion errors by keeping track of visited nodes
-            if visited is None:
-                visited = set()
-            if object_id in visited:
-                return
-            visited.add(object_id)
-
-            for k, v in parent.items():
-                if k in cls.INHERITABLE_ATTRS and k not in object_properties:
-                    object_properties[k] = v
-
-            object_type = object_properties.get("Type")
-            if object_type is None and not settings.STRICT:  # See #64
-                object_type = object_properties.get("type")
-
-            if object_type is LITERAL_PAGES and "Kids" in object_properties:
-                log.debug("Pages: Kids=%r", object_properties["Kids"])
-                for child in list_value(object_properties["Kids"]):
-                    yield from depth_first_search(child, object_properties, visited)
-
-            elif object_type is LITERAL_PAGE:
-                log.debug("Page: %r", object_properties)
-                yield (object_id, object_properties)
-
-        try:
-            page_labels: Iterator[Optional[str]] = document.get_page_labels()
-        except PDFNoPageLabels:
-            page_labels = itertools.repeat(None)
-
-        pages = False
-        if "Pages" in document.catalog:
-            objects = depth_first_search(document.catalog["Pages"], document.catalog)
-            for objid, tree in objects:
-                yield cls(document, objid, tree, next(page_labels))
-                pages = True
-        if not pages:
-            # fallback when /Pages is missing.
-            for xref in document.xrefs:
-                for objid in xref.get_objids():
-                    try:
-                        obj = document.getobj(objid)
-                        if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
-                            yield cls(document, objid, obj, next(page_labels))
-                    except PDFObjectNotFound:
-                        pass
-
-    @classmethod
-    def get_pages(
-        cls,
-        fp: BinaryIO,
-        pagenos: Optional[Container[int]] = None,
-        maxpages: int = 0,
-        password: str = "",
-        caching: bool = True,
-        check_extractable: bool = False,
-    ) -> Iterator["PDFPage"]:
-        # Create a PDF document object that stores the document structure.
-        doc = PDFDocument(fp, password=password)
-        # Check if the document allows text extraction.
-        # If not, warn the user and proceed.
-        if not doc.is_extractable:
-            if check_extractable:
-                error_msg = "Text extraction is not allowed: %r" % fp
-                raise PDFTextExtractionNotAllowed(error_msg)
-            else:
-                warning_msg = (
-                    "The PDF %r contains a metadata field "
-                    "indicating that it should not allow "
-                    "text extraction. Ignoring this field "
-                    "and proceeding. Use the check_extractable "
-                    "if you want to raise an error in this case" % fp
-                )
-                log.warning(warning_msg)
-        # Process each page contained in the document.
-        for pageno, page in enumerate(cls.create_pages(doc)):
-            if pagenos and (pageno not in pagenos):
-                continue
-            yield page
-            if maxpages and maxpages <= pageno + 1:
-                break
diff --git a/tests/test_open.py b/tests/test_open.py
@@ -11,7 +11,6 @@
 
 # These APIs will go away soon
 from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from playa.pdfpage import PDFPage
 
 TESTDIR = Path(__file__).parent.parent / "samples"
 ALLPDFS = TESTDIR.glob("**/*.pdf")
@@ -46,7 +45,7 @@ def test_inline_data():
         rsrc = PDFResourceManager()
         agg = PDFPageAggregator(rsrc, pageno=1)
         interp = PDFPageInterpreter(rsrc, agg)
-        page = next(PDFPage.create_pages(doc))
+        page = next(doc.get_pages())
         interp.process_page(page)
 
 
@@ -56,7 +55,7 @@ def test_multiple_contents():
         rsrc = PDFResourceManager()
         agg = PDFPageAggregator(rsrc, pageno=1)
         interp = PDFPageInterpreter(rsrc, agg)
-        page = next(PDFPage.create_pages(doc))
+        page = next(doc.get_pages())
         assert len(page.contents) > 1
         interp.process_page(page)
 

diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py
@@ -36,3 +36,9 @@ def test_page_labels():
     with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
         labels = [label for _, label in zip(range(10), doc.get_page_labels())]
         assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]
+
+
+def test_pages():
+    with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
+        page_objects = list(doc.get_pages())
+        assert len(page_objects) == 15
-Original file line number
+Diff line change
@@ Expand Up / @@ -71,6 +71,10 @@ class PDFNoPageLabels(PDFException): @@
         pass
+    class PDFNoPageTree(PDFException):
+        pass
     class PDFDestinationNotFound(PDFException):
         pass
@@ Expand Down @@