From 13243dd1d40cb393d4f0d2587ca62471d7e9aad0 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 18 Sep 2024 08:27:01 -0400
Subject: [PATCH 1/3] feat: iterate over page tree

---
 playa/pdfdocument.py      | 70 +++++++++++++++++++++++++++++++++++++++
 tests/test_pdfdocument.py | 10 ++++++
 2 files changed, 80 insertions(+)

diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
index 64c9238..e67f335 100644
--- a/playa/pdfdocument.py
+++ b/playa/pdfdocument.py
@@ -42,6 +42,7 @@
 from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
 from playa.pdftypes import (
     DecipherCallable,
+    PDFObjRef,
     PDFStream,
     decipher_all,
     dict_value,
@@ -68,7 +69,10 @@
 LITERAL_OBJSTM = LIT("ObjStm")
 LITERAL_XREF = LIT("XRef")
 LITERAL_CATALOG = LIT("Catalog")
+LITERAL_PAGE = LIT("Page")
+LITERAL_PAGES = LIT("Pages")
 KEYWORD_OBJ = KWD(b"obj")
+INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
 
 
 class PDFBaseXRef:
@@ -907,6 +911,72 @@ def get_page_labels(self) -> Iterator[str]:
 
         return page_labels.labels
 
+    PageType = Dict[Any, Dict[Any, Any]]
+
+    def get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
+        """Find pages from the cross-reference tables if the page tree
+        is missing (note that this only happens in invalid PDFs, but
+        it happens.)
+
+        Returns an iterator over (objid, dict) pairs.
+        """
+        for xref in self.xrefs:
+            for object_id in xref.get_objids():
+                try:
+                    obj = self.getobj(object_id)
+                    if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
+                        yield object_id, obj
+                except PDFObjectNotFound:
+                    pass
+
+    def walk_page_tree(self) -> Iterator[Tuple[int, PageType]]:
+        """Iterate over the flattened page tree in reading order, propagating
+        inheritable attributes.  Returns an iterator over (objid, dict) pairs.
+
+        Will raise an IndexError if there is no page tree.
+        """
+        stack = [(self.catalog["Pages"], self.catalog)]
+        visited = set()
+        while stack:
+            (obj, parent) = stack.pop()
+            if isinstance(obj, PDFObjRef):
+                # The PDF specification *requires* both the Pages
+                # element of the catalog and the entries in Kids in
+                # the page tree to be indirect references.
+                object_id = obj.objid
+            elif isinstance(obj, int):
+                # Should not happen in a valid PDF, but probably does?
+                log.warning("Page tree contains bare integer: %r in %r", obj, parent)
+                object_id = obj
+            else:
+                log.warning("Page tree contains unknown object: %r", obj)
+            page_object = dict_value(self.getobj(object_id))
+
+            # Avoid recursion errors by keeping track of visited nodes
+            # (again, this should never actually happen in a valid PDF)
+            if object_id in visited:
+                log.warning("Circular reference %r in page tree", obj)
+                continue
+            visited.add(object_id)
+
+            # Propagate inheritable attributes
+            object_properties = page_object.copy()
+            for k, v in parent.items():
+                if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
+                    object_properties[k] = v
+
+            # Recurse, depth-first
+            object_type = object_properties.get("Type")
+            if object_type is None and not settings.STRICT:  # See #64
+                object_type = object_properties.get("type")
+            if object_type is LITERAL_PAGES and "Kids" in object_properties:
+                log.debug("Pages: Kids=%r", object_properties["Kids"])
+                for child in reversed(list_value(object_properties["Kids"])):
+                    stack.append((child, object_properties))
+            elif object_type is LITERAL_PAGE:
+                log.debug("Page: %r", object_properties)
+                yield object_id, object_properties
+
     def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
         try:
             names = dict_value(self.catalog["Names"])
diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py
index e82f576..7169a8b 100644
--- a/tests/test_pdfdocument.py
+++ b/tests/test_pdfdocument.py
@@ -36,3 +36,13 @@ def test_page_labels():
     with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
         labels = [label for _, label in zip(range(10), doc.get_page_labels())]
         assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]
+
+
+def test_page_tree():
+    with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
+        page_objects = list(doc.walk_page_tree())
+        assert len(page_objects) == 15
+
+
+def test_pages():
+    pass

From c60ae5720c04e86a49c4fce921fc9a81d9ba6086 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Sun, 29 Sep 2024 16:27:14 -0400
Subject: [PATCH 2/3] fix: put page creation where it belongs

circular imports are not a fatailty
---
 playa/exceptions.py       |   4 ++
 playa/pdfdocument.py      |  23 +++++++-
 playa/pdfpage.py          | 116 ++------------------------------------
 tests/test_open.py        |   4 +-
 tests/test_pdfdocument.py |   2 +-
 5 files changed, 31 insertions(+), 118 deletions(-)

diff --git a/playa/exceptions.py b/playa/exceptions.py
index 6814ca3..0698a4c 100644
--- a/playa/exceptions.py
+++ b/playa/exceptions.py
@@ -71,6 +71,10 @@ class PDFNoPageLabels(PDFException):
     pass
 
 
+class PDFNoPageTree(PDFException):
+    pass
+
+
 class PDFDestinationNotFound(PDFException):
     pass
 
diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
index e67f335..eee0b2d 100644
--- a/playa/pdfdocument.py
+++ b/playa/pdfdocument.py
@@ -33,6 +33,7 @@
     PDFKeyError,
     PDFNoOutlines,
     PDFNoPageLabels,
+    PDFNoPageTree,
     PDFNoValidXRef,
     PDFObjectNotFound,
     PDFPasswordIncorrect,
@@ -913,7 +914,7 @@ def get_page_labels(self) -> Iterator[str]:
 
     PageType = Dict[Any, Dict[Any, Any]]
 
-    def get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
+    def pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
         """Find pages from the cross-reference tables if the page tree
         is missing (note that this only happens in invalid PDFs, but
         it happens.)
@@ -929,12 +930,14 @@ def get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
                 except PDFObjectNotFound:
                     pass
 
-    def walk_page_tree(self) -> Iterator[Tuple[int, PageType]]:
+    def page_tree(self) -> Iterator[Tuple[int, PageType]]:
         """Iterate over the flattened page tree in reading order, propagating
         inheritable attributes.  Returns an iterator over (objid, dict) pairs.
 
-        Will raise an IndexError if there is no page tree.
+        Will raise PDFNoPageTree if there is no page tree.
         """
+        if "Pages" not in self.catalog:
+            raise PDFNoPageTree("No 'Pages' entry in catalog")
         stack = [(self.catalog["Pages"], self.catalog)]
         visited = set()
         while stack:
@@ -977,6 +980,20 @@ def walk_page_tree(self) -> Iterator[Tuple[int, PageType]]:
                 log.debug("Page: %r", object_properties)
                 yield object_id, object_properties
 
+    def get_pages(self) -> Iterator["PDFPage"]:
+        from playa.pdfpage import PDFPage
+        try:
+            page_labels: Iterator[Optional[str]] = self.get_page_labels()
+        except PDFNoPageLabels:
+            page_labels = itertools.repeat(None)
+        try:
+            page_tree = self.page_tree()
+        except PDFNoPageTree:
+            page_tree = self.pages_from_xrefs()
+
+        for (objid, properties), label in zip(page_tree, page_labels):
+            yield PDFPage(self, objid, properties, label)
+
     def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
         try:
             names = dict_value(self.catalog["Names"])
diff --git a/playa/pdfpage.py b/playa/pdfpage.py
index f2064ae..611e79c 100644
--- a/playa/pdfpage.py
+++ b/playa/pdfpage.py
@@ -1,18 +1,9 @@
-import itertools
 import logging
-from typing import Any, BinaryIO, Container, Dict, Iterator, List, Optional, Set, Tuple
+from typing import Dict, List, Optional
 
-from playa import settings
-from playa.exceptions import (
-    PDFNoPageLabels,
-    PDFObjectNotFound,
-    PDFTextExtractionNotAllowed,
-    PDFValueError,
-)
-from playa.pdfdocument import (
-    PDFDocument,
-)
-from playa.pdftypes import dict_value, int_value, list_value, resolve1
+from playa.exceptions import PDFValueError
+from playa.pdfdocument import PDFDocument
+from playa.pdftypes import dict_value, int_value, resolve1
 from playa.psparser import LIT
 from playa.utils import parse_rect
 
@@ -100,102 +91,3 @@ def __init__(
 
     def __repr__(self) -> str:
         return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
-
-    INHERITABLE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}
-
-    @classmethod
-    def create_pages(cls, document: PDFDocument) -> Iterator["PDFPage"]:
-        def depth_first_search(
-            obj: Any,
-            parent: Dict[str, Any],
-            visited: Optional[Set[Any]] = None,
-        ) -> Iterator[Tuple[int, Dict[Any, Dict[Any, Any]]]]:
-            if isinstance(obj, int):
-                object_id = obj
-                object_properties = dict_value(document.getobj(object_id)).copy()
-            else:
-                # This looks broken. obj.objid means obj could be either
-                # PDFObjRef or PDFStream, but neither is valid for dict_value.
-                object_id = obj.objid  # type: ignore[attr-defined]
-                object_properties = dict_value(obj).copy()
-
-            # Avoid recursion errors by keeping track of visited nodes
-            if visited is None:
-                visited = set()
-            if object_id in visited:
-                return
-            visited.add(object_id)
-
-            for k, v in parent.items():
-                if k in cls.INHERITABLE_ATTRS and k not in object_properties:
-                    object_properties[k] = v
-
-            object_type = object_properties.get("Type")
-            if object_type is None and not settings.STRICT:  # See #64
-                object_type = object_properties.get("type")
-
-            if object_type is LITERAL_PAGES and "Kids" in object_properties:
-                log.debug("Pages: Kids=%r", object_properties["Kids"])
-                for child in list_value(object_properties["Kids"]):
-                    yield from depth_first_search(child, object_properties, visited)
-
-            elif object_type is LITERAL_PAGE:
-                log.debug("Page: %r", object_properties)
-                yield (object_id, object_properties)
-
-        try:
-            page_labels: Iterator[Optional[str]] = document.get_page_labels()
-        except PDFNoPageLabels:
-            page_labels = itertools.repeat(None)
-
-        pages = False
-        if "Pages" in document.catalog:
-            objects = depth_first_search(document.catalog["Pages"], document.catalog)
-            for objid, tree in objects:
-                yield cls(document, objid, tree, next(page_labels))
-                pages = True
-        if not pages:
-            # fallback when /Pages is missing.
-            for xref in document.xrefs:
-                for objid in xref.get_objids():
-                    try:
-                        obj = document.getobj(objid)
-                        if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
-                            yield cls(document, objid, obj, next(page_labels))
-                    except PDFObjectNotFound:
-                        pass
-
-    @classmethod
-    def get_pages(
-        cls,
-        fp: BinaryIO,
-        pagenos: Optional[Container[int]] = None,
-        maxpages: int = 0,
-        password: str = "",
-        caching: bool = True,
-        check_extractable: bool = False,
-    ) -> Iterator["PDFPage"]:
-        # Create a PDF document object that stores the document structure.
-        doc = PDFDocument(fp, password=password)
-        # Check if the document allows text extraction.
-        # If not, warn the user and proceed.
-        if not doc.is_extractable:
-            if check_extractable:
-                error_msg = "Text extraction is not allowed: %r" % fp
-                raise PDFTextExtractionNotAllowed(error_msg)
-            else:
-                warning_msg = (
-                    "The PDF %r contains a metadata field "
-                    "indicating that it should not allow "
-                    "text extraction. Ignoring this field "
-                    "and proceeding. Use the check_extractable "
-                    "if you want to raise an error in this case" % fp
-                )
-                log.warning(warning_msg)
-        # Process each page contained in the document.
-        for pageno, page in enumerate(cls.create_pages(doc)):
-            if pagenos and (pageno not in pagenos):
-                continue
-            yield page
-            if maxpages and maxpages <= pageno + 1:
-                break
diff --git a/tests/test_open.py b/tests/test_open.py
index 163e0c9..c00944a 100644
--- a/tests/test_open.py
+++ b/tests/test_open.py
@@ -46,7 +46,7 @@ def test_inline_data():
         rsrc = PDFResourceManager()
         agg = PDFPageAggregator(rsrc, pageno=1)
         interp = PDFPageInterpreter(rsrc, agg)
-        page = next(PDFPage.create_pages(doc))
+        page = next(doc.get_pages())
         interp.process_page(page)
 
 
@@ -56,7 +56,7 @@ def test_multiple_contents():
         rsrc = PDFResourceManager()
         agg = PDFPageAggregator(rsrc, pageno=1)
         interp = PDFPageInterpreter(rsrc, agg)
-        page = next(PDFPage.create_pages(doc))
+        page = next(doc.get_pages())
         assert len(page.contents) > 1
         interp.process_page(page)
 
diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py
index 7169a8b..69dcb0e 100644
--- a/tests/test_pdfdocument.py
+++ b/tests/test_pdfdocument.py
@@ -40,7 +40,7 @@ def test_page_labels():
 
 def test_page_tree():
     with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
-        page_objects = list(doc.walk_page_tree())
+        page_objects = list(doc.get_pages())
         assert len(page_objects) == 15
 
 

From 6c678860694ea412009ced63e6fccc625cbeacfa Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Mon, 30 Sep 2024 08:13:55 -0400
Subject: [PATCH 3/3] fix: circular references also unnecessary

---
 playa/pdfdocument.py      | 9 ++++++---
 playa/pdfpage.py          | 4 ----
 tests/test_open.py        | 1 -
 tests/test_pdfdocument.py | 6 +-----
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py
index eee0b2d..f204096 100644
--- a/playa/pdfdocument.py
+++ b/playa/pdfdocument.py
@@ -54,6 +54,7 @@
     uint_value,
 )
 from playa.psparser import KWD, LIT, literal_name
+from playa.pdfpage import PDFPage
 from playa.utils import (
     choplist,
     decode_text,
@@ -980,8 +981,10 @@ def page_tree(self) -> Iterator[Tuple[int, PageType]]:
                 log.debug("Page: %r", object_properties)
                 yield object_id, object_properties
 
-    def get_pages(self) -> Iterator["PDFPage"]:
-        from playa.pdfpage import PDFPage
+    def get_pages(self) -> Iterator[PDFPage]:
+        """Get an iterator over PDFPage objects, which contain
+        information about the pages in the document.
+        """
         try:
             page_labels: Iterator[Optional[str]] = self.get_page_labels()
         except PDFNoPageLabels:
@@ -992,7 +995,7 @@ def get_pages(self) -> Iterator["PDFPage"]:
             page_tree = self.pages_from_xrefs()
 
         for (objid, properties), label in zip(page_tree, page_labels):
-            yield PDFPage(self, objid, properties, label)
+            yield PDFPage(objid, properties, label)
 
     def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
         try:
diff --git a/playa/pdfpage.py b/playa/pdfpage.py
index 611e79c..02b2335 100644
--- a/playa/pdfpage.py
+++ b/playa/pdfpage.py
@@ -2,7 +2,6 @@
 from typing import Dict, List, Optional
 
 from playa.exceptions import PDFValueError
-from playa.pdfdocument import PDFDocument
 from playa.pdftypes import dict_value, int_value, resolve1
 from playa.psparser import LIT
 from playa.utils import parse_rect
@@ -23,7 +22,6 @@ class PDFPage:
 
     Attributes
     ----------
-      doc: a PDFDocument object.
       pageid: any Python object that can uniquely identify the page.
       attrs: a dictionary of page attributes.
       contents: a list of PDFStream objects that represents the page content.
@@ -40,7 +38,6 @@ class PDFPage:
 
     def __init__(
         self,
-        doc: PDFDocument,
         pageid: object,
         attrs: object,
         label: Optional[str],
@@ -52,7 +49,6 @@ def __init__(
         attrs: a dictionary of page attributes.
         label: page label string.
         """
-        self.doc = doc
         self.pageid = pageid
         self.attrs = dict_value(attrs)
         self.label = label
diff --git a/tests/test_open.py b/tests/test_open.py
index c00944a..195fbf8 100644
--- a/tests/test_open.py
+++ b/tests/test_open.py
@@ -11,7 +11,6 @@
 
 # These APIs will go away soon
 from playa.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from playa.pdfpage import PDFPage
 
 TESTDIR = Path(__file__).parent.parent / "samples"
 ALLPDFS = TESTDIR.glob("**/*.pdf")
diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py
index 69dcb0e..d790e09 100644
--- a/tests/test_pdfdocument.py
+++ b/tests/test_pdfdocument.py
@@ -38,11 +38,7 @@ def test_page_labels():
         assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]
 
 
-def test_page_tree():
+def test_pages():
     with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
         page_objects = list(doc.get_pages())
         assert len(page_objects) == 15
-
-
-def test_pages():
-    pass