Skip to content

Commit

Permalink
feat: iterate over page tree
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Sep 29, 2024
1 parent 91a8c37 commit 13243dd
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
70 changes: 70 additions & 0 deletions playa/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser
from playa.pdftypes import (
DecipherCallable,
PDFObjRef,
PDFStream,
decipher_all,
dict_value,
Expand All @@ -68,7 +69,10 @@
LITERAL_OBJSTM = LIT("ObjStm")
LITERAL_XREF = LIT("XRef")
LITERAL_CATALOG = LIT("Catalog")
LITERAL_PAGE = LIT("Page")
LITERAL_PAGES = LIT("Pages")
KEYWORD_OBJ = KWD(b"obj")
INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"}


class PDFBaseXRef:
Expand Down Expand Up @@ -907,6 +911,72 @@ def get_page_labels(self) -> Iterator[str]:

return page_labels.labels

PageType = Dict[Any, Dict[Any, Any]]

def get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
"""Find pages from the cross-reference tables if the page tree
is missing (note that this only happens in invalid PDFs, but
it happens.)
Returns an iterator over (objid, dict) pairs.
"""
for xref in self.xrefs:
for object_id in xref.get_objids():
try:
obj = self.getobj(object_id)
if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
yield object_id, obj
except PDFObjectNotFound:
pass

def walk_page_tree(self) -> Iterator[Tuple[int, PageType]]:
"""Iterate over the flattened page tree in reading order, propagating
inheritable attributes. Returns an iterator over (objid, dict) pairs.
Will raise an IndexError if there is no page tree.
"""
stack = [(self.catalog["Pages"], self.catalog)]
visited = set()
while stack:
(obj, parent) = stack.pop()
if isinstance(obj, PDFObjRef):
# The PDF specification *requires* both the Pages
# element of the catalog and the entries in Kids in
# the page tree to be indirect references.
object_id = obj.objid
elif isinstance(obj, int):
# Should not happen in a valid PDF, but probably does?
log.warning("Page tree contains bare integer: %r in %r", obj, parent)
object_id = obj
else:
log.warning("Page tree contains unknown object: %r", obj)
page_object = dict_value(self.getobj(object_id))

# Avoid recursion errors by keeping track of visited nodes
# (again, this should never actually happen in a valid PDF)
if object_id in visited:
log.warning("Circular reference %r in page tree", obj)
continue
visited.add(object_id)

# Propagate inheritable attributes
object_properties = page_object.copy()
for k, v in parent.items():
if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
object_properties[k] = v

# Recurse, depth-first
object_type = object_properties.get("Type")
if object_type is None and not settings.STRICT: # See #64
object_type = object_properties.get("type")
if object_type is LITERAL_PAGES and "Kids" in object_properties:
log.debug("Pages: Kids=%r", object_properties["Kids"])
for child in reversed(list_value(object_properties["Kids"])):
stack.append((child, object_properties))
elif object_type is LITERAL_PAGE:
log.debug("Page: %r", object_properties)
yield object_id, object_properties

def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any:
try:
names = dict_value(self.catalog["Names"])
Expand Down
10 changes: 10 additions & 0 deletions tests/test_pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,13 @@ def test_page_labels():
with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
labels = [label for _, label in zip(range(10), doc.get_page_labels())]
assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]


def test_page_tree():
with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
page_objects = list(doc.walk_page_tree())
assert len(page_objects) == 15


def test_pages():
pass

0 comments on commit 13243dd

Please sign in to comment.