diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py index 64c9238..e67f335 100644 --- a/playa/pdfdocument.py +++ b/playa/pdfdocument.py @@ -42,6 +42,7 @@ from playa.pdfparser import KEYWORD_XREF, PDFParser, PDFStreamParser from playa.pdftypes import ( DecipherCallable, + PDFObjRef, PDFStream, decipher_all, dict_value, @@ -68,7 +69,10 @@ LITERAL_OBJSTM = LIT("ObjStm") LITERAL_XREF = LIT("XRef") LITERAL_CATALOG = LIT("Catalog") +LITERAL_PAGE = LIT("Page") +LITERAL_PAGES = LIT("Pages") KEYWORD_OBJ = KWD(b"obj") +INHERITABLE_PAGE_ATTRS = {"Resources", "MediaBox", "CropBox", "Rotate"} class PDFBaseXRef: @@ -907,6 +911,72 @@ def get_page_labels(self) -> Iterator[str]: return page_labels.labels + PageType = Dict[Any, Dict[Any, Any]] + + def get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]: + """Find pages from the cross-reference tables if the page tree + is missing (note that this only happens in invalid PDFs, but + it happens.) + + Returns an iterator over (objid, dict) pairs. + """ + for xref in self.xrefs: + for object_id in xref.get_objids(): + try: + obj = self.getobj(object_id) + if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE: + yield object_id, obj + except PDFObjectNotFound: + pass + + def walk_page_tree(self) -> Iterator[Tuple[int, PageType]]: + """Iterate over the flattened page tree in reading order, propagating + inheritable attributes. Returns an iterator over (objid, dict) pairs. + + Will raise an IndexError if there is no page tree. + """ + stack = [(self.catalog["Pages"], self.catalog)] + visited = set() + while stack: + (obj, parent) = stack.pop() + if isinstance(obj, PDFObjRef): + # The PDF specification *requires* both the Pages + # element of the catalog and the entries in Kids in + # the page tree to be indirect references. + object_id = obj.objid + elif isinstance(obj, int): + # Should not happen in a valid PDF, but probably does? + log.warning("Page tree contains bare integer: %r in %r", obj, parent) + object_id = obj + else: + log.warning("Page tree contains unknown object: %r", obj) + page_object = dict_value(self.getobj(object_id)) + + # Avoid recursion errors by keeping track of visited nodes + # (again, this should never actually happen in a valid PDF) + if object_id in visited: + log.warning("Circular reference %r in page tree", obj) + continue + visited.add(object_id) + + # Propagate inheritable attributes + object_properties = page_object.copy() + for k, v in parent.items(): + if k in INHERITABLE_PAGE_ATTRS and k not in object_properties: + object_properties[k] = v + + # Recurse, depth-first + object_type = object_properties.get("Type") + if object_type is None and not settings.STRICT: # See #64 + object_type = object_properties.get("type") + if object_type is LITERAL_PAGES and "Kids" in object_properties: + log.debug("Pages: Kids=%r", object_properties["Kids"]) + for child in reversed(list_value(object_properties["Kids"])): + stack.append((child, object_properties)) + elif object_type is LITERAL_PAGE: + log.debug("Page: %r", object_properties) + yield object_id, object_properties + def lookup_name(self, cat: str, key: Union[str, bytes]) -> Any: try: names = dict_value(self.catalog["Names"]) diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 940847c..7169a8b 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -35,4 +35,14 @@ def test_read_header(): def test_page_labels(): with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc: labels = [label for _, label in zip(range(10), doc.get_page_labels())] - assert labels == ['iii', 'iv', '1', '2', '1', '2', '3', '4', '5', '6'] + assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"] + + +def test_page_tree(): + with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc: + page_objects = list(doc.walk_page_tree()) + assert len(page_objects) == 15 + + +def test_pages(): + pass