Skip to content

Commit

Permalink
fix: actually expose structtree on page/document oups
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Nov 28, 2024
1 parent b0c2d20 commit 22c67ca
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 9 deletions.
6 changes: 6 additions & 0 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
format_int_roman,
nunpack,
)
from playa.structtree import StructTree

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -942,6 +943,11 @@ def layout(self) -> Iterator[LayoutDict]:
dic["page_label"] = page.label
yield dic

@property
def structtree(self) -> StructTree:
"""Return the PDF structure tree."""
return StructTree(self)

def _getobj_objstm(
self, stream: ContentStream, index: int, objid: int
) -> PDFObject:
Expand Down
9 changes: 9 additions & 0 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
normalize_rect,
translate_matrix,
)
from playa.structtree import StructTree

if TYPE_CHECKING:
from playa.document import Document
Expand Down Expand Up @@ -243,6 +244,14 @@ def tokens(self) -> Iterator[Token]:
return
yield tok

@property
def structtree(self) -> StructTree:
"""Return the PDF structure tree."""
doc = self.doc()
if doc is None:
raise RuntimeError("Document no longer exists!")
return StructTree(doc, (self,))

def __repr__(self) -> str:
return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

Expand Down
6 changes: 3 additions & 3 deletions playa/structtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
)

from playa.data_structures import NumberTree
from playa.page import Page
from playa.parser import KEYWORD_NULL, PSLiteral
from playa.pdftypes import ObjRef, resolve1
from playa.utils import decode_text
Expand All @@ -29,6 +28,7 @@

if TYPE_CHECKING:
from playa.document import Document
from playa.page import Page


MatchFunc = Callable[["StructElement"], bool]
Expand Down Expand Up @@ -152,12 +152,12 @@ class StructTree(Findable):
attribute of `StructElement`.
"""

page: Union[Page, None]
page: Union["Page", None]

def __init__(
self,
doc: "Document",
pages: Union[Iterable[Page], None] = None,
pages: Union[Iterable["Page"], None] = None,
):
if "StructTreeRoot" not in doc.catalog:
raise KeyError("Catalog has no 'StructTreeRoot' entry")
Expand Down
11 changes: 5 additions & 6 deletions tests/test_structtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path

import playa
from playa.structtree import StructTree

TESTDIR = Path(__file__).parent.parent / "samples"

Expand All @@ -13,7 +12,7 @@ class TestClass(unittest.TestCase):

def test_structure_tree_class(self) -> None:
with playa.open(TESTDIR / "image_structure.pdf") as pdf:
stree = StructTree(pdf, [pdf.pages[0]])
stree = pdf.pages[0].structtree
doc_elem = next(iter(stree))
assert [k.type for k in doc_elem] == ["P", "P", "Figure"]

Expand All @@ -22,7 +21,7 @@ def test_find_all_tree(self) -> None:
Test find_all() and find() on trees
"""
with playa.open(TESTDIR / "image_structure.pdf") as pdf:
stree = StructTree(pdf, [pdf.pages[0]])
stree = pdf.pages[0].structtree
figs = list(stree.find_all("Figure"))
assert len(figs) == 1
fig = stree.find("Figure")
Expand All @@ -44,7 +43,7 @@ def test_find_all_element(self) -> None:
Test find_all() and find() on elements
"""
with playa.open(TESTDIR / "pdf_structure.pdf") as pdf:
stree = StructTree(pdf)
stree = pdf.structtree
for list_elem in stree.find_all("L"):
items = list(list_elem.find_all("LI"))
assert items
Expand All @@ -61,14 +60,14 @@ def test_all_mcids(self) -> None:
"""
with playa.open(TESTDIR / "2023-06-20-PV.pdf") as pdf:
# Make sure we can get them with page numbers
stree = StructTree(pdf)
stree = pdf.structtree
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
page_indices = set(page for page, mcid in mcids)
assert 0 in page_indices
assert 1 in page_indices

stree = StructTree(pdf, [pdf.pages[1]])
stree = pdf.pages[1].structtree
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
page_indices = set(page for page, mcid in mcids)
Expand Down

0 comments on commit 22c67ca

Please sign in to comment.