Skip to content

Commit

Permalink
fix: simplify pages api a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Oct 26, 2024
1 parent 5585e9e commit 6f80c3d
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 33 deletions.
36 changes: 18 additions & 18 deletions playa/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ class PDFDocument:
"""

_fp: Union[BinaryIO, None] = None
_pages: Union[List[PDFPage], None] = None

def __enter__(self) -> "PDFDocument":
return self
Expand Down Expand Up @@ -1118,24 +1119,23 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
yield object_id, object_properties

@property
def pages(self) -> Iterator[PDFPage]:
"""Iterator over PDFPage objects, which contain
information about the pages in the document.
"""
try:
page_labels: Iterator[Optional[str]] = self.page_labels
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
try:
for page_number, ((objid, properties), label) in enumerate(
zip(self.get_page_objects(), page_labels)
):
yield PDFPage(self, objid, properties, label, page_number + 1)
except PDFNoPageTree:
for page_number, ((objid, properties), label) in enumerate(
zip(self.get_pages_from_xrefs(), page_labels)
):
yield PDFPage(self, objid, properties, label, page_number + 1)
def pages(self) -> List[PDFPage]:
if self._pages is None:
try:
page_labels: Iterator[Optional[str]] = self.page_labels
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
try:
self._pages = [PDFPage(self, objid, properties, label, page_number + 1)
for page_number, ((objid, properties), label) in enumerate(
zip(self.get_page_objects(), page_labels)
)]
except PDFNoPageTree:
self._pages = [PDFPage(self, objid, properties, label, page_number + 1)
for page_number, ((objid, properties), label) in enumerate(
zip(self.get_pages_from_xrefs(), page_labels)
)]
return self._pages

@property
def names(self) -> Dict[str, Any]:
Expand Down
12 changes: 4 additions & 8 deletions playa/pdfstructtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class PDFStructTree(Findable):
def __init__(
self,
doc: "PDFDocument",
pages: Union[Iterable[Tuple[Union[int, None], PDFPage]], None] = None,
pages: Union[Iterable[PDFPage], None] = None,
):
if "StructTreeRoot" not in doc.catalog:
raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry")
Expand All @@ -175,22 +175,18 @@ def __init__(
self.page_dict: Dict[Any, Union[int, None]]

if pages is None:
self.page_dict = {
page.pageid: idx + 1 for idx, page in enumerate(doc.pages)
}
self.page_dict = {page.pageid: page.page_number for page in doc.pages}
self._parse_struct_tree()
else:
pagelist = list(pages)
self.page_dict = {
page.pageid: page_number for page_number, page in pagelist
}
self.page_dict = {page.pageid: page.page_number for page in pagelist}
parent_tree_obj = self.root.get("ParentTree")
# If we have a single page then we will work backwards from
# its ParentTree - this is because structure elements could
# span multiple pages, and the "Pg" attribute is *optional*,
# so this is the approved way to get a page's structure...
if len(pagelist) == 1 and parent_tree_obj is not None:
_, page = pagelist[0]
page = pagelist[0]
parent_tree = NumberTree(parent_tree_obj)
# If there is no marked content in the structure tree for
# this page (which can happen even when there is a
Expand Down
5 changes: 2 additions & 3 deletions tests/test_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@ def test_inline_data() -> None:
# The necessary mocking would be useless considering that I will
# shortly demolish these redundant and confusing APIs.
with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc:
page = next(doc.pages)
_ = page.layout
_ = doc.pages[0].layout


def test_multiple_contents() -> None:
with playa.open(TESTDIR / "jo.pdf") as doc:
page = next(doc.pages)
page = doc.pages[0]
assert len(page.contents) > 1
_ = page.layout

Expand Down
7 changes: 3 additions & 4 deletions tests/test_pdfstructtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class TestClass(unittest.TestCase):

def test_structure_tree_class(self) -> None:
with playa.open(TESTDIR / "image_structure.pdf") as pdf:
stree = PDFStructTree(pdf, [(1, next(pdf.pages))])
stree = PDFStructTree(pdf, [pdf.pages[0]])
doc_elem = next(iter(stree))
assert [k.type for k in doc_elem] == ["P", "P", "Figure"]

Expand All @@ -22,7 +22,7 @@ def test_find_all_tree(self) -> None:
Test find_all() and find() on trees
"""
with playa.open(TESTDIR / "image_structure.pdf") as pdf:
stree = PDFStructTree(pdf, [(1, next(pdf.pages))])
stree = PDFStructTree(pdf, [pdf.pages[0]])
figs = list(stree.find_all("Figure"))
assert len(figs) == 1
fig = stree.find("Figure")
Expand Down Expand Up @@ -68,8 +68,7 @@ def test_all_mcids(self) -> None:
assert 1 in page_numbers
assert 2 in page_numbers

pages = list(pdf.pages)
stree = PDFStructTree(pdf, [(2, pages[1])])
stree = PDFStructTree(pdf, [pdf.pages[1]])
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
page_numbers = set(page for page, mcid in mcids)
Expand Down

0 comments on commit 6f80c3d

Please sign in to comment.