Skip to content

Commit

Permalink
feat: allow indexing pages by label like the README says
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Nov 12, 2024
1 parent 2aab556 commit c122e8f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 22 deletions.
60 changes: 38 additions & 22 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ class PDFDocument:
"""

_fp: Union[BinaryIO, None] = None
_pages: Union[List[Page], None] = None
_pages: Union["PageList", None] = None

def __enter__(self) -> "PDFDocument":
return self
Expand Down Expand Up @@ -1062,7 +1062,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
"""Iterate over the flattened page tree in reading order, propagating
inheritable attributes. Returns an iterator over (objid, dict) pairs.
Will raise PDFNoPageTree if there is no page tree.
Will raise KeyError if there is no page tree.
"""
if "Pages" not in self.catalog:
raise KeyError("No 'Pages' entry in catalog")
Expand Down Expand Up @@ -1108,28 +1108,10 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
log.debug("Page: %r", object_properties)
yield object_id, object_properties

# FIXME: Make an object that can be indexed by int or str
@property
def pages(self) -> List[Page]:
def pages(self) -> "PageList":
if self._pages is None:
try:
page_labels: Iterator[Optional[str]] = self.page_labels
except (KeyError, ValueError):
page_labels = itertools.repeat(None)
try:
self._pages = [
Page(self, objid, properties, label, page_idx)
for page_idx, ((objid, properties), label) in enumerate(
zip(self.get_page_objects(), page_labels)
)
]
except KeyError:
self._pages = [
Page(self, objid, properties, label, page_idx)
for page_idx, ((objid, properties), label) in enumerate(
zip(self.get_pages_from_xrefs(), page_labels)
)
]
self._pages = PageList(self)
return self._pages

@property
Expand Down Expand Up @@ -1231,6 +1213,40 @@ def read_xref_from(
self.read_xref_from(pos, xrefs)


class PageList:
"""List of pages indexable by 0-based index or string label."""

def __init__(self, doc: PDFDocument):
try:
page_labels: Iterable[Optional[str]] = doc.page_labels
except (KeyError, ValueError):
page_labels = itertools.repeat(None)
self._pages = []
self._labels = {}
try:
itor = doc.get_page_objects()
except KeyError:
itor = doc.get_pages_from_xrefs()
for page_idx, ((objid, properties), label) in enumerate(zip(itor, page_labels)):
page = Page(doc, objid, properties, label, page_idx)
self._pages.append(page)
if label is not None:
label_str = str(label)
if label_str in self._labels:
log.warning("Duplicate page label %s", label_str)
else:
self._labels[str(label)] = page

def __iter__(self) -> Iterator[Page]:
return iter(self._pages)

def __getitem__(self, key: int | str) -> Page:
if isinstance(key, int):
return self._pages[key]
else:
return self._labels[key]


class PageLabels(NumberTree):
"""PageLabels from the document catalog.
Expand Down
3 changes: 3 additions & 0 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def test_page_labels():
with playa.open(TESTDIR / "contrib" / "pagelabels.pdf") as doc:
labels = [label for _, label in zip(range(10), doc.page_labels)]
assert labels == ["iii", "iv", "1", "2", "1", "2", "3", "4", "5", "6"]
assert doc.pages["iii"] == doc.pages[0]
assert doc.pages["iv"] == doc.pages[1]
assert doc.pages["2"] == doc.pages[3]


def test_pages():
Expand Down

0 comments on commit c122e8f

Please sign in to comment.