diff --git a/playa/pdfdocument.py b/playa/pdfdocument.py index a8e3cde..ed609a5 100644 --- a/playa/pdfdocument.py +++ b/playa/pdfdocument.py @@ -800,6 +800,7 @@ class PDFDocument: """ _fp: Union[BinaryIO, None] = None + _pages: Union[List[PDFPage], None] = None def __enter__(self) -> "PDFDocument": return self @@ -1118,24 +1119,23 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: yield object_id, object_properties @property - def pages(self) -> Iterator[PDFPage]: - """Iterator over PDFPage objects, which contain - information about the pages in the document. - """ - try: - page_labels: Iterator[Optional[str]] = self.page_labels - except PDFNoPageLabels: - page_labels = itertools.repeat(None) - try: - for page_number, ((objid, properties), label) in enumerate( - zip(self.get_page_objects(), page_labels) - ): - yield PDFPage(self, objid, properties, label, page_number + 1) - except PDFNoPageTree: - for page_number, ((objid, properties), label) in enumerate( - zip(self.get_pages_from_xrefs(), page_labels) - ): - yield PDFPage(self, objid, properties, label, page_number + 1) + def pages(self) -> List[PDFPage]: + if self._pages is None: + try: + page_labels: Iterator[Optional[str]] = self.page_labels + except PDFNoPageLabels: + page_labels = itertools.repeat(None) + try: + self._pages = [PDFPage(self, objid, properties, label, page_number + 1) + for page_number, ((objid, properties), label) in enumerate( + zip(self.get_page_objects(), page_labels) + )] + except PDFNoPageTree: + self._pages = [PDFPage(self, objid, properties, label, page_number + 1) + for page_number, ((objid, properties), label) in enumerate( + zip(self.get_pages_from_xrefs(), page_labels) + )] + return self._pages @property def names(self) -> Dict[str, Any]: diff --git a/playa/pdfstructtree.py b/playa/pdfstructtree.py index 99db93b..5f1c69e 100644 --- a/playa/pdfstructtree.py +++ b/playa/pdfstructtree.py @@ -164,7 +164,7 @@ class PDFStructTree(Findable): def __init__( self, doc: "PDFDocument", - pages: Union[Iterable[Tuple[Union[int, None], PDFPage]], None] = None, + pages: Union[Iterable[PDFPage], None] = None, ): if "StructTreeRoot" not in doc.catalog: raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry") @@ -175,22 +175,18 @@ def __init__( self.page_dict: Dict[Any, Union[int, None]] if pages is None: - self.page_dict = { - page.pageid: idx + 1 for idx, page in enumerate(doc.pages) - } + self.page_dict = {page.pageid: page.page_number for page in doc.pages} self._parse_struct_tree() else: pagelist = list(pages) - self.page_dict = { - page.pageid: page_number for page_number, page in pagelist - } + self.page_dict = {page.pageid: page.page_number for page in pagelist} parent_tree_obj = self.root.get("ParentTree") # If we have a single page then we will work backwards from # its ParentTree - this is because structure elements could # span multiple pages, and the "Pg" attribute is *optional*, # so this is the approved way to get a page's structure... if len(pagelist) == 1 and parent_tree_obj is not None: - _, page = pagelist[0] + page = pagelist[0] parent_tree = NumberTree(parent_tree_obj) # If there is no marked content in the structure tree for # this page (which can happen even when there is a diff --git a/tests/test_open.py b/tests/test_open.py index ddf82fe..fe67b61 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -46,13 +46,12 @@ def test_inline_data() -> None: # The necessary mocking would be useless considering that I will # shortly demolish these redundant and confusing APIs. with playa.open(TESTDIR / "contrib" / "issue-1008-inline-ascii85.pdf") as doc: - page = next(doc.pages) - _ = page.layout + _ = doc.pages[0].layout def test_multiple_contents() -> None: with playa.open(TESTDIR / "jo.pdf") as doc: - page = next(doc.pages) + page = doc.pages[0] assert len(page.contents) > 1 _ = page.layout diff --git a/tests/test_pdfstructtree.py b/tests/test_pdfstructtree.py index c781f65..ae5fe15 100644 --- a/tests/test_pdfstructtree.py +++ b/tests/test_pdfstructtree.py @@ -13,7 +13,7 @@ class TestClass(unittest.TestCase): def test_structure_tree_class(self) -> None: with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = PDFStructTree(pdf, [(1, next(pdf.pages))]) + stree = PDFStructTree(pdf, [pdf.pages[0]]) doc_elem = next(iter(stree)) assert [k.type for k in doc_elem] == ["P", "P", "Figure"] @@ -22,7 +22,7 @@ def test_find_all_tree(self) -> None: Test find_all() and find() on trees """ with playa.open(TESTDIR / "image_structure.pdf") as pdf: - stree = PDFStructTree(pdf, [(1, next(pdf.pages))]) + stree = PDFStructTree(pdf, [pdf.pages[0]]) figs = list(stree.find_all("Figure")) assert len(figs) == 1 fig = stree.find("Figure") @@ -68,8 +68,7 @@ def test_all_mcids(self) -> None: assert 1 in page_numbers assert 2 in page_numbers - pages = list(pdf.pages) - stree = PDFStructTree(pdf, [(2, pages[1])]) + stree = PDFStructTree(pdf, [pdf.pages[1]]) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) page_numbers = set(page for page, mcid in mcids)