diff --git a/playa/document.py b/playa/document.py index acf4bc3..3f1da79 100644 --- a/playa/document.py +++ b/playa/document.py @@ -45,7 +45,7 @@ PSException, ) from playa.font import PDFCIDFont, PDFFont, PDFTrueTypeFont, PDFType1Font, PDFType3Font -from playa.page import PDFPage +from playa.page import Page from playa.parser import ( KEYWORD_OBJ, KEYWORD_TRAILER, @@ -53,6 +53,7 @@ LIT, ContentStreamParser, PDFParser, + PSBaseParserToken, PSLiteral, literal_name, ) @@ -723,7 +724,7 @@ class PDFDocument: """ _fp: Union[BinaryIO, None] = None - _pages: Union[List[PDFPage], None] = None + _pages: Union[List[Page], None] = None def __enter__(self) -> "PDFDocument": return self @@ -838,9 +839,14 @@ def _initialize_password(self, password: str = "") -> None: self.parser.fallback = False # need to read streams with exact length def __iter__(self) -> Iterator[Tuple[int, object]]: - """Iterate over positions and top-level PDF objects in the file.""" + """Iterate over (position, object) tuples, raising StopIteration at EOF.""" return self.parser + @property + def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over (position, token) tuples, raising StopIteration at EOF.""" + return self.parser.tokens + def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object: if stream.objid in self._parsed_objs: (objs, n) = self._parsed_objs[stream.objid] @@ -1064,7 +1070,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: # The PDF specification *requires* both the Pages # element of the catalog and the entries in Kids in # the page tree to be indirect references. - object_id = obj.objid + object_id = int(obj.objid) elif isinstance(obj, int): # Should not happen in a valid PDF, but probably does? log.warning("Page tree contains bare integer: %r in %r", obj, parent) @@ -1099,7 +1105,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]: yield object_id, object_properties @property - def pages(self) -> List[PDFPage]: + def pages(self) -> List[Page]: if self._pages is None: try: page_labels: Iterator[Optional[str]] = self.page_labels @@ -1107,15 +1113,15 @@ def pages(self) -> List[PDFPage]: page_labels = itertools.repeat(None) try: self._pages = [ - PDFPage(self, objid, properties, label, page_number + 1) - for page_number, ((objid, properties), label) in enumerate( + Page(self, objid, properties, label, page_idx) + for page_idx, ((objid, properties), label) in enumerate( zip(self.get_page_objects(), page_labels) ) ] except PDFNoPageTree: self._pages = [ - PDFPage(self, objid, properties, label, page_number + 1) - for page_number, ((objid, properties), label) in enumerate( + Page(self, objid, properties, label, page_idx) + for page_idx, ((objid, properties), label) in enumerate( zip(self.get_pages_from_xrefs(), page_labels) ) ] diff --git a/playa/page.py b/playa/page.py index 42f8a80..c3910b8 100644 --- a/playa/page.py +++ b/playa/page.py @@ -82,16 +82,16 @@ PDFTextSeq = Iterable[Union[int, float, bytes]] -class PDFPage: +class Page: """An object that holds the information about a page. - A PDFPage object is merely a convenience class that has a set + A Page object is merely a convenience class that has a set of keys and values, which describe the properties of a page and point to its contents. Attributes ---------- - pageid: any Python object that can uniquely identify the page. + pageid: the integer object ID associated with the page in the page tree attrs: a dictionary of page attributes. contents: a list of ContentStream objects that represents the page content. resources: a dictionary of resources used by the page. @@ -99,30 +99,31 @@ class PDFPage: cropbox: the crop rectangle of the page. rotate: the page rotation (in degree). label: the page's label (typically, the logical page number). + page_number: the "physical" page number, indexed from 1. """ def __init__( self, doc: "PDFDocument", - pageid: object, - attrs: object, + pageid: int, + attrs: Dict, label: Optional[str], - page_number: int = 1, + page_idx: int = 0, ) -> None: """Initialize a page object. doc: a PDFDocument object. - pageid: any Python object that can uniquely identify the page. + pageid: the integer PDF object ID associated with the page in the page tree. attrs: a dictionary of page attributes. label: page label string. - page_number: page number (starting from 1) + page_idx: 0-based index of the page in the document. """ self.doc = weakref.ref(doc) self.pageid = pageid - self.attrs = dict_value(attrs) + self.attrs = attrs self.label = label - self.page_number = page_number + self.page_idx = page_idx self.lastmod = resolve1(self.attrs.get("LastModified")) self.resources: Dict[object, object] = resolve1( self.attrs.get("Resources", dict()), @@ -162,16 +163,16 @@ def layout(self) -> "LTPage": if self._layout is not None: return self._layout device = PDFLayoutAnalyzer( - pageno=self.page_number, + page_idx=self.page_idx, ) - interpreter = PDFPageInterpreter(self.doc, device) + interpreter = PageInterpreter(self.doc, device) interpreter.process_page(self) assert device.result is not None self._layout = device.result return self._layout def __repr__(self) -> str: - return f"" + return f"" class PDFTextState: @@ -330,26 +331,26 @@ class PDFLayoutAnalyzer: def __init__( self, - pageno: int = 1, + page_idx: int = 0, ) -> None: - self.pageno = pageno + self.page_idx = page_idx self._stack: List[LTLayoutContainer] = [] self.result: Optional[LTPage] = None def set_ctm(self, ctm: Matrix) -> None: self.ctm = ctm - def begin_page(self, page: PDFPage, ctm: Matrix) -> None: + def begin_page(self, page: Page, ctm: Matrix) -> None: (x0, y0, x1, y1) = page.mediabox (x0, y0) = apply_matrix_pt(ctm, (x0, y0)) (x1, y1) = apply_matrix_pt(ctm, (x1, y1)) mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) - self.cur_item = LTPage(self.pageno, mediabox) + self.cur_item = LTPage(self.page_idx, mediabox) - def end_page(self, page: PDFPage) -> None: + def end_page(self, page: Page) -> None: assert not self._stack, str(len(self._stack)) assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) - self.pageno += 1 + self.page_idx += 1 self.receive_layout(self.cur_item) def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: @@ -710,7 +711,7 @@ def receive_layout(self, ltpage: LTPage) -> None: self.result = ltpage -class PDFPageInterpreter: +class PageInterpreter: """Processor for the content of a PDF page Reference: PDF Reference, Appendix A, Operator Summary @@ -722,7 +723,7 @@ def __init__( self.doc = doc self.device = device - def dup(self) -> "PDFPageInterpreter": + def dup(self) -> "PageInterpreter": return self.__class__(self.doc, self.device) def init_resources(self, resources: Dict[object, object]) -> None: @@ -1330,7 +1331,7 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None: # unsupported xobject type. pass - def process_page(self, page: PDFPage) -> None: + def process_page(self, page: Page) -> None: log.debug("Processing page: %r", page) (x0, y0, x1, y1) = page.mediabox # FIXME: NO, this is bad, pdfplumber has a bug related to it diff --git a/playa/parser.py b/playa/parser.py index 96999fb..8bed850 100644 --- a/playa/parser.py +++ b/playa/parser.py @@ -438,9 +438,14 @@ def __next__(self) -> PSStackEntry[ExtraT]: return pos, obj def __iter__(self) -> Iterator[PSStackEntry[ExtraT]]: - """Iterate over objects, raising StopIteration at EOF.""" + """Iterate over (position, object) tuples, raising StopIteration at EOF.""" return self + @property + def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]: + """Iterate over (position, token) tuples, raising StopIteration at EOF.""" + return self._lexer + # Delegation follows def seek(self, pos: int) -> None: """Seek to a position and reset parser state.""" @@ -483,7 +488,6 @@ def nexttoken(self) -> Tuple[int, PSBaseParserToken]: return next(self._lexer) -# PDFParser stack holds all the base types plus ContentStream, ObjRef, and None class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]): """PDFParser fetches PDF objects from a file stream. It holds a weak reference to the document in order to @@ -579,11 +583,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: class ContentStreamParser(PDFParser): - """StreamParser is used to parse PDF content streams - that is contained in each page and has instructions - for rendering the page. A reference to a PDF document is - needed because a PDF content stream can also have - indirect references to other objects in the same document. + """StreamParser is used to parse PDF content streams and object + streams. These have slightly different rules for how objects are + described than the top-level PDF file contents. """ def __init__(self, data: bytes, doc: "PDFDocument") -> None: diff --git a/playa/structtree.py b/playa/structtree.py index 016f0ef..39f0750 100644 --- a/playa/structtree.py +++ b/playa/structtree.py @@ -17,7 +17,7 @@ from playa.data_structures import NumberTree from playa.exceptions import PDFNoStructTree -from playa.page import PDFPage +from playa.page import Page from playa.parser import KEYWORD_NULL, PSLiteral from playa.pdftypes import ObjRef, resolve1 from playa.utils import decode_text @@ -102,7 +102,7 @@ class PDFStructElement(Findable): alt_text: Union[str, None] actual_text: Union[str, None] title: Union[str, None] - page_number: Union[int, None] + page_idx: Union[int, None] attributes: Dict[str, Any] = field(default_factory=dict) mcids: List[int] = field(default_factory=list) children: List["PDFStructElement"] = field(default_factory=list) @@ -116,12 +116,12 @@ def all_mcids(self) -> Iterator[Tuple[Union[int, None], int]]: """ # Collect them depth-first to preserve ordering for mcid in self.mcids: - yield self.page_number, mcid + yield self.page_idx, mcid d = deque(self.children) while d: el = d.popleft() for mcid in el.mcids: - yield el.page_number, mcid + yield el.page_idx, mcid d.extendleft(reversed(el.children)) def to_dict(self) -> Dict[str, Any]: @@ -153,17 +153,17 @@ class PDFStructTree(Findable): Args: doc: Document from which to extract structure tree - pages: List of (number, page) pairs - numbers will be used to - identify pages in the tree through the `page_number` + pages: List of (index, page) pairs - indices will be used to + identify pages in the tree through the `page_idx` attribute of `PDFStructElement`. """ - page: Union[PDFPage, None] + page: Union[Page, None] def __init__( self, doc: "PDFDocument", - pages: Union[Iterable[PDFPage], None] = None, + pages: Union[Iterable[Page], None] = None, ): if "StructTreeRoot" not in doc.catalog: raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry") @@ -174,11 +174,11 @@ def __init__( self.page_dict: Dict[Any, Union[int, None]] if pages is None: - self.page_dict = {page.pageid: page.page_number for page in doc.pages} + self.page_dict = {page.pageid: page.page_idx for page in doc.pages} self._parse_struct_tree() else: pagelist = list(pages) - self.page_dict = {page.pageid: page.page_number for page in pagelist} + self.page_dict = {page.pageid: page.page_idx for page in pagelist} parent_tree_obj = self.root.get("ParentTree") # If we have a single page then we will work backwards from # its ParentTree - this is because structure elements could @@ -257,12 +257,12 @@ def _make_element( # We hopefully caught these earlier assert "MCID" not in obj, "Uncaught MCR: %s" % obj assert "Obj" not in obj, "Uncaught OBJR: %s" % obj - # Get page number if necessary - page_number = None + # Get page index if necessary + page_idx = None if self.page_dict is not None and "Pg" in obj: page_objid = obj["Pg"].objid assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj - page_number = self.page_dict[page_objid] + page_idx = self.page_dict[page_objid] obj_tag = "" if "S" in obj: obj_tag = decode_text(obj["S"].name) @@ -285,7 +285,7 @@ def _make_element( element = PDFStructElement( type=obj_tag, id=element_id, - page_number=page_number, + page_idx=page_idx, revision=revision, lang=lang, title=title, diff --git a/tests/test_pdfstructtree.py b/tests/test_pdfstructtree.py index 8a45d51..90eb37c 100644 --- a/tests/test_pdfstructtree.py +++ b/tests/test_pdfstructtree.py @@ -64,14 +64,14 @@ def test_all_mcids(self) -> None: stree = PDFStructTree(pdf) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) - page_numbers = set(page for page, mcid in mcids) - assert 1 in page_numbers - assert 2 in page_numbers + page_indices = set(page for page, mcid in mcids) + assert 0 in page_indices + assert 1 in page_indices stree = PDFStructTree(pdf, [pdf.pages[1]]) sect = next(stree.find_all("Sect")) mcids = list(sect.all_mcids()) - page_numbers = set(page for page, mcid in mcids) - assert page_numbers == {2} + page_indices = set(page for page, mcid in mcids) + assert page_indices == {1} for p in sect.find_all("P"): assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)