Skip to content

Commit

Permalink
refactor!: page indices (0-based), PDFRemove PDFMore PDFPrefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Oct 29, 2024
1 parent 8aaf9ab commit 3267b88
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 57 deletions.
24 changes: 15 additions & 9 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@
PSException,
)
from playa.font import PDFCIDFont, PDFFont, PDFTrueTypeFont, PDFType1Font, PDFType3Font
from playa.page import PDFPage
from playa.page import Page
from playa.parser import (
KEYWORD_OBJ,
KEYWORD_TRAILER,
KEYWORD_XREF,
LIT,
ContentStreamParser,
PDFParser,
PSBaseParserToken,
PSLiteral,
literal_name,
)
Expand Down Expand Up @@ -723,7 +724,7 @@ class PDFDocument:
"""

_fp: Union[BinaryIO, None] = None
_pages: Union[List[PDFPage], None] = None
_pages: Union[List[Page], None] = None

def __enter__(self) -> "PDFDocument":
return self
Expand Down Expand Up @@ -838,9 +839,14 @@ def _initialize_password(self, password: str = "") -> None:
self.parser.fallback = False # need to read streams with exact length

def __iter__(self) -> Iterator[Tuple[int, object]]:
"""Iterate over positions and top-level PDF objects in the file."""
"""Iterate over (position, object) tuples, raising StopIteration at EOF."""
return self.parser

@property
def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]:
"""Iterate over (position, token) tuples, raising StopIteration at EOF."""
return self.parser.tokens

def _getobj_objstm(self, stream: ContentStream, index: int, objid: int) -> object:
if stream.objid in self._parsed_objs:
(objs, n) = self._parsed_objs[stream.objid]
Expand Down Expand Up @@ -1064,7 +1070,7 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
# The PDF specification *requires* both the Pages
# element of the catalog and the entries in Kids in
# the page tree to be indirect references.
object_id = obj.objid
object_id = int(obj.objid)
elif isinstance(obj, int):
# Should not happen in a valid PDF, but probably does?
log.warning("Page tree contains bare integer: %r in %r", obj, parent)
Expand Down Expand Up @@ -1099,23 +1105,23 @@ def get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
yield object_id, object_properties

@property
def pages(self) -> List[PDFPage]:
def pages(self) -> List[Page]:
if self._pages is None:
try:
page_labels: Iterator[Optional[str]] = self.page_labels
except PDFNoPageLabels:
page_labels = itertools.repeat(None)
try:
self._pages = [
PDFPage(self, objid, properties, label, page_number + 1)
for page_number, ((objid, properties), label) in enumerate(
Page(self, objid, properties, label, page_idx)
for page_idx, ((objid, properties), label) in enumerate(
zip(self.get_page_objects(), page_labels)
)
]
except PDFNoPageTree:
self._pages = [
PDFPage(self, objid, properties, label, page_number + 1)
for page_number, ((objid, properties), label) in enumerate(
Page(self, objid, properties, label, page_idx)
for page_idx, ((objid, properties), label) in enumerate(
zip(self.get_pages_from_xrefs(), page_labels)
)
]
Expand Down
45 changes: 23 additions & 22 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,47 +82,48 @@
PDFTextSeq = Iterable[Union[int, float, bytes]]


class PDFPage:
class Page:
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
A Page object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes
----------
pageid: any Python object that can uniquely identify the page.
pageid: the integer object ID associated with the page in the page tree
attrs: a dictionary of page attributes.
contents: a list of ContentStream objects that represents the page content.
resources: a dictionary of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
label: the page's label (typically, the logical page number).
page_number: the "physical" page number, indexed from 1.
"""

def __init__(
self,
doc: "PDFDocument",
pageid: object,
attrs: object,
pageid: int,
attrs: Dict,
label: Optional[str],
page_number: int = 1,
page_idx: int = 0,
) -> None:
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
pageid: the integer PDF object ID associated with the page in the page tree.
attrs: a dictionary of page attributes.
label: page label string.
page_number: page number (starting from 1)
page_idx: 0-based index of the page in the document.
"""
self.doc = weakref.ref(doc)
self.pageid = pageid
self.attrs = dict_value(attrs)
self.attrs = attrs
self.label = label
self.page_number = page_number
self.page_idx = page_idx
self.lastmod = resolve1(self.attrs.get("LastModified"))
self.resources: Dict[object, object] = resolve1(
self.attrs.get("Resources", dict()),
Expand Down Expand Up @@ -162,16 +163,16 @@ def layout(self) -> "LTPage":
if self._layout is not None:
return self._layout
device = PDFLayoutAnalyzer(
pageno=self.page_number,
page_idx=self.page_idx,
)
interpreter = PDFPageInterpreter(self.doc, device)
interpreter = PageInterpreter(self.doc, device)
interpreter.process_page(self)
assert device.result is not None
self._layout = device.result
return self._layout

def __repr__(self) -> str:
return f"<PDFPage: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"


class PDFTextState:
Expand Down Expand Up @@ -330,26 +331,26 @@ class PDFLayoutAnalyzer:

def __init__(
self,
pageno: int = 1,
page_idx: int = 0,
) -> None:
self.pageno = pageno
self.page_idx = page_idx
self._stack: List[LTLayoutContainer] = []
self.result: Optional[LTPage] = None

def set_ctm(self, ctm: Matrix) -> None:
self.ctm = ctm

def begin_page(self, page: PDFPage, ctm: Matrix) -> None:
def begin_page(self, page: Page, ctm: Matrix) -> None:
(x0, y0, x1, y1) = page.mediabox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
self.cur_item = LTPage(self.pageno, mediabox)
self.cur_item = LTPage(self.page_idx, mediabox)

def end_page(self, page: PDFPage) -> None:
def end_page(self, page: Page) -> None:
assert not self._stack, str(len(self._stack))
assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
self.pageno += 1
self.page_idx += 1
self.receive_layout(self.cur_item)

def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None:
Expand Down Expand Up @@ -710,7 +711,7 @@ def receive_layout(self, ltpage: LTPage) -> None:
self.result = ltpage


class PDFPageInterpreter:
class PageInterpreter:
"""Processor for the content of a PDF page
Reference: PDF Reference, Appendix A, Operator Summary
Expand All @@ -722,7 +723,7 @@ def __init__(
self.doc = doc
self.device = device

def dup(self) -> "PDFPageInterpreter":
def dup(self) -> "PageInterpreter":
return self.__class__(self.doc, self.device)

def init_resources(self, resources: Dict[object, object]) -> None:
Expand Down Expand Up @@ -1330,7 +1331,7 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None:
# unsupported xobject type.
pass

def process_page(self, page: PDFPage) -> None:
def process_page(self, page: Page) -> None:
log.debug("Processing page: %r", page)
(x0, y0, x1, y1) = page.mediabox
# FIXME: NO, this is bad, pdfplumber has a bug related to it
Expand Down
16 changes: 9 additions & 7 deletions playa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,14 @@ def __next__(self) -> PSStackEntry[ExtraT]:
return pos, obj

def __iter__(self) -> Iterator[PSStackEntry[ExtraT]]:
"""Iterate over objects, raising StopIteration at EOF."""
"""Iterate over (position, object) tuples, raising StopIteration at EOF."""
return self

@property
def tokens(self) -> Iterator[Tuple[int, PSBaseParserToken]]:
"""Iterate over (position, token) tuples, raising StopIteration at EOF."""
return self._lexer

# Delegation follows
def seek(self, pos: int) -> None:
"""Seek to a position and reset parser state."""
Expand Down Expand Up @@ -483,7 +488,6 @@ def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
return next(self._lexer)


# PDFParser stack holds all the base types plus ContentStream, ObjRef, and None
class PDFParser(Parser[Union[PSKeyword, ContentStream, ObjRef, None]]):
"""PDFParser fetches PDF objects from a file stream.
It holds a weak reference to the document in order to
Expand Down Expand Up @@ -579,11 +583,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:


class ContentStreamParser(PDFParser):
"""StreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
indirect references to other objects in the same document.
"""StreamParser is used to parse PDF content streams and object
streams. These have slightly different rules for how objects are
described than the top-level PDF file contents.
"""

def __init__(self, data: bytes, doc: "PDFDocument") -> None:
Expand Down
28 changes: 14 additions & 14 deletions playa/structtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from playa.data_structures import NumberTree
from playa.exceptions import PDFNoStructTree
from playa.page import PDFPage
from playa.page import Page
from playa.parser import KEYWORD_NULL, PSLiteral
from playa.pdftypes import ObjRef, resolve1
from playa.utils import decode_text
Expand Down Expand Up @@ -102,7 +102,7 @@ class PDFStructElement(Findable):
alt_text: Union[str, None]
actual_text: Union[str, None]
title: Union[str, None]
page_number: Union[int, None]
page_idx: Union[int, None]
attributes: Dict[str, Any] = field(default_factory=dict)
mcids: List[int] = field(default_factory=list)
children: List["PDFStructElement"] = field(default_factory=list)
Expand All @@ -116,12 +116,12 @@ def all_mcids(self) -> Iterator[Tuple[Union[int, None], int]]:
"""
# Collect them depth-first to preserve ordering
for mcid in self.mcids:
yield self.page_number, mcid
yield self.page_idx, mcid
d = deque(self.children)
while d:
el = d.popleft()
for mcid in el.mcids:
yield el.page_number, mcid
yield el.page_idx, mcid
d.extendleft(reversed(el.children))

def to_dict(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -153,17 +153,17 @@ class PDFStructTree(Findable):
Args:
doc: Document from which to extract structure tree
pages: List of (number, page) pairs - numbers will be used to
identify pages in the tree through the `page_number`
pages: List of (index, page) pairs - indices will be used to
identify pages in the tree through the `page_idx`
attribute of `PDFStructElement`.
"""

page: Union[PDFPage, None]
page: Union[Page, None]

def __init__(
self,
doc: "PDFDocument",
pages: Union[Iterable[PDFPage], None] = None,
pages: Union[Iterable[Page], None] = None,
):
if "StructTreeRoot" not in doc.catalog:
raise PDFNoStructTree("Catalog has no 'StructTreeRoot' entry")
Expand All @@ -174,11 +174,11 @@ def __init__(
self.page_dict: Dict[Any, Union[int, None]]

if pages is None:
self.page_dict = {page.pageid: page.page_number for page in doc.pages}
self.page_dict = {page.pageid: page.page_idx for page in doc.pages}
self._parse_struct_tree()
else:
pagelist = list(pages)
self.page_dict = {page.pageid: page.page_number for page in pagelist}
self.page_dict = {page.pageid: page.page_idx for page in pagelist}
parent_tree_obj = self.root.get("ParentTree")
# If we have a single page then we will work backwards from
# its ParentTree - this is because structure elements could
Expand Down Expand Up @@ -257,12 +257,12 @@ def _make_element(
# We hopefully caught these earlier
assert "MCID" not in obj, "Uncaught MCR: %s" % obj
assert "Obj" not in obj, "Uncaught OBJR: %s" % obj
# Get page number if necessary
page_number = None
# Get page index if necessary
page_idx = None
if self.page_dict is not None and "Pg" in obj:
page_objid = obj["Pg"].objid
assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj
page_number = self.page_dict[page_objid]
page_idx = self.page_dict[page_objid]
obj_tag = ""
if "S" in obj:
obj_tag = decode_text(obj["S"].name)
Expand All @@ -285,7 +285,7 @@ def _make_element(
element = PDFStructElement(
type=obj_tag,
id=element_id,
page_number=page_number,
page_idx=page_idx,
revision=revision,
lang=lang,
title=title,
Expand Down
10 changes: 5 additions & 5 deletions tests/test_pdfstructtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ def test_all_mcids(self) -> None:
stree = PDFStructTree(pdf)
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
page_numbers = set(page for page, mcid in mcids)
assert 1 in page_numbers
assert 2 in page_numbers
page_indices = set(page for page, mcid in mcids)
assert 0 in page_indices
assert 1 in page_indices

stree = PDFStructTree(pdf, [pdf.pages[1]])
sect = next(stree.find_all("Sect"))
mcids = list(sect.all_mcids())
page_numbers = set(page for page, mcid in mcids)
assert page_numbers == {2}
page_indices = set(page for page, mcid in mcids)
assert page_indices == {1}
for p in sect.find_all("P"):
assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids)

0 comments on commit 3267b88

Please sign in to comment.