Skip to content

Commit

Permalink
fix: make things work like the README says
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Nov 13, 2024
1 parent eac41fa commit ebcee56
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 13 deletions.
24 changes: 12 additions & 12 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
KEYWORD_TRAILER,
KEYWORD_XREF,
LIT,
IndirectObject,
IndirectObjectParser,
Lexer,
ObjectParser,
Expand Down Expand Up @@ -852,14 +853,14 @@ def _initialize_password(self, password: str = "") -> None:
# Ensure that no extra data leaks into encrypted streams
self.parser.strict = True

def __iter__(self) -> Iterator[Tuple[int, object]]:
"""Iterate over (position, `IndirectObject`) tuples."""
return IndirectObjectParser(self.buffer, self)
def __iter__(self) -> Iterator[IndirectObject]:
"""Iterate over `IndirectObject`s"""
return (obj for pos, obj in IndirectObjectParser(self.buffer, self))

@property
def tokens(self) -> Iterator[Tuple[int, Token]]:
"""Iterate over (position, token) tuples."""
return Lexer(self.buffer)
def tokens(self) -> Iterator[Token]:
"""Iterate over tokens."""
return (tok for pos, tok in Lexer(self.buffer))

def _getobj_objstm(
self, stream: ContentStream, index: int, objid: int
Expand Down Expand Up @@ -1220,9 +1221,9 @@ def __init__(self, doc: PDFDocument):
try:
page_labels: Iterable[Optional[str]] = doc.page_labels
except (KeyError, ValueError):
page_labels = itertools.count(1)
page_labels = (str(idx) for idx in itertools.count(1))
self._pages = []
self._labels = {}
self._labels: Dict[str, Page] = {}
try:
itor = doc.get_page_objects()
except KeyError:
Expand All @@ -1231,11 +1232,10 @@ def __init__(self, doc: PDFDocument):
page = Page(doc, objid, properties, label, page_idx)
self._pages.append(page)
if label is not None:
label_str = str(label)
if label_str in self._labels:
log.info("Duplicate page label %s at index %d", label_str, page_idx)
if label in self._labels:
log.info("Duplicate page label %s at index %d", label, page_idx)
else:
self._labels[str(label)] = page
self._labels[label] = page

def __len__(self) -> int:
return len(self._pages)
Expand Down
14 changes: 14 additions & 0 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,20 @@ def __init__(
def layout(self) -> Iterator["Item"]:
return iter(PageInterpreter(self))

def __iter__(self) -> Iterator[PDFObject]:
for pos, obj in ContentParser(self.contents):
yield obj

@property
def tokens(self) -> Iterator[Token]:
parser = ContentParser(self.contents)
while True:
try:
pos, tok = parser.nexttoken()
except StopIteration:
return
yield tok

def __repr__(self) -> str:
return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

Expand Down
15 changes: 14 additions & 1 deletion tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def test_read_header():

def test_tokens():
with playa.open(TESTDIR / "simple1.pdf") as doc:
assert len(list(doc.tokens)) == 190
tokens = list(doc.tokens)
assert len(tokens) == 190
assert LIT("Helvetica") in tokens


def test_objects():
Expand All @@ -44,6 +46,13 @@ def test_objects():
assert doc7["Type"] == LIT("Font")
doc1 = doc[1]
assert doc1["Type"] == LIT("Catalog")
objects = list(doc)
assert len(objects) == 7
# Note that they don't have to be in order
assert objects[0].obj == doc[1]
assert objects[2].obj == doc[3]
# FIXME: this should also be the case but is not as it gets reparsed:
# assert objects[0].obj is doc[1]


def test_page_labels():
Expand All @@ -65,6 +74,10 @@ def test_pages():
with playa.open(TESTDIR / "contrib" / "PSC_Station.pdf") as doc:
page_objects = list(doc.pages)
assert len(page_objects) == 15
objects = list(page_objects[2])
assert LIT("Artifact") in objects
tokens = list(page_objects[2].tokens)
assert b"diversit\xe9 " in tokens


def test_names():
Expand Down

0 comments on commit ebcee56

Please sign in to comment.