Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add various things desired by pdfplumber #9

Merged
merged 8 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions playa/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from playa.layout import (
LAParams,
LTComponent,
LTChar,
LTCurve,
LTFigure,
Expand All @@ -21,7 +22,8 @@
from playa.pdfcolor import PDFColorSpace
from playa.pdfdevice import PDFTextDevice
from playa.pdffont import PDFFont, PDFUnicodeNotDefined
from playa.pdfinterp import PDFGraphicState, PDFResourceManager
from playa.pdfinterp import PDFGraphicState, PDFResourceManager, PDFStackT
from playa.psparser import PSLiteral
from playa.pdfpage import PDFPage
from playa.pdftypes import PDFStream
from playa.utils import (
Expand All @@ -30,6 +32,7 @@
Point,
Rect,
apply_matrix_pt,
decode_text,
mult_matrix,
)

Expand All @@ -39,6 +42,8 @@
class PDFLayoutAnalyzer(PDFTextDevice):
cur_item: LTLayoutContainer
ctm: Matrix
cur_mcid: Optional[int] = None
cur_tag: Optional[str] = None

def __init__(
self,
Expand Down Expand Up @@ -76,14 +81,32 @@ def end_figure(self, _: str) -> None:
self.cur_item = self._stack.pop()
self.cur_item.add(fig)

def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
"""Handle beginning of tag, setting current MCID if any."""
self.cur_tag = decode_text(tag.name)
if isinstance(props, dict) and "MCID" in props:
self.cur_mcid = props["MCID"]
else:
self.cur_mcid = None

def end_tag(self) -> None:
"""Handle beginning of tag, clearing current MCID."""
self.cur_tag = None
self.cur_mcid = None

def add_item(self, item: LTComponent) -> None:
item.mcid = self.cur_mcid
item.tag = self.cur_tag
self.cur_item.add(item)

def render_image(self, name: str, stream: PDFStream) -> None:
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
item = LTImage(
name,
stream,
(self.cur_item.x0, self.cur_item.y0, self.cur_item.x1, self.cur_item.y1),
)
self.cur_item.add(item)
self.add_item(item)

def paint_path(
self,
Expand All @@ -92,6 +115,8 @@ def paint_path(
fill: bool,
evenodd: bool,
path: Sequence[PathSegment],
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
"""Paint paths described in section 4.4 of the PDF reference manual"""
shape = "".join(x[0] for x in path)
Expand All @@ -109,7 +134,7 @@ def paint_path(
# recurse if there are multiple m's in this shape
for m in re.finditer(r"m[^m]+", shape):
subpath = path[m.start(0) : m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath)
self.paint_path(gstate, stroke, fill, evenodd, subpath, ncs, scs)

else:
# Although the 'h' command does not not literally provide a
Expand Down Expand Up @@ -153,8 +178,9 @@ def paint_path(
gstate.ncolor,
original_path=transformed_path,
dashing_style=gstate.dash,
ncs=ncs, scs=scs
)
self.cur_item.add(line)
self.add_item(line)

elif shape in {"mlllh", "mllll"}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
Expand All @@ -174,8 +200,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(rect)
self.add_item(rect)
else:
curve = LTCurve(
gstate.linewidth,
Expand All @@ -187,8 +214,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(curve)
self.add_item(curve)
else:
curve = LTCurve(
gstate.linewidth,
Expand All @@ -200,8 +228,9 @@ def paint_path(
gstate.ncolor,
transformed_path,
gstate.dash,
ncs, scs
)
self.cur_item.add(curve)
self.add_item(curve)

def render_char(
self,
Expand All @@ -211,8 +240,9 @@ def render_char(
scaling: float,
rise: float,
cid: int,
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> float:
try:
text = font.to_unichr(cid)
Expand All @@ -230,10 +260,11 @@ def render_char(
text,
textwidth,
textdisp,
ncs,
graphicstate,
ncs,
scs,
)
self.cur_item.add(item)
self.add_item(item)
return item.adv

def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
Expand Down
50 changes: 27 additions & 23 deletions playa/data_structures.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,50 @@
from typing import Any, Iterable, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Tuple

from playa import settings
from playa.pdfparser import PDFSyntaxError
from playa.pdftypes import dict_value, int_value, list_value
from playa.utils import choplist


def walk_number_tree(tree: Dict[str, Any]) -> Iterator[Tuple[int, Any]]:
stack = [tree]
while stack:
item = dict_value(stack.pop())
if "Nums" in item:
for k, v in choplist(2, list_value(item["Nums"])):
yield int_value(k), v
if "Kids" in item:
stack.extend(reversed(list_value(item["Kids"])))


class NumberTree:
"""A PDF number tree.

See Section 3.8.6 of the PDF Reference.
See Section 7.9.7 of the PDF 1.7 Reference.
"""

def __init__(self, obj: Any):
self._obj = dict_value(obj)
self.nums: Optional[Iterable[Any]] = None
self.kids: Optional[Iterable[Any]] = None
self.limits: Optional[Iterable[Any]] = None

if "Nums" in self._obj:
self.nums = list_value(self._obj["Nums"])
if "Kids" in self._obj:
self.kids = list_value(self._obj["Kids"])
if "Limits" in self._obj:
self.limits = list_value(self._obj["Limits"])
def __iter__(self) -> Iterator[Tuple[int, Any]]:
return walk_number_tree(self._obj)

def _parse(self) -> List[Tuple[int, Any]]:
items = []
if self.nums: # Leaf node
for k, v in choplist(2, self.nums):
items.append((int_value(k), v))
def __contains__(self, num) -> bool:
for idx, val in self:
if idx == num:
return True
return False

if self.kids: # Root or intermediate node
for child_ref in self.kids:
items += NumberTree(child_ref)._parse()

return items
def __getitem__(self, num) -> Any:
for idx, val in self:
if idx == num:
return val
raise IndexError(f"Number {num} not in tree")

@property
def values(self) -> List[Tuple[int, Any]]:
values = self._parse()

values = list(self)
# NOTE: They are supposed to be sorted! (but, I suppose, often aren't)
if settings.STRICT:
if not all(a[0] <= b[0] for a, b in zip(values, values[1:])):
raise PDFSyntaxError("Number tree elements are out of order")
Expand Down
19 changes: 18 additions & 1 deletion playa/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def __repr__(self) -> str:
class LTItem:
"""Interface for things that can be analyzed"""

# Any item could be in a marked content section
mcid: Optional[int] = None
# Which could have a tag
tag: Optional[str] = None

def analyze(self, laparams: LAParams) -> None:
"""Perform the layout analysis."""

Expand Down Expand Up @@ -234,9 +239,13 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTComponent.__init__(self, get_bound(pts))
self.pts = pts
self.ncs = ncs
self.scs = scs
self.linewidth = linewidth
self.stroke = stroke
self.fill = fill
Expand Down Expand Up @@ -268,6 +277,8 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTCurve.__init__(
self,
Expand All @@ -280,6 +291,7 @@ def __init__(
non_stroking_color,
original_path,
dashing_style,
ncs, scs,
)


Expand All @@ -300,6 +312,8 @@ def __init__(
non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
(x0, y0, x1, y1) = bbox
LTCurve.__init__(
Expand All @@ -313,6 +327,7 @@ def __init__(
non_stroking_color,
original_path,
dashing_style,
ncs, scs,
)


Expand Down Expand Up @@ -365,14 +380,16 @@ def __init__(
text: str,
textwidth: float,
textdisp: Union[float, Tuple[Optional[float], float]],
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
ncs: Optional[PDFColorSpace] = None,
scs: Optional[PDFColorSpace] = None,
) -> None:
LTText.__init__(self)
self._text = text
self.matrix = matrix
self.fontname = font.fontname
self.ncs = ncs
self.scs = scs
self.graphicstate = graphicstate
self.adv = textwidth * fontsize * scaling
# compute the boundary rectangle.
Expand Down
Loading