Skip to content

Commit

Permalink
fix: robustness to more pdf.js test cases
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Dec 2, 2024
1 parent ea774c1 commit eecbd51
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 29 deletions.
2 changes: 1 addition & 1 deletion playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def __init__(
try:
pos = self._find_xref()
self._read_xref_from(pos, self.xrefs)
except (ValueError, IndexError) as e:
except (ValueError, IndexError, PDFSyntaxError) as e:
log.debug("Using fallback XRef parsing: %s", e)
newxref = XRefFallback(self.parser)
self.xrefs.append(newxref)
Expand Down
36 changes: 21 additions & 15 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from playa.exceptions import (
PDFInterpreterError,
PDFUnicodeNotDefined,
PDFSyntaxError,
)
from playa.font import Font

Expand Down Expand Up @@ -67,7 +68,6 @@
get_transformed_bound,
make_compat_bytes,
mult_matrix,
parse_rect,
normalize_rect,
translate_matrix,
)
Expand Down Expand Up @@ -98,6 +98,17 @@ def Object(*args, **kwargs): ...
DeviceSpace = Literal["page", "screen", "user"]


# FIXME: This should go in utils/pdftypes but there are circular imports
def parse_rect(o: PDFObject) -> Rect:
try:
(x0, y0, x1, y1) = (num_value(x) for x in list_value(o))
return x0, y0, x1, y1
except ValueError:
raise ValueError("Could not parse rectangle %r" % (o,))
except TypeError:
raise PDFSyntaxError("Rectangle contains non-numeric values")


class Page:
"""An object that holds the information about a page.
Expand Down Expand Up @@ -141,9 +152,7 @@ def __init__(
self.attrs.get("Resources", {})
)
if "MediaBox" in self.attrs:
self.mediabox = normalize_rect(
parse_rect(resolve1(val) for val in resolve1(self.attrs["MediaBox"]))
)
self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"]))
else:
log.warning(
"MediaBox missing from /Page (and not inherited),"
Expand All @@ -153,9 +162,7 @@ def __init__(
self.cropbox = self.mediabox
if "CropBox" in self.attrs:
try:
self.cropbox = normalize_rect(
parse_rect(resolve1(val) for val in resolve1(self.attrs["CropBox"]))
)
self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"]))
except ValueError:
log.warning("Invalid CropBox in /Page, defaulting to MediaBox")

Expand Down Expand Up @@ -1213,11 +1220,10 @@ def __iter__(self) -> Iterator[LayoutDict]:
if len(args) == nargs:
gen = method(*args)
else:
error_msg = (
"Insufficient arguments (%d) for operator: %r"
% (len(args), obj)
log.warning(
"Insufficient arguments (%d) for operator: %r",
len(args), obj
)
raise PDFInterpreterError(error_msg)
else:
log.debug("exec: %r", obj)
gen = method()
Expand Down Expand Up @@ -2126,17 +2132,17 @@ def __iter__(self) -> Iterator[ContentObject]:
if len(args) == nargs:
gen = method(*args)
else:
error_msg = (
"Insufficient arguments (%d) for operator: %r"
% (len(args), obj)
log.warning(
"Insufficient arguments (%d) for operator: %r",
len(args), obj
)
raise PDFInterpreterError(error_msg)
else:
log.debug("exec: %r", obj)
gen = method()
if gen is not None:
yield from gen
else:
# TODO: This can get very verbose
log.warning("Unknown operator: %r", obj)
else:
self.push(obj)
Expand Down
13 changes: 0 additions & 13 deletions playa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import string
from typing import (
Any,
Iterable,
Iterator,
List,
Expand All @@ -11,8 +10,6 @@
Union,
)

from playa.exceptions import PDFSyntaxError


def make_compat_bytes(in_str: str) -> bytes:
"""Converts to bytes, encoding to unicode."""
Expand Down Expand Up @@ -180,16 +177,6 @@ def apply_png_predictor(
MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0)


def parse_rect(o: Any) -> Rect:
try:
(x0, y0, x1, y1) = o
return float(x0), float(y0), float(x1), float(y1)
except ValueError:
raise ValueError("Could not parse rectangle")
except TypeError:
raise PDFSyntaxError("Rectangle contains non-numeric values")


def normalize_rect(r: Rect) -> Rect:
(x0, y0, x1, y1) = r
if x1 < x0:
Expand Down

0 comments on commit eecbd51

Please sign in to comment.