From eecbd5106852746ec44f8b78f7b64fb944559f6e Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 2 Dec 2024 10:04:42 -0500 Subject: [PATCH] fix: robustness to more pdf.js test cases --- playa/document.py | 2 +- playa/page.py | 36 +++++++++++++++++++++--------------- playa/utils.py | 13 ------------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/playa/document.py b/playa/document.py index 307e038..171a892 100644 --- a/playa/document.py +++ b/playa/document.py @@ -853,7 +853,7 @@ def __init__( try: pos = self._find_xref() self._read_xref_from(pos, self.xrefs) - except (ValueError, IndexError) as e: + except (ValueError, IndexError, PDFSyntaxError) as e: log.debug("Using fallback XRef parsing: %s", e) newxref = XRefFallback(self.parser) self.xrefs.append(newxref) diff --git a/playa/page.py b/playa/page.py index 75a86a4..c6cf6d5 100644 --- a/playa/page.py +++ b/playa/page.py @@ -36,6 +36,7 @@ from playa.exceptions import ( PDFInterpreterError, PDFUnicodeNotDefined, + PDFSyntaxError, ) from playa.font import Font @@ -67,7 +68,6 @@ get_transformed_bound, make_compat_bytes, mult_matrix, - parse_rect, normalize_rect, translate_matrix, ) @@ -98,6 +98,17 @@ def Object(*args, **kwargs): ... DeviceSpace = Literal["page", "screen", "user"] +# FIXME: This should go in utils/pdftypes but there are circular imports +def parse_rect(o: PDFObject) -> Rect: + try: + (x0, y0, x1, y1) = (num_value(x) for x in list_value(o)) + return x0, y0, x1, y1 + except ValueError: + raise ValueError("Could not parse rectangle %r" % (o,)) + except TypeError: + raise PDFSyntaxError("Rectangle contains non-numeric values") + + class Page: """An object that holds the information about a page. @@ -141,9 +152,7 @@ def __init__( self.attrs.get("Resources", {}) ) if "MediaBox" in self.attrs: - self.mediabox = normalize_rect( - parse_rect(resolve1(val) for val in resolve1(self.attrs["MediaBox"])) - ) + self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"])) else: log.warning( "MediaBox missing from /Page (and not inherited)," @@ -153,9 +162,7 @@ def __init__( self.cropbox = self.mediabox if "CropBox" in self.attrs: try: - self.cropbox = normalize_rect( - parse_rect(resolve1(val) for val in resolve1(self.attrs["CropBox"])) - ) + self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"])) except ValueError: log.warning("Invalid CropBox in /Page, defaulting to MediaBox") @@ -1213,11 +1220,10 @@ def __iter__(self) -> Iterator[LayoutDict]: if len(args) == nargs: gen = method(*args) else: - error_msg = ( - "Insufficient arguments (%d) for operator: %r" - % (len(args), obj) + log.warning( + "Insufficient arguments (%d) for operator: %r", + len(args), obj ) - raise PDFInterpreterError(error_msg) else: log.debug("exec: %r", obj) gen = method() @@ -2126,17 +2132,17 @@ def __iter__(self) -> Iterator[ContentObject]: if len(args) == nargs: gen = method(*args) else: - error_msg = ( - "Insufficient arguments (%d) for operator: %r" - % (len(args), obj) + log.warning( + "Insufficient arguments (%d) for operator: %r", + len(args), obj ) - raise PDFInterpreterError(error_msg) else: log.debug("exec: %r", obj) gen = method() if gen is not None: yield from gen else: + # TODO: This can get very verbose log.warning("Unknown operator: %r", obj) else: self.push(obj) diff --git a/playa/utils.py b/playa/utils.py index cb3b7f4..accc7a6 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -2,7 +2,6 @@ import string from typing import ( - Any, Iterable, Iterator, List, @@ -11,8 +10,6 @@ Union, ) -from playa.exceptions import PDFSyntaxError - def make_compat_bytes(in_str: str) -> bytes: """Converts to bytes, encoding to unicode.""" @@ -180,16 +177,6 @@ def apply_png_predictor( MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) -def parse_rect(o: Any) -> Rect: - try: - (x0, y0, x1, y1) = o - return float(x0), float(y0), float(x1), float(y1) - except ValueError: - raise ValueError("Could not parse rectangle") - except TypeError: - raise PDFSyntaxError("Rectangle contains non-numeric values") - - def normalize_rect(r: Rect) -> Rect: (x0, y0, x1, y1) = r if x1 < x0: