From 796a68ca6f88405c29a7293c5376bb505d02e368 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Mon, 2 Dec 2024 11:50:45 -0500
Subject: [PATCH] fix: simpler and more robust recovery from borked xref tables

---
 playa/document.py | 45 +++++++++++++++++++++++----------------------
 tests/data.py     |  1 +
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/playa/document.py b/playa/document.py
index 171a892..e936b57 100644
--- a/playa/document.py
+++ b/playa/document.py
@@ -853,7 +853,7 @@ def __init__(
         try:
             pos = self._find_xref()
             self._read_xref_from(pos, self.xrefs)
-        except (ValueError, IndexError, PDFSyntaxError) as e:
+        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
             log.debug("Using fallback XRef parsing: %s", e)
             newxref = XRefFallback(self.parser)
             self.xrefs.append(newxref)
@@ -874,7 +874,10 @@ def __init__(
                 self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                 self._initialize_password(password)
             if "Info" in trailer:
-                self.info.append(dict_value(trailer["Info"]))
+                try:
+                    self.info.append(dict_value(trailer["Info"]))
+                except TypeError:
+                    log.warning("Info is a broken reference (incorrect xref table?)")
             if "Root" in trailer:
                 # Every PDF file must have exactly one /Root dictionary.
                 try:
@@ -982,30 +985,28 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
         self.parser.seek(pos)
         try:
             _, obj = next(self.parser)
-        except (ValueError, IndexError) as e:
+        except (ValueError, IndexError, PDFSyntaxError) as e:
             log.warning(
                 "Indirect object %d not found at position %d: %r", objid, pos, e
             )
-            # Hack around malformed pdf files where the offset in the
+            # In case of malformed pdf files where the offset in the
             # xref table doesn't point exactly at the object
-            # definition (probably more frequent than you think).
-            # Back up a bit, then parse forward until we find the right
-            # object. Fixes
-            # https://github.com/pdfminer/pdfminer.six/issues/56
-            tokenizer = Lexer(self.buffer, max(0, pos - 16))
-            q: Deque[int] = deque([], 3)
-            while True:
-                try:
-                    (pos, token) = next(tokenizer)
-                except StopIteration:
-                    raise PDFSyntaxError(
-                        f"Indirect object {objid!r} not found at or after position {pos}"
-                    )
-                q.append(pos)
-                if len(q) == 3 and token is KEYWORD_OBJ:
-                    break
-            log.debug("seeking to %r", q[0])
-            self.parser.seek(q[0])
+            # definition (probably more frequent than you think), just
+            # use a regular expression to find the object because we
+            # can do that.
+            realpos = -1
+            lastgen = -1
+            for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
+                genno = int(m.group(1))
+                if genno > lastgen:
+                    lastgen = genno
+                    realpos = m.start(0)
+            if realpos == -1:
+                raise PDFSyntaxError(
+                    f"Indirect object {objid!r} not found in document"
+                )
+            log.debug("found object (%r) seeking to %r", m.group(0), realpos)
+            self.parser.seek(realpos)
             (_, obj) = next(self.parser)
         if obj.objid != objid:
             raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
diff --git a/tests/data.py b/tests/data.py
index 0a5bb62..7bc825f 100644
--- a/tests/data.py
+++ b/tests/data.py
@@ -42,4 +42,5 @@
 XFAILS = {
     "bogus-stream-length.pdf",
     "empty.pdf",
+    "issue9418.pdf",
 }