Skip to content

Commit

Permalink
fix: simpler and more robust recovery from borked xref tables
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Dec 2, 2024
1 parent eecbd51 commit 796a68c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 22 deletions.
45 changes: 23 additions & 22 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def __init__(
try:
pos = self._find_xref()
self._read_xref_from(pos, self.xrefs)
except (ValueError, IndexError, PDFSyntaxError) as e:
except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
log.debug("Using fallback XRef parsing: %s", e)
newxref = XRefFallback(self.parser)
self.xrefs.append(newxref)
Expand All @@ -874,7 +874,10 @@ def __init__(
self.encryption = (id_value, dict_value(trailer["Encrypt"]))
self._initialize_password(password)
if "Info" in trailer:
self.info.append(dict_value(trailer["Info"]))
try:
self.info.append(dict_value(trailer["Info"]))
except TypeError:
log.warning("Info is a broken reference (incorrect xref table?)")
if "Root" in trailer:
# Every PDF file must have exactly one /Root dictionary.
try:
Expand Down Expand Up @@ -982,30 +985,28 @@ def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
self.parser.seek(pos)
try:
_, obj = next(self.parser)
except (ValueError, IndexError) as e:
except (ValueError, IndexError, PDFSyntaxError) as e:
log.warning(
"Indirect object %d not found at position %d: %r", objid, pos, e
)
# Hack around malformed pdf files where the offset in the
# In case of malformed pdf files where the offset in the
# xref table doesn't point exactly at the object
# definition (probably more frequent than you think).
# Back up a bit, then parse forward until we find the right
# object. Fixes
# https://github.com/pdfminer/pdfminer.six/issues/56
tokenizer = Lexer(self.buffer, max(0, pos - 16))
q: Deque[int] = deque([], 3)
while True:
try:
(pos, token) = next(tokenizer)
except StopIteration:
raise PDFSyntaxError(
f"Indirect object {objid!r} not found at or after position {pos}"
)
q.append(pos)
if len(q) == 3 and token is KEYWORD_OBJ:
break
log.debug("seeking to %r", q[0])
self.parser.seek(q[0])
# definition (probably more frequent than you think), just
# use a regular expression to find the object because we
# can do that.
realpos = -1
lastgen = -1
for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
genno = int(m.group(1))
if genno > lastgen:
lastgen = genno
realpos = m.start(0)
if realpos == -1:
raise PDFSyntaxError(
f"Indirect object {objid!r} not found in document"
)
log.debug("found object (%r) seeking to %r", m.group(0), realpos)
self.parser.seek(realpos)
(_, obj) = next(self.parser)
if obj.objid != objid:
raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
Expand Down
1 change: 1 addition & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,5 @@
XFAILS = {
"bogus-stream-length.pdf",
"empty.pdf",
"issue9418.pdf",
}

0 comments on commit 796a68c

Please sign in to comment.