Skip to content

Commit

Permalink
Merge pull request #343 from MatthiasValvekens/feature/reader-load-cl…
Browse files Browse the repository at this point in the history
…eanup

PdfFileReader load cleanup
  • Loading branch information
MatthiasValvekens authored Nov 23, 2023
2 parents ddf26f8 + 9578947 commit 9b19f5f
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 49 deletions.
107 changes: 63 additions & 44 deletions pyhanko/pdf_utils/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import re
from collections import defaultdict
from io import BytesIO
from typing import Dict, Generator, Optional, Set, Tuple, Union
from typing import BinaryIO, Dict, Generator, Optional, Set, Tuple, Union

from . import generic, misc
from .crypt import (
Expand All @@ -38,7 +38,6 @@

logger = logging.getLogger(__name__)


__all__ = [
'PdfFileReader',
'HistoricalResolver',
Expand Down Expand Up @@ -132,6 +131,41 @@ def process_data_at_eof(stream) -> int:
return startxref


def _read_header_version(stream: BinaryIO) -> Tuple[int, int]:
stream.seek(0)
input_version = None
header = misc.read_until_whitespace(stream, maxchars=20)
# match ignores trailing chars
m = header_regex.match(header)
if m is not None:
major = int(m.group(1))
minor = int(m.group(2))
input_version = (major, minor)
if input_version is None:
raise PdfReadError('Illegal PDF header')
return input_version


def _read_xrefs_and_trailer(
stream: BinaryIO, handler_ref: PdfHandler, strict: bool
) -> Tuple[XRefCache, XRefBuilder]:
# start at the end to read the trailer & xref table
stream.seek(-1, os.SEEK_END)
# This needs to be recorded for incremental update purposes
last_startxref = process_data_at_eof(stream)

# Read the xref table
xref_builder = XRefBuilder(
handler=handler_ref,
stream=stream,
strict=strict,
last_startxref=last_startxref,
)
xref_sections = xref_builder.read_xrefs()
xref_cache = XRefCache(handler_ref, xref_sections)
return xref_cache, xref_builder


class PdfFileReader(PdfHandler):
"""Class implementing functionality to read a PDF file and cache
certain data about it."""
Expand All @@ -151,20 +185,29 @@ def __init__(self, stream, strict: bool = True):
problems and also causes some correctable problems to be fatal.
Defaults to ``True``.
"""
self.security_handler: Optional[SecurityHandler] = None
self._security_handler: Optional[SecurityHandler] = None
self.strict = strict
self.resolved_objects: Dict[Tuple[int, int], generic.PdfObject] = {}
self._header_version = None
self._input_version = None
self._historical_resolver_cache: Dict[int, HistoricalResolver] = {}
self.stream = stream
self.xrefs, self.trailer = self.read()
encrypt_dict = self._get_encryption_params()
if encrypt_dict is not None:
self.security_handler = SecurityHandler.build(encrypt_dict)
# first, read the header & PDF version number
# (version number can be overridden in the document catalog later)
self._header_version = _read_header_version(stream)
self.xrefs, xref_builder = _read_xrefs_and_trailer(stream, self, strict)
self.last_startxref = xref_builder.last_startxref
self.trailer = xref_builder.trailer
self.has_xref_stream = xref_builder.has_xref_stream

self._embedded_signatures = None

@property
def security_handler(self):
if self.encrypt_dict and not self._security_handler:
self._security_handler = SecurityHandler.build(self.encrypt_dict)
return self._security_handler

def _xmp_meta_view(self) -> Optional[DocumentMetadata]:
try:
from pyhanko.pdf_utils.metadata import xmp_xml
Expand Down Expand Up @@ -280,15 +323,25 @@ def _get_object_from_stream(self, idnum, stmnum, idx):
else:
return generic.NullObject()

def _get_encryption_params(self) -> Optional[generic.DictionaryObject]:
@property
def encrypt_dict(self) -> Optional[generic.DictionaryObject]:
try:
encrypt_ref = self.trailer.raw_get('/Encrypt')
except KeyError:
return None
if isinstance(encrypt_ref, generic.IndirectObject):
return self.get_object(encrypt_ref.reference, never_decrypt=True)
encrypt_dict = self.get_object(
encrypt_ref.reference, never_decrypt=True
)
elif not self.strict:
encrypt_dict = encrypt_ref
else:
return encrypt_ref
raise misc.PdfReadError(
"Encryption settings must be an indirect reference"
)
if not isinstance(encrypt_dict, generic.DictionaryObject):
raise misc.PdfReadError("Encryption settings must be a dictionary")
return encrypt_dict

@property
def trailer_view(self) -> generic.DictionaryObject:
Expand Down Expand Up @@ -475,40 +528,6 @@ def cache_indirect_object(self, generation, idnum, obj):
self.resolved_objects[(generation, idnum)] = obj
return obj

def read(self):
# first, read the header & PDF version number
# (version number can be overridden in the document catalog later)
stream = self.stream
stream.seek(0)
input_version = None
header = misc.read_until_whitespace(stream, maxchars=20)
# match ignores trailing chars
m = header_regex.match(header)
if m is not None:
major = int(m.group(1))
minor = int(m.group(2))
input_version = (major, minor)
if input_version is None:
raise PdfReadError('Illegal PDF header')
self._header_version = input_version

# start at the end:
stream.seek(-1, os.SEEK_END)

# This needs to be recorded for incremental update purposes
self.last_startxref = last_startxref = process_data_at_eof(stream)
# Read the xref table
xref_builder = XRefBuilder(
handler=self,
stream=stream,
strict=self.strict,
last_startxref=last_startxref,
)
xref_sections = xref_builder.read_xrefs()
xref_cache = XRefCache(self, xref_sections)
self.has_xref_stream = xref_builder.has_xref_stream
return xref_cache, xref_builder.trailer

def decrypt(self, password: Union[str, bytes]) -> AuthResult:
"""
When using an encrypted PDF file with the standard PDF encryption
Expand Down
2 changes: 1 addition & 1 deletion pyhanko/pdf_utils/xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ def __init__(
self.sections: List[XRefSection] = []

self.trailer = TrailerDictionary()
self.trailer.container_ref = generic.TrailerReference(self)
self.trailer.container_ref = generic.TrailerReference(handler)
self.has_xref_stream = False

def _read_xref_stream_object(self):
Expand Down
Binary file not shown.
Binary file added pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf
Binary file not shown.
36 changes: 32 additions & 4 deletions pyhanko_tests/test_crypt.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,8 @@ def test_pubkey_unsupported_filter(delete_subfilter):
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_block_cfs_s4():
Expand All @@ -505,7 +506,8 @@ def test_pubkey_encryption_block_cfs_s4():
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_s5_requires_cfs():
Expand All @@ -518,7 +520,8 @@ def test_pubkey_encryption_s5_requires_cfs():
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_dict_errors():
Expand Down Expand Up @@ -1433,7 +1436,8 @@ def test_legacy_o_u_values(entry):
w.write(out)

with pytest.raises(misc.PdfError, match="be 32 bytes long"):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_key_length_constraint():
Expand Down Expand Up @@ -1529,3 +1533,27 @@ def test_add_crypt_filter_to_stream_without_security_handler():
dummy_stream = generic.StreamObject(stream_data=b"1001")
with pytest.raises(misc.PdfStreamError, match="no security handler"):
dummy_stream.add_crypt_filter()


@pytest.mark.parametrize(
"fname,strict",
[
("malformed-encrypt-dict1.pdf", True),
("malformed-encrypt-dict2.pdf", True),
("malformed-encrypt-dict2.pdf", False),
],
)
def test_malformed_crypt(fname, strict):
with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf:
r = PdfFileReader(inf, strict=strict)
with pytest.raises(misc.PdfReadError, match='Encryption settings'):
r.encrypt_dict


def test_tolerate_direct_encryption_dict_in_nonstrict():
fname = 'malformed-encrypt-dict1.pdf'
with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf:
r = PdfFileReader(inf, strict=False)
r.decrypt('ownersecret')
data = r.root['/Pages']['/Kids'][0]['/Contents'].data
assert b'Hello' in data

0 comments on commit 9b19f5f

Please sign in to comment.