Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PdfFileReader load cleanup #343

Merged
merged 5 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 63 additions & 44 deletions pyhanko/pdf_utils/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import re
from collections import defaultdict
from io import BytesIO
from typing import Dict, Generator, Optional, Set, Tuple, Union
from typing import BinaryIO, Dict, Generator, Optional, Set, Tuple, Union

from . import generic, misc
from .crypt import (
Expand All @@ -38,7 +38,6 @@

logger = logging.getLogger(__name__)


__all__ = [
'PdfFileReader',
'HistoricalResolver',
Expand Down Expand Up @@ -132,6 +131,41 @@ def process_data_at_eof(stream) -> int:
return startxref


def _read_header_version(stream: BinaryIO) -> Tuple[int, int]:
stream.seek(0)
input_version = None
header = misc.read_until_whitespace(stream, maxchars=20)
# match ignores trailing chars
m = header_regex.match(header)
if m is not None:
major = int(m.group(1))
minor = int(m.group(2))
input_version = (major, minor)
if input_version is None:
raise PdfReadError('Illegal PDF header')
return input_version


def _read_xrefs_and_trailer(
stream: BinaryIO, handler_ref: PdfHandler, strict: bool
) -> Tuple[XRefCache, XRefBuilder]:
# start at the end to read the trailer & xref table
stream.seek(-1, os.SEEK_END)
# This needs to be recorded for incremental update purposes
last_startxref = process_data_at_eof(stream)

# Read the xref table
xref_builder = XRefBuilder(
handler=handler_ref,
stream=stream,
strict=strict,
last_startxref=last_startxref,
)
xref_sections = xref_builder.read_xrefs()
xref_cache = XRefCache(handler_ref, xref_sections)
return xref_cache, xref_builder


class PdfFileReader(PdfHandler):
"""Class implementing functionality to read a PDF file and cache
certain data about it."""
Expand All @@ -151,20 +185,29 @@ def __init__(self, stream, strict: bool = True):
problems and also causes some correctable problems to be fatal.
Defaults to ``True``.
"""
self.security_handler: Optional[SecurityHandler] = None
self._security_handler: Optional[SecurityHandler] = None
self.strict = strict
self.resolved_objects: Dict[Tuple[int, int], generic.PdfObject] = {}
self._header_version = None
self._input_version = None
self._historical_resolver_cache: Dict[int, HistoricalResolver] = {}
self.stream = stream
self.xrefs, self.trailer = self.read()
encrypt_dict = self._get_encryption_params()
if encrypt_dict is not None:
self.security_handler = SecurityHandler.build(encrypt_dict)
# first, read the header & PDF version number
# (version number can be overridden in the document catalog later)
self._header_version = _read_header_version(stream)
self.xrefs, xref_builder = _read_xrefs_and_trailer(stream, self, strict)
self.last_startxref = xref_builder.last_startxref
self.trailer = xref_builder.trailer
self.has_xref_stream = xref_builder.has_xref_stream

self._embedded_signatures = None

@property
def security_handler(self):
if self.encrypt_dict and not self._security_handler:
self._security_handler = SecurityHandler.build(self.encrypt_dict)
return self._security_handler

def _xmp_meta_view(self) -> Optional[DocumentMetadata]:
try:
from pyhanko.pdf_utils.metadata import xmp_xml
Expand Down Expand Up @@ -280,15 +323,25 @@ def _get_object_from_stream(self, idnum, stmnum, idx):
else:
return generic.NullObject()

def _get_encryption_params(self) -> Optional[generic.DictionaryObject]:
@property
def encrypt_dict(self) -> Optional[generic.DictionaryObject]:
try:
encrypt_ref = self.trailer.raw_get('/Encrypt')
except KeyError:
return None
if isinstance(encrypt_ref, generic.IndirectObject):
return self.get_object(encrypt_ref.reference, never_decrypt=True)
encrypt_dict = self.get_object(
encrypt_ref.reference, never_decrypt=True
)
elif not self.strict:
encrypt_dict = encrypt_ref
else:
return encrypt_ref
raise misc.PdfReadError(
"Encryption settings must be an indirect reference"
)
if not isinstance(encrypt_dict, generic.DictionaryObject):
raise misc.PdfReadError("Encryption settings must be a dictionary")
return encrypt_dict

@property
def trailer_view(self) -> generic.DictionaryObject:
Expand Down Expand Up @@ -475,40 +528,6 @@ def cache_indirect_object(self, generation, idnum, obj):
self.resolved_objects[(generation, idnum)] = obj
return obj

def read(self):
# first, read the header & PDF version number
# (version number can be overridden in the document catalog later)
stream = self.stream
stream.seek(0)
input_version = None
header = misc.read_until_whitespace(stream, maxchars=20)
# match ignores trailing chars
m = header_regex.match(header)
if m is not None:
major = int(m.group(1))
minor = int(m.group(2))
input_version = (major, minor)
if input_version is None:
raise PdfReadError('Illegal PDF header')
self._header_version = input_version

# start at the end:
stream.seek(-1, os.SEEK_END)

# This needs to be recorded for incremental update purposes
self.last_startxref = last_startxref = process_data_at_eof(stream)
# Read the xref table
xref_builder = XRefBuilder(
handler=self,
stream=stream,
strict=self.strict,
last_startxref=last_startxref,
)
xref_sections = xref_builder.read_xrefs()
xref_cache = XRefCache(self, xref_sections)
self.has_xref_stream = xref_builder.has_xref_stream
return xref_cache, xref_builder.trailer

def decrypt(self, password: Union[str, bytes]) -> AuthResult:
"""
When using an encrypted PDF file with the standard PDF encryption
Expand Down
2 changes: 1 addition & 1 deletion pyhanko/pdf_utils/xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ def __init__(
self.sections: List[XRefSection] = []

self.trailer = TrailerDictionary()
self.trailer.container_ref = generic.TrailerReference(self)
self.trailer.container_ref = generic.TrailerReference(handler)
self.has_xref_stream = False

def _read_xref_stream_object(self):
Expand Down
Binary file not shown.
Binary file added pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf
Binary file not shown.
36 changes: 32 additions & 4 deletions pyhanko_tests/test_crypt.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,8 @@ def test_pubkey_unsupported_filter(delete_subfilter):
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_block_cfs_s4():
Expand All @@ -505,7 +506,8 @@ def test_pubkey_encryption_block_cfs_s4():
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_s5_requires_cfs():
Expand All @@ -518,7 +520,8 @@ def test_pubkey_encryption_s5_requires_cfs():
out = BytesIO()
w.write(out)
with pytest.raises(misc.PdfReadError):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_pubkey_encryption_dict_errors():
Expand Down Expand Up @@ -1433,7 +1436,8 @@ def test_legacy_o_u_values(entry):
w.write(out)

with pytest.raises(misc.PdfError, match="be 32 bytes long"):
PdfFileReader(out)
# noinspection PyStatementEffect
PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data


def test_key_length_constraint():
Expand Down Expand Up @@ -1529,3 +1533,27 @@ def test_add_crypt_filter_to_stream_without_security_handler():
dummy_stream = generic.StreamObject(stream_data=b"1001")
with pytest.raises(misc.PdfStreamError, match="no security handler"):
dummy_stream.add_crypt_filter()


@pytest.mark.parametrize(
"fname,strict",
[
("malformed-encrypt-dict1.pdf", True),
("malformed-encrypt-dict2.pdf", True),
("malformed-encrypt-dict2.pdf", False),
],
)
def test_malformed_crypt(fname, strict):
with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf:
r = PdfFileReader(inf, strict=strict)
with pytest.raises(misc.PdfReadError, match='Encryption settings'):
r.encrypt_dict


def test_tolerate_direct_encryption_dict_in_nonstrict():
fname = 'malformed-encrypt-dict1.pdf'
with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf:
r = PdfFileReader(inf, strict=False)
r.decrypt('ownersecret')
data = r.root['/Pages']['/Kids'][0]['/Contents'].data
assert b'Hello' in data
Loading