From a60e366efd5a79fea4b96977a99dfcfc822cc370 Mon Sep 17 00:00:00 2001 From: Matthias Valvekens Date: Tue, 21 Nov 2023 23:02:02 +0100 Subject: [PATCH 1/4] Clean up loading code in PdfFileReader - Remove special read() method that was really never intended to be "public" and would simply mess things up if called more than once. - Load security handler lazily. This means that accessing an encrypted document with wrong credentials will now only fail upon trying to access the first encrypted object, as opposed to failing at init time. --- pyhanko/pdf_utils/reader.py | 93 +++++++++++++++++++++---------------- pyhanko/pdf_utils/xref.py | 2 +- pyhanko_tests/test_crypt.py | 12 +++-- 3 files changed, 61 insertions(+), 46 deletions(-) diff --git a/pyhanko/pdf_utils/reader.py b/pyhanko/pdf_utils/reader.py index 87ae6fa5..dcbffccb 100644 --- a/pyhanko/pdf_utils/reader.py +++ b/pyhanko/pdf_utils/reader.py @@ -14,7 +14,7 @@ import re from collections import defaultdict from io import BytesIO -from typing import Dict, Generator, Optional, Set, Tuple, Union +from typing import BinaryIO, Dict, Generator, Optional, Set, Tuple, Union from . import generic, misc from .crypt import ( @@ -38,7 +38,6 @@ logger = logging.getLogger(__name__) - __all__ = [ 'PdfFileReader', 'HistoricalResolver', @@ -132,6 +131,41 @@ def process_data_at_eof(stream) -> int: return startxref +def _read_header_version(stream: BinaryIO) -> Tuple[int, int]: + stream.seek(0) + input_version = None + header = misc.read_until_whitespace(stream, maxchars=20) + # match ignores trailing chars + m = header_regex.match(header) + if m is not None: + major = int(m.group(1)) + minor = int(m.group(2)) + input_version = (major, minor) + if input_version is None: + raise PdfReadError('Illegal PDF header') + return input_version + + +def _read_xrefs_and_trailer( + stream: BinaryIO, handler_ref: PdfHandler, strict: bool +) -> Tuple[XRefCache, XRefBuilder]: + # start at the end to read the trailer & xref table + stream.seek(-1, os.SEEK_END) + # This needs to be recorded for incremental update purposes + last_startxref = process_data_at_eof(stream) + + # Read the xref table + xref_builder = XRefBuilder( + handler=handler_ref, + stream=stream, + strict=strict, + last_startxref=last_startxref, + ) + xref_sections = xref_builder.read_xrefs() + xref_cache = XRefCache(handler_ref, xref_sections) + return xref_cache, xref_builder + + class PdfFileReader(PdfHandler): """Class implementing functionality to read a PDF file and cache certain data about it.""" @@ -151,20 +185,31 @@ def __init__(self, stream, strict: bool = True): problems and also causes some correctable problems to be fatal. Defaults to ``True``. """ - self.security_handler: Optional[SecurityHandler] = None + self._security_handler: Optional[SecurityHandler] = None self.strict = strict self.resolved_objects: Dict[Tuple[int, int], generic.PdfObject] = {} self._header_version = None self._input_version = None self._historical_resolver_cache: Dict[int, HistoricalResolver] = {} self.stream = stream - self.xrefs, self.trailer = self.read() - encrypt_dict = self._get_encryption_params() - if encrypt_dict is not None: - self.security_handler = SecurityHandler.build(encrypt_dict) + # first, read the header & PDF version number + # (version number can be overridden in the document catalog later) + self._header_version = _read_header_version(stream) + self.xrefs, xref_builder = _read_xrefs_and_trailer(stream, self, strict) + self.last_startxref = xref_builder.last_startxref + self.trailer = xref_builder.trailer + self.has_xref_stream = xref_builder.has_xref_stream self._embedded_signatures = None + @property + def security_handler(self): + if self._security_handler is None: + encrypt_dict = self._get_encryption_params() + if encrypt_dict is not None: + self._security_handler = SecurityHandler.build(encrypt_dict) + return self._security_handler + def _xmp_meta_view(self) -> Optional[DocumentMetadata]: try: from pyhanko.pdf_utils.metadata import xmp_xml @@ -475,40 +520,6 @@ def cache_indirect_object(self, generation, idnum, obj): self.resolved_objects[(generation, idnum)] = obj return obj - def read(self): - # first, read the header & PDF version number - # (version number can be overridden in the document catalog later) - stream = self.stream - stream.seek(0) - input_version = None - header = misc.read_until_whitespace(stream, maxchars=20) - # match ignores trailing chars - m = header_regex.match(header) - if m is not None: - major = int(m.group(1)) - minor = int(m.group(2)) - input_version = (major, minor) - if input_version is None: - raise PdfReadError('Illegal PDF header') - self._header_version = input_version - - # start at the end: - stream.seek(-1, os.SEEK_END) - - # This needs to be recorded for incremental update purposes - self.last_startxref = last_startxref = process_data_at_eof(stream) - # Read the xref table - xref_builder = XRefBuilder( - handler=self, - stream=stream, - strict=self.strict, - last_startxref=last_startxref, - ) - xref_sections = xref_builder.read_xrefs() - xref_cache = XRefCache(self, xref_sections) - self.has_xref_stream = xref_builder.has_xref_stream - return xref_cache, xref_builder.trailer - def decrypt(self, password: Union[str, bytes]) -> AuthResult: """ When using an encrypted PDF file with the standard PDF encryption diff --git a/pyhanko/pdf_utils/xref.py b/pyhanko/pdf_utils/xref.py index 5b8d0c65..b43eda7f 100644 --- a/pyhanko/pdf_utils/xref.py +++ b/pyhanko/pdf_utils/xref.py @@ -642,7 +642,7 @@ def __init__( self.sections: List[XRefSection] = [] self.trailer = TrailerDictionary() - self.trailer.container_ref = generic.TrailerReference(self) + self.trailer.container_ref = generic.TrailerReference(handler) self.has_xref_stream = False def _read_xref_stream_object(self): diff --git a/pyhanko_tests/test_crypt.py b/pyhanko_tests/test_crypt.py index 3c74c3a9..f18c0bf5 100644 --- a/pyhanko_tests/test_crypt.py +++ b/pyhanko_tests/test_crypt.py @@ -493,7 +493,8 @@ def test_pubkey_unsupported_filter(delete_subfilter): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_block_cfs_s4(): @@ -505,7 +506,8 @@ def test_pubkey_encryption_block_cfs_s4(): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_s5_requires_cfs(): @@ -518,7 +520,8 @@ def test_pubkey_encryption_s5_requires_cfs(): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_dict_errors(): @@ -1433,7 +1436,8 @@ def test_legacy_o_u_values(entry): w.write(out) with pytest.raises(misc.PdfError, match="be 32 bytes long"): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_key_length_constraint(): From f0903787b6850b57fb4ea540d6236995d8767f74 Mon Sep 17 00:00:00 2001 From: Matthias Valvekens Date: Wed, 22 Nov 2023 22:17:49 +0100 Subject: [PATCH 2/4] Better tests for _get_encryption_params --- pyhanko/pdf_utils/reader.py | 13 ++++++++-- .../data/pdf/malformed-encrypt-dict1.pdf | Bin 0 -> 1090 bytes .../data/pdf/malformed-encrypt-dict2.pdf | Bin 0 -> 904 bytes pyhanko_tests/test_crypt.py | 24 ++++++++++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf create mode 100644 pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf diff --git a/pyhanko/pdf_utils/reader.py b/pyhanko/pdf_utils/reader.py index dcbffccb..5bda7efc 100644 --- a/pyhanko/pdf_utils/reader.py +++ b/pyhanko/pdf_utils/reader.py @@ -331,9 +331,18 @@ def _get_encryption_params(self) -> Optional[generic.DictionaryObject]: except KeyError: return None if isinstance(encrypt_ref, generic.IndirectObject): - return self.get_object(encrypt_ref.reference, never_decrypt=True) + encrypt_dict = self.get_object( + encrypt_ref.reference, never_decrypt=True + ) + elif not self.strict: + encrypt_dict = encrypt_ref else: - return encrypt_ref + raise misc.PdfReadError( + "Encryption settings must be an indirect reference" + ) + if not isinstance(encrypt_dict, generic.DictionaryObject): + raise misc.PdfReadError("Encryption settings must be a dictionary") + return encrypt_dict @property def trailer_view(self) -> generic.DictionaryObject: diff --git a/pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf b/pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e246c1920a818e7991c68cde3bfc7f6dc0fb2fd3 GIT binary patch literal 1090 zcmdT?J!=$E6vakiet`A0*hbvU`<@wCw@5aDpfO>yiAA({^X5)m+02BQSx5v6Ev&6< z#KywLQc_!pAcCD>DT1|VC0GRS%43jw_pt;)NFV&c@gai0Vk_rIQZ9wNeO!(o7CIj zZ(uUC%lQ#uj2_7taHW85CS@m`=0)qodC{!GtSUh@)reI=pEFv0Ec0>Uh-b}sEvK92 zz?*H^me^qaV0eIKyPqG}4E*hJTs7m=-HGmmL=)KU(rop+z-(dKn!}9Pb&i z9gZq2Xdl@Md+726>!h+O&>nbC9-O;*{@K!(8yDI47q_mw?Y;kWI=K4k(o^^5(aHS{ z`}kM+F)A+ay!&zC!{_ql*KfNIPw4MXv#0-AFLa)@+pcVu*dhUc%7u#%{7vX+rb9H5 zbjYJn1}1VWHpVf-Vv(R@QOumsBGp0}AuMx7t2mH65iv&{#WemFv=#V$($L?5PHa=F z!jLh>Ek#Mxk_V|XVWbkKEix{I(#A$Swy3#OB9PW3F`Zl(I7J0_NwT@ZM+K%H1LIN2 z=>ns*)R4j-jMLEdf^3c%lHd%+RrD7o85#3ToDvJURbhuC(yVq%KhJC1Ha_1T-a|0p zulHcDV>vS}3^6s43T47Mjp(EbP=}hOvFrXjf;_&Ox#IY6YRsuq{zK^|j%}kiNfkd#-!rID4 zY%FXnCAEbJBG?I*B3O%7f<^GmWY=9{;kg6%&Uaqld34tM%X54_3Oa9oy*LdxFeuY| zLAMLxy4}MX1R()>tVX-_cJa7na46_<`K(?;C5Uyf2 zvTP?yj+r|eem{L zur;<=xH;U%dTvne+XBLk!_>FsG}(#n8Hv`g*Q45+Z-&`SwGD>}v71=z`MaP6B9@RFFv3Ddh0U#@$$~Kcl{5aPdBf>zVghyeSGp@)js)M ze@d#WTkn5f`uL@O_3iuiqZ8`x*n8Y(J(GEoZo}ImvsDJ+m5wO} zj7;KKYK&usr6NPel9V~2MXrT1LRjXER%s-8CQ^<%NvZsGm{SlAXoleybW@vK6~~M* zZplibmORRxi4&DEZIN*ylr}cusYT7D5|OkfOX=m}$SDfAO(UC5_^84>U|<3YITkQl zOAR^v#W)q+2ozJy&$IhE+7icrUz<*Doa--330sc_Ztfrp)4AZho)R=(|kkv!;hme!Vo EKcc+;KL7v# literal 0 HcmV?d00001 diff --git a/pyhanko_tests/test_crypt.py b/pyhanko_tests/test_crypt.py index f18c0bf5..ee72497e 100644 --- a/pyhanko_tests/test_crypt.py +++ b/pyhanko_tests/test_crypt.py @@ -1533,3 +1533,27 @@ def test_add_crypt_filter_to_stream_without_security_handler(): dummy_stream = generic.StreamObject(stream_data=b"1001") with pytest.raises(misc.PdfStreamError, match="no security handler"): dummy_stream.add_crypt_filter() + + +@pytest.mark.parametrize( + "fname,strict", + [ + ("malformed-encrypt-dict1.pdf", True), + ("malformed-encrypt-dict2.pdf", True), + ("malformed-encrypt-dict2.pdf", False), + ], +) +def test_malformed_crypt(fname, strict): + with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: + r = PdfFileReader(inf, strict=strict) + with pytest.raises(misc.PdfReadError, match='Encryption settings'): + r._get_encryption_params() + + +def test_tolerate_direct_encryption_dict_in_nonstrict(): + fname = 'malformed-encrypt-dict1.pdf' + with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: + r = PdfFileReader(inf, strict=False) + r.decrypt('ownersecret') + data = r.root['/Pages']['/Kids'][0]['/Contents'].data + assert b'Hello' in data From ee839b05d39782c833af8443d9ad80873b485c12 Mon Sep 17 00:00:00 2001 From: Matthias Valvekens Date: Wed, 22 Nov 2023 22:25:05 +0100 Subject: [PATCH 3/4] Add encrypt_dict property to reader --- pyhanko/pdf_utils/reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyhanko/pdf_utils/reader.py b/pyhanko/pdf_utils/reader.py index 5bda7efc..4948d708 100644 --- a/pyhanko/pdf_utils/reader.py +++ b/pyhanko/pdf_utils/reader.py @@ -344,6 +344,8 @@ def _get_encryption_params(self) -> Optional[generic.DictionaryObject]: raise misc.PdfReadError("Encryption settings must be a dictionary") return encrypt_dict + encrypt_dict = property(_get_encryption_params) + @property def trailer_view(self) -> generic.DictionaryObject: return self.trailer.flatten() From 04b524b4d602f68de01dce90c398b045990a440d Mon Sep 17 00:00:00 2001 From: Benjamin Dornel <{{ .email }}> Date: Thu, 23 Nov 2023 16:08:26 +0000 Subject: [PATCH 4/4] Add encrypt dict using property decorator --- pyhanko/pdf_utils/reader.py | 11 ++++------- pyhanko_tests/test_crypt.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pyhanko/pdf_utils/reader.py b/pyhanko/pdf_utils/reader.py index 4948d708..d77ac415 100644 --- a/pyhanko/pdf_utils/reader.py +++ b/pyhanko/pdf_utils/reader.py @@ -204,10 +204,8 @@ def __init__(self, stream, strict: bool = True): @property def security_handler(self): - if self._security_handler is None: - encrypt_dict = self._get_encryption_params() - if encrypt_dict is not None: - self._security_handler = SecurityHandler.build(encrypt_dict) + if self.encrypt_dict and not self._security_handler: + self._security_handler = SecurityHandler.build(self.encrypt_dict) return self._security_handler def _xmp_meta_view(self) -> Optional[DocumentMetadata]: @@ -325,7 +323,8 @@ def _get_object_from_stream(self, idnum, stmnum, idx): else: return generic.NullObject() - def _get_encryption_params(self) -> Optional[generic.DictionaryObject]: + @property + def encrypt_dict(self) -> Optional[generic.DictionaryObject]: try: encrypt_ref = self.trailer.raw_get('/Encrypt') except KeyError: @@ -344,8 +343,6 @@ def _get_encryption_params(self) -> Optional[generic.DictionaryObject]: raise misc.PdfReadError("Encryption settings must be a dictionary") return encrypt_dict - encrypt_dict = property(_get_encryption_params) - @property def trailer_view(self) -> generic.DictionaryObject: return self.trailer.flatten() diff --git a/pyhanko_tests/test_crypt.py b/pyhanko_tests/test_crypt.py index ee72497e..b0de97a5 100644 --- a/pyhanko_tests/test_crypt.py +++ b/pyhanko_tests/test_crypt.py @@ -1547,7 +1547,7 @@ def test_malformed_crypt(fname, strict): with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: r = PdfFileReader(inf, strict=strict) with pytest.raises(misc.PdfReadError, match='Encryption settings'): - r._get_encryption_params() + r.encrypt_dict def test_tolerate_direct_encryption_dict_in_nonstrict():