From 8ab2e287be67185cc120597e6785c1b81df6b842 Mon Sep 17 00:00:00 2001 From: John Kesegich Date: Mon, 25 Feb 2019 11:33:18 -0600 Subject: [PATCH 1/7] Handle PDFStream as character map name in PDFCIDFont --- pdfminer/pdffont.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb85575..1487bab2 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -16,6 +16,7 @@ from .psparser import PSLiteral from .psparser import literal_name from .pdftypes import PDFException +from .pdftypes import PDFStream from .pdftypes import resolve1 from .pdftypes import int_value from .pdftypes import num_value @@ -654,6 +655,17 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): if strict: raise PDFFontError('Encoding is unspecified') name = 'unknown' + if type(name) is PDFStream: + if 'CMapName' in name: + name = name.get('CMapName').name + if name == 'DLIdent-H': + name = 'Identity-H' + elif name == 'DLIdent-V': + name = 'Identity-V' + else: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound as e: From c022358c8df6f2b6b9affc8c4bfc716aa222dcee Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 13 Jul 2019 04:52:24 +0530 Subject: [PATCH 2/7] Encapsulates character map name --- pdfminer/pdffont.py | 60 ++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 1487bab2..fdebe33c 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -649,29 +649,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - try: - name = literal_name(spec['Encoding']) - except KeyError: - if strict: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - if type(name) is PDFStream: - if 'CMapName' in name: - name = name.get('CMapName').name - if name == 'DLIdent-H': - name = 'Identity-H' - elif name == 'DLIdent-V': - name = 'Identity-V' - else: - if strict: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - try: - self.cmap = CMapDB.get_cmap(name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) - self.cmap = CMap() + self.cmap = (spec, strict) + try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: @@ -718,6 +697,41 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return + @property + def cmap(self): + return self._cmap + + @cmap.setter + def cmap(self,values): + spec, strict = values + try: + spec_encoding = spec['Encoding'] + if hasattr(spec_encoding, 'name'): + name = literal_name(spec['Encoding']) + else: + name = literal_name(spec_encoding['CMapName']) + except KeyError: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' + if type(name) is PDFStream: + if 'CMapName' in name: + name = name.get('CMapName').name + if name in('DLIdent-H','OneByteIdentityH','Identity-H') : + name = 'Identity-H' + elif name in ('DLIdent-V','OneByteIdentityV','Identity-V'): + name = 'Identity-V' + else: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' + try: + self._cmap = CMapDB.get_cmap(name) + except CMapDB.CMapNotFound as e: + if strict: + raise PDFFontError(e) + self._cmap = CMap() + def __repr__(self): return '' % (self.basefont, self.cidcoding) From 8e4a82ad8b249ddd1bc5cd42f6df992e15c0a01d Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 13 Jul 2019 05:00:25 +0530 Subject: [PATCH 3/7] Corrects Indentation --- pdfminer/pdffont.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index fdebe33c..85c23b59 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -697,7 +697,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - @property + @property def cmap(self): return self._cmap From cc40af3d2b84a8532988dfe70c3d6c3e15e29ffa Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Mon, 15 Jul 2019 14:21:21 +0530 Subject: [PATCH 4/7] Removes @property, Adds docstring --- pdfminer/pdffont.py | 53 ++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 85c23b59..f5b89421 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,7 +128,14 @@ def do_keyword(self, pos, token): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') - +CMAP_ENCODER = { + 'DLIdent-H': 'Identity-H', + 'OneByteIdentityH': 'Identity-H', + 'Identity-H': 'Identity-H', + 'DLIdent-V': 'Identity-V', + 'OneByteIdentityV': 'Identity-V', + 'Identity-V': 'Identity-V' +} ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -649,7 +656,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - self.cmap = (spec, strict) + self.cmap_setter(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -697,40 +704,42 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - @property - def cmap(self): - return self._cmap - - @cmap.setter - def cmap(self,values): - spec, strict = values + def cmap_setter(self, spec, strict): + """ + For certain PDFs, Encoding Type isn't mentioned as an attribute of + Encoding but as an attribute of CMapName, where CMapName is an + attribure of spec['Encoding']. + The horizaontal/vertical modes are mentioned with diffrent name + such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V' + """ try: spec_encoding = spec['Encoding'] if hasattr(spec_encoding, 'name'): - name = literal_name(spec['Encoding']) + cmap_name = literal_name(spec['Encoding']) else: - name = literal_name(spec_encoding['CMapName']) + cmap_name = literal_name(spec_encoding['CMapName']) except KeyError: if strict: raise PDFFontError('Encoding is unspecified') - name = 'unknown' - if type(name) is PDFStream: - if 'CMapName' in name: - name = name.get('CMapName').name - if name in('DLIdent-H','OneByteIdentityH','Identity-H') : - name = 'Identity-H' - elif name in ('DLIdent-V','OneByteIdentityV','Identity-V'): - name = 'Identity-V' + cmap_name = 'unknown' + if type(cmap_name) is PDFStream: + if 'CMapName' in cmap_name: + cmap_key = cmap_name.get('CMapName').cmap_name + try: + cmap_name = CMAP_ENCODER[cmap_key] + except: + cmap_name = cmap_key + raise PDFFontError('Unidentified encoding mentioned. %s is not supported' % cmap_name) else: if strict: raise PDFFontError('Encoding is unspecified') - name = 'unknown' + cmap_name = 'unknown' try: - self._cmap = CMapDB.get_cmap(name) + self.cmap = CMapDB.get_cmap(cmap_name) except CMapDB.CMapNotFound as e: if strict: raise PDFFontError(e) - self._cmap = CMap() + self.cmap = CMap() def __repr__(self): return '' % (self.basefont, self.cidcoding) From fa400431f571de9197ab72ffc5dd9f7d76826474 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 17 Jul 2019 11:38:00 +0530 Subject: [PATCH 5/7] Adds Test, Removes Unnecessary Assumptions --- pdfminer/pdffont.py | 24 ++++------------- tests/test_pdfencoding.py | 56 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 19 deletions(-) create mode 100644 tests/test_pdfencoding.py diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index f5b89421..a09c5c43 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,14 +128,7 @@ def do_keyword(self, pos, token): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -CMAP_ENCODER = { - 'DLIdent-H': 'Identity-H', - 'OneByteIdentityH': 'Identity-H', - 'Identity-H': 'Identity-H', - 'DLIdent-V': 'Identity-V', - 'OneByteIdentityV': 'Identity-V', - 'Identity-V': 'Identity-V' -} +IDENTITY_ENCODER = ('Identity-H', 'Identity-V') ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -724,21 +717,14 @@ def cmap_setter(self, spec, strict): cmap_name = 'unknown' if type(cmap_name) is PDFStream: if 'CMapName' in cmap_name: - cmap_key = cmap_name.get('CMapName').cmap_name - try: - cmap_name = CMAP_ENCODER[cmap_key] - except: - cmap_name = cmap_key - raise PDFFontError('Unidentified encoding mentioned. %s is not supported' % cmap_name) + cmap_name = cmap_name.get('CMapName').name else: if strict: - raise PDFFontError('Encoding is unspecified') + raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' - try: + if cmap_name in IDENTITY_ENCODER: self.cmap = CMapDB.get_cmap(cmap_name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) + else: self.cmap = CMap() def __repr__(self): diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py new file mode 100644 index 00000000..47256151 --- /dev/null +++ b/tests/test_pdfencoding.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +# -*- coding: utf-8 -*- + +import nose, logging, os +from pdfminer.cmapdb import IdentityCMap, CMap +from pdfminer.pdffont import PDFCIDFont +from pdfminer.pdftypes import PDFStream +from pdfminer.psparser import PSLiteral + +# 'DLIdent-H': 'Identity-H', +# 'OneByteIdentityH': 'Identity-H', +# 'Identity-H': 'Identity-H', +# 'DLIdent-V': 'Identity-V', +# 'OneByteIdentityV': 'Identity-V', +# 'Identity-V': 'Identity-V' + +class TestPDFEncoding(): + + def test_cmapname_onebyteidentityV(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_onebyteidentityH(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_V(self): + stream = PDFStream({'CMapName': PSLiteral('V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_H(self): + stream = PDFStream({'CMapName': PSLiteral('H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_encoding_identityH(self): + spec = {'Encoding': PSLiteral('Identity-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV(self): + spec = {'Encoding': PSLiteral('Identity-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + +if __name__ == '__main__': + nose.runmodule() From b4c261b647f42844981e083f9920240ccea2a0dc Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 17 Jul 2019 11:43:45 +0530 Subject: [PATCH 6/7] Removes Code Comments --- tests/test_pdfencoding.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 47256151..9ed4e9e5 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -8,13 +8,6 @@ from pdfminer.pdftypes import PDFStream from pdfminer.psparser import PSLiteral -# 'DLIdent-H': 'Identity-H', -# 'OneByteIdentityH': 'Identity-H', -# 'Identity-H': 'Identity-H', -# 'DLIdent-V': 'Identity-V', -# 'OneByteIdentityV': 'Identity-V', -# 'Identity-V': 'Identity-V' - class TestPDFEncoding(): def test_cmapname_onebyteidentityV(self): From f1a4dcea88e2ada84f737c23b12bf0d3f9a57c49 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 24 Jul 2019 11:56:06 +0530 Subject: [PATCH 7/7] Adds Test Cases, Neater Code For CMap Assignment --- pdfminer/pdffont.py | 8 ++++---- tests/test_pdfencoding.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index a09c5c43..9f24afb0 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -649,7 +649,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - self.cmap_setter(spec, strict) + self.cmap = self.get_cmap_from_spec(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -697,7 +697,7 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - def cmap_setter(self, spec, strict): + def get_cmap_from_spec(self, spec, strict): """ For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an @@ -723,9 +723,9 @@ def cmap_setter(self, spec, strict): raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' if cmap_name in IDENTITY_ENCODER: - self.cmap = CMapDB.get_cmap(cmap_name) + return CMapDB.get_cmap(cmap_name) else: - self.cmap = CMap() + return CMap() def __repr__(self): return '' % (self.basefont, self.cidcoding) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 9ed4e9e5..396d12d8 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -44,6 +44,34 @@ def test_encoding_identityV(self): font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap) + def test_encoding_identityH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityH_as_stream(self): + stream = PDFStream({'CMapName':'Identity-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_stream(self): + stream = PDFStream({'CMapName':'Identity-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_font_without_spec(self): + font = PDFCIDFont(None, {}) + assert isinstance(font.cmap, CMap) + if __name__ == '__main__': nose.runmodule()