Skip to content

Commit

Permalink
Merge pull request pdfminer#264 from fakabbir/pdfstream-as-cmap
Browse files Browse the repository at this point in the history
Pdfstream as cmap
  • Loading branch information
tataganesh authored Jul 31, 2019
2 parents 6b312ed + f1a4dce commit 48b2593
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 13 deletions.
47 changes: 34 additions & 13 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .psparser import PSLiteral
from .psparser import literal_name
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import num_value
Expand Down Expand Up @@ -127,7 +128,7 @@ def do_keyword(self, pos, token):


NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')

IDENTITY_ENCODER = ('Identity-H', 'Identity-V')

## CFFFont
## (Format specified in Adobe Technical Note: #5176
Expand Down Expand Up @@ -648,18 +649,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
self.cmap = self.get_cmap_from_spec(spec, strict)

try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
Expand Down Expand Up @@ -706,6 +697,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return

def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribure of spec['Encoding'].
The horizaontal/vertical modes are mentioned with diffrent name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(cmap_name)
else:
return CMap()

def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

Expand Down
77 changes: 77 additions & 0 deletions tests/test_pdfencoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python

# -*- coding: utf-8 -*-

import nose, logging, os
from pdfminer.cmapdb import IdentityCMap, CMap
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdftypes import PDFStream
from pdfminer.psparser import PSLiteral

class TestPDFEncoding():

def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName':'Identity-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName':'Identity-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_font_without_spec(self):
font = PDFCIDFont(None, {})
assert isinstance(font.cmap, CMap)


if __name__ == '__main__':
nose.runmodule()

0 comments on commit 48b2593

Please sign in to comment.