Merge pull request pdfminer#230 from 0xabu/unicode_glyph_bug

name2unicode(): handle hexadecimal literals for unicode glyphs in text extraction Unicode literals are hex, not decimal (refer: https://github.com/adobe-type-tools/agl-specification). We are still far from conformance with the full spec, but this handles more PDFs seen in the wild.
docsumo · Jul 9, 2019 · 6b312ed · 6b312ed
2 parents b6a5848 + c4c0a36
commit 6b312ed
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py
@@ -6,7 +6,7 @@
 
 import six # Python 2+3 compatibility
 
-STRIP_NAME = re.compile(r'[0-9]+')
+STRIP_NAME = re.compile(r'[0-9A-Fa-f]+')
 
 
 ##  name2unicode
@@ -18,7 +18,7 @@ def name2unicode(name):
     m = STRIP_NAME.search(name)
     if not m:
         raise KeyError(name)
-    return six.unichr(int(m.group(0)))
+    return six.unichr(int(m.group(0), base=16))
 
 
 ##  EncodingDB