-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathttf.py
executable file
·179 lines (157 loc) · 5.98 KB
/
ttf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
"""
Copied text from a PDF and pasted random symbols?
PDF files are sometimes purposely protected from copying,
but this often unintentionally prevents distribution and findability of knowledge.
Another reason to embed fonts is to protect the font designs themselves from copying.
This program is trying to recover that text and lost knowledge.
It recovers text in HTML produced by pdf2htmlEX from PDFs
where characters are broken due to embedded fonts with bad character maps (CMAPs).
This allows to search and copy/paste the text.
The program works by comparing glyph shapes of the embedded fonts with known fonts,
so it is very helpful if the fonts used in the PDF document are known and their full version
is available. This allows fully automatic repair of information.
If the fonts are unknown, unavailable, or glyphs can't be recognized,
program will ask the user to recognize the letter shape and key in the right symbol.
It will only ask once for each shape and remember the letter choice
in a human readable dictionary (dictionary.js).
The technical reason for random symbols:
Seemingly random characters are produced when you copy/paste text from PDF
because the PDF embedded fonts don't use standard unicode character code maps.
They use Private Use Area unicode range for mapping the glyph indices to codes.
"""
from lxml.html import fromstring, tostring
import operator
import string
import glob
import json
try:
from freetype import Face, FT_LOAD_RENDER, FT_LOAD_TARGET_MONO
except ImportError:
print('Requires: pip install freetype-py')
try:
from config import FULL_FONTS_PATH
except ImportError:
FULL_FONTS_PATH = './fonts'
DEBUG = 1
def pua_content(txt):
"""Ratio of characters encoded using Private Use Area (PUA) E000—F8FF.
PUA is used by PDF embedded fonts if original CMAP was thrown away.
"""
return len([1 for x in txt if 0xE000 <= ord(x) <= 0xF8FF]) / float(len(txt))
def bits(x):
data = []
for i in range(8):
data.insert(0, int((x & 1) == 1))
x = x >> 1
return data
def show_glyph(data, bitmap, draw=True):
"""Render glyph on the CLI using TEXT art"""
w=''.join(['█ ' if px else ' ' for px in data])
ls=[]
s=''
for index, e in enumerate(w):
if (index+1) % (bitmap.width * 2) == 0:
ls.append(s)
s=''
else:
s += e
return ls
def glyph_data(face, char):
face.set_char_size(32*48) # 24*32, 32*48, 48*64
face.load_char(char, FT_LOAD_RENDER | FT_LOAD_TARGET_MONO)
bitmap = face.glyph.bitmap
width = face.glyph.bitmap.width
rows = face.glyph.bitmap.rows
pitch = face.glyph.bitmap.pitch
data = []
for i in range(bitmap.rows):
row = []
for j in range(bitmap.pitch):
row.extend(bits(bitmap.buffer[i*bitmap.pitch+j]))
data.extend(row[:bitmap.width])
return data, bitmap
def load_fonts(path):
# TODO: WOFF handling
fonts = glob.glob(path+'/*.ttf')# + glob.glob(path+'/*.woff')
fonts = {x.split('/')[-1].replace('.ttf',''):Face(x) for x in fonts}
if DEBUG:
print('Loading fonts from: '+ path)
for face in fonts.values():
print(face.family_name.decode(), face.style_name.decode(), face.num_glyphs, 'glyphs')
return fonts
def char_lookup(fonts):
chars = string.printable + "£©¹’'‘’“”"
ls = []
for name, font in fonts.items():
for char in chars:
data, bitmap = glyph_data(font, char)
ls.append((str(data), char))
return dict(ls)
def lookup_user(data, bitmap):
dictionary = "dictionary.js"
try:
lookup = json.load(open(dictionary, "r"))
except ValueError: # dictonary was empty
lookup = []
shape = show_glyph(data, bitmap)
try: # lookup shape in our dictionary
return [c for c,s in lookup if s==shape][0]
except IndexError: # No known character - ask for input
for line in shape: print(line)
print('\a')
char = input("Please enter character shown: ")
print("you entered: ", char)
lookup.append((char, shape))
lookup = sorted(lookup, key=lambda x: x[0])
json.dump(lookup, open(dictionary, "w+"), indent=1, ensure_ascii=False)
return char
LOOKUP_FONTS = char_lookup(load_fonts(FULL_FONTS_PATH))
def decode_font(code, font, embed_fonts):
word = ''
for codepoint in code:
data, bitmap = glyph_data(embed_fonts[font], codepoint)
try:
char = LOOKUP_FONTS[str(data)]
except KeyError:
char = lookup_user(data, bitmap)
word += char
# print(font, len(code), word)
return word
def font_family(e):
def fn(e):
if e is None: return
css = e.get('class', '')
if css.startswith('ff'): return css[1:3]
try: return 'f' + css.split(' ff')[1][0]
except IndexError: return
ancestors = [e]
if e is not None:
ancestors += [x for x in e.iterancestors()]
for w in ancestors:
f = fn(w)
if f:
return f
return 'f1'
def recover_text(dom, embed_fonts_path):
embed_fonts = load_fonts(embed_fonts_path)
for e in dom.iter():
text_ff = font_family(e)
tail_ff = font_family(e.getparent())
decode = lambda txt, font: decode_font(txt, font, embed_fonts)
# element text and tail(txt following el) can be different font-family
# only decode text its font-family is embedded font
if e.text and e.text != ' ' and text_ff in embed_fonts.keys():
e.text = decode(e.text, text_ff)
if e.tail and e.tail != None and tail_ff in embed_fonts.keys():
e.tail = decode(e.tail, tail_ff)
if __name__ == '__main__':
import transcript
import config
import os.path
doc_path = config.HTML_DIR + '/100026_945655/100026_945655.html'
dom, css = transcript.prepare(doc_path)
recover_text(dom, os.path.dirname(doc_path))
f = open(doc_path.replace('.html', '.htm'), "wb+")
f.write(tostring(dom))
f.close()