ttf.py

#!/usr/bin/env python3
"""
Copied text from a PDF and pasted random symbols?

PDF files are sometimes purposely protected from copying,
but this often unintentionally prevents distribution and findability of knowledge.
Another reason to embed fonts is to protect the font designs themselves from copying.

This program is trying to recover that text and lost knowledge.

It recovers text in HTML produced by pdf2htmlEX from PDFs
where characters are broken due to embedded fonts with bad character maps (CMAPs).

This allows to search and copy/paste the text.

The program works by comparing glyph shapes of the embedded fonts with known fonts,
so it is very helpful if the fonts used in the PDF document are known and their full version
is available. This allows fully automatic repair of information.

If the fonts are unknown, unavailable, or glyphs can't be recognized,
program will ask the user to recognize the letter shape and key in the right symbol.
It will only ask once for each shape and remember the letter choice
in a human readable dictionary (dictionary.js).

The technical reason for random symbols:
Seemingly random characters are produced when you copy/paste text from PDF
because the PDF embedded fonts don't use standard unicode character code maps.
They use Private Use Area unicode range for mapping the glyph indices to codes.
"""
from lxml.html import fromstring, tostring
import operator
import string
import glob
import json
try:
    from freetype import Face, FT_LOAD_RENDER, FT_LOAD_TARGET_MONO
except ImportError:
    print('Requires: pip install freetype-py')
try:
    from config import FULL_FONTS_PATH
except ImportError:
    FULL_FONTS_PATH = './fonts'

DEBUG = 1

def pua_content(txt):
    """Ratio of characters encoded using Private Use Area (PUA) E000—F8FF.
    PUA is used by PDF embedded fonts if original CMAP was thrown away.
    """
    return len([1 for x in txt if 0xE000 <= ord(x) <= 0xF8FF]) / float(len(txt))

def bits(x):
    data = []
    for i in range(8):
        data.insert(0, int((x & 1) == 1))
        x = x >> 1
    return data

def show_glyph(data, bitmap, draw=True):
    """Render glyph on the CLI using TEXT art"""
    w=''.join(['█ ' if px else '  ' for px in data])
    ls=[]
    s=''
    for index, e in enumerate(w):
        if (index+1) % (bitmap.width * 2) == 0:
            ls.append(s)
            s=''
        else:
            s += e
    return ls

def glyph_data(face, char):
    face.set_char_size(32*48) # 24*32, 32*48, 48*64
    face.load_char(char, FT_LOAD_RENDER | FT_LOAD_TARGET_MONO)
    bitmap = face.glyph.bitmap
    width  = face.glyph.bitmap.width
    rows   = face.glyph.bitmap.rows
    pitch  = face.glyph.bitmap.pitch
    data = []
    for i in range(bitmap.rows):
        row = []
        for j in range(bitmap.pitch):
            row.extend(bits(bitmap.buffer[i*bitmap.pitch+j]))
        data.extend(row[:bitmap.width])
    return data, bitmap

def load_fonts(path):
    # TODO: WOFF handling
    fonts = glob.glob(path+'/*.ttf')# + glob.glob(path+'/*.woff')
    fonts = {x.split('/')[-1].replace('.ttf',''):Face(x) for x in fonts}
    if DEBUG:
        print('Loading fonts from: '+ path)
        for face in fonts.values():
            print(face.family_name.decode(), face.style_name.decode(), face.num_glyphs, 'glyphs')
    return fonts

def char_lookup(fonts):
    chars = string.printable + "£©¹’'‘’“”"
    ls = []
    for name, font in fonts.items():
        for char in chars:
            data, bitmap = glyph_data(font, char)
            ls.append((str(data), char))
    return dict(ls)

def lookup_user(data, bitmap):
    dictionary = "dictionary.js"
    try:
        lookup = json.load(open(dictionary, "r"))
    except ValueError: # dictonary was empty
        lookup = []
    shape = show_glyph(data, bitmap)
    try: # lookup shape in our dictionary
        return [c for c,s in lookup if s==shape][0]
    except IndexError: # No known character - ask for input
        for line in shape: print(line)
        print('\a')
        char = input("Please enter character shown: ")
        print("you entered: ", char)
        lookup.append((char, shape))
        lookup = sorted(lookup, key=lambda x: x[0])
        json.dump(lookup, open(dictionary, "w+"), indent=1, ensure_ascii=False)
        return char

LOOKUP_FONTS = char_lookup(load_fonts(FULL_FONTS_PATH))

def decode_font(code, font, embed_fonts):
    word = ''
    for codepoint in code:
        data, bitmap = glyph_data(embed_fonts[font], codepoint)
        try:
            char = LOOKUP_FONTS[str(data)]
        except KeyError:
            char = lookup_user(data, bitmap)
        word += char
    # print(font, len(code), word)
    return word

def font_family(e):
    def fn(e):
        if e is None: return
        css = e.get('class', '')
        if css.startswith('ff'): return css[1:3]
        try: return 'f' + css.split(' ff')[1][0]
        except IndexError: return
    ancestors = [e]
    if e is not None:
        ancestors += [x for x in e.iterancestors()]
    for w in ancestors:
        f = fn(w)
        if f:
            return f
    return 'f1'


def recover_text(dom, embed_fonts_path):
    embed_fonts = load_fonts(embed_fonts_path)
    for e in dom.iter():
        text_ff = font_family(e)
        tail_ff = font_family(e.getparent())
        decode = lambda txt, font: decode_font(txt, font, embed_fonts)
        # element text and tail(txt following el) can be different font-family
        # only decode text its font-family is embedded font
        if e.text and e.text != ' '  and text_ff in embed_fonts.keys():
            e.text = decode(e.text, text_ff)
        if e.tail and e.tail != None and tail_ff in embed_fonts.keys():
            e.tail = decode(e.tail, tail_ff)

if __name__ == '__main__':
    import transcript
    import config
    import os.path
    
    doc_path = config.HTML_DIR + '/100026_945655/100026_945655.html'
    dom, css = transcript.prepare(doc_path)
    recover_text(dom, os.path.dirname(doc_path))
    f = open(doc_path.replace('.html', '.htm'), "wb+")
    f.write(tostring(dom))
    f.close()