-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcharfinder2.py
executable file
·101 lines (79 loc) · 2.56 KB
/
charfinder2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
"""
charfinder2.py:
Searches for Unicode characters named with the words given.
Builds inverted index of UCD (Unicode Character Database).
"""
import pathlib
from urllib import request
import collections
import functools
import operator
import pickle
UCD_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt'
UCD_NAME = pathlib.Path(UCD_URL).name
INDEX_NAME = 'charfinder_index.pickle'
def download_ucd():
print('downloading {}...'.format(UCD_NAME))
with request.urlopen(UCD_URL) as fp_in:
text = fp_in.read().decode('ascii')
with open(UCD_NAME, 'wt', encoding='ascii') as fp_out:
fp_out.write(text)
return text
def read_ucd():
if pathlib.Path(UCD_NAME).exists():
with open(UCD_NAME, 'rt', encoding='ascii') as fp_in:
text = fp_in.read()
else:
text = download_ucd()
return (line for line in text.split('\n')
if line.strip() and not line.startswith('#'))
def parse(ucd_line):
parts = ucd_line.split(';')
char = chr(int(parts[0], 16))
name = parts[1]
name_set = set(name.replace('-', ' ').split())
if parts[10]:
old_name = parts[10]
old_name_set = set(old_name.replace('-', ' ').split())
if old_name_set - name_set:
name += ' | ' + old_name
name_set |= old_name_set
return char, name, name_set
def build_index():
lines = read_ucd()
word_idx = collections.defaultdict(set)
char_idx = {}
for line in lines:
char, name, name_set = parse(line)
if name.startswith('<'):
continue
for word in name_set:
word_idx[word].add(char)
char_idx[char] = name
with open(INDEX_NAME, 'wb') as fp:
pickle.dump((word_idx, char_idx), fp)
return word_idx, char_idx
def read_index():
if pathlib.Path(INDEX_NAME).exists():
with open(INDEX_NAME, 'rb') as fp:
word_idx, char_idx = pickle.load(fp)
else:
word_idx, char_idx = build_index()
return word_idx, char_idx
def search(word_idx, char_idx, words):
if not words:
return
words = (word.upper() for word in words)
found = functools.reduce(operator.and_, (word_idx[word] for word in words))
for char in sorted(found):
yield char, char_idx[char]
def main():
import sys
if len(sys.argv) < 2:
print('usage: {} <word1> <word2> ...'.format(sys.argv[0]))
sys.exit()
for char, name in search(*read_index(), sys.argv[1:]):
print(char, name)
if __name__ == '__main__':
main()