-
Notifications
You must be signed in to change notification settings - Fork 1
/
__init__.py
136 lines (128 loc) · 8.67 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from .pipeline import Pipeline
from .tpipeline import TPipeline
from .pipeline import supported_langs, langwithner, remove_with_path
from .utils.base_utils import download, trankit2conllu
from .utils.tbinfo import supported_embeddings, supported_langs, saved_model_version
import os
from shutil import copyfile
__version__ = "1.1.0"
def download_missing_files(category, save_dir, embedding_name, language):
assert language in supported_langs, '{} is not a pretrained language. Current pretrained languages: {}'.format(language, supported_langs)
assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format(embedding_name, supported_embeddings)
import os
assert category in {'customized', 'customized-ner', 'customized-mwt',
'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
if category == 'customized':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)))
]
elif category == 'customized-ner':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))),
('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))),
('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)))
]
elif category == 'customized-mwt':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)))
]
elif category == 'customized-mwt-ner':
file_list = [
('{}.tokenizer.mdl', os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category))),
('{}_mwt_expander.pt', os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category))),
('{}.tagger.mdl', os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category))),
('{}.vocabs.json', os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category))),
('{}_lemmatizer.pt', os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))),
('{}.ner.mdl', os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category))),
('{}.ner-vocab.json', os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category)))
]
else:
assert 'Unknown customized lang!'
missing_filenamess = []
for filename, filepath in file_list:
if not os.path.exists(filepath):
print('Missing {}'.format(filepath))
missing_filenamess.append(filename)
download(
cache_dir=save_dir,
language=language,
saved_model_version=saved_model_version, # manually set this to avoid duplicated storage
embedding_name=embedding_name
)
# borrow pretrained files
src_dir = os.path.join(save_dir, embedding_name, language)
tgt_dir = os.path.join(save_dir, embedding_name, category)
for fname in missing_filenamess:
copyfile(os.path.join(src_dir, fname.format(language)), os.path.join(tgt_dir, fname.format(category)))
print('Copying {} to {}'.format(
os.path.join(src_dir, fname.format(language)),
os.path.join(tgt_dir, fname.format(category))
))
remove_with_path(src_dir)
def verify_customized_pipeline(category, save_dir, embedding_name):
assert embedding_name in supported_embeddings, '{} has not been supported. Current supported embeddings: {}'.format(
embedding_name, supported_embeddings)
assert category in {'customized', 'customized-ner', 'customized-mwt',
'customized-mwt-ner'}, "Pipeline category must be one of the following: 'customized', 'customized-ner', 'customized-mwt', 'customized-mwt-ner'"
if category == 'customized':
file_list = [
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))
]
elif category == 'customized-ner':
file_list = [
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))
]
elif category == 'customized-mwt':
file_list = [
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category))
]
elif category == 'customized-mwt-ner':
file_list = [
os.path.join(save_dir, embedding_name, category, '{}.tokenizer.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_mwt_expander.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.tagger.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.vocabs.json'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}_lemmatizer.pt'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner.mdl'.format(category)),
os.path.join(save_dir, embedding_name, category, '{}.ner-vocab.json'.format(category))
]
else:
assert 'Unknown customized lang!'
verified = True
for filepath in file_list:
if not os.path.exists(filepath):
verified = False
print('Missing {}'.format(filepath))
if verified:
with open(os.path.join(save_dir, embedding_name, category, '{}.downloaded'.format(category)), 'w') as f:
f.write('')
remove_with_path(os.path.join(save_dir, embedding_name, category, 'train.txt.character'))
remove_with_path(os.path.join(save_dir, embedding_name, category, 'logs'))
remove_with_path(os.path.join(save_dir, embedding_name, category, 'preds'))
print(
"Customized pipeline is ready to use!\nIt can be initialized as follows:\n-----------------------------------\nfrom trankit import Pipeline\np = Pipeline(lang='{}', cache_dir='{}')".format(
category, save_dir))
else:
print('Customized pipeline is not ready to use!\nPlease consider the missing files above.')