-
Notifications
You must be signed in to change notification settings - Fork 0
/
NER_Plugin_for_ttw.py
309 lines (225 loc) · 11.1 KB
/
NER_Plugin_for_ttw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import time
import subprocess
import os
from bs4 import BeautifulSoup
#Make sure that the basic functions are not beeing affected by this plugin in case of missing libraries.
#A quick check of the environment before running is done by the basic_NER_lib_check() function in the main file.
try:
import json
import requests
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
except:
pass
class log_NER_Class:
def __init__(self, files):
self.files = files
self.actualTime = time.localtime()
self.year, self.month, self.day = self.actualTime[0:3]
self.hour, self.minute, self.second = self.actualTime[3:6]
self.NERresultPath = self.files.projectPath+"NER_results/"
if not os.path.exists(self.NERresultPath):
os.makedirs(self.NERresultPath)
self.logCollector = []
self.logCollector.append(f"Log file: {self.year:4d}-{self.month:02d}-{self.day:02d}_{self.hour:02d}:{self.minute:02d}:{self.second:02d}\n")
def add_to_log(self, logInput):
self.logCollector.append(logInput)
def save_log(self):
with open(self.NERresultPath + "01_log.txt", 'w', encoding="utf8") as fp:
for logEntry in self.logCollector:
fp.write(logEntry)
#print(logEntry)
fp.close()
def save_results(self, resultJSONList, resultForCSVList):
self.save_log()
#Now the .csv list
intro=("Place name|Suggested ID\nNote|\"(*NOT LIKELY*)\" means, "
"that the entity is not of type \"archaeological-site\", "
"\"archaeological-area\" or \"populated-place\".\n")
resultForCSVList_sorted = sorted(resultForCSVList)
with open(self.NERresultPath + "02_Gazetteer_IDs_DRAFT.csv", 'w', encoding="utf8") as fp:
fp.write(intro + "\n")
#print(intro)
for item in resultForCSVList_sorted:
fp.write(item + "\n")
#print(item)
fp.close()
#Now the complete .json file
with open(self.NERresultPath + "03_Gazetteer_result_detailed.json", 'w', encoding="utf8") as fp:
resultJSON = json.dumps(resultJSONList,
indent=4, sort_keys=False,
separators=(',', ': '), ensure_ascii=False)
fp.write(resultJSON)
fp.close()
def call_gazetteer(results, logGenerator):
listGazetteer = []
listForCSV = []
csvRow = ""
logGenerator.add_to_log("\n3. iDAI.gazetteer query result")
for result in results:
toBeRun = filter_NER_results(result) #Decides whether the entry will be run or not
if toBeRun == True:
#Most simple way to call gazetteer, only for testing purposes, more elaborated filters following.
#See also the README.md file here https://github.com/pBxr/NER_Plugin_for_ttw on this point.
toSearch = "https://gazetteer.dainst.org/search.json?q=" + result
response = requests.get(toSearch)
resultListComplete = response.json()
logGenerator.add_to_log(f"\n--------------------------------------------------------------\nSearching in iDAI.gazetteer for \"{result}\"\n")
logGenerator.add_to_log(f"Number of results: {resultListComplete['total']}\n")
resultList = resultListComplete['result']
i=1
for item in resultList:
if item['prefName']['title']:
logGenerator.add_to_log(f"Nr. {i}: Preferred Name: {item['prefName']['title']}\n")
if "types" in item:
logGenerator.add_to_log("Type: ")
for entry in item['types']:
logGenerator.add_to_log(entry +", ")
logGenerator.add_to_log("\n")
if "@id" in item:
logGenerator.add_to_log(item['@id'])
if item['prefName']['title'] and "@id" in item:
if ("types" in item) and ('archaeological-area' in item['types']
or 'populated-place' in item['types']
or 'archaeological-site' in item['types']):
csvRow = result + "|" + item['@id']
logGenerator.add_to_log("\n")
listForCSV.append(csvRow)
else:
result2 = result + "(*NOT LIKELY*)"
csvRow = result2 + "|" + item['@id']
logGenerator.add_to_log("\n")
listForCSV.append(csvRow) #To save only the needed entries
i+=1
logGenerator.add_to_log("\n--------------------------------------------------------------\n")
toSearch=""
listGazetteer.append(resultListComplete) #To save the complete result
return listGazetteer, listForCSV
def filter_NER_results(result):
"""
This is only a simple placeholder for a more elaborated function.
"""
if len(result) > 3:
return True
else:
return False
def prepare_folder_and_input_text(files, settings):
#Prepare folder
pathNERresults = files.projectPath + "NER_results"
if not os.path.exists(pathNERresults):
os.makedirs(pathNERresults)
#Convert text to the selected input format
if settings.NER_SettingsSet['Source'] == 'Convert .docx to .txt and get text':
pandocParameter = "00_Plain_article_text.txt"
else:
pandocParameter = "00_Plain_article_text.html"
#Put together the pandoc call to convert the .docx file into the selected format and save it
pandocCall = "pandoc -o " + "\"" + pathNERresults + "\\" + pandocParameter + "\"" + " " + "\"" + files.projectPath + files.fileName + "\""
FNULL = open(os.devnull, 'w') #For subprocess
subprocess.run(pandocCall, stdout=FNULL, stderr=FNULL, shell=False)
#Return the plain text for the pipeline. If a structured format like .html is selected, text gets extracted with bs4.
plainTextPath = pathNERresults + "\\" + pandocParameter
if settings.NER_SettingsSet['Source'] == 'Convert .docx to .txt and get text':
with open(plainTextPath, 'r', encoding="utf8") as fp:
inputText = fp.read()
fp.close()
return inputText
else:
with open(plainTextPath, 'r', encoding="utf8") as fp:
soup = BeautifulSoup(fp, "html.parser")
text = soup.get_text()
#Remove blank lines
inputText = str(text).replace('\n\n','')
return inputText
def return_location_names(nerResults, settings, logGenerator):
logGenerator.add_to_log("\n1. NER result:\n")
for result in nerResults:
logGenerator.add_to_log(str(result)+"\n")
#Extract names using B/I span
listNames = []
toInsert = ""
for nerResult in reversed(nerResults):
#Threshold only for the beginning of a location name)
if nerResult['entity'] == "B-LOC" and nerResult['score'] > settings.NER_Threshold:
toInsert = nerResult['word'] + toInsert
listNames.append(toInsert)
toInsert = ""
if nerResult['entity'] == "I-LOC":
if "##" in nerResult['word']:
cleaned = nerResult['word'].replace("##", "")
toInsert = cleaned + toInsert
else:
toInsert = " " + nerResult['word'] + toInsert
#Repair wrong breaks
toInsert = ""
substring = ""
listNamesFixed = []
for nerResult in listNames:
if "##" in nerResult:
substring = nerResult.replace("##", "")
else:
toInsert = nerResult + substring
listNamesFixed.append(toInsert)
toInsert = ""
substring = ""
listNamesFixed.sort()
#Delete duplicates
locationNames = list(set(listNamesFixed))
locationNames.sort()
logGenerator.add_to_log("\n2. Extracted entities de-tokenized\n")
for entry in locationNames:
logGenerator.add_to_log(entry +", ")
logGenerator.add_to_log("\n")
return locationNames
def run_NER_process(files, settings):
logGenerator = log_NER_Class(files)
inputText = prepare_folder_and_input_text(files, settings)
selectedModell = settings.NER_SettingsSet['Model']
try:
#Now run NER
tokenizer = AutoTokenizer.from_pretrained(selectedModell)
model = AutoModelForTokenClassification.from_pretrained(selectedModell)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
nerResults = nlp(inputText)
#Now extract names, get iDAI.gazetteer entries and save log and results
extractedLocationNames = return_location_names(nerResults, settings, logGenerator)
resultJSONList, resultForCSVList = call_gazetteer(extractedLocationNames, logGenerator)
logGenerator.save_log()
logGenerator.save_results(resultJSONList, resultForCSVList)
except:
textInfo = ("Some unexpected problem occured while starting the NER pipeline.\n\n"
"Check your environment.\n\n"
"See \"About\" -> \"Help\" for instructions.\n\n\n")
logGenerator.add_to_log(textInfo)
logGenerator.save_log()
return False, textInfo
else:
textInfo = "Process finished. Check result"
return True, textInfo
#-------------------------------------------------------------------------------------------
"""
The following two classes resp. instances simulate the input from `ttw` into this plugin.
The idea is that the plugin gets called with
- the project path
- the file name of the article
- and the settings (selected model, entity type, method of text extraction and threshold)
For the testing purposes the settings are hard coded here.
An instance of each class containing the settings will be passed as arg to "run_NER_process(files, settings)"
"""
class files_class:
def __init__(self):
self.projectPath = "C:\\#enter_your_project_path_here#\\"
self.fileName = "#enter_your_file_name_here#"
class settings_class:
def __init__(self):
self.NER_SettingsSet = {'Model' : 'dslim/bert-base-NER',
'Entity Type' : 'Place Name',
'Source' : 'Convert .docx to .txt and get text'
}
self.NER_Threshold = 0.5 #Only scores > 0.50 will be taken into account (for B-LOC)
files = files_class()
settings = settings_class()
#-------------------------------------------------------------------------------------------
if __name__=='__main__':
#This ist the way the plugin will be called by `ttw` including the args
run_NER_process(files, settings)