-
Notifications
You must be signed in to change notification settings - Fork 0
/
textrank.py
39 lines (29 loc) · 1.09 KB
/
textrank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pytextrank
import sys
import json
import re
from importlib import reload
reload(sys)
#sys.setdefaultencoding('utf-8')
path_stage1 = 'o1.json'
path_stage2 = 'o2.json'
def text_rank(json_request):
pattern = re.compile("TI - (.*?)\\r|AB - (.*?)\\r")
matches = re.findall(pattern, json_request['ris'])
all_inputs = []
for section in matches:
all_inputs.append((''.join([word + ' ' for word in section])).strip())
input_json = {}
input_json['id'] = "0"
input_json['text'] = '.'.join(all_inputs)
with open('ris_extracted.json', 'w') as output:
json.dump(input_json, output)
with open(path_stage1, 'w') as f:
for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')):
f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
graph, ranks = pytextrank.text_rank(path_stage1)
with open(path_stage2, 'w') as f:
for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)])
return phrases