-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_entity_grid_perm.py
153 lines (145 loc) · 6.82 KB
/
extract_entity_grid_perm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# takes csv files, parses them, and extracts entity grid
from pycorenlp import StanfordCoreNLP
import os, json, sys
nlp = StanfordCoreNLP('http://localhost:9000')
corpus = sys.argv[1]
in_dir = 'data/' + corpus + '/'
if not os.path.exists(in_dir + 'parsed_permute/'):
os.makedirs(in_dir + 'parsed_permute/')
if not os.path.exists(in_dir + 'grid_permute/'):
os.makedirs(in_dir + 'grid_permute/')
def update_noun_types(dep_type, np_words, curr_nouns_type):
for word in np_words:
if word not in curr_nouns_type:
curr_nouns_type[word] = dep_type
if curr_nouns_type[word] == "x" or curr_nouns_type[word] == "o":
curr_nouns_type[word] = dep_type
return curr_nouns_type
def get_np(dependency, const_parse):
target_id = dependency['dependent']
index = 0
nouns = []
for line in const_parse.splitlines():
if ")" not in line:
continue
tokens = line.strip().split(") (")
num_tokens = len(tokens) # remove phrase label
index += num_tokens
if target_id <= index and tokens[0].startswith("(NP"):
for token in tokens:
if token.startswith("(NP"):
token = token[3:].strip()
while token.startswith("("):
token = token[1:]
while token.endswith(")"):
token = token[:-1].strip()
word = token.split(None, 1)[1] # remove POS tag
if token.startswith("NN"):
nouns.append(word.lower())
elif token.startswith("PRP "):
nouns.append(word.lower())
elif token.startswith("DT") and len(tokens) == 1:
nouns.append(word.lower()) # is noun phrase, only one DT word (this, all) in the phrase
break
return nouns
# read all text files, parse and extract entity grid
for filename in os.listdir(in_dir + "text_permute/"):
if not filename.endswith("_sent.txt"):
continue # original files only
with open(in_dir + "text_permute/" + filename, 'r') as in_file:
# process original sentence order file
nouns_list = []
nouns_dict = {}
sent_annotations = []
text_id = filename.rsplit("_", 1)[0]
const_out_filename = in_dir + "parsed_permute/" + text_id + ".0.const_parse"
dep_out_filename = in_dir + "parsed_permute/" + text_id + ".0.dep_parse"
grid_out_filename = in_dir + "parsed_permute/" + text_id + ".0.grid"
if os.path.exists(const_out_filename) and os.path.exists(dep_out_filename) and os.path.exists(
grid_out_filename):
continue
const_out = open(in_dir + "parsed_permute/" + text_id + ".0.const_parse", "w")
const_lines = {}
dep_out = open(in_dir + "parsed_permute/" + text_id + ".0.dep_parse", "w")
dep_lines = {}
grid_out = open(in_dir + "grid_permute/" + text_id + ".0.grid", "w")
grid_lines = {}
for line in in_file: # sentences in original order
line = line.strip()
const_lines[line] = []
dep_lines[line] = []
grid_lines[line] = []
if line.strip() == "": # not sure if this ever fires (I might have removed line breaks in these files -- for entity grid only)
const_out.write("\n\n")
dep_out.write("\n\n")
continue
output = nlp.annotate(line, properties={
'annotators': 'tokenize,ssplit,pos,depparse,parse',
'outputFormat': 'json'
})
for sent in output['sentences']:
const_out.write(sent['parse'] + "\n")
const_lines[line].append(sent['parse'])
json.dump(sent['basicDependencies'], dep_out)
dep_out.write("\n")
dep_lines[line].append(sent['basicDependencies'])
curr_nouns_type = {}
for token in sent['tokens']:
if token['pos'].startswith("NN") or token['pos'] == 'PRP':
token_str = token['word'].lower()
curr_nouns_type[token_str] = "x"
if token_str not in nouns_dict:
nouns_list.append(token_str)
nouns_dict[token_str] = 0
nouns_dict[token_str] += 1
for dep in sent['basicDependencies']:
dep_type = ""
if dep['dep'] == 'nsubj' or dep['dep'] == 'nsubjpass':
dep_type = "s"
elif dep['dep'] == 'dobj':
dep_type = "o"
if dep_type != "":
np = get_np(dep, sent['parse'])
curr_nouns_type = update_noun_types(dep_type, np, curr_nouns_type)
sent_annotations.append(curr_nouns_type)
grid_lines[line].append(curr_nouns_type)
for noun in nouns_list:
grid_out.write(noun + " ")
for sent_ann in sent_annotations:
if noun in sent_ann:
grid_out.write(sent_ann[noun] + " ")
else:
grid_out.write("- ")
grid_out.write(str(nouns_dict[noun]) + "\n") # frequency for salience feature
grid_out.close()
const_out.close()
dep_out.close()
for i in range(1, 21):
filename_perm = text_id + ".perm-" + str(i)
if not os.path.exists(in_dir + "text_permute/" + filename_perm + ".txt"):
continue
const_out = open(in_dir + "parsed_permute/" + filename_perm + ".const_parse", "w")
dep_out = open(in_dir + "parsed_permute/" + filename_perm + ".dep_parse", "w")
grid_out = open(in_dir + "grid_permute/" + filename_perm + ".grid", "w")
sent_annotations = []
with open(in_dir + "text_permute/" + filename_perm + ".txt", "r") as in_file:
for line in in_file:
line = line.strip()
for parse in const_lines[line]:
const_out.write(parse + "\n")
for parse in dep_lines[line]:
json.dump(parse, dep_out)
dep_out.write("\n")
for grid_line in grid_lines[line]:
sent_annotations.append(grid_line)
for noun in nouns_list:
grid_out.write(noun + " ")
for sent_ann in sent_annotations:
if noun in sent_ann:
grid_out.write(sent_ann[noun] + " ")
else:
grid_out.write("- ")
grid_out.write(str(nouns_dict[noun]) + "\n") # saliance frequency feature
grid_out.close()
const_out.close()
dep_out.close()