-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathsupdate.py
executable file
·134 lines (110 loc) · 5.1 KB
/
supdate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""
Incorporate edits to annotations in the human-readable inline rendered format
(output by streusvis.py) and produce the modified corpus as JSON.
Usage:
./streusvis.py --sent-ids --lexcats --colorless streusle.conllulex > updates.tsv
[manually edit annotations in updates.tsv]
./supdate.py streusle.conllulex updates.tsv > streusle.new.json
updates.tsv must contain 2 tab-separated columns: sentence IDs and rendered sentences.
The rendered sentence may be split across multiple consecutive lines,
with the sentence ID specified only in the first of these.
The sentences in updates.tsv will be compared against the ones in streusle.conllulex,
and only the ones with modified annotations will be processed (so including
unmodified sentences in updates.tsv is optional).
This script will not add or delete sentences from the corpus, or alter
their tokenization or syntactic parse.
@author: Nathan Schneider (nschneid)
@since: 2019-09-16
"""
import sys, re
from conllulex2json import load_sents, print_json
from conllulex2UDlextag import simplify_to_UDlextag
from UDlextag2json import load_sents as load_UDlextag_sents
from mwerender import render, render_sent, unrender
conllulexFP, updatesFP = sys.argv[1:]
"""
ALGORITHM OVERVIEW
For any sentence where the lextags have changed:
a) Convert the JSON for the sentence to conllulex.
b) Strip out the lexical semantic analyses.
c) Parse (unrender) the human-readable string with MWEs, lextags, and supersenses
into a tagging.
c) Substitute the modified lextags in the last column.
d) Convert the modified UDlextag to JSON.
e) Re-render the sentence to make sure it matches what the user specified.
"""
"""
1. Load streusvis.py-created file with potential updates to be made.
It must contain 2 tab-separated columns: sentence IDs and rendered sentences.
The rendered sentence may be split across multiple consecutive lines,
with the sentence ID specified only in the first of these.
"""
updates = {}
with open(updatesFP, encoding='utf-8') as updatesF:
sentid = None
for ln in updatesF:
if not ln.strip():
sentid = None
continue
ln = ln.rstrip()
s, r = ln.split('\t')
if s:
sentid = s
assert sentid not in updates
updates[sentid] = r
else: # continuation of second column from previous line
assert sentid
updates[sentid] += ' ' + r
"""
2. Scan the full corpus .conllulex for sentences with their original annotations.
If there was a change, parse the rendered lexical semantic analysis into tags,
substitute the tags in the UDlextag format, and parse the sentence to JSON in
order to update the fields: 'mwe', 'toks', 'swes', 'smwes', 'wmwes'
('etoks' etc. will be unaffected).
"""
sents = []
with open(conllulexFP, encoding='utf-8') as conllulexF:
nUpdatedSents = 0
for sent in load_sents(conllulexF, store_conllulex='toks'):
sentid = sent['sent_id']
if sentid in updates:
# compare rendered strings to see whether there has been a change
rendered_old = render_sent(sent, lexcats=True, supersenses=True)
rendered_new = updates[sentid]
if rendered_old!=rendered_new: # there has been a change
# parse the new rendered string
toks = [tok['word'] for tok in sent['toks']]
tagging = unrender(rendered_new, toks) # this should fail if tokens have changed
toks2, bios, lbls = zip(*tagging)
assert toks==list(toks2),(toks,toks2) # be super-duper sure tokens haven't changed
labeled_bio = [bio+('-'+lbl.replace(':','|') if lbl else '') for bio,lbl in zip(bios,lbls)]
# substitute new tagging in UDlextag format
conllulex = sent['conllulex'].strip().split('\n')
udlextag = simplify_to_UDlextag(conllulex)
assert udlextag.count('\n')==len(toks),(udlextag.count('\n'),len(toks),udlextag)
lines = udlextag.split('\n')
for i in range(len(labeled_bio)):
ln = lines[i]
newtag = labeled_bio[i]
lines[i] = ln[:ln.rindex('\t')] + '\t' + newtag
# add sentence ID
lines.insert(0, f'# sent_id = {sentid}')
# parse the new CoNLL-U-Lex
try:
newsent = next(load_UDlextag_sents(lines))
except AssertionError:
print('\n'.join(lines), file=sys.stderr)
raise
# update fields
for fld in ('mwe', 'toks', 'swes', 'smwes', 'wmwes'):
sent[fld] = newsent[fld]
# re-render the sentence as a sanity check
rendered2 = render_sent(sent, lexcats=True, supersenses=True)
assert rendered2==rendered_new
nUpdatedSents += 1
del sent['conllulex']
sents.append(sent)
# output the modified corpus
print_json(sents)
print(f'Reviewed inputs for {len(updates)} sentences and implemented updates to {nUpdatedSents} of them', file=sys.stderr)