-
Notifications
You must be signed in to change notification settings - Fork 17
/
json2conllulex.py
executable file
·120 lines (102 loc) · 4.08 KB
/
json2conllulex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
"""
Given a file in the STREUSLE JSON format, convert it to the .conllulex format.
Relies on sentence IDs being in the format DOCID-SENTNUM, where SENTNUM contains no hyphens.
Args: inputfile
@since: 2019-06-22
@author: Nathan Schneider (@nschneid)
"""
import os, sys, fileinput, re, json, csv
from collections import defaultdict
from itertools import chain
CONLLU = ('ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC')
# 1 2 3 4 5 6 7 8 9 10
STREUSLE = ('SMWE', 'LEXCAT', 'LEXLEMMA', 'SS', 'SS2', 'WMWE', 'WCAT', 'WLEMMA', 'LEXTAG')
# 11 12 13 14 15 16 17 18 19
FIELDS = CONLLU + STREUSLE
# Naming is slightly different for some fields
CONLLU_TO_JSON_FIELDS = {'ID': '#', 'FORM': 'word', 'DEPS': 'edeps'}
def build_conllulex(sents):
result = ''
curDocId = None
for sent in sents:
# headers
sent_id = sent["sent_id"]
doc_id, sent_num = sent_id.rsplit('-', 1)
if doc_id!=curDocId:
result += f'# newdoc id = {doc_id}\n'
curDocId = doc_id
result += f'# sent_id = {sent_id}\n'
result += f'# text = {sent["text"]}\n'
result += f'# streusle_sent_id = {sent["streusle_sent_id"]}\n'
result += f'# mwe = {sent["mwe"]}\n'
# body
# merge regular and ellipsis tokens
toks = sent["toks"]
for etok in reversed(sent["etoks"]):
part1, part2, s = etok["#"]
etok["#"] = s
toks.insert(part1-1 if '-' in s else part1, etok)
for tok in toks:
isEllipsis = isMWT = False
if isinstance(tok["#"], str):
if '.' in tok["#"]:
isEllipsis = True
elif '-' in tok["#"]:
isMWT = True
row = []
for fld in CONLLU:
v = tok[CONLLU_TO_JSON_FIELDS.get(fld, fld.lower())]
if not v and v!=0:
assert isEllipsis or isMWT or fld in ('FEATS', 'MISC'),(fld,v)
v = '_'
row.append(str(v))
# SMWE
if isEllipsis or isMWT:
# this is an ellipsis token. it doesn't have any lexical semantic info
row.extend(list('_'*9))
result += '\t'.join(row) + '\n'
continue
elif tok["smwe"]:
mweNum, position = tok["smwe"]
row.append(f'{mweNum}:{position}') # e.g. 2:1
if position==1:
lexe = sent["smwes"][str(tok["smwe"][0])]
else:
lexe = None # we already printed info about this lexical expression
else:
row.append('_')
lexe = sent["swes"][str(tok["#"])]
# Properties of the (strong) lexical expression:
# LEXCAT, LEXLEMMA, SS, SS2
if lexe:
assert lexe["lexcat"] and lexe["lexlemma"]
row.extend([lexe["lexcat"], lexe["lexlemma"]])
row.append(lexe["ss"] or '_')
row.append(lexe["ss2"] or '_')
else:
row.extend(list('_'*4))
# WMWE, WCAT, WLEMMA
if tok["wmwe"]:
mweNum, position = tok["wmwe"]
row.append(f'{mweNum}:{position}')
if position==1:
wmwe = sent["wmwes"][str(mweNum)]
assert wmwe["lexlemma"]
row.extend([wmwe["lexcat"] or '_', wmwe["lexlemma"]])
else:
row.extend(['_', '_'])
else:
row.extend(['_', '_', '_'])
# LEXTAG
assert tok["lextag"]
row.append(tok["lextag"])
result += '\t'.join(row) + '\n'
result += '\n'
return result
if __name__=='__main__':
inFname, = sys.argv[1:]
with open(inFname, encoding='utf-8') as inF:
sents = json.load(inF)
output = build_conllulex(sents)
print(output, end='')