-
Notifications
You must be signed in to change notification settings - Fork 1
/
class_conllx.py
117 lines (98 loc) · 4.49 KB
/
class_conllx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""A simplified version of class_conllx used in conllu_analysis"""
from pathlib import Path
import pandas as pd
# class sentence contains text, sentenceText, and a DataFrame containing token information
class Sentence:
def __init__(self, text='', sentence_tokens='', tree_tokens='',
sentence_comments=None, dependency_tree=None):
self.id = -1
self.text = text
self.sentence_tokens = sentence_tokens
self.tree_tokens = tree_tokens
if sentence_comments is None:
sentence_comments = []
self.sentence_comments = sentence_comments
if dependency_tree is None:
dependency_tree = []
self.dependency_tree = dependency_tree
class Conllx:
"""
Contains methods responsible for reading/writing conllx files,
as well as converting between conllx and DataFrame formats.
Conllx can also add catibex to the conllx file and
run parse from maltparser.
"""
def __init__(self, file_name='', file_path=None, file_data=None):
if file_data is None:
file_data = []
if file_path is None:
file_path = 'data'
self.file_data = file_data
self.file_path = file_path
self.file_name = file_name
@staticmethod
def get_conllx_header():
return ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS',
'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
def __len__(self):
return len(self.file_data)
def read_file(self):
"""
Reads a file. Removes extra whitespaces, newlines, and carriage returns
from the end of lines.
"""
data = []
# sometimes a character that is not valid in utf8 is used, use this line
# with open(Path(self.file_path) / self.file_name, encoding='latin-1') as file:
with open(Path(self.file_path) / self.file_name, encoding='utf-8-sig') as file:
data = file.readlines()
data = [x.rstrip() for x in data]
data = [x+'\n' for x in data if x != '\n']
self.file_data = data
def conllx_to_sentence_list(self, columns=None):
"""
Converts the conllx file to a list of Sentence objects.
A Sentence object contains text and sentenceText variables,
as well as a DataFrame of the dependency tree
"""
assert self.file_data != [], "conllx file is empty!"
if not columns: # if columns are not provided (i.e. Conll)
columns = Conllx.get_conllx_header()
data = self.file_data
sentence_list = []
i = 0
while i < len(data):
temp_sentence = Sentence()
temp_sentence.id = len(sentence_list) # assigns an id to the sentence
temp_list = []
while i < len(data) and data[i].strip('\n') + '\n' != '\n':
if data[i].startswith('# text'):
temp_sentence.text = data[i][9:].strip()
# not using sentence_tokens for now
# elif data[i].startswith('# sentenceTokens'):
# temp_sentence.sentence_tokens = data[i][19:].strip()
# treeTokens and sentenceText are stored in tree_tokens
elif data[i].startswith('# treeTokens'):
temp_sentence.tree_tokens = data[i][15:].strip()
elif data[i].startswith('# sentenceText'):
temp_sentence.tree_tokens = data[i][17:].strip()
elif data[i].startswith('#'): # keep a list of non-text comments
temp_sentence.sentence_comments.append(data[i].strip())
else:
temp_list.append(data[i].rstrip('\n'))
i += 1
i += 1
# print('here')
# print(temp_list)
temp_df = pd.DataFrame([x.split('\t')
for x in temp_list], columns=columns)
temp_df = temp_df.astype({'ID': 'int64', 'HEAD': 'int64'})
# if Conll columns are used, add missing columns
if list(temp_df.columns) != Conllx.get_conllx_header():
temp_df = Conllx.add_missing_columns(temp_df)
temp_sentence.dependency_tree = temp_df
if not temp_sentence.dependency_tree.empty:
sentence_list.append(temp_sentence)
elif temp_sentence.text or temp_sentence.sentence_comments:
raise ValueError('no dependency tree, yet comments are present!')
return sentence_list