-
Notifications
You must be signed in to change notification settings - Fork 2
/
read.py
248 lines (195 loc) · 8.67 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to read Turkish Web Treebank sentence annotations."""
import os
import pathlib
from typing import Generator, Iterable, List, Optional
from turkish_treebanks import twt_pb2
_Feature = twt_pb2.Feature
_Sentence = twt_pb2.Sentence
_Tag = twt_pb2.Tag
_Token = twt_pb2.Token
_ROOT_DIR = pathlib.Path(__file__).parent.parent
_DATA_DIR = os.path.join(_ROOT_DIR, "data")
_PATHS_BY_SECTION = {
"web": os.path.join(_DATA_DIR, "web.conllu"),
"wiki": os.path.join(_DATA_DIR, "wiki.conllu"),
}
_VALID_SPLIT_NAMES = [
"train",
"dev",
"test",
]
def _whitespace_trimmed(string: str) -> str:
"""Strips any leading and trailing whitespace off from the string"""
return string.lstrip().rstrip()
def _split_into_sentences(conll: str) -> List[str]:
"""Tokenizes contents of a CoNLL-U format file by sentence annotations."""
return [_whitespace_trimmed(s) for s in conll.split("\n\n")]
def _split_into_lines(sentence: str) -> List[str]:
"""Tokenizes CoNLL-U format sentence annotation into lines."""
return [_whitespace_trimmed(line) for line in sentence.split("\n")]
def _reconstruct_conll_from(sentences: Iterable[str]) -> str:
"""Reconstructs CoNLL-U format file content from sentence annotations."""
return "\n\n".join(sentences)
def _read_sentences_from(path: str) -> Generator[str, None, None]:
"""Reads and yields sentences of a CoNLL-U format treebank file from the path.
Args:
path: path to a CoNLL-U format treebank file from which sentences will be
read.
Yields:
Individual sentence annotations of a CoNLL-U format treebank file.
"""
with open(path, "r", encoding="utf-8") as reader:
yield from _split_into_sentences(reader.read())
def _sentence_is_in_split(sentence_index: int, split: str) -> bool:
"""Checks if sentence with given positional index is in the split.
In order to deterministically sample train/dev/test sentences, this function
assumes every 9th sentence of a treebank file belongs to the development
set, and every 10th sentence of a treebank file belongs to the test set. All
other sentences belong to the training set.
Args:
sentence_index: sequential index of the sentence in the source CoNLL-U
format treebank file (assuming first sentence has index 0).
split: treebank split (could be 'train', 'test', 'dev').
Returns:
True if sentence with given positional index belongs to the specified split.
Otherwise, returns False.
"""
if split == "train":
return sentence_index % 10 < 8
elif split == "dev":
return sentence_index % 10 == 8
elif split == "test":
return sentence_index % 10 == 9
else:
return True
def _validate_sentence(sentence: str) -> None:
"""Checks if a CoNLL-U format sentence annotation is structurally wellformed.
Args:
sentence: a CoNNL-U format sentence annotation.
Raises:
ValueError: CoNNL-U format sentence annotation is illformed. It is either
missing sentence id or text annotation, or one of the lines that ought
to contain token annotations is structurally illformed.
"""
lines = _split_into_lines(sentence)
if len(lines) <= 2:
raise ValueError(
f"Expecting a sentence to be at least 3 lines in CoNLL-U format,"
f" but found a {len(sentence)} line sentence annotation:\n{sentence}")
if not lines[0].startswith("# sent_id = "):
raise ValueError(
f"First line of the CoNNL-U format sentence annotation does not have a"
f" valid sentence id annotation:\n{sentence}")
if not lines[1].startswith("# text = "):
raise ValueError(
f"Second line of the CoNNL-U format sentence annotation does not have a"
f" valid sentence text annotation:\n{sentence}")
def _validate_token_line(line: str) -> None:
if len(line.split("\t")) != 10:
raise ValueError(f"Illformed CoNNL-U format token annotation:\n{line}")
for line in lines[3:]:
_validate_token_line(line)
def _decompose_sentence(sentence: str) -> _Sentence:
"""Parses CoNLL-U format sentence annotation in a sentence protobuf.
Args:
sentence: a CoNNL-U format sentence annotation.
Returns:
Sentence protobuf that contains annotations for a sentence that is parsed
from CoNLL-U format sentence annotation.
"""
def _decompose_comment(line: str, prefix: str) -> None:
"""Decomposes a comment line of CoNNL-U format."""
return line[len(prefix):]
def _decompose_features(raw_features: str) -> Generator[_Feature, None, None]:
"""Parses CoNLL-U format features annotations into Feature objects."""
category_value = (f.split("=") for f in raw_features.split("|") if f != "_")
yield from (_Feature(category=n, value=v) for n, v in category_value)
def _decompose_token(line: str) -> _Token:
"""Parses CoNLL-U format token annotations into Token objects."""
column = line.split("\t")
return _Token(
form=column[1],
lemma=column[2],
tag=_Tag(coarse=column[3], fine=column[4]),
feature=_decompose_features(column[5]),
head=int(column[6]),
dependency_relation=column[7],
misc_feature=_decompose_features(column[9]),
)
lines = _split_into_lines(sentence)
return _Sentence(
sentence_id=_decompose_comment(lines[0], "# sent_id = "),
text=_decompose_comment(lines[1], "# text = "),
token=tuple(_decompose_token(line) for line in lines[2:]),
)
def as_conllu(section: Optional[str] = None,
split: Optional[str] = None) -> str:
"""Reads sentence annotations of Turkish Web Treebank in CoNLL-U format.
Args:
section: optional, section of Turkish Web Treebank whose sentence
annotations will be read (could be either 'web' or 'wiki'). If
unspecified sentence annotations from web and Wikipedia sections will
be read.
split: optional, treebank split whose sentence annotations will be read
(could be 'train', 'test', 'dev'). If unspecified sentence annotations
from all three splits will be read.
Raises:
ValueError: invalid section name or split specifier, or source treebank
files from which the sentence annotations are read is not valid with
respect to the CoNLL-U format.
Returns:
Sentence annotations for the specified treebank and split in CoNNL-U
treebank file format.
"""
if section and section not in _PATHS_BY_SECTION:
raise ValueError(f"Invalid section name '{section}'."
f" It can only be one of: 'web', 'wiki'.")
if split and split not in _VALID_SPLIT_NAMES:
raise ValueError(f"Invalid split specifier '{split}'."
f" It can only be one of: 'train', 'dev', 'test'")
if section:
paths = (p for s, p in _PATHS_BY_SECTION.items() if s == section)
else:
paths = _PATHS_BY_SECTION.values()
def _filtered_sentences() -> Generator[str, None, None]:
for path in sorted(paths):
for index, sentence in enumerate(_read_sentences_from(path)):
if _sentence_is_in_split(index, split):
_validate_sentence(sentence)
yield sentence
return _reconstruct_conll_from(_filtered_sentences())
def sentences(section: Optional[str] = None,
split: Optional[str] = None) -> Generator[_Sentence, None, None]:
"""Reads and yields sentences of Turkish Web Treebank as sentence protobufs.
Args:
section: optional, section of Turkish Web Treebank whose sentence
annotations will be read (could be either 'web' or 'wiki'). If
unspecified sentence annotations from both web and Wikipedia sections
will be read.
split: optional, treebank split whose sentence annotations will be read
(could be 'train', 'test', 'dev'). If unspecified sentence annotations
from all three splits will be read.
Raises:
ValueError: invalid section name or split specifier, or source treebank
files from which the sentence annotations are read is not valid with
respect to the CoNLL-U format.
Yields:
Sentence protobufs which contain annotations for the specified treebank
section and split.
"""
for sentence in _split_into_sentences(as_conllu(section, split)):
yield _decompose_sentence(sentence)