-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_europarl_data.py
154 lines (113 loc) · 4.9 KB
/
parse_europarl_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
def read_tokens_file(token_path, num_sentences):
"""
Reads in a token file path and returns the data, limited to the number of sentences.
:param token_path: path to token file.
:param num_sentences: number of sentences we want to read.
:return:
(pseudo-tokenized sentences in language 1, pseudo-tokenized sentences in language 2)
"""
tokens_lang_1 = []
tokens_lang_2 = []
with open(token_path, "r", encoding='utf-8') as file:
"""
The content should be of form
sentence_in_language_1 ||| sentence_in_language_2
"""
for idx, line in enumerate(file):
# if we got enough data
if idx >= num_sentences:
break
# split into the 2 languages and then split further into "tokens" (not really tokens, but naming works for now)
s1, s2 = line.split("|||")
tokens_s1 = s1.split()
tokens_s2 = s2.split()
tokens_lang_1.append(tokens_s1)
tokens_lang_2.append(tokens_s2)
return tokens_lang_1, tokens_lang_2
def read_alignments_file(alignment_path, num_sentences):
"""
Reads in an alignment file path and returns the data, limited to the number of sentences.
:param alignment_path: path to the file containing the alignments
:param num_sentences: number of sentences we are interested in
:return: list of alignments per sentence
"""
with open(alignment_path, "r", encoding='utf-8') as align_file:
"""
Content should be of form
0-0 1-1 2-3 etc.
"""
alignments = []
for idx, line in enumerate(align_file):
# if we got enough data
if idx >= num_sentences:
break
pairs = [p.split("-") for p in line.split()]
pairs = np.array(pairs).astype(int)
alignments.append(pairs)
return alignments
def create_parallel_sentences(token_files, alignment_files, num_sentences=10):
"""
Takes in an array of token files and array of alignment files and constructs the data to further fine-tune BERT.
:param token_files: Array
Array of file paths to files containing cleaned tokenized data.
:param alignment_files:
Array of file paths to alignment files.
:return: 3-tuple
( array of tokenized sentences in language 1,
array of tokenized sentences in language 2,
array of sentence token alignments
)
"""
data = []
if len(token_files) is not len(alignment_files):
raise Exception("Unequal number of token and alignment files.")
for token_path, align_path in zip(token_files, alignment_files):
tokens_1, tokens_2 = read_tokens_file(token_path, num_sentences)
alignments = read_alignments_file(align_path, num_sentences)
data.append((tokens_1, tokens_2, alignments))
return data
def displace_alignments(data, has_bert_sep=True, include_seps=True, return_words=False):
"""
Displace the alignments in order to get them to match a squeezed version of the words.
:param data: the data containing the sentences and alignments
:param has_bert_sep: boolean flag, True if it should account for BERT [CLS] and [SEP] that are not in the alignments
:param include_seps: boolean flag, True if it should add alignments for the BERT [CLS] and [SEP tokens]
:return: list indices of the aligned words in lang 1 and list indices of the aligned words in lang 2
"""
aligned_features = []
w1 = []
w2 = []
displacement_1 = 0
displacement_2 = 0
for idx, alignment in enumerate(data[2]):
# include the [CLS] tokens if required
if has_bert_sep:
if include_seps:
aligned_features.append([displacement_1, displacement_2])
if return_words:
w1.append("[CLS]")
w2.append("[CLS]")
displacement_1 += 1
displacement_2 += 1
# for each alignment word
for entry in alignment:
aligned_features.append([entry[0] + displacement_1, entry[1] + displacement_2])
if return_words:
w1.extend(data[0][idx])
w2.extend(data[1][idx])
# include the [SEP] tokens if required
if has_bert_sep and len(alignment) > 0:
displacement_1 += 1
displacement_2 += 1
if include_seps:
aligned_features.append([entry[0] + displacement_1, entry[1] + displacement_2])
if return_words:
w1.append("[SEP]")
w2.append("[SEP]")
displacement_1 += len(data[0][idx])
displacement_2 += len(data[1][idx])
aligned_features = np.array(aligned_features)
if return_words:
return aligned_features[:, 0], aligned_features[:, 1], w1, w2
return aligned_features[:, 0], aligned_features[:, 1]