-
Notifications
You must be signed in to change notification settings - Fork 463
/
test_laser_tokenizer.py
310 lines (242 loc) · 9.79 KB
/
test_laser_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
# --------------------------------------------------------
# Tests for LaserTokenizer
import os
import warnings
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List
import numpy as np
import pytest
from laser_encoders import (
LaserEncoderPipeline,
initialize_encoder,
initialize_tokenizer,
)
@pytest.fixture
def tokenizer(tmp_path: Path):
tokenizer_instance = initialize_tokenizer(model_dir=tmp_path, laser="laser2")
return tokenizer_instance
@pytest.fixture
def input_text() -> str:
return "This is a test sentence."
@pytest.fixture
def test_readme_params() -> dict:
return {
"lang": "igbo",
"input_sentences": ["nnọọ, kedu ka ị mere"],
"expected_embedding_shape": (1, 1024),
"expected_array": [
0.3807628,
-0.27941525,
-0.17819545,
0.44144684,
-0.38985375,
0.04719935,
0.20238206,
-0.03934783,
0.0118901,
0.28986093,
],
}
def test_tokenize(tokenizer, input_text: str):
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer.tokenize(input_text) == expected_output
def test_tokenizer_call_method(tokenizer, input_text: str):
single_string = "This is a test sentence."
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert tokenizer(single_string) == expected_output
list_of_strings = ["This is a test sentence.", "This is another test sentence."]
expected_output = [
"▁this ▁is ▁a ▁test ▁sent ence .",
"▁this ▁is ▁another ▁test ▁sent ence .",
]
assert tokenizer(list_of_strings) == expected_output
def test_normalization(tokenizer):
test_data = "Hello!!! How are you??? I'm doing great."
expected_output = "▁hel lo !!! ▁how ▁are ▁you ??? ▁i ' m ▁do ing ▁great ."
assert tokenizer.tokenize(test_data) == expected_output
def test_descape(tokenizer):
test_data = "I <3 Apple & Carrots!"
expected_output = "▁i ▁<3 ▁app le ▁& ▁car ro ts !"
tokenizer.descape = True
assert tokenizer.tokenize(test_data) == expected_output
def test_lowercase(tokenizer):
test_data = "THIS OUTPUT MUST BE UPPERCASE"
expected_output = "▁TH IS ▁ OU TP UT ▁ MU ST ▁BE ▁ UP PER CA SE"
tokenizer.lower_case = False
assert tokenizer.tokenize(test_data) == expected_output
def test_is_printable(tokenizer):
test_data = "Hello, \tWorld! ABC\x1f123"
expected_output = "▁hel lo , ▁world ! ▁ab c ▁12 3"
assert tokenizer.tokenize(test_data) == expected_output
def test_tokenize_file(tokenizer, input_text: str):
with TemporaryDirectory() as temp_dir:
input_file = os.path.join(temp_dir, "input.txt")
output_file = os.path.join(temp_dir, "output.txt")
with open(input_file, "w") as file:
file.write(input_text)
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert output == expected_output
def test_tokenize_file_overwrite(tokenizer, input_text: str):
with TemporaryDirectory() as temp_dir:
input_file = os.path.join(temp_dir, "input.txt")
output_file = os.path.join(temp_dir, "output.txt")
with open(input_file, "w") as file:
file.write(input_text)
with open(output_file, "w") as file:
file.write("Existing output")
# Test when over_write is False
tokenizer.over_write = False
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
assert output == "Existing output"
# Test when over_write is True
tokenizer.over_write = True
tokenizer.tokenize_file(
inp_fname=Path(input_file),
out_fname=Path(output_file),
)
with open(output_file, "r") as file:
output = file.read().strip()
expected_output = "▁this ▁is ▁a ▁test ▁sent ence ."
assert output == expected_output
@pytest.mark.parametrize(
"laser, expected_array, lang",
[
(
"laser2",
[
1.042462512850761414e-02,
6.325428839772939682e-03,
-3.032622225873637944e-05,
9.033476933836936951e-03,
2.937933895736932755e-04,
4.489220678806304932e-03,
2.334521152079105377e-03,
-9.427300537936389446e-04,
-1.571535394759848714e-04,
2.095808042213320732e-03,
],
None,
),
(
"laser3",
[
3.038274645805358887e-01,
4.151830971240997314e-01,
-2.458990514278411865e-01,
3.153458833694458008e-01,
-5.153598189353942871e-01,
-6.035178527235984802e-02,
2.210616767406463623e-01,
-2.701394855976104736e-01,
-4.902199506759643555e-01,
-3.126966953277587891e-02,
],
"zul_Latn",
),
],
)
def test_sentence_encoder(
tmp_path: Path,
tokenizer,
laser: str,
expected_array: List,
lang: str,
input_text: str,
):
sentence_encoder = initialize_encoder(model_dir=tmp_path, laser=laser, lang=lang)
tokenized_text = tokenizer.tokenize(input_text)
sentence_embedding = sentence_encoder.encode_sentences([tokenized_text])
assert isinstance(sentence_embedding, np.ndarray)
assert sentence_embedding.shape == (1, 1024)
assert np.allclose(expected_array, sentence_embedding[:, :10], atol=1e-3)
def test_laser_encoder_pipeline(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
embeddings = encoder.encode_sentences(input_sentences)
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
def test_separate_initialization_and_encoding(
tmp_path, tokenizer, test_readme_params: dict
):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
expected_embedding_shape = test_readme_params["expected_embedding_shape"]
expected_array = test_readme_params["expected_array"]
tokenized_sentence = tokenizer.tokenize(input_sentences[0])
sentence_encoder = initialize_encoder(model_dir=tmp_path, lang=lang)
# Encode tokenized sentences into embeddings
embeddings = sentence_encoder.encode_sentences([tokenized_sentence])
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == expected_embedding_shape
assert np.allclose(expected_array, embeddings[:, :10], atol=1e-3)
def test_encoder_normalization(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=True
)
norm = np.linalg.norm(normalized_embeddings[0])
assert np.allclose(norm, 1.0, atol=1e-3)
def test_encoder_default_behaviour(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
default_embeddings = encoder.encode_sentences(input_sentences)
non_normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=False
)
assert np.allclose(default_embeddings, non_normalized_embeddings)
def test_encoder_non_normalization(tmp_path: Path, test_readme_params: dict):
lang = test_readme_params["lang"]
input_sentences = test_readme_params["input_sentences"]
encoder = LaserEncoderPipeline(model_dir=tmp_path, lang=lang)
non_normalized_embeddings = encoder.encode_sentences(
input_sentences, normalize_embeddings=False
)
norm = np.linalg.norm(non_normalized_embeddings[0])
assert not np.isclose(norm, 1)
def test_optional_lang_with_laser2(tmp_path: Path):
with pytest.warns(
UserWarning,
match="The 'lang' parameter is optional when using 'laser2'. It will be ignored.",
):
encoder = LaserEncoderPipeline(lang="en", laser="laser2", model_dir=tmp_path)
def test_required_lang_with_laser3(tmp_path: Path):
with pytest.raises(
ValueError, match="For 'laser3', the 'lang' parameter is required."
):
encoder = LaserEncoderPipeline(laser="laser3", model_dir=tmp_path)
def test_missing_lang_and_laser(tmp_path: Path):
with pytest.raises(
ValueError, match="Either 'laser' or 'lang' should be provided."
):
encoder = LaserEncoderPipeline(model_dir=tmp_path)