-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset_maker.py
70 lines (57 loc) · 3.17 KB
/
dataset_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# from evaluate import load
import random
from dictionary_tables import greek_to_greeklish_intonated
def convert_to_greeklish(texts_greek):
"""
This function takes a list of original Greek sentences, and converts them to Greeklish, using the conversions
stored in the dictionaries above. Each time a token in a sentence has more than 1 possible conversions (for example,
η -> {h, i} ), the sentence branches into two (or more, depending on the amount of possible conversions) new
sentences, equivalent up to that point, and the translation continues independently for each branch. This ensures
that each possible way to convert a sentence to Greeklish gets an equal representation in the resulting dataset.
--------------------
@:param texts_greek: a string array containing the corpus in greek, with each row representing a distinct sentence
@:return texts_greeklish: a string array containing the converted sentences from greek_texts
"""
# We start by checking for tokens in this order:
# 1)double_tokens
# 2)special_tokens
# 3)simple_tokens
texts_greeklish = []
random.seed(12300)
for sentence in texts_greek:
converted_sent = ""
i = 0
while i < len(sentence):
# If the current token read belongs to double_token conversions
if(i+1 <= len(sentence)-1 )and (sentence[i] + sentence[i+1] in greek_to_greeklish_intonated.keys()):
# If there are more than 1 possible conversions, pick one of them randomly
if isinstance(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]), list):
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])))
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])[index]
else:
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])
i += 2
# If the current token read belongs to special_token conversions
elif sentence[i] in greek_to_greeklish_intonated.keys():
if isinstance(greek_to_greeklish_intonated.get(sentence[i]), list):
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i])))
converted_sent += greek_to_greeklish_intonated.get(sentence[i])[index]
else:
converted_sent += greek_to_greeklish_intonated.get(sentence[i])
i += 1
# If the current token does not belong to the Greek alphabet, transfer it directly
else:
converted_sent += sentence[i]
i += 1
# Add the converted sentence to the list
texts_greeklish.append(converted_sent)
return texts_greeklish
with open(data_path+"greek_europarl_test_5k.txt", "r", encoding="utf-8") as file:
tests = []
for line in file:
tests.append(line[:-1])
processed = convert_to_greeklish(tests)
with open(data_path+"greeklish_europarl_test_5k.txt", "w", encoding="utf-8") as file:
for i in processed:
file.write(i)
file.write("\n")