Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ipavlopoulos authored Mar 25, 2024
1 parent 83ca558 commit cfe9090
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 0 deletions.
70 changes: 70 additions & 0 deletions dataset_maker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# from evaluate import load
import random
from dictionary_tables import greek_to_greeklish_intonated


def convert_to_greeklish(texts_greek):
"""
This function takes a list of original Greek sentences, and converts them to Greeklish, using the conversions
stored in the dictionaries above. Each time a token in a sentence has more than 1 possible conversions (for example,
η -> {h, i} ), the sentence branches into two (or more, depending on the amount of possible conversions) new
sentences, equivalent up to that point, and the translation continues independently for each branch. This ensures
that each possible way to convert a sentence to Greeklish gets an equal representation in the resulting dataset.
--------------------
@:param texts_greek: a string array containing the corpus in greek, with each row representing a distinct sentence
@:return texts_greeklish: a string array containing the converted sentences from greek_texts
"""

# We start by checking for tokens in this order:
# 1)double_tokens
# 2)special_tokens
# 3)simple_tokens

texts_greeklish = []
random.seed(12300)
for sentence in texts_greek:
converted_sent = ""
i = 0

while i < len(sentence):
# If the current token read belongs to double_token conversions
if(i+1 <= len(sentence)-1 )and (sentence[i] + sentence[i+1] in greek_to_greeklish_intonated.keys()):
# If there are more than 1 possible conversions, pick one of them randomly
if isinstance(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]), list):
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])))
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])[index]
else:
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])
i += 2
# If the current token read belongs to special_token conversions
elif sentence[i] in greek_to_greeklish_intonated.keys():
if isinstance(greek_to_greeklish_intonated.get(sentence[i]), list):
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i])))
converted_sent += greek_to_greeklish_intonated.get(sentence[i])[index]
else:
converted_sent += greek_to_greeklish_intonated.get(sentence[i])
i += 1
# If the current token does not belong to the Greek alphabet, transfer it directly
else:
converted_sent += sentence[i]
i += 1

# Add the converted sentence to the list
texts_greeklish.append(converted_sent)
return texts_greeklish




with open(data_path+"greek_europarl_test_5k.txt", "r", encoding="utf-8") as file:
tests = []
for line in file:
tests.append(line[:-1])

processed = convert_to_greeklish(tests)

with open(data_path+"greeklish_europarl_test_5k.txt", "w", encoding="utf-8") as file:
for i in processed:
file.write(i)
file.write("\n")

106 changes: 106 additions & 0 deletions dictionary_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Without intonation
greek_to_greeklish = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'χ': ['x', 'h'], 'φ': ['f'], 'ψ': ['ps'],

'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'I'],
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],

'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],

'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D']}

greeklish_to_greek = {'A': ['Α'], 'Ai': ['Αι'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 'E': ['Ε', 'Αι'], 'Ei': ['Ει'],
'F': ['Φ'], 'G': ['Γ'], 'H': ['Η', 'Χ'], 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι'], 'K': ['Κ'],
'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 'Nt': ['Ντ'], 'O': ['Ο', 'Ω'],
'Oi': ['Οι'], 'Ou': ['Ου'], 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'],
'Th': ['Θ'], 'U': ['Θ', 'Ου', 'Y'], 'V': ['Β'], 'W': ['Ω'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ'], 'Yi': ['Υι'],
'Z': ['Ζ'], 'a': ['α'], 'ai': ['αι'], 'b': ['β', 'μπ'], 'd': ['δ', 'ντ'],
'e': ['ε', 'αι'], 'ei': ['ει'], 'f': ['φ'], 'g': ['γ'], 'h': ['η', 'χ'],
'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι'], 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'],
'mp': ['μπ'], 'n': ['ν'], 'nt': ['ντ'], 'o': ['ο', 'ω'], 'oi': ['οι'], 'ou': ['ου'], 'p': ['π'],
'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 'u': ['υ', 'θ', 'ου'],
'ui': ['υι'], 'v': ['β'], 'w': ['ω'], 'x': ['ξ', 'χ'], 'y': ['υ'], 'z': ['ζ']}

# With intonation
greek_to_greeklish_intonated = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'],

'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'],
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['Ps'],

'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],

'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'],

'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'],
'ώ': ['w', 'o'],
'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'],
'Ώ': ['W', 'O'],

'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'],
'υί': ['ui', 'i'],
'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'],
'Υί': ['Yi', 'I'],
}



"""
{'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'],
'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'],
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],
'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'],
'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'],
'ώ': ['w', 'o'],
'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'],
'Ώ': ['W', 'O'],
'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'],
'υί': ['ui', 'i'],
'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'],
'Υί': ['Yi', 'I'],
}
"""

greeklish_to_greek_intonated = {'A': ['Α', 'Ά'], 'Ai': ['Αι', 'Αί'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'],
'E': ['Ε', 'Αι', 'Έ', 'Αί'], 'Ei': ['Ει', 'Εί'], 'F': ['Φ'], 'G': ['Γ'],
'H': ['Η', 'Χ', 'Ή'],
'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι', 'Ή', 'Ί', 'Ύ', 'Εί', 'Οί', 'Υί'],
'K': ['Κ'], 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'],
'Nt': ['Ντ'], 'O': ['Ο', 'Ω', 'Ό', 'Ώ'], 'Oi': ['Οι', 'Οί'], 'Ou': ['Ου', 'Ού'],
'P': ['Π'], 'Ps': ['Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 'Th': ['Θ'],
'U': ['Θ', 'Ου', 'Ού', 'Υ', 'Ύ'], 'V': ['Β'], 'W': ['Ω', 'Ώ'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ', 'Ύ'],
'Yi': ['Υι', 'Υί'], 'Z': ['Ζ'], 'a': ['α', 'ά'], 'ai': ['αι', 'αί'], 'b': ['β', 'μπ'],
'd': ['δ', 'ντ'], 'e': ['ε', 'αι', 'έ', 'αί'], 'ei': ['ει', 'εί'], 'f': ['φ'],
'g': ['γ'], 'h': ['η', 'χ', 'ή'],
'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι', 'ή', 'ί', 'ύ', 'εί', 'οί', 'υί'],
'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 'mp': ['μπ'], 'n': ['ν'],
'nt': ['ντ'], 'o': ['ο', 'ω', 'ό', 'ώ'], 'oi': ['οι', 'οί'], 'ou': ['ου', 'ού'],
'p': ['π'], 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'],
'u': ['υ', 'θ', 'ου', 'ύ', 'ού'], 'ui': ['υι', 'υί'], 'v': ['β'], 'w': ['ω', 'ώ'],
'x': ['ξ', 'χ'], 'y': ['υ', 'ύ'], 'z': ['ζ']}

83 changes: 83 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from datetime import datetime
import pickle
from dictionary_tables import greek_to_greeklish_intonated
from models.RNN.LSTM_LM import LSTM_LangModel
from models.RNN.main import LanguageModel
from models.RNN.util import *
import torch
from evaluate import load

models_path = "models/"
data_path = "data/"
results_path = "results/"

cer = load("cer")
wer = load("wer")
bleu = load("bleu")

# Load the test data
with open(data_path + "greeklish_europarl_test_5k.txt", "r", encoding="utf-8") as file:
test_sources = []
for line in file:
# REMOVE newline
line = line[:-1]
# REVERSE ORDER
test_sources.append(line[::-1])

with open(data_path + "greek_europarl_test_5k.txt", "r", encoding="utf-8") as file:
test_targets = []
for line in file:
# REMOVE newline
line = line[:-1]
# REVERSE ORDER
test_targets.append(line[::-1])


# Reverse models
model_params = [(32,512,2),(32,512,5),(64,512,2),(64,512,5),(64,1024,5),(128,300,5),(128,512,5)]
counter = 0
for params in model_params:

# Load model
input_size = 120
embed_size = params[0]
hidden_size = params[1]
output_size = 120

model = LSTM_LangModel(input_size, embed_size, hidden_size, output_size)
path = "models/LSTM_models_trained/rev_1layer_LSTM_LM_50000_char_120_{}_{}/0{}_dropout/".format(
embed_size, hidden_size, params[2])

model.load_state_dict(
torch.load(path+"rev_1layer_LSTM_LM_50000_char_120_{}_{}.pt".format(embed_size, hidden_size), map_location=torch.device('cpu')))

with open(path+"rev_vectorizer_50000_char_120_{}_{}.pickle".format(embed_size, hidden_size), "rb") as f:
text_vec = pickle.load(f)

LM = LanguageModel(text_vec, model)

print("Successful load. Translation began. {}".format(datetime.now()))

# Number of beams does matter.
results = LM.translate(test_sources[:1000], 9)

scores = []

scores.append(cer.compute(predictions=results, references=test_targets[:1000]))
scores.append(wer.compute(predictions=results, references=test_targets[:1000]))
scores.append(bleu.compute(predictions=results, references=test_targets[:1000]).get("bleu"))

# write the resulting translations
with open(results_path+"results_rev_1layer_50000_char_120_{}_{}_0{}drop.txt".format(embed_size, hidden_size, params[2]), "w", encoding="utf-8") as file:

file.write("Test Score at [:1000] of greeklish_europarl_test_5k.txt")
file.write('\n')
file.write("CER: {}".format(scores[0]))
file.write('\n')
file.write("WER: {}".format(scores[1]))
file.write('\n')
file.write("BLEU: {}".format(scores[2]))
file.write('\n')

counter += 1
print(str(counter)+"/7 done. {}".format(datetime.now()))

0 comments on commit cfe9090

Please sign in to comment.