-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
83ca558
commit cfe9090
Showing
3 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# from evaluate import load | ||
import random | ||
from dictionary_tables import greek_to_greeklish_intonated | ||
|
||
|
||
def convert_to_greeklish(texts_greek): | ||
""" | ||
This function takes a list of original Greek sentences, and converts them to Greeklish, using the conversions | ||
stored in the dictionaries above. Each time a token in a sentence has more than 1 possible conversions (for example, | ||
η -> {h, i} ), the sentence branches into two (or more, depending on the amount of possible conversions) new | ||
sentences, equivalent up to that point, and the translation continues independently for each branch. This ensures | ||
that each possible way to convert a sentence to Greeklish gets an equal representation in the resulting dataset. | ||
-------------------- | ||
@:param texts_greek: a string array containing the corpus in greek, with each row representing a distinct sentence | ||
@:return texts_greeklish: a string array containing the converted sentences from greek_texts | ||
""" | ||
|
||
# We start by checking for tokens in this order: | ||
# 1)double_tokens | ||
# 2)special_tokens | ||
# 3)simple_tokens | ||
|
||
texts_greeklish = [] | ||
random.seed(12300) | ||
for sentence in texts_greek: | ||
converted_sent = "" | ||
i = 0 | ||
|
||
while i < len(sentence): | ||
# If the current token read belongs to double_token conversions | ||
if(i+1 <= len(sentence)-1 )and (sentence[i] + sentence[i+1] in greek_to_greeklish_intonated.keys()): | ||
# If there are more than 1 possible conversions, pick one of them randomly | ||
if isinstance(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]), list): | ||
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]))) | ||
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])[index] | ||
else: | ||
converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]) | ||
i += 2 | ||
# If the current token read belongs to special_token conversions | ||
elif sentence[i] in greek_to_greeklish_intonated.keys(): | ||
if isinstance(greek_to_greeklish_intonated.get(sentence[i]), list): | ||
index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i]))) | ||
converted_sent += greek_to_greeklish_intonated.get(sentence[i])[index] | ||
else: | ||
converted_sent += greek_to_greeklish_intonated.get(sentence[i]) | ||
i += 1 | ||
# If the current token does not belong to the Greek alphabet, transfer it directly | ||
else: | ||
converted_sent += sentence[i] | ||
i += 1 | ||
|
||
# Add the converted sentence to the list | ||
texts_greeklish.append(converted_sent) | ||
return texts_greeklish | ||
|
||
|
||
|
||
|
||
with open(data_path+"greek_europarl_test_5k.txt", "r", encoding="utf-8") as file: | ||
tests = [] | ||
for line in file: | ||
tests.append(line[:-1]) | ||
|
||
processed = convert_to_greeklish(tests) | ||
|
||
with open(data_path+"greeklish_europarl_test_5k.txt", "w", encoding="utf-8") as file: | ||
for i in processed: | ||
file.write(i) | ||
file.write("\n") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# Without intonation | ||
greek_to_greeklish = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'], | ||
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'], | ||
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'], | ||
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'χ': ['x', 'h'], 'φ': ['f'], 'ψ': ['ps'], | ||
|
||
'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'I'], | ||
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'], | ||
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'], | ||
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'], | ||
|
||
'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'], | ||
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'], | ||
|
||
'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'], | ||
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D']} | ||
|
||
greeklish_to_greek = {'A': ['Α'], 'Ai': ['Αι'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 'E': ['Ε', 'Αι'], 'Ei': ['Ει'], | ||
'F': ['Φ'], 'G': ['Γ'], 'H': ['Η', 'Χ'], 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι'], 'K': ['Κ'], | ||
'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 'Nt': ['Ντ'], 'O': ['Ο', 'Ω'], | ||
'Oi': ['Οι'], 'Ou': ['Ου'], 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], | ||
'Th': ['Θ'], 'U': ['Θ', 'Ου', 'Y'], 'V': ['Β'], 'W': ['Ω'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ'], 'Yi': ['Υι'], | ||
'Z': ['Ζ'], 'a': ['α'], 'ai': ['αι'], 'b': ['β', 'μπ'], 'd': ['δ', 'ντ'], | ||
'e': ['ε', 'αι'], 'ei': ['ει'], 'f': ['φ'], 'g': ['γ'], 'h': ['η', 'χ'], | ||
'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι'], 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], | ||
'mp': ['μπ'], 'n': ['ν'], 'nt': ['ντ'], 'o': ['ο', 'ω'], 'oi': ['οι'], 'ou': ['ου'], 'p': ['π'], | ||
'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 'u': ['υ', 'θ', 'ου'], | ||
'ui': ['υι'], 'v': ['β'], 'w': ['ω'], 'x': ['ξ', 'χ'], 'y': ['υ'], 'z': ['ζ']} | ||
|
||
# With intonation | ||
greek_to_greeklish_intonated = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'], | ||
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'], | ||
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'], | ||
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'], | ||
|
||
'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'], | ||
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'], | ||
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'], | ||
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['Ps'], | ||
|
||
'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'], | ||
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'], | ||
|
||
'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'], | ||
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'], | ||
|
||
'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'], | ||
'ώ': ['w', 'o'], | ||
'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'], | ||
'Ώ': ['W', 'O'], | ||
|
||
'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'], | ||
'υί': ['ui', 'i'], | ||
'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'], | ||
'Υί': ['Yi', 'I'], | ||
} | ||
|
||
|
||
|
||
""" | ||
{'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'], | ||
'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'], | ||
'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'], | ||
'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'], | ||
'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'], | ||
'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'], | ||
'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'], | ||
'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'], | ||
'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'], | ||
'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'], | ||
'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'], | ||
'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'], | ||
'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'], | ||
'ώ': ['w', 'o'], | ||
'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'], | ||
'Ώ': ['W', 'O'], | ||
'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'], | ||
'υί': ['ui', 'i'], | ||
'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'], | ||
'Υί': ['Yi', 'I'], | ||
} | ||
""" | ||
|
||
greeklish_to_greek_intonated = {'A': ['Α', 'Ά'], 'Ai': ['Αι', 'Αί'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], | ||
'E': ['Ε', 'Αι', 'Έ', 'Αί'], 'Ei': ['Ει', 'Εί'], 'F': ['Φ'], 'G': ['Γ'], | ||
'H': ['Η', 'Χ', 'Ή'], | ||
'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι', 'Ή', 'Ί', 'Ύ', 'Εί', 'Οί', 'Υί'], | ||
'K': ['Κ'], 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], | ||
'Nt': ['Ντ'], 'O': ['Ο', 'Ω', 'Ό', 'Ώ'], 'Oi': ['Οι', 'Οί'], 'Ou': ['Ου', 'Ού'], | ||
'P': ['Π'], 'Ps': ['Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 'Th': ['Θ'], | ||
'U': ['Θ', 'Ου', 'Ού', 'Υ', 'Ύ'], 'V': ['Β'], 'W': ['Ω', 'Ώ'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ', 'Ύ'], | ||
'Yi': ['Υι', 'Υί'], 'Z': ['Ζ'], 'a': ['α', 'ά'], 'ai': ['αι', 'αί'], 'b': ['β', 'μπ'], | ||
'd': ['δ', 'ντ'], 'e': ['ε', 'αι', 'έ', 'αί'], 'ei': ['ει', 'εί'], 'f': ['φ'], | ||
'g': ['γ'], 'h': ['η', 'χ', 'ή'], | ||
'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι', 'ή', 'ί', 'ύ', 'εί', 'οί', 'υί'], | ||
'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 'mp': ['μπ'], 'n': ['ν'], | ||
'nt': ['ντ'], 'o': ['ο', 'ω', 'ό', 'ώ'], 'oi': ['οι', 'οί'], 'ou': ['ου', 'ού'], | ||
'p': ['π'], 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], | ||
'u': ['υ', 'θ', 'ου', 'ύ', 'ού'], 'ui': ['υι', 'υί'], 'v': ['β'], 'w': ['ω', 'ώ'], | ||
'x': ['ξ', 'χ'], 'y': ['υ', 'ύ'], 'z': ['ζ']} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
from datetime import datetime | ||
import pickle | ||
from dictionary_tables import greek_to_greeklish_intonated | ||
from models.RNN.LSTM_LM import LSTM_LangModel | ||
from models.RNN.main import LanguageModel | ||
from models.RNN.util import * | ||
import torch | ||
from evaluate import load | ||
|
||
models_path = "models/" | ||
data_path = "data/" | ||
results_path = "results/" | ||
|
||
cer = load("cer") | ||
wer = load("wer") | ||
bleu = load("bleu") | ||
|
||
# Load the test data | ||
with open(data_path + "greeklish_europarl_test_5k.txt", "r", encoding="utf-8") as file: | ||
test_sources = [] | ||
for line in file: | ||
# REMOVE newline | ||
line = line[:-1] | ||
# REVERSE ORDER | ||
test_sources.append(line[::-1]) | ||
|
||
with open(data_path + "greek_europarl_test_5k.txt", "r", encoding="utf-8") as file: | ||
test_targets = [] | ||
for line in file: | ||
# REMOVE newline | ||
line = line[:-1] | ||
# REVERSE ORDER | ||
test_targets.append(line[::-1]) | ||
|
||
|
||
# Reverse models | ||
model_params = [(32,512,2),(32,512,5),(64,512,2),(64,512,5),(64,1024,5),(128,300,5),(128,512,5)] | ||
counter = 0 | ||
for params in model_params: | ||
|
||
# Load model | ||
input_size = 120 | ||
embed_size = params[0] | ||
hidden_size = params[1] | ||
output_size = 120 | ||
|
||
model = LSTM_LangModel(input_size, embed_size, hidden_size, output_size) | ||
path = "models/LSTM_models_trained/rev_1layer_LSTM_LM_50000_char_120_{}_{}/0{}_dropout/".format( | ||
embed_size, hidden_size, params[2]) | ||
|
||
model.load_state_dict( | ||
torch.load(path+"rev_1layer_LSTM_LM_50000_char_120_{}_{}.pt".format(embed_size, hidden_size), map_location=torch.device('cpu'))) | ||
|
||
with open(path+"rev_vectorizer_50000_char_120_{}_{}.pickle".format(embed_size, hidden_size), "rb") as f: | ||
text_vec = pickle.load(f) | ||
|
||
LM = LanguageModel(text_vec, model) | ||
|
||
print("Successful load. Translation began. {}".format(datetime.now())) | ||
|
||
# Number of beams does matter. | ||
results = LM.translate(test_sources[:1000], 9) | ||
|
||
scores = [] | ||
|
||
scores.append(cer.compute(predictions=results, references=test_targets[:1000])) | ||
scores.append(wer.compute(predictions=results, references=test_targets[:1000])) | ||
scores.append(bleu.compute(predictions=results, references=test_targets[:1000]).get("bleu")) | ||
|
||
# write the resulting translations | ||
with open(results_path+"results_rev_1layer_50000_char_120_{}_{}_0{}drop.txt".format(embed_size, hidden_size, params[2]), "w", encoding="utf-8") as file: | ||
|
||
file.write("Test Score at [:1000] of greeklish_europarl_test_5k.txt") | ||
file.write('\n') | ||
file.write("CER: {}".format(scores[0])) | ||
file.write('\n') | ||
file.write("WER: {}".format(scores[1])) | ||
file.write('\n') | ||
file.write("BLEU: {}".format(scores[2])) | ||
file.write('\n') | ||
|
||
counter += 1 | ||
print(str(counter)+"/7 done. {}".format(datetime.now())) |