Add files via upload

nlpaueb · Mar 25, 2024 · cfe9090 · cfe9090
1 parent 83ca558
commit cfe9090
Show file tree

Hide file tree

Showing 3 changed files with 259 additions and 0 deletions.
diff --git a/dataset_maker.py b/dataset_maker.py
@@ -0,0 +1,70 @@
+# from evaluate import load
+import random
+from dictionary_tables import greek_to_greeklish_intonated
+
+
+def convert_to_greeklish(texts_greek):
+    """
+    This function takes a list of original Greek sentences, and converts them to Greeklish, using the conversions
+    stored in the dictionaries above. Each time a token in a sentence has more than 1 possible conversions (for example,
+    η -> {h, i} ), the sentence branches into two (or more, depending on the amount of possible conversions) new
+    sentences, equivalent up to that point, and the translation continues independently for each branch. This ensures
+    that each possible way to convert a sentence to Greeklish gets an equal representation in the resulting dataset.
+    --------------------
+    @:param texts_greek: a string array containing the corpus in greek, with each row representing a distinct sentence
+    @:return texts_greeklish: a string array containing the converted sentences from greek_texts
+    """
+
+    # We start by checking for tokens in this order:
+    #       1)double_tokens
+    #       2)special_tokens
+    #       3)simple_tokens
+
+    texts_greeklish = []
+    random.seed(12300)
+    for sentence in texts_greek:
+        converted_sent = ""
+        i = 0
+
+        while i < len(sentence):
+            # If the current token read belongs to double_token conversions
+            if(i+1 <= len(sentence)-1 )and (sentence[i] + sentence[i+1] in greek_to_greeklish_intonated.keys()):
+                # If there are more than 1 possible conversions, pick one of them randomly
+                if isinstance(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1]), list):
+                    index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])))
+                    converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])[index]
+                else:
+                    converted_sent += greek_to_greeklish_intonated.get(sentence[i] + sentence[i + 1])
+                i += 2
+            # If the current token read belongs to special_token conversions
+            elif sentence[i] in greek_to_greeklish_intonated.keys():
+                if isinstance(greek_to_greeklish_intonated.get(sentence[i]), list):
+                    index = random.randrange(len(greek_to_greeklish_intonated.get(sentence[i])))
+                    converted_sent += greek_to_greeklish_intonated.get(sentence[i])[index]
+                else:
+                    converted_sent += greek_to_greeklish_intonated.get(sentence[i])
+                i += 1
+            # If the current token does not belong to the Greek alphabet, transfer it directly
+            else:
+                converted_sent += sentence[i]
+                i += 1
+
+        # Add the converted sentence to the list
+        texts_greeklish.append(converted_sent)
+    return texts_greeklish
+
+
+
+
+with open(data_path+"greek_europarl_test_5k.txt", "r", encoding="utf-8") as file:
+    tests = []
+    for line in file:
+        tests.append(line[:-1])
+
+processed = convert_to_greeklish(tests)
+
+with open(data_path+"greeklish_europarl_test_5k.txt", "w", encoding="utf-8") as file:
+    for i in processed:
+        file.write(i)
+        file.write("\n")
+
diff --git a/dictionary_tables.py b/dictionary_tables.py
@@ -0,0 +1,106 @@
+# Without intonation
+greek_to_greeklish = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
+                    'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
+                    'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
+                    'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'χ': ['x', 'h'], 'φ': ['f'], 'ψ': ['ps'],
+
+                    'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'I'],
+                    'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
+                    'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
+                    'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],
+
+                    'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
+                    'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
+
+                    'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
+                    'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D']}
+
+greeklish_to_greek = {'A': ['Α'], 'Ai': ['Αι'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'], 'E': ['Ε', 'Αι'], 'Ei': ['Ει'],
+                      'F': ['Φ'], 'G': ['Γ'], 'H': ['Η', 'Χ'], 'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι'], 'K': ['Κ'],
+                      'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'], 'Nt': ['Ντ'], 'O': ['Ο', 'Ω'],
+                      'Oi': ['Οι'], 'Ou': ['Ου'], 'P': ['Π', 'Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'],
+                      'Th': ['Θ'], 'U': ['Θ', 'Ου', 'Y'], 'V': ['Β'], 'W': ['Ω'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ'], 'Yi': ['Υι'],
+                      'Z': ['Ζ'], 'a': ['α'], 'ai': ['αι'], 'b': ['β', 'μπ'], 'd': ['δ', 'ντ'],
+                      'e': ['ε', 'αι'], 'ei': ['ει'], 'f': ['φ'], 'g': ['γ'], 'h': ['η', 'χ'],
+                      'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι'], 'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'],
+                      'mp': ['μπ'], 'n': ['ν'], 'nt': ['ντ'], 'o': ['ο', 'ω'], 'oi': ['οι'], 'ou': ['ου'], 'p': ['π'],
+                      'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'], 'u': ['υ', 'θ', 'ου'],
+                      'ui': ['υι'], 'v': ['β'], 'w': ['ω'], 'x': ['ξ', 'χ'], 'y': ['υ'], 'z': ['ζ']}
+
+# With intonation
+greek_to_greeklish_intonated = {'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
+                                'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
+                                'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
+                                'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'],
+
+                                'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'],
+                                'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
+                                'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
+                                'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['Ps'],
+
+                                'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
+                                'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
+
+                                'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
+                                'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'],
+
+                                'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'],
+                                'ώ': ['w', 'o'],
+                                'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'],
+                                'Ώ': ['W', 'O'],
+
+                                'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'],
+                                'υί': ['ui', 'i'],
+                                'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'],
+                                'Υί': ['Yi', 'I'],
+                                }
+
+
+
+"""
+{'α': ['a'], 'ε': ['e'], 'η': ['h', 'i'], 'ι': ['i'], 'ο': ['o'], 'υ': ['u', 'y', 'i'],
+                                'ω': ['w', 'o'], 'β': ['b', 'v'], 'γ': ['g'], 'δ': ['d'], 'ζ': ['z'], 'θ': ['th', 'u'],
+                                'κ': ['k'], 'λ': ['l'], 'μ': ['m'], 'ν': ['n'], 'ξ': ['ks', 'x'], 'π': ['p'],
+                                'ρ': ['r'], 'σ': ['s'], 'ς': ['s'], 'τ': ['t'], 'φ': ['f'], 'χ': ['x', 'h'], 'ψ': ['ps'],
+
+                                'Α': ['A'], 'Ε': ['E'], 'Η': ['H', 'I'], 'Ι': ['I'], 'Ο': ['O'], 'Υ': ['Y', 'U', 'I'],
+                                'Ω': ['W', 'O'], 'Β': ['B', 'V'], 'Γ': ['G'], 'Δ': ['D'], 'Ζ': ['Z'],
+                                'Θ': ['Th', 'U', 'Q'], 'Κ': ['K'], 'Λ': ['L'], 'Μ': ['M'], 'Ν': ['N'], 'Ξ': ['Ks', 'X'],
+                                'Π': ['P'], 'Ρ': ['R'], 'Σ': ['S'], 'Τ': ['T'], 'Χ': ['X', 'H'], 'Φ': ['F'], 'Ψ': ['P'],
+
+                                'ει': ['ei', 'i'], 'οι': ['oi', 'i'], 'ου': ['ou', 'u'], 'αι': ['ai', 'e'],
+                                'υι': ['ui', 'i'], 'μπ': ['mp', 'b'], 'ντ': ['nt', 'd'],
+
+                                'Ει': ['Ei', 'I'], 'Οι': ['Oi', 'I'], 'Ου': ['Ou', 'U'], 'Αι': ['Ai', 'E'],
+                                'Υι': ['Yi', 'I'], 'Μπ': ['Mp', 'B'], 'Ντ': ['Nt', 'D'],
+
+                                'ά': ['a'], 'έ': ['e'], 'ή': ['h', 'i'], 'ί': ['i'], 'ό': ['o'], 'ύ': ['u', 'y', 'i'],
+                                'ώ': ['w', 'o'],
+                                'Ά': ['A'], 'Έ': ['E'], 'Ή': ['H', 'I'], 'Ί': ['I'], 'Ό': ['O'], 'Ύ': ['Y', 'U', 'I'],
+                                'Ώ': ['W', 'O'],
+
+                                'εί': ['ei', 'i'], 'οί': ['oi', 'i'], 'ού': ['ou', 'u'], 'αί': ['ai', 'e'],
+                                'υί': ['ui', 'i'],
+                                'Εί': ['Ei', 'I'], 'Οί': ['Oi', 'I'], 'Ού': ['Ou', 'U'], 'Αί': ['Ai', 'E'],
+                                'Υί': ['Yi', 'I'],
+                                }
+"""
+
+greeklish_to_greek_intonated = {'A': ['Α', 'Ά'], 'Ai': ['Αι', 'Αί'], 'B': ['Β', 'Μπ'], 'D': ['Δ', 'Ντ'],
+                                'E': ['Ε', 'Αι', 'Έ', 'Αί'], 'Ei': ['Ει', 'Εί'], 'F': ['Φ'], 'G': ['Γ'],
+                                'H': ['Η', 'Χ', 'Ή'],
+                                'I': ['Η', 'Ι', 'Υ', 'Ει', 'Οι', 'Υι', 'Ή', 'Ί', 'Ύ', 'Εί', 'Οί', 'Υί'],
+                                'K': ['Κ'], 'Ks': ['Ξ'], 'L': ['Λ'], 'M': ['Μ'], 'Mp': ['Μπ'], 'N': ['Ν'],
+                                'Nt': ['Ντ'], 'O': ['Ο', 'Ω', 'Ό', 'Ώ'], 'Oi': ['Οι', 'Οί'], 'Ou': ['Ου', 'Ού'],
+                                'P': ['Π'], 'Ps': ['Ψ'], 'Q': ['Θ'], 'R': ['Ρ'], 'S': ['Σ'], 'T': ['Τ'], 'Th': ['Θ'],
+                                'U': ['Θ', 'Ου', 'Ού', 'Υ', 'Ύ'], 'V': ['Β'], 'W': ['Ω', 'Ώ'], 'X': ['Ξ', 'Χ'], 'Y': ['Υ', 'Ύ'],
+                                'Yi': ['Υι', 'Υί'], 'Z': ['Ζ'], 'a': ['α', 'ά'], 'ai': ['αι', 'αί'], 'b': ['β', 'μπ'],
+                                'd': ['δ', 'ντ'], 'e': ['ε', 'αι', 'έ', 'αί'], 'ei': ['ει', 'εί'], 'f': ['φ'],
+                                'g': ['γ'], 'h': ['η', 'χ', 'ή'],
+                                'i': ['η', 'ι', 'υ', 'ει', 'οι', 'υι', 'ή', 'ί', 'ύ', 'εί', 'οί', 'υί'],
+                                'k': ['κ'], 'ks': ['ξ'], 'l': ['λ'], 'm': ['μ'], 'mp': ['μπ'], 'n': ['ν'],
+                                'nt': ['ντ'], 'o': ['ο', 'ω', 'ό', 'ώ'], 'oi': ['οι', 'οί'], 'ou': ['ου', 'ού'],
+                                'p': ['π'], 'ps': ['ψ'], 'r': ['ρ'], 's': ['σ', 'ς'], 't': ['τ'], 'th': ['θ'],
+                                'u': ['υ', 'θ', 'ου', 'ύ', 'ού'], 'ui': ['υι', 'υί'], 'v': ['β'], 'w': ['ω', 'ώ'],
+                                'x': ['ξ', 'χ'], 'y': ['υ', 'ύ'], 'z': ['ζ']}
+
diff --git a/main.py b/main.py
@@ -0,0 +1,83 @@
+from datetime import datetime
+import pickle
+from dictionary_tables import greek_to_greeklish_intonated
+from models.RNN.LSTM_LM import LSTM_LangModel
+from models.RNN.main import LanguageModel
+from models.RNN.util import *
+import torch
+from evaluate import load
+
+models_path = "models/"
+data_path = "data/"
+results_path = "results/"
+
+cer = load("cer")
+wer = load("wer")
+bleu = load("bleu")
+
+# Load the test data
+with open(data_path + "greeklish_europarl_test_5k.txt", "r", encoding="utf-8") as file:
+    test_sources = []
+    for line in file:
+        # REMOVE newline
+        line = line[:-1]
+        # REVERSE ORDER
+        test_sources.append(line[::-1])
+
+with open(data_path + "greek_europarl_test_5k.txt", "r", encoding="utf-8") as file:
+    test_targets = []
+    for line in file:
+        # REMOVE newline
+        line = line[:-1]
+        # REVERSE ORDER
+        test_targets.append(line[::-1])
+
+
+# Reverse models
+model_params = [(32,512,2),(32,512,5),(64,512,2),(64,512,5),(64,1024,5),(128,300,5),(128,512,5)]
+counter = 0
+for params in model_params:
+
+    # Load model
+    input_size = 120
+    embed_size = params[0]
+    hidden_size = params[1]
+    output_size = 120
+
+    model = LSTM_LangModel(input_size, embed_size, hidden_size, output_size)
+    path = "models/LSTM_models_trained/rev_1layer_LSTM_LM_50000_char_120_{}_{}/0{}_dropout/".format(
+        embed_size, hidden_size, params[2])
+
+    model.load_state_dict(
+        torch.load(path+"rev_1layer_LSTM_LM_50000_char_120_{}_{}.pt".format(embed_size, hidden_size), map_location=torch.device('cpu')))
+
+    with open(path+"rev_vectorizer_50000_char_120_{}_{}.pickle".format(embed_size, hidden_size), "rb") as f:
+        text_vec = pickle.load(f)
+
+    LM = LanguageModel(text_vec, model)
+
+    print("Successful load. Translation began. {}".format(datetime.now()))
+
+    # Number of beams does matter.
+    results = LM.translate(test_sources[:1000], 9)
+
+    scores = []
+
+    scores.append(cer.compute(predictions=results, references=test_targets[:1000]))
+    scores.append(wer.compute(predictions=results, references=test_targets[:1000]))
+    scores.append(bleu.compute(predictions=results, references=test_targets[:1000]).get("bleu"))
+
+    # write the resulting translations
+    with open(results_path+"results_rev_1layer_50000_char_120_{}_{}_0{}drop.txt".format(embed_size, hidden_size, params[2]), "w", encoding="utf-8") as file:
+
+            file.write("Test Score at [:1000] of greeklish_europarl_test_5k.txt")
+            file.write('\n')
+            file.write("CER: {}".format(scores[0]))
+            file.write('\n')
+            file.write("WER: {}".format(scores[1]))
+            file.write('\n')
+            file.write("BLEU: {}".format(scores[2]))
+            file.write('\n')
+
+    counter += 1
+    print(str(counter)+"/7 done. {}".format(datetime.now()))