-
Notifications
You must be signed in to change notification settings - Fork 0
/
aprendizaje.py
119 lines (108 loc) · 4.13 KB
/
aprendizaje.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import math
import string
import nltk
import emoji
import re
#import enchant
#nltk.download('stopwords') #Descomentar si es la primera vez que se ejecuta
#nltk.download('punkt') #Descomentar si es la primera vez que se ejecuta
#nltk.download('wordnet') #Descomentar si es la primera vez que se ejecuta
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
def process_text(text, stem=True):
tokens = word_tokenize(text)
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens]
return tokens
def preprocessCorpus(corpus):
url_regex = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
html_tag_regex = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
control_characters_regex = "[\x00-\x1F\x7F]"
corpus = corpus.split("\n")
translate_table = dict((ord(char), ' ') for char in string.punctuation)
newCorpus = []
print("Preprocessing corpus...")
for email in corpus:
email = email.translate(translate_table)
stop_words = [process_text(w)[0] for w in stopwords.words('english')]
email = ""
stemmer = PorterStemmer()
for email in corpus:
newEmail = ""
for word in email.split():
if re.match(control_characters_regex, word):
continue
if re.match(url_regex, word):
continue
if re.match(html_tag_regex, word):
continue
if emoji.emoji_count(word) == 0:
if word not in stop_words:
newEmail += stemmer.stem(word) + " "
else:
listOfEmojis = emoji.emoji_list(word)
for visual in listOfEmojis:
word = word.replace(visual["emoji"], "")
newEmail += stemmer.stem(word) + " "
for visual in listOfEmojis:
newEmail += emoji.demojize(visual["emoji"]) + " "
newCorpus.append(newEmail)
print("Corpus preprocessed, starting classification...")
return newCorpus
def main():
filename = input("Enter the name of the corpus file: ")
with open(filename, "r") as file:
texto = file.read()
filename = input("Enter the name of the vocabulary file: ")
with open(filename, "r") as file:
vocabulario = file.read()
vocabulario = vocabulario.split("\n")
numeroCorreos = len(texto.split("\n"))
numeroPalabrasVocabulario = int(vocabulario[0].split(" ")[3])
numeroPalabrasCorpus = 0
vocabulario = vocabulario[1:]
palabras = {}
for i in range(numeroPalabrasVocabulario):
palabras[vocabulario[i]] = 0
preprocess = input("Do you want to preprocess the corpus? (y/n): ")
if preprocess == "y":
texto = preprocessCorpus(texto)
else:
texto = texto.split("\n")
for i in range(len(texto)):
for word in texto[i].split():
if (word == " " or word == ""):
continue
numeroPalabrasCorpus += 1
if word in palabras:
palabras[word] += 1
else:
palabras["<UNK>"] += 1
# Recorrer el mapa, si hay palabras que aparezcan menos de k veces, sustituirlas por <UNK>
k = int(input("Enter the minimum number of times a word must appear: "))
for word in palabras:
if palabras[word] < k:
palabras["<UNK>"] += palabras[word]
palabras[word] = 0
# Hacer suavizado laplaciano
for word in palabras:
palabras[word] += 1
# Calcular la frecuencia de cada palabra
frecuencias = {}
for word in palabras:
frecuencias[word] = palabras[word] / (numeroPalabrasCorpus + numeroPalabrasVocabulario)
# Guardar en un archivo
filename = input("Enter the name of the file to save the model: ")
with open(filename, "w") as file:
file.write("Numero de documentos del corpus: " + str(numeroCorreos) + "\n")
file.write("Numero de palabras del corpus: " + str(numeroPalabrasCorpus) + "\n")
for word in frecuencias:
file.write("Palabra: " + word + " Frec: " + str(palabras[word] - 1) + " LogProb: " + str(math.log(frecuencias[word])) + "\n") # Preguntar si la frecuencia es dividir o es el número de veces que aparece
print("Created " + filename + " file")
if __name__ == "__main__":
main()