-
Notifications
You must be signed in to change notification settings - Fork 0
/
clasificador.py
152 lines (143 loc) · 5.05 KB
/
clasificador.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import string
import nltk
import emoji
import re
#import enchant
#nltk.download('stopwords') #Descomentar si es la primera vez que se ejecuta
#nltk.download('punkt') #Descomentar si es la primera vez que se ejecuta
#nltk.download('wordnet') #Descomentar si es la primera vez que se ejecuta
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
def process_text(text, stem=True):
tokens = word_tokenize(text)
if stem:
stemmer = PorterStemmer()
tokens = [stemmer.stem(t) for t in tokens]
return tokens
def preprocessCorpus(corpus):
url_regex = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
html_tag_regex = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
control_characters_regex = "[\x00-\x1F\x7F]"
corpus = corpus.split("\n")
translate_table = dict((ord(char), ' ') for char in string.punctuation)
newCorpus = []
print("Preprocessing corpus...")
for email in corpus:
email = email.translate(translate_table)
stop_words = [process_text(w)[0] for w in stopwords.words('english')]
email = ""
stemmer = PorterStemmer()
for email in corpus:
newEmail = ""
for word in email.split():
if re.match(control_characters_regex, word):
continue
if re.match(url_regex, word):
continue
if re.match(html_tag_regex, word):
continue
if emoji.emoji_count(word) == 0:
if word not in stop_words:
newEmail += stemmer.stem(word) + " "
else:
listOfEmojis = emoji.emoji_list(word)
for visual in listOfEmojis:
word = word.replace(visual["emoji"], "")
newEmail += stemmer.stem(word) + " "
for visual in listOfEmojis:
newEmail += emoji.demojize(visual["emoji"]) + " "
newCorpus.append(newEmail)
print("Corpus preprocessed, starting classification...")
return newCorpus
def main():
phishingFilename = input("Enter the model of lenguage of the phishing emails: ")
with open(phishingFilename, "r") as file:
phishing = file.read()
phishing = phishing.split("\n")
firstLine = phishing[0]
phishing = phishing[2:]
numOfPhishingEmails = int(firstLine.split(":")[1]);
safeFilename = input("Enter the model of lenguage of the safe emails: ")
with open(safeFilename, "r") as file:
safe = file.read()
safe = safe.split("\n")
firstLine = safe[0]
safe = safe[2:]
numOfSafeEmails = int(firstLine.split(":")[1]);
probPhishing = numOfPhishingEmails / (numOfPhishingEmails + numOfSafeEmails)
probSafe = numOfSafeEmails / (numOfPhishingEmails + numOfSafeEmails)
wordProbPhishing = {}
wordProbSafe = {}
for line in phishing:
if line == "":
continue
line = line.split(":")
word = line[1].split(" ")[1]
prob = float(line[-1])
wordProbPhishing[word] = prob
for line in safe:
if line == "":
continue
line = line.split(":")
word = line[1].split(" ")[1]
prob = float(line[-1])
wordProbSafe[word] = prob
corpusName = input("Enter the corpus to classify: ")
with open(corpusName, "r") as file:
corpus = file.read()
preprocess = input("Do you want to preprocess the corpus? (y/n): ")
if preprocess == "y":
corpus = preprocessCorpus(corpus)
else:
corpus = corpus.split("\n")
print("Number of emails to classify: " + str(len(corpus)))
classificationOfEmails = []
contador = 1
for email in corpus:
originalEmail = email
email = email.split(" ")
probEmailPhishing = 0
probEmailSafe = 0
for word in email:
if word in wordProbPhishing:
probEmailPhishing += wordProbPhishing[word]
else:
probEmailPhishing += wordProbPhishing["<UNK>"]
if word in wordProbSafe:
probEmailSafe += wordProbSafe[word]
else:
probEmailSafe += wordProbSafe["<UNK>"]
probEmailPhishing += probPhishing
probEmailSafe += probSafe
classificationOfEmails.append([originalEmail[:10], probEmailSafe, probEmailPhishing])
if contador % 100 == 0:
print("Email " + str(contador) + " of "+ str(len(corpus)) + " classified")
contador += 1
nameOfOutput = input("Enter the name of the output file: ")
emailsP = 0
emailsS = 0
with open(nameOfOutput, "w") as file:
for email in classificationOfEmails:
file.write(email[0] + "," + str(round(email[1], 2)) + "," + str(round(email[2], 2)))
if email[1] > email[2]:
file.write(",S")
emailsS += 1
else:
file.write(",P")
emailsP += 1
file.write("\n")
print("The classification of the emails is in the file " + nameOfOutput)
with open("resumen_" + nameOfOutput, "w") as file:
for email in classificationOfEmails:
if email[1] > email[2]:
file.write("S\n")
else:
file.write("P\n")
print("Number of emails classified as phishing: " + str(emailsP))
print("Number of emails classified as safe: " + str(emailsS))
if __name__ == "__main__":
main()