-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
212 lines (167 loc) · 7.19 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import numpy as np
import pandas as pd
import pickle
import re, nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
token = ToktokTokenizer()
punct = punctuation
def avg_jaccard(y_true, y_pred):
''' It calculates Jaccard similarity coefficient score for each instance,and
it finds their averange in percentage
Parameters:
y_true: truth labels
y_pred: predicted labels
'''
jacard = np.minimum(y_true, y_pred).sum(axis=1) / \
np.maximum(y_true, y_pred).sum(axis=1)
return jacard.mean()*100
def clean_text(text):
''' Lowering text and removing undesirable marks
Parameter:
text: document to be cleaned
'''
text = text.lower()
text = re.sub(r"\'\n", " ", text)
text = re.sub(r"\'\xa0", " ", text)
text = re.sub('\s+', ' ', text) # matches all whitespace characters
text = text.strip(' ')
return text
def strip_list_noempty(mylist):
newlist = (item.strip() if hasattr(item, 'strip')
else item for item in mylist)
return [item for item in newlist if item != '']
def clean_punct(text, top_tags):
''' Remove all the punctuation from text, unless it's part of an important
tag (ex: c++, c#, etc)
Parameter:
text: document to remove punctuation from it
'''
words = token.tokenize(text)
punctuation_filtered = []
regex = re.compile('[%s]' % re.escape(punct))
remove_punctuation = str.maketrans(' ', ' ', punct)
top_tags = top_tags
for w in words:
if w in top_tags:
punctuation_filtered.append(w)
else:
w = re.sub('^[0-9]*', " ", w)
punctuation_filtered.append(regex.sub('', w))
filtered_list = strip_list_noempty(punctuation_filtered)
return ' '.join(map(str, filtered_list))
def stopWordsRemove(text):
''' Removing all the english stop words from a corpus
Parameter:
text: document to remove stop words from it
'''
words = token.tokenize(text)
filtered = [w for w in words if not w in stop_words]
return ' '.join(map(str, filtered))
def lemmatization(texts, allowed_postags, top_tags,stop_words=stop_words):
''' It keeps the lemma of the words (lemma is the uninflected form of a word),
and deletes the underired POS tags
Parameters:
texts (list): text to lemmatize
allowed_postags (list): list of allowed postags, like NOUN, ADL, VERB, ADV
'''
lemma = wordnet.WordNetLemmatizer()
doc = nlp(texts)
texts_out = []
top_tags = top_tags
for token in doc:
if str(token) in top_tags:
texts_out.append(str(token))
elif token.pos_ in allowed_postags:
if token.lemma_ not in ['-PRON-']:
texts_out.append(token.lemma_)
else:
texts_out.append('')
texts_out = ' '.join(texts_out)
return texts_out
def pred_nwords_unsupervised(text, tfidf, lda, n_words):
''' Recommend n_words tags by detecting latent topics in a corpus
Parameters:
text: cleaned text on which recommendations are based
tfidf: tfidf transformer
lda: lda model
n_words: number of words retrieved
'''
document_tfidf = tfidf.transform(text)
proba_topic_sachant_document = lda.transform(document_tfidf)
words_label = []
for word in tfidf.get_feature_names():
words_label.append(word)
proba_word_sachant_topic = lda.components_ / \
lda.components_.sum(axis=1)[:, np.newaxis] # normalization
# proba_topic_sachant_document est de dimension d x t
# proba_word_sachant_topic est de dimension t x w
# je peux donc opérer un produit matriciel entre les 2 matrices pour calculer pour chaque document : proba(wordn)
# j'obtiendrai une matrice proba_word_sachant_document de dimension d x w
# il ne me restera plus qu'à choisir les "n_words" mots les plus probables
proba_word_sachant_document = proba_topic_sachant_document.dot(
proba_word_sachant_topic)
# je transforme la matrice en dataframe :
# data = les proba des mots pour chaque document
# index = l'index des données en entrée
# columns = les labels des mots sélectionnés en sortie du LDA
df_wd = pd.DataFrame(data=proba_word_sachant_document,
index=text.index,
columns=words_label)
values = df_wd.columns.values[np.argsort(
-df_wd.values, axis=1)[:, :n_words]]
values = [", ".join(item) for item in values.astype(str)]
pred_unsupervised = pd.DataFrame(values,
index=df_wd.index,
columns=['Unsupervised'])
return pred_unsupervised
def recommend_tags(text_ori, n_words, seuil=0.5, clean=False):
''' Recommendation system for StackOverflow posts based on a unsupervised AND supervised model which returns up to 5 words.
Parameters:
text_ori: the stackoverflow post of user
n_words: number of tags to recommend
seuil: threshold for decision
clean: True if data preparation is needed
'''
# CHARGEMENT
with open('vectorizer_dfText.pkl', 'rb') as f:
vectorizer_dfText = pickle.load(f)
with open('lda_model.pkl', 'rb') as f:
best_lda = pickle.load(f)
with open('lr_ovr.pkl', 'rb') as f:
lr_ovr = pickle.load(f)
with open('multilabel_binarizer.pkl', 'rb') as f:
multilabel_binarizer = pickle.load(f)
with open('top_tags.pkl', 'rb') as f:
top_tags = pickle.load(f)
if type(text_ori) in (str, pd.Series):
if type(text_ori) is str:
text_ori = pd.Series(text_ori)
text = text_ori
text_ori = text_ori.rename("Texte d'origine")
text = text.rename("Texte modifié")
else:
return 'Type should be str or pd.Series'
if clean == True:
text = text.apply(lambda s: clean_text(s))
text = text.apply(lambda s: BeautifulSoup(s,features="lxml").get_text())
text = text.apply(lambda s: clean_punct(s,top_tags))
text = text.apply(lambda s: stopWordsRemove(s))
text = text.apply(lambda s: lemmatization(s,['NOUN', 'ADV'],top_tags))
pred_unsupervised = pred_nwords_unsupervised(
text, vectorizer_dfText, best_lda, n_words)
pred_supervised = pd.DataFrame(lr_ovr.predict_proba(vectorizer_dfText.transform(
text))).applymap(lambda x: 1 if x > seuil else 0).to_numpy()
pred_supervised = pd.Series(multilabel_binarizer.inverse_transform(
pred_supervised), name='Supervised', index=text.index)
pred_supervised = pred_supervised.apply(lambda row: ', '.join(row))
result = pd.concat(
[pred_supervised, pred_unsupervised, text_ori, text], axis=1)
return result