-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
263 lines (185 loc) · 8.41 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
from flask import Flask, render_template, request, redirect, send_file
from werkzeug.utils import secure_filename
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
import sys
import os
# Data processing
import pandas as pd
from collections import Counter
# Language processing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
sys.path.append('/Users/wolfsinem/product-tagging')
from product_tagging.tags_generator import tokenized_list
from similarityRate import lemma_tag
N = 5000
MODEL = tokenized_list()
# Preprocessing
model_df = MODEL[:N]
target_variable = model_df['tags']
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)
# Open our classifier and vectorizer pickle files
with open('/Users/wolfsinem/product-tagging/data/classifier2', 'rb') as training_model:
model = pickle.load(training_model)
with open('/Users/wolfsinem/product-tagging/data/vect2', 'rb') as tfvectorizer:
vectorizer = pickle.load(tfvectorizer)
# Initializing the Flask app
app = Flask(__name__)
app.config.from_object("config.DevelopmentConfig")
def allowed_file(filename):
"""To make sure the user can't upload any type of file we will use this
function to limit the input to just the formats as described in config.
:param filename: user input file.
:type filename: string.
"""
if not "." in filename:
return False
ext = filename.rsplit(".", 1)[1]
if ext.upper() in app.config["ALLOWED_FILE_EXTENSIONS"]:
return True
else:
return False
def tokenize_user_text_input(sentence, size_tags=10):
"""This function splits a string into substrings using a regular expression
using RegexpTokenizer. Additionally it counts the occurence of each word
and returns the top x words which can be used as tags
:param sentence: Text description of a product
:type sentence: string
"""
tokenizer = nltk.RegexpTokenizer(r"\w+")
new_words = tokenizer.tokenize(str(sentence))
new_words = [token.lower() for token in new_words]
stop_words = set(stopwords.words('english'))
filter_tokens = [w for w in new_words if not w in stop_words]
count_terms = Counter(filter_tokens).most_common(size_tags)
count_terms = [item[0] for item in count_terms]
token_lists = []
for i in count_terms:
token_lists.append(i)
token_lists = [item for item in token_lists if not item.isdigit()]
return token_lists
def lemma_tag(sentence,tags_size):
"""This function uses the NLTK lemmatizer function in the first part.
Lemmatization, unlike Stemming, reduces the inflected words properly ensuring
that the root word belongs to the language See:
https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
To reduce the amount of duplicates in a set of tags we will thus use
lemmatization. Words like 'weight' and 'weights' will be considered the same
and be saved as 'weight'. In addition to that we have a few other conditions
to clean the set of tags.
:param sentence: A single sentence e.g. product description
:type sentence: string
"""
lemmatizer = WordNetLemmatizer()
lemm_set = []
for word in tokenize_user_text_input(sentence):
tag = lemmatizer.lemmatize(word)
lemm_set.append(tag)
lemm_set = list(set(lemm_set))
lemm_set = [x for x in lemm_set if not x[-3:] == "ing"]
return [i for i in lemm_set if len(i) > 1][:tags_size]
def extend_df(df,tags_size):
"""This function extends the original dataframe with an extra column 'tags'.
This function uses both the lemma_tag() and tokenize_user_text_input()
function to tokenize and clean the set of tags.
:param df: This would be the orginal df imported by the user.
:type df: string.
"""
for i in df.index:
df.at[i,'tags'] = lemma_tag(df.loc[i]['description'],tags_size)
return df
@app.route('/', methods=['POST', 'GET'])
def index():
"""Setting up the main route"""
return render_template('index.html')
@app.route('/predict', methods=['POST', 'GET'])
def predict():
"""This predict function will load the persisted model into memory when the
application starts and create an API endpoint that takes input variables,
transforms them into the appropiate format and return predictions; set of tags.
"""
if request.method == 'POST':
user_input_string = request.form.values()
user_input_string = vectorizer.transform(user_input_string).toarray()
label = model.predict(user_input_string)
generated_tags = mlb.inverse_transform(label)
return render_template("index.html", generated_tags = generated_tags)
@app.route('/warning', methods=['POST', 'GET'])
def send_warning():
"""Function to send out the warning template to user."""
return render_template('warning.html')
@app.route('/read_csv', methods=['POST', 'GET'])
def read_csv():
"""This predict function will load the persisted model into memory when the
application starts and create an API endpoint that takes the input CSV file,
transforms it into the appropiate format and returns a new CSV file with a
new column; tags.
"""
if request.method == "POST":
user_input_size = request.form.get('tags_size')
if user_input_size:
tags_size = int(user_input_size) # + 1
else:
tags_size = 20
if request.files:
file = request.files["file"]
if file.filename == "":
print("There is no file")
return redirect(request.url)
if allowed_file(file.filename):
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['FILE_UPLOADS'], filename)
file.save(file_path)
print('The uploaded file: {} has been saved into the directory'.format(filename))
df = pd.read_csv(file_path)
df.dropna(inplace=True)
df.drop_duplicates(subset=['description'],inplace=True)
if 'product_name' in df.columns and 'description' in df.columns:
model_df = df[['product_name','description']]
pd.options.mode.chained_assignment = None
model_df['tags'] = ""
file_path = os.path.join(app.config['FILE_UPLOADS'], "extended-" +filename)
extended_dataset = extend_df(model_df,tags_size)
extended_dataset.to_csv(file_path)
print('The transformed file: {} has been saved into the directory'.format("extended-"+filename))
# return redirect('/uploads/'+ filename)
return render_template("download.html", value="extended-"+filename)
else:
print("File =---> {} has no valid file format".format(file.filename))
return send_warning()
return render_template("upload_csv.html")
@app.route('/uploads/<filename>', methods=['GET'])
def download_csv(filename):
"""In the previous function we have read the user input's csv file
and this function will transform this dataset to give user a new
extended csv file back.
"""
return render_template('download.html',value="extended-"+filename)
@app.route('/return-files/<filename>')
def return_files(filename):
file_path = os.path.join(app.config["FILE_UPLOADS"], filename)
return send_file(file_path, as_attachment=True)
@app.route('/generated', methods=['POST', 'GET'])
def generate_tags():
"""This product generator doesn't use any machine learning to create tags
based on the text input. It only uses the NLTK library.
"""
if request.method == 'POST':
user_input_size = request.form.get('tags_size')
if user_input_size:
tags_size = int(user_input_size) # + 1
else:
tags_size = 20
user_input_string = request.form.get('product_description')
tags_set = lemma_tag(user_input_string, tags_size)
return render_template("algorithm.html", tags_set = tags_set)
return render_template("algorithm.html")
@app.route('/extended_guide', methods=['POST', 'GET'])
def guide():
"""Setting up the extended guide for more information on uploading CSV"""
return render_template('extended_guide.html')
if __name__ == "__main__":
app.run(debug=True)