-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
96 lines (70 loc) · 3.35 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Machine Learning - model training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from product_tagging.tags_generator import tokenized_list
n = 2000
model_df = tokenized_list()
model_df = model_df[:n]
def preprocessing_df():
"""This function preprocesses the newly created dataframe with the tags
column. It takes the target variable, which are the tags and uses the
MultiLabelBinarizer. This allows us to encode multiple labels per instance
and fit the label sets binarizer and transform the given label sets.
"""
target_variable = model_df['tags']
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)
return target_variable
def tfidfvec():
"""This function uses the TfidfVectorizer method to tokenize texts,
learn the vocabulary and inverse the document frequency weighting and
allows you to encode new texts.
"""
vectorizer = TfidfVectorizer(
strip_accents='unicode',
analyzer='word',
ngram_range=(1,3),
stop_words='english',
token_pattern=r'\w{3,}'
)
product_description = model_df['description']
independent_variable = vectorizer.fit_transform(product_description)
return independent_variable
def traintestSplit(test_size=0.3, random_state=42):
"""This function splits the independent variable and target variable
data into a test and train set.
"""
independent_variable = tfidfvec()
target_variable = preprocessing_df()
X_train, X_test, y_train, y_test = train_test_split(
independent_variable,
target_variable,
test_size = test_size,
random_state = random_state
)
return X_train, X_test, y_train, y_test
def linearSVC_pipeline(random_state=42, tol=1e-1, C=8.385, n_jobs=-1):
"""This function uses the TfidfVectorizer method to tokenize texts,
learn the vocabulary and inverse the document frequency weighting and
allows you to encode new texts.
"""
X_train, X_test, y_train, y_test = traintestSplit()
Linear_pipeline = Pipeline([
('clf', OneVsRestClassifier(LinearSVC(
class_weight = 'balanced',
random_state = random_state,
tol = tol,
C = C ),
n_jobs = n_jobs)),
])
SVCpipeline = Linear_pipeline.fit(X_train, y_train)
prediction = Linear_pipeline.predict(X_test)
accScore = accuracy_score(y_test, prediction)
return SVCpipeline, prediction, accScore
SVCpipeline, prediction, accScore = linearSVC_pipeline()
print("Accuracy score with {} trained rows: {}".format(n,accScore))