-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_NLP.py
183 lines (155 loc) · 8.24 KB
/
train_NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer
import joblib
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
class SentimentAnalysisModel:
def __init__(self, train_directory, model_file_path, vectorizer_file_path):
# Initialize class variables
self.train_directory = train_directory
self.reviews_train = []
self.labels_train = []
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
self.max_features_selected = 0
self.best_model_max_features_selected = 0
self.vocab_size = 0
self.best_model_vocab_size = 0
self.vectorizer = None
self.best_model_vectorizer = None
self.model = None
self.best_model = None
self.model_history = None
self.best_model_history = None
self.model_file_path = model_file_path
self.vectorizer_file_path = vectorizer_file_path
def load_dataset(self, directory):
# Load the dataset and labels from the specified directory
reviews = []
labels = []
for sentiment in ['pos', 'neg']:
sentiment_directory = directory + sentiment + "/"
for file_name in os.listdir(sentiment_directory):
with open(sentiment_directory + file_name, 'r') as file:
review = file.read()
reviews.append(review)
labels.append(1 if sentiment == 'pos' else 0)
return reviews, labels
def preprocess_reviews(self, reviews):
# Preprocess the reviews using tokenization, lemmatization, and stop-word removal
processed_reviews = []
for review in reviews:
tokens = word_tokenize(review)
filtered_tokens = [self.lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in self.stop_words]
processed_review = ' '.join(filtered_tokens)
processed_reviews.append(processed_review)
print("Sample Review before processing")
print(reviews[0])
print("Sample Review before processing")
print(processed_reviews[0])
return processed_reviews
def set_vocab_size(self, reviews):
# Calculate the vocabulary size from the given reviews
unique_words = set()
for review in reviews:
tokens = word_tokenize(review)
unique_words.update(tokens)
self.vocab_size = len(unique_words)
def fit_vectorizer(self, X_train):
# Fit the TfidfVectorizer on the training data
self.set_vocab_size(X_train)
self.max_features_selected = (20 * self.vocab_size) // 100 # Pareto principle
self.vectorizer = TfidfVectorizer(max_features=self.max_features_selected)
self.vectorizer.fit(X_train)
print("Vocabulary Size of processed data: ", self.vocab_size)
print("Maximum Features Selected for TF-IDF Vectorizer: ", self.max_features_selected)
def train_classifier(self, X_train_tfidf, y_train, X_val_tfidf, y_val):
# Train the sentiment analysis model using the specified architecture
self.model = Sequential()
l2_penalty = 0.01
self.model.add(Dense(512, input_dim=X_train_tfidf.shape[1], activation='relu', kernel_regularizer=l2(l2_penalty)))
self.model.add(Dropout(0.2))
self.model.add(Dense(512, activation='relu', kernel_regularizer=l2(l2_penalty)))
self.model.add(Dropout(0.2))
self.model.add(Dense(64, activation='relu', kernel_regularizer=l2(l2_penalty)))
self.model.add(Dense(1, activation='sigmoid'))
self.model.summary()
self.model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
self.model_history = self.model.fit(X_train_tfidf.toarray(), np.array(y_train), epochs=20, batch_size=2048, validation_data=(X_val_tfidf.toarray(), np.array(y_val)))
def train_and_save_model(self, num_folds=5):
# Load the dataset, preprocess it, and train the model using K-fold cross-validation
self.reviews_train, self.labels_train = self.load_dataset(self.train_directory)
self.reviews_train = self.preprocess_reviews(self.reviews_train)
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
val_accuracies = []
best_val_accuracy = 0
for (train_idx, val_idx) in skf.split(self.reviews_train, self.labels_train):
X_train_fold, X_val_fold = np.array(self.reviews_train)[train_idx], np.array(self.reviews_train)[val_idx]
y_train_fold, y_val_fold = np.array(self.labels_train)[train_idx], np.array(self.labels_train)[val_idx]
self.fit_vectorizer(X_train_fold)
X_train_tfidf = self.vectorizer.transform(X_train_fold)
X_val_tfidf = self.vectorizer.transform(X_val_fold)
self.train_classifier(X_train_tfidf, y_train_fold, X_val_tfidf, y_val_fold)
val_accuracy = round(self.model_history.history['val_accuracy'][-1]*100,2)
val_accuracies.append(val_accuracy)
# Save the best accuracy model
if val_accuracy > best_val_accuracy:
best_val_accuracy = val_accuracy
self.best_model = self.model
self.best_model_vectorizer = self.vectorizer
self.best_model_history = self.model_history.history
self.best_model_vocab_size = self.vocab_size
self.best_model_max_features_selected = self.max_features_selected
print("Vocabulary Size of Best model: ", self.best_model_vocab_size)
print("Maximum Features Selected for TF-IDF Vectorizer in best model: ", self.best_model_max_features_selected)
for index, val_accuracy in enumerate(val_accuracies):
print(index+1, " Fold Accuracy = ", val_accuracy)
print("Best Validation accuracy: ", best_val_accuracy)
print("Average Validation accuracy: ", round(sum(val_accuracies)/len(val_accuracies),2))
print("Saving the best model and vectorizer")
self.save_model_and_vectorizer()
def save_model_and_vectorizer(self):
# Save the best model and vectorizer to disk
self.best_model.save(self.model_file_path)
joblib.dump(self.best_model_vectorizer, self.vectorizer_file_path)
def plot_training_and_validation_metrics(self):
# Plot the training and validation metrics (accuracy and loss)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(self.best_model_history['accuracy']) + 1), self.best_model_history['accuracy'], label='Training Accuracy')
plt.plot(range(1, len(self.best_model_history['val_accuracy']) + 1), self.best_model_history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Average Training and Validation Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(range(1, len(self.best_model_history['loss']) + 1), self.best_model_history['loss'], label='Training Loss')
plt.plot(range(1, len(self.best_model_history['val_loss']) + 1), self.best_model_history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Average Training and Validation Loss')
plt.legend()
plt.tight_layout()
plt.show()
def main():
# Main function to train and evaluate the model
train_directory = "./data/aclImdb/train/" # train directory path
model_file_path = "./models/NLP_model.h5" # model file path to save
vectorizer_file_path = './models/vectorizer.joblib' # vectorizer file path to save
model = SentimentAnalysisModel(train_directory, model_file_path, vectorizer_file_path)
model.train_and_save_model(num_folds=5)
model.plot_training_and_validation_metrics()
if __name__ == '__main__':
main()