forked from StAandrew/SNS_client
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLP_testing.py
200 lines (166 loc) · 5.61 KB
/
NLP_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import random
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Flatten, SpatialDropout1D
from keras.models import Sequential
import matplotlib.pyplot as plt
# Load data from .json file
intents = json.loads(open("intents.json").read())
inputs = []
classes = []
responses = {}
# Append data from json file to lists
for intent in intents["intents"]:
responses[intent["tag"]] = intent["responses"]
for pattern in intent["patterns"]:
inputs.append(pattern)
classes.append(intent["tag"])
# Convert lists to pd dataframe
data = pd.DataFrame({"inputs": inputs, "classes": classes})
# Remove punctuation from inputs
data["inputs"] = data["inputs"].apply(
lambda word: [
letters.lower() for letters in word if letters not in string.punctuation
]
)
data["inputs"] = data["inputs"].apply(lambda word: "".join(word))
# Tokenize words and convert to np array
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["inputs"])
train = tokenizer.texts_to_sequences(data["inputs"])
xtrain = pad_sequences(
train
) # Padding converts list of sequences to a 2D np array with length of longest sequence in list (makes all inputs uniform)
# Encode classes to numerical values
encoder = LabelEncoder()
ytrain = encoder.fit_transform(data["classes"])
# training data info
input_size = xtrain.shape[1]
vocab = len(tokenizer.word_index)
output_size = encoder.classes_.shape[0]
# creating the model
model = Sequential()
model.add(Embedding(input_dim=vocab + 1, output_dim=10, input_length=input_size))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(10, dropout=0.3, recurrent_dropout=0.3))
model.add(Flatten())
model.add(Dense(output_size, activation="softmax"))
# compile model
model.compile(
loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
# train the model
print("\nChatbot is loading...\n")
hist = model.fit(xtrain, ytrain, epochs=400, verbose=0)
fig, ax1 = plt.subplots()
ax1.plot(hist.history["accuracy"], color="tab:red")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Accuracy", color="tab:red")
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2 = ax1.twinx()
ax2.set_ylabel("Loss", color="tab:blue")
ax2.plot(hist.history["loss"], color="tab:blue")
ax2.tick_params(axis="y", labelcolor="tab:blue")
plt.title("NLP training metrics")
fig.tight_layout()
plt.savefig("NLP_metrics.png")
plt.show()
test_df_data = {
"inputs": [
"Predict the price of a stock", # get_price
"What price will NVDA be tomorrow",
"Find the price of a stock in 5 days",
"What price will my stock be in the future",
"what will the price of GOOG be",
"Find the returns of my portfolio", # get_daily_returns
"What will the daily profit of my stock be",
"How much money will I make each day",
"Predict the daily returns of PLTR",
"How much profit will I make each day",
"what will the average return of NVDA be", # get_avg_returns
"Predict the average profit of a stock",
"How much money will I make on average",
"what's the expected returns on this stock",
"Find the average returns of QQQ",
"What will the volatility of GOOG be",
"Find the volatility of a stock", # get_std
"Will PLTR be volatile",
"Predict the standard deviation of a stock",
"Predict how volitile META will be",
"What will the volatility of a stock be",
"Predict the sharpe ratio of GOOG", # get_sharpe
"Assess the performance of TSLA",
"What sharp ratio will this stock have",
"Find the sharpe ratio",
"Predict if a stock will perform well",
"Optimise my portfolio for minimum variance", # optimise
"How should I invest in these stocks?"
"Find the optimal way to invest in these stocks",
"Minimise the variance of these stocks",
"How can I maximise the sharpe ratio for my portfolio",
"What can you do", # help
"Help",
"Help me",
"What are you",
"What things can you do",
],
"classes": [
"get_price",
"get_price",
"get_price",
"get_price",
"get_price",
"get_daily_return",
"get_daily_return",
"get_daily_return",
"get_daily_return",
"get_daily_return",
"get_avg_return",
"get_avg_return",
"get_avg_return",
"get_avg_return",
"get_avg_return",
"get_std",
"get_std",
"get_std",
"get_std",
"get_std",
"get_sharpe",
"get_sharpe",
"get_sharpe",
"get_sharpe",
"get_sharpe",
"optimise",
"optimise",
"optimise",
"optimise",
"optimise",
"help",
"help",
"help",
"help",
"help",
],
}
test_df = pd.DataFrame(test_df_data)
# Remove punctuation from inputs
test_df["inputs"] = test_df["inputs"].apply(
lambda word: [
letters.lower() for letters in word if letters not in string.punctuation
]
)
test_df["inputs"] = test_df["inputs"].apply(lambda word: "".join(word))
# Tokenize words and convert to np array
test = tokenizer.texts_to_sequences(test_df["inputs"])
xtest = pad_sequences(test, input_size)
# Encode classes to numerical values
ytest = encoder.transform(test_df["classes"])
print("Predicting...\n")
results = model.evaluate(xtest, ytest)
print("Accuracy:", results[1])