-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrnn_gru.py
124 lines (88 loc) · 3.27 KB
/
rnn_gru.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import random
from collections import defaultdict
import requests
import spacy
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")
# Path to the text-file
file_path = 'input.txt'
# Open the text-file and read the content
with open(file_path, 'r', encoding='utf-8-sig') as file:
text = file.read()
# Clean the text
start_index = text.find("CHAPTER I")
end_index = text.find("End of Project Gutenberg's Alice's Adventures in Wonderland")
text = text[start_index:end_index]
# Use spaCy to tokenize the text
doc = nlp(text)
# Extract tokens
tokens = [token.text for token in doc if not token.is_space]
#####
# Define sequence length
seq_length = 50
# Generate sequences of tokens
sequences = []
next_words = []
for i in range(len(tokens) - seq_length):
sequences.append(tokens[i:i + seq_length])
next_words.append(tokens[i + seq_length])
# Convert sequences and next words to integers
unique_tokens = list(set(tokens))
token_to_id = {token: idx for idx, token in enumerate(unique_tokens)}
id_to_token = {idx: token for token, idx in token_to_id.items()}
sequences = np.array([[token_to_id[token] for token in seq] for seq in sequences])
next_words = np.array([token_to_id[token] for token in next_words])
# model
# Define model parameters
vocab_size = len(unique_tokens)
embedding_dim = 100
rnn_units = 128
model = Sequential([
Embedding(vocab_size, embedding_dim, input_length=seq_length),
GRU(rnn_units, return_sequences=False),
#Dropout(0.5),
Dense(vocab_size, activation='softmax')
])
learning_rate = 0.01
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')
model.summary()
# train the model
# Define training parameters
batch_size = 128
epochs = 20
model.fit(
sequences,
next_words,
batch_size=batch_size,
epochs=epochs
)
# Function to generate text using the RNN model
def generate_text_rnn(model, start_string, num_generate=100):
# Convert start string to tokens
input_eval = [token_to_id[s] for s in start_string.split()]
# Pad input to match the sequence length expected by the model
if len(input_eval) < seq_length:
input_eval = [0] * (seq_length - len(input_eval)) + input_eval
input_eval = tf.expand_dims(input_eval, 0)
text_generated = []
temperature = 1.0
model.reset_states()
for _ in range(num_generate):
predictions = model(input_eval)
predictions = tf.squeeze(predictions, 0) # Remove batch dimension
# Ensure the predictions are in the right shape for tf.random.categorical
predictions = tf.expand_dims(predictions, 0) # Add batch dimension back
predictions = predictions / temperature
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
input_eval = tf.expand_dims(input_eval.numpy().tolist()[0][1:] + [predicted_id], 0)
text_generated.append(id_to_token[predicted_id])
return start_string + ' ' + ' '.join(text_generated)
# Generate text
start_string = "Alice was beginning"
print(generate_text_rnn(model, start_string))