-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
83 lines (70 loc) · 3.02 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# preprocess.py
import numpy as np
import matplotlib.pyplot as plt
from keras.utils import get_file
from collections import Counter
import nltk
from nltk import word_tokenize
import os
nltk.download('punkt')
# Load and preprocess the text data
def preprocess_data():
url = 'https://www.gutenberg.org/cache/epub/100/pg100.txt'
path = get_file('pg100.txt', origin=url)
with open(path, 'r', encoding='utf8') as f:
lines = f.readlines()
text = []
start = False
for line in lines:
line = line.strip().lower()
if "*** START OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***".lower() in line and not start:
start = True
if "*** END OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***".lower() in line:
break
if not start or len(line) == 0:
continue
text.append(line)
text = " ".join(text)
voc_chars = sorted(set(text))
char_indices = {c: i for i, c in enumerate(voc_chars)}
indices_char = {i: c for i, c in enumerate(voc_chars)}
tokens = word_tokenize(text)
freq = Counter(tokens)
ordered_word_list = freq.most_common()
rank_counts = np.array([[rank + 1, count] for rank, (_, count) in enumerate(ordered_word_list)])
plt.figure(figsize=(20, 5))
plt.title('Word counts versus rank')
plt.scatter(rank_counts[:, 0], rank_counts[:, 1])
plt.yscale('log')
plt.show()
print(f'Vocabulary size: {len(freq)}')
for i in range(1000, len(freq), 1000):
print(f'{i} : {np.sum(rank_counts[:i, 1]) / np.sum(rank_counts[:, 1]):.2f}')
maximum_seq_length = 30
time_step = 4
sentences = []
next_char = []
n = len(text)
for i in range(0, n - maximum_seq_length, time_step):
sentences.append(text[i:i + maximum_seq_length])
next_char.append(text[i + maximum_seq_length])
print(f'Number of Sequences: {len(sentences)}')
X = np.zeros((len(sentences), maximum_seq_length, len(voc_chars)), dtype=bool)
y = np.zeros((len(sentences), len(voc_chars)), dtype=bool)
for i, sentence in enumerate(sentences):
for j, char in enumerate(sentence):
X[i, j, char_indices[char]] = 1
y[i, char_indices[next_char[i]]] = 1
X_emb = np.zeros((len(sentences), maximum_seq_length))
y_emb = np.zeros((len(sentences),1))
for i,sentence in enumerate(sentences):
# Loop over the characters
for j,c in enumerate(sentence):
# Put the right value of X to its index
X_emb[i,j] = char_indices[c]
# Put the right value of y to its index
y_emb[i] = char_indices[next_char[i]]
return X, y, maximum_seq_length, voc_chars, char_indices, indices_char,X_emb,y_emb
X, y, maximum_seq_length, voc_chars, char_indices, indices_char,X_emb,y_emb = preprocess_data()
np.savez('preprocessed_data.npz', X=X, y=y, maximum_seq_length=maximum_seq_length,
voc_chars=voc_chars, char_indices=char_indices, indices_char=indices_char,X_emb=X_emb,y_emb=y_emb)