-
Notifications
You must be signed in to change notification settings - Fork 5
/
recurrent neural networks.py
1514 lines (1290 loc) · 66.5 KB
/
recurrent neural networks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#%%
# In the name of God the most compassinate the most merciful
# introduction to RNNs in Pytorch
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim
%matplotlib inline
#%% RNN
# In this section we are going to learn about RNNs in Pytorch, we will start with
# Vaniall RNN and continue to see how GRU/LSTM works, and implement different project
# along the way to get a good grasp on the idea
# we will first start with a simple example about time-series. as you know, we use
# recurrent neural networks on series. we can use cnns as well, but here we will be
# doing time-series and other sequential data using RNNs.
# there are always 3 distinc steps to build a model.
# 1. create the dataset or data to be fed to our model
# 2. create our model in Pytorch
# 3. run training/evaluation on the data
# 4. visualize as needed
# for our timesereies example, we need a dataset, to keep it simple, we create one for ourselves
# here is one way to do this :
sequence_length = 30
# using linspace, we create a series of points.(evenly spaced numbers over a specified interval.
# the series length is specified by the sequence length, which for our case, we defined it as 30.
# using linspace the points are generated with fixed step-sizes. if we plot this
# you'll note that they form a line! in order to give a structure/shape of some kind we alter these
# points using sin().(we could use anything for his)
# By doing this we are trying to create some kind of an underlying structure in our dataset
# and see whether our network can discover that underlying relationship and model it rather
# than just modeling a simple line!)
# please note that we could achieve this using other methods as well, for example
# generate some random numbers and then multiplying them by a theta, and network would
# need to learn what theta actually is to model the data. but for now lets use this
sample = torch.linspace(start = 0, end=np.pi, steps=sequence_length)
sample_sin = sample.sin()
# now lets plot our data points and see how the look:
plt.plot(sample,color='r')
plt.plot(sample_sin,color='w')
# we could do
# dt = torch.randn(size=(30,))
# dt.mul_(5).add_(1.5)
# plt.plot(dt)
# so that was the sample, whats the label?
# in time-series problems, we have different types of problems.
# when our problem is a sequence to sequence and lets say we want to predict
# a value, we usually create the label from the sample itself, we just shift it
# forward one timestep!
# so the sample and its label would be
x = sample[:-1]
y = sample[1:]
plt.plot(x, color='r' ,label='x')
plt.plot(y, color='y', label='y')
# now lets make a function for plotting make our life easier!
def plot_sample(x, y, fmt='g.', fmt2='y+', label1='x', label2='y'):
# clears the figure, for this we have cla and clf.
# cla is used for clearing the current 'a'xes while clf
# is used for clearing the whole 'f'igure
plt.clf()
# plot([x], y, [fmt], *, data=None, **kwargs)
# plot([x], y, [fmt], [x2], y2, [fmt2], ..., **kwargs)
# The optional parameter fmt is a convenient way for
# defining basic formatting like color, marker and linestyle.
# It's a shortcut string notation described in the Notes section below.
# plot(y, 'r+') # ditto, but with red plusses
# The following two lines are the same, the firts one uses [fmt]
# >>> plot(x, y, 'go--', linewidth=2, markersize=12)
# >>> plot(x, y, color='green', marker='o', linestyle='dashed',
# linewidth=2, markersize=12
plt.plot(x, fmt, label=label1)
plt.plot(y, fmt2, label=label2)
# Any way, what we just created and plotted is a single sample. a sample of 30 timesteps
# we need more of these samples to train our model. so we eaither need to create a datset
# beforehand and then during training, read from it, or we can just simulate that, by creating
# a generator which generates a sample each time it is called!
# in a dataset we need both a sample and its label!
def dataset_sample(i, seq_len=10, device=torch.device('cuda')):
sample_raw = torch.linspace(i*np.pi, (i+1)*np.pi, seq_len+1)
sample_data = sample_raw.sin().to(device)
data = sample_data[:-1].view(1,seq_len,1)
label = sample_data[1:].view(1,seq_len,1)
yield sample_raw.numpy(), data, label
#lets test this
raw, data, label = next(iter(dataset_sample(0)))
data = data.view(-1)
label =label.view(-1)
plot_sample(data.cpu(), label.cpu())
# plt.clf()
plt.plot(data.cpu(),'r+-', label='input, x')
# now lets create our RNN.
class RNN_Net(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1,
drpout=0.5, bidirect=False):
super().__init__()
self.input_size = input_dim
self.hidden_size = hidden_dim
self.output_size = output_dim
# some arguments are self explanetory, so I explain about others
# the first thing to pay attention when working with rnns in pytorch is
# unlike normal vanilla rnn implementation, pytorchs rnns(grus and lstms)
# are like this as well, will only return a tuple which includes, outputs
# and hiddenstates for each timestep. obviously the output is not the actual
# output that we want, for that we need to add another layer after the rnn
# for example an fc layer with sigmoid to achive the actual output. (dont worry
# if its vague tt you, we will get to this in a moment)
# the second thing is num_layers, basically this allows you to create a stacked rnn
# usually you may choose between 1-3 layers.
# As it is said in the documentation, num_layers: Number of recurrent layers. E.g.,
# setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN,
# with the second RNN taking in 'outputs' of the first RNN and computing the final
# results. Default is 1
# batch_first, means, if you have your data in batches(batch is the first dim),
# set this to true
# bidirectional, means whether you want your rnn to be bidirectional!
# by the way, the input_dim actually refers to the input size, e.g if you
# one_hot encoded your input, you feed the one_hot encoded dim.(we will see this
# in a moment!)
# note dropout will actually be relevant if we have more than 1 layers!
self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=num_layers,
batch_first=True,
dropout=drpout,
bidirectional=bidirect)
# as I just pointed out, for the actual outputs we need a new layer
# we use fc layer, since we are going to predict values
# we need 30 values as output for each sample!
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x, hidden_state):
# our input should have the shape :
# (batch_size, seq_length, input_size)
# our hidden states can be None or if not should have the shape:
# (n_layers*direction, batch_size, hidden_dim)
# and the output returned by rnn has the shape:
# (batch_size, time_step, hidden_size)
rnn_outputs, hidden_states = self.rnn(x, hidden_state)
rnn_outputs = rnn_outputs.view(-1, self.hidden_size)
output = self.fc(rnn_outputs)
return output, hidden_states
batch_size=1
iteration = 80
interval = 15
sequence_length = 20
hidden_dim = 50
# its initially zero, we could use None as well, but I wanted you to see how
# a hidden state dims looks like
# we can have a stacked rnn, if we want one, simply increase this number
num_layers = 1
# uni or bidirection. ( 1 or 2)
direction = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# hidden_state = None
hidden_state = torch.zeros(size=(num_layers*direction, batch_size, hidden_dim)).to(device)
model = RNN_Net(input_dim=1,
hidden_dim=hidden_dim,
output_dim=1
num_layers=num_layers).to(device)
# training
criterion = nn.MSELoss()
# experiment with a large lr sucha s 0.1 and also a small lr like 0.0001 and see the changes
optimizer = optim.Adam(model.parameters(), lr=0.01)
# torch.autograd.set_detect_anomaly(True)
print(model)
for i in range(iteration):
for (raw, data, label) in dataset_sample(i,
seq_len=sequence_length,
device=device):
(output, hidden_state) = model(data, hidden_state)
## Representing Memory ##
# make a new variable for hidden and detach the hidden state from its history
# this way, we don't backpropagate through the entire history
hidden_state = hidden_state.data
# make sure the dims for output and label is the same
# otherwise, you may not get an error, but a sckewed result
# that may bogle your mind for quite sometime!
loss = criterion(output, label)
print(f'iter: {i} loss: {loss.item():.6f} ')
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i%interval==0:
x = data.view(-1).data.cpu().numpy()
y = label.reshape(-1).data.cpu().numpy()
output = output.view(-1)
plt.plot(x, 'r.', label='x-data')
plt.plot(y,'w.', label='label')
plt.plot(output.data.cpu().numpy(), 'y+',label='output')
plt.legend()
plt.show()
#%%
# Now using GRU and LSTM is the same, but we are goingto use them
# in a new architecture. lets delve into the realm of nlp and write
# a simple text generator!
# for our dataset, we use gutenberg opensource library which contains over 60K free ebooks,
# you can access this library using this link : http://www.gutenberg.org
# we use http://www.gutenberg.org/files/1399/1399-0.txt but feel free to use anything you
# you like !
# lets roll everybody!:)
# lets download the book and create a dataset out of it
# ref : https://stackoverflow.com/questions/7243750/download-file-from-web-in-python-3
def download_ebook(url='http://www.gutenberg.org/files/1399/1399-0.txt',
file_name='corpus.txt'):
import urllib.request as request
import os, sys
dir_name = 'data'
file_name_path = os.path.join(dir_name, file_name)
if os.path.exists(file_name_path):
return file_name_path
if not os.path.exists(dir_name):
os.mkdir(dir_name)
try:
request.urlretrieve(url, file_name_path)
return file_name_path
except :
print(f'exception occurec!: {sys.exc_info()[0]} ')
return None
with open(download_ebook(),'r') as file:
corpus_raw = file.read()
# we use repr to see the special characters
# without it, the print statement will render them
# but before that lets have a minimal preprocessing step
# lets lower case all characters! this will help with
# the end result, there are more preprocessings, but
# for now its enough, we will see more in the next sections
# such as sentiment analysis and word embedding section
corpus_raw = corpus_raw.translate(str.maketrans('','',string.punctuation))
corpus_raw = ''.join(filter(lambda x: x in set(string.printable), corpus_raw))
corpus_raw = ''.join([h.lower() for h in corpus_raw])
print(repr(corpus_raw[:10]))
# our first step would be to tekenize our corpus
# basically, we will tokenize our corpus of text
# and then digitize each letter/symbol, and then
# use the encoded representation of these digitized
# words. in order to encode them, we first need to
# create a dictionary of each letter
# first we find all unique symbols.
# using set, we can avoid duplicates
unique_chars = set(corpus_raw)
# now lets create a char2int and int2char dictionaries!
int2char = dict(enumerate(unique_chars))
char2int = {c:d for d,c in int2char.items()}
# print(chars)
# print(int2char)
# print(char2int)
print(f'unique characters: {len(unique_chars)} : \n {unique_chars}')
# lets convert our corpus to int
corpus_digitized = np.array([char2int[char] for char in corpus_raw])
print(f'corpus digitized shape: {corpus_digitized.shape}')
print(corpus_digitized[:10])
print('after conversion: ')
print(repr(''.join([int2char[ii] for ii in corpus_digitized[:10]])))
# ok so far so good. we now need to create a one-hot representation of our input
# our inputs are digits, each digit as you saw, represents a single letter/symbol
# so in order to train our net, we must one_hot encode them,
def one_hot(input_array, length=10):
# our list conains several digits, we will create a one hot vector
# for each digit
one_hot_array = np.zeros(shape=(input_array.size, length), dtype=np.float32)
one_hot_array[np.arange(input_array.size), input_array.flatten()] = 1
return one_hot_array.reshape(*input_array.shape,length)
# lets test this
print(one_hot(np.array([0,1,9]),10))
# now we need to have batches!
def get_next_batch(corpus_digitized, batch_size=1, seq_len=10):
# lets create batches from our corpus, first lets see
# how many batches we can get from our corpus
char_count = corpus_digitized.shape[0]
each_batch_size = batch_size*seq_len
batch_count = char_count // each_batch_size
# now we should reshape our corpus data for easier access
corpus_p = corpus_digitized[:batch_count * each_batch_size]
#
corpus_p = corpus_p.reshape(batch_size, -1)
# read one batch for data and label
x = np.zeros(shape=(batch_size, seq_len),dtype=np.int)
y = np.zeros_like(x)
for i in range(0, corpus_p.shape[1], seq_len):
x[:,:] = corpus_p[:, i:i+seq_len]
try :
y[:, :-1] = x[:, 1:]
y[:,-1] = corpus_p[:,i+seq_len]
except:
y[:,:-1] = x[:,1:]
y[:,-1] = corpus_p[:,0]
yield x,y
x,y = next(iter(get_next_batch(corpus_digitized,batch_size=3,seq_len=8)))
print(x.shape)
print(y.shape)
print(x)
print(y)
print(one_hot(x,length=len(unique_chars)).shape)
#%%
# Ok everything seems ok now. lets create our network
# in order to be able to compare between different network types,
# lets use all of them. lstm works the best as you will see
# followed by GRU and then RNN. as you can guess, RNN has the worst
# performance in terms if accuracy and text generation!
class lstm_char(nn.Module):
def __init__(self, rnn_type='rnn', unique_chars=110, hidden_size=30,
num_layers=1, dropout=0.3, bidirection =False, act='tanh'):
super().__init__()
self.unique_char = unique_chars
self.int2char = dict(enumerate(self.unique_char))
self.char2int = {ch:ii for ii,ch in int2char.items()}
self.input_size = len(unique_chars)
self.output_size = self.input_size
self.hidden_size = hidden_size
self.drp = nn.Dropout(0.3)
self.rnn_type = rnn_type.lower()
self.direction = 2 if bidirection else 1
if rnn_type.lower() == 'rnn':
self.rnn = nn.RNN(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
elif rnn_type.lower() == 'gru':
self.rnn = nn.GRU(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
else:
self.rnn = nn.LSTM(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
self.fc = nn.Linear(hidden_size*self.direction, self.output_size)
def forward(self, input, hidden_states):
rnn_outputs, hidden_states = self.rnn(input,hidden_states)
outputs = rnn_outputs.reshape(-1, self.hidden_size*self.direction)
outputs = self.drp(outputs)
outputs = self.fc(outputs)
return outputs, hidden_states
# lets test our newtork
seq_len = 30
data, labels = next(iter(get_next_batch(corpus_digitized,batch_size=2,seq_len=seq_len)))
print('initial data shape before one_hot encoding: ',data.shape)
data = one_hot(data, length=len(unique_chars))
labels = one_hot(labels, length=len(unique_chars))
data = torch.from_numpy(data)
labels = torch.from_numpy(labels)
direction = True
num_layers = 1
model = lstm_char(rnn_type='lstm',
unique_chars=unique_chars,
hidden_size=30,
num_layers=num_layers,
bidirection=direction )
print(f'rnn type : {model.rnn_type}')
print(f'our input(data).shape: {data.shape}')
outputs, hiddenstates = model(data,None)
print(f'model input size: {model.input_size}')
print(f'model output size: {model.output_size}')
# now our output may look weird sth like, remember that
# in order to get meaningful output we need to reshape it
print(f'rnn output shape: {outputs.shape}')
# therefore the actual shape is
print(f'output actual shape :{outputs.view(-1, seq_len, model.output_size).shape}')
#%%
# now lets train our model
hidden_size = 512
layers_cnt = 2
bidirection = False
rnn_type = 'lstm'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = lstm_char(rnn_type=rnn_type,
unique_chars=unique_chars,
hidden_size=hidden_size,
num_layers=layers_cnt,
bidirection=bidirection,dropout=0.3)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=20)
epochs =60
# in order to not face the exploding gradient in lstm
# we clip the gradients
clip = 5.
interval = 1000
batch_size = 128
label_length = len(unique_chars)
hidden_states = None
val_ratio = 0.2
val_idx = int(corpus_digitized.size * (1-val_ratio))
train = corpus_digitized[:val_idx]
val = corpus_digitized[val_idx:]
print(f'rnn_type: {model.rnn_type}')
print(f'corpus size: {corpus_digitized.size}')
print(f'val size: {val.size}')
print(f'train size: {train.size}')
print(f'val + train: {val.size + train.size}')
assert train.size + val.size == corpus_digitized.size ,'they must be equale!'
for e in range(epochs):
for i, (data, label) in enumerate(get_next_batch(train, batch_size,seq_len=seq_len)):
#one hot encode
model.train()
data = torch.from_numpy(one_hot(data,length=label_length)).to(device)
label = torch.from_numpy(label).to(device)
output , hidden_states = model(data, hidden_states)
if model.rnn_type == 'lstm':
hidden_states = tuple(h.data for h in hidden_states)
else:#RNN, GRU
hidden_states = hidden_states.data
# we dont one_hot_encode our labels, the crossEntropy() layer will do this internally!
# since for output, we reshaped it so that the batch_size and sequence length are fused
# we do the same thing for labels, so it can be used in cossentropy!()
# the crossentropy loss, will internally convert each digit to its corosponding one_hot
# encoded vector. so basically we are just making sure, the shape between, output and
# label is the same
label = label.view(batch_size*seq_len).long()
loss = criterion(output, label)
optimizer.zero_grad()
loss.backward()
# note the _, which indicates the inplace operation!
torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=5.)
optimizer.step()
if i%interval==0:
print(f'epoch: {e}/{epochs} loss: {loss.item():.4f} lr: {scheduler.get_lr()[-1]:.6f}')
scheduler.step()
# test
print('test')
hidden_states = None
for i, (data,label) in enumerate(get_next_batch(val,batch_size,seq_len)):
with torch.no_grad():
model.eval()
data = torch.from_numpy(one_hot(data,label_length)).to(device)
label = torch.from_numpy(label).to(device)
output, hidden_states = model(data,hidden_states)
if model.rnn_type == 'lstm':
hidden_states = tuple(h.data for h in hidden_states)
else:#RNN, GRU
hidden_states = hidden_states.data
loss = criterion(output,label.view(batch_size*seq_len).long())
if i % interval ==0:
print(f'loss: {loss.item():.4f}')
#%%
# now its time for sampling and generating new text.
# basically this boils down to feeding a random character and then
# feed the next generated character to the next timestep as the next input
# and this goes on till we generate the whole text.
# since our network produces distributions , we use softmax to get the probablity
# , for each timestep, we can choose the highest probable outcome, or just randomly
# choose one, you'll see how this is done if this is vagueto you.
# first we need to create a fucntion that does one thing only!
# feed one character and retreieve one character from our neytwork.
# we then use this function to create more characters in a loop. thats basically it!
def predict(model, input, unique_chars, hidden_states=None, topk=None):
model.eval()
int2char = model.int2char
char2int = model.char2int
# print(char2int)
# convert input string into corrosponding ids
input = np.array([char2int[input]]).reshape(1,-1)
one_hot_vec = torch.from_numpy(one_hot(input, len(unique_chars))).to(device)
output, hidden_states = model(one_hot_vec, hidden_states)
output = torch.nn.functional.softmax(output,dim=1)
# now our output has probabilities for each sequence/timestep
# we will choose the highest one here
if topk==None:
indexes = output.topk(np.arrange(len(unique_chars)))
else:
probs, indexes = output.topk(k=topk, dim=1)
indexes = indexes.cpu().data.numpy().squeeze()
probs = probs.cpu().data.numpy().squeeze()
char = np.random.choice(indexes,p=probs/probs.sum())
return int2char[char], hidden_states
def sample(model, size=10, prime='hello there'):
chars = [ch.lower() for ch in prime]
h = None
prime = prime.lower()
# print(unique_chars)
for ch in prime:
o,h = predict(model, ch, unique_chars,h,topk=5)
chars.append(ch)
for c in range(size):
o, h = predict(model,chars[-1],unique_chars,h,topk=5)
chars.append(o)
return ''.join(chars)
print(unique_chars)
print(sample(model, size=200,prime='The '))
#%%
# NOte :
# how to use bidirectional LSTM/RNN
# In general if we want to create our own Bidirectional rnn network(be it rnn,lstm, gru),
# we need to create two normal networks(lstms for example), and then feed one with
# the normal input sequence, and the other with inverted input sequence.
# After that, we just take the last states from both networks and concat them together
# (or sum them concatenation seem to be the norm!) and thats all.
# but using Pytorch we dont have to deal with this hassle
# for using bidirectional version of the mentioned rnn networks, just look at the implementation
# that I provided in our example.
# note that in our simple case of text generation, the bilstm wouldnt magically make everything better
# infact you may see the loss decreases much more, but the text generation is aweful! guess what
# is causing this?
#%%
#%% Attention Mechanism
# good resources :
# papers:
# https://www.aclweb.org/anthology/D15-1166.pdf
# https://arxiv.org/pdf/1502.03044.pdf
# https://arxiv.org/pdf/1409.0473.pdf
# others:
# https://www.youtube.com/watch?v=yInilk6x-OY
# https://www.youtube.com/watch?v=W2rWgXJBZhU&t=607s
# https://blog.floydhub.com/attention-mechanism/
# https://towardsdatascience.com/
# https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
# https://colab.research.google.com/github/bastings/annotated_encoder_decoder/blob/master/annotated_encoder_decoder.ipynb#scrollTo=NL240dtrgDw6
# intuitive-understanding-of-attention-mechanism-in-deep-learning-6c9482aecf4f
# https://github.com/thomlake/pytorch-attention
# https://medium.com/intel-student-ambassadors/implementing-attention-models-in-pytorch-f947034b3e66
# https://towardsdatascience.com/attention-seq2seq-with-pytorch-learning-to-invert-a-sequence-34faf4133e53
# https://machinelearningmastery.com/how-does-attention-work-in-encoder-decoder-recurrent-neural-networks/
# https://medium.com/syncedreview/a-brief-overview-of-attention-mechanism-13c578ba9129
# https://stackoverflow.com/questions/50571991/implementing-luong-attention-in-pytorch
# https://medium.com/@shashank7.iitd/understanding-attention-mechanism-35ff53fc328e
# https://medium.com/@bgg/seq2seq-pay-attention-to-self-attention-part-1-d332e85e9aad
# Now lets learn about Attention mechanism!
# before we continue, for sequence to sequence models and translations, one of the good
# resources is https://www.manythings.org/anki/ that we can download text corpus and use it
# for translations. OK now lets continue with attention mechanism
# as you can see, I have posted a lot of resources that you can use, I myself read some of them
# and will quote from them. so what is an attention mechanism and why do we even care?
# Attention mechanism, as its name implies, is a mechanism which helps the network to pay more
# attention on specific parts of the input in order to produce more plausible outcome.
# it was initially proposed for NMT or neural machine translation, where for example, you want
# to translate a sentence from one language to another. in a traditional case which we saw earlier
# in such cases, a seq2seq model is used, that is, a network comprising of two networks, an encoder
# and a decoder, where the input sequence is fed to the encoder, a decoder ultimately recieves a
# compressed representation , representing the input sequence, from the encoder part and then,
# tries to produce a sequence as the answer. the problem with this procedure was/is that for a long
# sequence, we cant transfer the information from earlier time steps, its just simply not possible
# (yes I know how we said about the lstm and gru gates, retaining earlier time step features, they
# idea here, is even if lstm gates simply and ease the transfer of a specific feature from earlier
# time steps to the later timesteps, lots of such information will be lost becasue the last output
# is simply a finite fixed-size vector which can only accomated so much features, and mostly they
# will be features from recent timesteps as apposed to earlier ones. please note that, there
# are lots of relationships between each word, and also underlying concept in a given sequence,
# suppose, in an optimal case, your sequence, had 40 underlying features, that needed to be identified
# and used so a perefect output be created, however, since there is no mechanism to retain 'all' of
# these features, and we face a fixed final vector, plus our training procedure is not perefect and
# also have noise!, only a handful of such features get the chance to be transfered, features do get
# identified, but they cant be utilized as we dont have a mechanism to use them effectively so in the
# current procedure, they just get lost. so what should we do then? we can use all of t he states
# from all previous timesteps instead of using only the last one. this way, we can provide much more
# information and this wealth of information at each timestep can help the network produce better result
# but how do we do that? surely not all hidden states, are equally important when it comes to producing
# a translation, a new word e.g. here we can rank them based on how much they affect the outcome.
# this way the network will gradually understand the relation ship and focuses on the correct states
# when needed. this is the gist of attention. we simply use all hidden states from all timesteps in
# the encoder and feed them to the decoder as input.
# how do we pay attention more or less to a specific hidden state at a timestep ?
# we calculate a score between that hidden state and the current hidden state of our decoder (
# the current timestep in the decoder). that is, for every single timestep in our encoder,
# we simply calculate a score beteen that timestep and the previous timestep in our decoder(
# that is used for creating the current output).
# how do we do that? there are several ways for this that we will get to shortly.
# when we calculated scores for all timesteps, we take the softmax. why do we do that?
# we do that, so the cumulative score is 1, and we can treat these scores, as a conditioal factors
# a weight that specifies / shows the importance of a specific timestep.
# as we said there are several ways for implementing attention mechanism.
# two important methods exist among others. They are known as Bahdanau, and luoung ,
# which indicate the main authors of two papers that proposed these methods for attentions.
# The first type of Attention, commonly referred to as Additive Attention, came from a paper by
# Dzmitry Bahdanau, which explains the less-descriptive original name. The paper aimed to improve the
# sequence-to-sequence model in machine translation by aligning the decoder with the relevant input
# sentences and implementing Attention. The entire step-by-step process of applying Attention in
# Bahdanau’s paper is as follows:
# 1.Producing the Encoder Hidden States - Encoder produces hidden states of each element in the input
# sequence
# 2.Calculating Alignment Scores between the previous decoder hidden state and each of the encoder’s
# hidden states are calculated (Note: The last encoder hidden state can be used as the first hidden
# state in the decoder)
# 3.Softmaxing the Alignment Scores - the alignment scores for each encoder hidden state are combined
# and represented in a single vector and subsequently softmaxed
# 4.Calculating the Context Vector - the encoder hidden states and their respective alignment scores
# are multiplied to form the context vector
# 5.Decoding the Output - the context vector is concatenated with the previous decoder output and fed
# into the Decoder RNN for that time step along with the previous decoder hidden state to produce a
# new output
# The process (steps 2-5) repeats itself for each time step of the decoder until an token is produced
# or output is past the specified maximum length
# For our first step, we’ll be using an RNN or any of its variants (e.g. LSTM, GRU)
# to encode the input sequence. After passing the input sequence through the encoder RNN,
# a hidden state/output will be produced for each input passed in. Instead of using only the hidden
# state at the final time step, we’ll be carrying forward all the hidden states produced by the
# encoder to the next step
# After obtaining all of our encoder outputs, we can start using the decoder to produce outputs.
# At each time step of the decoder, we have to calculate the alignment score of each encoder output
# with respect to the decoder input and hidden state at that time step. The alignment score is the
# essence of the Attention mechanism, as it quantifies the amount of “Attention” the decoder will
# place on each of the encoder outputs when producing the next output.
# The alignment scores for Bahdanau Attention are calculated using the hidden state produced by the decoder in the previous time step and the encoder outputs with the following equation:
# score_alignment = W_combined * tanh(W_decoder * H_decoder + W_encoder * H_encoder)
# as you can see, its basically the decoders hidden state plus the encoders hidden state which are
# being used in a tanh transformation function. the weights are basically going to specify how much
# importantce each hidden state has.
# The decoder hidden state and encoder outputs will be passed through their individual
# Linear layer(that is we use nn.Linear without a bias since it simply does a W*input! and makes life easier for us, without it, we should define a new parameter W and multiply it by the decoder hidden state, its the same thing! but uglier! so thats why we simply use a linear layer as a learnable parameter for (W_decoder*H_decoder)) and have their own individual trainable weights.
# Lastly, the resultant vector from the previous few steps will undergo matrix multiplication with
# a trainable vector, obtaining a final alignment score vector which holds a score for each encoder
# output.
# Note: As there is no previous hidden state or output for the first decoder step, the last encoder
# hidden state and a Start Of String (<SOS>) token can be used to replace these two respectively.
# 3. Softmaxing the Alignment Scores
# After generating the alignment scores vector in the previous step, we can then apply a softmax on this
# vector to obtain the attention weights. The softmax function will cause the values in the vector to
# sum up to 1 and each individual value will lie between 0 and 1, therefore representing the weightage
# each input holds at that time step.
# 4. Calculating the Context Vector
# After computing the attention weights in the previous step, we can now generate the context vector by
# doing an element-wise multiplication of the attention weights with the encoder outputs.
# Due to the softmax function in the previous step, if the score of a specific input element is closer
# to 1 its effect and influence on the decoder output is amplified, whereas if the score is close to 0,
# its influence is drowned out and nullified.
# 5. Decoding the Output
# The context vector we produced will then be concatenated with the previous decoder output.
# It is then fed into the decoder RNN cell to produce a new hidden state and the process repeats itself
# from step 2. The final output for the time step is obtained by passing the new hidden state through a
# Linear layer, which acts as a classifier to give the probability scores of the next predicted word.
# seems a lot of people implement attention based on the version explained in this paper:
# https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf
# basically this only looks at the encoders hidden states and doesnt include decoders
# hidden state in calculating the score and creating the context vector!
# https://mlwhiz.com/blog/2019/03/09/deeplearning_architectures_text_classification/?utm_campaign=shareaholic&utm_medium=reddit&utm_source=news
# this is an excellent blog post. highly recommened it
# https://srome.github.io/Understanding-Attention-in-Neural-Networks-Mathematically/
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size=30, num_layers=1, bidirectional=False,drp=0.3):
super().__init__()
# **h_0** of shape (num_layers * num_directions, batch, hidden_size)
self.encoder = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,num_layers=num_layers,
bidirectional=bidirectional, batch_first=True,dropout=drp)
self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim = hidden_size)
def forward(self, x, h):
x = self.embedding(x)
return self.encoder(x,h)
#this is basically a decoder part
class BahdanauAttention(nn.Module):
def __init__(self, output_size, hidden_size=30, num_layers =1,bidirectional=False, drp=0.3):
super().__init__()
self.input_size = hidden_size*2
self.output_size = output_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.drp = drp
# we need to calculate (score = W_v . tanh( W_d*H_d + W_e*H_e))
# W_v shape is nx1, W_d is n×n and W_e n×2n these shapes can be found
# in the bahdanaus paper page 14.
# and the output of score must be a single number for each
# encoders hiddenstates(which means, seq_len since we have a
# n output (hiddenstate) for every sequence).
# always remeber, we only need the W and not bias! so we set them off
self.W_d = nn.Linear(hidden_size, hidden_size, bias=False)
self.W_e = nn.Linear(hidden_size, hidden_size, bias=False)
# attention combine
self.W_v = nn.Parameter(torch.FloatTensor(1, hidden_size))
self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim = hidden_size)
# input_size is hidden_size * 2, since unlike before, now
# in addition to the hidden_state we used to get from encoder, we now
# have the attention vector that gets concatenated together and fed to the decoder.
#
self.lstm = nn.LSTM(input_size=hidden_size * 2, hidden_size=hidden_size,num_layers=num_layers,
bidirectional=bidirectional, batch_first=True,dropout=drp)
# LSTMCell(input_size: int, hidden_size: int, bias: bool) -> None
self.lstm_cell = nn.LSTMCell(hidden_size * 2, hidden_size)
self.fc = nn.Linear(hidden_size, output_size)
# this forward, we assume , we get inputs of length 1 sequence
# that means, our inputs are fed one timestep at a time
# thats why we are using the lstm layer and it works!
# for this to work, we must have a loop in our training procesure
# where we feed one timestep at a time.
def attention_cell(self, x_t, hidden_states_t, encoder_outputs):
# x_t = self.embedding(x_t)
# since lstm has two states, h and c, we only care about h!
(h_prev, c) = hidden_states_t
# lets first calculate the score, we use the previous hiddenstate
# of the decoder, which can be none or the last state of the encoder
# As we have already read there is no previous hidden state or output
# for the first decoder step, the last encoder hidden state and a
# Start Of String (<SOS>) token can be used to replace these
# two, respectively.
D = self.W_d(h_prev)
print(f'h_prev shape: {h_prev.shape}')
print(f'encoders.shape: {encoder_outputs.shape}')
print(f'W_e shape: {self.W_e.weight.shape}')
encoder_outputs = encoder_outputs.reshape(encoder_outputs.size(0),-1)
print(f'encoders.shape: {encoder_outputs.shape}')
E = self.W_e(encoder_outputs)
score_raw_part1 = F.tanh(D + E)
print(f'tanh(w_d*h_prev + w_e*encoders): {score_raw_part1.shape}')
# scale it , we use bmm which is batchmatrixmultiplication
score_raw = score_raw_part1.bmm(self.W_v.unsqueeze(2))
print(f'score raw(W_v*tanh()): {score_raw.shape}')
# normalize it using softmax and get our attention weights
# these will be multiplied by our encoders states
weights = F.softmax(score_raw, dim=1)
print(f'attention weights: {weights.shape}')
# now create context vector which is attention_weights * encoder outputs(states)
# Multiplying the Attention weights with encoder outputs to get the context vector
# since we are dealing with not a sample but several samples at the same time, we
# use bmm.
context_vector = torch.bmm(weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
print(f'context_vector: {context_vector.shape}')
# Concatenating context vector with embedded input word
decoder_input = torch.cat((x_t, context_vector[0]), 1).unsqueeze(0)
print(f'output(concat(context_vector[0],x_t)): {decoder_input.shape}')
# Passing the concatenated vector as input to the LSTM cell
output, hidden = self.lstm(decoder_input, hidden)
# Passing the LSTM output through a Linear layer acting as a classifier
output = F.log_softmax(self.classifier(output[0]), dim=1)
return output, hidden, weights
def forward(self, x, hidden_states, encoder_outputs):
x = self.embedding(x)
print(f'x.shape after embedding: {x.shape}')
# x = x.view(1, -1)
# print(f'x.shape after view(1,-1) : {x.shape}')
# we can set it to none
# hidden_states = None
# since lstm has two states, h and c, we only care about h!
# x.shape = (batch, timestep, dims)
O = torch.zeros_like(x)
H = []
A = []
for t in range (x.size(1)):
O[:,t,:], hidden_states, weights = self.attention_cell(x[:,t,:], hidden_states,encoder_outputs)
H.append(hidden_states)
A.append(weights)
return O, torch.tensor(H), torch.tensor(A)
# ok lets test our implementation so far and see if it works well
# first lets create a dummy input
input_np = np.array([[1,2,3],[0,4,1]])
input = one_hot(input_np,length=5)
# print(f'input_one hot encoded: \n{input}')
encoder = Encoder(5)
model = BahdanauAttention(5)
# batch, n_layer*bid
input = torch.from_numpy(input)
print(f'input: {input.shape}')
d_input = torch.from_numpy(input_np).long()
encoder_outputs, encoder_last_hidden_states = encoder(d_input, None)
print(f'encoder output: {encoder_outputs.shape}')
print(f'h from encoder(shape (n_layers * num_dir, batch, hidden_size)) \n{encoder_last_hidden_states[0].shape}')
yz = model(d_input, encoder_last_hidden_states, encoder_outputs)
print('results: ')
print(f'input {input}')
print(f'decoder output: {yz}')
#%%
# lets implement this one first and then we implement the luoungs version :
#%% sentiment analysis
# lets do sentiment analysis . we read a bunch of reviews with their corrosponding labels
# here are the steps we need to take
# we know our label contains words, positive and neagative, so we convert them into numbers, 1, 0
# our reviews must be dgitized so we can feed them into our network , but before that we need
# to do some preprocessings. the preprocessings include
# 1. make everything lower case
# 2. remove punctuations
# 3. remove special characters such as \n
# 4. split only words! we dont want to create text, so creating dictionaries of characters
# as apposed to creating dictionary of words is not going to suite us, becasue we intend on
# using word embedding, and relationship between words need to be found out! this is not possible
# with characters!
# 5. thats nearly basically it(there are still some left),
# we need to create two dictionaries for converting word2int and in2words ()
# there is one thing to note that, we start our intergers from 1 and not 0. we will be using 0
# later on for padding the input so we can have batches of the same size.
# 6.so we order our words based on their frequency! the most frequent wants get to the top and
# the least freqyent one goes to the bottom of the list.
# 7. an important step is to just normalize/standardize the length. since we want to use batch
# we need to pick a length. we cant use the bigest one, becasue we may waste alot of space
# so we search and remove the larges and smallest ones.
# 8. good lets go
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim
%matplotlib inline
# first lets read our files
with open(r'data\reviews.txt','r') as file:
reviews_raw = file.read().lower()
with open(r'data\labels.txt', 'r') as file:
labels_raw = file.read().lower()
# lets see what we have here
print(repr(reviews_raw[:2000]))
print(repr(labels_raw[:20]))
# ok, good, now lets remove punctuations and \ns
reviews_raw = ''.join([c for c in reviews_raw if c not in string.punctuation])
print(repr(reviews_raw[:2]))
# ok now lets remove \ns
reviews_raw_split = reviews_raw.split('\n')
# print(repr(reviews_raw[:200]))
# ok good, now lets create our wordlist from reviews
reviews_words_list = ''.join(reviews_raw_split).split()
print(reviews_words_list[:20])
#ok good, now lets get each words frequency and also sort them based on that
# first get each words frequency in our list. we use Counter from collections for that
from collections import Counter
words_dict = Counter(reviews_words_list)
print(words_dict['the'])
# now lets sort these from highest to lowest
words_sorted_list = sorted(words_dict,key=words_dict.get,reverse=True)
# print(words_dict_sorted)
int2word = dict(enumerate(words_sorted_list,1))
word2int = {word:idx for idx,word in int2word.items()}
print(f'int2word[1] {int2word[1]}')
print(f'word2int["the"] {word2int["the"]}')
# now lets digitize our label
labels = [1 if word=='positive' else 0 for word in labels_raw.split('\n')]
print(f'labels[:50](raw): \n{labels_raw[:54]}')
print(f'labels[:50]: {labels[:6]}')
# now lets create the digitized version of the reviews
reviews_digitized=[]
for each_review in reviews_raw_split:
# create a temp list of words separated by space for each review by doing .split()
reviews_digitized.append([word2int[w] for w in each_review.split()])
print(len(reviews_digitized))
print(reviews_raw_split[:2])
print(reviews_digitized[:2])
#%%
# ok the frequency is clear, they are sorted, lets find the biggest and smallest reviews
max_len = max([len(rev) for rev in reviews_digitized])
min_len = min([len(rev) for rev in reviews_digitized])
dic_rev_length = Counter(len(rev) for rev in reviews_digitized)
print(f'max length: {max_len} and min_len : {min_len}')
print(dic_rev_length[2])
print(f'max: {max(dic_rev_length)}')
print(f'min: {min(dic_rev_length)}')
print(f'max count {dic_rev_length[max_len]}')
print(f'min count {dic_rev_length[min_len]}')
# lets remove the ones with zero index
# we dont this, since we also need to remove the corrosponding labels
# so we instead get the index and remove the reviews based on their index
# new_reviews = [review for review in reviews_digitized if len(review)!=0]
# new_reviews2 = [review for review in new_reviews if len(review)!=2514]
idxs = [idx for idx,review in enumerate(reviews_digitized) if len(review)==min_len]
print(f'idxs: {idxs[:10]}')
idxs += [idx for idx,review in enumerate(reviews_digitized) if len(review)==max_len]
new_reviews = [review for idx,review in enumerate(reviews_digitized) if idx not in (idxs)]
print(f'idxs: {idxs[:10]}')
print(len(reviews_digitized))
print(len(new_reviews))
# lets remove the labels
new_labels = [label for idx, label in enumerate(labels) if idx not in (idxs)]
print(new_labels[:10])
#%%
# Ok now lets create a function for padding our input
# we do this so we can create batches and increase the performance
# Pytorch provides its own functions
# anyway lets create a function that gets the digitized review and returns a
# padded numpy array . we define a maximum length , and fill it from the end
def pad_input(new_reviews, max_length =200):
padded_array = np.zeros(shape=(len(new_reviews), max_length),dtype=np.int)
for i,review in enumerate(new_reviews) :
padded_array[i,-len(review):] = review[:max_length]
return padded_array
reviews_digitized = pad_input(new_reviews, 150)
print(reviews_digitized[:1])
training_frac = 0.80
tr_idx = int(reviews_digitized.shape[0] * training_frac)
training_set, remaining_set = reviews_digitized[:tr_idx,:], reviews_digitized[tr_idx:,:]
val_frac = 0.5
val_idx = int(remaining_set.shape[0]*val_frac)
val_set = remaining_set[:val_idx,:]
test_set = remaining_set[val_idx:,:]
print(training_set.shape)
print(val_set.shape)
print(test_set.shape)
# now labels!