-
Notifications
You must be signed in to change notification settings - Fork 5
/
basic_Pytorch_introduction_NeuralNetworks.py
1684 lines (1512 loc) · 79 KB
/
basic_Pytorch_introduction_NeuralNetworks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#%% [markdown]
# In the name of God the most compassionate the most merciful
import torch
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#%%
# Here we are going to see how we can create a neural network and train/test it in Pytorch
# We will see how we can augment our data, create our datasets, etc and alot more.
# In Pytorch we can use the torchvision module, for reading existing datasets that Pytorch offers,
# or create a dataset out of our existing folder of images.
# It also provides a fakedataset for images, which we can use for benchmarking, or debugging.
# We also do augmentation using this module. This module also provides several well known achitectures
# such as AlexNet, VGGNet, ResNet, MobileNet, DenseNet, etc
# enough talking lets see how to use it
# here lets import datasets for using the dataset capabilities
# use transforms for data-augmentation, and
# models for using existing models
from torchvision import datasets, transforms, models
# Our first step is to create a dataset. lets create MNIST dataset for the start
# before we create our dataset, we should know that upon creating our dataset
# we need to specify at least 1 transformation and that is ToTensor().
# ToTensor() actually converts the input images into torch tensors and this is a must have
# transformation is not limited to ToTensor only, it includes a range of transformation from
# resize, flipping, etc.
# so we do it this way
transformations = transforms.ToTensor()
# but what if we want to do more transformations, such as padding the image, or filliping it, or resizing it?
# we can simply compose as many tansformations as we wish!
# the Compose method, takes a list of transformations so we can simply instead do :
transformations = transforms.Compose([transforms.ToTensor(),
# for normalizing note we used 0.5, that ,
# is needed since mean and std requires a tuple
# and since mnist is grayscale(black n white)
# or has 1 channel images only! we use 1
# number only for mean and std.
transforms.Normalize(mean=(0.5,),std=(0.5,))])
#
dataset_train = datasets.MNIST(root='MNIST', train=True, transform=transformations, download=True)
dataset_test = datasets.MNIST(root='MNIST', train=False, transform=transformations, download=True)
# now we have our datasets. but as you know, usually we dont load the whole dataset all atonce!
# instead we read in batches! in Pytorch we do this using a dataloader. using a dataloader
# we can easily specify a batchsize, and other performance related options such as number of workers
# which is the number of threads used to read from the dataset and thus provide a more efficient data
# loading!!! enough talking lets see how to create a dtaloader!
# the Dataloader resides in torch.utils.data !
import torch.utils.data as data
dataloader_train = data.DataLoader(dataset_train, batch_size=32, shuffle=True, num_workers=2)
# we do the same thing for test
dataloader_test = data.DataLoader(dataset_test, batch_size=32, shuffle=False,num_workers=2)
print(f'test dataloader size: {len(dataloader_test)}')
# there is a note here. whenever you get wierd errors concerning dataloader/dataset, the first thing
# you do first is to set num_workers = 0. usually when you have a probelm in your dataset (specially
# when you create your own dataset) this will help you see the issue immediaitly. becasue setting num_workers=0
# uses the same thread to run everything, and thus will catch your error, whereas when you set it to any number
# greater than 0, you will get a completely different error which is concerning threads failing! so have this in mind!
# OK, we defined our datasets, and dataloaders. lets inspect our data and see how they look !
imgs, labels = next(iter(dataloader_train))
# remember our images, are not tensors, and in order to display them using matplotlib
# we must convert them back to normal numpy! arrays . and again since we normalized them
# we must unnormalize them for visualization as well. lets write a function that accepts a batch of images
# with their labels and displays them!
def visualize_imgs(imgs, labels, row=3, cols=11,):
# images in pytorch have their axes swapped!
# so we first fix their orders firts!
# the dim is batch, c, h, w, but we should be having
# batch, h, w, c
# we used detach to clone images (we dont have to do this, but its good practice for later)
imgs = imgs.detach().numpy().transpose(0, 2,3,1)
# now we need to unnormalize our images.
# figsize takes two arguments, which specify the column(width) and row(height)
fig = plt.figure(figsize=(20,5))
for i in range(imgs.shape[0]):
ax = fig.add_subplot(row, cols, i+1, xticks=[], yticks=[])
# since our mnist images are 1 channel only, we remove the channel dimension
# so that matplot lib can work with it (we changed 28x28x1 to 28x28!)
ax.imshow(imgs[i].squeeze(), cmap='gray')
ax.set_title(labels[i].item())
plt.show()
visualize_imgs(imgs, labels)
# OK, lets see an image in more details!
def visualize_img(img):
img = img.numpy().transpose(1,2,0).squeeze()
fig = plt.figure(figsize=(28,28))
ax = fig.add_subplot(1,1,1)
# or
#ax = plt.subplot(111)
ax.imshow(img, cmap='gray')
threshold = 0.5
w,h = img.shape
for i in range(h):
for j in range(w):
ax.annotate('{:.2f}'.format(img[i,j]),xy=(j,i),
horizontalalignment='center',
verticalalignment='center',
color='white' if img[i,j]<threshold else 'black')
visualize_img(imgs[0])
#%%
# Before we continue with training a model, I'd like to point out a useful feature here
# note 1 concerning new transformations.
# previously we just saw how to augment our data using torchvision.transforms module
# we saw there are many transformations that we can use. what about something new?
# how can we add our own transformations?
# the transformations that we saw and used such as transforms.ToTensor(), transforms.Resize()
# etc are called functors. you can create new functors and add them to the compose list!
# a functor is simply a class with a __call__(self, *args) method. in Python terminalogy
# it is called as 'Callable'! for example, the ToTensor() funtcor can be implemented
# like this :
import torchvision.transforms.functional as F
class ToTensor(object):
def __call__(self, pic):
return F.to_tensor(pic)
# or a funcor that accepts parameters, resize can be as simple as :
class Resize(object):
def __init__(self, size, interpolation):
self.size = size
self.interpolation = interpolation
def __call__(self, input):
return F.resize(input, self.size, self.interpolation)
# as you can see you can write as you many custom transformations as you like!
to_tensor_f = ToTensor()
import PIL.Image as Image
resize_f = Resize(3, Image.BILINEAR)
numpy_tensor = np.random.rand(3, 3, 3)
result_tensor = to_tensor_f(numpy_tensor)
print(result_tensor)
result_tensor = Image.Image.
#resize this tensor!
x=resize_f(result_tensor)
print(x)
# I'll be covering the dataset related chores in detail in the upcomming parts, but for
# now lets cover this.
# Suppose we dont have separate sets for our trainning (such as training set, validation set, test set)
# and we want to create these by ourseleves, what should we do?
# lets take MNIST as our example. as it only has a training set and a test set but no validation set.
# as you will see what we describe here will be applicable to just anything.
# in Pytorch we have something called a sampler that as the name implies, samples!
# basically a sampler defines the strategy to draw samples from a dataset.
# we have different kind of samplers. what we are after is a sampler called 'SubsetRandomSampler'
# we can access it from 'torch.utils.data' module .
# This class samples elements randomly from a given list of indices, without replacement.
# without replacement simply means the values are unique.
# what this class needs is a list of indeces. lets create ourselevs a list of indeces :
dataset_train = datasets.MNIST(root='MNIST', train=True, transform=transformations, download=True)
dataset_test = datasets.MNIST(root='MNIST', train=False, transform=transformations, download=True)
train_num_samples = len(dataset_train)
# this simply gives us a list of indexes starting from 0 - 59999
train_indexes = list(range(train_num_samples))
print(f'some training indexes[:5] : {train_indexes[:5]}')
# now let us shuffle our indexes, shuffle is an inplace operation
# so it changes the list items orders.
np.random.shuffle(train_indexes)
print(f'some training indexes[:5] : {train_indexes[:5]}')
# now we have a list of indexes. lets specify our validation ratio from this list.
# here we are specifying that 20% of our data is reserved for validation,
val_ratio = 0.2
val_end = int(train_num_samples * 0.2)
validation_split_indexes = train_indexes[0:val_end]
training_split_indexes = train_indexes[val_end:]
# lets view the changes
print(f'training size before split: {train_num_samples}')
print(f'validation size: {len(validation_split_indexes)}')
print(f'training size: {len(training_split_indexes)}')
# make sure the splits are actually correctly done!
assert len(validation_split_indexes) + len(training_split_indexes) == train_num_samples ,'they must match!'
# Now we have our list of indexes, what remains is to create a sampler that samples from
# these lists
sampler_train = torch.utils.data.SubsetRandomSampler(training_split_indexes)
sampler_val = torch.utils.data.SubsetRandomSampler(validation_split_indexes)
# and the last step is just to create a dataloader.
# note that we dont use shuffle here. as they are mutually exclusive (i.e. they can not
# come together!) with samplers.
dataloader_train = torch.utils.data.DataLoader(dataset_train,
batch_size=32,
sampler=sampler_train,
num_workers=2)
dataloader_val = torch.utils.data.DataLoader(dataset_train,
batch_size=32,
sampler=sampler_val,
num_workers=2)
# check some samples
imgs, labels = next(iter(dataloader_val))
visualize_imgs(imgs, labels)
# so if you have a folder of images, you can use datasets.ImageFolder()
# and then use SubsetRandomSampler() to split your data into
# different sets. here we only created a validation set out of our train
# -ing set. but you can do as many split as you like.
# we will cover ImageFolder in later sections
#%%
# OK now that we have done this lets see how we train a model !
# before that we need a model
# we can create one or use an existing one,
# first lets see how to use an existing model !
model = models.AlexNet(10)
# thats it now we have a model the that we can use for training! however there is a catch like always!
# this is alexnet. and alexnet was trained for imagenet challange. in imagenet chalange the image size
# that were used to train this model were 224x224x3. so this model accepts 224x224x3 images as input!
# while our input is a 28x28x1 images!! this wont simply work!!!
# either we have to resize our images from 28x28 to 224x24! (using resize() in transformers up there!)
# and also make images 3 channeled instead of 1! this is too much work for now! instead
# so lets create our own model first and then later on see how we can use these models on new data!
# for creating a model, we simply define a new class that inherits from torch.nn.Module()
# and then define our layers and ultimately specify the sequence of how these layers are used in
# forward() method. lets see how all of this can be implemented! (this is very easy!)
import torch.nn as nn
# torch.nn.functional module(usually imported into the F namespace by convention)
# is a module which contains activation functions, loss functions, etc, as well
# as non-stateful versions of layers such as convolutional and linear layers.
import torch.nn.functional as F
class ourNetwork(torch.nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# lets create a simple multilayer preceptron (4 fully connected layer network!)
# we have several ways for creating layers. the simplest form is like this
# we define many layers we need here and then in the forward() we specify their order
# nn.Linear() gives us a fully connected layer. since our image is 28x28,
# and we are using fully connected layers, this means our input dimension is 28x28
self.fc1 = nn.Linear(28*28, 256)
self.fc2 = nn.Linear(256, 128)
self.fc3 = nn.Linear(128, 64)
self.fc4 = nn.Linear(64, num_classes)
self.relu = nn.ReLU()
# this method is called in our forward phase
# we can recieve any number of parameters here,
# but the default is one parameter which is the input batch!
# we can also have none! and we will see an example concerning this
# later on! for now, our forward method takes one argument and that
# is a batch of images
def forward(self, input_images):
# since we have images, but our first layer
# is fully connected, we have to flatten our input images
# i.e. instead of being a matrix, we convert them into vector!
# we will not change the underlying data, we are only going to change
# how it looks (thus change its shape!) instead of seeing them in rows columns
# (28 rows, 28 columns ), suppose we have one row and 28x28 columns!
# as you remember, we use view or reshape for this (also resize_ but we chose
# view becasue we explained why previously!
# for flattening we do this two way!
# 1. we directly enter the size since we know the dims.
# input_images = input_images.view(-1, 28*28)
# the bad thing about this is that if later we change our image dims
# we have to comeback here and change the dims to match the new dims (e.g. 32x32)
# a better way would be to specify the batchsize and then let the dims be automatically
# infered like this :
batch_size = input_images.size(0)
input_images = input_images.view(batch_size, -1)
# to see how it changed! uncomment this
# print(input_images.shape)
output = self.fc1(input_images)
# now we need to use a nonlinearity! we can use
output = self.relu(output)
# or we could also use the functional form which is
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
output = F.relu(self.fc4(output))
return output
# thats it! thats all it needed to create a model.
# now before we continue lets test our model and see if
# we implemented everything corectly and we have no error whatsoever!
# lets create a dummy batch of fake images (which are just tensors with random vlaues!)
# the shape representes, batch, h,w,c
fake_images = torch.rand(size=(3,28,28,1))
our_model = ourNetwork(num_classes=10)
output = our_model(fake_images)
print(f'output: {output}')
# so far so good!
#%%
# now lets go for the last part which is training /testing.
# we will need
# 1. an optimizer such as sgd or adam, etc
# 2. a criterion (a loss function )
# 3. thats all.
# all optimizers reside in torch.optim
# from torch import optim
# our optimizers, take at least two paramters, 1. model parameters
# and 2. the learning rate (they can take, weight decay, momentum etc as well)
# but the model parameters, and lr are the very minimum requirements
# remember if you set too of a high learning rate, you will see your loss
# will not decrease and may also not increase!! it may very well get stuck
# to or around a value . therefore your val_acc would also perform similarly
# when you use batchnormalization, you maynot see this as you can use considerably
# higher learning rate. but knowing this is important. set this learning rate to 0.1
# forexample and see the outcome. then reset it to 0.001 and rerun the training loop
# and witness the change
optimizer = torch.optim.Adam(our_model.parameters(), lr = 0.001)
# for our loss function, its customery to use the name criterion,
# you can use anyname you like, but criterion is overwelmingly popular!
# different loss functions can be found under torch.nn module
# we use crossentropy for our classification task!
criterion = nn.CrossEntropyLoss()
# now we are ready to start our training loop.
# before that lets write code that allows us to
# utilize GPU if its present
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
our_model = our_model.to(device)
epochs = 20
# at this interval we display our training loss or other informations we like
# for example run our model on test data to see the validation acc/loss or pretty much
# anything else we would like. you name it!
interval = 500
# a counter!
i = 0
training_losses = []
validation_losses = []
for e in range(epochs) :
for imgs, labels in dataloader_train:
# we set our model to train mode
# this is specially needed if we use dropout or batchnormalization
# or pretty much any layers that during training and testing have different behaviors!
our_model.train()
i+=1
# in case we have access to gpu, we must move
# all images, labels to gpu. so we should do
imgs = imgs.to(device)
labels = labels.to(device)
# now feed our images to the model and get the predictions
preds = our_model(imgs)
# we can also write
# preds = our_model.forward(imgs)
# however, the first style is more popular you can choose either of the two!
# now lets see the loss
loss = criterion(preds, labels)
# always do this before backpropagating the loss
# this zeros the gradient from previousl steps
optimizer.zero_grad()
# backpropagate the loss
loss.backward()
# now update the weights (take one step of the optimzier)
optimizer.step()
if i % interval == 0 :
# since loss is a tensor, in order to see a python scaler, we use item()
# else the print will show the verbose tensor information becasue its a tensor!
print(f'epoch/iter: {e}/{i} training-loss: {loss.item()}')
# lets run evaluations on test set after each epoch :
total_corrects = 0
total_loss_val = 0.0
acc_val = 0.0
class_correct_counter = torch.zeros(size=(10,), dtype=torch.int32)
class_total_samples = torch.zeros(size=(10,), dtype=torch.int32)
total_corrects2 = 0
uncorrect_imgs_predicted_list = []
for imgs, labels in dataloader_test:
our_model.eval()
imgs = imgs.to(device)
labels = labels.to(device)
preds = our_model(imgs)
loss_val = criterion(preds, labels)
# now lets see how the predictions looks like and what accuracy we get
# we can calculate the accuracy using a multitudes of ways!
# we'll see 3 ways of doing it. we start with
# the easiest/normalest way first.
# first method :
# in the first method, we get the index of highest probabilities in our predictions
# and compare them with our labels. our comparsion will give us a series of
# true/false values which indicate, where our predictions were inline with our labels
# and where they were different. we can then simply convert true /false to 1 and 0, and
# then by suming all of them we get the total number of correct predictions. then dividing
# this number by the total number of samples in our test set, we get our accuracies.
# we use torch.max() to get the highes values in a tensor.
values, class_indexes = torch.max(preds,dim=1)
# not only max returns the highest values (maximum values)
# it also gives us the index of those values. We use the index
# to see which class(index) was predicted as correct by the netwok
# and compare it agains the labels (our true class)
result = (class_indexes == labels)
# or we could also use torch.eq(a, b) (and NOT torch.equal()) for this.
# Remember that torch.equal(a,b) says whether the two tensors are 'equal' or 'not'
# while torch.eq(a,b) says whether 'each' element is the same or not
# thus torch.equal() will return a 'single' True or False value as result,
# while torch.eq() will return a matrix of boolean values each indicating if the
# respective elements were the same or not! there for we use torch.eq becasue we
# want to check all elements in tensors agains each other:
# result = torch.eq(class_indexes, labels)
# to see its shape or content you can uncomment this
# print(result.shape)
# result is a matrix of boolean values. lets sum them
# before that lets convert True/False to 1 and 0s
# this way we will treat True as 1 and False as 0. the sum therefore
# will give us the number of true classes that were
# predicted correctly .
# since we read in batches, we add all the correct predictions
# in each batch and then divide it by the total number of samples
# and get our accuracy! https://pytorch.org/docs/stable/torch.html#torch.eq
total_corrects += result.float().sum()
# 2. method
# There is a second way we can calculate the accuracy. and that is using topk!
# for example suppose, we want to get the accuracy for top5 (like in imagenet)
# how could we do it ? there are many ways but one of the easiest ways is using topk
# lets see how we can do that :
# each tensor has a method named topk, topk accepts an argument k, which we specify
# if we want top5 we set k=5, if we want top1 (which is normal accuracy) we set k=1.
# topk, like max, returns two results. the first is the top values and the second is their
# respective index. so lets see it in action .
top_values, indexes = preds.topk(k=1, dim=1)
# print(indexes.shape, labels.shape)
# we use squeeze on indexes so its shape becomes [32] instead of [32x1]
result2 = torch.eq(indexes.squeeze(), labels).float()
# remember since we are using topk, our 'indexes' has a shape of [32, 1] while our label is
# [32]. this is important, becsaue if we compare these two, in newver versions of Pytorch (0.4+)
# this will be broadcasted and the result will be a tensor of [32,32] while it should be [32]
# If we do (indexes == labels) or torch.eq(indexes, labels)
# results will have shape (32, 32), What it's doing is comparing the one element in each row of
# indexes with each element in labels which returns 32 True/False boolean values for each row.
# therefore in order to not face this issue (which will not give you any error, but mess up your result!)
# always make sure what you are comparing with have the same exact shape.
# one way is to use squeeze, which we just saw!
result2 = (indexes.squeeze() == labels)
# the other way is to do sth like this :
result2 = (indexes.view(*labels.shape) == labels)
# uncomment to see the shapes
# print(f'indexes: {indexes.shape} result(eq) {result.shape} result2(==) {result2.shape}')
total_corrects2 += result2.float().sum()
# 3rd Method :
# now we just found out about two ways of calculating the accuracy , so far we counted
# the number of correct samples and then divided them by the total number of samples
# at the end and got our accuracy. we could also calculate accuracy per batch and then
# add all these accuracies and then divide them by the to tal number of batches.
# this is easily done as well. lets see how we can do this :
# we have our predictions, and labels, using max, or topk we can get the indexes
# of the highest predictions.
# so if we add and average all of them it should give us the accuracy per batch!
# but there is a catch here, you must make sure the dimensions of labels, and predictions
# indexes are the same. if they are not the same, and e.g. one is [N,1] while the other is [N]
# it will be broadcased and the result will be a [N,N] tensor and will mess up your result completely!
# so here just like before, we make sure the two tensors has the same shape
# (if we used max, the indexes and labels both would have the same shape and this wasnt necessary
# but its good practice to havethis in mind and always make sure the shapes match exactly so we dont
# spend a lot of time debugging something that may waste a lot of our time)
result3 = (indexes.view(*labels.shape) == labels)
acc_val += torch.mean(result3.float())
# we can get number of batches, simply by doing len(dataloader_test) !
total_loss_val += loss_val.item()
# lets see a more detailed analysis concerning our model performance
# that is, lets see, which classes are being predicted better than others
# for this, we will count the number of samples for each class and also its
# correct predictions by model.
# this can be easily done.
# we need two arrays/list which has as many elements as our class numbers
# lets do this
for i in range (labels.size(0)):
class_total_samples[labels[i].item()] +=1
class_correct_counter[labels[i].item()] += (indexes[i] == labels[i]).item()
# now lets get a bit more fancy and also save all the images that were uncorrectly
# classified. its logical to only when we trained our model and in evaluation time
# we are trying to see which images were hard for the network and get an idea and
# hopefully comeup with some solutions to fix the issue. during training this is not
# recommended becasue it imposes a lot of over head!
lbl = (indexes[i] == labels[i]).item()
if lbl == 0 :
# we save a tuple containing the image, the predicted class, and the actual class
# this way later on we can not only see the wrongly predicted image, but
# the wrong class that it mispredicted it as
uncorrect_imgs_predicted_list.append((imgs[i],indexes[i],labels[i]))
for i in range(10):
print(f'class {i} : Total Samples : {class_total_samples[i].item()} / {class_correct_counter[i].item()}'\
f' acc: {class_correct_counter[i].item()/class_total_samples[i].item()}')
print(f'test set samples: {len(dataloader_test)}')
print(f'accuracy_val(total corrects//dataset_size): {total_corrects/len(dataloader_test.dataset):.4f}'\
f'\naccuracy_val(topk(k=1)): {total_corrects2/len(dataloader_test.dataset):.4f}'\
f'\naccuracy_val(topk(k=1)-per batch acc): {acc_val/len(dataloader_test):.4f}'\
f'\nloss_val: {total_loss_val/len(dataloader_test.dataset):.6f}')
#%%
# lets visualize our wrongly predicted images and their wrong/true classes
def visualize_wrongly_predicted(imgs_list, cols = 10):
count = len(imgs_list)
rows = np.ceil(count/cols)
print(count,rows)
fig = plt.figure(figsize=(cols, rows))
plt.subplots_adjust(wspace=1, hspace=1)
for i in range(count):
ax = fig.add_subplot(rows, cols, i+1, xticks=[], yticks=[])
(img, pred_class, true_class) = imgs_list[i]
# convert from tensor to numpy image. squeeze because its 28x28x1 and
# matplotlib accepts 28x28 for grayscale images!
img = img.detach().cpu().numpy().squeeze()
ax.imshow(img,cmap='Greys_r')
ax.set_title(f'({pred_class.item()} | {true_class.item()})')
# plot the highest wrong class
visualize_wrongly_predicted(uncorrect_imgs_predicted_list)
#%%
# OK great we created a network from scratch and trained/tested it successfully .
# you probably have couple of questions(ok many questions!). one of them is probabaly
# concerning the way we created our network. if you look back again, you'll see we didnt use
# any softmax layer at the end? why?
# as you know, in classification problems, we usually use a softmax layer at the very end
# of the network to get probablities for our predictions.
# we also know that we usually use softmax with crossentropy loss.
# Yet here we used crossEntropy without softmax? what is it?
# we can use softmax with crossentropy but if you read the documentations,
# you'll notice that when we used crossentropyloss, it acually requires
# 'logits' or 'raw scores' as inputs.
# it then applies a 'logsoftmax' on its input (logits), and then applies
# a negative log likelihood afterward.
# So we can have softmax as our last layer, but for calculating loss, we
# must send the logits.
# But Why do we use logits and not softmax itself ?
# we use logits and not softmax, becasue softmax gives us probablities, which are floating
# point numbers ranging from 0. to 1. and the critical issue with floating point numbers is
# that floating point numbers can not accurately represent numbers close to 0 or 1 and thus we
# face numerical instabilities. Therefore we use raw scores or logits.
#
# OK, but what about log_softmax? why do we use log_softmax instead of softmax then?
# there are several reasons for this. one of them is that log_softmax is numerically stable and doesnt
# have the problems associated with softmax. it also plays nicer with crossentropy
# (https://datascience.stackexchange.com/questions/40714/what-is-the-advantage-of-using-log-softmax-instead-of-softmax)
# so in short, having log probabilities, help both in numerical stabilty and optimization performance
# From wikipedia :
# A log probability is simply a logarithm of a probability. The use of log probabilities means
# representing probabilities on a 'logarithmic scale', instead of the standard [0,1] unit interval.
# Since the probability of independent events multiply, and logarithms convert multiplication to addition,
# log probabilities of independent events add. Log probabilities are thus practical for computations,
# and have an intuitive interpretation in terms of information theory: the negative of the log probability
# is the information content of an event. Similarly, likelihoods are often transformed to the log scale,
# and the corresponding log-likelihood can be interpreted as the degree to which an event supports a
# statistical model. The log probability is widely used in implementations of computations with
# probability, and is studied as a concept in its own right in some applications of information theory,
# such as natural language processing.
# Representing probabilities in this way has several practical advantages:
# Speed: Since multiplication is more expensive than addition, taking the product of a
# high number of probabilities is often faster if they are represented in log form.
# (The conversion to log form is expensive, but is only incurred once.)
# Multiplication arises from calculating the probability that multiple independent
# events occur: the probability that all independent events of interest occur is
# the product of all these events' probabilities.
# Accuracy: The use of log probabilities improves numerical stability, when the probabilities
# are very small, because of the way in which computers approximate real numbers.
# Simplicity: Many probability distributions have an exponential form. Taking the log of
# these distributions eliminates the exponential function, unwrapping the exponent.
# For example, the log probability of the normal distribution's PDF
# is -(x-m_{x}/\sigma _{m})^{2}+C} instead of C_{2}\exp(-(x-m_{x}/\sigma _{m})^{2}).
# Log probabilities make some mathematical manipulations easier to perform
# Ok a quick not so related question :
# whats the difference between sigmoid and softmax by the way?
# softmax is an extension of sigmoid. sigmoid
# is used for binary classification or multi-label classification where there arent mutually exclusive
# classes. softmax is used when classes are mutually exclusive and there is only "ONE" correct class
# also remember that the log-softmax has a
# https://stats.stackexchange.com/questions/233658/softmax-vs-sigmoid-function-in-logistic-classifier
# recap from before : softmax :
# Exercise: Implement a function softmax that performs the softmax calculation
# and returns probability distributions for each example in the batch. Note that
# you'll need to pay attention to the shapes when doing this. If you have a
# tensor a with shape (64, 10) and a tensor b with shape (64,), doing a/b will
# give you an error because PyTorch will try to do the division across the
# columns (called broadcasting) but you'll get a size mismatch. The way to think
# about this is for each of the 64 examples, you only want to divide by one value,
# the sum in the denominator. So you need b to have a shape of (64, 1).
# This way PyTorch will divide the 10 values in each row of a by the one value in
# each row of b. Pay attention to how you take the sum as well. You'll need to
# define the dim keyword in torch.sum. Setting dim=0 takes the sum across the rows
# while dim=1 takes the sum across the columns.
# so the moral of the story : use logsoftmax instead of softmax
# if you used logsoftmax in your network, use negativeloglikelyhood or NLLLoss
# to get probablities at test time, when you used logs0ftmax, just use torch.exp() on
# your result and you re done.
# that was it!
#%%
import torch
# now in this section we are going to learn how to do some initializations
# there are several ways you can do this .
# 1. using torch.nn.init module which provides lots of initialization algorithms
# such as xavier, rmsa, etc
# 2. using model.apply()
# 3. directly initializing the weights and biases.
# lets start with the direct method first.
# as you know, each module may contain a weight and a bias.
# we can access them directlt. each weight has a data and grad attributes
# the data as the name sounds, contains the data, while grad contains the weights
# gradients.
# this is thecase for bias as well, it has a data and a grad attribute.
# lets create a simple fc layer and initialize its weights and biases in different way
fc = torch.nn.Linear(1,1)
# initialize using normal distribution
# since data is a normal tensor, it has access to all methods available to any tensors
# including the inplace normal_ method which samples from a normal distribution
print(f'default: {fc.weight.data}')
fc.weight.data.normal_(mean=0.5, std=1)
print(f'normal_: {fc.weight.data}')
# initialize using uniform distribution [0,1]
print(f'default:_ {fc.weight.data}')
fc.weight.data.uniform_(0.01,0.1)
print(f'uniform:_ {fc.weight.data}')
# for initializing bias we can simply do :
print(f'bias(defaul): {fc.bias.data}')
fc.bias.data.normal_(0.5,1)
print(f'bias(normal): {fc.bias.data}')
# or more famously just initialize all by zero or 1 or etc
fc.bias.data.fill_(1)
print(f'bias(1): {fc.bias.data}')
# now to since there is no operation done yet there is no gradients yet!
print(f'grad: {fc.weight.grad}')
# the second way we can initialize a modules weights and biases
# is using torch.nn.init module. basically this is what we use nearly 99%
# of the times.
# using it is straight forward
torch.nn.init.normal_(fc.weight, mean = 0.0, std=1.0)
# or
torch.nn.init.uniform_(fc.weight, a= -1.0, b = 1.0)
# initializing using a constant
torch.nn.init.constant_(fc.weight, 0.1234)
torch.nn.init.constant_(fc.bias, 0.9999)
print(f'fc.weight.data: {fc.weight.data}, \nfc.bias.data: {fc.bias.data}')
# using xavier initialization
torch.nn.init.xavier_normal_(fc.weight, gain=torch.nn.init.calculate_gain('relu'))
# now suppose we want to initialize all layers using a specific initialization scheme, how do we do it?
# its easy we do it like this :
#first lets create a simple dummy model
import torch.nn as nn
model = torch.nn.Sequential(*[nn.Linear(1,1), nn.ReLU(),
nn.Conv1d(1,1,1), nn.ReLU()])
# we iterate over all 'modules' and initialize
# their paramertes (weights and biases)
for m in model.modules():
if isinstance(m, nn.Linear):
torch.nn.init.xavier_normal_(m.weight, torch.nn.init.calculate_gain('relu'))
torch.nn.init.constant_(m.bias, 1)
print('linear initialized!')
elif isinstance(m, nn.Conv1d):
torch.nn.init.kaiming_normal_(m.weight, a = 0)
torch.nn.init.constant_(m.bias, 1)
print('conv initialized!')
# we can define this in our architecture and initialize all modules.
# but what if we have a model, and after its creation we want to change the initialization!
# here we use model.apply().
# we first write down a function that does the intialization and then apply this function on the model
# this is how it is done.
# first write a function and do your thing!
def initialize_some_layers(m):
if isinstance(m, nn.Linear):
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, 0)
# showing the first modules weight before the new initializtaion
print('before: ',model._modules['0'].weight.data)
model.apply(initialize_some_layers)
# after the initialization :
# quick note. our model has a .modules() method which is a generator
# each time we call module(), it returns one module form _modules.
# we here however used the underlying _modules which is a dictionary
# since when creating our modules, we didnt specify a name, numbers
# specify each module, thats why we used '0', we will learn about this
# more in the next section.
print('after: ',model._modules['0'].weight.data)
#thats it
#%%
# so that was it!! you now know how to initialize a tensor/ a modules parameter in different ways.
# in this section we will learn several other ways of creating networks .
# previously we just learnt one way of creating a network which was simply by creating
# some layers and then use them in the order we wanted. here we will learn more ways of doing this.
#
#
#
# we wil create networks using different ways that are as follows:
# 1. using the simple layer definition (what we just saw)
# 2. using nn.Sequential() to create a sequential model
# 3. using ordered_dict to create named modules.
# 4. using ordered_list to create a list of layers.
# 5. using new modules
# This list probably doesnt make any sense to you, but when we try implementing them you'll understand
# why we covered each here.
# lets start wil the first method:
# 1. In the first method, we first define all of the layers we need and then in forward()
# we simply call them and use them in any order we like. here is a simple example
class simple_net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 3)
self.relu1 = nn.ReLU()
self.conv2 = nn.Conv2d(6, 12, 3)
self.pool1 = nn.MaxPool2d(2,2)
self.relu2 = nn.ReLU()
def forward(self, x):
output = self.conv1(x)
output = self.relu1(output)
output = self.conv2(output)
output = self.relu2(output)
output = self.pool1(output)
return output
# as you can see we defined an attribute for each layer we wanted to use.
# we can do better than this. for example , we can avoid defining two relu layers
# and instead use the functional form like this. we can access the functional forms
# from torch.nn.functional module.
import torch.nn.functional as F
class simple_net2(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 3)
self.conv2 = nn.Conv2d(6, 12, 2)
self.pool1 = nn.MaxPool2d(2,2)
def forward(self, x):
output = F.relu(self.conv1(x))
output = F.relu(self.conv2(x))
return self.pool1(output)
# as you may have guessed we can use the functional form for any of this, its up to you
# when and where to use either of them. usually we use activation functions, this way as
# its more consise, easy to use and more readible.
# Also as you know, you can have control over which layer gets executed, so you can have
# different if clauses in your foward() as well.
# 2. using nn.Sequential() class
# If we want to create a plain network which is just a series of layers in succession,
# we can ease ourseleves using the nn.Sequential() class. this class as the name implies
# applies a series of layers in succesion.
# nn.Sequential class is in fact a sequential container. 'Modules' will be added to it in
# the order they are passed in the constructor.
# Alternatively, an ordered dict of modules can also be passed in.
# As you see, there are several ways we can create a network using nn.Sequential() class.
# below I'll show some ways that comes to my mind:
# first lets see what the :
# "'Modules' will be added to it in the order they are passed in the constructor. "
# mean. this means, the layers order are specified when we specify them in the constructor
# see the example below to understand this :
class sequential_net1(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Conv2d(3, 6, 3),
nn.ReLU(),
nn.Conv2d(6, 12, 3),
nn.ReLU(),
nn.MaxPool2d(2,2)
)
def forward(self, x):
output = self.net(x)
return output
# We can also use an ordered_dict for this as well
# this is how we do it:
from collections import OrderedDict
class sequential_net2(nn.Module):
def __init__(self):
super().__init__()
self.model = nn.Sequential(OrderedDict([
"conv1", nn.Conv2d(3, 6, 3),
"relu1", nn.ReLU(),
"conv2", nn.Conv2d(6, 12, 3),
"relu2", nn.ReLU(),
"maxpool1", nn.MaxPool2d(2, 2)
]))
def forward(self, inputs):
return self.model(inputs)
# we can also use a 'list' of layers with our nn.Sequetntial class.
# here we use a list of layers with nn.Sequential
class sequential_net3(nn.Module):
def __init__(self):
super().__init__()
# nn.Sequential() can take a list of layers as well
# so first lets create a list
layers = []
# add each layer you like to the list
layers.append(nn.Conv2d(3, 6, 3))
layers.append(nn.ReLU())
layers.append(nn.Conv2d(6, 12, 3))
layers.append(nn.ReLU())
layers.append(nn.MaxPool2d(2,2))
# and now we use nn.Sequential()
# since Sequential does not accept a list,
# we decompose it by using the * operator.
self.net = nn.Sequential(*layers)
# and now in forward, we just need to write one line of code!
def forward(self, x):
return self.net(x)
# You may see this a lot in many architectures in Pytorch. however,
# you are better not use the Python list. and instead use Pytorchs ModuleList:
# becasue :
# 1. the parameters (weights, biases) of modules inside of a Python list will be missing
# and if you use it in the training they will not be updated, unless, you manually pass those parameters
# to the optimizer as well.
# This means, when you do 'model.parameters()', the parameters of layers inside of a python list wont be
# returned becasue Pytorch doesnt look for modules (i.e. layers) in a python list.
# However, when you do use a ModuleList, there is no problem and everything is fine.
# 2. Even if we pass those modules(that are in a python list) manually, when saving models using
# 'model.state_dict()', the parameters of modules inside of a Python list will not be saved.
# So always stick to ModuleList
# using the ModuleList is no different than the normal List, simply swap these together and thats it!:
#
# torch.nn.ModuleList :
# torch.nn.ModuleList can be indexed like a regular Python list, but modules it contains are properly registered,
# and will be visible by all ~torch.nn.Module methods.
# here is an example showing how to use nn.ModuleList
class sequential_net4(nn.Module):
def __init__(self):
super().__init__()
# nn.Sequential() can take a list of layers as well
# so first lets create a list
layers = nn.ModuleList()
# add each layer you like to the list
layers.append(nn.Conv2d(3, 6, 3))
layers.append(nn.ReLU())
layers.append(nn.Conv2d(6, 12, 3))
layers.append(nn.ReLU())
layers.append(nn.MaxPool2d(2,2))
# and now we use nn.Sequential()
# since Sequential does not accept a list,
# we decompose it by using the * operator.
self.net = nn.Sequential(*layers)
# and now in forward, we just need to write one line of code!
def forward(self, x):
return self.net(x)
# Since Sequnetial is a module container and respects the order of insertion,
# (which means internally maintains an ordered_dict), we can add different modules
# using the add_module() method. this way we can provide a unique name
# for each module and later on access them using this very name!
# also when priniting the model, the names that we gave our modules,
# makes the architecture more readable!
# This is how we do it
class sequential_net5(nn.Module):
def __init__(self):
super().__init__()
self.model = nn.Sequential()
self.model.add_module('conv1', nn.Conv2d(3, 6, 3))
self.model.add_module('relu1', nn.ReLU())
self.model.add_module('conv2', nn.Conv2d(6, 12, 3))
self.model.add_module('relu2', nn.ReLU())
self.model.add_module('maxpool2', nn.MaxPool2d(2,2))
def forward(self, inputs):
return self.model(inputs)
# Similarly to what we saw with ModuleList, we can have ModuleDict
# which is an 'ordered' dictionary (unlike Python dict which doesnt
# preserve the order of items when inserted).
# So we can also use a ModuleDict with Sequential() class .
# # here is an example
# nn.ModuleDict holds submodules in a dictionary.
# torch.nn.ModuleDict can be indexed like a regular Python dictionary,
# but modules it contains are properly registered, and will be visible
# by all torch.nn.Module methods.
# torch.nn.ModuleDict is an **ordered** dictionary that respects the order of insertion,
# and in torch.nn.ModuleDict.update, the order of the merged OrderedDict or
# another torch.nn.ModuleDict (the argument to torch.nn.ModuleDict.update).
# Note that torch.nn.ModuleDict.update with other unordered mapping types
# (e.g., Python's plain dict) does not preserve the order of the merged mapping.
# important note: the forward pass for this wont happen!
# becasue there is not a forwardpass implemented for moduledict
# its just a container. Seqeuntial however, does support forward
#
class sequential_net6(nn.Module):
def __init__(self):
super().__init__()
# the order of insertion is preserved
self.model = nn.ModuleDict()
# we can use the add_module
self.model.add_module('conv1', nn.Conv2d(3, 6, 3))
# or simply use the ordinary way!
self.model['relu1'] = nn.ReLU()
self.model['conv2'] = nn.Conv2d(6, 12, 3)
self.model['relu2'] = nn.ReLU()
self.model['maxpool1'] = nn.MaxPool2d(2, 2)
def forward(self, inputs):
# simply doing :
# return self.model(inputs)
# wont work as moduledict dosnt implemet forward()
# so you must manually forward all layers here. i.e do
out = inputs
# named_children() iterates over all higherlevel blocks/modules only
# we dont use modules as biases and weights are also modules which dont
# implement forward!!
for n,m in self.model.named_children():
out = m(out)
return out
# as you can see, our foward function has become a one liner!
# whats the benifit of doing this ?
# there are several benifits for using nn.Sequential.
# 1. we can simply define our whole network in one line, as you can see
# and have everything in a consise manner.
# 2. we can create different parts of our networks this way
# for example, we can define a feature extractor part and
# a classifier or a regressor, etc for our network.
# and call them separately and this gives us a lot of flexibility.
# lets see some of these usecases:
# example 1 :
class sequential_net7(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(nn.Conv2d(3, 6, 3),
nn.ReLU(),
nn.Conv2d(6, 12, 3),
nn.ReLU(),
nn.MaxPool2d(2,2)
)
self.classifer = nn.Linear(12, 2)
def forward(self, x):
output = self.features(x)
output = self.classifer(output)
return output
# now using this scheme, we can easily later on swap any part of the network, like the classifer
# or easily use the feature extractor of our network. for example when finetuning, we can create a new
# classifer and assign it to our model.classifier and retrain our model without
# even changing one line of our architecture. we will see this in action when we do finetuning.
# another benifit of using Sequential is that we can create different building blocks
# for our networks. we just saw an example, lets expand on that.
# here we will create a function for creating a convolution layer with batchnorm and
# relu. here we create a list of layers and then send this list to nn.Sequential
def convlayer(input_dim, output_dim, kernel_size=3, stride=1, padding=1, batchnorm=False):
layers = nn.ModuleList()
conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride, padding)
layers.append(conv)
if batchnorm:
layers.append(nn.BatchNorm2d(output_dim))
layers.append(nn.ReLU())
return nn.Sequential(*layers)
class sequential_net8(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = convlayer(3, 6, 3)