Many optimization to NN, expecialy ConvLayer and PoolLayer

- NN Layers are no longer mutable and are fully parametric - forward/backward passages can now be done on eltypes different than Float64, for example all the computation chanin can be made in Float32 - on many layers _zComp!(y,layer,x) replaces _zComp(layer,x), for convolutions also for de_dx and de_dw
sylvaticus · Jan 17, 2024 · 98eb947 · 98eb947
1 parent 4e7c9ce
commit 98eb947
Show file tree

Hide file tree

Showing 16 changed files with 1,444 additions and 734 deletions.
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+BetaML = "024491cd-cc6b-443e-8034-08ea7eb7db2b"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/docs/src/tutorials/Image recognition/Image_recognition.jl b/docs/src/tutorials/Image recognition/Image_recognition.jl
@@ -1,6 +1,6 @@
 # ## Library and data loading
 using Dates #src
-println(now(), " ", "*** Start iris clustering tutorial..." )  #src
+println(now(), " ", "*** Start image recognition tutorial..." )  #src
 
 # Activating the local environment specific to BetaML documentation
 using Pkg
@@ -11,88 +11,63 @@ using Random
 Random.seed!(123);
 using DelimitedFiles
 using Statistics
+using BenchmarkTools
+using Plots
 using Flux
 using Flux: Data.DataLoader
 using Flux: onehotbatch, onecold, crossentropy
-using Flux: @epochs
 using MLDatasets # For loading the training data
 #using Images, FileIO, ImageTransformations # For loading the actual images
 
 TESTRNG = FIXEDRNG # This could change...
 
-x_train, y_train = MLDatasets.MNIST.traindata()
+x_train, y_train = MLDatasets.MNIST(split=:train)[:]
 x_train          = permutedims(x_train,(3,2,1))
-x_train          = convert(Array{Float32,3},x_train)
+x_train          = convert(Array{Float64,3},x_train)
 x_train          = reshape(x_train,size(x_train,1),size(x_train,2)*size(x_train,3))
 ohm              = OneHotEncoder()
 y_train_oh       = fit!(ohm,y_train)
 
-x_test, y_test = MLDatasets.MNIST.testdata()
+x_test, y_test  = MLDatasets.MNIST(split=:test)[:]
 x_test          = permutedims(x_test,(3,2,1))
-x_test          = convert(Array{Float32,3},x_test)
+x_test          = convert(Array{Float64,3},x_test)
 x_test          = reshape(x_test,size(x_test,1),size(x_test,2)*size(x_test,3))
 y_test_oh       = predict(ohm,y_test)
-
-
-#=
-(N,D)    = size(x_train)
-l1       = ReshaperLayer((D,1),(28,28,1))
-l2       = ConvLayer((28,28),(5,5),1,8,padding=2,stride=2,rng=copy(TESTRNG))
-size(l2)
-l3       = ConvLayer(size(l2)[2],(3,3),16,padding=2,stride=2,rng=copy(TESTRNG))
-size(l3)
-l4       = ConvLayer(size(l3)[2],(3,3),32,padding=1,stride=2,rng=copy(TESTRNG))
-size(l4)
-l5       = ConvLayer(size(l4)[2],(3,3),32,padding=1,stride=2,rng=copy(TESTRNG))
-size(l5)
-l6       = PoolingLayer(size(l5)[2],(2,2),f=mean)
-size(l6)
-l7       = ReshaperLayer(size(l6)[2])
-size(l7)
-l8       = DenseLayer(size(l7)[2][1],10,f=BetaML.relu, rng=copy(TESTRNG))
-size(l8)
-l9       = VectorFunctionLayer(size(l8)[2][1],f=BetaML.softmax)
-size(l9)
-layers   = [l1,l2,l3,l4,l5,l6,l7,l8,l9]
-
-m = NeuralNetworkEstimator(layers=layers,loss=squared_cost,verbosity=NONE,batch_size=64,epochs=1)
-
-(x_debug,x_other),(y_debug_oh,y_other_oh)  = partition([x_train,y_train_oh],[0.005,0.995])
-
-ŷ = fit!(m,x_debug,y_debug_oh)
-
-mode(ŷ)
-
-accuracy(predict(y_debug_oh),mode(ŷ))
-hcat(y_train[1:100],mode(ŷ))
-=#
-
-
-(N,D)    = size(x_train)
-l1       = ReshaperLayer((D,1),(28,28,1))
-size(l1)
-l2       = ConvLayer(size(l1)[2],(5,5),8,stride=2,rng=copy(TESTRNG))
-size(l2)
-l3       = PoolingLayer(size(l2)[2],(2,2))
-size(l3)
-l4       = ConvLayer(size(l3)[2],(3,3),16,stride=2,rng=copy(TESTRNG))
-size(l4)
-l5       = PoolingLayer(size(l4)[2],(2,2))
-size(l5)
-l6       = ReshaperLayer(size(l5)[2])
-size(l6)
-l7       = DenseLayer(size(l6)[2][1],10,f=BetaML.relu, rng=copy(TESTRNG))
-size(l7)
-l8      = VectorFunctionLayer(size(l7)[2][1],f=BetaML.softmax)
-size(l8)
-layers   = [l1,l2,l3,l4,l5,l6,l7,l8]
-m = NeuralNetworkEstimator(layers=layers,loss=squared_cost,verbosity=HIGH,batch_size=64,epochs=5)
-
+(N,D)  = size(x_train)
+
+# Building the model:
+
+## 784x1 => 28x28x1
+l1     = ReshaperLayer((D,1),(28,28,1))
+## 28x28x1 => 14x14x8
+l2     = ConvLayer(size(l1)[2],(5,5),8,stride=2,f=relu,rng=copy(TESTRNG))
+## 14x14x8 => 7x7x16
+l3     = ConvLayer(size(l2)[2],(3,3),16,stride=2,f=relu,rng=copy(TESTRNG))
+## 7x7x16 => 4x4x32
+l4     = ConvLayer(size(l3)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG))
+## 4x4x32 => 2x2x32
+l5     = ConvLayer(size(l4)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG))
+## 2x2x32 => 1x1x32 (global per layer mean)
+l6     = PoolingLayer(size(l5)[2],(2,2),stride=(2,2),f=mean)
+## 1x1x32 => 32x1 
+l7     = ReshaperLayer(size(l6)[2])
+## 32x1 => 10x1 
+l8     = DenseLayer(size(l7)[2][1],10,f=identity, rng=copy(TESTRNG))
+## 10x1 => 10x1 
+l9     = VectorFunctionLayer(size(l8)[2][1],f=BetaML.softmax)
+layers = [l1,l2,l3,l4,l5,l6,l7,l8,l9]
+m      = NeuralNetworkEstimator(layers=layers,loss=squared_cost,verbosity=HIGH,batch_size=128,epochs=4)
+
+# We train the model only on a subset of the training data, otherwise it is too long for the automated building of this page.
+# Training the whole MINST set takes approximatly 16 minutes on a mid-level laptop (on CPU), leading to a test accuracy of 0.969
 (x_debug,x_other),(y_debug_oh,y_other_oh)  = partition([x_train,y_train_oh],[0.01,0.99],rng=copy(TESTRNG))
 
 ŷ = fit!(m,x_debug,y_debug_oh)
 
-y_true  = inverse_predict(ohm,convert(Matrix{Bool},y_debug_oh))
+## ŷ = fit!(m,x_train,y_train_oh)
+
+##y_true  = inverse_predict(ohm,convert(Matrix{Bool},y_train_oh))
+# y_true  = inverse_predict(ohm,convert(Matrix{Bool},y_debug_oh))
 ŷ_nonoh = inverse_predict(ohm,ŷ)
 accuracy(y_true,ŷ_nonoh)
 hcat(y_true,ŷ_nonoh)
@@ -106,3 +81,69 @@ hcat(ytest_true,ŷtest_nonoh)
 cm = ConfusionMatrix()
 fit!(cm,ytest_true,ŷtest_nonoh)
 print(cm)
+
+res = info(cm)
+
+heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)")
+
+# -----------------------------------------------------------
+# ## Flux implementation
+# This is the equivalent workflow in Flux.
+# Fitting on the whole training dataset lead to a test accuracy of 0.9658, so likely not statistically different than BetaML, but with still a much faster comutation time, as it takes only 2 minutes instead of 16...
+
+
+x_train, y_train = MLDatasets.MNIST(split=:train)[:]
+x_train          = permutedims(x_train,(2,1,3)); # For correct img axis
+#x_train          = convert(Array{Float32,3},x_train);
+x_train          = reshape(x_train,(28,28,1,60000));
+y_train          = Flux.onehotbatch(y_train, 0:9)
+train_data       = Flux.Data.DataLoader((x_train, y_train), batchsize=128)
+#x_test, y_test   = MLDatasets.MNIST.testdata(dir = "data/MNIST")
+x_test, y_test   = MLDatasets.MNIST(split=:test)[:]
+x_test           = permutedims(x_test,(2,1,3)); # For correct img axis
+#x_test           = convert(Array{Float32,3},x_test);
+x_test           = reshape(x_test,(28,28,1,10000));
+y_test           = Flux.onehotbatch(y_test, 0:9)
+
+model = Chain(
+    ## 28x28 => 14x14
+    Conv((5, 5), 1=>8, pad=2, stride=2, Flux.relu),
+    ## 14x14 => 7x7
+    Conv((3, 3), 8=>16, pad=1, stride=2, Flux.relu),
+    ## 7x7 => 4x4
+    Conv((3, 3), 16=>32, pad=1, stride=2, Flux.relu),
+    ## 4x4 => 2x2
+    Conv((3, 3), 32=>32, pad=1, stride=2, Flux.relu),
+    ## Average pooling on each width x height feature map
+    GlobalMeanPool(),
+    Flux.flatten,
+    Dense(32, 10),
+    Flux.softmax
+)
+
+
+
+myaccuracy(y,ŷ) = (mean(Flux.onecold(ŷ) .== Flux.onecold(y)))
+myloss(x, y)     = Flux.crossentropy(model(x), y)
+
+opt = Flux.ADAM()
+ps  = Flux.params(model)
+number_epochs = 4
+
+[(println(e); Flux.train!(myloss, ps, train_data, opt)) for e in 1:number_epochs]
+
+ŷtrain =   model(x_train)
+ŷtest  =   model(x_test)
+myaccuracy(y_train,ŷtrain)
+myaccuracy(y_test,ŷtest)
+
+plot(Gray.(x_train[:,:,1,2]))
+
+cm = ConfusionMatrix()
+fit!(cm,Flux.onecold(y_test) .-1, Flux.onecold(ŷtest) .-1 )
+println(cm)
+
+res = info(cm)
+heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)")
+
+
diff --git a/src/Nn/Nn.jl b/src/Nn/Nn.jl
@@ -85,11 +85,15 @@ Structure representing the learnable parameters of a layer or its gradient.
 The learnable parameters of a layers are given in the form of a N-tuple of Array{Float64,N2} where N2 can change (e.g. we can have a layer with the first parameter being a matrix, and the second one being a scalar).
 We wrap the tuple on its own structure a bit for some efficiency gain, but above all to define standard mathematic operations on the gradients without doing "type piracy" with respect to Base tuples.
 """
-mutable struct Learnable
+mutable struct Learnable{ET}
     #data::Union{Tuple{Vararg{Array{Float64,N} where N}},Vector{Tuple{Vararg{Array{Float64,N} where N}}}}
-    data::Tuple{Vararg{Array{Float64,N} where N}}
+    data::Tuple{Vararg{Array{ET,N} where N}}
     function Learnable(data)
-        return new(data)
+        if data == ()
+             return new{Float64}(data)
+        else
+            return new{eltype(eltype(data))}(data)
+        end
     end
 end
 function +(items::Learnable...)
@@ -354,7 +358,7 @@ function predict(nn::NN,x)
     lastlayer_size = size(nn.layers[end])[2]
     length(lastlayer_size) == 1 || error("The last NN layer should always be a single dimension vector. Eventually use `ReshaperLayer` to reshape its output as a vector.")
     d = lastlayer_size[1]
-    out = zeros(n,d)
+    out = zeros(eltype(x),n,d)
     for i in 1:size(x)[1]
         values = selectdim(x,1,i) # x[i,:]
         for l in nn.layers