diff --git a/README.md b/README.md index 02522166..36a7b2e1 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ Neural Networks | [Flux.jl](https://fluxml.ai/), [Knet](https://github.com/deni Decision Trees | [DecisionTree.jl](https://github.com/bensadeghi/DecisionTree.jl) Clustering | [Clustering.jl](https://github.com/JuliaStats/Clustering.jl), [GaussianMixtures.jl](https://github.com/davidavdav/GaussianMixtures.jl) Missing imputation | [Impute.jl](https://github.com/invenia/Impute.jl), [Mice.jl](https://github.com/tom-metherell/Mice.jl) -Variable importance | [ShapML.jl](https://github.com/nredell/ShapML.jl) + ## TODO @@ -175,7 +175,7 @@ Variable importance | [ShapML.jl](https://github.com/nredell/ShapML.jl) ### Short term - Implement autotuning of `GaussianMixtureClusterer` using `BIC` or `AIC` -- Add Silhouette method to check cluster validity +- Add Silhouette method to check cluster validity - Implement PAM and/or variants for kmedoids ### Mid/Long term @@ -183,7 +183,7 @@ Variable importance | [ShapML.jl](https://github.com/nredell/ShapML.jl) - Add RNN support and improve convolutional layers speed - Reinforcement learning (Markov decision processes) - Standardize data sampling in training -- Add GPU +- Convert to GPU ## Contribute diff --git a/announce_autoencoder.txt b/announce_autoencoder.txt deleted file mode 100644 index b96a94e7..00000000 --- a/announce_autoencoder.txt +++ /dev/null @@ -1,21 +0,0 @@ - -[Ann] Easiest AutoEncoder on Earth: `m=AutoEncoder(); fit!(m,x); latent_x = predict(m,x); xest=inverse_predict(m,x_latent)` - -[ANN] A easy to use AutoEncoder to reduce the dimensionality of the data - -(I hope I'm not breaking the self-promotion rule ;-) ) -Hello, I am pleased to announce one of the easiest to use [AutoEncoder](https://sylvaticus.github.io/BetaML.jl/dev/Utils.html#BetaML.Utils.AutoEncoder) models in the world. - -No need to implement a neural network yourself, just use -- `mod = AutoEncoder([optional stuff])` to create the model -- `fit!(mod,x)` to fit the model to some tabular data (dims in cols) -- x_latent = predict(mod)` or `x_latent = predict(mod,otherx)` to get the data encoded in latent space (usually with many less dimensions than the original) -- x_decoded = inverse_predict(mod,x_latent)` to get the decoded values. - -The user can still specify the number of dimensions in the latent space, the number of neurons in the inner layers, or the full specification of encoding/decoding layers and NN training options, but this remains completely optional as some heuristics are applied. The `autotune' method also allows to further simplify these choices. - -`AutoEncoder` is part of the v0.10.4 of [BetaML Machine Learning Toolkit](https://github.com/sylvaticus/BetaML.jl), an open source set of machine learning models and utilities, and a wrapper should soon be available as part of the MLJ library. -Although developed in the Julia language, it can be easily accessed from R, Python or any other language with a Julia binding, as specified [here](https://sylvaticus.github.io/BetaML.jl/stable/tutorials/Betaml_tutorial_getting_started.html#using_betaml_from_other_languages). - - - diff --git a/cancellable/# Scratchpad.jl b/cancellable/# Scratchpad.jl deleted file mode 100644 index 36712612..00000000 --- a/cancellable/# Scratchpad.jl +++ /dev/null @@ -1,108 +0,0 @@ -# Scratchpad - -x = [0.12 0.31 0.29 3.21 0.21; - 0.44 1.21 1.18 13.54 0.85 - 0.22 0.61 0.58 6.43 0.42; - 0.35 0.93 0.91 10.04 0.71; - 0.51 1.47 1.46 16.12 0.99; - 0.35 0.93 0.91 10.04 0.71; - 0.51 1.47 1.46 16.12 0.99; - 0.22 0.61 0.58 6.43 0.42; - 0.12 0.31 0.29 3.21 0.21; - 0.44 1.21 1.18 13.54 0.85]; -m = AutoEncoder(encoded_size=2,layers_size=15,epochs=400,autotune=false,rng=copy(TESTRNG)) -x_reduced = fit!(m,x) -x̂ = inverse_predict(m,x_reduced) -x̂sum = sum(x̂) - -x = vcat(rand(copy(TESTRNG),0:0.001:0.6,30,5), rand(copy(TESTRNG),0.4:0.001:1,30,5)) -m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE) -x_reduced = fit!(m,x) -x̂ = inverse_predict(m,x_reduced) -x̂sum = sum(x̂) - -l2loss_by_cv2(AutoEncoder(rng=copy(TESTRNG), verbosity=NONE),(x,),rng=copy(TESTRNG)) - -m = AutoEncoder(rng=copy(TESTRNG), verbosity=NONE) -sampler = KFold(nsplits=5,nrepeats=1,rng=copy(TESTRNG)) -(μ,σ) = cross_validation([x],sampler) do trainData,valData,rng - (xtrain,) = trainData; (xval,) = valData - fit!(m,xtrain) - x̂val_red = predict(m,xval) - x̂val = inverse_predict(m,x̂val_red) - ϵ = norm(xval .- x̂val)/size(xval,1) - println(ϵ) # different - reset!(m) - return ismissing(ϵ) ? Inf : ϵ -end - -function l2loss_by_cv2(m,data;nsplits=5,nrepeats=1,rng=Random.GLOBAL_RNG) - x= data[1] - sampler = KFold(nsplits=nsplits,nrepeats=nrepeats,rng=rng) - (μ,σ) = cross_validation([x],sampler) do trainData,valData,rng - (xtrain,) = trainData; (xval,) = valData - fit!(m,xtrain) - x̂val = inverse_predict(m,xval) - ϵ = norm(xval .- x̂val)/size(xval,1) - reset!(m) - return ismissing(ϵ) ? Inf : ϵ - end - return μ -end - - - -using Random, Pipe, HTTP, CSV, DataFrames, Plots, BetaML -import Distributions: Normal, quantile -Random.seed!(123) - -# We download the Boston house prices dataset from interet and split it into x and y -dataURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data" -data = @pipe HTTP.get(dataURL).body |> CSV.File(_, delim=' ', header=false, ignorerepeated=true) |> DataFrame - -data = CSV.File(joinpath("docs","src","tutorials","Feature importance", "data","housing.data"), delim=' ', header=false, ignorerepeated=true) |> DataFrame - -var_names = [ - "CRIM", # per capita crime rate by town - "ZN", # proportion of residential land zoned for lots over 25,000 sq.ft. - "INDUS", # proportion of non-retail business acres per town - "CHAS", # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - "NOX", # nitric oxides concentration (parts per 10 million) - "RM", # average number of rooms per dwelling - "AGE", # proportion of owner-occupied units built prior to 1940 - "DIS", # weighted distances to five Boston employment centres - "RAD", # index of accessibility to radial highways - "TAX", # full-value property-tax rate per $10,000 - "PTRATIO", # pupil-teacher ratio by town - "B", # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - "LSTAT", # % lower status of the population -] -y_name = "MEDV" # Median value of owner-occupied homes in $1000's - -# Our features are a set of 13 explanatory variables, while the label that we want to estimate is the average housing prices: -x = Matrix(data[:,1:13]) -y = data[:,14] - -# We use a Random Forest model as regressor and we compute the variable importance for this model : -fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=false, ignore_dims_keyword="ignore_dims") -rank = fit!(fr,x,y) - -loss_by_col = info(fr)["loss_by_col"] -sobol_by_col = info(fr)["sobol_by_col"] -loss_by_col_sd = info(fr)["loss_by_col_sd"] -sobol_by_col_sd = info(fr)["sobol_by_col_sd"] -loss_fullmodel = info(fr)["loss_all_cols"] -loss_fullmodel_sd = info(fr)["loss_all_cols_sd"] -ntrials_per_metric = info(fr)["ntrials_per_metric"] - -# Finally we can plot the variable importance, first using the loss metric ("mda") and then the sobol one: -bar(var_names[sortperm(loss_by_col)], loss_by_col[sortperm(loss_by_col)],label="Loss by var", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[sortperm(loss_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5]) -vline!([loss_fullmodel], label="Loss with all vars",linewidth=2) -vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), - loss_fullmodel+quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), -], label=nothing,linecolor=:black,linestyle=:dot,linewidth=1) -savefig("loss_by_var.png") -#- -bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.4]) -savefig("sobol_ny_var.png") -# As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important explanatory variables of house price in the Boston area. \ No newline at end of file diff --git a/cancellable/Untitled-2.jl b/cancellable/Untitled-2.jl deleted file mode 100644 index 06884223..00000000 --- a/cancellable/Untitled-2.jl +++ /dev/null @@ -1,97 +0,0 @@ - -using Random, LinearAlgebra, Plots -Random.seed!(123) -# Syntetic data generation -# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y -TEMPRNG = copy(Random.GLOBAL_RNG) -N = 2000 -D = 6 -nAttempts = 30 -xa = rand(TEMPRNG,0:0.0001:10,N,3) -xb = (xa[:,1] .* 0.5 .* rand(TEMPRNG,0.7:0.001:1.3)) .+ 10 -xc = rand(TEMPRNG,0:0.0001:10,N,D-4) -x = hcat(xa,xb,xc) -y = [10*r[1]-r[2]+0.1*r[3]*r[1] for r in eachrow(x) ] -((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=TEMPRNG) - - -# full cols model: -m = RandomForestEstimator(n_trees=100,rng=TEMPRNG) -m = DecisionTreeEstimator(rng=TEMPRNG) -m = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG) -fit!(m,xtrain,ytrain) -ŷtrain = predict(m,xtrain) -loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good - -ŷtest = predict(m,xtest) -loss = norm(ytest-ŷtest)/length(ytest) # this is good - -loss_by_cols = zeros(D) -sobol_by_cols = zeros(D) -loss_by_cols2 = zeros(D) -sobol_by_cols2 = zeros(D) -diffest_bycols = zeros(D) -loss_by_cols_test = zeros(D) -sobol_by_cols_test = zeros(D) -loss_by_cols2_test = zeros(D) -sobol_by_cols2_test = zeros(D) -diffest_bycols_test = zeros(D) -for a in 1:nAttempts - println("Running attempt $a...") - for d in 1:D - println("- doing modelling without dimension $d ....") - xd_train = hcat(xtrain[:,1:d-1],shuffle(TEMPRNG,xtrain[:,d]),xtrain[:,d+1:end]) - xd_test = hcat(xtest[:,1:d-1],shuffle(TEMPRNG,xtest[:,d]),xtest[:,d+1:end]) - #md = RandomForestEstimator(n_trees=100,rng=TEMPRNG) - #md = DecisionTreeEstimator(rng=TEMPRNG) - md = NeuralNetworkEstimator(verbosity=NONE,rng=TEMPRNG) - fit!(md,xd_train,ytrain) - ŷdtrain = predict(md,xd_train) - #ŷdtrain2 = predict(m,xtrain,ignore_dims=d) - ŷdtest = predict(md,xd_test) - #ŷdtest2 = predict(m,xtest,ignore_dims=d) - if a == 1 - loss_by_cols[d] = norm(ytrain-ŷdtrain)/length(ytrain) - sobol_by_cols[d] = sobol_index(ŷtrain,ŷdtrain) - #loss_by_cols2[d] = norm(ytrain-ŷdtrain2)/length(ytrain) - #sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) - #diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain) - loss_by_cols_test[d] = norm(ytest-ŷdtest)/length(ytest) - sobol_by_cols_test[d] = sobol_index(ŷtest,ŷdtest) - #loss_by_cols2_test[d] = norm(ytest-ŷdtest2)/length(ytest) - #sobol_by_cols2_test[d] = sobol_index(ŷtest,ŷdtest2) - #diffest_bycols_test[d] = norm(ŷdtest-ŷdtest2)/length(ytest) - else - loss_by_cols[d] = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1) - sobol_by_cols[d] = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1) - #loss_by_cols2[d] = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1) - #sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1) - #diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1) - loss_by_cols_test[d] = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols_test[d],n=a-1) - sobol_by_cols_test[d] = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols_test[d],n=a-1) - #loss_by_cols2_test[d] = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2_test[d],n=a-1) - #sobol_by_cols2_test[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2_test[d],n=a-1) - #diffest_bycols_test[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols_test[d],n=a-1) - end - end -end -# Expected order: ~ [{5,6,4},{3,2},1] good -# ~ [{5,6},{4,3,2,1}] still good but don't see corelation -bar(string.(sortperm(loss_by_cols)),loss_by_cols[sortperm(loss_by_cols)],label="loss_by_cols train") -bar(string.(sortperm(sobol_by_cols)),sobol_by_cols[sortperm(sobol_by_cols)],label="sobol_by_cols train") -bar(string.(sortperm(loss_by_cols2)),loss_by_cols2[sortperm(loss_by_cols2)],label="loss_by_cols2 train") -bar(string.(sortperm(sobol_by_cols2)),sobol_by_cols2[sortperm(sobol_by_cols2)],label="sobol_by_cols2 train") -bar(string.(sortperm(loss_by_cols_test)),loss_by_cols_test[sortperm(loss_by_cols_test)],label="loss_by_cols test") -bar(string.(sortperm(sobol_by_cols_test)),sobol_by_cols_test[sortperm(sobol_by_cols_test)],label="sobol_by_cols test") -bar(string.(sortperm(loss_by_cols2_test)),loss_by_cols2_test[sortperm(loss_by_cols2_test)],label="loss_by_cols2 test") -bar(string.(sortperm(sobol_by_cols2_test)),sobol_by_cols2_test[sortperm(sobol_by_cols2_test)],label="sobol_by_cols2 test") - - - - -d = 5 -xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) -md = RandomForestEstimator(n_trees=50) -fit!(md,xd_train,ytrain) -ŷdtrain = predict(md,xd_train) -loss_d = norm(ytrain-ŷdtrain)/length(ytrain) \ No newline at end of file diff --git a/cancellable/Untitled-3.jl b/cancellable/Untitled-3.jl deleted file mode 100644 index be4e4a68..00000000 --- a/cancellable/Untitled-3.jl +++ /dev/null @@ -1,79 +0,0 @@ - -# Syntetic data generation -# x1: high importance, x2: little importance, x3: mixed effects with x1, x4: highly correlated with x1 but no effects on Y, x5 and x6: no effects on Y -using Random -TESTRNG = Random.GLOBAL_RNG -N = 2000 -D = 6 -nAttempts = 10 -xa = rand(copy(TESTRNG),0:0.0001:10,N,3) -xb = (xa[:,1] .* 0.5 .* rand(0.8:0.001:1.2)) .+ 10 -xc = rand(copy(TESTRNG),0:0.0001:10,N,D-4) -x = hcat(xa,xb,xc) -y = [10*r[1]-r[2]+0.05*r[3]*r[1] for r in eachrow(x) ] -((xtrain,xtest),(ytrain,ytest)) = BetaML.partition([x,y],[0.8,0.2],rng=copy(TESTRNG)) - - -# full cols model: -m = RandomForestEstimator(n_trees=50) -fit!(m,xtrain,ytrain) -ŷtrain = predict(m,xtrain) -loss = norm(ytrain-ŷtrain)/length(ytrain) # this is good - -ŷtest = predict(m,xtest) -loss = norm(ytest-ŷtest)/length(ytest) # this is good - -loss_by_cols = zeros(D) -sobol_by_cols = zeros(D) -loss_by_cols2 = zeros(D) -sobol_by_cols2 = zeros(D) -diffest_bycols = zeros(D) -for a in 1:nAttempts - println("Running attempt $a...") - for d in 1:D - println("- doing modelling without dimension $d ....") - xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) - xd_test = hcat(xtest[:,1:d-1],shuffle(xtest[:,d]),xtest[:,d+1:end]) - md = RandomForestEstimator(n_trees=50) - fit!(md,xd_train,ytrain) - ŷdtrain = predict(md,xd_train) - ŷdtrain2 = predict(m,xtrain,ignore_dims=d) - ŷdtest = predict(md,xd_test) - ŷdtest2 = predict(m,xtest,ignore_dims=d) - if a == 1 - #= - loss_by_cols[d] = norm(ytest-ŷdtest)/length(ytest) - sobol_by_cols[d] = sobol_index(ŷtest,ŷdtest) - loss_by_cols2[d] = norm(ytest-ŷdtest2)/length(ytest) - sobol_by_cols2[d] = sobol_index(ŷtest,ŷdtest2) - diffest_bycols[d] = norm(ŷdtest-ŷdtest2)/length(ytest) - =# - loss_by_cols[d] = norm(ytrain-ŷdtrain)/length(ytrain) - sobol_by_cols[d] = sobol_index(ŷtrain,ŷdtrain) - loss_by_cols2[d] = norm(ytrain-ŷdtrain2)/length(ytrain) - sobol_by_cols2[d] = sobol_index(ŷtrain,ŷdtrain2) - diffest_bycols[d] = norm(ŷdtrain-ŷdtrain2)/length(ytrain) - else - #= - loss_by_cols[d] = online_mean(norm(ytest-ŷdtest)/length(ytest); mean=loss_by_cols[d],n=a-1) - sobol_by_cols[d] = online_mean(sobol_index(ŷtest,ŷdtest) ; mean=sobol_by_cols[d],n=a-1) - loss_by_cols2[d] = online_mean(norm(ytest-ŷdtest2)/length(ytest); mean=loss_by_cols2[d],n=a-1) - sobol_by_cols2[d] = online_mean(sobol_index(ŷtest,ŷdtest2) ; mean=sobol_by_cols2[d],n=a-1) - diffest_bycols[d] = online_mean(norm(ŷdtest-ŷdtest2)/length(ytest); mean=diffest_bycols[d],n=a-1) - =# - loss_by_cols[d] = online_mean(norm(ytrain-ŷdtrain)/length(ytrain); mean=loss_by_cols[d],n=a-1) - sobol_by_cols[d] = online_mean(sobol_index(ŷtrain,ŷdtrain) ; mean=sobol_by_cols[d],n=a-1) - loss_by_cols2[d] = online_mean(norm(ytrain-ŷdtrain2)/length(ytrain); mean=loss_by_cols2[d],n=a-1) - sobol_by_cols2[d] = online_mean(sobol_index(ŷtrain,ŷdtrain2) ; mean=sobol_by_cols2[d],n=a-1) - diffest_bycols[d] = online_mean(norm(ŷdtrain-ŷdtrain2)/length(ytrain); mean=diffest_bycols[d],n=a-1) - end - end -end -# Expected order: ~ [5,6,4,3,2,1] - -d = 5 -xd_train = hcat(xtrain[:,1:d-1],shuffle(xtrain[:,d]),xtrain[:,d+1:end]) -md = RandomForestEstimator(n_trees=50) -fit!(md,xd_train,ytrain) -ŷdtrain = predict(md,xd_train) -loss_d = norm(ytrain-ŷdtrain)/length(ytrain) \ No newline at end of file diff --git a/cancellable/announce.txt b/cancellable/announce.txt deleted file mode 100644 index ddff3bef..00000000 --- a/cancellable/announce.txt +++ /dev/null @@ -1,85 +0,0 @@ -@JuliaRegistrator register - -Release notes: - -- Added `FeatureRanker`, a flexible feature ranking estimator using multiple feature importance metrics -- new functions `kl_divergence` and `sobol_index` -- added option to RF/DT models to ignore specific variables in prediction, by following _both_ the splits on nodes occurring on that dimensions, as the keyword `ignore_dims` to the `predict` function -- added option `sampling_share` to `RandomForestEstimator` model -- DOC: added Benchmarks (but then temporarily removed due to the issue of SystemBenchmark not installable, see [this issue](https://github.com/IanButterworth/SystemBenchmark.jl/issues/64) ) -- DOC: added `FeatureRanker` tutorial -- bugfix on l2loss_by_cv for unsupervised models - - -[Announce]: FeatureRanking: learn which variables contribute the most to the estimation of black box models - -I am pleased to announce the availability of [`FeatureRanker`](https://sylvaticus.github.io/BetaML.jl/dev/Utils.html#BetaML.Utils.FeatureRanker), a simple yet flexible feature ranking estimator where different metrics can be used to estimate the importance of individual variables. - -Key features: -- Choose between loss-based or variance-based (Sobol indices) metrics -- Choose between _permute and relearn_ or _permute only_ strategies, or exploit the ability of some models (typically tree-based) to "ignore" variables at prediction time. -- Choose whether to generate the rank in a single stage (a single loop) or recursively, where at each stage the less important variable is "removed". -- Choose the number of splits and possibly the number of iterations of the splits in the cross-validation used internally to produce the rank -- Works with any estimator model (not just from the BetaML suit) that can be wrapped in a BetaML-like API (`m=ModelName(hyperparameters...); fit_function(m,x,y); predict_function(m,x)` where `fit_function` and `predict_function` can be specified in the `FeatureRanker` options). - -In the following example, we estimate the importance of different variables in predicting house prices using the Boston dataset: - -```julia -# Loading packages... -using Random, Pipe, HTTP, CSV, DataFrames, Plots, BetaML -import Distributions: Normal, quantile -Random.seed!(123) - -# We download the Boston house prices dataset from internet and split it into x and y -dataURL = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data" -data = @pipe HTTP.get(dataURL).body |> CSV.File(_, delim=' ', header=false, ignorerepeated=true) |> DataFrame - -var_names = [ - "CRIM", # per capita crime rate by town - "ZN", # proportion of residential land zoned for lots over 25,000 sq.ft. - "INDUS", # proportion of non-retail business acres per town - "CHAS", # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - "NOX", # nitric oxides concentration (parts per 10 million) - "RM", # average number of rooms per dwelling - "AGE", # proportion of owner-occupied units built prior to 1940 - "DIS", # weighted distances to five Boston employment centres - "RAD", # index of accessibility to radial highways - "TAX", # full-value property-tax rate per $10,000 - "PTRATIO", # pupil-teacher ratio by town - "B", # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - "LSTAT", # % lower status of the population -] -y_name = "MEDV" # Median value of owner-occupied homes in $1000's - -# Our features are a set of 13 explanatory variables, while the label that we want to estimate is the average housing prices: -x = Matrix(data[:,1:13]) -y = data[:,14] - -# We use a Random Forest model as regressor and we compute the variable importance for this model : -fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=false, ignore_dims_keyword="ignore_dims") -rank = fit!(fr,x,y) - -loss_by_col = info(fr)["loss_by_col"] -sobol_by_col = info(fr)["sobol_by_col"] -loss_by_col_sd = info(fr)["loss_by_col_sd"] -sobol_by_col_sd = info(fr)["sobol_by_col_sd"] -loss_fullmodel = info(fr)["loss_all_cols"] -loss_fullmodel_sd = info(fr)["loss_all_cols_sd"] -ntrials_per_metric = info(fr)["ntrials_per_metric"] - -# Finally we can plot the variable importance, first using the loss metric ("mda") and then the sobol one: -bar(var_names[sortperm(loss_by_col)], loss_by_col[sortperm(loss_by_col)],label="Loss by var", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[sortperm(loss_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5]) -vline!([loss_fullmodel], label="Loss with all vars",linewidth=2) -vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), - loss_fullmodel+quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), -], label=nothing,linecolor=:black,linestyle=:dot,linewidth=1) - -bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.4]) -``` - -As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important variables for the estimation of house price in the Boston area. - -`FeatureRanker` is shipped with the [Beta Machine Learning Toolkit (BetaML.jl)](https://github.com/sylvaticus/BetaML.jl) v0.12. A tutorial is available [here](https://sylvaticus.github.io/BetaML.jl/dev/tutorials/Feature%20importance/Feature_importance.html). - - - diff --git a/cancellable/using Statistics.jl b/cancellable/using Statistics.jl deleted file mode 100644 index 711cfa77..00000000 --- a/cancellable/using Statistics.jl +++ /dev/null @@ -1,29 +0,0 @@ -using Statistics - -scalars = [1.0,1.1,1.2] -vectors = [[1.0,10.0],[1.1,11],[1.2,12]] -vofv = [[[1.0,10.0],[100,1000]], [[1.1,11],[111,1111]],[[1.2,12],[122,1222]]] - -mean(scalars) -std(scalars) -mean(vectors) -std(vectors) -mean(vofv) -std.(vofv) - - -mean([[1.1,3.1],[1.3,3.3]]) - -using DataFrames, Plots - -df = DataFrame(group=["A", "B", "C"], total=[7.7, 4.6, 5.1], std_error = [0.04, 0.05, 0.06]) - -bar(df.group, df.total, c=:blues, lw=0, widen=false) -plot!(1/2:(ncol(df)-1/2), df.total, lw=0, yerror=20*df.std_error, ms=10) - -group=["A", "B", "C"] -total=[7.7, 4.6, 5.1] -std_error = [0.04, 0.05, 0.06] - -bar(group, total, c=:blues, lw=0, widen=false) -plot!(1/2:(3-1/2), total, lw=0, yerror=20*std_error, ms=10) diff --git a/docs/Manifest_old.toml b/docs/Manifest_old.toml deleted file mode 100644 index 7f45716d..00000000 --- a/docs/Manifest_old.toml +++ /dev/null @@ -1,2327 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.10.0" -manifest_format = "2.0" -project_hash = "4babdefef2fbb529ff83e45595ee20526dac2b29" - -[[deps.ANSIColoredPrinters]] -git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" -uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" -version = "0.0.1" - -[[deps.AbstractFFTs]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" -uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" -version = "1.5.0" -weakdeps = ["ChainRulesCore", "Test"] - - [deps.AbstractFFTs.extensions] - AbstractFFTsChainRulesCoreExt = "ChainRulesCore" - AbstractFFTsTestExt = "Test" - -[[deps.AbstractTrees]] -git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177" -uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -version = "0.4.5" - -[[deps.Adapt]] -deps = ["LinearAlgebra", "Requires"] -git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608" -uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "3.7.2" -weakdeps = ["StaticArrays"] - - [deps.Adapt.extensions] - AdaptStaticArraysExt = "StaticArrays" - -[[deps.ArgCheck]] -git-tree-sha1 = "a3a402a35a2f7e0b87828ccabbd5ebfbebe356b4" -uuid = "dce04be8-c92d-5529-be00-80e4d2c0e197" -version = "2.3.0" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" - -[[deps.Arpack]] -deps = ["Arpack_jll", "Libdl", "LinearAlgebra", "Logging"] -git-tree-sha1 = "9b9b347613394885fd1c8c7729bfc60528faa436" -uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -version = "0.5.4" - -[[deps.Arpack_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "OpenBLAS_jll", "Pkg"] -git-tree-sha1 = "5ba6c757e8feccf03a1554dfaf3e26b3cfc7fd5e" -uuid = "68821587-b530-5797-8361-c406ea357684" -version = "3.5.1+1" - -[[deps.ArrayInterface]] -deps = ["Adapt", "LinearAlgebra", "Requires", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "c5aeb516a84459e0318a02507d2261edad97eb75" -uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" -version = "7.7.1" - - [deps.ArrayInterface.extensions] - ArrayInterfaceBandedMatricesExt = "BandedMatrices" - ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices" - ArrayInterfaceCUDAExt = "CUDA" - ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore" - ArrayInterfaceStaticArraysCoreExt = "StaticArraysCore" - ArrayInterfaceTrackerExt = "Tracker" - - [deps.ArrayInterface.weakdeps] - BandedMatrices = "aae01518-5342-5314-be14-df237901396f" - BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0" - CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" - GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" - StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" - Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Atomix]] -deps = ["UnsafeAtomics"] -git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" -uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -version = "0.1.0" - -[[deps.BFloat16s]] -deps = ["LinearAlgebra", "Printf", "Random", "Test"] -git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" -uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -version = "0.2.0" - -[[deps.BSON]] -git-tree-sha1 = "4c3e506685c527ac6a54ccc0c8c76fd6f91b42fb" -uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" -version = "0.3.9" - -[[deps.BangBang]] -deps = ["Compat", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires", "Setfield", "Tables"] -git-tree-sha1 = "7aa7ad1682f3d5754e3491bb59b8103cae28e3a3" -uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" -version = "0.3.40" - - [deps.BangBang.extensions] - BangBangChainRulesCoreExt = "ChainRulesCore" - BangBangDataFramesExt = "DataFrames" - BangBangStaticArraysExt = "StaticArrays" - BangBangStructArraysExt = "StructArrays" - BangBangTypedTablesExt = "TypedTables" - - [deps.BangBang.weakdeps] - ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" - DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" - StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" - StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" - TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.Baselet]] -git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e" -uuid = "9718e550-a3fa-408a-8086-8db961cd8217" -version = "0.1.1" - -[[deps.BenchmarkTools]] -deps = ["JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"] -git-tree-sha1 = "f1dff6729bc61f4d49e140da1af55dcd1ac97b2f" -uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -version = "1.5.0" - -[[deps.BetaML]] -deps = ["AbstractTrees", "CategoricalArrays", "Combinatorics", "DelimitedFiles", "Distributions", "DocStringExtensions", "ForceImport", "JLD2", "LinearAlgebra", "LoopVectorization", "MLJModelInterface", "PDMats", "PrecompileTools", "Printf", "ProgressMeter", "Random", "Reexport", "StableRNGs", "StaticArrays", "Statistics", "StatsBase", "Test", "Zygote"] -path = "/home/lobianco/.julia/dev/BetaML" -uuid = "024491cd-cc6b-443e-8034-08ea7eb7db2b" -version = "0.11.4" - -[[deps.BinDeps]] -deps = ["Libdl", "Pkg", "SHA", "URIParser", "Unicode"] -git-tree-sha1 = "1289b57e8cf019aede076edab0587eb9644175bd" -uuid = "9e28174c-4ba2-5203-b857-d8d62c4213ee" -version = "1.0.2" - -[[deps.BinaryProvider]] -deps = ["Libdl", "Logging", "SHA"] -git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.10" - -[[deps.BitTwiddlingConvenienceFunctions]] -deps = ["Static"] -git-tree-sha1 = "0c5f81f47bbbcf4aea7b2959135713459170798b" -uuid = "62783981-4cbd-42fc-bca8-16325de8dc4b" -version = "0.1.5" - -[[deps.BufferedStreams]] -git-tree-sha1 = "4ae47f9a4b1dc19897d3743ff13685925c5202ec" -uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" -version = "1.2.1" - -[[deps.Bzip2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd" -uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.8+1" - -[[deps.CEnum]] -git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.2" - -[[deps.CPUSummary]] -deps = ["CpuId", "IfElse", "PrecompileTools", "Static"] -git-tree-sha1 = "601f7e7b3d36f18790e2caf83a882d88e9b71ff1" -uuid = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" -version = "0.2.4" - -[[deps.CSV]] -deps = ["Dates", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode"] -git-tree-sha1 = "b83aa3f513be680454437a0eee21001607e5d983" -uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -version = "0.8.5" - -[[deps.CUDA]] -deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] -git-tree-sha1 = "6717cb9a3425ebb7b31ca4f832823615d175f64a" -uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" -version = "3.13.1" - -[[deps.Cairo_jll]] -deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "a4c43f59baa34011e303e76f5c8c91bf58415aaf" -uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a" -version = "1.18.0+1" - -[[deps.Calculus]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "f641eb0a4f00c343bbc32346e1217b86f3ce9dad" -uuid = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9" -version = "0.5.1" - -[[deps.CategoricalArrays]] -deps = ["DataAPI", "Future", "Missings", "Printf", "Requires", "Statistics", "Unicode"] -git-tree-sha1 = "1568b28f91293458345dabba6a5ea3f183250a61" -uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" -version = "0.10.8" -weakdeps = ["JSON", "RecipesBase", "SentinelArrays", "StructTypes"] - - [deps.CategoricalArrays.extensions] - CategoricalArraysJSONExt = "JSON" - CategoricalArraysRecipesBaseExt = "RecipesBase" - CategoricalArraysSentinelArraysExt = "SentinelArrays" - CategoricalArraysStructTypesExt = "StructTypes" - -[[deps.ChainRules]] -deps = ["Adapt", "ChainRulesCore", "Compat", "Distributed", "GPUArraysCore", "IrrationalConstants", "LinearAlgebra", "Random", "RealDot", "SparseArrays", "SparseInverseSubset", "Statistics", "StructArrays", "SuiteSparse"] -git-tree-sha1 = "4e42872be98fa3343c4f8458cbda8c5c6a6fa97c" -uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2" -version = "1.63.0" - -[[deps.ChainRulesCore]] -deps = ["Compat", "LinearAlgebra"] -git-tree-sha1 = "575cd02e080939a33b6df6c5853d14924c08e35b" -uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.23.0" -weakdeps = ["SparseArrays"] - - [deps.ChainRulesCore.extensions] - ChainRulesCoreSparseArraysExt = "SparseArrays" - -[[deps.CloseOpenIntervals]] -deps = ["Static", "StaticArrayInterface"] -git-tree-sha1 = "70232f82ffaab9dc52585e0dd043b5e0c6b714f1" -uuid = "fb6a15b2-703c-40df-9091-08a04967cfa9" -version = "0.1.12" - -[[deps.Clustering]] -deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "Random", "SparseArrays", "Statistics", "StatsBase"] -git-tree-sha1 = "9ebb045901e9bbf58767a9f34ff89831ed711aae" -uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" -version = "0.15.7" - -[[deps.CodecZlib]] -deps = ["TranscodingStreams", "Zlib_jll"] -git-tree-sha1 = "59939d8a997469ee05c4b4944560a820f9ba0d73" -uuid = "944b1d66-785c-5afd-91f1-9de20f533193" -version = "0.7.4" - -[[deps.ColorSchemes]] -deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"] -git-tree-sha1 = "67c1f244b991cad9b0aa4b7540fb758c2488b129" -uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -version = "3.24.0" - -[[deps.ColorTypes]] -deps = ["FixedPointNumbers", "Random"] -git-tree-sha1 = "eb7f0f8307f71fac7c606984ea5fb2817275d6e4" -uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" -version = "0.11.4" - -[[deps.ColorVectorSpace]] -deps = ["ColorTypes", "FixedPointNumbers", "LinearAlgebra", "SpecialFunctions", "Statistics", "TensorCore"] -git-tree-sha1 = "600cc5508d66b78aae350f7accdb58763ac18589" -uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4" -version = "0.9.10" - -[[deps.Colors]] -deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] -git-tree-sha1 = "fc08e5930ee9a4e03f84bfb5211cb54e7769758a" -uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" -version = "0.12.10" - -[[deps.Combinatorics]] -git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860" -uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" -version = "1.0.2" - -[[deps.CommonSubexpressions]] -deps = ["MacroTools", "Test"] -git-tree-sha1 = "7b8a93dba8af7e3b42fecabf646260105ac373f7" -uuid = "bbf7d656-a473-5ed7-a52c-81e309532950" -version = "0.3.0" - -[[deps.Compat]] -deps = ["TOML", "UUIDs"] -git-tree-sha1 = "c955881e3c981181362ae4088b35995446298b80" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.14.0" -weakdeps = ["Dates", "LinearAlgebra"] - - [deps.Compat.extensions] - CompatLinearAlgebraExt = "LinearAlgebra" - -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.0.5+1" - -[[deps.CompositionsBase]] -git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad" -uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b" -version = "0.1.2" - - [deps.CompositionsBase.extensions] - CompositionsBaseInverseFunctionsExt = "InverseFunctions" - - [deps.CompositionsBase.weakdeps] - InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" - -[[deps.ComputationalResources]] -git-tree-sha1 = "52cb3ec90e8a8bea0e62e275ba577ad0f74821f7" -uuid = "ed09eef8-17a6-5b46-8889-db040fac31e3" -version = "0.3.2" - -[[deps.Conda]] -deps = ["Downloads", "JSON", "VersionParsing"] -git-tree-sha1 = "51cab8e982c5b598eea9c8ceaced4b58d9dd37c9" -uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" -version = "1.10.0" - -[[deps.ConstructionBase]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "c53fc348ca4d40d7b371e71fd52251839080cbc9" -uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" -version = "1.5.4" - - [deps.ConstructionBase.extensions] - ConstructionBaseIntervalSetsExt = "IntervalSets" - ConstructionBaseStaticArraysExt = "StaticArrays" - - [deps.ConstructionBase.weakdeps] - IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" - StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" - -[[deps.ContextVariablesX]] -deps = ["Compat", "Logging", "UUIDs"] -git-tree-sha1 = "25cc3803f1030ab855e383129dcd3dc294e322cc" -uuid = "6add18c4-b38d-439d-96f6-d6bc489c04c5" -version = "0.1.3" - -[[deps.Contour]] -git-tree-sha1 = "d05d9e7b7aedff4e5b51a029dced05cfb6125781" -uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" -version = "0.6.2" - -[[deps.CpuId]] -deps = ["Markdown"] -git-tree-sha1 = "fcbb72b032692610bfbdb15018ac16a36cf2e406" -uuid = "adafc99b-e345-5852-983c-f28acb93d879" -version = "0.3.1" - -[[deps.Crayons]] -git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" -uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" -version = "4.1.1" - -[[deps.DataAPI]] -git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.16.0" - -[[deps.DataDeps]] -deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"] -git-tree-sha1 = "e299d8267135ef2f9c941a764006697082c1e7e8" -uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -version = "0.7.8" - -[[deps.DataFrames]] -deps = ["Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "db2a9cb664fcea7836da4b414c3278d71dd602d2" -uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.3.6" - -[[deps.DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.18" - -[[deps.DataValueInterfaces]] -git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" -uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" -version = "1.0.0" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.DecisionTree]] -deps = ["AbstractTrees", "DelimitedFiles", "LinearAlgebra", "Random", "ScikitLearnBase", "Statistics"] -git-tree-sha1 = "526ca14aaaf2d5a0e242f3a8a7966eb9065d7d78" -uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" -version = "0.12.4" - -[[deps.DefineSingletons]] -git-tree-sha1 = "0fba8b706d0178b4dc7fd44a96a92382c9065c2c" -uuid = "244e2a9f-e319-4986-a169-4d1fe445cd52" -version = "0.1.2" - -[[deps.DelimitedFiles]] -deps = ["Mmap"] -git-tree-sha1 = "9e2f36d3c96a820c678f2f1f1782582fcf685bae" -uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" -version = "1.9.1" - -[[deps.DiffResults]] -deps = ["StaticArraysCore"] -git-tree-sha1 = "782dd5f4561f5d267313f23853baaaa4c52ea621" -uuid = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" -version = "1.1.0" - -[[deps.DiffRules]] -deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"] -git-tree-sha1 = "23163d55f885173722d1e4cf0f6110cdbaf7e272" -uuid = "b552c78f-8df3-52c6-915a-8e097449b14b" -version = "1.15.1" - -[[deps.Distances]] -deps = ["LinearAlgebra", "Statistics", "StatsAPI"] -git-tree-sha1 = "66c4c81f259586e8f002eacebc177e1fb06363b0" -uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -version = "0.10.11" -weakdeps = ["ChainRulesCore", "SparseArrays"] - - [deps.Distances.extensions] - DistancesChainRulesCoreExt = "ChainRulesCore" - DistancesSparseArraysExt = "SparseArrays" - -[[deps.Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[deps.Distributions]] -deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsAPI", "StatsBase", "StatsFuns"] -git-tree-sha1 = "7c302d7a5fec5214eb8a5a4c466dcf7a51fcf169" -uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.25.107" - - [deps.Distributions.extensions] - DistributionsChainRulesCoreExt = "ChainRulesCore" - DistributionsDensityInterfaceExt = "DensityInterface" - DistributionsTestExt = "Test" - - [deps.Distributions.weakdeps] - ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" - DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d" - Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.9.3" - -[[deps.Documenter]] -deps = ["ANSIColoredPrinters", "AbstractTrees", "Base64", "CodecZlib", "Dates", "DocStringExtensions", "Downloads", "Git", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "MarkdownAST", "Pkg", "PrecompileTools", "REPL", "RegistryInstances", "SHA", "TOML", "Test", "Unicode"] -git-tree-sha1 = "4a40af50e8b24333b9ec6892546d9ca5724228eb" -uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "1.3.0" - -[[deps.Downloads]] -deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -version = "1.6.0" - -[[deps.DualNumbers]] -deps = ["Calculus", "NaNMath", "SpecialFunctions"] -git-tree-sha1 = "5837a837389fccf076445fce071c8ddaea35a566" -uuid = "fa6b7ba4-c1ee-5f82-b5fc-ecf0adba8f74" -version = "0.6.8" - -[[deps.EarlyStopping]] -deps = ["Dates", "Statistics"] -git-tree-sha1 = "ea0b56527cefce87419d4b7559811bd96974a6c8" -uuid = "792122b4-ca99-40de-a6bc-6742525f08b6" -version = "0.1.9" - -[[deps.EpollShim_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8e9441ee83492030ace98f9789a654a6d0b1f643" -uuid = "2702e6a9-849d-5ed8-8c21-79e8b8f9ee43" -version = "0.0.20230411+0" - -[[deps.Expat_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "4558ab818dcceaab612d1bb8c19cee87eda2b83c" -uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.5.0+0" - -[[deps.ExprTools]] -git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.10" - -[[deps.FFMPEG]] -deps = ["FFMPEG_jll"] -git-tree-sha1 = "b57e3acbe22f8484b4b5ff66a7499717fe1a9cc8" -uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" -version = "0.4.1" - -[[deps.FFMPEG_jll]] -deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "PCRE2_jll", "Zlib_jll", "libaom_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"] -git-tree-sha1 = "466d45dc38e15794ec7d5d63ec03d776a9aff36e" -uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5" -version = "4.4.4+1" - -[[deps.FLoops]] -deps = ["BangBang", "Compat", "FLoopsBase", "InitialValues", "JuliaVariables", "MLStyle", "Serialization", "Setfield", "Transducers"] -git-tree-sha1 = "ffb97765602e3cbe59a0589d237bf07f245a8576" -uuid = "cc61a311-1640-44b5-9fba-1b764f453329" -version = "0.2.1" - -[[deps.FLoopsBase]] -deps = ["ContextVariablesX"] -git-tree-sha1 = "656f7a6859be8673bf1f35da5670246b923964f7" -uuid = "b9860ae5-e623-471e-878b-f6a53c775ea6" -version = "0.1.1" - -[[deps.FileIO]] -deps = ["Pkg", "Requires", "UUIDs"] -git-tree-sha1 = "82d8afa92ecf4b52d78d869f038ebfb881267322" -uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" -version = "1.16.3" - -[[deps.FilePathsBase]] -deps = ["Compat", "Dates", "Mmap", "Printf", "Test", "UUIDs"] -git-tree-sha1 = "9f00e42f8d99fdde64d40c8ea5d14269a2e2c1aa" -uuid = "48062228-2e41-5def-b9a4-89aafe57970f" -version = "0.9.21" - -[[deps.FileWatching]] -uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" - -[[deps.FillArrays]] -deps = ["LinearAlgebra", "Random"] -git-tree-sha1 = "5b93957f6dcd33fc343044af3d48c215be2562f1" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "1.9.3" -weakdeps = ["PDMats", "SparseArrays", "Statistics"] - - [deps.FillArrays.extensions] - FillArraysPDMatsExt = "PDMats" - FillArraysSparseArraysExt = "SparseArrays" - FillArraysStatisticsExt = "Statistics" - -[[deps.FixedPointNumbers]] -deps = ["Statistics"] -git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" -uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" -version = "0.8.4" - -[[deps.Flux]] -deps = ["Adapt", "CUDA", "ChainRulesCore", "Functors", "LinearAlgebra", "MLUtils", "MacroTools", "NNlib", "NNlibCUDA", "OneHotArrays", "Optimisers", "ProgressLogging", "Random", "Reexport", "SparseArrays", "SpecialFunctions", "Statistics", "StatsBase", "Zygote"] -git-tree-sha1 = "4ff3a1d7b0dd38f2fc38e813bc801f817639c1f2" -uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c" -version = "0.13.13" - -[[deps.Fontconfig_jll]] -deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Pkg", "Zlib_jll"] -git-tree-sha1 = "21efd19106a55620a188615da6d3d06cd7f6ee03" -uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" -version = "2.13.93+0" - -[[deps.ForceImport]] -deps = ["Test"] -git-tree-sha1 = "7ac07d5194360af910146abd33af89bb69541194" -uuid = "9dda63f9-cce7-5873-89fa-eccbb2fffcde" -version = "0.0.3" - -[[deps.Format]] -git-tree-sha1 = "f3cf88025f6d03c194d73f5d13fee9004a108329" -uuid = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" -version = "1.3.6" - -[[deps.Formatting]] -deps = ["Logging", "Printf"] -git-tree-sha1 = "fb409abab2caf118986fc597ba84b50cbaf00b87" -uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" -version = "0.4.3" - -[[deps.ForwardDiff]] -deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "LogExpFunctions", "NaNMath", "Preferences", "Printf", "Random", "SpecialFunctions"] -git-tree-sha1 = "cf0fe81336da9fb90944683b8c41984b08793dad" -uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "0.10.36" -weakdeps = ["StaticArrays"] - - [deps.ForwardDiff.extensions] - ForwardDiffStaticArraysExt = "StaticArrays" - -[[deps.FreeType2_jll]] -deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "d8db6a5a2fe1381c1ea4ef2cab7c69c2de7f9ea0" -uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7" -version = "2.13.1+0" - -[[deps.FriBidi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "aa31987c2ba8704e23c6c8ba8a4f769d5d7e4f91" -uuid = "559328eb-81f9-559d-9380-de523a88c83c" -version = "1.0.10+0" - -[[deps.Functors]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "8ae30e786837ce0a24f5e2186938bf3251ab94b2" -uuid = "d9f16b24-f501-4c13-a1f2-28368ffc5196" -version = "0.4.8" - -[[deps.Future]] -deps = ["Random"] -uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" - -[[deps.GLFW_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"] -git-tree-sha1 = "ff38ba61beff76b8f4acad8ab0c97ef73bb670cb" -uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89" -version = "3.3.9+0" - -[[deps.GPUArrays]] -deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] -git-tree-sha1 = "2e57b4a4f9cc15e85a24d603256fe08e527f48d1" -uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "8.8.1" - -[[deps.GPUArraysCore]] -deps = ["Adapt"] -git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" -uuid = "46192b85-c4d5-4398-a991-12ede77f4527" -version = "0.1.5" - -[[deps.GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.17.3" - -[[deps.GR]] -deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Pkg", "Preferences", "Printf", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "UUIDs", "p7zip_jll"] -git-tree-sha1 = "3437ade7073682993e092ca570ad68a2aba26983" -uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" -version = "0.73.3" - -[[deps.GR_jll]] -deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "FreeType2_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Qt6Base_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "a96d5c713e6aa28c242b0d25c1347e258d6541ab" -uuid = "d2c73de3-f751-5644-a686-071e5b155ba9" -version = "0.73.3+0" - -[[deps.GZip]] -deps = ["Libdl"] -git-tree-sha1 = "8d838ee3772e00c75d6cc06bb08891379868c18d" -uuid = "92fee26a-97fe-5a0c-ad85-20a5f3185b63" -version = "0.5.2" - -[[deps.GaussianMixtures]] -deps = ["Arpack", "Clustering", "Compat", "DelimitedFiles", "Distributed", "Distributions", "FileIO", "JLD2", "LinearAlgebra", "Logging", "PDMats", "Printf", "Random", "ScikitLearnBase", "SpecialFunctions", "Statistics", "StatsBase"] -git-tree-sha1 = "289e5126240812b9fb0ebada26c1e5ad305f904f" -uuid = "cc18c42c-b769-54ff-9e2a-b28141a64aae" -version = "0.3.9" - -[[deps.Gettext_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "XML2_jll"] -git-tree-sha1 = "9b02998aba7bf074d14de89f9d37ca24a1a0b046" -uuid = "78b55507-aeef-58d4-861c-77aaff3498b1" -version = "0.21.0+0" - -[[deps.Git]] -deps = ["Git_jll"] -git-tree-sha1 = "04eff47b1354d702c3a85e8ab23d539bb7d5957e" -uuid = "d7ba0133-e1db-5d97-8f8c-041e4b3a1eb2" -version = "1.3.1" - -[[deps.Git_jll]] -deps = ["Artifacts", "Expat_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Libiconv_jll", "OpenSSL_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "12945451c5d0e2d0dca0724c3a8d6448b46bbdf9" -uuid = "f8c6e375-362e-5223-8a59-34ff63f689eb" -version = "2.44.0+1" - -[[deps.Glib_jll]] -deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "359a1ba2e320790ddbe4ee8b4d54a305c0ea2aff" -uuid = "7746bdde-850d-59dc-9ae8-88ece973131d" -version = "2.80.0+0" - -[[deps.Graphite2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "344bf40dcab1073aca04aa0df4fb092f920e4011" -uuid = "3b182d85-2403-5c21-9c21-1e1f0cc25472" -version = "1.3.14+0" - -[[deps.Grisu]] -git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2" -uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" -version = "1.0.2" - -[[deps.HDF5]] -deps = ["Compat", "HDF5_jll", "Libdl", "MPIPreferences", "Mmap", "Preferences", "Printf", "Random", "Requires", "UUIDs"] -git-tree-sha1 = "26407bd1c60129062cec9da63dc7d08251544d53" -uuid = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" -version = "0.17.1" - - [deps.HDF5.extensions] - MPIExt = "MPI" - - [deps.HDF5.weakdeps] - MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" - -[[deps.HDF5_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "OpenSSL_jll", "TOML", "Zlib_jll", "libaec_jll"] -git-tree-sha1 = "38c8874692d48d5440d5752d6c74b0c6b0b60739" -uuid = "0234f1f7-429e-5d53-9886-15a909be8d59" -version = "1.14.2+1" - -[[deps.HTTP]] -deps = ["Base64", "Dates", "IniFile", "Logging", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] -git-tree-sha1 = "0fa77022fe4b511826b39c894c90daf5fce3334a" -uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "0.9.17" - -[[deps.HarfBuzz_jll]] -deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg"] -git-tree-sha1 = "129acf094d168394e80ee1dc4bc06ec835e510a3" -uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566" -version = "2.8.1+1" - -[[deps.HostCPUFeatures]] -deps = ["BitTwiddlingConvenienceFunctions", "IfElse", "Libdl", "Static"] -git-tree-sha1 = "eb8fed28f4994600e29beef49744639d985a04b2" -uuid = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0" -version = "0.1.16" - -[[deps.Hwloc_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "ca0f6bf568b4bfc807e7537f081c81e35ceca114" -uuid = "e33a78d0-f292-5ffc-b300-72abe9b543c8" -version = "2.10.0+0" - -[[deps.HypergeometricFunctions]] -deps = ["DualNumbers", "LinearAlgebra", "OpenLibm_jll", "SpecialFunctions"] -git-tree-sha1 = "f218fe3736ddf977e0e772bc9a586b2383da2685" -uuid = "34004b35-14d8-5ef3-9330-4cdb6864b03a" -version = "0.3.23" - -[[deps.IOCapture]] -deps = ["Logging", "Random"] -git-tree-sha1 = "8b72179abc660bfab5e28472e019392b97d0985c" -uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.2.4" - -[[deps.IRTools]] -deps = ["InteractiveUtils", "MacroTools", "Test"] -git-tree-sha1 = "5d8c5713f38f7bc029e26627b687710ba406d0dd" -uuid = "7869d1d1-7146-5819-86e3-90919afe41df" -version = "0.4.12" - -[[deps.IfElse]] -git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1" -uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" -version = "0.1.1" - -[[deps.IniFile]] -git-tree-sha1 = "f550e6e32074c939295eb5ea6de31849ac2c9625" -uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" -version = "0.5.1" - -[[deps.InitialValues]] -git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3" -uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c" -version = "0.3.1" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.InternedStrings]] -deps = ["Random", "Test"] -git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b" -uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01" -version = "0.7.0" - -[[deps.InvertedIndices]] -git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" -uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" -version = "1.3.0" - -[[deps.IrrationalConstants]] -git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" -uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.2.2" - -[[deps.IterationControl]] -deps = ["EarlyStopping", "InteractiveUtils"] -git-tree-sha1 = "f61d5d4d0e433b3fab03ca5a1bfa2d7dcbb8094c" -uuid = "b3c1a2ee-3fec-4384-bf48-272ea71de57c" -version = "0.4.0" - -[[deps.IteratorInterfaceExtensions]] -git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" -uuid = "82899510-4779-5014-852e-03e436cf321d" -version = "1.0.0" - -[[deps.JLD2]] -deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "PrecompileTools", "Printf", "Reexport", "Requires", "TranscodingStreams", "UUIDs"] -git-tree-sha1 = "5ea6acdd53a51d897672edb694e3cc2912f3f8a7" -uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819" -version = "0.4.46" - -[[deps.JLFzf]] -deps = ["Pipe", "REPL", "Random", "fzf_jll"] -git-tree-sha1 = "a53ebe394b71470c7f97c2e7e170d51df21b17af" -uuid = "1019f520-868f-41f5-a6de-eb00f4b6a39c" -version = "0.1.7" - -[[deps.JLLWrappers]] -deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.5.0" - -[[deps.JLSO]] -deps = ["BSON", "CodecZlib", "FilePathsBase", "Memento", "Pkg", "Serialization"] -git-tree-sha1 = "7e3821e362ede76f83a39635d177c63595296776" -uuid = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11" -version = "2.7.0" - -[[deps.JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.4" - -[[deps.JSON3]] -deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"] -git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b" -uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -version = "1.14.0" - - [deps.JSON3.extensions] - JSON3ArrowExt = ["ArrowTypes"] - - [deps.JSON3.weakdeps] - ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" - -[[deps.JpegTurbo_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "3336abae9a713d2210bb57ab484b1e065edd7d23" -uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" -version = "3.0.2+0" - -[[deps.JuliaVariables]] -deps = ["MLStyle", "NameResolution"] -git-tree-sha1 = "49fb3cb53362ddadb4415e9b73926d6b40709e70" -uuid = "b14d175d-62b4-44ba-8fb7-3064adc8c3ec" -version = "0.2.4" - -[[deps.KernelAbstractions]] -deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] -git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49" -uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.9.18" - - [deps.KernelAbstractions.extensions] - EnzymeExt = "EnzymeCore" - - [deps.KernelAbstractions.weakdeps] - EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" - -[[deps.LAME_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "f6250b16881adf048549549fba48b1161acdac8c" -uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d" -version = "3.100.1+0" - -[[deps.LERC_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "bf36f528eec6634efc60d7ec062008f171071434" -uuid = "88015f11-f218-50d7-93a8-a6af411a945d" -version = "3.0.0+1" - -[[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "4.17.1" - -[[deps.LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.18+0" - -[[deps.LLVMOpenMP_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "d986ce2d884d49126836ea94ed5bfb0f12679713" -uuid = "1d63c593-3942-5779-bab2-d838dc0a180e" -version = "15.0.7+0" - -[[deps.LZO_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "e5b909bcf985c5e2605737d2ce278ed791b89be6" -uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac" -version = "2.10.1+0" - -[[deps.LaTeXStrings]] -git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec" -uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -version = "1.3.1" - -[[deps.Latexify]] -deps = ["Format", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"] -git-tree-sha1 = "cad560042a7cc108f5a4c24ea1431a9221f22c1b" -uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" -version = "0.16.2" - - [deps.Latexify.extensions] - DataFramesExt = "DataFrames" - SymEngineExt = "SymEngine" - - [deps.Latexify.weakdeps] - DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" - SymEngine = "123dc426-2d89-5057-bbad-38513e3affd8" - -[[deps.LatinHypercubeSampling]] -deps = ["Random", "StableRNGs", "StatsBase", "Test"] -git-tree-sha1 = "825289d43c753c7f1bf9bed334c253e9913997f8" -uuid = "a5e1c1ea-c99a-51d3-a14d-a9a37257b02d" -version = "1.9.0" - -[[deps.LayoutPointers]] -deps = ["ArrayInterface", "LinearAlgebra", "ManualMemory", "SIMDTypes", "Static", "StaticArrayInterface"] -git-tree-sha1 = "62edfee3211981241b57ff1cedf4d74d79519277" -uuid = "10f19ff3-798f-405d-979b-55457f8fc047" -version = "0.1.15" - -[[deps.LazilyInitializedFields]] -git-tree-sha1 = "8f7f3cabab0fd1800699663533b6d5cb3fc0e612" -uuid = "0e77f7df-68c5-4e49-93ce-4cd80f5598bf" -version = "1.2.2" - -[[deps.LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - -[[deps.LearnBase]] -git-tree-sha1 = "a0d90569edd490b82fdc4dc078ea54a5a800d30a" -uuid = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6" -version = "0.4.1" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.4" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "8.4.0+0" - -[[deps.LibGit2]] -deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibGit2_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] -uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" -version = "1.6.4+0" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.11.0+1" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.Libffi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0b4a5d71f3e5200a7dff793393e09dfc2d874290" -uuid = "e9f186c6-92d2-5b65-8a66-fee21dc1b490" -version = "3.2.2+1" - -[[deps.Libgcrypt_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] -git-tree-sha1 = "64613c82a59c120435c067c2b809fc61cf5166ae" -uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" -version = "1.8.7+0" - -[[deps.Libglvnd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] -git-tree-sha1 = "6f73d1dd803986947b2c750138528a999a6c7733" -uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" -version = "1.6.0+0" - -[[deps.Libgpg_error_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "c333716e46366857753e273ce6a69ee0945a6db9" -uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" -version = "1.42.0+0" - -[[deps.Libiconv_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" -uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.17.0+0" - -[[deps.Libmount_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "dae976433497a2f841baadea93d27e68f1a12a97" -uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" -version = "2.39.3+0" - -[[deps.Libtiff_jll]] -deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "2da088d113af58221c52828a80378e16be7d037a" -uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.5.1+1" - -[[deps.Libuuid_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "0a04a1318df1bf510beb2562cf90fb0c386f58c4" -uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700" -version = "2.39.3+1" - -[[deps.LinearAlgebra]] -deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.Literate]] -deps = ["Base64", "IOCapture", "JSON", "REPL"] -git-tree-sha1 = "bad26f1ccd99c553886ec0725e99a509589dcd11" -uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306" -version = "2.16.1" - -[[deps.LogExpFunctions]] -deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "18144f3e9cbe9b15b070288eef858f71b291ce37" -uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.27" - - [deps.LogExpFunctions.extensions] - LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" - LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" - LogExpFunctionsInverseFunctionsExt = "InverseFunctions" - - [deps.LogExpFunctions.weakdeps] - ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" - ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" - InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.LoopVectorization]] -deps = ["ArrayInterface", "CPUSummary", "CloseOpenIntervals", "DocStringExtensions", "HostCPUFeatures", "IfElse", "LayoutPointers", "LinearAlgebra", "OffsetArrays", "PolyesterWeave", "PrecompileTools", "SIMDTypes", "SLEEFPirates", "Static", "StaticArrayInterface", "ThreadingUtilities", "UnPack", "VectorizationBase"] -git-tree-sha1 = "0f5648fbae0d015e3abe5867bca2b362f67a5894" -uuid = "bdcacae8-1622-11e9-2a5c-532679323890" -version = "0.12.166" -weakdeps = ["ChainRulesCore", "ForwardDiff", "SpecialFunctions"] - - [deps.LoopVectorization.extensions] - ForwardDiffExt = ["ChainRulesCore", "ForwardDiff"] - SpecialFunctionsExt = "SpecialFunctions" - -[[deps.LossFunctions]] -deps = ["InteractiveUtils", "LearnBase", "Markdown", "RecipesBase", "StatsBase"] -git-tree-sha1 = "0f057f6ea90a84e73a8ef6eebb4dc7b5c330020f" -uuid = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" -version = "0.7.2" - -[[deps.MAT]] -deps = ["BufferedStreams", "CodecZlib", "HDF5", "SparseArrays"] -git-tree-sha1 = "ed1cf0a322d78cee07718bed5fd945e2218c35a1" -uuid = "23992714-dd62-5051-b70f-ba57cb901cac" -version = "0.10.6" - -[[deps.MLDatasets]] -deps = ["BinDeps", "ColorTypes", "DataDeps", "DelimitedFiles", "FixedPointNumbers", "GZip", "JSON3", "MAT", "Pickle", "Requires", "SparseArrays"] -git-tree-sha1 = "f1ff456828cfceb8fd64f2f212dea67b1414be96" -uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458" -version = "0.5.15" - -[[deps.MLJ]] -deps = ["CategoricalArrays", "ComputationalResources", "Distributed", "Distributions", "LinearAlgebra", "MLJBase", "MLJEnsembles", "MLJIteration", "MLJModels", "MLJOpenML", "MLJSerialization", "MLJTuning", "Pkg", "ProgressMeter", "Random", "ScientificTypes", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "7cbd651e39fd3f3aa37e8a4d8beaccfa8d13b1cd" -uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -version = "0.16.7" - -[[deps.MLJBase]] -deps = ["CategoricalArrays", "ComputationalResources", "Dates", "DelimitedFiles", "Distributed", "Distributions", "InteractiveUtils", "InvertedIndices", "LinearAlgebra", "LossFunctions", "MLJModelInterface", "Missings", "OrderedCollections", "Parameters", "PrettyTables", "ProgressMeter", "Random", "ScientificTypes", "StatisticalTraits", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "4a0b5d1212786f5de364f481ef2a84458d3dfe4b" -uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -version = "0.18.26" - -[[deps.MLJEnsembles]] -deps = ["CategoricalArrays", "ComputationalResources", "Distributed", "Distributions", "MLJBase", "MLJModelInterface", "ProgressMeter", "Random", "ScientificTypes", "StatsBase"] -git-tree-sha1 = "f8ca949d52432b81f621d9da641cf59829ad2c8c" -uuid = "50ed68f4-41fd-4504-931a-ed422449fee0" -version = "0.1.2" - -[[deps.MLJIteration]] -deps = ["IterationControl", "MLJBase", "Random"] -git-tree-sha1 = "1c94830f8927b10a5653d6e1868c20faccf57be5" -uuid = "614be32b-d00c-4edb-bd02-1eb411ab5e55" -version = "0.3.3" - -[[deps.MLJModelInterface]] -deps = ["Random", "ScientificTypesBase", "StatisticalTraits"] -git-tree-sha1 = "0174e9d180b0cae1f8fe7976350ad52f0e70e0d8" -uuid = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" -version = "1.3.3" - -[[deps.MLJModels]] -deps = ["CategoricalArrays", "Dates", "Distances", "Distributions", "InteractiveUtils", "LinearAlgebra", "MLJBase", "MLJModelInterface", "OrderedCollections", "Parameters", "Pkg", "REPL", "Random", "Requires", "ScientificTypes", "Statistics", "StatsBase", "Tables"] -git-tree-sha1 = "271c431ef783079db3371ffe770140bb83cf2f16" -uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7" -version = "0.14.14" - -[[deps.MLJOpenML]] -deps = ["CSV", "HTTP", "JSON", "Markdown", "ScientificTypes"] -git-tree-sha1 = "a0d6e25ec042ab84505733a62a2b2894fbcf260c" -uuid = "cbea4545-8c96-4583-ad3a-44078d60d369" -version = "1.1.0" - -[[deps.MLJSerialization]] -deps = ["IterationControl", "JLSO", "MLJBase", "MLJModelInterface"] -git-tree-sha1 = "cc5877ad02ef02e273d2622f0d259d628fa61cd0" -uuid = "17bed46d-0ab5-4cd4-b792-a5c4b8547c6d" -version = "1.1.3" - -[[deps.MLJTuning]] -deps = ["ComputationalResources", "Distributed", "Distributions", "LatinHypercubeSampling", "MLJBase", "ProgressMeter", "Random", "RecipesBase"] -git-tree-sha1 = "a443cc088158b949876d7038a1aa37cfc8c5509b" -uuid = "03970b2e-30c4-11ea-3135-d1576263f10f" -version = "0.6.16" - -[[deps.MLStyle]] -git-tree-sha1 = "bc38dff0548128765760c79eb7388a4b37fae2c8" -uuid = "d8e11817-5142-5d16-987a-aa16d5891078" -version = "0.4.17" - -[[deps.MLUtils]] -deps = ["ChainRulesCore", "Compat", "DataAPI", "DelimitedFiles", "FLoops", "NNlib", "Random", "ShowCases", "SimpleTraits", "Statistics", "StatsBase", "Tables", "Transducers"] -git-tree-sha1 = "b45738c2e3d0d402dffa32b2c1654759a2ac35a4" -uuid = "f1d291b0-491e-4a28-83b9-f70985020b54" -version = "0.4.4" - -[[deps.MPICH_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Hwloc_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"] -git-tree-sha1 = "656036b9ed6f942d35e536e249600bc31d0f9df8" -uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4" -version = "4.2.0+0" - -[[deps.MPIPreferences]] -deps = ["Libdl", "Preferences"] -git-tree-sha1 = "8f6af051b9e8ec597fa09d8885ed79fd582f33c9" -uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" -version = "0.1.10" - -[[deps.MPItrampoline_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Hwloc_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"] -git-tree-sha1 = "77c3bd69fdb024d75af38713e883d0f249ce19c2" -uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748" -version = "5.3.2+0" - -[[deps.MacroTools]] -deps = ["Markdown", "Random"] -git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df" -uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -version = "0.5.13" - -[[deps.ManualMemory]] -git-tree-sha1 = "bcaef4fc7a0cfe2cba636d84cda54b5e4e4ca3cd" -uuid = "d125e4d3-2237-4719-b19c-fa641b8a4667" -version = "0.1.8" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MarkdownAST]] -deps = ["AbstractTrees", "Markdown"] -git-tree-sha1 = "465a70f0fc7d443a00dcdc3267a497397b8a3899" -uuid = "d0879d2d-cac2-40c8-9cee-1863dc0c7391" -version = "0.1.2" - -[[deps.MbedTLS]] -deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"] -git-tree-sha1 = "c067a280ddc25f196b5e7df3877c6b226d390aaf" -uuid = "739be429-bea8-5141-9913-cc70e7f3736d" -version = "1.1.9" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+1" - -[[deps.Measures]] -git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102" -uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" -version = "0.3.2" - -[[deps.Memento]] -deps = ["Dates", "Distributed", "Requires", "Serialization", "Sockets", "Test", "UUIDs"] -git-tree-sha1 = "bb2e8f4d9f400f6e90d57b34860f6abdc51398e5" -uuid = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9" -version = "1.4.1" - -[[deps.MicroCollections]] -deps = ["BangBang", "InitialValues", "Setfield"] -git-tree-sha1 = "629afd7d10dbc6935ec59b32daeb33bc4460a42e" -uuid = "128add7d-3638-4c79-886c-908ea0c25c34" -version = "0.1.4" - -[[deps.MicrosoftMPI_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "f12a29c4400ba812841c6ace3f4efbb6dbb3ba01" -uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf" -version = "10.1.4+2" - -[[deps.Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "1.1.0" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.Mocking]] -deps = ["Compat", "ExprTools"] -git-tree-sha1 = "4cc0c5a83933648b615c36c2b956d94fda70641e" -uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" -version = "0.7.7" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2023.1.10" - -[[deps.NNlib]] -deps = ["Adapt", "Atomix", "ChainRulesCore", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "Pkg", "Random", "Requires", "Statistics"] -git-tree-sha1 = "72240e3f5ca031937bd536182cb2c031da5f46dd" -uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.8.21" - - [deps.NNlib.extensions] - NNlibAMDGPUExt = "AMDGPU" - - [deps.NNlib.weakdeps] - AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" - -[[deps.NNlibCUDA]] -deps = ["Adapt", "CUDA", "LinearAlgebra", "NNlib", "Random", "Statistics"] -git-tree-sha1 = "b05a082b08a3af0e5c576883bc6dfb6513e7e478" -uuid = "a00861dc-f156-4864-bf3c-e6376f28a68d" -version = "0.2.6" - -[[deps.NaNMath]] -deps = ["OpenLibm_jll"] -git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4" -uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" -version = "1.0.2" - -[[deps.NameResolution]] -deps = ["PrettyPrint"] -git-tree-sha1 = "1a0fa0e9613f46c9b8c11eee38ebb4f590013c5e" -uuid = "71a1bf82-56d0-4bbc-8a3c-48b961074391" -version = "0.1.5" - -[[deps.NearestNeighbors]] -deps = ["Distances", "StaticArrays"] -git-tree-sha1 = "ded64ff6d4fdd1cb68dfcbb818c69e144a5b2e4c" -uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" -version = "0.4.16" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[deps.OffsetArrays]] -git-tree-sha1 = "6a731f2b5c03157418a20c12195eb4b74c8f8621" -uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" -version = "1.13.0" -weakdeps = ["Adapt"] - - [deps.OffsetArrays.extensions] - OffsetArraysAdaptExt = "Adapt" - -[[deps.Ogg_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f" -uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" -version = "1.3.5+1" - -[[deps.OneHotArrays]] -deps = ["Adapt", "ChainRulesCore", "Compat", "GPUArraysCore", "LinearAlgebra", "NNlib"] -git-tree-sha1 = "963a3f28a2e65bb87a68033ea4a616002406037d" -uuid = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" -version = "0.2.5" - -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.23+2" - -[[deps.OpenLibm_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "05823500-19ac-5b8b-9628-191a04bc5112" -version = "0.8.1+2" - -[[deps.OpenMPI_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Hwloc_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "PMIx_jll", "TOML", "Zlib_jll", "libevent_jll", "prrte_jll"] -git-tree-sha1 = "f46caf663e069027a06942d00dced37f1eb3d8ad" -uuid = "fe0851c0-eecd-5654-98d4-656369965a5c" -version = "5.0.2+0" - -[[deps.OpenSSL_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "60e3045590bd104a16fefb12836c00c0ef8c7f8c" -uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" -version = "3.0.13+0" - -[[deps.OpenSpecFun_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.5+0" - -[[deps.Optimisers]] -deps = ["ChainRulesCore", "Functors", "LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "c1fc26bab5df929a5172f296f25d7d08688fd25b" -uuid = "3bd65402-5787-11e9-1adc-39752487f4e2" -version = "0.2.20" - -[[deps.Opus_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "51a08fb14ec28da2ec7a927c4337e4332c2a4720" -uuid = "91d4177d-7536-5919-b921-800302f37372" -version = "1.3.2+0" - -[[deps.OrderedCollections]] -git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.3" - -[[deps.PCRE2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15" -version = "10.42.0+1" - -[[deps.PDMats]] -deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "949347156c25054de2db3b166c52ac4728cbad65" -uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.11.31" - -[[deps.PMIx_jll]] -deps = ["Artifacts", "Hwloc_jll", "JLLWrappers", "Libdl", "Zlib_jll", "libevent_jll"] -git-tree-sha1 = "8b3b19351fa24791f94d7ae85faf845ca1362541" -uuid = "32165bc3-0280-59bc-8c0b-c33b6203efab" -version = "4.2.7+0" - -[[deps.Parameters]] -deps = ["OrderedCollections", "UnPack"] -git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" -uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" -version = "0.12.3" - -[[deps.Parsers]] -deps = ["Dates"] -git-tree-sha1 = "bfd7d8c7fd87f04543810d9cbd3995972236ba1b" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.1.2" - -[[deps.Pickle]] -deps = ["DataStructures", "InternedStrings", "Serialization", "SparseArrays", "Strided", "StringEncodings", "ZipFile"] -git-tree-sha1 = "e6a34eb1dc0c498f0774bbfbbbeff2de101f4235" -uuid = "fbb45041-c46e-462f-888f-7c521cafbc2c" -version = "0.3.2" - -[[deps.Pipe]] -git-tree-sha1 = "6842804e7867b115ca9de748a0cf6b364523c16d" -uuid = "b98c9c47-44ae-5843-9183-064241ee97a0" -version = "1.3.0" - -[[deps.Pixman_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "Libdl"] -git-tree-sha1 = "64779bc4c9784fee475689a1752ef4d5747c5e87" -uuid = "30392449-352a-5448-841d-b1acce4e97dc" -version = "0.42.2+0" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.10.0" - -[[deps.PlotThemes]] -deps = ["PlotUtils", "Statistics"] -git-tree-sha1 = "1f03a2d339f42dca4a4da149c7e15e9b896ad899" -uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a" -version = "3.1.0" - -[[deps.PlotUtils]] -deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "7b1a9df27f072ac4c9c7cbe5efb198489258d1f5" -uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.4.1" - -[[deps.Plots]] -deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"] -git-tree-sha1 = "3c403c6590dd93b36752634115e20137e79ab4df" -uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -version = "1.40.2" - - [deps.Plots.extensions] - FileIOExt = "FileIO" - GeometryBasicsExt = "GeometryBasics" - IJuliaExt = "IJulia" - ImageInTerminalExt = "ImageInTerminal" - UnitfulExt = "Unitful" - - [deps.Plots.weakdeps] - FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" - GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326" - IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a" - ImageInTerminal = "d8c32880-2388-543b-8c61-d9f865259254" - Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" - -[[deps.PolyesterWeave]] -deps = ["BitTwiddlingConvenienceFunctions", "CPUSummary", "IfElse", "Static", "ThreadingUtilities"] -git-tree-sha1 = "240d7170f5ffdb285f9427b92333c3463bf65bf6" -uuid = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad" -version = "0.2.1" - -[[deps.PooledArrays]] -deps = ["DataAPI", "Future"] -git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" -uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" -version = "1.4.3" - -[[deps.PrecompileTools]] -deps = ["Preferences"] -git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" -uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.2.1" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.3" - -[[deps.PrettyPrint]] -git-tree-sha1 = "632eb4abab3449ab30c5e1afaa874f0b98b586e4" -uuid = "8162dcfd-2161-5ef2-ae6c-7681170c5f98" -version = "0.2.0" - -[[deps.PrettyTables]] -deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] -git-tree-sha1 = "dfb54c4e414caa595a1f2ed759b160f5a3ddcba5" -uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "1.3.1" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.Profile]] -deps = ["Printf"] -uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" - -[[deps.ProgressLogging]] -deps = ["Logging", "SHA", "UUIDs"] -git-tree-sha1 = "80d919dee55b9c50e8d9e2da5eeafff3fe58b539" -uuid = "33c8b6b6-d38a-422a-b730-caa89a2f386c" -version = "0.1.4" - -[[deps.ProgressMeter]] -deps = ["Distributed", "Printf"] -git-tree-sha1 = "763a8ceb07833dd51bb9e3bbca372de32c0605ad" -uuid = "92933f4c-e287-5a05-a399-4b506db050ca" -version = "1.10.0" - -[[deps.PyCall]] -deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Serialization", "VersionParsing"] -git-tree-sha1 = "9816a3826b0ebf49ab4926e2b18842ad8b5c8f04" -uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" -version = "1.96.4" - -[[deps.Qt6Base_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Vulkan_Loader_jll", "Xorg_libSM_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_cursor_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "libinput_jll", "xkbcommon_jll"] -git-tree-sha1 = "37b7bb7aabf9a085e0044307e1717436117f2b3b" -uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56" -version = "6.5.3+1" - -[[deps.QuadGK]] -deps = ["DataStructures", "LinearAlgebra"] -git-tree-sha1 = "9b23c31e76e333e6fb4c1595ae6afa74966a729e" -uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -version = "2.9.4" - -[[deps.RCall]] -deps = ["CategoricalArrays", "Conda", "DataFrames", "DataStructures", "Dates", "Libdl", "Missings", "Preferences", "REPL", "Random", "Requires", "StatsModels", "WinReg"] -git-tree-sha1 = "846b2aab2d312fda5e7b099fc217c661e8fae27e" -uuid = "6f49c342-dc21-5d91-9882-a32aef131414" -version = "0.14.1" - -[[deps.RData]] -deps = ["CategoricalArrays", "CodecZlib", "DataFrames", "Dates", "FileIO", "Requires", "TimeZones", "Unicode"] -git-tree-sha1 = "19e47a495dfb7240eb44dc6971d660f7e4244a72" -uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" -version = "0.8.3" - -[[deps.RDatasets]] -deps = ["CSV", "CodecZlib", "DataFrames", "FileIO", "Printf", "RData", "Reexport"] -git-tree-sha1 = "2720e6f6afb3e562ccb70a6b62f8f308ff810333" -uuid = "ce6b1742-4840-55fa-b093-852dadbb1d8b" -version = "0.7.7" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.Random123]] -deps = ["Random", "RandomNumbers"] -git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7" -uuid = "74087812-796a-5b5d-8853-05524746bad3" -version = "1.7.0" - -[[deps.RandomNumbers]] -deps = ["Random", "Requires"] -git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" -uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" -version = "1.5.3" - -[[deps.RealDot]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9" -uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9" -version = "0.1.0" - -[[deps.RecipesBase]] -deps = ["PrecompileTools"] -git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" -uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -version = "1.3.4" - -[[deps.RecipesPipeline]] -deps = ["Dates", "NaNMath", "PlotUtils", "PrecompileTools", "RecipesBase"] -git-tree-sha1 = "45cf9fd0ca5839d06ef333c8201714e888486342" -uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c" -version = "0.6.12" - -[[deps.Reexport]] -git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.2.2" - -[[deps.RegistryInstances]] -deps = ["LazilyInitializedFields", "Pkg", "TOML", "Tar"] -git-tree-sha1 = "ffd19052caf598b8653b99404058fce14828be51" -uuid = "2792f1a3-b283-48e8-9a74-f99dce5104f3" -version = "0.1.0" - -[[deps.RelocatableFolders]] -deps = ["SHA", "Scratch"] -git-tree-sha1 = "ffdaf70d81cf6ff22c2b6e733c900c3321cab864" -uuid = "05181044-ff0b-4ac5-8273-598c1e38db00" -version = "1.0.1" - -[[deps.Requires]] -deps = ["UUIDs"] -git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" -uuid = "ae029012-a4dd-5104-9daa-d747884805df" -version = "1.3.0" - -[[deps.Rmath]] -deps = ["Random", "Rmath_jll"] -git-tree-sha1 = "f65dcb5fa46aee0cf9ed6274ccbd597adc49aa7b" -uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.7.1" - -[[deps.Rmath_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "6ed52fdd3382cf21947b15e8870ac0ddbff736da" -uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" -version = "0.4.0+0" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.SIMDTypes]] -git-tree-sha1 = "330289636fb8107c5f32088d2741e9fd7a061a5c" -uuid = "94e857df-77ce-4151-89e5-788b33177be4" -version = "0.1.0" - -[[deps.SLEEFPirates]] -deps = ["IfElse", "Static", "VectorizationBase"] -git-tree-sha1 = "3aac6d68c5e57449f5b9b865c9ba50ac2970c4cf" -uuid = "476501e8-09a2-5ece-8869-fb82de89a1fa" -version = "0.6.42" - -[[deps.ScientificTypes]] -deps = ["CategoricalArrays", "ColorTypes", "Dates", "Distributions", "PrettyTables", "Reexport", "ScientificTypesBase", "StatisticalTraits", "Tables"] -git-tree-sha1 = "7a3efcacd212801a8cf2f961e8238ffb2109b30d" -uuid = "321657f4-b219-11e9-178b-2701a2544e81" -version = "2.3.3" - -[[deps.ScientificTypesBase]] -git-tree-sha1 = "185e373beaf6b381c1e7151ce2c2a722351d6637" -uuid = "30f210dd-8aff-4c5f-94ba-8e64358c1161" -version = "2.3.0" - -[[deps.ScikitLearnBase]] -deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "7877e55c1523a4b336b433da39c8e8c08d2f221f" -uuid = "6e75b9c4-186b-50bd-896f-2d2496a4843e" -version = "0.5.0" - -[[deps.Scratch]] -deps = ["Dates"] -git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.2.1" - -[[deps.SentinelArrays]] -deps = ["Dates", "Random"] -git-tree-sha1 = "0e7508ff27ba32f26cd459474ca2ede1bc10991f" -uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" -version = "1.4.1" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.Setfield]] -deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] -git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" -uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" -version = "1.1.1" - -[[deps.ShiftedArrays]] -git-tree-sha1 = "503688b59397b3307443af35cd953a13e8005c16" -uuid = "1277b4bf-5013-50f5-be3d-901d8477a67a" -version = "2.0.0" - -[[deps.ShowCases]] -git-tree-sha1 = "7f534ad62ab2bd48591bdeac81994ea8c445e4a5" -uuid = "605ecd9f-84a6-4c9e-81e2-4798472b76a3" -version = "0.1.0" - -[[deps.Showoff]] -deps = ["Dates", "Grisu"] -git-tree-sha1 = "91eddf657aca81df9ae6ceb20b959ae5653ad1de" -uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" -version = "1.0.3" - -[[deps.SimpleTraits]] -deps = ["InteractiveUtils", "MacroTools"] -git-tree-sha1 = "5d7e3f4e11935503d3ecaf7186eac40602e7d231" -uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d" -version = "0.9.4" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.SortingAlgorithms]] -deps = ["DataStructures"] -git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085" -uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.2.1" - -[[deps.SparseArrays]] -deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -version = "1.10.0" - -[[deps.SparseInverseSubset]] -deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "52962839426b75b3021296f7df242e40ecfc0852" -uuid = "dc90abb0-5640-4711-901d-7e5b23a2fada" -version = "0.1.2" - -[[deps.SpecialFunctions]] -deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "e2cfc4012a19088254b3950b85c3c1d8882d864d" -uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.3.1" -weakdeps = ["ChainRulesCore"] - - [deps.SpecialFunctions.extensions] - SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" - -[[deps.SplittablesBase]] -deps = ["Setfield", "Test"] -git-tree-sha1 = "e08a62abc517eb79667d0a29dc08a3b589516bb5" -uuid = "171d559e-b47b-412a-8079-5efa626c420e" -version = "0.1.15" - -[[deps.StableRNGs]] -deps = ["Random", "Test"] -git-tree-sha1 = "ddc1a7b85e760b5285b50b882fa91e40c603be47" -uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" -version = "1.0.1" - -[[deps.Static]] -deps = ["IfElse"] -git-tree-sha1 = "d2fdac9ff3906e27f7a618d47b676941baa6c80c" -uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" -version = "0.8.10" - -[[deps.StaticArrayInterface]] -deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "PrecompileTools", "Requires", "SparseArrays", "Static", "SuiteSparse"] -git-tree-sha1 = "5d66818a39bb04bf328e92bc933ec5b4ee88e436" -uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718" -version = "1.5.0" -weakdeps = ["OffsetArrays", "StaticArrays"] - - [deps.StaticArrayInterface.extensions] - StaticArrayInterfaceOffsetArraysExt = "OffsetArrays" - StaticArrayInterfaceStaticArraysExt = "StaticArrays" - -[[deps.StaticArrays]] -deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"] -git-tree-sha1 = "bf074c045d3d5ffd956fa0a461da38a44685d6b2" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.9.3" -weakdeps = ["ChainRulesCore", "Statistics"] - - [deps.StaticArrays.extensions] - StaticArraysChainRulesCoreExt = "ChainRulesCore" - StaticArraysStatisticsExt = "Statistics" - -[[deps.StaticArraysCore]] -git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" -uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" -version = "1.4.2" - -[[deps.StatisticalTraits]] -deps = ["ScientificTypesBase"] -git-tree-sha1 = "730732cae4d3135e2f2182bd47f8d8b795ea4439" -uuid = "64bff920-2084-43da-a3e6-9bb72801c0c9" -version = "2.1.0" - -[[deps.Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -version = "1.10.0" - -[[deps.StatsAPI]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "1ff449ad350c9c4cbc756624d6f8a8c3ef56d3ed" -uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" -version = "1.7.0" - -[[deps.StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "d1bf48bfcc554a3761a133fe3a9bb01488e06916" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.33.21" - -[[deps.StatsFuns]] -deps = ["HypergeometricFunctions", "IrrationalConstants", "LogExpFunctions", "Reexport", "Rmath", "SpecialFunctions"] -git-tree-sha1 = "cef0472124fab0695b58ca35a77c6fb942fdab8a" -uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "1.3.1" - - [deps.StatsFuns.extensions] - StatsFunsChainRulesCoreExt = "ChainRulesCore" - StatsFunsInverseFunctionsExt = "InverseFunctions" - - [deps.StatsFuns.weakdeps] - ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" - InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" - -[[deps.StatsModels]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Printf", "REPL", "ShiftedArrays", "SparseArrays", "StatsAPI", "StatsBase", "StatsFuns", "Tables"] -git-tree-sha1 = "5cf6c4583533ee38639f73b880f35fc85f2941e0" -uuid = "3eaba693-59b7-5ba5-a881-562e759f1c8d" -version = "0.7.3" - -[[deps.Strided]] -deps = ["LinearAlgebra", "TupleTools"] -git-tree-sha1 = "a7a664c91104329c88222aa20264e1a05b6ad138" -uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" -version = "1.2.3" - -[[deps.StringEncodings]] -deps = ["Libiconv_jll"] -git-tree-sha1 = "b765e46ba27ecf6b44faf70df40c57aa3a547dcb" -uuid = "69024149-9ee7-55f6-a4c4-859efe599b68" -version = "0.3.7" - -[[deps.StructArrays]] -deps = ["ConstructionBase", "DataAPI", "Tables"] -git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be" -uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" -version = "0.6.18" -weakdeps = ["Adapt", "GPUArraysCore", "SparseArrays", "StaticArrays"] - - [deps.StructArrays.extensions] - StructArraysAdaptExt = "Adapt" - StructArraysGPUArraysCoreExt = "GPUArraysCore" - StructArraysSparseArraysExt = "SparseArrays" - StructArraysStaticArraysExt = "StaticArrays" - -[[deps.StructTypes]] -deps = ["Dates", "UUIDs"] -git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70" -uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" -version = "1.10.0" - -[[deps.SuiteSparse]] -deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] -uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" - -[[deps.SuiteSparse_jll]] -deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] -uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" -version = "7.2.1+1" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.TableTraits]] -deps = ["IteratorInterfaceExtensions"] -git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" -uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" -version = "1.0.1" - -[[deps.Tables]] -deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits"] -git-tree-sha1 = "cb76cf677714c095e535e3501ac7954732aeea2d" -uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.11.1" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.0" - -[[deps.TensorCore]] -deps = ["LinearAlgebra"] -git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6" -uuid = "62fd8b95-f654-4bbd-a8a5-9c27f68ccd50" -version = "0.1.1" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.ThreadingUtilities]] -deps = ["ManualMemory"] -git-tree-sha1 = "eda08f7e9818eb53661b3deb74e3159460dfbc27" -uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5" -version = "0.5.2" - -[[deps.TimeZones]] -deps = ["Dates", "Future", "LazyArtifacts", "Mocking", "Pkg", "Printf", "RecipesBase", "Serialization", "Unicode"] -git-tree-sha1 = "a5688ffdbd849a98503c6650effe79fe89a41252" -uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" -version = "1.5.9" - -[[deps.TimerOutputs]] -deps = ["ExprTools", "Printf"] -git-tree-sha1 = "f548a9e9c490030e545f72074a41edfd0e5bcdd7" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.23" - -[[deps.TranscodingStreams]] -git-tree-sha1 = "3caa21522e7efac1ba21834a03734c57b4611c7e" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.4" -weakdeps = ["Random", "Test"] - - [deps.TranscodingStreams.extensions] - TestExt = ["Test", "Random"] - -[[deps.Transducers]] -deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "ConstructionBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"] -git-tree-sha1 = "3064e780dbb8a9296ebb3af8f440f787bb5332af" -uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999" -version = "0.4.80" - - [deps.Transducers.extensions] - TransducersBlockArraysExt = "BlockArrays" - TransducersDataFramesExt = "DataFrames" - TransducersLazyArraysExt = "LazyArrays" - TransducersOnlineStatsBaseExt = "OnlineStatsBase" - TransducersReferenceablesExt = "Referenceables" - - [deps.Transducers.weakdeps] - BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" - DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" - LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02" - OnlineStatsBase = "925886fa-5bf2-5e8e-b522-a9147a512338" - Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e" - -[[deps.TupleTools]] -git-tree-sha1 = "41d61b1c545b06279871ef1a4b5fcb2cac2191cd" -uuid = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" -version = "1.5.0" - -[[deps.URIParser]] -deps = ["Unicode"] -git-tree-sha1 = "53a9f49546b8d2dd2e688d216421d050c9a31d0d" -uuid = "30578b45-9adc-5946-b283-645ec420af67" -version = "0.4.1" - -[[deps.URIs]] -git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b" -uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.5.1" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.UnPack]] -git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" -uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" -version = "1.0.2" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.UnicodeFun]] -deps = ["REPL"] -git-tree-sha1 = "53915e50200959667e78a92a418594b428dffddf" -uuid = "1cfade01-22cf-5700-b092-accc4b62d6e1" -version = "0.4.1" - -[[deps.Unitful]] -deps = ["Dates", "LinearAlgebra", "Random"] -git-tree-sha1 = "3c793be6df9dd77a0cf49d80984ef9ff996948fa" -uuid = "1986cc42-f94f-5a68-af5c-568840ba703d" -version = "1.19.0" - - [deps.Unitful.extensions] - ConstructionBaseUnitfulExt = "ConstructionBase" - InverseFunctionsUnitfulExt = "InverseFunctions" - - [deps.Unitful.weakdeps] - ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" - InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" - -[[deps.UnitfulLatexify]] -deps = ["LaTeXStrings", "Latexify", "Unitful"] -git-tree-sha1 = "e2d817cc500e960fdbafcf988ac8436ba3208bfd" -uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728" -version = "1.6.3" - -[[deps.UnsafeAtomics]] -git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" -uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" -version = "0.2.1" - -[[deps.UnsafeAtomicsLLVM]] -deps = ["LLVM", "UnsafeAtomics"] -git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219" -uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" -version = "0.1.1" - -[[deps.Unzip]] -git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78" -uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d" -version = "0.2.0" - -[[deps.VectorizationBase]] -deps = ["ArrayInterface", "CPUSummary", "HostCPUFeatures", "IfElse", "LayoutPointers", "Libdl", "LinearAlgebra", "SIMDTypes", "Static", "StaticArrayInterface"] -git-tree-sha1 = "7209df901e6ed7489fe9b7aa3e46fb788e15db85" -uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" -version = "0.21.65" - -[[deps.VersionParsing]] -git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868" -uuid = "81def892-9a0e-5fdd-b105-ffc91e053289" -version = "1.3.0" - -[[deps.Vulkan_Loader_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Wayland_jll", "Xorg_libX11_jll", "Xorg_libXrandr_jll", "xkbcommon_jll"] -git-tree-sha1 = "2f0486047a07670caad3a81a075d2e518acc5c59" -uuid = "a44049a8-05dd-5a78-86c9-5fde0876e88c" -version = "1.3.243+0" - -[[deps.Wayland_jll]] -deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"] -git-tree-sha1 = "7558e29847e99bc3f04d6569e82d0f5c54460703" -uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89" -version = "1.21.0+1" - -[[deps.Wayland_protocols_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "93f43ab61b16ddfb2fd3bb13b3ce241cafb0e6c9" -uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91" -version = "1.31.0+0" - -[[deps.WinReg]] -git-tree-sha1 = "cd910906b099402bcc50b3eafa9634244e5ec83b" -uuid = "1b915085-20d7-51cf-bf83-8f477d6f5128" -version = "1.0.0" - -[[deps.XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] -git-tree-sha1 = "07e470dabc5a6a4254ffebc29a1b3fc01464e105" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.12.5+0" - -[[deps.XSLT_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "Pkg", "XML2_jll", "Zlib_jll"] -git-tree-sha1 = "91844873c4085240b95e795f692c4cec4d805f8a" -uuid = "aed1982a-8fda-507f-9586-7b0439959a61" -version = "1.1.34+0" - -[[deps.XZ_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "31c421e5516a6248dfb22c194519e37effbf1f30" -uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" -version = "5.6.1+0" - -[[deps.Xorg_libICE_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "e5becd4411063bdcac16be8b66fc2f9f6f1e8fe5" -uuid = "f67eecfb-183a-506d-b269-f58e52b52d7c" -version = "1.0.10+1" - -[[deps.Xorg_libSM_jll]] -deps = ["Libdl", "Pkg", "Xorg_libICE_jll"] -git-tree-sha1 = "4a9d9e4c180e1e8119b5ffc224a7b59d3a7f7e18" -uuid = "c834827a-8449-5923-a945-d239c165b7dd" -version = "1.2.3+0" - -[[deps.Xorg_libX11_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" -uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.8.6+0" - -[[deps.Xorg_libXau_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" -uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.11+0" - -[[deps.Xorg_libXcursor_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXfixes_jll", "Xorg_libXrender_jll"] -git-tree-sha1 = "12e0eb3bc634fa2080c1c37fccf56f7c22989afd" -uuid = "935fb764-8cf2-53bf-bb30-45bb1f8bf724" -version = "1.2.0+4" - -[[deps.Xorg_libXdmcp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" -uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.4+0" - -[[deps.Xorg_libXext_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" -uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" -version = "1.3.4+4" - -[[deps.Xorg_libXfixes_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "0e0dc7431e7a0587559f9294aeec269471c991a4" -uuid = "d091e8ba-531a-589c-9de9-94069b037ed8" -version = "5.0.3+4" - -[[deps.Xorg_libXi_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXfixes_jll"] -git-tree-sha1 = "89b52bc2160aadc84d707093930ef0bffa641246" -uuid = "a51aa0fd-4e3c-5386-b890-e753decda492" -version = "1.7.10+4" - -[[deps.Xorg_libXinerama_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll"] -git-tree-sha1 = "26be8b1c342929259317d8b9f7b53bf2bb73b123" -uuid = "d1454406-59df-5ea1-beac-c340f2130bc3" -version = "1.1.4+4" - -[[deps.Xorg_libXrandr_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll"] -git-tree-sha1 = "34cea83cb726fb58f325887bf0612c6b3fb17631" -uuid = "ec84b674-ba8e-5d96-8ba1-2a689ba10484" -version = "1.5.2+4" - -[[deps.Xorg_libXrender_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] -git-tree-sha1 = "19560f30fd49f4d4efbe7002a1037f8c43d43b96" -uuid = "ea2f1a96-1ddc-540d-b46f-429655e07cfa" -version = "0.9.10+4" - -[[deps.Xorg_libpthread_stubs_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" -uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.1+0" - -[[deps.Xorg_libxcb_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "b4bfde5d5b652e22b9c790ad00af08b6d042b97d" -uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.15.0+0" - -[[deps.Xorg_libxkbfile_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"] -git-tree-sha1 = "730eeca102434283c50ccf7d1ecdadf521a765a4" -uuid = "cc61e674-0454-545c-8b26-ed2c68acab7a" -version = "1.1.2+0" - -[[deps.Xorg_xcb_util_cursor_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_jll", "Xorg_xcb_util_renderutil_jll"] -git-tree-sha1 = "04341cb870f29dcd5e39055f895c39d016e18ccd" -uuid = "e920d4aa-a673-5f3a-b3d7-f755a4d47c43" -version = "0.1.4+0" - -[[deps.Xorg_xcb_util_image_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "0fab0a40349ba1cba2c1da699243396ff8e94b97" -uuid = "12413925-8142-5f55-bb0e-6d7ca50bb09b" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll"] -git-tree-sha1 = "e7fd7b2881fa2eaa72717420894d3938177862d1" -uuid = "2def613f-5ad1-5310-b15b-b15d46f528f5" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_keysyms_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "d1151e2c45a544f32441a567d1690e701ec89b00" -uuid = "975044d2-76e6-5fbe-bf08-97ce7c6574c7" -version = "0.4.0+1" - -[[deps.Xorg_xcb_util_renderutil_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "dfd7a8f38d4613b6a575253b3174dd991ca6183e" -uuid = "0d47668e-0667-5a69-a72c-f761630bfb7e" -version = "0.3.9+1" - -[[deps.Xorg_xcb_util_wm_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xcb_util_jll"] -git-tree-sha1 = "e78d10aab01a4a154142c5006ed44fd9e8e31b67" -uuid = "c22f9ab0-d5fe-5066-847c-f4bb1cd4e361" -version = "0.4.1+1" - -[[deps.Xorg_xkbcomp_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxkbfile_jll"] -git-tree-sha1 = "330f955bc41bb8f5270a369c473fc4a5a4e4d3cb" -uuid = "35661453-b289-5fab-8a00-3d9160c6a3a4" -version = "1.4.6+0" - -[[deps.Xorg_xkeyboard_config_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xkbcomp_jll"] -git-tree-sha1 = "691634e5453ad362044e2ad653e79f3ee3bb98c3" -uuid = "33bec58e-1273-512f-9401-5d533626f822" -version = "2.39.0+0" - -[[deps.Xorg_xtrans_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" -uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.5.0+0" - -[[deps.ZipFile]] -deps = ["Libdl", "Printf", "Zlib_jll"] -git-tree-sha1 = "f492b7fe1698e623024e873244f10d89c95c340a" -uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" -version = "0.10.1" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+1" - -[[deps.Zstd_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "49ce682769cd5de6c72dcf1b94ed7790cd08974c" -uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" -version = "1.5.5+0" - -[[deps.Zygote]] -deps = ["AbstractFFTs", "ChainRules", "ChainRulesCore", "DiffRules", "Distributed", "FillArrays", "ForwardDiff", "GPUArrays", "GPUArraysCore", "IRTools", "InteractiveUtils", "LinearAlgebra", "LogExpFunctions", "MacroTools", "NaNMath", "PrecompileTools", "Random", "Requires", "SparseArrays", "SpecialFunctions", "Statistics", "ZygoteRules"] -git-tree-sha1 = "4ddb4470e47b0094c93055a3bcae799165cc68f1" -uuid = "e88e6eb3-aa80-5325-afca-941959d7151f" -version = "0.6.69" - - [deps.Zygote.extensions] - ZygoteColorsExt = "Colors" - ZygoteDistancesExt = "Distances" - ZygoteTrackerExt = "Tracker" - - [deps.Zygote.weakdeps] - Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" - Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" - Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" - -[[deps.ZygoteRules]] -deps = ["ChainRulesCore", "MacroTools"] -git-tree-sha1 = "27798139afc0a2afa7b1824c206d5e87ea587a00" -uuid = "700de1a5-db45-46bc-99cf-38207098b444" -version = "0.2.5" - -[[deps.eudev_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "gperf_jll"] -git-tree-sha1 = "431b678a28ebb559d224c0b6b6d01afce87c51ba" -uuid = "35ca27e7-8b34-5b7f-bca9-bdc33f59eb06" -version = "3.2.9+0" - -[[deps.fzf_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "a68c9655fbe6dfcab3d972808f1aafec151ce3f8" -uuid = "214eeab7-80f7-51ab-84ad-2988db7cef09" -version = "0.43.0+0" - -[[deps.gperf_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "3516a5630f741c9eecb3720b1ec9d8edc3ecc033" -uuid = "1a1c6b14-54f6-533d-8383-74cd7377aa70" -version = "3.1.1+0" - -[[deps.libaec_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "46bf7be2917b59b761247be3f317ddf75e50e997" -uuid = "477f73a3-ac25-53e9-8cc3-50b2fa2566f0" -version = "1.1.2+0" - -[[deps.libaom_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "3a2ea60308f0996d26f1e5354e10c24e9ef905d4" -uuid = "a4ae2306-e953-59d6-aa16-d00cac43593b" -version = "3.4.0+0" - -[[deps.libass_jll]] -deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] -git-tree-sha1 = "5982a94fcba20f02f42ace44b9894ee2b140fe47" -uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" -version = "0.15.1+0" - -[[deps.libblastrampoline_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.8.0+1" - -[[deps.libevdev_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "141fe65dc3efabb0b1d5ba74e91f6ad26f84cc22" -uuid = "2db6ffa8-e38f-5e21-84af-90c45d0032cc" -version = "1.11.0+0" - -[[deps.libevent_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "OpenSSL_jll"] -git-tree-sha1 = "f04ec6d9a186115fb38f858f05c0c4e1b7fc9dcb" -uuid = "1080aeaf-3a6a-583e-a51c-c537b09f60ec" -version = "2.1.13+1" - -[[deps.libfdk_aac_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "daacc84a041563f965be61859a36e17c4e4fcd55" -uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" -version = "2.0.2+0" - -[[deps.libinput_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "eudev_jll", "libevdev_jll", "mtdev_jll"] -git-tree-sha1 = "ad50e5b90f222cfe78aa3d5183a20a12de1322ce" -uuid = "36db933b-70db-51c0-b978-0f229ee0e533" -version = "1.18.0+0" - -[[deps.libpng_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "d7015d2e18a5fd9a4f47de711837e980519781a4" -uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f" -version = "1.6.43+1" - -[[deps.libvorbis_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"] -git-tree-sha1 = "b910cb81ef3fe6e78bf6acee440bda86fd6ae00c" -uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" -version = "1.3.7+1" - -[[deps.mtdev_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "814e154bdb7be91d78b6802843f76b6ece642f11" -uuid = "009596ad-96f7-51b1-9f1b-5ce2d5e8a71e" -version = "1.1.6+0" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.52.0+1" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+2" - -[[deps.prrte_jll]] -deps = ["Artifacts", "Hwloc_jll", "JLLWrappers", "Libdl", "PMIx_jll", "libevent_jll"] -git-tree-sha1 = "5adb2d7a18a30280feb66cad6f1a1dfdca2dc7b0" -uuid = "eb928a42-fffd-568d-ab9c-3f5d54fc65b9" -version = "3.0.2+0" - -[[deps.x264_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fea590b89e6ec504593146bf8b988b2c00922b2" -uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" -version = "2021.5.5+0" - -[[deps.x265_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "ee567a171cce03570d77ad3a43e90218e38937a9" -uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" -version = "3.5.0+0" - -[[deps.xkbcommon_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"] -git-tree-sha1 = "9c304562909ab2bab0262639bd4f444d7bc2be37" -uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd" -version = "1.4.1+1" diff --git a/docs/src/Benchmarks_temp.jl b/docs/src/Benchmarks_temp.jl deleted file mode 100644 index ff9cd48a..00000000 --- a/docs/src/Benchmarks_temp.jl +++ /dev/null @@ -1,82 +0,0 @@ -# temp file -using Pkg -Pkg.activate(joinpath(@__DIR__,"..")) -ENV["PYTHON"] = "" -using Test, Statistics, Random, DelimitedFiles, Logging -using DataStructures, DataFrames, BenchmarkTools, StableRNGs, SystemBenchmark -import DecisionTree, Flux -import Clustering, GaussianMixtures -using BetaML -using Conda -using PyCall -pyimport_conda("sklearn", "sklearn", "conda-forge") - -TESTRNG = StableRNG(123) - - - - -println("*** Benchmarking regression task..") - -bm_regression = DataFrame(name= String[],time=Float64[],memory=Int64[],allocs=Int64[],mre_train=Float64[],std_train=Float64[],mre_test=Float64[],std_test=Float64[]) -n = 500 -seeds = rand(copy(TESTRNG),n) -x = vcat([[s*2 (s-3)^2 s/2 0.2-s] for s in seeds]...) -y = [r[1]*2-r[2]+r[3]^2 for r in eachrow(x)] - - -Random.seed!(123) -dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(rng=copy(TESTRNG)), - "RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(n_trees=30, rng=copy(TESTRNG)), -); - -# DT: -# set of regression parameters and respective default values -# pruning_purity: purity threshold used for post-pruning (default: 1.0, no pruning) -# max_depth: maximum depth of the decision tree (default: -1, no maximum) -# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5) -# min_samples_split: the minimum number of samples in needed for a split (default: 2) -# min_purity_increase: minimum purity needed for a split (default: 0.0) -# n_subfeatures: number of features to select at random (default: 0, keep all) -# keyword rng: the random number generator or seed to use (default Random.GLOBAL_RNG) - - -# RF: -# set of regression build_forest() parameters and respective default values -# n_subfeatures: number of features to consider at random per split (default: -1, sqrt(# features)) -# n_trees: number of trees to train (default: 10) -# partial_sampling: fraction of samples to train each tree on (default: 0.7) -# max_depth: maximum depth of the decision trees (default: no maximum) -# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5) -# min_samples_split: the minimum number of samples in needed for a split (default: 2) -# min_purity_increase: minimum purity needed for a split (default: 0.0) -# keyword rng: the random number generator or seed to use (default Random.GLOBAL_RNG) -# multi-threaded forests must be seeded with an `Int` - -for (mname,m) in dt_models - #mname = "DT" - #m = NeuralNetworkEstimator(rng=copy(TESTRNG),verbosity=NONE) - # speed measure - bres = @benchmark DecisionTree.fit!(m2,$x,$y) setup=(m2 = deepcopy($m)) - m_time = median(bres.times) - m_memory = bres.memory - m_allocs = bres.allocs - sampler = KFold(nsplits=10,rng=copy(TESTRNG)); - cv_out = cross_validation([x,y],sampler,return_statistics=false) do trainData,valData,rng - (xtrain,ytrain) = trainData; (xval,yval) = valData - m2 = deepcopy(m) - DecisionTree.fit!(m2,xtrain,ytrain) - ŷtrain = DecisionTree.predict(m2,xtrain) - ŷval = DecisionTree.predict(m2,xval) - rme_train = relative_mean_error(ytrain,ŷtrain) - rme_val = relative_mean_error(yval,ŷval) - return (rme_train, rme_val) - end - - mre_train = mean([r[1] for r in cv_out]) - std_train = std([r[1] for r in cv_out]) - mre_test = mean([r[2] for r in cv_out]) - std_test = std([r[2] for r in cv_out]) - push!(bm_regression,[mname, m_time, m_memory, m_allocs, mre_train, std_train, mre_test, std_test]) - @test mre_test <= 0.05 -end \ No newline at end of file diff --git a/docs/src/assets/data/breast+cancer+wisconsin+diagnostic.zip b/docs/src/assets/data/breast+cancer+wisconsin+diagnostic.zip deleted file mode 100644 index 17d98def..00000000 Binary files a/docs/src/assets/data/breast+cancer+wisconsin+diagnostic.zip and /dev/null differ diff --git a/docs/src/tutorials/Classification - cars/betaml_tutorial_classification_cars.md b/docs/src/tutorials/Classification - cars/betaml_tutorial_classification_cars.md deleted file mode 100644 index e14627c5..00000000 --- a/docs/src/tutorials/Classification - cars/betaml_tutorial_classification_cars.md +++ /dev/null @@ -1,389 +0,0 @@ -```@meta -EditURL = "betaml_tutorial_classification_cars.jl" -``` - -# [A classification task when labels are known - determining the country of origin of cars given the cars characteristics](@id classification_tutorial) - -In this exercise we are provided with several technical characteristics (mpg, horsepower,weight, model year...) for several car's models, together with the country of origin of such models, and we would like to create a machine learning model such that the country of origin can be accurately predicted given the technical characteristics. -As the information to predict is a multi-class one, this is a _[classification](https://en.wikipedia.org/wiki/Statistical_classification) task. -It is a challenging exercise due to the simultaneous presence of three factors: (1) presence of missing data; (2) unbalanced data - 254 out of 406 cars are US made; (3) small dataset. - -Data origin: -- dataset description: [https://archive.ics.uci.edu/ml/datasets/auto+mpg](https://archive.ics.uci.edu/ml/datasets/auto+mpg) -- data source we use here: [https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data](https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original) - -Field description: - -1. mpg: _continuous_ -2. cylinders: _multi-valued discrete_ -3. displacement: _continuous_ -4. horsepower: _continuous_ -5. weight: _continuous_ -6. acceleration: _continuous_ -7. model year: _multi-valued discrete_ -8. origin: _multi-valued discrete_ -9. car name: _string (unique for each instance)_ - -The car name is not used in this tutorial, so that the country is inferred only from technical data. As this field includes also the car maker, and there are several car's models from the same car maker, a more sophisticated machine learnign model could exploit this information e.g. using a bag of word encoding. - -## Library loading and initialisation - -Activating the local environment specific to BetaML documentation - -```text -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -``` - -We load a buch of packages that we'll use during this tutorial.. - -```text -using Random, HTTP, Plots, CSV, DataFrames, BenchmarkTools, StableRNGs, BetaML -import DecisionTree, Flux -import Pipe: @pipe -``` - -Machine Learning workflows include stochastic components in several steps: in the data sampling, in the model initialisation and often in the models's own algorithms (and sometimes also in the prediciton step). -BetaML provides a random nuber generator (RNG) in order to simplify reproducibility ( [`FIXEDRNG`](@ref BetaML.Utils.FIXEDRNG). This is nothing else than an istance of `StableRNG(123)` defined in the [`BetaML.Utils`](@ref utils_module) sub-module, but you can choose of course your own "fixed" RNG). See the [Dealing with stochasticity](@ref stochasticity_reproducibility) section in the [Getting started](@ref getting_started) tutorial for details. - -Here we are explicit and we use our own fixed RNG: - -```text -seed = 123 # The table at the end of this tutorial has been obtained with seeds 123, 1000 and 10000 -AFIXEDRNG = StableRNG(seed) -``` - -## Data loading and preparation - -To load the data from the internet our workflow is -(1) Retrieve the data --> (2) Clean it --> (3) Load it --> (4) Output it as a DataFrame. - -For step (1) we use `HTTP.get()`, for step (2) we use `replace!`, for steps (3) and (4) we uses the `CSV` package, and we use the "pip" `|>` operator to chain these operations, so that no file is ever saved on disk: - -```text -urlDataOriginal = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original" -data = @pipe HTTP.get(urlDataOriginal).body |> - replace!(_, UInt8('\t') => UInt8(' ')) |> # the original dataset has mixed field delimiters ! - CSV.File(_, delim=' ', missingstring="NA", ignorerepeated=true, header=false) |> - DataFrame; -nothing #hide -``` - -This results in a table where the rows are the observations (the various cars' models) and the column the fields. All BetaML models expect this layout. - -As the dataset is ordered, we randomly shuffle the data. - -```text -idx = randperm(copy(AFIXEDRNG),size(data,1)) -data[idx, :] -describe(data) -``` - -Columns 1 to 7 contain characteristics of the car, while column 8 encodes the country or origin ("1" -> US, "2" -> EU, "3" -> Japan). That's the variable we want to be able to predict. - -Columns 9 contains the car name, but we are not going to use this information in this tutorial. -Note also that some fields have missing data. - -Our first step is hence to divide the dataset in features (the x) and the labels (the y) we want to predict. The `x` is then a Julia standard `Matrix` of 406 rows by 7 columns and the `y` is a vector of the 406 observations: - -```text -x = Matrix{Union{Missing,Float64}}(data[:,1:7]); -y = Vector{Int64}(data[:,8]); -x = fit!(Scaler(),x) -``` - -Some algorithms that we will use today don't accept missing data, so we need to _impute_ them. BetaML provides several imputation models in the [`Imputation`](@ref) module. Note that many of these imputation models can be used for Collaborative Filtering / Recomendation Systems. Models as [`GaussianMixtureImputer`](@ref) have the advantage over traditional algorithms as k-nearest neighbors (KNN) that GMM can "detect" the hidden structure of the observed data, where some observation can be similar to a certain pool of other observvations for a certain characteristic, but similar to an other pool of observations for other characteristics. -Here we use [`RandomForestImputer`](@ref). While the model allows for reproducible multiple imputations (with the parameter `multiple_imputation=an_integer`) and multiple passages trough the various columns (fields) containing missing data (with the option `recursive_passages=an_integer`), we use here just a single imputation and a single passage. -As all `BetaML` models, `RandomForestImputer` follows the patters `m=ModelConstruction(pars); fit!(m,x,[y]); est = predict(m,x)` where `est` can be an estimation of some labels or be some characteristics of x itself (the imputed version, as in this case, a reprojected version as in [`PCAEncoder`](@ref)), depending if the model is supervised or not. See the [`API user documentation`](@ref api_usage)` for more details. -For imputers, the output of `predict` is the matrix with the imputed values replacing the missing ones, and we write here the model in a single line using a convenience feature that when the default `cache` parameter is used in the model constructor the `fit!` function returns itself the prediciton over the trained data: - -```text -x = fit!(RandomForestImputer(rng=copy(AFIXEDRNG)),x) # Same as `m = RandomForestImputer(rng=copy(AFIXEDRNG)); fit!(m,x); x= predict(m,x)` -``` - -Further, some models don't work with categorical data as well, so we need to represent our `y` as a matrix with a separate column for each possible categorical value (the so called "one-hot" representation). -For example, within a three classes field, the individual value `2` (or `"Europe"` for what it matters) would be represented as the vector `[0 1 0]`, while `3` (or `"Japan"`) would become the vector `[0 0 1]`. -To encode as one-hot we use the [`OneHotEncoder`](@ref) in [`BetaML.Utils`](@ref utils_module), using the same shortcut as for the imputer we used earlier: - -```text -y_oh = fit!(OneHotEncoder(),y) -``` - -In supervised machine learning it is good practice to partition the available data in a _training_, _validation_, and _test_ subsets, where the first one is used to train the ML algorithm, the second one to train any eventual "hyper-parameters" of the algorithm and the _test_ subset is finally used to evaluate the quality of the algorithm. -Here, for brevity, we use only the _train_ and the _test_ subsets, implicitly assuming we already know the best hyper-parameters. Please refer to the [regression tutorial](@ref regression_tutorial) for examples of the auto-tune feature of BetaML models to "automatically" train the hyper-parameters (hint: in most cases just add the parameter `autotune=true` in the model constructor), or the [clustering tutorial](@ref clustering_tutorial) for an example of using the [`cross_validation`](@ref) function to do it manually. - -We use then the [`partition`](@ref) function in [BetaML.Utils](@ref utils_module), where we can specify the different data to partition (each matrix or vector to partition must have the same number of observations) and the shares of observation that we want in each subset. Here we keep 80% of observations for training (`xtrain`, and `ytrain`) and we use 20% of them for testing (`xtest`, and `ytest`): - -```text -((xtrain,xtest),(ytrain,ytest),(ytrain_oh,ytest_oh)) = partition([x,y,y_oh],[0.8,1-0.8],rng=copy(AFIXEDRNG)); -nothing #hide -``` - -We finally set up a dataframe to store the accuracies of the various models we'll use. - -```text -results = DataFrame(model=String[],train_acc=Float64[],test_acc=Float64[]) -``` - -## Random Forests - -We are now ready to use our first model, the [`RandomForestEstimator`](@ref). Random Forests build a "forest" of decision trees models and then average their predictions in order to make an overall prediction, wheter a regression or a classification. - -While here the missing data has been imputed and the dataset is comprised of only numerical values, one attractive feature of BetaML `RandomForestEstimator` is that they can work directly with missing and categorical data without any prior processing required. - -However as the labels are encoded using integers, we need also to specify the parameter `force_classification=true`, otherwise the model would undergo a _regression_ job instead. - -```text -rfm = RandomForestEstimator(force_classification=true, rng=copy(AFIXEDRNG)) -``` - -Opposite to the `RandomForestImputer` and `OneHotEncoder` models used earielr, to train a `RandomForestEstimator` model we need to provide it with both the training feature matrix and the associated "true" training labels. We use the same shortcut to get the training predictions directly from the `fit!` function. In this case the predictions correspond to the labels: - -```text -ŷtrain = fit!(rfm,xtrain,ytrain) -``` - -You can notice that for each record the result is reported in terms of a dictionary with the possible categories and their associated probabilities. - -!!! warning - Only categories with non-zero probabilities are reported for each record, and being a dictionary, the order of the categories is not undefined - -For example `ŷtrain[1]` is a `Dict(2 => 0.0333333, 3 => 0.933333, 1 => 0.0333333)`, indicating an overhelming probability that that car model originates from Japan. -To retrieve the predictions with the highest probabilities use `mode(ŷ)`: - -```text -ŷtrain_top = mode(ŷtrain,rng=copy(AFIXEDRNG)) -``` - -Why `mode` takes (optionally) a RNG ? I let the answer for you :-) - -To obtain the predicted labels for the test set we simply run the `predict` function over the features of the test set: - -```text -ŷtest = predict(rfm,xtest) -``` - -Finally we can measure the _accuracy_ of our predictions with the [`accuracy`](@ref) function. We don't need to explicitly use `mode`, as `accuracy` does it itself when it is passed with predictions expressed as a dictionary: - -```text -trainAccuracy,testAccuracy = accuracy.([ytrain,ytest],[ŷtrain,ŷtest],rng=copy(AFIXEDRNG)) -``` - -We are now ready to store our first model accuracies in the `results` dataframe: - -```text -push!(results,["RF",trainAccuracy,testAccuracy]); -nothing #hide -``` - -The predictions are quite good, for the training set the algoritm predicted almost all cars' origins correctly, while for the testing set (i.e. those records that has **not** been used to train the algorithm), the correct prediction level is still quite high, at around 80% (depends on the random seed) - -While accuracy can sometimes suffice, we may often want to better understand which categories our model has trouble to predict correctly. -We can investigate the output of a multi-class classifier more in-deep with a [`ConfusionMatrix`](@ref) where the true values (`y`) are given in rows and the predicted ones (`ŷ`) in columns, together to some per-class metrics like the _precision_ (true class _i_ over predicted in class _i_), the _recall_ (predicted class _i_ over the true class _i_) and others. - -We fist build the [`ConfusionMatrix`](@ref) model, we train it with `ŷ` and `y` and then we print it (we do it here for the test subset): - -```text -cfm = ConfusionMatrix(categories_names=Dict(1=>"US",2=>"EU",3=>"Japan"),rng=copy(AFIXEDRNG)) -fit!(cfm,ytest,ŷtest) # the output is by default the confusion matrix in relative terms -print(cfm) -``` - -From the report we can see that Japanese cars have more trouble in being correctly classified, and in particular many Japanease cars are classified as US ones. This is likely a result of the class imbalance of the data set, and could be solved by balancing the dataset with various sampling tecniques before training the model. - -If you prefer a more graphical approach, we can also plot the confusion matrix. In order to do so, we pick up information from the `info(cfm)` function. Indeed most BetaML models can be queried with `info(model)` to retrieve additional information, in terms of a dictionary, that is not necessary to the prediciton, but could still be relevant. Other functions that you can use with BetaML models are `parameters(m)` and `hyperparamaeters(m)`. - -```text -res = info(cfm) -heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)") -``` - -### Comparision with DecisionTree.jl - -We now compare BetaML [`RandomForestEstimator`] with the random forest estimator of the package [`DecisionTrees.jl`](https://github.com/JuliaAI/DecisionTree.jl)` random forests are similar in usage: we first "build" (train) the forest and we then make predictions out of the trained model. - -```text -# We train the model... -model = DecisionTree.build_forest(ytrain, xtrain,rng=seed) -# ..and we generate predictions and measure their error -(ŷtrain,ŷtest) = DecisionTree.apply_forest.([model],[xtrain,xtest]); -(trainAccuracy,testAccuracy) = accuracy.([ytrain,ytest],[ŷtrain,ŷtest]) -push!(results,["RF (DecisionTrees.jl)",trainAccuracy,testAccuracy]); -nothing #hide -``` - -While the accuracy on the training set is exactly the same as for `BetaML` random forets, `DecisionTree.jl` random forests are slighly less accurate in the testing sample. -Where however `DecisionTrees.jl` excell is in the efficiency: they are extremelly fast and memory thrifty, even if we should consider also the resources needed to impute the missing values, as they don't work with missing data. - -Also, one of the reasons DecisionTrees are such efficient is that internally the data is sorted to avoid repeated comparision, but in this way they work only with features that are sortable, while BetaML random forests accept virtually any kind of input without the needs to process it. - -### Neural network - -Neural networks (NN) can be very powerfull, but have two "inconvenients" compared with random forests: first, are a bit "picky". We need to do a bit of work to provide data in specific format. Note that this is _not_ feature engineering. One of the advantages on neural network is that for the most this is not needed for neural networks. However we still need to "clean" the data. One issue is that NN don't like missing data. So we need to provide them with the feature matrix "clean" of missing data. Secondly, they work only with numerical data. So we need to use the one-hot encoding we saw earlier. -Further, they work best if the features are scaled such that each feature has mean zero and standard deviation 1. This is why we scaled the data back at the beginning of this tutorial. - -We firt measure the dimensions of our data in input (i.e. the column of the feature matrix) and the dimensions of our output, i.e. the number of categories or columns in out one-hot encoded y. - -```text -D = size(xtrain,2) -classes = unique(y) -nCl = length(classes) -``` - -The second "inconvenient" of NN is that, while not requiring feature engineering, they still need a bit of practice on the way the structure of the network is built . It's not as simple as `fit!(Model(),x,y)` (altougth BetaML provides a "default" neural network structure that can be used, it isn't often adapted to the specific task). We need instead to specify how we want our layers, _chain_ the layers together and then decide a _loss_ overall function. Only when we done these steps, we have the model ready for training. -Here we define 2 [`DenseLayer`](@ref) where, for each of them, we specify the number of neurons in input (the first layer being equal to the dimensions of the data), the output layer (for a classification task, the last layer output size beying equal to the number of classes) and an _activation function_ for each layer (default the `identity` function). - -```text -ls = 50 # number of neurons in the inned layer -l1 = DenseLayer(D,ls,f=relu,rng=copy(AFIXEDRNG)) -l2 = DenseLayer(ls,nCl,f=relu,rng=copy(AFIXEDRNG)) -``` - -For a classification task, the last layer is a [`VectorFunctionLayer`](@ref) that has no learnable parameters but whose activation function is applied to the ensemble of the neurons, rather than individually on each neuron. In particular, for classification we pass the [`softmax`](@ref) function whose output has the same size as the input (i.e. the number of classes to predict), but we can use the `VectorFunctionLayer` with any function, including the [`pool1d`](@ref) function to create a "pooling" layer (using maximum, mean or whatever other sub-function we pass to `pool1d`) - -```text -l3 = VectorFunctionLayer(nCl,f=softmax) ## Add a (parameterless) layer whose activation function (softmax in this case) is defined to all its nodes at once -``` - -Finally we _chain_ the layers and assign a loss function and the number of epochs we want to train the model to the constructor of [`NeuralNetworkEstimator`](@ref): - -```text -nn = NeuralNetworkEstimator(layers=[l1,l2,l3],loss=crossentropy,rng=copy(AFIXEDRNG),epochs=500) -``` - -Aside the layer structure and size and the number of epochs, other hyper-parameters you may want to try are the `batch_size` and the optimisation algoritm to employ (`opt_alg`). - -Now we can train our network: - -```text -ŷtrain = fit!(nn, xtrain, ytrain_oh) -``` - -Predictions are in form of a _n_records_ by _n_classes_ matrix of the probabilities of each record being in that class. To retrieve the classes with the highest probabilities we can use again the `mode` function: - -```text -ŷtrain_top = mode(ŷtrain) -``` - -Once trained, we can predict the test labels. As the trained was based on the scaled feature matrix, so must be for the predictions - -```text -ŷtest = predict(nn,xtest) -``` - -And finally we can measure the accuracies and store the accuracies in the `result` dataframe: - -```text -trainAccuracy, testAccuracy = accuracy.([ytrain,ytest],[ŷtrain,ŷtest],rng=copy(AFIXEDRNG)) -push!(results,["NN",trainAccuracy,testAccuracy]); -nothing #hide -``` - -```text -cfm = ConfusionMatrix(categories_names=Dict(1=>"US",2=>"EU",3=>"Japan"),rng=copy(AFIXEDRNG)) -fit!(cfm,ytest,ŷtest) -print(cfm) -res = info(cfm) -heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)") -``` - -While accuracies are a bit lower, the distribution of misclassification is similar, with many Jamanease cars misclassified as US ones (here we have also some EU cars misclassified as Japanease ones). - -### Comparisons with Flux - -As we did for Random Forests, we compare BetaML neural networks with the leading package for deep learning in Julia, [`Flux.jl`](https://github.com/FluxML/Flux.jl). - -In Flux the input must be in the form (fields, observations), so we transpose our original matrices - -```text -xtrainT, ytrain_ohT = transpose.([xtrain, ytrain_oh]) -xtestT, ytest_ohT = transpose.([xtest, ytest_oh]) -``` - -We define the Flux neural network model in a similar way than BetaML and load it with data, we train it, predict and measure the accuracies on the training and the test sets: - -We fix the random seed for Flux, altough you may still get different results depending on the number of threads used.. this is a problem we solve in BetaML with [`generate_parallel_rngs`](@ref). - -```text -Random.seed!(seed) - -l1 = Flux.Dense(D,ls,Flux.relu) -l2 = Flux.Dense(ls,nCl,Flux.relu) -Flux_nn = Flux.Chain(l1,l2) -fluxloss(x, y) = Flux.logitcrossentropy(Flux_nn(x), y) -ps = Flux.params(Flux_nn) -nndata = Flux.Data.DataLoader((xtrainT, ytrain_ohT),shuffle=true) -begin for i in 1:500 Flux.train!(fluxloss, ps, nndata, Flux.ADAM()) end end -ŷtrain = Flux.onecold(Flux_nn(xtrainT),1:3) -ŷtest = Flux.onecold(Flux_nn(xtestT),1:3) -trainAccuracy, testAccuracy = accuracy.([ytrain,ytest],[ŷtrain,ŷtest]) -``` - -```text -push!(results,["NN (Flux.jl)",trainAccuracy,testAccuracy]); -nothing #hide -``` - -While the train accuracy is little bit higher that BetaML, the test accuracy remains comparable - -## Perceptron-like classifiers. - -We finaly test 3 "perceptron-like" classifiers, the "classical" Perceptron ([`PerceptronClassifier`](@ref)), one of the first ML algorithms (a linear classifier), a "kernellised" version of it ([`KernelPerceptronClassifier`](@ref), default to using the radial kernel) and "PegasosClassifier" ([`PegasosClassifier`](@ref)) another linear algorithm that starts considering a gradient-based optimisation, altought without the regularisation term as in the Support Vector Machines (SVM). - -As for the previous classifiers we construct the model object, we train and predict and we compute the train and test accuracies: - -```text -pm = PerceptronClassifier(rng=copy(AFIXEDRNG)) -ŷtrain = fit!(pm, xtrain, ytrain) -ŷtest = predict(pm, xtest) -(trainAccuracy,testAccuracy) = accuracy.([ytrain,ytest],[ŷtrain,ŷtest]) -push!(results,["Perceptron",trainAccuracy,testAccuracy]); - -kpm = KernelPerceptronClassifier(rng=copy(AFIXEDRNG)) -ŷtrain = fit!(kpm, xtrain, ytrain) -ŷtest = predict(kpm, xtest) -(trainAccuracy,testAccuracy) = accuracy.([ytrain,ytest],[ŷtrain,ŷtest]) -push!(results,["KernelPerceptronClassifier",trainAccuracy,testAccuracy]); - - -pegm = PegasosClassifier(rng=copy(AFIXEDRNG)) -ŷtrain = fit!(pegm, xtrain, ytrain) -ŷtest = predict(pm, xtest) -(trainAccuracy,testAccuracy) = accuracy.([ytrain,ytest],[ŷtrain,ŷtest]) -push!(results,["Pegasaus",trainAccuracy,testAccuracy]); -nothing #hide -``` - -## Summary - -This is the summary of the results we had trying to predict the country of origin of the cars, based on their technical characteristics: - -```text -println(results) -``` - -If you clone BetaML repository - -Model accuracies on my machine with seedd 123, 1000 and 10000 respectivelly - -| model | train 1 | test 1 | train 2 | test 2 | train 3 | test 3 | -| --------------------- | --------- | -------- | -------- | -------- | -------- | -------- | -| RF | 0.996923 | 0.765432 | 1.000000 | 0.802469 | 1.000000 | 0.888889 | -| RF (DecisionTrees.jl) | 0.975385 | 0.765432 | 0.984615 | 0.777778 | 0.975385 | 0.864198 | -| NN | 0.886154 | 0.728395 | 0.916923 | 0.827160 | 0.895385 | 0.876543 | -│ NN (Flux.jl) | 0.793846 | 0.654321 | 0.938462 | 0.790123 | 0.935385 | 0.851852 | -│ Perceptron | 0.778462 | 0.703704 | 0.720000 | 0.753086 | 0.670769 | 0.654321 | -│ KernelPerceptronClassifier | 0.987692 | 0.703704 | 0.978462 | 0.777778 | 0.944615 | 0.827160 | -│ Pegasaus | 0.732308 | 0.703704 | 0.633846 | 0.753086 | 0.575385 | 0.654321 | - -We warn that this table just provides a rought idea of the various algorithms performances. Indeed there is a large amount of stochasticity both in the sampling of the data used for training/testing and in the initial settings of the parameters of the algorithm. For a statistically significant comparision we would have to repeat the analysis with multiple sampling (e.g. by cross-validation, see the [clustering tutorial](@ref clustering_tutorial) for an example) and initial random parameters. - -Neverthless the table above shows that, when we compare BetaML with the algorithm-specific leading packages, we found similar results in terms of accuracy, but often the leading packages are better optimised and run more efficiently (but sometimes at the cost of being less verstatile). -Also, for this dataset, Random Forests seems to remain marginally more accurate than Neural Network, altought of course this depends on the hyper-parameters and, with a single run of the models, we don't know if this difference is significant. - -[View this file on Github](betaml_tutorial_classification_cars.jl). - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/docs/src/tutorials/Clustering - Iris/betaml_tutorial_cluster_iris.md b/docs/src/tutorials/Clustering - Iris/betaml_tutorial_cluster_iris.md deleted file mode 100644 index 99f0a930..00000000 --- a/docs/src/tutorials/Clustering - Iris/betaml_tutorial_cluster_iris.md +++ /dev/null @@ -1,301 +0,0 @@ -```@meta -EditURL = "betaml_tutorial_cluster_iris.jl" -``` - -# [A clustering task: the prediction of plant species from floreal measures (the iris dataset)](@id clustering_tutorial) -The task is to estimate the species of a plant given some floreal measurements. It use the classical "Iris" dataset. -Note that in this example we are using clustering approaches, so we try to understand the "structure" of our data, without relying to actually knowing the true labels ("classes" or "factors"). However we have chosen a dataset for which the true labels are actually known, so we can compare the accuracy of the algorithms we use, but these labels will not be used during the algorithms training. - -Data origin: -- dataset description: [https://en.wikipedia.org/wiki/Iris_flower_data_set](https://en.wikipedia.org/wiki/Iris_flower_data_set) -- data source we use here: [https://github.com/JuliaStats/RDatasets.jl](https://github.com/JuliaStats/RDatasets.jl) - -## Library and data loading - -Activating the local environment specific to BetaML documentation - -```text -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -``` - -We load the Beta Machine Learning Toolkit as well as some other packages that we use in this tutorial - -```text -using BetaML -using Random, Statistics, Logging, BenchmarkTools, StableRNGs, RDatasets, Plots, DataFrames -``` - -We are also going to compare our results with two other leading packages in Julia for clustering analysis, [`Clustering.jl`](https://github.com/JuliaStats/Clustering.jl) that provides (inter alia) kmeans and kmedoids algorithms and [`GaussianMixtures.jl`](https://github.com/davidavdav/GaussianMixtures.jl) that provides, as the name says, Gaussian Mixture Models. So we import them (we "import" them, rather than "use", not to bound their full names into namespace as some would collide with BetaML). - -```text -import Clustering, GaussianMixtures -``` - -Here we are explicit and we use our own fixed RNG: - -```text -seed = 123 # The table at the end of this tutorial has been obtained with seeds 123, 1000 and 10000 -AFIXEDRNG = StableRNG(seed) -``` - -We do a few tweeks for the Clustering and GaussianMixtures packages. Note that in BetaML we can also control both the random seed and the verbosity in the algorithm call, not only globally - -```text -Random.seed!(seed) -#logger = Logging.SimpleLogger(stdout, Logging.Error); global_logger(logger); ## For suppressing GaussianMixtures output -``` - -Differently from the [regression tutorial](@ref regression_tutorial), we load the data here from [`RDatasets`](https://github.com/JuliaStats/RDatasets.jl](https://github.com/JuliaStats/RDatasets.jl), a package providing standard datasets. - -```text -iris = dataset("datasets", "iris") -describe(iris) -``` - -The iris dataset provides floreal measures in columns 1 to 4 and the assigned species name in column 5. There are no missing values - -## Data preparation -The first step is to prepare the data for the analysis. We collect the first 4 columns as our _feature_ `x` matrix and the last one as our `y` label vector. -As we are using clustering algorithms, we are not actually using the labels to train the algorithms, we'll behave like we do not know them, we'll just let the algorithm "learn" from the structure of the data itself. We'll however use it to judge the accuracy that the various algorithms reach. - -```text -x = Matrix{Float64}(iris[:,1:4]); -yLabels = unique(iris[:,5]) -``` - -As the labels are expressed as strings, the first thing we do is encode them as integers for our analysis using the [`OrdinalEncoder`](@ref) model (data isn't really needed to be actually ordered): - -```text -y = fit!(OrdinalEncoder(categories=yLabels),iris[:,5]) -``` - -The dataset from RDatasets is ordered by species, so we need to shuffle it to avoid biases. -Shuffling happens by default in cross_validation, but we are keeping here a copy of the shuffled version for later. -Note that the version of [`consistent_shuffle`](@ref) that is included in BetaML accepts several n-dimensional arrays and shuffle them (by default on rows, by we can specify the dimension) keeping the association between the various arrays in the shuffled output. - -```text -(xs,ys) = consistent_shuffle([x,y], rng=copy(AFIXEDRNG)); -nothing #hide -``` - -## Main analysis - -We will try 3 BetaML models ([`KMeansClusterer`](@ref), [`KMedoidsClusterer`](@ref) and [`GaussianMixtureClusterer`](@ref)) and we compare them with `kmeans` from Clusterings.jl and `GMM` from GaussianMixtures.jl - -`KMeansClusterer` and `KMedoidsClusterer` works by first initialising the centers of the k-clusters (step a ). These centers, also known as the "representatives", must be selected within the data for kmedoids, while for kmeans they are the geometrical centers. - -Then ( step b ) the algorithms iterates toward each point to assign the point to the cluster of the closest representative (according with a user defined distance metric, default to Euclidean), and ( step c ) moves each representative at the center of its newly acquired cluster (where "center" depends again from the metric). - -Steps _b_ and _c_ are reiterated until the algorithm converge, i.e. the tentative k representative points (and their relative clusters) don't move any more. The result (output of the algorithm) is that each point is assigned to one of the clusters (classes). - -The algorithm in `GaussianMixtureClusterer` is similar in that it employs an iterative approach (the Expectation_Minimisation algorithm, "em") but here we make the hipothesis that the data points are the observed outcomes of some _mixture_ probabilistic models where we have first a k-categorical variables whose outcomes are the (unobservble) parameters of a probabilistic distribution from which the data is finally drawn. Because the parameters of each of the k-possible distributions is unobservable this is also called a model with latent variables. - -Most `gmm` models use the Gaussain distribution as the family of the mixture components, so we can tought the `gmm` acronym to indicate _Gaussian Mixture Model_. In BetaML we have currently implemented only Gaussain components, but any distribution could be used by just subclassing `AbstractMixture` and implementing a couple of methids (you are invited to contribute or just ask for a distribution family you are interested), so I prefer to think "gmm" as an acronym for _Generative Mixture Model_. - -The algorithm tries to find the mixture that maximises the likelihood that the data has been generated indeed from such mixture, where the "E" step refers to computing the probability that each point belongs to each of the k-composants (somehow similar to the step _b_ in the kmeans/kmedoids algorithms), and the "M" step estimates, giving the association probabilities in step "E", the parameters of the mixture and of the individual components (similar to step _c_). - -The result here is that each point has a categorical distribution (PMF) representing the probabilities that it belongs to any of the k-components (our classes or clusters). This is interesting, as `gmm` can be used for many other things that clustering. It forms the backbone of the [`GaussianMixtureImputer`](@ref) model to impute missing values (on some or all dimensions) based to how close the record seems to its pears. For the same reasons, `GaussianMixtureImputer` can also be used to predict user's behaviours (or users' appreciation) according to the behaviour/ranking made by pears ("collaborative filtering"). - -While the result of `GaussianMixtureClusterer` is a vector of PMFs (one for each record), error measures and reports with the true values (if known) can be directly applied, as in BetaML they internally call `mode()` to retrieve the class with the highest probability for each record. - - -As we are here, we also try different versions of the BetaML models, even if the default "versions" should be fine. For `KMeansClusterer` and `KMedoidsClusterer` we will try different initialisation strategies ("gird", the default one, "random" and "shuffle"), while for the `GaussianMixtureClusterer` model we'll choose different distributions of the Gaussain family (`SphericalGaussian` - where the variance is a scalar, `DiagonalGaussian` - with a vector variance, and `FullGaussian`, where the covariance is a matrix). - -As the result would depend on stochasticity both in the data selected and in the random initialisation, we use a cross-validation approach to run our models several times (with different data) and then we average their results. -Cross-Validation in BetaML is very flexible and it is done using the [`cross_validation`](@ref) function. It is used by default for hyperparameters autotuning of the BetaML supervised models. -`cross_validation` works by calling the function `f`, defined by the user, passing to it the tuple `trainData`, `valData` and `rng` and collecting the result of the function f. The specific method for which `trainData`, and `valData` are selected at each iteration depends on the specific `sampler`. - -We start by selectign a k-fold sampler that split our data in 5 different parts, it uses 4 for training and 1 part (not used here) for validation. We run the simulations twice and, to be sure to have replicable results, we fix the random seed (at the whole crossValidaiton level, not on each iteration). - -```text -sampler = KFold(nsplits=5,nrepeats=3,shuffle=true, rng=copy(AFIXEDRNG)) -``` - -We can now run the cross-validation with our models. Note that instead of defining the function `f` and then calling `cross_validation[f(trainData,testData,rng),[x,y],...)` we use the Julia `do` block syntax and we write directly the content of the `f` function in the `do` block. -Also, by default cross_validation already returns the mean and the standard deviation of the output of the user-provided `f` function (or the `do` block). However this requires that the `f` function returns a single scalar. Here we are returning a vector of the accuracies of the different models (so we can run the cross-validation only once), and hence we indicate with `return_statistics=false` to cross_validation not to attempt to generate statistics but rather report the whole output. -We'll compute the statistics ex-post. - -Inside the `do` block we do 4 things: -- we recover from `trainData` (a tuple, as we passed a tuple to `cross_validation` too) the `xtrain` features and `ytrain` labels; -- we run the various clustering algorithms -- we use the real labels to compute the model accuracy. Note that the clustering algorithm know nothing about the specific label name or even their order. This is why [`accuracy`](@ref) has the parameter `ignorelabels` to compute the accuracy oven any possible permutation of the classes found. -- we return the various models' accuracies - -```text -cOut = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng - # For unsupervised learning we use only the train data. - # Also, we use the associated labels only to measure the performances - (xtrain,ytrain) = trainData; - # We run the clustering algorithm and then and we compute the accuracy using the real labels: - estcl = fit!(KMeansClusterer(n_classes=3,initialisation_strategy="grid",rng=rng),xtrain) - kMeansGAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(KMeansClusterer(n_classes=3,initialisation_strategy="random",rng=rng),xtrain) - kMeansRAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(KMeansClusterer(n_classes=3,initialisation_strategy="shuffle",rng=rng),xtrain) - kMeansSAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(KMedoidsClusterer(n_classes=3,initialisation_strategy="grid",rng=rng),xtrain) - kMedoidsGAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(KMedoidsClusterer(n_classes=3,initialisation_strategy="random",rng=rng),xtrain) - kMedoidsRAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(KMedoidsClusterer(n_classes=3,initialisation_strategy="shuffle",rng=rng),xtrain) - kMedoidsSAccuracy = accuracy(ytrain,estcl,ignorelabels=true) - estcl = fit!(GaussianMixtureClusterer(n_classes=3,mixtures=SphericalGaussian,rng=rng,verbosity=NONE),xtrain) - gmmSpherAccuracy = accuracy(ytrain,estcl,ignorelabels=true, rng=rng) - estcl = fit!(GaussianMixtureClusterer(n_classes=3,mixtures=DiagonalGaussian,rng=rng,verbosity=NONE),xtrain) - gmmDiagAccuracy = accuracy(ytrain,estcl,ignorelabels=true, rng=rng) - estcl = fit!(GaussianMixtureClusterer(n_classes=3,mixtures=FullGaussian,rng=rng,verbosity=NONE),xtrain) - gmmFullAccuracy = accuracy(ytrain,estcl,ignorelabels=true, rng=rng) - # For comparision with Clustering.jl - clusteringOut = Clustering.kmeans(xtrain', 3) - kMeans2Accuracy = accuracy(ytrain,clusteringOut.assignments,ignorelabels=true) - # For comparision with GaussianMistures.jl - sometimes GaussianMistures.jl em! fails with a PosDefException - dGMM = GaussianMixtures.GMM(3, xtrain; method=:kmeans, kind=:diag) - GaussianMixtures.em!(dGMM, xtrain) - gmmDiag2Accuracy = accuracy(ytrain,GaussianMixtures.gmmposterior(dGMM, xtrain)[1],ignorelabels=true) - fGMM = GaussianMixtures.GMM(3, xtrain; method=:kmeans, kind=:full) - GaussianMixtures.em!(fGMM, xtrain) - gmmFull2Accuracy = accuracy(ytrain,GaussianMixtures.gmmposterior(fGMM, xtrain)[1],ignorelabels=true) - # Returning the accuracies - return kMeansGAccuracy,kMeansRAccuracy,kMeansSAccuracy,kMedoidsGAccuracy,kMedoidsRAccuracy,kMedoidsSAccuracy,gmmSpherAccuracy,gmmDiagAccuracy,gmmFullAccuracy,kMeans2Accuracy,gmmDiag2Accuracy,gmmFull2Accuracy - end - -# We transform the output in matrix for easier analysis -accuracies = fill(0.0,(length(cOut),length(cOut[1]))) -[accuracies[r,c] = cOut[r][c] for r in 1:length(cOut),c in 1:length(cOut[1])] -μs = mean(accuracies,dims=1) -σs = std(accuracies,dims=1) - - -modelLabels=["kMeansG","kMeansR","kMeansS","kMedoidsG","kMedoidsR","kMedoidsS","gmmSpher","gmmDiag","gmmFull","kMeans (Clustering.jl)","gmmDiag (GaussianMixtures.jl)","gmmFull (GaussianMixtures.jl)"] - -report = DataFrame(mName = modelLabels, avgAccuracy = dropdims(round.(μs',digits=3),dims=2), stdAccuracy = dropdims(round.(σs',digits=3),dims=2)) -``` - -Accuracies (mean and its standard dev.) running this scripts with different random seeds (`123`, `1000` and `10000`): - -| model | μ 1 | σ² 1 | μ 2 | σ² 2 | μ 3 | σ² 3 | -| ------------------------------| ----- | ----- | ----- | ----- | ----- | ----- | -│ kMeansG | 0.891 | 0.017 | 0.892 | 0.012 | 0.893 | 0.017 | -│ kMeansR | 0.866 | 0.083 | 0.831 | 0.127 | 0.836 | 0.114 | -│ kMeansS | 0.764 | 0.174 | 0.822 | 0.145 | 0.779 | 0.170 | -│ kMedoidsG | 0.894 | 0.015 | 0.896 | 0.012 | 0.894 | 0.017 | -│ kMedoidsR | 0.804 | 0.144 | 0.841 | 0.123 | 0.825 | 0.134 | -│ kMedoidsS | 0.893 | 0.018 | 0.834 | 0.130 | 0.877 | 0.085 | -│ gmmSpher | 0.893 | 0.016 | 0.891 | 0.016 | 0.895 | 0.017 | -│ gmmDiag | 0.917 | 0.022 | 0.912 | 0.016 | 0.916 | 0.014 | -│ gmmFull | 0.970 | 0.035 | 0.982 | 0.013 | 0.981 | 0.009 | -│ kMeans (Clustering.jl) | 0.856 | 0.112 | 0.873 | 0.083 | 0.873 | 0.089 | -│ gmmDiag (GaussianMixtures.jl) | 0.865 | 0.127 | 0.872 | 0.090 | 0.833 | 0.152 | -│ gmmFull (GaussianMixtures.jl) | 0.907 | 0.133 | 0.914 | 0.160 | 0.917 | 0.141 | - -We can see that running the script multiple times with different random seed confirm the estimated standard deviations collected with the cross_validation, with the BetaML GMM-based models and grid based ones being the most stable ones. - -### BetaML model accuracies - -From the output We see that the gmm models perform for this dataset generally better than kmeans or kmedoids algorithms, and they further have very low variances. -In detail, it is the (default) `grid` initialisation that leads to the better results for `kmeans` and `kmedoids`, while for the `gmm` models it is the `FullGaussian` to perform better. - -### Comparisions with `Clustering.jl` and `GaussianMixtures.jl` -For this specific case, both `Clustering.jl` and `GaussianMixtures.jl` report substantially worst accuracies, and with very high variances. But we maintain the ranking that Full Gaussian gmm > Diagonal Gaussian > Kmeans accuracy. -I suspect the reason that BetaML gmm works so well is in relation to the usage of kmeans algorithm for initialisation of the mixtures, itself initialized with a "grid" arpproach. -The grid initialisation "guarantee" indeed that the initial means of the mixture components are well spread across the multidimensional space defined by the data, and it helps avoiding the EM algoritm to converge to a bad local optimus. - -## Working without the labels - -Up to now we used the real labels to compare the model accuracies. But in real clustering examples we don't have the true classes, or we wouln't need to do clustering in the first instance, so we don't know the number of classes to use. -There are several methods to judge clusters algorithms goodness. For likelyhood based algorithms as `GaussianMixtureClusterer` we can use a information criteria that trade the goodness of the lickelyhood with the number of parameters used to do the fit. -BetaML provides by default in the gmm clustering outputs both the _Bayesian information criterion_ ([`BIC`](@ref bic)) and the _Akaike information criterion_ ([`AIC`](@ref aic)), where for both a lower value is better. - -We can then run the model with different number of classes and see which one leads to the lower BIC or AIC. -We run hence `cross_validation` again with the `FullGaussian` gmm model. -Note that we use the BIC/AIC criteria here for establishing the "best" number of classes but we could have used it also to select the kind of Gaussain distribution to use. This is one example of hyper-parameter tuning that we developed more in detail using autotuning in the [regression tutorial](@ref regression_tutorial). - -Let's try up to 4 possible classes: - -```text -K = 4 -sampler = KFold(nsplits=5,nrepeats=2,shuffle=true, rng=copy(AFIXEDRNG)) -cOut = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng - (xtrain,ytrain) = trainData; - BICS = [] - AICS = [] - for k in 1:K - m = GaussianMixtureClusterer(n_classes=k,mixtures=FullGaussian,rng=rng,verbosity=NONE) - fit!(m,xtrain) - push!(BICS,info(m)["BIC"]) - push!(AICS,info(m)["AIC"]) - end - return (BICS,AICS) -end - -# Transforming the output in matrices for easier analysis -Nit = length(cOut) - -BICS = fill(0.0,(Nit,K)) -AICS = fill(0.0,(Nit,K)) -[BICS[r,c] = cOut[r][1][c] for r in 1:Nit,c in 1:K] -[AICS[r,c] = cOut[r][2][c] for r in 1:Nit,c in 1:K] - -μsBICS = mean(BICS,dims=1) -``` - -```text -σsBICS = std(BICS,dims=1) -``` - -```text -μsAICS = mean(AICS,dims=1) -``` - -```text -σsAICS = std(AICS,dims=1) -``` - -```text -plot(1:K,[μsBICS' μsAICS'], labels=["BIC" "AIC"], title="Information criteria by number of classes", xlabel="number of classes", ylabel="lower is better") -``` - -We see that following the "lowest AIC" rule we would indeed choose three classes, while following the "lowest BIC" criteria we would have choosen only two classes. This means that there is two classes that, concerning the floreal measures used in the database, are very similar, and our models are unsure about them. Perhaps the biologists will end up one day with the conclusion that it is indeed only one specie :-). - -We could study this issue more in detail by analysing the [`ConfusionMatrix`](@ref), but the one used in BetaML does not account for the ignorelabels option (yet). - -### Analysing the silhouette of the cluster - -A further metric to analyse cluster output is the so-called [Sinhouette method](https://en.wikipedia.org/wiki/Silhouette_(clustering)) - -Silhouette is a distance-based metric and require as first argument a matrix of pairwise distances. This can be computed with the [`pairwise`](@ref) function, that default to using `l2_distance` (i.e. Euclidean). Many other distance functions are available in the [`Clustering`](@ref) sub-module or one can use the efficiently implemented distances from the [`Distances`](https://github.com/JuliaStats/Distances.jl) package, as in this example. - -We'll use here the [`silhouette`](@ref) function over a simple loop: - -```text -x,y = consistent_shuffle([x,y],dims=1) -import Distances -pd = pairwise(x,distance=Distances.euclidean) # we compute the pairwise distances -nclasses = 2:6 -models = [KMeansClusterer, KMedoidsClusterer, GaussianMixtureClusterer] -println("Silhouette score by model type and class number:") -for ncl in nclasses, mtype in models - m = mtype(n_classes=ncl, verbosity=NONE) - ŷ = fit!(m,x) - if mtype == GaussianMixtureClusterer - ŷ = mode(ŷ) - end - s = mean(silhouette(pd,ŷ)) - println("$mtype \t ($ncl classes): $s") -end -``` - -Highest levels are better. We see again that 2 classes have better scores ! - -## Conclusions - -We have shown in this tutorial how we can easily run clustering algorithms in BetaML with just one line of code `fit!(ChoosenClusterer(),x)`, but also how can we use cross-validation in order to help the model or parameter selection, with or whithout knowing the real classes. -We retrieve here what we observed with supervised models. Globally the accuracy of BetaML models are comparable to those of leading specialised packages (in this case they are even better), but there is a significant gap in computational efficiency that restricts the pratical usage of BetaML to datasets that fits in the pc memory. However we trade this relative inefficiency with very flexible model definition and utility functions (for example `GaussianMixtureClusterer` works with missing data, allowing it to be used as the backbone of the [`GaussianMixtureImputer`](@ref) missing imputation function, or for collaborative reccomendation systems). - -[View this file on Github](betaml_tutorial_cluster_iris.jl). - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/docs/src/tutorials/Dimensionality reduction/betaml_tutorial_dimensionality_reduction.jl b/docs/src/tutorials/Dimensionality reduction/betaml_tutorial_dimensionality_reduction.jl deleted file mode 100644 index bfea7d61..00000000 --- a/docs/src/tutorials/Dimensionality reduction/betaml_tutorial_dimensionality_reduction.jl +++ /dev/null @@ -1,275 +0,0 @@ -# [A dimensionality reduction task: imagee encoding (the MNIST dataset)](@id dimensionality_reduction_tutorial) -# For several reasons, including reducing the risk to incur into the [curse of the dimensionality](), we need to reduce the dimensionality of our data. - -# Currently BetaML provides two methods for dimensionality reduction, using Principal Component Analysis ([`PCAEncoder`](@ref)), that linearlyreproject the data toward the axis of greeatest variance, or using an AutoEncoder ([`AutoEncoder`](@ref)) that try to learn, unsupervised, the characteristics of the data using neural network. - -# We will apply them to - -# -# Data origin: -# - dataset description: [https://en.wikipedia.org/wiki/Iris_flower_data_set](https://en.wikipedia.org/wiki/Iris_flower_data_set) -# - data source we use here: [https://github.com/JuliaStats/RDatasets.jl](https://github.com/JuliaStats/RDatasets.jl) - - -# ## Library and data loading -using Dates #src -println(now(), " ", "*** Start image recognition tutorial..." ) #src - -# Activating the local environment specific to BetaML documentation -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -using Random -using BetaML -using MLDatasets # For loading the training data - -Random.seed!(123); -TESTRNG = FIXEDRNG # This could change... - -x,y = MLDatasets.MNIST()[:] -x = permutedims(x,(3,2,1)) -x = convert(Array{Float64,3},x) -x = reshape(x,size(x,1),size(x,2)*size(x,3)) -ohm = OneHotEncoder() -y_oh = fit!(ohm,y) -(N,D) = size(x) - -x2 = collect(x[1:10,:]) - -e_layers = [ - ReshaperLayer((D,1),(28,28,1)), # 784x1 => 28x28x1 - ConvLayer((28,28,1),(5,5),4,stride=2,f=relu,rng=copy(TESTRNG)), # 28x28x1 => 14x14x4 - ConvLayer((14,14,4),(3,3),8,stride=2,f=relu,rng=copy(TESTRNG)), # 14x14x4 => 7x7x8 - ConvLayer((7,7,8),(3,3),8,stride=2,f=relu,rng=copy(TESTRNG)), # 7x7x8 => 4x4x8 - ReshaperLayer((4,4,8),(128,1)), # 4x4x8 => 128x1 - DenseLayer(128,2,f=relu,rng=copy(TESTRNG)) # 128x1 => 2x1 -] - -d_layers = [ - DenseLayer(2,16,f=relu,rng=copy(TESTRNG)) - DenseLayer(16,784,f=relu,rng=copy(TESTRNG)) -] - - -ae_mod = AutoEncoder(encoded_size=2, e_layers=e_layers, d_layers=d_layers, epochs=4, cache=false) -x_ae = fit!(ae_mod,x2) - -predict(ae_mod,x2) - -xtemp = copy(x2) -xi = x[1,:] -for el in ae_mod.par.fullnn.par.nnstruct.layers[1:ae_mod.par.n_el] - xi = forward(el,xi) - println(typeof(xi)) - println(size(xi)) - - # xtemp = vcat([forward(el,r) for r in eachrow(xtemp)]'...) -end -return xtemp|> makematrix - - - - -DenseLayer(2,4*4*8,f=relu,rng=copy(TESTRNG)) # 2x1 => 128x1 - -ReshaperLayer((4*4*8,1),(4,4,8)) # 128x1 => 4x4x8 -a = ConvLayer((4,4,8),(3,3),8,stride=1,padding=3,f=relu,rng=copy(TESTRNG)) # 4x4x8 => 8x8x8 -a = ConvLayer((8,8,8),(3,3),8,stride=1,padding=3,f=relu,rng=copy(TESTRNG)) # 4x4x8 => 8x8x8 - -ReshaperLayer((D,1),(28,28,1)) - - -l1 = ReshaperLayer((D,1),(28,28,1)) -## 28x28x1 => 14x14x8 -l2 = ConvLayer(size(l1)[2],(5,5),8,stride=2,f=relu,rng=copy(TESTRNG)) -## 14x14x8 => 7x7x16 -l3 = ConvLayer(size(l2)[2],(3,3),16,stride=2,f=relu,rng=copy(TESTRNG)) -## 7x7x16 => 4x4x32 -l4 = ConvLayer(size(l3)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG)) -## 4x4x32 => 2x2x32 -l5 = ConvLayer(size(l4)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG)) -## 2x2x32 => 1x1x32 (global per layer mean) -l6 = PoolingLayer(size(l5)[2],(2,2),stride=(2,2),f=mean) -## 1x1x32 => 32x1 -l7 = ReshaperLayer(size(l6)[2]) -## 32x1 => 10x1 -l8 = DenseLayer(size(l7)[2][1],10,f=identity, rng=copy(TESTRNG)) - - - -pca_mod = PCAEncoder() -#x_pca = fit!(pca_mod,x[1:20000,200:end]) -e_layers = [DenseLayer(784,30)] -d_layers = [DenseLayer(30,784)] -ae_mod = AutoEncoder(encoded_size=2) -x_ae = fit!(ae_mod,x[1:200,:]) - -e_layers = - - - -x_train, y_train = MLDatasets.MNIST(split=:train)[:] -x_train = permutedims(x_train,(3,2,1)) -x_train = convert(Array{Float64,3},x_train) -x_train = reshape(x_train,size(x_train,1),size(x_train,2)*size(x_train,3)) -ohm = OneHotEncoder() -y_train_oh = fit!(ohm,y_train) - -x_test, y_test = MLDatasets.MNIST(split=:test)[:] -x_test = permutedims(x_test,(3,2,1)) -x_test = convert(Array{Float64,3},x_test) -x_test = reshape(x_test,size(x_test,1),size(x_test,2)*size(x_test,3)) -y_test_oh = predict(ohm,y_test) -(N,D) = size(x_train) - - -using DelimitedFiles -using Statistics -using BenchmarkTools -using Plots -using Flux -using Flux: Data.DataLoader -using Flux: onehotbatch, onecold, crossentropy -using MLDatasets # For loading the training data -#using Images, FileIO, ImageTransformations # For loading the actual images - -TESTRNG = FIXEDRNG # This could change... - -x_train, y_train = MLDatasets.MNIST(split=:train)[:] -x_train = permutedims(x_train,(3,2,1)) -x_train = convert(Array{Float64,3},x_train) -x_train = reshape(x_train,size(x_train,1),size(x_train,2)*size(x_train,3)) -ohm = OneHotEncoder() -y_train_oh = fit!(ohm,y_train) - -x_test, y_test = MLDatasets.MNIST(split=:test)[:] -x_test = permutedims(x_test,(3,2,1)) -x_test = convert(Array{Float64,3},x_test) -x_test = reshape(x_test,size(x_test,1),size(x_test,2)*size(x_test,3)) -y_test_oh = predict(ohm,y_test) -(N,D) = size(x_train) - -# Building the model: - -## 784x1 => 28x28x1 -l1 = ReshaperLayer((D,1),(28,28,1)) -## 28x28x1 => 14x14x8 -l2 = ConvLayer(size(l1)[2],(5,5),8,stride=2,f=relu,rng=copy(TESTRNG)) -## 14x14x8 => 7x7x16 -l3 = ConvLayer(size(l2)[2],(3,3),16,stride=2,f=relu,rng=copy(TESTRNG)) -## 7x7x16 => 4x4x32 -l4 = ConvLayer(size(l3)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG)) -## 4x4x32 => 2x2x32 -l5 = ConvLayer(size(l4)[2],(3,3),32,stride=2,f=relu,rng=copy(TESTRNG)) -## 2x2x32 => 1x1x32 (global per layer mean) -l6 = PoolingLayer(size(l5)[2],(2,2),stride=(2,2),f=mean) -## 1x1x32 => 32x1 -l7 = ReshaperLayer(size(l6)[2]) -## 32x1 => 10x1 -l8 = DenseLayer(size(l7)[2][1],10,f=identity, rng=copy(TESTRNG)) -## 10x1 => 10x1 -l9 = VectorFunctionLayer(size(l8)[2][1],f=BetaML.softmax) -layers = [l1,l2,l3,l4,l5,l6,l7,l8,l9] -m = NeuralNetworkEstimator(layers=layers,loss=squared_cost,verbosity=HIGH,batch_size=128,epochs=4) - -# We train the model only on a subset of the training data, otherwise it is too long for the automated building of this page. -# Training the whole MINST set takes approximatly 16 minutes on a mid-level laptop (on CPU), leading to a test accuracy of 0.969 -(x_debug,x_other),(y_debug_oh,y_other_oh) = partition([x_train,y_train_oh],[0.01,0.99],rng=copy(TESTRNG)) - -#preprocess!.(layers) -# 0.131836 seconds (477.02 k allocations: 53.470 MiB, 72.73% compilation time) -#@code_warntype preprocess!(l5) - -ŷ = fit!(m,x_debug,y_debug_oh) -#@btime fit!(m,x_debug,y_debug_oh) -# 1%: 15.909 s (1940246 allocations: 1.39 GiB) -# 17.509 s (1039126 allocations: 1.37 GiB) -# 15.766 s (1039111 allocations: 1.37 GiB) -# 14.669 s (3129139 allocations: 1.64 GiB) (w threads) -# 18.119 s (1039121 allocations: 1.37 GiB) -# 14.966 s (1039123 allocations: 1.37 GiB) (whout threads) -# 19.357 s (1039123 allocations: 1.37 GiB) - -#println(now(), " ", "*** prefit..." ) #src -#ŷ = fit!(m,x_train,y_train_oh) -#println(now(), " ", "*** postfit..." ) #src - -#y_true = inverse_predict(ohm,convert(Matrix{Bool},y_train_oh)) -y_true = inverse_predict(ohm,convert(Matrix{Bool},y_debug_oh)) -ŷ_nonoh = inverse_predict(ohm,ŷ) -accuracy(y_true,ŷ_nonoh) -hcat(y_true,ŷ_nonoh) - -ŷtest = predict(m,x_test) -ytest_true = inverse_predict(ohm,convert(Matrix{Bool},y_test_oh)) -ŷtest_nonoh = inverse_predict(ohm,ŷtest) -accuracy(ytest_true,ŷtest_nonoh) -hcat(ytest_true,ŷtest_nonoh) - -cm = ConfusionMatrix() -fit!(cm,ytest_true,ŷtest_nonoh) -print(cm) - -res = info(cm) - -heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)") - -# ----------------------------------------------------------- -# ## Flux implementation -# This is the equivalent workflow in Flux. -# Fitting on the whole training dataset lead to a test accuracy of 0.9658, so likely not statistically different than BetaML, but with still a much faster comutation time, as it takes only 2 minutes instead of 16... - - -x_train, y_train = MLDatasets.MNIST(split=:train)[:] -x_train = permutedims(x_train,(2,1,3)); # For correct img axis -#x_train = convert(Array{Float32,3},x_train); -x_train = reshape(x_train,(28,28,1,60000)); -y_train = Flux.onehotbatch(y_train, 0:9) -train_data = Flux.Data.DataLoader((x_train, y_train), batchsize=128) -#x_test, y_test = MLDatasets.MNIST.testdata(dir = "data/MNIST") -x_test, y_test = MLDatasets.MNIST(split=:test)[:] -x_test = permutedims(x_test,(2,1,3)); # For correct img axis -#x_test = convert(Array{Float32,3},x_test); -x_test = reshape(x_test,(28,28,1,10000)); -y_test = Flux.onehotbatch(y_test, 0:9) - -model = Chain( - ## 28x28 => 14x14 - Conv((5, 5), 1=>8, pad=2, stride=2, Flux.relu), - ## 14x14 => 7x7 - Conv((3, 3), 8=>16, pad=1, stride=2, Flux.relu), - ## 7x7 => 4x4 - Conv((3, 3), 16=>32, pad=1, stride=2, Flux.relu), - ## 4x4 => 2x2 - Conv((3, 3), 32=>32, pad=1, stride=2, Flux.relu), - ## Average pooling on each width x height feature map - GlobalMeanPool(), - Flux.flatten, - Dense(32, 10), - Flux.softmax -) - - - -myaccuracy(y,ŷ) = (mean(Flux.onecold(ŷ) .== Flux.onecold(y))) -myloss(x, y) = Flux.crossentropy(model(x), y) - -opt = Flux.ADAM() -ps = Flux.params(model) -number_epochs = 4 - -[(println(e); Flux.train!(myloss, ps, train_data, opt)) for e in 1:number_epochs] - -ŷtrain = model(x_train) -ŷtest = model(x_test) -myaccuracy(y_train,ŷtrain) -myaccuracy(y_test,ŷtest) - -plot(Gray.(x_train[:,:,1,2])) - -cm = ConfusionMatrix() -fit!(cm,Flux.onecold(y_test) .-1, Flux.onecold(ŷtest) .-1 ) -println(cm) - -res = info(cm) -heatmap(string.(res["categories"]),string.(res["categories"]),res["normalised_scores"],seriescolor=cgrad([:white,:blue]),xlabel="Predicted",ylabel="Actual", title="Confusion Matrix (normalised scores)") - diff --git a/docs/src/tutorials/Feature importance/Feature_importance.jl b/docs/src/tutorials/Feature importance/Feature_importance.jl index 64d96e9c..9036369c 100644 --- a/docs/src/tutorials/Feature importance/Feature_importance.jl +++ b/docs/src/tutorials/Feature importance/Feature_importance.jl @@ -162,4 +162,4 @@ vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntri #- bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5]) -# As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important variables for the estimation of house price in the Boston area. \ No newline at end of file +# As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important explanatory variables of house price in the Boston area. \ No newline at end of file diff --git a/docs/src/tutorials/Feature importance/Feature_importance.md b/docs/src/tutorials/Feature importance/Feature_importance.md deleted file mode 100644 index 8d5ef564..00000000 --- a/docs/src/tutorials/Feature importance/Feature_importance.md +++ /dev/null @@ -1,230 +0,0 @@ -```@meta -EditURL = "Feature_importance.jl" -``` - -# [Understanding variable importance in black-box machine learning models](@id variable_importance_tutorial) - -Often we want to understand the contribution of different variables (x columns) to the prediction accuracy of a black-box machine learning model. -To this end, BetaML 0.12 introduces [`FeatureRanker`](@ref), a flexible variable ranking estimator that employs multiple variable importance metrics. -`FeatureRanker` helps to determine the importance of features in predictions from any black-box machine learning model (not necessarily the BetaML suit), internally using cross-validation to assess the quality of the predictions (`metric="mda"`), or the contribution of the variable to the variance of the predictions (`metric="sobol"`), with or without a given variable. - -By default, it ranks variables (columns) in a single pass without retraining on each one. However, it is possible to specify the model to use multiple passes (where on each pass the less important variable is permuted). This helps to assess importance in the presence of highly correlated variables. -While the default strategy is to simply (temporarily) permute the "test" variable and predict the modified data set, it is possible to refit the model to be evaluated on each variable ("permute and relearn"), of course at a much higher computational cost. -However, if the ML model to be evaluated supports ignoring variables during prediction (as BetaML tree models do), it is possible to specify the keyword argument for such an option in the target model prediction function and avoid refitting. - -In this tutorial we will use `FeatureRanker` first with some synthetic data, and then with the Boston dataset to determine the most important variables in determining house prices. -We will compare the results with Shapley values using the [`ShapML`](https://github.com/nredell/ShapML.jl) package. - -Let's start by activating the local environment specific to the BetaML documentation and loading the necessary packages: - -```text -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -using Statistics, Random, Pipe, StableRNGs, HTTP, CSV, DataFrames, Plots, BetaML -import Distributions: Normal, Uniform, quantile -import ShapML -Random.seed!(123) -``` - -## Example with synthetic data - -In this example, we generate a dataset of 5 random variables, where `x1` is the most important in determining `y`, `x2` is somewhat less important, `x3` has interaction effects with `x1`, while `x4` and `x5` do not contribute at all to the calculation of `y`. -We also add `x6` as a highly correlated variable to `x1`, but note that `x4` also does not contribute to `y`: - -```text -N = 2000 -xa = rand(Uniform(0.0,10.0),N,5) -xb = xa[:,1] .* rand.(Normal(1,0.5)) -x = hcat(xa,xb) -y = [10*r[1]-r[2]+0.1*r[3]*r[1] for r in eachrow(x) ]; -nothing #hide -``` - -Aside of `y`, that is numerical, we create also a categorical version to test classification and a further one-hot version to test neural networks models that, for classification tasks, work using one-hot encoded variables: - -```text -ysort = sort(y) -ycat = [(i < ysort[Int(round(N/3))]) ? "c" : ( (i < ysort[Int(round(2*N/3))]) ? "a" : "b") for i in y] -yoh = fit!(OneHotEncoder(),ycat); -nothing #hide -``` - -We first try a Random Forest regressor. The BetaML `RandomForestEstimator` model supports a `predict` function with the option to ignore specific dimensions. This allow us to "test" the various variables without retraining the model: - -```text -fr = FeatureRanker(model=RandomForestEstimator(),nsplits=5,nrepeats=1,recursive=false,metric="mda",ignore_dims_keyword="ignore_dims") -rank = fit!(fr,x,y) # As for the other BetaML models, `fit!` by default returns the predictions, in this case the ranking, avoiding a `predict` call -``` - -As expected, the ranking shows `x1` as the most important variable. Let's look in detail at the metrics that we can obtain by querying the model with `info(fr)`: - -```text -loss_by_col = info(fr)["loss_by_col"] -sobol_by_col = info(fr)["sobol_by_col"] -loss_by_col_sd = info(fr)["loss_by_col_sd"] -sobol_by_col_sd = info(fr)["sobol_by_col_sd"] -loss_fullmodel = info(fr)["loss_all_cols"] -loss_fullmodel_sd = info(fr)["loss_all_cols_sd"] -ntrials_per_metric = info(fr)["ntrials_per_metric"] -``` - -Since we choosed `mda` as the reported metric, we must have that the reported rank is equal to the sortperm of `loss_by_col`: - -```text -sortperm(loss_by_col) == rank -``` - -We can plot the loss per (omitted) column... - -```text -bar(string.(rank),loss_by_col[rank],label="Loss by col", yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[rank]./sqrt(ntrials_per_metric))) -``` - -..and the sobol values: - -```text -bar(string.(sortperm(sobol_by_col)),sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric))) -``` - -As we can see from the graphs, the model did a good job of identifying the first variable as the most important one, ignoring the others and even giving a very low importance to the correlated one. - -### Comparision with the Shapley values - -For Shapley values we need first to have a trained model - -```text -m = RandomForestEstimator() -fit!(m,x,y); -nothing #hide -``` - -We need then to wrap the predict function, accounting with the fact that BetaML models works with standard arrays, while `ShapML` assume data in DataFrame format: - -```text -function predict_function(model, data) - data_pred = DataFrame(y_pred = BetaML.predict(model, Matrix(data))) - return data_pred -end -``` - -We set up other data related to the simulation.. - -```text -explain = DataFrame(x[1:300, :],:auto) -reference = DataFrame(x,:auto) - -sample_size = 60 ; # Number of Monte Carlo samples. -nothing #hide -``` - -...and finally compute the stochastic Shapley values per individual record: - -```text -data_shap = ShapML.shap(explain = explain, - reference = reference, - model = m, - predict_function = predict_function, - sample_size = sample_size, - seed = 1 - ); -nothing #hide -``` - -We aggregate the Shape values by feature - -```text -shap_aggregated =combine(groupby(data_shap,[:feature_name])) do subdf - (mean_effect = mean(abs.(subdf.shap_effect)), std = std(abs.(subdf.shap_effect)), n = size(subdf,1) ) -end -shap_values = shap_aggregated.mean_effect - -bar(string.(sortperm(shap_values)),shap_values[sortperm(shap_values)],label="Shapley values by col", yerror=quantile(Normal(1,0),0.975) .* (shap_aggregated.std[sortperm(shap_values)]./ sqrt.(shap_aggregated.n))) -``` - -Note that the output using the Sobol index and the Shapley values are very similar. This shoudn't come as a surprice, as the two metrics are related. - -### Classifications - -For classification tasks, the usage of `FeatureRanker` doesn't change: - -```text -fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=true,metric="mda",ignore_dims_keyword="ignore_dims") -rank = fit!(fr,x,ycat) -``` - -```text -fr = FeatureRanker(model=NeuralNetworkEstimator(verbosity=NONE),nsplits=3,nrepeats=1,recursive=false,metric="sobol",refit=false) -rank = fit!(fr,x,yoh) -``` - -## Determinant of house prices in the Boston alrea - -We start this example by first loading the data from a CSV file and splitting the data in features and labels: - -```text -data = CSV.File(joinpath(@__DIR__,"data","housing.data"), delim=' ', header=false, ignorerepeated=true) |> DataFrame - -var_names = [ - "CRIM", # per capita crime rate by town - "ZN", # proportion of residential land zoned for lots over 25,000 sq.ft. - "INDUS", # proportion of non-retail business acres per town - "CHAS", # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - "NOX", # nitric oxides concentration (parts per 10 million) - "RM", # average number of rooms per dwelling - "AGE", # proportion of owner-occupied units built prior to 1940 - "DIS", # weighted distances to five Boston employment centres - "RAD", # index of accessibility to radial highways - "TAX", # full-value property-tax rate per $10,000 - "PTRATIO", # pupil-teacher ratio by town - "B", # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town - "LSTAT", # % lower status of the population -] -y_name = "MEDV" ;# Median value of owner-occupied homes in $1000's -nothing #hide -``` - -Our features are a set of 13 explanatory variables, while the label that we want to estimate is the average housing prices: - -```text -x = Matrix(data[:,1:13]) -y = data[:,14]; -nothing #hide -``` - -We use a Random Forest model as regressor and we compute the variable importance for this model as we did for the synthetic data: - -```text -fr = FeatureRanker(model=RandomForestEstimator(),nsplits=3,nrepeats=2,recursive=false) -rank = fit!(fr,x,y) - -loss_by_col = info(fr)["loss_by_col"] -sobol_by_col = info(fr)["sobol_by_col"] -loss_by_col_sd = info(fr)["loss_by_col_sd"] -sobol_by_col_sd = info(fr)["sobol_by_col_sd"] -loss_fullmodel = info(fr)["loss_all_cols"] -loss_fullmodel_sd = info(fr)["loss_all_cols_sd"] -ntrials_per_metric = info(fr)["ntrials_per_metric"] -``` - -Finally we can plot the variable importance: - -```text -bar(var_names[sortperm(loss_by_col)], loss_by_col[sortperm(loss_by_col)],label="Loss by var", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (loss_by_col_sd[sortperm(loss_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.5]) -vline!([loss_fullmodel], label="Loss with all vars",linewidth=2) -vline!([loss_fullmodel-quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), - loss_fullmodel+quantile(Normal(1,0),0.975) * loss_fullmodel_sd/sqrt(ntrials_per_metric), -], label=nothing,linecolor=:black,linestyle=:dot,linewidth=1) -``` - -```text -bar(var_names[sortperm(sobol_by_col)],sobol_by_col[sortperm(sobol_by_col)],label="Sobol index by col", permute=(:x,:y), yerror=quantile(Normal(1,0),0.975) .* (sobol_by_col_sd[sortperm(sobol_by_col)]./sqrt(ntrials_per_metric)), yrange=[0,0.4]) -``` - -As we can see, the two analyses agree on the most important variables, showing that the size of the house (number of rooms), the percentage of low-income population in the neighbourhood and, to a lesser extent, the distance to employment centres are the most important explanatory variables of house price in the Boston area. - -[View this file on Github](Feature_importance.jl). - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/docs/src/tutorials/Multi-branch neural network/betaml_tutorial_multibranch_nn.md b/docs/src/tutorials/Multi-branch neural network/betaml_tutorial_multibranch_nn.md deleted file mode 100644 index 6488587e..00000000 --- a/docs/src/tutorials/Multi-branch neural network/betaml_tutorial_multibranch_nn.md +++ /dev/null @@ -1,169 +0,0 @@ -```@meta -EditURL = "betaml_tutorial_multibranch_nn.jl" -``` - -# [A deep neural network with multi-branch architecture](@id multibranch_nn_tutorial) - -Often we can "divide" our feature sets into different groups, where for each group we have many, many variables whose importance in prediction we don't know, but for which using a fully dense layer would be too computationally expensive. -For example, we want to predict the growth of forest trees based on soil characteristics, climate characteristics and a bunch of other data (species, age, density...). - -A soil (or climate) database may have hundreds of variables, how can we reduce them to a few that encode all the "soil" information? -Sure, we could do a PCA or a clustering analysis, but a better way is to let our model itself find a way to _encode_ the soil information into a vector in a way that is optimal for our prediction goal, i.e. we target the encoding task at our prediction goal. - -So we run a multi-branch neural network where one branch is given by the soil variables - it starts from all the hundreds of variables and ends in a few neuron outputs, another branch in a similar way is for the climate variables, we merge them in a branch to take into account the soil-weather interrelation (for example, it is well known that the water retention capacity of a sandy soil is quite different from that of a clay soil) and finally we merge this branch with the other variable branch to arrive at a single predicted output. -In this example we focus on building, training and predicting a multi-branch neural network. See the other examples for cross-validation, hyperparameter tuning, scaling, overfitting, encoding, etc. - -Data origin: -- while we hope to apply this example soon on actual real world data, for now we work on synthetic random data just to assess the validity of the network configuration. - -## Library and data generation - -Activating the local environment specific to the tutorials - -```text -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -``` - -We first load all the packages we are going to use - -```text -using StableRNGs, BetaML, Plots -``` - -Here we are explicit and we use our own fixed RNG: - -```text -seed = 123 -AFIXEDRNG = StableRNG(seed) -``` - -Here we generate the random data.. - -```text -N = 100 # records -soilD = 20 # dimensions of the soil database -climateD = 30 # dimensions of the climate database -othervarD = 10 # dimensions of the other variables database - -soilX = rand(StableRNG(seed),N,soilD) -climateX = rand(StableRNG(seed+10),N,climateD) -othervarX = rand(StableRNG(seed+20),N,othervarD) -X = hcat(soilX,climateX,othervarX) -Y = rand(StableRNG(seed+30),N) -``` - -## Model definition - -![Neural Network model](imgs/multibranch_nn.png) - -In the figure above, each circle represents a multi-neuron layer, with the number of neurons (output dimensions) written inside. Dotted circles are `RreplicatorLayer`s, which simply "pass through" the information to the next layer. -Red layers represent the layers responsible for the final step in encoding the information for a given branch. Subsequent layers will use this encoded information (i.e. decode it) to finally provide the prediction for the branch. -We create a first branch for the soil variables, a second for the climate variables and finally a third for the other variables. We merge the soil and climate branches in layer 4 and the resulting branch and the other variables branch in layer 6. Finally, the single neuron layer 8 provides the prediction. - -The weights along the whole chain can be learned using the traditional backpropagation algorithm. - -The whole model can be implemented with the following code: - -- layer 1: - -```text -l1_soil = DenseLayer(20,30,f=relu,rng=copy(AFIXEDRNG)) -l1_climate = ReplicatorLayer(30) -l1_oth = ReplicatorLayer(10) -l1 = GroupedLayer([l1_soil,l1_climate,l1_oth]) -``` - -- layer 2: - -```text -l2_soil = DenseLayer(30,30,f=relu,rng=copy(AFIXEDRNG)) -l2_climate = DenseLayer(30,40,f=relu,rng=copy(AFIXEDRNG)) -l2_oth = ReplicatorLayer(10) -l2 = GroupedLayer([l2_soil,l2_climate,l2_oth]) -``` - -- layer 3: - -```text -l3_soil = DenseLayer(30,4,f=relu,rng=copy(AFIXEDRNG)) # encoding of soil properties -l3_climate = DenseLayer(40,4,f=relu,rng=copy(AFIXEDRNG)) # encoding of climate properties -l3_oth = DenseLayer(10,15,f=relu,rng=copy(AFIXEDRNG)) -l3 = GroupedLayer([l3_soil,l3_climate,l3_oth]) -``` - -- layer 4: - -```text -l4_soilclim = DenseLayer(8,15,f=relu,rng=copy(AFIXEDRNG)) -l4_oth = DenseLayer(15,15,f=relu,rng=copy(AFIXEDRNG)) -l4 = GroupedLayer([l4_soilclim,l4_oth]) -``` - -- layer 5: - -```text -l5_soilclim = DenseLayer(15,6,f=relu,rng=copy(AFIXEDRNG)) # encoding of soil and climate properties together -l5_oth = DenseLayer(15,6,f=relu,rng=copy(AFIXEDRNG)) # encoding of other vars -l5 = GroupedLayer([l5_soilclim,l5_oth]) -``` - -- layer 6: - -```text -l6 = DenseLayer(12,15,f=relu,rng=copy(AFIXEDRNG)) -``` - -- layer 7: - -```text -l7 = DenseLayer(15,15,f=relu,rng=copy(AFIXEDRNG)) -``` - -- layer 8: - -```text -l8 = DenseLayer(15,1,f=relu,rng=copy(AFIXEDRNG)) -``` - -Finally we put the layers together and we create our `NeuralNetworkEstimator` model: - -```text -layers = [l1,l2,l3,l4,l5,l6,l7,l8] -m = NeuralNetworkEstimator(layers=layers,opt_alg=ADAM(),epochs=100,rng=copy(AFIXEDRNG)) -``` - -## Fitting the model -We are now ready to fit the model to the data. By default BetaML models return directly the predictions of the trained data as the output of the fitting call, so there is no need to separate call `predict(m,X)`. - -```text -Ŷ = fit!(m,X,Y) -``` - -## Model quality assessment -We can compute the relative mean error between the "true" Y and the Y estimated by the model. - -```text -rme = relative_mean_error(Y,Ŷ) -``` - -Of course we know there is no actual relation here between the X and The Y, as both are randomly generated, the result above just tell us that the network has been able to find a path between the X and Y that has been used for training, but we hope that in the real application this learned path represent a true, general relation beteen the inputs and the outputs. - -Finally we can also plot Y again Ŷ and visualize how the average loss reduced along the training: - -```text -scatter(Y,Ŷ,xlabel="vol observed",ylabel="vol estimated",label=nothing,title="Est vs. obs volumes") -``` - -```text -loss_per_epoch = info(m)["loss_per_epoch"] - -plot(loss_per_epoch, xlabel="epoch", ylabel="loss per epoch", label=nothing, title="Loss per epoch") -``` - -[View this file on Github](betaml_tutorial_multibranch_nn.jl). - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/docs/src/tutorials/Regression - bike sharing/betaml_tutorial_regression_sharingBikes.md b/docs/src/tutorials/Regression - bike sharing/betaml_tutorial_regression_sharingBikes.md deleted file mode 100644 index 7747f9b0..00000000 --- a/docs/src/tutorials/Regression - bike sharing/betaml_tutorial_regression_sharingBikes.md +++ /dev/null @@ -1,565 +0,0 @@ -```@meta -EditURL = "betaml_tutorial_regression_sharingBikes.jl" -``` - -# [A regression task: the prediction of bike sharing demand](@id regression_tutorial) -The task is to estimate the influence of several variables (like the weather, the season, the day of the week..) on the demand of shared bicycles, so that the authority in charge of the service can organise the service in the best way. - -Data origin: -- original full dataset (by hour, not used here): [https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset](https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset) -- simplified dataset (by day, with some simple scaling): [https://www.hds.utc.fr/~tdenoeux/dokuwiki/en/aec](https://www.hds.utc.fr/~tdenoeux/dokuwiki/en/aec) -- description: [https://www.hds.utc.fr/~tdenoeux/dokuwiki/_media/en/exam_2019_ace_.pdf](https://www.hds.utc.fr/~tdenoeux/dokuwiki/_media/en/exam_2019_ace_.pdf) -- data: [https://www.hds.utc.fr/~tdenoeux/dokuwiki/_media/en/bike_sharing_day.csv.zip](https://www.hds.utc.fr/~tdenoeux/dokuwiki/_media/en/bike_sharing_day.csv.zip) - -Note that even if we are estimating a time serie, we are not using here a recurrent neural network as we assume the temporal dependence to be negligible (i.e. $Y_t = f(X_t)$ alone). - -## Library and data loading - -Activating the local environment specific to - -```text -using Pkg -Pkg.activate(joinpath(@__DIR__,"..","..","..")) -``` - -We first load all the packages we are going to use - -```text -using LinearAlgebra, Random, Statistics, StableRNGs, DataFrames, CSV, Plots, Pipe, BenchmarkTools, BetaML -import Distributions: Uniform, DiscreteUniform -import DecisionTree, Flux ## For comparisions -``` - -Here we are explicit and we use our own fixed RNG: - -```text -seed = 123 # The table at the end of this tutorial has been obtained with seeds 123, 1000 and 10000 -AFIXEDRNG = StableRNG(seed) -``` - -Here we load the data from a csv provided by the BataML package - -```text -basedir = joinpath(dirname(pathof(BetaML)),"..","docs","src","tutorials","Regression - bike sharing") -data = CSV.File(joinpath(basedir,"data","bike_sharing_day.csv"),delim=',') |> DataFrame -describe(data) -``` - -The variable we want to learn to predict is `cnt`, the total demand of bikes for a given day. Even if it is indeed an integer, we treat it as a continuous variable, so each single prediction will be a scalar $Y \in \mathbb{R}$. - -```text -plot(data.cnt, title="Daily bike sharing rents (2Y)", label=nothing) -``` - -## Decision Trees - -We start our regression task with Decision Trees. - -Decision trees training consist in choosing the set of questions (in a hierarcical way, so to form indeed a "decision tree") that "best" split the dataset given for training, in the sense that the split generate the sub-samples (always 2 subsamples in the BetaML implementation) that are, for the characteristic we want to predict, the most homogeneous possible. Decision trees are one of the few ML algorithms that has an intuitive interpretation and can be used for both regression or classification tasks. - -### Data preparation - -The first step is to prepare the data for the analysis. This indeed depends already on the model we want to employ, as some models "accept" almost everything as input, no matter if the data is numerical or categorical, if it has missing values or not... while other models are instead much more exigents, and require more work to "clean up" our dataset. - -The tutorial starts using Decision Tree and Random Forest models that definitly belong to the first group, so the only thing we have to do is to select the variables in input (the "feature matrix", that we will indicate with "X") and the variable representing our output (the information we want to learn to predict, we call it "y"): - -```text -x = Matrix{Float64}(data[:,[:instant,:season,:yr,:mnth,:holiday,:weekday,:workingday,:weathersit,:temp,:atemp,:hum,:windspeed]]) -y = data[:,16]; -nothing #hide -``` - -We finally set up a dataframe to store the relative mean errors of the various models we'll use. - -```text -results = DataFrame(model=String[],train_rme=Float64[],test_rme=Float64[]) -``` - -### Model selection - -We can now split the dataset between the data that we will use for training the algorithm and selecting the hyperparameters (`xtrain`/`ytrain`) and those for testing the quality of the algoritm with the optimal hyperparameters (`xtest`/`ytest`). We use the `partition` function specifying the share we want to use for these two different subsets, here 80%, and 20% respectively. As our data represents indeed a time serie, we want our model to be able to predict _future_ demand of bike sharing from _past_, observed rented bikes, so we do not shuffle the datasets as it would be the default. - -```text -((xtrain,xtest),(ytrain,ytest)) = partition([x,y],[0.75,1-0.75],shuffle=false) -(ntrain, ntest) = size.([ytrain,ytest],1) -``` - -Then we define the model we want to use, [`DecisionTreeEstimator`](@ref) in this case, and we create an instance of the model: - -```text -m = DecisionTreeEstimator(autotune=true, rng=copy(AFIXEDRNG)) -``` - -Passing a fixed Random Number Generator (RNG) to the `rng` parameter guarantees that everytime we use the model with the same data (from the model creation downward to value prediciton) we obtain the same results. In particular BetaML provide `FIXEDRNG`, an istance of `StableRNG` that guarantees reproducibility even across different Julia versions. See the section ["Dealing with stochasticity"](@ref stochasticity_reproducibility) for details. -Note the `autotune` parameter. BetaML has perhaps what is the easiest method for automatically tuning the model hyperparameters (thus becoming in this way _learned_ parameters). Indeed, in most cases it is enought to pass the attribute `autotune=true` on the model constructor and hyperparameters search will be automatically performed on the first `fit!` call. -If needed we can customise hyperparameter tuning, chosing the tuning method on the parameter `tunemethod`. The single-line above is equivalent to: - -```text -tuning_method = SuccessiveHalvingSearch( - hpranges = Dict("max_depth" =>[5,10,nothing], "min_gain"=>[0.0, 0.1, 0.5], "min_records"=>[2,3,5],"max_features"=>[nothing,5,10,30]), - loss = l2loss_by_cv, - res_shares = [0.05, 0.2, 0.3], - multithreads = true - ) -m_dt = DecisionTreeEstimator(autotune=true, rng=copy(AFIXEDRNG), tunemethod=tuning_method) -``` - -Note that the defaults change according to the specific model, for example `RandomForestEstimator`](@ref) autotuning default to not being multithreaded, as the individual model is already multithreaded. - -!!! tip - Refer to the versions of this tutorial for BetaML <= 0.6 for a good exercise on how to perform model selection using the [`cross_validation`](@ref) function, or even by custom grid search. - -We can now fit the model, that is learn the model parameters that lead to the best predictions from the data. By default (unless we use `cache=false` in the model constructor) the model stores also the training predictions, so we can just use `fit!()` instead of `fit!()` followed by `predict(model,xtrain)` - -```text -ŷtrain = fit!(m_dt,xtrain,ytrain) -``` - -The above code produces a fitted `DecisionTreeEstimator` object that can be used to make predictions given some new features, i.e. given a new X matrix of (number of observations x dimensions), predict the corresponding Y vector of scalars in R. - -```text -ŷtest = predict(m_dt, xtest) -``` - -We now compute the mean relative error for the training and the test set. The [`relative_mean_error`](@ref) is a very flexible error function. Without additional parameter, it computes, as the name says, the _relative mean error_, between an estimated and a true vector. -However it can also compute the _mean relative error_, also known as the "mean absolute percentage error" ([MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)), or use a p-norm higher than 1. -The _mean relative error_ enfatises the relativeness of the error, i.e. all observations and dimensions weigth the same, wether large or small. Conversly, in the _relative mean error_ the same relative error on larger observations (or dimensions) weights more. -In this tutorial we use the later, as our data has clearly some outlier days with very small rents, and we care more of avoiding our customers finding empty bike racks than having unrented bikes on the rack. Targeting a low mean average error would push all our predicitons down to try accomodate the low-level predicitons (to avoid a large relative error), and that's not what we want. - -We can then compute the relative mean error for the decision tree - -```text -rme_train = relative_mean_error(ytrain,ŷtrain) # 0.1367 -rme_test = relative_mean_error(ytest,ŷtest) # 0.1547 -``` - -And we save the real mean accuracies in the `results` dataframe: - -```text -push!(results,["DT",rme_train,rme_test]); -nothing #hide -``` - -We can plot the true labels vs the estimated one for the three subsets... - -```text -scatter(ytrain,ŷtrain,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in training period (DT)") -``` - -```text -scatter(ytest,ŷtest,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in testing period (DT)") -``` - -Or we can visualise the true vs estimated bike shared on a temporal base. -First on the full period (2 years) ... - -```text -ŷtrainfull = vcat(ŷtrain,fill(missing,ntest)) -ŷtestfull = vcat(fill(missing,ntrain), ŷtest) -plot(data[:,:dteday],[data[:,:cnt] ŷtrainfull ŷtestfull], label=["obs" "train" "test"], legend=:topleft, ylabel="daily rides", title="Daily bike sharing demand observed/estimated across the\n whole 2-years period (DT)") -``` - -..and then focusing on the testing period - -```text -stc = ntrain -endc = size(x,1) -plot(data[stc:endc,:dteday],[data[stc:endc,:cnt] ŷtestfull[stc:endc]], label=["obs" "test"], legend=:bottomleft, ylabel="Daily rides", title="Focus on the testing period (DT)") -``` - -The predictions aren't so bad in this case, however decision trees are highly instable, and the output could have depended just from the specific initial random seed. - -## Random Forests -Rather than trying to solve this problem using a single Decision Tree model, let's not try to use a _Random Forest_ model. Random forests average the results of many different decision trees and provide a more "stable" result. -Being made of many decision trees, random forests are hovever more computationally expensive to train. - -```text -m_rf = RandomForestEstimator(autotune=true, oob=true, rng=copy(AFIXEDRNG)) -ŷtrain = fit!(m_rf,xtrain,ytrain); -ŷtest = predict(m_rf,xtest); -rme_train = relative_mean_error(ytrain,ŷtrain) # 0.056 -rme_test = relative_mean_error(ytest,ŷtest) # 0.161 -push!(results,["RF",rme_train,rme_test]); -nothing #hide -``` - -While slower than individual decision trees, random forests remain relativly fast. We should also consider that they are by default efficiently parallelised, so their speed increases with the number of available cores (in building this documentation page, GitHub CI servers allow for a single core, so all the bechmark you see in this tutorial are run with a single core available). - -Random forests support the so-called "out-of-bag" error, an estimation of the error that we would have when the model is applied on a testing sample. -However in this case the oob reported is much smaller than the testing error we will actually find. This is due to the fact that the division between training/validation and testing in this exercise is not random, but has a temporal basis. It seems that in this example the data in validation/testing follows a different pattern/variance than those in training (in probabilistic terms, the daily observations are not i.i.d.). - -```text -info(m_rf) -oob_error, rme_test = info(m_rf)["oob_errors"],relative_mean_error(ytest,ŷtest) -``` - -In this case we found an error very similar to the one employing a single decision tree. Let's print the observed data vs the estimated one using the random forest and then along the temporal axis: - -```text -scatter(ytrain,ŷtrain,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in training period (RF)") -``` - -```text -scatter(ytest,ŷtest,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in testing period (RF)") -``` - -Full period plot (2 years): - -```text -ŷtrainfull = vcat(ŷtrain,fill(missing,ntest)) -ŷtestfull = vcat(fill(missing,ntrain), ŷtest) -plot(data[:,:dteday],[data[:,:cnt] ŷtrainfull ŷtestfull], label=["obs" "train" "test"], legend=:topleft, ylabel="daily rides", title="Daily bike sharing demand observed/estimated across the\n whole 2-years period (RF)") -``` - -Focus on the testing period: - -```text -stc = 620 -endc = size(x,1) -plot(data[stc:endc,:dteday],[data[stc:endc,:cnt] ŷtrainfull[stc:endc] ŷtestfull[stc:endc]], label=["obs" "val" "test"], legend=:bottomleft, ylabel="Daily rides", title="Focus on the testing period (RF)") -``` - -### Comparison with DecisionTree.jl random forest - -We now compare our results with those obtained employing the same model in the [DecisionTree package](https://github.com/bensadeghi/DecisionTree.jl), using the hyperparameters of the obtimal BetaML Random forest model: - -```text -best_rf_hp = hyperparameters(m_rf) -``` - -Hyperparameters of the DecisionTree.jl random forest model - -```text -n_subfeatures=isnothing(best_rf_hp.max_features) ? -1 : best_rf_hp.max_features; n_trees=best_rf_hp.n_trees; partial_sampling=0.7; max_depth=isnothing(best_rf_hp.max_depth) ? typemax(Int64) : best_rf_hp.max_depth; -min_samples_leaf=best_rf_hp.min_records; min_samples_split=best_rf_hp.min_records; min_purity_increase=best_rf_hp.min_gain; -nothing #hide -``` - -We train the model.. - -```text -model = DecisionTree.build_forest(ytrain, convert(Matrix,xtrain), - n_subfeatures, - n_trees, - partial_sampling, - max_depth, - min_samples_leaf, - min_samples_split, - min_purity_increase; - rng = seed) -``` - -And we generate predictions and measure their error - -```text -(ŷtrain,ŷtest) = DecisionTree.apply_forest.([model],[xtrain,xtest]); - - -(rme_train, rme_test) = relative_mean_error.([ytrain,ytest],[ŷtrain,ŷtest]) # 0.022 and 0.304 -push!(results,["RF (DecisionTree.jl)",rme_train,rme_test]); -nothing #hide -``` - -While the train error is very small, the error on the test set remains relativly high. The very low error level on the training set is a sign that it overspecialised on the training set, and we should have better ran a dedicated hyper-parameter tuning function for the DecisionTree.jl model (we did try using the default `DecisionTrees.jl` parameters, but we obtained roughtly the same results). - -Finally we plot the DecisionTree.jl predictions alongside the observed value: - -```text -ŷtrainfull = vcat(ŷtrain,fill(missing,ntest)) -ŷtestfull = vcat(fill(missing,ntrain), ŷtest) -plot(data[:,:dteday],[data[:,:cnt] ŷtrainfull ŷtestfull], label=["obs" "train" "test"], legend=:topleft, ylabel="daily rides", title="Daily bike sharing demand observed/estimated across the\n whole 2-years period (DT.jl RF)") -``` - -Again, focusing on the testing data: - -```text -stc = ntrain -endc = size(x,1) -plot(data[stc:endc,:dteday],[data[stc:endc,:cnt] ŷtestfull[stc:endc]], label=["obs" "test"], legend=:bottomleft, ylabel="Daily rides", title="Focus on the testing period (DT.jl RF)") -``` - -### Conclusions of Decision Trees / Random Forests methods -The error obtained employing DecisionTree.jl is significantly larger than those obtained using a BetaML random forest model, altought to be fair with DecisionTrees.jl we didn't tuned its hyper-parameters. Also, the DecisionTree.jl random forest model is much faster. -This is partially due by the fact that, internally, DecisionTree.jl models optimise the algorithm by sorting the observations. BetaML trees/forests don't employ this optimisation and hence they can work with true categorical data for which ordering is not defined. An other explanation of this difference in speed is that BetaML Random Forest models accept `missing` values within the feature matrix. -To sum up, BetaML random forests are ideal algorithms when we want to obtain good predictions in the most simpler way, even without manually tuning the hyper-parameters, and without spending time in cleaning ("munging") the feature matrix, as they accept almost "any kind" of data as it is. - -## Neural Networks - -BetaML provides only _deep forward neural networks_, artificial neural network units where the individual "nodes" are arranged in _layers_, from the _input layer_, where each unit holds the input coordinate, through various _hidden layer_ transformations, until the actual _output_ of the model: - -![Neural Networks](imgs/nn_scheme.png) - -In this layerwise computation, each unit in a particular layer takes input from _all_ the preceding layer units and it has its own parameters that are adjusted to perform the overall computation. The _training_ of the network consists in retrieving the coefficients that minimise a _loss_ function between the output of the model and the known data. -In particular, a _deep_ (feedforward) neural network refers to a neural network that contains not only the input and output layers, but also (a variable number of) hidden layers in between. - -Neural networks accept only numerical inputs. We hence need to convert all categorical data in numerical units. A common approach is to use the so-called "one-hot-encoding" where the catagorical values are converted into indicator variables (0/1), one for each possible value. This can be done in BetaML using the [`OneHotEncoder`](@ref) function: - -```text -seasonDummies = fit!(OneHotEncoder(),data.season) -weatherDummies = fit!(OneHotEncoder(),data.weathersit) -wdayDummies = fit!(OneHotEncoder(),data.weekday .+ 1) - - -# We compose the feature matrix with the new dimensions obtained from the onehotencoder functions -x = hcat(Matrix{Float64}(data[:,[:instant,:yr,:mnth,:holiday,:workingday,:temp,:atemp,:hum,:windspeed]]), - seasonDummies, - weatherDummies, - wdayDummies) -y = data[:,16]; -nothing #hide -``` - -As we did for decision trees/ random forests, we split the data in training, validation and testing sets - -```text -((xtrain,xtest),(ytrain,ytest)) = partition([x,y],[0.75,1-0.75],shuffle=false) -(ntrain, ntest) = size.([ytrain,ytest],1) -``` - -An other common operation with neural networks is to scale the feature vectors (X) and the labels (Y). The BetaML [`Scaler`](@ref) model, by default, scales the data such that each dimension has mean 0 and variance 1. - -Note that we can provide the `Scaler`` model with different scale factors or specify the columns that shoudn't be scaled (e.g. those resulting from the one-hot encoding). Finally we can reverse the scaling (this is useful to retrieve the unscaled features from a model trained with scaled ones). - -```text -cols_nottoscale = [2;4;5;10:23] -xsm = Scaler(skip=cols_nottoscale) -xtrain_scaled = fit!(xsm,xtrain) -xtest_scaled = predict(xsm,xtest) -ytrain_scaled = ytrain ./ 1000 # We just divide Y by 1000, as using full scaling of Y we may get negative demand. -ytest_scaled = ytest ./ 1000 -D = size(xtrain,2) -``` - -We can now build our feed-forward neaural network. We create three layers, the first layers will always have a input size equal to the dimensions of our data (the number of columns), and the output layer, for a simple regression where the predictions are scalars, it will always be one. We will tune the size of the middle layer size. - -There are already several kind of layers available (and you can build your own kind by defining a new `struct` and implementing a few functions. See the [`Nn`](@ref nn_module) module documentation for details). Here we use only _dense_ layers, those found in typycal feed-fordward neural networks. - -For each layer, on top of its size (in "neurons") we can specify an _activation function_. Here we use the [`relu`](@ref) for the terminal layer (this will guarantee that our predictions are always positive) and `identity` for the hidden layer. Again, consult the `Nn` module documentation for other activation layers already defined, or use any function of your choice. - -Initial weight parameters can also be specified if needed. By default [`DenseLayer`](@ref) use the so-called _Xavier initialisation_. - -Let's hence build our candidate neural network structures, choosing between 5 and 10 nodes in the hidden layers: - -```text -candidate_structures = [ - [DenseLayer(D,k,f=relu,df=drelu,rng=copy(AFIXEDRNG)), # Activation function is ReLU, it's derivative is drelu - DenseLayer(k,k,f=identity,df=identity,rng=copy(AFIXEDRNG)), # This is the hidden layer we vant to test various sizes - DenseLayer(k,1,f=relu,df=didentity,rng=copy(AFIXEDRNG))] for k in 5:2:10] -``` - -Note that specify the derivatives of the activation functions (and of the loss function that we'll see in a moment) it totally optional, as without them BetaML will use [`Zygote.jl`](https://github.com/FluxML/Zygote.jl for automatic differentiation. - -We do also set a few other parameters as "turnable": the number of "epochs" to train the model (the number of iterations trough the whole dataset), the sample size at each batch and the optimisation algorithm to use. -Several optimisation algorithms are indeed available, and each accepts different parameters, like the _learning rate_ for the Stochastic Gradient Descent algorithm ([`SGD`](@ref), used by default) or the exponential decay rates for the moments estimates for the [`ADAM`](@ref) algorithm (that we use here, with the default parameters). - -The hyperparameter ranges will then look as follow: - -```text -hpranges = Dict("layers" => candidate_structures, - "epochs" => rand(copy(AFIXEDRNG),DiscreteUniform(50,100),3), # 3 values sampled at random between 50 and 100 - "batch_size" => [4,8,16], - "opt_alg" => [SGD(λ=2),SGD(λ=1),SGD(λ=3),ADAM(λ=0.5),ADAM(λ=1),ADAM(λ=0.25)]) -``` - -Finally we can build "neural network" [`NeuralNetworkEstimator`](@ref) model where we "chain" the layers together and we assign a final loss function (again, you can provide your own loss function, if those available in BetaML don't suit your needs): - -```text -nnm = NeuralNetworkEstimator(loss=squared_cost, descr="Bike sharing regression model", tunemethod=SuccessiveHalvingSearch(hpranges = hpranges), autotune=true,rng=copy(AFIXEDRNG)) # Build the NN model and use the squared cost (aka MSE) as error function by default -``` - -We can now fit and autotune the model: - -```text -ŷtrain_scaled = fit!(nnm,xtrain_scaled,ytrain_scaled) -``` - -The model training is one order of magnitude slower than random forests, altought the memory requirement is approximatly the same. - -To obtain the neural network predictions we apply the function `predict` to the feature matrix X for which we want to generate previsions, and then we rescale y. -Normally we would apply here the `inverse_predict` function, but as we simple divided by 1000, we multiply ŷ by the same amount: - -```text -ŷtrain = ŷtrain_scaled .* 1000 -ŷtest = predict(nnm,xtest_scaled) .* 1000 -``` - -```text -(rme_train, rme_test) = relative_mean_error.([ŷtrain,ŷtest],[ytrain,ytest]) -push!(results,["NN",rme_train,rme_test]); -nothing #hide -``` - -The error is much lower. Let's plot our predictions: - -Again, we can start by plotting the estimated vs the observed value: - -```text -scatter(ytrain,ŷtrain,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in training period (NN)") -``` - -```text -scatter(ytest,ŷtest,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in testing period (NN)") -``` - -We now plot across the time dimension, first plotting the whole period (2 years): - -```text -ŷtrainfull = vcat(ŷtrain,fill(missing,ntest)) -ŷtestfull = vcat(fill(missing,ntrain), ŷtest) -plot(data[:,:dteday],[data[:,:cnt] ŷtrainfull ŷtestfull], label=["obs" "train" "test"], legend=:topleft, ylabel="daily rides", title="Daily bike sharing demand observed/estimated across the\n whole 2-years period (NN)") -``` - -...and then focusing on the testing data - -```text -stc = 620 -endc = size(x,1) -plot(data[stc:endc,:dteday],[data[stc:endc,:cnt] ŷtestfull[stc:endc]], label=["obs" "val" "test"], legend=:bottomleft, ylabel="Daily rides", title="Focus on the testing period (NN)") -``` - -### Comparison with Flux.jl - -We now apply the same Neural Network model using the [Flux](https://fluxml.ai/) framework, a dedicated neural network library, reusing the optimal parameters that we did learn from tuning `NeuralNetworkEstimator`: - -```text -hp_opt = hyperparameters(nnm) -opt_size = size(hp_opt.layers[1])[2][1] -opt_batch_size = hp_opt.batch_size -opt_epochs = hp_opt.epochs -``` - -We fix the default random number generator so that the Flux example gives a reproducible output - -```text -Random.seed!(seed) -``` - -We define the Flux neural network model and load it with data... - -```text -l1 = Flux.Dense(D,opt_size,Flux.relu) -l2 = Flux.Dense(opt_size,opt_size,identity) -l3 = Flux.Dense(opt_size,1,Flux.relu) -Flux_nn = Flux.Chain(l1,l2,l3) -fluxloss(x, y) = Flux.mse(Flux_nn(x), y) -ps = Flux.params(Flux_nn) -nndata = Flux.Data.DataLoader((xtrain_scaled', ytrain_scaled'), batchsize=opt_batch_size,shuffle=true) -``` - -We do the training of the Flux model... - -```text -[Flux.train!(fluxloss, ps, nndata, Flux.ADAM(0.001, (0.9, 0.8))) for i in 1:opt_epochs] -``` - -We obtain the predicitons... - -```text -ŷtrainf = @pipe Flux_nn(xtrain_scaled')' .* 1000; -ŷtestf = @pipe Flux_nn(xtest_scaled')' .* 1000; -nothing #hide -``` - -..and we compute the mean relative errors.. - -```text -(rme_train, rme_test) = relative_mean_error.([ŷtrainf,ŷtestf],[ytrain,ytest]) -push!(results,["NN (Flux.jl)",rme_train,rme_test]); -nothing #hide -``` - -.. finding an error not significantly different than the one obtained from BetaML.Nn. - -Plots: - -```text -scatter(ytrain,ŷtrainf,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in training period (Flux.NN)") -``` - -```text -scatter(ytest,ŷtestf,xlabel="daily rides",ylabel="est. daily rides",label=nothing,title="Est vs. obs in testing period (Flux.NN)") -``` - -```text -ŷtrainfullf = vcat(ŷtrainf,fill(missing,ntest)) -ŷtestfullf = vcat(fill(missing,ntrain), ŷtestf) -plot(data[:,:dteday],[data[:,:cnt] ŷtrainfullf ŷtestfullf], label=["obs" "train" "test"], legend=:topleft, ylabel="daily rides", title="Daily bike sharing demand observed/estimated across the\n whole 2-years period (Flux.NN)") -``` - -```text -stc = 620 -endc = size(x,1) -plot(data[stc:endc,:dteday],[data[stc:endc,:cnt] ŷtestfullf[stc:endc]], label=["obs" "val" "test"], legend=:bottomleft, ylabel="Daily rides", title="Focus on the testing period (Flux.NN)") -``` - -### Conclusions of Neural Network models - -If we strive for the most accurate predictions, deep neural networks are usually the best choice. However they are computationally expensive, so with limited resourses we may get better results by fine tuning and running many repetitions of "simpler" decision trees or even random forest models than a large naural network with insufficient hyper-parameter tuning. -Also, we shoudl consider that decision trees/random forests are much simpler to work with. - -That said, specialised neural network libraries, like Flux, allow to use GPU and specialised hardware letting neural networks to scale with very large datasets. - -Still, for small and medium datasets, BetaML provides simpler yet customisable solutions that are accurate and fast. - -## GMM-based regressors - -BetaML 0.8 introduces new regression algorithms based on Gaussian Mixture Model. -Specifically, there are two variants available, `GaussianMixtureRegressor2` and `GaussianMixtureRegressor`, and this example uses `GaussianMixtureRegressor` -As for neural networks, they work on numerical data only, so we reuse the datasets we prepared for the neural networks. - -As usual we first define the model. - -```text -m = GaussianMixtureRegressor(rng=copy(AFIXEDRNG),verbosity=NONE) -``` - -!!! info - We disabled autotune here, as this code is run by GitHub continuous_integration servers on each code update, and GitHub servers seem to have some strange problem with it, taking almost 4 hours instead of a few seconds on my machine. - -We then fit the model to the training data.. - -```text -ŷtrainGMM_unscaled = fit!(m,xtrain_scaled,ytrain_scaled) -``` - -And we predict... - -```text -ŷtrainGMM = ŷtrainGMM_unscaled .* 1000; -ŷtestGMM = predict(m,xtest_scaled) .* 1000; - -(rme_train, rme_test) = relative_mean_error.([ŷtrainGMM,ŷtestGMM],[ytrain,ytest]) -push!(results,["GMM",rme_train,rme_test]); -nothing #hide -``` - -## Summary - -This is the summary of the results (train and test relative mean error) we had trying to predict the daily bike sharing demand, given weather and calendar information: - -```text -println(results) -``` - -You may ask how stable are these results? How much do they depend from the specific RNG seed ? We re-evaluated a couple of times the whole script but changing random seeds (to `1000` and `10000`): - -| Model | Train rme1 | Test rme1 | Train rme2 | Test rme2 | Train rme3 | Test rme3 | -|:-------------------- |:----------:|:---------:|:----------:|:---------:|:----------:|:---------:| -| DT | 0.1366960 | 0.154720 | 0.0233044 | 0.249329 | 0.0621571 | 0.161657 | -| RF | 0.0421267 | 0.180186 | 0.0535776 | 0.136920 | 0.0386144 | 0.141606 | -| RF (DecisionTree.jl) | 0.0230439 | 0.235823 | 0.0801040 | 0.243822 | 0.0168764 | 0.219011 | -| NN | 0.1604000 | 0.169952 | 0.1091330 | 0.121496 | 0.1481440 | 0.150458 | -| NN (Flux.jl) | 0.0931161 | 0.166228 | 0.0920796 | 0.167047 | 0.0907810 | 0.122469 | -| GaussianMixtureRegressor* | 0.1432800 | 0.293891 | 0.1380340 | 0.295470 | 0.1477570 | 0.284567 | - -* GMM is a deterministic model, the variations are due to the different random sampling in choosing the best hyperparameters - -Neural networks can be more precise than random forests models, but are more computationally expensive (and tricky to set up). When we compare BetaML with the algorithm-specific leading packages, we found similar results in terms of accuracy, but often the leading packages are better optimised and run more efficiently (but sometimes at the cost of being less versatile). -GMM_based regressors are very computationally cheap and a good compromise if accuracy can be traded off for performances. - -[View this file on Github](betaml_tutorial_regression_sharingBikes.jl). - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/loss_by_var.png b/loss_by_var.png deleted file mode 100644 index de891ea0..00000000 Binary files a/loss_by_var.png and /dev/null differ diff --git a/plot1.png b/plot1.png deleted file mode 100644 index c8a94bdf..00000000 Binary files a/plot1.png and /dev/null differ diff --git a/plot2.png b/plot2.png deleted file mode 100644 index d3bc4ed4..00000000 Binary files a/plot2.png and /dev/null differ diff --git a/plot3.png b/plot3.png deleted file mode 100644 index ae99526d..00000000 Binary files a/plot3.png and /dev/null differ diff --git a/plot4.png b/plot4.png deleted file mode 100644 index c840da1e..00000000 Binary files a/plot4.png and /dev/null differ diff --git a/plot_2.png b/plot_2.png deleted file mode 100644 index 3e8bf0a2..00000000 --- a/plot_2.png +++ /dev/null @@ -1,50 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/sobol_by_var.png b/sobol_by_var.png deleted file mode 100644 index 4b2c0f57..00000000 Binary files a/sobol_by_var.png and /dev/null differ diff --git a/sobol_ny_var.png b/sobol_ny_var.png deleted file mode 100644 index a5db1ddf..00000000 Binary files a/sobol_ny_var.png and /dev/null differ diff --git a/test.jld2 b/test.jld2 deleted file mode 100644 index 47deb69e..00000000 Binary files a/test.jld2 and /dev/null differ diff --git a/test/temp_test.jl b/test/temp_test.jl deleted file mode 100644 index e21531e1..00000000 --- a/test/temp_test.jl +++ /dev/null @@ -1,154 +0,0 @@ -using Test -using DelimitedFiles, LinearAlgebra, Statistics #, MLDatasets - -#using StableRNGs -#rng = StableRNG(123) -using BetaML - -import BetaML.Nn: buildNetwork, forward, loss, backward, train!, get_nparams, _get_n_layers_weights -import BetaML.Nn: ConvLayer, ReshaperLayer, _zComp! -TESTRNG = FIXEDRNG # This could change... -#TESTRNG = StableRNG(123) - - - -x = reshape(1:100*3*3*2,100,3*3*2) ./ 100 -#x = rand(100,18) -y = [norm(r[1:9])+2*norm(r[10:18],2) for r in eachrow(x) ] -(N,D) = size(x) -l1 = ReshaperLayer((D,1),(3,3,2)) -l2 = ConvLayer((3,3),(2,2),2,3,rng=copy(TESTRNG),f=identity) -l3 = ConvLayer(size(l2)[2],(2,2),8,rng=copy(TESTRNG), f=identity) -l4 = ReshaperLayer(size(l3)[2]) -l5 = DenseLayer(size(l4)[2][1],1,f=relu, rng=copy(TESTRNG)) -layers = [l1,l2,l3,l4,l5] -mynn = buildNetwork(layers,squared_cost,name="Regression with a convolutional layer") -preprocess!(mynn) -predict(mynn,x[1,:]') - -l1out = forward(l1,x[1,:]) -l2out = forward(l2,l1out) -@btime forward(l2,l1out) - -@btime forward($l2,$l1out) - -_, output_size = size(l2) -z = zeros(output_size) -@btime _zComp!($z,$l1out,$l2.weight,$l2.bias,$l2.y_ids,$l2.x_ids,$l2.w_ids,$l2.usebias) -@btime _zComp!($z,$l2,$l1out) - -@btime zeros(size($l2)[2]) - - -train!(mynn,x,y,epochs=60,verbosity=NONE,rng=copy(TESTRNG)) -ŷ = predict(mynn,x) -rmeTrain = relative_mean_error(y,ŷ,normrec=false) -@test rmeTrain < 0.01 - -using BenchmarkTools -@btime train!($mynn,$x,$y,epochs=60,verbosity=NONE,rng=copy(TESTRNG)) - -# original (already int64 and no return value): 2.988 s (117156427 allocations: 3.13 GiB) -# vith vector instead of array: 1.111 s (44415127 allocations: 772.20 MiB) -# with _dedxComp!: 777.724 ms (22815127 allocations: 442.61 MiB) -# with _dedwComp!: 410.060 ms (1215127 allocations: 113.02 MiB) -# with all inbounds: 256.673 ms (1215127 allocations: 113.02 MiB) -y_id = [3,2,1,2] -x_id = [1,2,2,1] -w_id = [2,3,2,1] - -x = [1.5,2.5] -w = [2.0,3.0,4.0] - - -function foo!(y,x,w,y_id,x_id,w_id) - for i in 1:length(y_id) - y[y_id[i]] += x[x_id[i]] * w[w_id[i]] - end - return y -end - -foo(x,w,y_id,x_id,w_id) - - -# --------------------------------------------------- -y_id = [(3,1),(2,2),(2,2),(2,1)] -x_id = [(1,2),(2,1),(1,1),(2,2)] -w_id = [(2,2),(3,2),(2,1),(1,1)] - -x = [1.5 2.5; 2.0 1.0] -w = [2.0 3.0 4.0; 1.0 1.5 2.5; 0.5 1.0 0.5] - -y = zeros(3,2) - -function foo!(y,x,w,y_id,x_id,w_id) - for i in 1:length(y_id) - y[y_id[i][1],y_id[i][2]] += x[x_id[i][1],x_id[i][2]] * w[w_id[i][1],w_id[i][2]] - end - return y -end - - -foo!(y,x,w,y_id,x_id,w_id) -@btime foo!($zeros(3,2),$x,$w,$y_id,$x_id,$w_id) - - - -# ------------------------------------------------------------------------------ - -using StaticArrays, BenchmarkTools - -mutable struct ConvLayerTest4{Int32} - x_ids::Vector{SVector{3,Int32}} - y_ids::Vector{SVector{3,Int32}} - w_ids::Vector{SVector{4,Int32}} - w::Array{Float64,4} - somethingelse -end - -# Data generation for the MWE... -x = rand(64,64,3) -y = zeros(32,32,5) -w = rand(4,4,3,5) -N = 3000 -x_ids = [SVector{3,Int64}([rand(1:id) for id in size(x)]...) for n in 1:N] -y_ids = [SVector{3,Int64}([rand(1:id) for id in size(y)]...) for n in 1:N] -w_ids = [SVector{4,Int64}([rand(1:id) for id in size(w)]...) for n in 1:N] -layer = ConvLayerTest4(x_ids, y_ids, w_ids, w, "foo") - -function compute!(y,l,x) - for i in 1:length(l.y_ids) - y[l.y_ids[i][1],l.y_ids[i][2],l.y_ids[i][3]] += - x[l.x_ids[i][1],l.x_ids[i][2],l.x_ids[i][3]] * - l.w[l.w_ids[i][1],l.w_ids[i][2],l.w_ids[i][3],l.w_ids[i][4]] - end - return nothing -end - -function compute!(y,x,w,y_ids,x_ids,w_ids) - for i in 1:length(y_ids) - y[y_ids[i][1],y_ids[i][2],y_ids[i][3]] += - x[x_ids[i][1],x_ids[i][2],x_ids[i][3]] * - w[w_ids[i][1],w_ids[i][2],w_ids[i][3],w_ids[i][4]] - end - return nothing -end - -# The computation that I care... -@btime compute!($y,$layer,$x) -@btime compute!($y,$x,$w,$y_ids,$x_ids,$w_ids) - -a = compute!(y,layer,x) -y = zeros(32,32,5) -b = compute!(y,x,w,y_ids,x_ids,w_ids) -a == b - - - -# ------------------------------------------------------------------------------ - -foo(x,::Val{true}) = println("t $x") - -foo(x,::Val{false}) = println("f $x") - -foo(10,Val(true)) \ No newline at end of file