diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 0a7103e..662841b 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.0" manifest_format = "2.0" -project_hash = "18e8039b3de791d9b34e11bdeca5dbd5a0cd9fca" +project_hash = "a6a64beac05fb4bb6ed762665c2730d213492a2a" [[deps.ANSIColoredPrinters]] git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" diff --git a/docs/Project.toml b/docs/Project.toml index 2e042fc..5350106 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,6 +3,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" BetaML = "024491cd-cc6b-443e-8034-08ea7eb7db2b" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" diff --git a/docs/src/Benchmarks.md b/docs/src/Benchmarks.md index e4d6d4b..ce1a938 100644 --- a/docs/src/Benchmarks.md +++ b/docs/src/Benchmarks.md @@ -3,7 +3,7 @@ This benchmark page allows us to quickly check for regressions across versions. As it is run and compiled using GitHub actions, and these may be powered by different computational resources, timing results are normalized using SystemBenchmark. -This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model, USING DEFAULT VALUES. Note that this could imply very different important hyperparameters. +This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model. Note that while we did try to match hyperparameters to the BetaML default ones, differences still occour and may in part explain the different model's performances and scores. This file is intended just for benchmarking, not much as a tutorial, and it doesn't employ a full ML workflow, just the minimum preprocessing such that the algorithms work. @@ -87,9 +87,9 @@ end ### DecisionTree Random.seed!(123) -dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(), - "RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(), -); +dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(rng=copy(TESTRNG)), + "RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)), +) for (mname,m) in dt_models #mname = "DT" @@ -230,8 +230,8 @@ end Random.seed!(123) -dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(), - "RF (DT.jl)"=>DecisionTree.RandomForestClassifier(), +dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(rng=copy(TESTRNG)), + "RF (DT.jl)"=>DecisionTree.RandomForestClassifier(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)), ); @@ -388,34 +388,36 @@ for (mname,f) in othcl_functions #mname = "GMM" println("Processing model $mname ... ") Random.seed!(123) - old_logger = Logging.global_logger() - oldstdout = stdout - redirect_stdout(devnull) - global_logger(NullLogger()) - - bres = @benchmark $f($x) - m_time = median(bres.times) - m_memory = bres.memory - m_allocs = bres.allocs - - sampler = KFold(nsplits=10,rng=copy(TESTRNG)); - cv_out = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng - # For unsupervised learning we use only the train data. - # Also, we use the associated labels only to measure the performances - (xtrain,ytrain) = trainData; - ŷtrain = f(xtrain) - acc_train = accuracy(ytrain,ŷtrain,ignorelabels=true) - pd = pairwise(xtrain) - sil_score = mean(silhouette(pd,ŷtrain)) - return (acc_train, sil_score) + with_logger(NullLogger()) + #old_logger = Logging.global_logger() + #oldstdout = stdout + #redirect_stdout(devnull) + #global_logger(NullLogger()) + + bres = @benchmark $f($x) + m_time = median(bres.times) + m_memory = bres.memory + m_allocs = bres.allocs + + sampler = KFold(nsplits=10,rng=copy(TESTRNG)); + cv_out = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng + # For unsupervised learning we use only the train data. + # Also, we use the associated labels only to measure the performances + (xtrain,ytrain) = trainData; + ŷtrain = f(xtrain) + acc_train = accuracy(ytrain,ŷtrain,ignorelabels=true) + pd = pairwise(xtrain) + sil_score = mean(silhouette(pd,ŷtrain)) + return (acc_train, sil_score) + end + acc_mean = mean([r[1] for r in cv_out]) + acc_std = std([r[1] for r in cv_out]) + sil_mean = mean([r[2] for r in cv_out]) + sil_std = std([r[2] for r in cv_out]) + push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std]) + #redirect_stdout(oldstdout) + #global_logger(old_logger) end - acc_mean = mean([r[1] for r in cv_out]) - acc_std = std([r[1] for r in cv_out]) - sil_mean = mean([r[2] for r in cv_out]) - sil_std = std([r[2] for r in cv_out]) - push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std]) - redirect_stdout(oldstdout) - global_logger(old_logger) @test acc_mean >= 0.6 end diff --git a/src/Trees/RandomForests.jl b/src/Trees/RandomForests.jl index 97cf6b2..5e08678 100644 --- a/src/Trees/RandomForests.jl +++ b/src/Trees/RandomForests.jl @@ -47,6 +47,8 @@ Base.@kwdef mutable struct RandomForestE_hp <: BetaMLHyperParametersSet min_records::Int64 = 2 "The maximum number of (random) features to consider when choosing the optimal partition of the dataset [def: `nothing`, i.e. square root of the dimensions of the training data`]" max_features::Union{Nothing,Int64} = nothing + "Share of samples to bootstrap for each individual tree [def: `1.0`]" + sampling_share::Float64 = 1.0 "Whether to force a classification task even if the labels are numerical (typically when labels are integers encoding some feature rather than representing a real cardinal measure) [def: `false`]" force_classification::Bool = false "Either `gini`, `entropy` or `variance`. This is the name of the function to be used to compute the information gain of a specific partition. This is done by measuring the difference betwwen the \"impurity\" of the labels of the parent node with those of the two child nodes, weighted by the respective number of items. [def: `nothing`, i.e. `gini` for categorical labels (classification task) and `variance` for numerical labels(regression task)]. It can be an anonymous function." @@ -199,7 +201,7 @@ See [`buildTree`](@ref). The function has all the parameters of `bildTree` (with - This function optionally reports a weight distribution of the performances of eanch individual trees, as measured using the records he has not being trained with. These weights can then be (optionally) used in the `predict` function. The parameter `β ≥ 0` regulate the distribution of these weights: larger is `β`, the greater the importance (hence the weights) attached to the best-performing trees compared to the low-performing ones. Using these weights can significantly improve the forest performances (especially using small forests), however the correct value of β depends on the problem under exam (and the chosen caratteristics of the random forest estimator) and should be cross-validated to avoid over-fitting. - Note that this function uses multiple threads if these are available. You can check the number of threads available with `Threads.nthreads()`. To set the number of threads in Julia either set the environmental variable `JULIA_NUM_THREADS` (before starting Julia) or start Julia with the command line option `--threads` (most integrated development editors for Julia already set the number of threads to 4). """ -function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty} +function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), sampling_share=1.0, force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty} # Force what would be a regression task into a classification task if force_classification && Ty <: Number y = string.(y) @@ -228,7 +230,7 @@ function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1 Threads.@threads for i in 1:n_trees tsrng = rngs[Threads.threadid()] # Thread safe random number generator Random.seed!(tsrng,masterSeed+i*10) - toSample = rand(tsrng, 1:N,N) + toSample = rand(tsrng, 1:N, Int(round(N*sampling_share))) notToSample = setdiff(1:N,toSample) bootstrappedx = x[toSample,:] # "boosted is different than "bootstrapped": https://towardsdatascience.com/random-forest-and-its-implementation-71824ced454f bootstrappedy = y[toSample] @@ -280,6 +282,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty} # Setting schortcuts to other hyperparameters/options.... min_gain = m.hpar.min_gain min_records = m.hpar.min_records + sampling_share = m.hpar.sampling_share force_classification = m.hpar.force_classification n_trees = m.hpar.n_trees fast_algorithm = m.hpar.fast_algorithm @@ -290,7 +293,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty} rng = m.opt.rng verbosity = m.opt.verbosity - forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false, rng = rng) + forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, sampling_share=sampling_share, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false, rng = rng) m.par = RF_lp(forest,Tynm)