Added sampling_share option to RF and aligned RF hyperparameters in b…

…enchmark
sylvaticus · Mar 26, 2024 · 8a79646 · 8a79646
1 parent a448a42
commit 8a79646
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 37 deletions.
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.0"
 manifest_format = "2.0"
-project_hash = "18e8039b3de791d9b34e11bdeca5dbd5a0cd9fca"
+project_hash = "a6a64beac05fb4bb6ed762665c2730d213492a2a"
 
 [[deps.ANSIColoredPrinters]]
 git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -3,6 +3,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 BetaML = "024491cd-cc6b-443e-8034-08ea7eb7db2b"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"

diff --git a/docs/src/Benchmarks.md b/docs/src/Benchmarks.md
@@ -3,7 +3,7 @@
 This benchmark page allows us to quickly check for regressions across versions.
 As it is run and compiled using GitHub actions, and these may be powered by different computational resources, timing results are normalized using SystemBenchmark.
 
-This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model, USING DEFAULT VALUES. Note that this could imply very different important hyperparameters.
+This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model. Note that while we did try to match hyperparameters to the BetaML default ones, differences still occour and may in part explain the different model's performances and scores.
 
 This file is intended just for benchmarking, not much as a tutorial, and it doesn't employ a full ML workflow, just the minimum preprocessing such that the algorithms work.
 
@@ -87,9 +87,9 @@ end
 
 ### DecisionTree
 Random.seed!(123)
-dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(),
-                        "RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(),
-);
+dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(rng=copy(TESTRNG)),
+                        "RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)),
+)
 
 for (mname,m) in dt_models
     #mname = "DT"
@@ -230,8 +230,8 @@ end
 
 
 Random.seed!(123)
-dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(),
-                 "RF (DT.jl)"=>DecisionTree.RandomForestClassifier(),
+dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(rng=copy(TESTRNG)),
+                 "RF (DT.jl)"=>DecisionTree.RandomForestClassifier(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)),
 );
 
 
@@ -388,34 +388,36 @@ for (mname,f) in othcl_functions
     #mname = "GMM"
     println("Processing model $mname ... ")
     Random.seed!(123)
-    old_logger = Logging.global_logger() 
-    oldstdout = stdout
-    redirect_stdout(devnull)
-    global_logger(NullLogger())
-
-    bres     = @benchmark $f($x)
-    m_time   = median(bres.times)
-    m_memory = bres.memory
-    m_allocs = bres.allocs
-
-    sampler = KFold(nsplits=10,rng=copy(TESTRNG));
-    cv_out  = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng
-        # For unsupervised learning we use only the train data.
-        # Also, we use the associated labels only to measure the performances
-        (xtrain,ytrain)  = trainData;
-        ŷtrain     = f(xtrain)     
-        acc_train  = accuracy(ytrain,ŷtrain,ignorelabels=true)
-        pd         = pairwise(xtrain) 
-        sil_score  = mean(silhouette(pd,ŷtrain))
-        return (acc_train, sil_score)
+    with_logger(NullLogger())
+        #old_logger = Logging.global_logger() 
+        #oldstdout = stdout
+        #redirect_stdout(devnull)
+        #global_logger(NullLogger())
+
+        bres     = @benchmark $f($x)
+        m_time   = median(bres.times)
+        m_memory = bres.memory
+        m_allocs = bres.allocs
+
+        sampler = KFold(nsplits=10,rng=copy(TESTRNG));
+        cv_out  = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng
+            # For unsupervised learning we use only the train data.
+            # Also, we use the associated labels only to measure the performances
+            (xtrain,ytrain)  = trainData;
+            ŷtrain     = f(xtrain)     
+            acc_train  = accuracy(ytrain,ŷtrain,ignorelabels=true)
+            pd         = pairwise(xtrain) 
+            sil_score  = mean(silhouette(pd,ŷtrain))
+            return (acc_train, sil_score)
+        end
+        acc_mean = mean([r[1] for r in cv_out])
+        acc_std = std([r[1] for r in cv_out])
+        sil_mean = mean([r[2] for r in cv_out])
+        sil_std = std([r[2] for r in cv_out])
+        push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std])
+        #redirect_stdout(oldstdout)
+        #global_logger(old_logger)
     end
-    acc_mean = mean([r[1] for r in cv_out])
-    acc_std = std([r[1] for r in cv_out])
-    sil_mean = mean([r[2] for r in cv_out])
-    sil_std = std([r[2] for r in cv_out])
-    push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std])
-    redirect_stdout(oldstdout)
-    global_logger(old_logger)
     @test acc_mean >= 0.6
 end
 

diff --git a/src/Trees/RandomForests.jl b/src/Trees/RandomForests.jl
@@ -47,6 +47,8 @@ Base.@kwdef mutable struct RandomForestE_hp <: BetaMLHyperParametersSet
     min_records::Int64                           = 2
     "The maximum number of (random) features to consider when choosing the optimal partition of the dataset [def: `nothing`, i.e. square root of the dimensions of the training data`]"
     max_features::Union{Nothing,Int64}           = nothing
+    "Share of samples to bootstrap for each individual tree [def: `1.0`]"
+    sampling_share::Float64                      = 1.0
     "Whether to force a classification task even if the labels are numerical (typically when labels are integers encoding some feature rather than representing a real cardinal measure) [def: `false`]"
     force_classification::Bool                   = false
     "Either `gini`, `entropy` or `variance`. This is the name of the function to be used to compute the information gain of a specific partition. This is done by measuring the difference betwwen the \"impurity\" of the labels of the parent node with those of the two child nodes, weighted by the respective number of items. [def: `nothing`, i.e. `gini` for categorical labels (classification task) and `variance` for numerical labels(regression task)]. It can be an anonymous function."
@@ -199,7 +201,7 @@ See [`buildTree`](@ref). The function has all the parameters of `bildTree` (with
 - This function optionally reports a weight distribution of the performances of eanch individual trees, as measured using the records he has not being trained with. These weights can then be (optionally) used in the `predict` function. The parameter `β ≥ 0` regulate the distribution of these weights: larger is `β`, the greater the importance (hence the weights) attached to the best-performing trees compared to the low-performing ones. Using these weights can significantly improve the forest performances (especially using small forests), however the correct value of β depends on the problem under exam (and the chosen caratteristics of the random forest estimator) and should be cross-validated to avoid over-fitting.
 - Note that this function uses multiple threads if these are available. You can check the number of threads available with `Threads.nthreads()`. To set the number of threads in Julia either set the environmental variable `JULIA_NUM_THREADS` (before starting Julia) or start Julia with the command line option `--threads` (most integrated development editors for Julia already set the number of threads to 4).
 """
-function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty}
+function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), sampling_share=1.0, force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty}
     # Force what would be a regression task into a classification task
     if force_classification && Ty <: Number
         y = string.(y)
@@ -228,7 +230,7 @@ function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1
     Threads.@threads for i in 1:n_trees
         tsrng = rngs[Threads.threadid()] # Thread safe random number generator
         Random.seed!(tsrng,masterSeed+i*10)
-        toSample = rand(tsrng, 1:N,N)
+        toSample = rand(tsrng, 1:N, Int(round(N*sampling_share)))
         notToSample = setdiff(1:N,toSample)
         bootstrappedx = x[toSample,:] # "boosted is different than "bootstrapped": https://towardsdatascience.com/random-forest-and-its-implementation-71824ced454f
         bootstrappedy = y[toSample]
@@ -280,6 +282,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty}
     # Setting schortcuts to other hyperparameters/options....
     min_gain             = m.hpar.min_gain
     min_records          = m.hpar.min_records
+    sampling_share       = m.hpar.sampling_share
     force_classification = m.hpar.force_classification
     n_trees              = m.hpar.n_trees
     fast_algorithm       = m.hpar.fast_algorithm
@@ -290,7 +293,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty}
     rng                 = m.opt.rng
     verbosity           = m.opt.verbosity
 
-    forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false,  rng = rng)
+    forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, sampling_share=sampling_share, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false,  rng = rng)
 
     m.par = RF_lp(forest,Tynm)