Skip to content

Commit

Permalink
Added sampling_share option to RF and aligned RF hyperparameters in b…
Browse files Browse the repository at this point in the history
…enchmark
  • Loading branch information
sylvaticus committed Mar 26, 2024
1 parent a448a42 commit 8a79646
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 37 deletions.
2 changes: 1 addition & 1 deletion docs/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.10.0"
manifest_format = "2.0"
project_hash = "18e8039b3de791d9b34e11bdeca5dbd5a0cd9fca"
project_hash = "a6a64beac05fb4bb6ed762665c2730d213492a2a"

[[deps.ANSIColoredPrinters]]
git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c"
Expand Down
1 change: 1 addition & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
BetaML = "024491cd-cc6b-443e-8034-08ea7eb7db2b"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
Expand Down
68 changes: 35 additions & 33 deletions docs/src/Benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This benchmark page allows us to quickly check for regressions across versions.
As it is run and compiled using GitHub actions, and these may be powered by different computational resources, timing results are normalized using SystemBenchmark.

This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model, USING DEFAULT VALUES. Note that this could imply very different important hyperparameters.
This page also provides a basic, far-from-exhaustive, comparison with other leading Julia libraries for the same model. Note that while we did try to match hyperparameters to the BetaML default ones, differences still occour and may in part explain the different model's performances and scores.

This file is intended just for benchmarking, not much as a tutorial, and it doesn't employ a full ML workflow, just the minimum preprocessing such that the algorithms work.

Expand Down Expand Up @@ -87,9 +87,9 @@ end
### DecisionTree
Random.seed!(123)
dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(),
"RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(),
);
dt_models = OrderedDict("DT (DecisionTrees.jl)"=>DecisionTree.DecisionTreeRegressor(rng=copy(TESTRNG)),
"RF (DecisionTrees.jl)"=>DecisionTree.RandomForestRegressor(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)),
)
for (mname,m) in dt_models
#mname = "DT"
Expand Down Expand Up @@ -230,8 +230,8 @@ end
Random.seed!(123)
dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(),
"RF (DT.jl)"=>DecisionTree.RandomForestClassifier(),
dt_models = OrderedDict("DT (DT.jl)"=>DecisionTree.DecisionTreeClassifier(rng=copy(TESTRNG)),
"RF (DT.jl)"=>DecisionTree.RandomForestClassifier(n_trees=30, partial_sampling=1.0, rng=copy(TESTRNG)),
);
Expand Down Expand Up @@ -388,34 +388,36 @@ for (mname,f) in othcl_functions
#mname = "GMM"
println("Processing model $mname ... ")
Random.seed!(123)
old_logger = Logging.global_logger()
oldstdout = stdout
redirect_stdout(devnull)
global_logger(NullLogger())
bres = @benchmark $f($x)
m_time = median(bres.times)
m_memory = bres.memory
m_allocs = bres.allocs
sampler = KFold(nsplits=10,rng=copy(TESTRNG));
cv_out = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng
# For unsupervised learning we use only the train data.
# Also, we use the associated labels only to measure the performances
(xtrain,ytrain) = trainData;
ŷtrain = f(xtrain)
acc_train = accuracy(ytrain,ŷtrain,ignorelabels=true)
pd = pairwise(xtrain)
sil_score = mean(silhouette(pd,ŷtrain))
return (acc_train, sil_score)
with_logger(NullLogger())
#old_logger = Logging.global_logger()
#oldstdout = stdout
#redirect_stdout(devnull)
#global_logger(NullLogger())
bres = @benchmark $f($x)
m_time = median(bres.times)
m_memory = bres.memory
m_allocs = bres.allocs
sampler = KFold(nsplits=10,rng=copy(TESTRNG));
cv_out = cross_validation([x,y],sampler,return_statistics=false) do trainData,testData,rng
# For unsupervised learning we use only the train data.
# Also, we use the associated labels only to measure the performances
(xtrain,ytrain) = trainData;
ŷtrain = f(xtrain)
acc_train = accuracy(ytrain,ŷtrain,ignorelabels=true)
pd = pairwise(xtrain)
sil_score = mean(silhouette(pd,ŷtrain))
return (acc_train, sil_score)
end
acc_mean = mean([r[1] for r in cv_out])
acc_std = std([r[1] for r in cv_out])
sil_mean = mean([r[2] for r in cv_out])
sil_std = std([r[2] for r in cv_out])
push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std])
#redirect_stdout(oldstdout)
#global_logger(old_logger)
end
acc_mean = mean([r[1] for r in cv_out])
acc_std = std([r[1] for r in cv_out])
sil_mean = mean([r[2] for r in cv_out])
sil_std = std([r[2] for r in cv_out])
push!(bm_clustering,[mname, m_time, m_memory, m_allocs, acc_mean, acc_std, sil_mean, sil_std])
redirect_stdout(oldstdout)
global_logger(old_logger)
@test acc_mean >= 0.6
end
Expand Down
9 changes: 6 additions & 3 deletions src/Trees/RandomForests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Base.@kwdef mutable struct RandomForestE_hp <: BetaMLHyperParametersSet
min_records::Int64 = 2
"The maximum number of (random) features to consider when choosing the optimal partition of the dataset [def: `nothing`, i.e. square root of the dimensions of the training data`]"
max_features::Union{Nothing,Int64} = nothing
"Share of samples to bootstrap for each individual tree [def: `1.0`]"
sampling_share::Float64 = 1.0
"Whether to force a classification task even if the labels are numerical (typically when labels are integers encoding some feature rather than representing a real cardinal measure) [def: `false`]"
force_classification::Bool = false
"Either `gini`, `entropy` or `variance`. This is the name of the function to be used to compute the information gain of a specific partition. This is done by measuring the difference betwwen the \"impurity\" of the labels of the parent node with those of the two child nodes, weighted by the respective number of items. [def: `nothing`, i.e. `gini` for categorical labels (classification task) and `variance` for numerical labels(regression task)]. It can be an anonymous function."
Expand Down Expand Up @@ -199,7 +201,7 @@ See [`buildTree`](@ref). The function has all the parameters of `bildTree` (with
- This function optionally reports a weight distribution of the performances of eanch individual trees, as measured using the records he has not being trained with. These weights can then be (optionally) used in the `predict` function. The parameter `β ≥ 0` regulate the distribution of these weights: larger is `β`, the greater the importance (hence the weights) attached to the best-performing trees compared to the low-performing ones. Using these weights can significantly improve the forest performances (especially using small forests), however the correct value of β depends on the problem under exam (and the chosen caratteristics of the random forest estimator) and should be cross-validated to avoid over-fitting.
- Note that this function uses multiple threads if these are available. You can check the number of threads available with `Threads.nthreads()`. To set the number of threads in Julia either set the environmental variable `JULIA_NUM_THREADS` (before starting Julia) or start Julia with the command line option `--threads` (most integrated development editors for Julia already set the number of threads to 4).
"""
function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty}
function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1), min_gain=0.0, min_records=2, max_features=Int(round(sqrt(size(x,2)))), sampling_share=1.0, force_classification=false, splitting_criterion = (Ty <: Number && !force_classification) ? variance : gini, integer_encoded_cols=nothing, fast_algorithm=false, β=0, oob=false,rng = Random.GLOBAL_RNG, verbosity=NONE) where {Ty}
# Force what would be a regression task into a classification task
if force_classification && Ty <: Number
y = string.(y)
Expand Down Expand Up @@ -228,7 +230,7 @@ function buildForest(x, y::AbstractArray{Ty,1}, n_trees=30; max_depth = size(x,1
Threads.@threads for i in 1:n_trees
tsrng = rngs[Threads.threadid()] # Thread safe random number generator
Random.seed!(tsrng,masterSeed+i*10)
toSample = rand(tsrng, 1:N,N)
toSample = rand(tsrng, 1:N, Int(round(N*sampling_share)))
notToSample = setdiff(1:N,toSample)
bootstrappedx = x[toSample,:] # "boosted is different than "bootstrapped": https://towardsdatascience.com/random-forest-and-its-implementation-71824ced454f
bootstrappedy = y[toSample]
Expand Down Expand Up @@ -280,6 +282,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty}
# Setting schortcuts to other hyperparameters/options....
min_gain = m.hpar.min_gain
min_records = m.hpar.min_records
sampling_share = m.hpar.sampling_share
force_classification = m.hpar.force_classification
n_trees = m.hpar.n_trees
fast_algorithm = m.hpar.fast_algorithm
Expand All @@ -290,7 +293,7 @@ function fit!(m::RandomForestEstimator,x,y::AbstractArray{Ty,1}) where {Ty}
rng = m.opt.rng
verbosity = m.opt.verbosity

forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false, rng = rng)
forest = buildForest(x, y, n_trees; max_depth = max_depth, min_gain=min_gain, min_records=min_records, sampling_share=sampling_share, max_features=max_features, force_classification=force_classification, splitting_criterion = splitting_criterion, fast_algorithm=fast_algorithm, integer_encoded_cols=integer_encoded_cols, β=β, oob=false, rng = rng)

m.par = RF_lp(forest,Tynm)

Expand Down

0 comments on commit 8a79646

Please sign in to comment.