dmlc · ExpandingMan · Nov 17, 2023 · Oct 8, 2023 · Oct 8, 2023 · Oct 8, 2023
diff --git a/src/booster.jl b/src/booster.jl
@@ -366,9 +366,10 @@ function updateone!(b::Booster, Xy::DMatrix;
                     update_feature_names::Bool=false,
                    )
     xgbcall(XGBoosterUpdateOneIter, b.handle, round_number, Xy.handle)
-    isempty(watchlist) || logeval(b, watchlist, round_number)
+    isempty(watchlist) || (msg = evaliter(b, watchlist, round_number))
+    @info msg
     _maybe_update_feature_names!(b, Xy, update_feature_names)
-    b
+    b, msg
 end
 
 function updateone!(b::Booster, Xy::DMatrix, g::AbstractVector{<:Real}, h::AbstractVector{<:Real};
@@ -382,9 +383,10 @@ function updateone!(b::Booster, Xy::DMatrix, g::AbstractVector{<:Real}, h::Abstr
     g = convert(Vector{Cfloat}, g)
     h = convert(Vector{Cfloat}, h)
     xgbcall(XGBoosterBoostOneIter, b.handle, Xy.handle, g, h, length(g))
-    isempty(watchlist) || logeval(b, watchlist, round_number)
+    isempty(watchlist) || (msg = evaliter(b, watchlist, round_number))
+    @info msg
     _maybe_update_feature_names!(b, Xy, update_feature_names)
-    b
+    b, msg
 end
 
 """
@@ -422,14 +424,97 @@ Run `num_round` rounds of gradient boosting on [`Booster`](@ref) `b`.
 The first and second derivatives of the loss function (`ℓ′` and `ℓ″` respectively) can be provided
 for custom loss.
 """
-function update!(b::Booster, data, a...; num_round::Integer=1, kw...)
+function update!(b::Booster, data, a...;
+                 num_round::Integer=1, 
+                 watchlist=Dict("train"=>Xy), 
+                 early_stopping_rounds::Integer=0,
+                 maximize=false,
+                 kw...,
+                 )
+
+    if !isempty(watchlist) && early_stopping_rounds > 0
+        @info("Will train until there has been no improvement in $early_stopping_rounds rounds.\n")
+        best_round = 0
+        best_score = maximize ? -Inf : Inf
+    end 
+
     for j ∈ 1:num_round
         round_number = getnrounds(b) + 1
-        updateone!(b, data, a...; round_number, kw...)
+        b, msg = updateone!(b, data, a...; round_number, kw...)
+        if !isempty(watchlist) && early_stopping_rounds > 0
+            score, dataset, metric = extract_metric_value(msg)
+            if (maximize && score > best_score || (!maximize && score < best_score))
+                best_score = score
+                best_round = j
+            elseif j - best_round >= early_stopping_rounds
+                @info(
+                    "Xgboost: Stopping. \n\tBest iteration: $best_round. \n\tNo improvement in $dataset-$metric result in $early_stopping_rounds rounds."
+                )
+            return (b)
+            end
+        end
     end
     b
 end
 
+
+
+"""
+    extract_metric_value(msg, dataset=nothing, metric=nothing)
+
+Extracts a numeric value from a message based on the specified dataset and metric.
+If dataset or metric is not provided, the function will automatically find the last
+mentioned dataset or metric in the message.
+
+# Arguments
+- `msg::AbstractString`: The message containing the numeric values.
+- `dataset::Union{AbstractString, Nothing}`: The dataset to extract values for (default: `nothing`).
+- `metric::Union{AbstractString, Nothing}`: The metric to extract values for (default: `nothing`).
+
+# Returns
+- Returns the parsed Float64 value if a match is found, otherwise returns `nothing`.
+
+# Examples
+```julia
+msg = "train-rmsle:0.09516384803222511 train-rmse:0.12458323318968342 eval-rmsle:0.09311178520817574 eval-rmse:0.12088154560829874"
+
+# Without specifying dataset and metric
+value_without_params = extract_metric_value(msg)
+println(value_without_params)  # Output: (0.09311178520817574, "eval", "rmsle")
+
+# With specifying dataset and metric
+value_with_params = extract_metric_value(msg, "train", "rmsle")
+println(value_with_params)  # Output: (0.0951638480322251, "train", "rmsle")
+"""
+
+function extract_metric_value(msg, dataset=nothing, metric=nothing)
+    if isnothing(dataset)
+            # Find the last mentioned dataset
+            datasets = Set([m.match for m in eachmatch(r"\w+(?=-)", msg)])
+            dataset = last(collect(datasets))
+    end
+
+    if isnothing(metric)
+            # Find the first mentioned metric
+            metrics = Set([m.match for m in eachmatch(r"(?<=-)\w+", msg)])
+            metric = last(collect(metrics))
+    end
+
+    pattern = Regex("$dataset-$metric:([\\d.]+)")
+
+    match_result = match(pattern, msg)
+
+    if match_result != nothing
+            parsed_value = parse(Float64, match_result.captures[1])
+            return parsed_value, dataset, metric
+    else
+            @warn "No match found for pattern: $dataset-$metric in message: $msg"
+            return nothing
+    end
+end
+
+
+
 """
     xgboost(data; num_round=10, watchlist=Dict(), kw...)
     xgboost(data, ℓ′, ℓ″; kw...)
@@ -441,6 +526,13 @@ followed by [`update!`](@ref) for `nrounds`.
 `watchlist` is a dict the keys of which are strings giving the name of the data to watch
 and the values of which are [`DMatrix`](@ref) objects containing the data.
 
+`early_stopping_rounds` if 0, the early stopping function is not triggered. If set to a positive integer, 
+training with a validation set will stop if the performance doesn't improve for k rounds.
+
+`maximize` If early_stopping_rounds is set, then this parameter must be set as well.
+When it is false, it means the smaller the evaluation score the better. When set to true,
+the larger the evaluation score the better.
+
 All other keyword arguments are passed to [`Booster`](@ref).  With few exceptions these are model
 training hyper-parameters, see [here](https://xgboost.readthedocs.io/en/stable/parameter.html) for
 a comprehensive list.
@@ -460,12 +552,14 @@ ŷ = predict(b, X)
 function xgboost(dm::DMatrix, a...;
                  num_round::Integer=10,
                  watchlist=Dict("train"=>dm),
+                 early_stopping_rounds::Integer=0,
+                 maximize=false,
                  kw...
                 )
     Xy = DMatrix(dm)
     b = Booster(Xy; kw...)
     isempty(watchlist) || @info("XGBoost: starting training.")
-    update!(b, Xy, a...; num_round, watchlist)
+    update!(b, Xy, a...; num_round, watchlist, early_stopping_rounds, maximize)
     isempty(watchlist) || @info("Training rounds complete.")
     b
 end

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -130,6 +130,41 @@ end
     end
 end
 
+
+@testset "Early Stopping rounds" begin
+
+    dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"), format=:libsvm)
+    dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"), format=:libsvm)
+    watchlist = Dict("eval"=>dtest, "train"=>dtrain)
+
+    bst = xgboost(dtrain, 
+        num_round=30,
+        watchlist=watchlist,
+        η=1,
+        objective="binary:logistic",
+        eval_metric=["rmsle","rmse"]
+        )
+
+    bst_early_stopping = xgboost(dtrain,
+        num_round=30,
+        watchlist=watchlist,
+        η=1,
+        objective="binary:logistic",
+        eval_metric=["rmsle","rmse"],
+        early_stopping_rounds = 2
+        )
+
+        nrounds_bst = XGBoost.getnrounds(bst) 
+        nrounds_bst_early_stopping = XGBoost.getnrounds(bst_early_stopping) 
+        # Check to see that running with early stopping results in less rounds
+        @test nrounds_bst_early_stopping < nrounds_bst
+
+        # Check number of rounds > early stopping rounds
+        @test nrounds_bst_early_stopping > 2
+end
+
+
+
 @testset "Blobs training" begin
     (X, y) = load_classification()