From d799a798f937fb30620e33a9a6583d88491f6b3b Mon Sep 17 00:00:00 2001 From: ExpandingMan Date: Wed, 27 Sep 2023 17:38:26 -0400 Subject: [PATCH] update to libxgboost 2.0 (#191) --- Project.toml | 6 +++--- src/Lib.jl | 22 +++++++++++++++++----- src/booster.jl | 10 +++++++--- src/dmatrix.jl | 32 +++++++++++++++++++++++++++----- test/runtests.jl | 22 ++++++++++++---------- 5 files changed, 66 insertions(+), 26 deletions(-) diff --git a/Project.toml b/Project.toml index ff75ad0..91c0cf2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "XGBoost" uuid = "009559a3-9522-5dbb-924b-0b6ed2b22bb9" -version = "2.3.2" +version = "2.4.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" @@ -27,13 +27,13 @@ XGBoostTermExt = "Term" [compat] AbstractTrees = "0.4" CEnum = "0.4" -CUDA = "3, 4" +CUDA = "3, 4, 5" JSON3 = "1" OrderedCollections = "1" SparseMatricesCSR = "0.6" Tables = "1" Term = "1, 2" -XGBoost_jll = "1.7.2" +XGBoost_jll = "2" julia = "1.6" [extras] diff --git a/src/Lib.jl b/src/Lib.jl index cbf3b56..bc14451 100644 --- a/src/Lib.jl +++ b/src/Lib.jl @@ -68,6 +68,10 @@ function XGDMatrixCreateFromFile(fname, silent, out) @ccall libxgboost.XGDMatrixCreateFromFile(fname::Ptr{Cchar}, silent::Cint, out::Ptr{DMatrixHandle})::Cint end +function XGDMatrixCreateFromURI(config, out) + @ccall libxgboost.XGDMatrixCreateFromURI(config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint +end + function XGDMatrixCreateFromCSREx(indptr, indices, data, nindptr, nelem, num_col, out) @ccall libxgboost.XGDMatrixCreateFromCSREx(indptr::Ptr{Csize_t}, indices::Ptr{Cuint}, data::Ptr{Cfloat}, nindptr::Csize_t, nelem::Csize_t, num_col::Csize_t, out::Ptr{DMatrixHandle})::Cint end @@ -80,6 +84,10 @@ function XGDMatrixCreateFromDense(data, config, out) @ccall libxgboost.XGDMatrixCreateFromDense(data::Ptr{Cchar}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint end +function XGDMatrixCreateFromCSC(indptr, indices, data, nrow, config, out) + @ccall libxgboost.XGDMatrixCreateFromCSC(indptr::Ptr{Cchar}, indices::Ptr{Cchar}, data::Ptr{Cchar}, nrow::bst_ulong, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint +end + function XGDMatrixCreateFromCSCEx(col_ptr, indices, data, nindptr, nelem, num_row, out) @ccall libxgboost.XGDMatrixCreateFromCSCEx(col_ptr::Ptr{Csize_t}, indices::Ptr{Cuint}, data::Ptr{Cfloat}, nindptr::Csize_t, nelem::Csize_t, num_row::Csize_t, out::Ptr{DMatrixHandle})::Cint end @@ -125,7 +133,7 @@ const XGBCallbackSetData = Cvoid const XGBCallbackDataIterNext = Cvoid function XGDMatrixCreateFromDataIter(data_handle, callback, cache_info, out) - @ccall libxgboost.XGDMatrixCreateFromDataIter(data_handle::DataIterHandle, callback::Ptr{Cvoid}, cache_info::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint + @ccall libxgboost.XGDMatrixCreateFromDataIter(data_handle::DataIterHandle, callback::Ptr{XGBCallbackDataIterNext}, cache_info::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint end function XGProxyDMatrixCreate(out) @@ -139,15 +147,15 @@ const XGDMatrixCallbackNext = Cvoid const DataIterResetCallback = Cvoid function XGDMatrixCreateFromCallback(iter, proxy, reset, next, config, out) - @ccall libxgboost.XGDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, reset::Ptr{Cvoid}, next::Ptr{Cvoid}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint + @ccall libxgboost.XGDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, reset::Ptr{DataIterResetCallback}, next::Ptr{XGDMatrixCallbackNext}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint end function XGQuantileDMatrixCreateFromCallback(iter, proxy, ref, reset, next, config, out) - @ccall libxgboost.XGQuantileDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, ref::DataIterHandle, reset::Ptr{Cvoid}, next::Ptr{Cvoid}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint + @ccall libxgboost.XGQuantileDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, ref::DataIterHandle, reset::Ptr{DataIterResetCallback}, next::Ptr{XGDMatrixCallbackNext}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint end function XGDeviceQuantileDMatrixCreateFromCallback(iter, proxy, reset, next, missing, nthread, max_bin, out) - @ccall libxgboost.XGDeviceQuantileDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, reset::Ptr{Cvoid}, next::Ptr{Cvoid}, missing::Cfloat, nthread::Cint, max_bin::Cint, out::Ptr{DMatrixHandle})::Cint + @ccall libxgboost.XGDeviceQuantileDMatrixCreateFromCallback(iter::DataIterHandle, proxy::DMatrixHandle, reset::Ptr{DataIterResetCallback}, next::Ptr{XGDMatrixCallbackNext}, missing::Cfloat, nthread::Cint, max_bin::Cint, out::Ptr{DMatrixHandle})::Cint end function XGProxyDMatrixSetDataCudaArrayInterface(handle, c_interface_str) @@ -171,7 +179,7 @@ function XGImportArrowRecordBatch(data_handle, ptr_array, ptr_schema) end function XGDMatrixCreateFromArrowCallback(next, config, out) - @ccall libxgboost.XGDMatrixCreateFromArrowCallback(next::Ptr{Cvoid}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint + @ccall libxgboost.XGDMatrixCreateFromArrowCallback(next::Ptr{XGDMatrixCallbackNext}, config::Ptr{Cchar}, out::Ptr{DMatrixHandle})::Cint end function XGDMatrixSliceDMatrix(handle, idxset, len, out) @@ -242,6 +250,10 @@ function XGDMatrixGetDataAsCSR(handle, config, out_indptr, out_indices, out_data @ccall libxgboost.XGDMatrixGetDataAsCSR(handle::DMatrixHandle, config::Ptr{Cchar}, out_indptr::Ptr{bst_ulong}, out_indices::Ptr{Cuint}, out_data::Ptr{Cfloat})::Cint end +function XGDMatrixGetQuantileCut(handle, config, out_indptr, out_data) + @ccall libxgboost.XGDMatrixGetQuantileCut(handle::DMatrixHandle, config::Ptr{Cchar}, out_indptr::Ptr{Ptr{Cchar}}, out_data::Ptr{Ptr{Cchar}})::Cint +end + function XGBoosterCreate(dmats, len, out) @ccall libxgboost.XGBoosterCreate(dmats::Ptr{DMatrixHandle}, len::bst_ulong, out::Ptr{BoosterHandle})::Cint end diff --git a/src/booster.jl b/src/booster.jl index 44d8bc1..bebb054 100644 --- a/src/booster.jl +++ b/src/booster.jl @@ -173,16 +173,20 @@ load(::Type{Booster}, fname::AbstractString) = Booster(DMatrix[], model_file=fna load(::Type{Booster}, io) = Booster(DMatrix[], model_buffer=io) """ - save(b::Booster, fname) + save(b::Booster, fname; format="json") save(b::Booster, Vector{UInt8}; format="json") save(b::Booster, io::IO; format="json") Save the [`Booster`](@ref) object. This saves to formats which are intended to be stored on disk but the formats used are a lot zanier than those used by `deserialize`. A model saved with this function can be retrieved with [`load`](@ref) or [`load!`](@ref). +Valid formats are `"json"` and `"ubj"` (universal binary JSON). """ -function save(b::Booster, fname::AbstractString) - xgbcall(XGBoosterSaveModel, b.handle, fname) +function save(b::Booster, fname::AbstractString; kw...) + # note that XGBoosterSaveModel seems to be deprecated + open(fname, write=true, create=true) do io + save(b, io; kw...) + end fname end function save(b::Booster, ::Type{Vector{UInt8}}; format::AbstractString="json") diff --git a/src/dmatrix.jl b/src/dmatrix.jl index 4ea9c95..f3962e8 100644 --- a/src/dmatrix.jl +++ b/src/dmatrix.jl @@ -163,16 +163,37 @@ function getinfo(dm::DMatrix, ::Type{T}, name::AbstractString) where {T<:Real} end getinfo(dm::DMatrix, t::Type, name::Symbol) = getinfo(dm, t, string(name)) +# see https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html +function _fileuri(fname::AbstractString, format::Symbol) + if '?' ∈ fname + throw(ArgumentError("file name strings passed to libxgboost cannot contain '?'")) + end + format == :binary && return fname + string(fname, "?format=", format) +end + """ - load(DMatrix, fname; silent=true, kw...) + load(DMatrix, fname; silent=true, format=:libsvm, kw...) Load a `DMatrix` from file with name `fname`. The matrix must have been serialized with a call to `save(::DMatrix, fname)`. If `silent` the xgboost library will print logs to `stdout`. Additional keyword arguments are passed to the `DMatrix` on construction. -""" -function load(::Type{DMatrix}, fname::AbstractString; silent::Bool=true, kw...) +Format describes the file format, valid options are `:binary`, `:csv` and `:libsvm`. +""" +function load(::Type{DMatrix}, fname::AbstractString; + #TODO: would be better to have :binary as default, but would be breaking + format::Symbol=:libsvm, + silent::Bool=true, + kw... + ) o = Ref{DMatrixHandle}() - xgbcall(XGDMatrixCreateFromFile, fname, silent, o) + cfg = Dict("uri"=>_fileuri(fname, format), + # gives runtime error if not int even though docs say bool + "silent"=>Int(silent), + # docs are inconsistent and don't explain this, so it's disabled + #"data_split_mode"=>string(data_split_mode), + ) + xgbcall(XGDMatrixCreateFromURI, JSON3.write(cfg), o) DMatrix(o[], kw...) end @@ -385,7 +406,8 @@ getweights(dm::DMatrix) = getinfo(dm, Float32, "weight") save(dm::DMatrix, fname; silent=true) Save the `DMatrix` to file `fname` in an opaque (xgboost-specific) serialization format. -Will print logs to `stdout` unless `silent`. +Will print logs to `stdout` unless `silent`. Files created with this function can be loaded +using `XGBoost.load(DMatrix, fname, format=:binary)`. """ function save(dm::DMatrix, fname::AbstractString; silent::Bool=true) xgbcall(XGDMatrixSaveBinary, dm.handle, fname, convert(Cint, silent)) diff --git a/test/runtests.jl b/test/runtests.jl index 1be205c..cade6d5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -75,7 +75,7 @@ end @testset "DMatrix IO" begin for (fname, sz) ∈ [("agaricus.txt.train", (6513, 126)), ("agaricus.txt.test", (1611, 126))] - dm = XGBoost.load(DMatrix, testfilepath(fname)) + dm = XGBoost.load(DMatrix, testfilepath(fname), format=:libsvm) @test size(dm) == sz (X, y) = readlibsvm(testfilepath(fname), sz) @@ -86,15 +86,15 @@ end dm = DMatrix((X, y)) fname = tempname() XGBoost.save(dm, fname) - dm′ = XGBoost.load(DMatrix, fname) + dm′ = XGBoost.load(DMatrix, fname, format=:binary) @test size(dm) == size(dm′) @test XGBoost.getlabel(dm) == XGBoost.getlabel(dm′) isfile(fname) && rm(fname) end @testset "Agaricus training" begin - dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train")) - dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test")) + dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"), format=:libsvm) + dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"), format=:libsvm) watchlist = Dict("eval"=>dtest, "train"=>dtrain) bst = @test_logs (:info, r"XGBoost") (:info, r"") (:info, r"") (:info, r"Training") begin @@ -142,8 +142,8 @@ end end @testset "Feature importance" begin - dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train")) - dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test")) + dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"), format=:libsvm) + dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"), format=:libsvm) bst = xgboost(dtrain, num_round=5, η=1.0, max_depth=2, @@ -166,8 +166,8 @@ end # these just ensure we don't have any exceptions @testset "Term extension" begin - dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train")) - dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test")) + dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"), format=:libsvm) + dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"), format=:libsvm) bst = xgboost(dtrain, num_round=5, η=1.0, max_depth=2, @@ -180,8 +180,8 @@ end end @testset "Booster" begin - dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train")) - dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test")) + dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"), format=:libsvm) + dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"), format=:libsvm) (model_file, _) = mktemp() @@ -228,6 +228,8 @@ end end has_cuda() && @testset "cuda" begin + @info("runing CUDA tests") + X = randn(Float32, 4, 5) dm = DMatrix(cu(X)) @test size(dm) == size(X)