first commit

Evovest · Feb 24, 2024 · 7bd6993 · 7bd6993
commit 7bd6993
Show file tree

Hide file tree

Showing 49 changed files with 6,660 additions and 0 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,36 @@
+name: CI
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+      - ci-docs
+    tags: '*'
+env:
+ JULIA_PKG_USE_CLI_GIT: true
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - '1.10'
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v1
+      - uses: julia-actions/julia-buildpkg@v1
+        with:
+          git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default)
+      - uses: julia-actions/julia-runtest@v1
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,46 @@
+name: Docs
+
+on:
+  push:
+    branches:
+      - main
+    tags: '*'
+  pull_request:
+
+env:
+ JULIA_PKG_USE_CLI_GIT: true
+
+    # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: write
+  pages: write
+  id-token: write
+  statuses: write
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Node
+        uses: actions/setup-node@v3
+        with:
+            node-version: 20
+            cache: npm # or pnpm / yarn
+            cache-dependency-path: 'docs/package-lock.json' # this should be a package-lock.json file
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1.10'
+      - uses: julia-actions/cache@v1
+      - uses: webfactory/[email protected]
+        with:
+          ssh-private-key: |
+            ${{ secrets.NEUROTREEMODELS_KEY }}
+      - uses: julia-actions/julia-buildpkg@v1
+        with:
+          git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default)
+      - name: Instantiate NPM
+        run: cd docs/; npm i; cd ..
+      - uses: julia-actions/julia-docdeploy@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,34 @@
+# Files generated by invoking Julia with --code-coverage
+*.jl.cov
+*.jl.*.cov
+
+# Files generated by invoking Julia with --track-allocation
+*.jl.mem
+
+# System-specific files and directories generated by the BinaryProvider and BinDeps packages
+# They contain absolute paths specific to the host computer, and so should not be committed
+deps/deps.jl
+deps/build.log
+deps/downloads/
+deps/usr/
+deps/src/
+
+# Build artifacts for creating documentation generated by the Documenter package
+docs/build/
+docs/site/
+docs/src/.vitepress/cache
+docs/src/.vitepress/dist
+docs/Manifest.toml
+docs/.vscode
+docs/node_modules/
+docs/.vitepress/cache
+docs/.vitepress/dist
+
+# File generated by Pkg, the package manager, based on a corresponding Project.toml
+# It records a fixed state of all packages used by the project. As such, it should not be
+# committed for packages, but should be committed for applications that require a static
+# environment.
+Manifest.toml
+
+data/
+.vscode/
diff --git a/Project.toml b/Project.toml
@@ -0,0 +1,30 @@
+name = "NeuroTreeModels"
+uuid = "1db4e0a5-a364-4b0c-897c-2bd5a4a3a1f2"
+authors = ["jeremie <[email protected]>"]
+version = "0.1.0"
+
+
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+
+[compat]
+CUDA = "3, 4, 5"
+Flux = "0.13, 0.14"
+MLUtils = "0.4"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/README.md b/README.md
@@ -0,0 +1,56 @@
+# NeuroTreeModels.jl
+
+Differentiable tree-based models for tabular data. 
+
+## Installation
+
+```julia
+] add NeuroTreeModels
+```
+
+## Configuring a model
+
+A model configuration is defined with the [NeuroTreeRegressor](@ref) constructor:
+
+```julia
+using NeuroTreeModels, DataFrames
+
+config = NeuroTreeRegressor(
+    loss = :mse,
+    nrounds = 10,
+    num_trees = 16,
+    depth = 5,
+)
+```
+
+## Training
+
+Building a training a model according to the above `config` is done [NeuroTreeModels.fit](@ref).
+See the docs for additinal features, notably early stopping support through the tracking of an evaluation metric.
+
+```julia
+nobs, nfeats = 1_000, 5
+dtrain = DataFrame(randn(nobs, nfeats), :auto)
+dtrain.y = rand(nobs)
+feature_names, target_name = names(dtrain, r"x"), "y"
+
+m = NeuroTreeModels.fit(config, dtrain; feature_names, target_name)
+```
+
+## Inference
+
+```julia
+p = m(dtrain)
+```
+
+## MLJ
+
+NeuroTreeModels.jl supports the [MLJ](https://github.com/alan-turing-institute/MLJ.jl) Interface. 
+
+```julia
+using MLJBase, NeuroTreeModels
+m = NeuroTreeRegressor(depth=5, nrounds=10)
+X, y = @load_boston
+mach = machine(m, X, y) |> fit!
+p = predict(mach, X)
+```
diff --git a/benchmarks/Higgs-logloss.jl b/benchmarks/Higgs-logloss.jl
@@ -0,0 +1,89 @@
+using Revise
+using Random
+using CSV
+using DataFrames
+using StatsBase
+using Statistics: mean, std
+using NeuroTreeModels
+using Solage: Connectors
+using AWS: AWSCredentials, AWSConfig, @service
+
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
+bucket = "jeremiedb"
+
+path = "share/data/higgs/HIGGS.arrow"
+df_tot = Connectors.read_arrow_aws(path; bucket="jeremiedb", aws_config)
+
+rename!(df_tot, "Column1" => "y")
+feature_names = setdiff(names(df_tot), ["y"])
+target_name = "y"
+
+# function percent_rank(x::AbstractVector{T}) where {T}
+#     return tiedrank(x) / (length(x) + 1)
+# end
+
+# transform!(df_tot, feature_names .=> percent_rank .=> feature_names)
+
+dtrain = df_tot[1:end-1_000_000, :];
+deval = df_tot[end-1_000_000+1:end-500_000, :];
+dtest = df_tot[end-500_000+1:end, :];
+
+config = NeuroTreeRegressor(
+    device=:gpu,
+    loss=:logloss,
+    nrounds=200,
+    scaler=true,
+    outsize=1,
+    depth=4,
+    lr=2e-3,
+    ntrees=128,
+    stack_size=2,
+    hidden_size=16,
+    batchsize=8092,
+)
+
+@time m, logger = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    print_every_n=1,
+    early_stopping_rounds=2,
+    metric=:logloss,
+    return_logger=true
+);
+
+dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
+p_eval = m(dinfer_eval);
+error_eval = 1 - mean(round.(Int, p_eval) .== deval.y)
+@info "ERROR - deval" error_eval
+
+dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
+p_test = m(dinfer_test);
+error_test = 1 - mean(round.(Int, p_test) .== dtest.y)
+@info "ERROR - dtest" error_test
+
+# depth:4, num_trees=256, stack_size=2, hidden_size=16, boosting_size=1, batchsize=2048, lr=1e-3
+# ┌ Info: iter 30
+# └   metric = 0.4679296910762787
+# 10128.021110 seconds (806.60 M allocations: 206.595 GiB, 0.40% gc time, 0.00% compilation time)
+# ┌ Info: ERROR - dtest
+# └   error_test = 0.22794599999999998
+
+# depth:5, num_trees=256, stack_size=1, hidden_size=1, boosting_size=1, batchsize=2048,
+# ┌ Info: iter 40
+# └   metric = 0.4786278009414673
+# 10985.068111 seconds (959.42 M allocations: 259.180 GiB, 0.38% gc time)
+# ┌ Info: ERROR - dtest
+# └   error_test = 0.23524
+
+# depth:5, num_trees=256, stack_size=3, hidden_size=16, boosting_size=3, batchsize=2048,
+# ┌ Info: iter 33
+# └   metric = 0.4564650058746338
+# 34568.885039 seconds (7.51 G allocations: 1.109 TiB, 1.01% gc time)
+# ┌ Info: ERROR - dtest
+# └   error_test = 0.22153599999999996
+
diff --git a/benchmarks/MSRank.jl b/benchmarks/MSRank.jl
@@ -0,0 +1,89 @@
+using Revise
+using Random
+using CSV
+using DataFrames
+using StatsBase
+using Statistics: mean, std
+using NeuroTreeModels
+using Solage: Connectors
+using ReadLIBSVM
+using AWS: AWSCredentials, AWSConfig, @service
+
+# https://www.microsoft.com/en-us/research/project/mslr/
+
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
+bucket = "jeremiedb"
+
+# initial prep
+function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
+    raw = S3.get_object("jeremiedb", file, Dict("response-content-type" => "application/octet-stream"); aws_config)
+    return read_libsvm(raw; has_query)
+end
+
+@time train_raw = read_libsvm_aws("share/data/msrank/train.txt"; has_query=true, aws_config);
+@time eval_raw = read_libsvm_aws("share/data/msrank/vali.txt"; has_query=true, aws_config);
+@time test_raw = read_libsvm_aws("share/data/msrank/test.txt"; has_query=true, aws_config);
+
+dtrain = DataFrame(train_raw[:x], :auto)
+dtrain.y_raw = train_raw[:y]
+dtrain.y = dtrain.y_raw ./ 4
+dtrain.q = train_raw[:q]
+
+deval = DataFrame(eval_raw[:x], :auto)
+deval.y_raw = eval_raw[:y]
+deval.y = deval.y_raw ./ 4
+deval.q = eval_raw[:q]
+
+dtest = DataFrame(test_raw[:x], :auto)
+dtest.y_raw = test_raw[:y]
+dtest.y = dtest.y_raw ./ 4
+dtest.q = test_raw[:q]
+
+feature_names = setdiff(names(dtrain), ["y", "y_raw", "q"])
+target_name = "y_raw"
+
+function percent_rank(x::AbstractVector{T}) where {T}
+    return tiedrank(x) / (length(x) + 1)
+end
+
+transform!(dtrain, feature_names .=> percent_rank .=> feature_names)
+transform!(deval, feature_names .=> percent_rank .=> feature_names)
+transform!(dtest, feature_names .=> percent_rank .=> feature_names)
+
+config = NeuroTreeRegressor(
+    device=:gpu,
+    loss=:mse,
+    nrounds=2,
+    actA=:tanh,
+    outsize=1,
+    depth=4,
+    ntrees=64,
+    stack_size=2,
+    hidden_size=16,
+    batchsize=4096,
+    lr=3e-4,
+)
+
+@time m, logger = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    print_every_n=1,
+    early_stopping_rounds=3,
+    metric=:mse,
+    return_logger=true
+);
+
+dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
+p_eval = m(dinfer_eval);
+mse_eval = mean((p_eval .- deval.y_raw) .^ 2)
+@info "MSE - deval" mse_eval
+
+dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
+p_test = m(dinfer_test);
+mse_test = mean((p_test .- dtest.y_raw) .^ 2)
+@info "MSE - dtest" mse_test