-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7bd6993
Showing
49 changed files
with
6,660 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
name: CI | ||
on: | ||
pull_request: | ||
branches: | ||
- main | ||
push: | ||
branches: | ||
- main | ||
- ci-docs | ||
tags: '*' | ||
env: | ||
JULIA_PKG_USE_CLI_GIT: true | ||
jobs: | ||
test: | ||
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
version: | ||
- '1.10' | ||
os: | ||
- ubuntu-latest | ||
arch: | ||
- x64 | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- uses: julia-actions/setup-julia@v1 | ||
with: | ||
version: ${{ matrix.version }} | ||
arch: ${{ matrix.arch }} | ||
- uses: julia-actions/cache@v1 | ||
- uses: julia-actions/julia-buildpkg@v1 | ||
with: | ||
git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default) | ||
- uses: julia-actions/julia-runtest@v1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
name: Docs | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
tags: '*' | ||
pull_request: | ||
|
||
env: | ||
JULIA_PKG_USE_CLI_GIT: true | ||
|
||
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages | ||
permissions: | ||
contents: write | ||
pages: write | ||
id-token: write | ||
statuses: write | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Setup Node | ||
uses: actions/setup-node@v3 | ||
with: | ||
node-version: 20 | ||
cache: npm # or pnpm / yarn | ||
cache-dependency-path: 'docs/package-lock.json' # this should be a package-lock.json file | ||
- uses: julia-actions/setup-julia@v1 | ||
with: | ||
version: '1.10' | ||
- uses: julia-actions/cache@v1 | ||
- uses: webfactory/[email protected] | ||
with: | ||
ssh-private-key: | | ||
${{ secrets.NEUROTREEMODELS_KEY }} | ||
- uses: julia-actions/julia-buildpkg@v1 | ||
with: | ||
git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default) | ||
- name: Instantiate NPM | ||
run: cd docs/; npm i; cd .. | ||
- uses: julia-actions/julia-docdeploy@v1 | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Files generated by invoking Julia with --code-coverage | ||
*.jl.cov | ||
*.jl.*.cov | ||
|
||
# Files generated by invoking Julia with --track-allocation | ||
*.jl.mem | ||
|
||
# System-specific files and directories generated by the BinaryProvider and BinDeps packages | ||
# They contain absolute paths specific to the host computer, and so should not be committed | ||
deps/deps.jl | ||
deps/build.log | ||
deps/downloads/ | ||
deps/usr/ | ||
deps/src/ | ||
|
||
# Build artifacts for creating documentation generated by the Documenter package | ||
docs/build/ | ||
docs/site/ | ||
docs/src/.vitepress/cache | ||
docs/src/.vitepress/dist | ||
docs/Manifest.toml | ||
docs/.vscode | ||
docs/node_modules/ | ||
docs/.vitepress/cache | ||
docs/.vitepress/dist | ||
|
||
# File generated by Pkg, the package manager, based on a corresponding Project.toml | ||
# It records a fixed state of all packages used by the project. As such, it should not be | ||
# committed for packages, but should be committed for applications that require a static | ||
# environment. | ||
Manifest.toml | ||
|
||
data/ | ||
.vscode/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
name = "NeuroTreeModels" | ||
uuid = "1db4e0a5-a364-4b0c-897c-2bd5a4a3a1f2" | ||
authors = ["jeremie <[email protected]>"] | ||
version = "0.1.0" | ||
|
||
|
||
[deps] | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" | ||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" | ||
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" | ||
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" | ||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" | ||
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" | ||
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" | ||
|
||
[compat] | ||
CUDA = "3, 4, 5" | ||
Flux = "0.13, 0.14" | ||
MLUtils = "0.4" | ||
|
||
[extras] | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
||
[targets] | ||
test = ["Test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# NeuroTreeModels.jl | ||
|
||
Differentiable tree-based models for tabular data. | ||
|
||
## Installation | ||
|
||
```julia | ||
] add NeuroTreeModels | ||
``` | ||
|
||
## Configuring a model | ||
|
||
A model configuration is defined with the [NeuroTreeRegressor](@ref) constructor: | ||
|
||
```julia | ||
using NeuroTreeModels, DataFrames | ||
|
||
config = NeuroTreeRegressor( | ||
loss = :mse, | ||
nrounds = 10, | ||
num_trees = 16, | ||
depth = 5, | ||
) | ||
``` | ||
|
||
## Training | ||
|
||
Building a training a model according to the above `config` is done [NeuroTreeModels.fit](@ref). | ||
See the docs for additinal features, notably early stopping support through the tracking of an evaluation metric. | ||
|
||
```julia | ||
nobs, nfeats = 1_000, 5 | ||
dtrain = DataFrame(randn(nobs, nfeats), :auto) | ||
dtrain.y = rand(nobs) | ||
feature_names, target_name = names(dtrain, r"x"), "y" | ||
|
||
m = NeuroTreeModels.fit(config, dtrain; feature_names, target_name) | ||
``` | ||
|
||
## Inference | ||
|
||
```julia | ||
p = m(dtrain) | ||
``` | ||
|
||
## MLJ | ||
|
||
NeuroTreeModels.jl supports the [MLJ](https://github.com/alan-turing-institute/MLJ.jl) Interface. | ||
|
||
```julia | ||
using MLJBase, NeuroTreeModels | ||
m = NeuroTreeRegressor(depth=5, nrounds=10) | ||
X, y = @load_boston | ||
mach = machine(m, X, y) |> fit! | ||
p = predict(mach, X) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using Revise | ||
using Random | ||
using CSV | ||
using DataFrames | ||
using StatsBase | ||
using Statistics: mean, std | ||
using NeuroTreeModels | ||
using Solage: Connectors | ||
using AWS: AWSCredentials, AWSConfig, @service | ||
|
||
@service S3 | ||
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"]) | ||
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1") | ||
bucket = "jeremiedb" | ||
|
||
path = "share/data/higgs/HIGGS.arrow" | ||
df_tot = Connectors.read_arrow_aws(path; bucket="jeremiedb", aws_config) | ||
|
||
rename!(df_tot, "Column1" => "y") | ||
feature_names = setdiff(names(df_tot), ["y"]) | ||
target_name = "y" | ||
|
||
# function percent_rank(x::AbstractVector{T}) where {T} | ||
# return tiedrank(x) / (length(x) + 1) | ||
# end | ||
|
||
# transform!(df_tot, feature_names .=> percent_rank .=> feature_names) | ||
|
||
dtrain = df_tot[1:end-1_000_000, :]; | ||
deval = df_tot[end-1_000_000+1:end-500_000, :]; | ||
dtest = df_tot[end-500_000+1:end, :]; | ||
|
||
config = NeuroTreeRegressor( | ||
device=:gpu, | ||
loss=:logloss, | ||
nrounds=200, | ||
scaler=true, | ||
outsize=1, | ||
depth=4, | ||
lr=2e-3, | ||
ntrees=128, | ||
stack_size=2, | ||
hidden_size=16, | ||
batchsize=8092, | ||
) | ||
|
||
@time m, logger = NeuroTreeModels.fit( | ||
config, | ||
dtrain; | ||
deval, | ||
target_name, | ||
feature_names, | ||
print_every_n=1, | ||
early_stopping_rounds=2, | ||
metric=:logloss, | ||
return_logger=true | ||
); | ||
|
||
dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device); | ||
p_eval = m(dinfer_eval); | ||
error_eval = 1 - mean(round.(Int, p_eval) .== deval.y) | ||
@info "ERROR - deval" error_eval | ||
|
||
dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device); | ||
p_test = m(dinfer_test); | ||
error_test = 1 - mean(round.(Int, p_test) .== dtest.y) | ||
@info "ERROR - dtest" error_test | ||
|
||
# depth:4, num_trees=256, stack_size=2, hidden_size=16, boosting_size=1, batchsize=2048, lr=1e-3 | ||
# ┌ Info: iter 30 | ||
# └ metric = 0.4679296910762787 | ||
# 10128.021110 seconds (806.60 M allocations: 206.595 GiB, 0.40% gc time, 0.00% compilation time) | ||
# ┌ Info: ERROR - dtest | ||
# └ error_test = 0.22794599999999998 | ||
|
||
# depth:5, num_trees=256, stack_size=1, hidden_size=1, boosting_size=1, batchsize=2048, | ||
# ┌ Info: iter 40 | ||
# └ metric = 0.4786278009414673 | ||
# 10985.068111 seconds (959.42 M allocations: 259.180 GiB, 0.38% gc time) | ||
# ┌ Info: ERROR - dtest | ||
# └ error_test = 0.23524 | ||
|
||
# depth:5, num_trees=256, stack_size=3, hidden_size=16, boosting_size=3, batchsize=2048, | ||
# ┌ Info: iter 33 | ||
# └ metric = 0.4564650058746338 | ||
# 34568.885039 seconds (7.51 G allocations: 1.109 TiB, 1.01% gc time) | ||
# ┌ Info: ERROR - dtest | ||
# └ error_test = 0.22153599999999996 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using Revise | ||
using Random | ||
using CSV | ||
using DataFrames | ||
using StatsBase | ||
using Statistics: mean, std | ||
using NeuroTreeModels | ||
using Solage: Connectors | ||
using ReadLIBSVM | ||
using AWS: AWSCredentials, AWSConfig, @service | ||
|
||
# https://www.microsoft.com/en-us/research/project/mslr/ | ||
|
||
@service S3 | ||
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"]) | ||
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1") | ||
bucket = "jeremiedb" | ||
|
||
# initial prep | ||
function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig()) | ||
raw = S3.get_object("jeremiedb", file, Dict("response-content-type" => "application/octet-stream"); aws_config) | ||
return read_libsvm(raw; has_query) | ||
end | ||
|
||
@time train_raw = read_libsvm_aws("share/data/msrank/train.txt"; has_query=true, aws_config); | ||
@time eval_raw = read_libsvm_aws("share/data/msrank/vali.txt"; has_query=true, aws_config); | ||
@time test_raw = read_libsvm_aws("share/data/msrank/test.txt"; has_query=true, aws_config); | ||
|
||
dtrain = DataFrame(train_raw[:x], :auto) | ||
dtrain.y_raw = train_raw[:y] | ||
dtrain.y = dtrain.y_raw ./ 4 | ||
dtrain.q = train_raw[:q] | ||
|
||
deval = DataFrame(eval_raw[:x], :auto) | ||
deval.y_raw = eval_raw[:y] | ||
deval.y = deval.y_raw ./ 4 | ||
deval.q = eval_raw[:q] | ||
|
||
dtest = DataFrame(test_raw[:x], :auto) | ||
dtest.y_raw = test_raw[:y] | ||
dtest.y = dtest.y_raw ./ 4 | ||
dtest.q = test_raw[:q] | ||
|
||
feature_names = setdiff(names(dtrain), ["y", "y_raw", "q"]) | ||
target_name = "y_raw" | ||
|
||
function percent_rank(x::AbstractVector{T}) where {T} | ||
return tiedrank(x) / (length(x) + 1) | ||
end | ||
|
||
transform!(dtrain, feature_names .=> percent_rank .=> feature_names) | ||
transform!(deval, feature_names .=> percent_rank .=> feature_names) | ||
transform!(dtest, feature_names .=> percent_rank .=> feature_names) | ||
|
||
config = NeuroTreeRegressor( | ||
device=:gpu, | ||
loss=:mse, | ||
nrounds=2, | ||
actA=:tanh, | ||
outsize=1, | ||
depth=4, | ||
ntrees=64, | ||
stack_size=2, | ||
hidden_size=16, | ||
batchsize=4096, | ||
lr=3e-4, | ||
) | ||
|
||
@time m, logger = NeuroTreeModels.fit( | ||
config, | ||
dtrain; | ||
deval, | ||
target_name, | ||
feature_names, | ||
print_every_n=1, | ||
early_stopping_rounds=3, | ||
metric=:mse, | ||
return_logger=true | ||
); | ||
|
||
dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device); | ||
p_eval = m(dinfer_eval); | ||
mse_eval = mean((p_eval .- deval.y_raw) .^ 2) | ||
@info "MSE - deval" mse_eval | ||
|
||
dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device); | ||
p_test = m(dinfer_test); | ||
mse_test = mean((p_test .- dtest.y_raw) .^ 2) | ||
@info "MSE - dtest" mse_test |
Oops, something went wrong.