Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Feb 24, 2024
0 parents commit 7bd6993
Show file tree
Hide file tree
Showing 49 changed files with 6,660 additions and 0 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: CI
on:
pull_request:
branches:
- main
push:
branches:
- main
- ci-docs
tags: '*'
env:
JULIA_PKG_USE_CLI_GIT: true
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- '1.10'
os:
- ubuntu-latest
arch:
- x64
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: julia-actions/cache@v1
- uses: julia-actions/julia-buildpkg@v1
with:
git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default)
- uses: julia-actions/julia-runtest@v1
46 changes: 46 additions & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Docs

on:
push:
branches:
- main
tags: '*'
pull_request:

env:
JULIA_PKG_USE_CLI_GIT: true

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: write
pages: write
id-token: write
statuses: write

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v3
with:
node-version: 20
cache: npm # or pnpm / yarn
cache-dependency-path: 'docs/package-lock.json' # this should be a package-lock.json file
- uses: julia-actions/setup-julia@v1
with:
version: '1.10'
- uses: julia-actions/cache@v1
- uses: webfactory/[email protected]
with:
ssh-private-key: |
${{ secrets.NEUROTREEMODELS_KEY }}
- uses: julia-actions/julia-buildpkg@v1
with:
git_cli: true # = JULIA_PKG_USE_CLI_GIT. Options: true | false (default)
- name: Instantiate NPM
run: cd docs/; npm i; cd ..
- uses: julia-actions/julia-docdeploy@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 changes: 34 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Files generated by invoking Julia with --code-coverage
*.jl.cov
*.jl.*.cov

# Files generated by invoking Julia with --track-allocation
*.jl.mem

# System-specific files and directories generated by the BinaryProvider and BinDeps packages
# They contain absolute paths specific to the host computer, and so should not be committed
deps/deps.jl
deps/build.log
deps/downloads/
deps/usr/
deps/src/

# Build artifacts for creating documentation generated by the Documenter package
docs/build/
docs/site/
docs/src/.vitepress/cache
docs/src/.vitepress/dist
docs/Manifest.toml
docs/.vscode
docs/node_modules/
docs/.vitepress/cache
docs/.vitepress/dist

# File generated by Pkg, the package manager, based on a corresponding Project.toml
# It records a fixed state of all packages used by the project. As such, it should not be
# committed for packages, but should be committed for applications that require a static
# environment.
Manifest.toml

data/
.vscode/
30 changes: 30 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name = "NeuroTreeModels"
uuid = "1db4e0a5-a364-4b0c-897c-2bd5a4a3a1f2"
authors = ["jeremie <[email protected]>"]
version = "0.1.0"


[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[compat]
CUDA = "3, 4, 5"
Flux = "0.13, 0.14"
MLUtils = "0.4"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# NeuroTreeModels.jl

Differentiable tree-based models for tabular data.

## Installation

```julia
] add NeuroTreeModels
```

## Configuring a model

A model configuration is defined with the [NeuroTreeRegressor](@ref) constructor:

```julia
using NeuroTreeModels, DataFrames

config = NeuroTreeRegressor(
loss = :mse,
nrounds = 10,
num_trees = 16,
depth = 5,
)
```

## Training

Building a training a model according to the above `config` is done [NeuroTreeModels.fit](@ref).
See the docs for additinal features, notably early stopping support through the tracking of an evaluation metric.

```julia
nobs, nfeats = 1_000, 5
dtrain = DataFrame(randn(nobs, nfeats), :auto)
dtrain.y = rand(nobs)
feature_names, target_name = names(dtrain, r"x"), "y"

m = NeuroTreeModels.fit(config, dtrain; feature_names, target_name)
```

## Inference

```julia
p = m(dtrain)
```

## MLJ

NeuroTreeModels.jl supports the [MLJ](https://github.com/alan-turing-institute/MLJ.jl) Interface.

```julia
using MLJBase, NeuroTreeModels
m = NeuroTreeRegressor(depth=5, nrounds=10)
X, y = @load_boston
mach = machine(m, X, y) |> fit!
p = predict(mach, X)
```
89 changes: 89 additions & 0 deletions benchmarks/Higgs-logloss.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using Revise
using Random
using CSV
using DataFrames
using StatsBase
using Statistics: mean, std
using NeuroTreeModels
using Solage: Connectors
using AWS: AWSCredentials, AWSConfig, @service

@service S3
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
bucket = "jeremiedb"

path = "share/data/higgs/HIGGS.arrow"
df_tot = Connectors.read_arrow_aws(path; bucket="jeremiedb", aws_config)

rename!(df_tot, "Column1" => "y")
feature_names = setdiff(names(df_tot), ["y"])
target_name = "y"

# function percent_rank(x::AbstractVector{T}) where {T}
# return tiedrank(x) / (length(x) + 1)
# end

# transform!(df_tot, feature_names .=> percent_rank .=> feature_names)

dtrain = df_tot[1:end-1_000_000, :];
deval = df_tot[end-1_000_000+1:end-500_000, :];
dtest = df_tot[end-500_000+1:end, :];

config = NeuroTreeRegressor(
device=:gpu,
loss=:logloss,
nrounds=200,
scaler=true,
outsize=1,
depth=4,
lr=2e-3,
ntrees=128,
stack_size=2,
hidden_size=16,
batchsize=8092,
)

@time m, logger = NeuroTreeModels.fit(
config,
dtrain;
deval,
target_name,
feature_names,
print_every_n=1,
early_stopping_rounds=2,
metric=:logloss,
return_logger=true
);

dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
p_eval = m(dinfer_eval);
error_eval = 1 - mean(round.(Int, p_eval) .== deval.y)
@info "ERROR - deval" error_eval

dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
p_test = m(dinfer_test);
error_test = 1 - mean(round.(Int, p_test) .== dtest.y)
@info "ERROR - dtest" error_test

# depth:4, num_trees=256, stack_size=2, hidden_size=16, boosting_size=1, batchsize=2048, lr=1e-3
# ┌ Info: iter 30
# └ metric = 0.4679296910762787
# 10128.021110 seconds (806.60 M allocations: 206.595 GiB, 0.40% gc time, 0.00% compilation time)
# ┌ Info: ERROR - dtest
# └ error_test = 0.22794599999999998

# depth:5, num_trees=256, stack_size=1, hidden_size=1, boosting_size=1, batchsize=2048,
# ┌ Info: iter 40
# └ metric = 0.4786278009414673
# 10985.068111 seconds (959.42 M allocations: 259.180 GiB, 0.38% gc time)
# ┌ Info: ERROR - dtest
# └ error_test = 0.23524

# depth:5, num_trees=256, stack_size=3, hidden_size=16, boosting_size=3, batchsize=2048,
# ┌ Info: iter 33
# └ metric = 0.4564650058746338
# 34568.885039 seconds (7.51 G allocations: 1.109 TiB, 1.01% gc time)
# ┌ Info: ERROR - dtest
# └ error_test = 0.22153599999999996

89 changes: 89 additions & 0 deletions benchmarks/MSRank.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using Revise
using Random
using CSV
using DataFrames
using StatsBase
using Statistics: mean, std
using NeuroTreeModels
using Solage: Connectors
using ReadLIBSVM
using AWS: AWSCredentials, AWSConfig, @service

# https://www.microsoft.com/en-us/research/project/mslr/

@service S3
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
bucket = "jeremiedb"

# initial prep
function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
raw = S3.get_object("jeremiedb", file, Dict("response-content-type" => "application/octet-stream"); aws_config)
return read_libsvm(raw; has_query)
end

@time train_raw = read_libsvm_aws("share/data/msrank/train.txt"; has_query=true, aws_config);
@time eval_raw = read_libsvm_aws("share/data/msrank/vali.txt"; has_query=true, aws_config);
@time test_raw = read_libsvm_aws("share/data/msrank/test.txt"; has_query=true, aws_config);

dtrain = DataFrame(train_raw[:x], :auto)
dtrain.y_raw = train_raw[:y]
dtrain.y = dtrain.y_raw ./ 4
dtrain.q = train_raw[:q]

deval = DataFrame(eval_raw[:x], :auto)
deval.y_raw = eval_raw[:y]
deval.y = deval.y_raw ./ 4
deval.q = eval_raw[:q]

dtest = DataFrame(test_raw[:x], :auto)
dtest.y_raw = test_raw[:y]
dtest.y = dtest.y_raw ./ 4
dtest.q = test_raw[:q]

feature_names = setdiff(names(dtrain), ["y", "y_raw", "q"])
target_name = "y_raw"

function percent_rank(x::AbstractVector{T}) where {T}
return tiedrank(x) / (length(x) + 1)
end

transform!(dtrain, feature_names .=> percent_rank .=> feature_names)
transform!(deval, feature_names .=> percent_rank .=> feature_names)
transform!(dtest, feature_names .=> percent_rank .=> feature_names)

config = NeuroTreeRegressor(
device=:gpu,
loss=:mse,
nrounds=2,
actA=:tanh,
outsize=1,
depth=4,
ntrees=64,
stack_size=2,
hidden_size=16,
batchsize=4096,
lr=3e-4,
)

@time m, logger = NeuroTreeModels.fit(
config,
dtrain;
deval,
target_name,
feature_names,
print_every_n=1,
early_stopping_rounds=3,
metric=:mse,
return_logger=true
);

dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
p_eval = m(dinfer_eval);
mse_eval = mean((p_eval .- deval.y_raw) .^ 2)
@info "MSE - deval" mse_eval

dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
p_test = m(dinfer_test);
mse_test = mean((p_test .- dtest.y_raw) .^ 2)
@info "MSE - dtest" mse_test
Loading

0 comments on commit 7bd6993

Please sign in to comment.