feat: compile training loop automatically using reactant (#969)

* feat: compile training loop automatically using reactant * refactor: add a level of indirection for the train_step * feat: directly compile step + grad function * fix: make note of current issue with inplace update * chore: bump minimum reactant version * test: setup specific reactant test group * ci: temporarily disable other tests (drop me) * test: fix installation of Reactant * test: start adding loss function tests * fix: xlogx and xlogy now work with Reactant scalars * feat: support regression losses + tests * test: classification losses * fix: more specialization * fix: support all loss functions * chore: comments * fix: bump reactant version * test: don't run reactant tests on windows * test: temporarily disable more tests * fix: reactant GPU support * fix: remove old LossFunctions.jl dispatches * test: try using MSELoss directly * ci: reactivate all tests * ci(windows): don't test Reactant on windows
LuxDL · Oct 9, 2024 · 1b0d6f8 · 1b0d6f8 · github-actions · Oct 10, 2024
1 parent 77eb5fb
commit 1b0d6f8
Show file tree

Hide file tree

Showing 15 changed files with 487 additions and 33 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -42,6 +42,7 @@ jobs:
           - "recurrent_layers"
           - "eltype_match"
           - "fluxcompat"
+          - "reactant"
         include:
           - version: "1.10"
             os: macos-latest

diff --git a/.github/workflows/CIPreRelease.yml b/.github/workflows/CIPreRelease.yml
@@ -32,16 +32,17 @@ jobs:
         os:
           - ubuntu-latest
         test_group:
-          - "core_layers"
-          - "contrib"
-          - "helpers"
-          - "distributed"
-          - "normalize_layers"
-          - "others"
-          - "autodiff"
-          - "recurrent_layers"
-          - "eltype_match"
-          - "fluxcompat"
+          # - "core_layers"
+          # - "contrib"
+          # - "helpers"
+          # - "distributed"
+          # - "normalize_layers"
+          # - "others"
+          # - "autodiff"
+          # - "recurrent_layers"
+          # - "eltype_match"
+          # - "fluxcompat"
+          - "reactant"
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2

diff --git a/Project.toml b/Project.toml
@@ -46,6 +46,7 @@ LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
@@ -59,6 +60,7 @@ LuxLossFunctionsExt = "LossFunctions"
 LuxMLUtilsExt = "MLUtils"
 LuxMPIExt = "MPI"
 LuxMPINCCLExt = ["CUDA", "MPI", "NCCL"]
+LuxReactantExt = ["Enzyme", "Reactant"]
 LuxReverseDiffExt = ["FunctionWrappers", "ReverseDiff"]
 LuxSimpleChainsExt = "SimpleChains"
 LuxTrackerExt = "Tracker"
@@ -68,7 +70,7 @@ LuxZygoteExt = "Zygote"
 ADTypes = "1.8.1"
 Adapt = "4"
 ArgCheck = "2.3"
-ArrayInterface = "7.9"
+ArrayInterface = "7.10"
 CUDA = "5.3.2"
 ChainRulesCore = "1.24"
 Compat = "4.15"
@@ -87,7 +89,7 @@ LinearAlgebra = "1.10"
 LossFunctions = "0.11.1"
 LuxCore = "1"
 LuxLib = "1.3"
-MLDataDevices = "1.1"
+MLDataDevices = "1.2"
 MLUtils = "0.4.4"
 MPI = "0.20.19"
 MacroTools = "0.5.13"
@@ -97,6 +99,7 @@ NNlib = "0.9.24"
 Optimisers = "0.3.3"
 Preferences = "1.4.3"
 Random = "1.10"
+Reactant = "0.2.3"
 Reexport = "1.2.2"
 ReverseDiff = "1.15"
 SIMDTypes = "0.1"

diff --git a/ext/LuxEnzymeExt/training.jl b/ext/LuxEnzymeExt/training.jl
@@ -1,4 +1,4 @@
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ad::AutoEnzyme, obj_fn::F, data, ts::TrainState) where {F}
     dps = Lux.recursive_make_zero(ts.parameters)
 
@@ -20,9 +20,8 @@ end
 const AUTODIFF_CACHE_TYPE = TrainingBackendCache{
     <:AutoEnzyme, False, PS, <:NamedTuple{(:obj_fn, :st_wrap, :stats_wrap)}} where {PS}
 
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ::AutoEnzyme, obj_fn::F, data, ts::TrainState{<:AUTODIFF_CACHE_TYPE, F}) where {F}
-    # dps = Lux.recursive_make_zero!!(ts.cache.dparameters)
     Enzyme.make_zero!(ts.cache.dparameters)
     dps = ts.cache.dparameters
 
@@ -36,7 +35,7 @@ function Lux.Training.compute_gradients(
     return dps, loss, ts.cache.extras.stats_wrap[], ts
 end
 
-function Lux.Training.compute_gradients(ad::AutoEnzyme, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(ad::AutoEnzyme, obj_fn::F, data,
         ts::TrainState{<:TrainingBackendCache{<:AutoEnzyme, False}}) where {F}
     @warn "Detected calls to `compute_gradients(::AutoEnzyme, ...)` with objective \
            function that is changing across function calls. This can lead to the \
@@ -56,7 +55,7 @@ end
 const AUTODIFF_THUNK_CACHE_TYPE = TrainingBackendCache{
     <:AutoEnzyme, False, PS, <:NamedTuple{(:forward, :reverse)}} where {PS}
 
-function Lux.Training.compute_gradients(::AutoEnzyme, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(::AutoEnzyme, obj_fn::F, data,
         ts::TrainState{<:AUTODIFF_THUNK_CACHE_TYPE, F}) where {F}
     dps = Lux.recursive_make_zero!!(ts.cache.dparameters)
     params = Duplicated(ts.parameters, dps)

diff --git a/ext/LuxReactantExt/LuxReactantExt.jl b/ext/LuxReactantExt/LuxReactantExt.jl
@@ -0,0 +1,14 @@
+module LuxReactantExt
+
+using Enzyme: Enzyme, Const, Duplicated, Active
+using Optimisers: Optimisers
+using Reactant: Reactant, @compile, TracedRArray
+using Setfield: @set!
+using Static: False
+
+using Lux: Lux, LuxOps, Training
+using Lux.Training: TrainingBackendCache, ReactantBackend
+
+include("training.jl")
+
+end
diff --git a/ext/LuxReactantExt/training.jl b/ext/LuxReactantExt/training.jl
@@ -0,0 +1,92 @@
+function Lux.Training.compute_gradients_impl(
+        backend::ReactantBackend, objective_function::F,
+        data, ts::Training.TrainState) where {F}
+    compiled_gradient_function = @compile compute_gradients_internal(
+        objective_function, ts.model, data, ts.parameters, ts.states)
+
+    grads, loss, stats, st = compiled_gradient_function(
+        objective_function, ts.model, data, ts.parameters, ts.states)
+
+    cache = TrainingBackendCache(backend, False(), nothing, (; compiled_gradient_function))
+    @set! ts.cache = cache
+    @set! ts.objective_function = objective_function
+    @set! ts.states = st
+    return grads, loss, stats, ts
+end
+
+function Lux.Training.compute_gradients_impl(::ReactantBackend, obj_fn::F, data,
+        ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}, F}) where {F}
+    grads, loss, stats, st = ts.cache.extras.compiled_gradient_function(
+        obj_fn, ts.model, data, ts.parameters, ts.states)
+    @set! ts.states = st
+    return grads, loss, stats, ts
+end
+
+function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
+    dps = Enzyme.make_zero(ps)
+    _, (loss, stₙ, stats) = Enzyme.autodiff(
+        Enzyme.ReverseWithPrimal, Const(objective_function), Active, Const(model),
+        Duplicated(ps, dps), Const(st), Const(data))
+    return dps, loss, stats, stₙ
+end
+
+for inplace in ("!", "")
+    fname = Symbol(:single_train_step_impl, inplace)
+    internal_fn = Symbol(:compute_gradients_internal_and_step, inplace)
+
+    @eval function Lux.Training.$(fname)(backend::ReactantBackend, objective_function::F,
+            data, ts::Training.TrainState) where {F}
+        compiled_grad_and_step_function = @compile $(internal_fn)(
+            objective_function, ts.model, data, ts.parameters, ts.states,
+            ts.optimizer_state)
+
+        grads, ps, loss, stats, st, opt_state = compiled_grad_and_step_function(
+            objective_function, ts.model, data, ts.parameters, ts.states,
+            ts.optimizer_state)
+
+        cache = TrainingBackendCache(
+            backend, False(), nothing, (; compiled_grad_and_step_function))
+        @set! ts.cache = cache
+        @set! ts.objective_function = objective_function
+        @set! ts.states = st
+        @set! ts.parameters = ps
+        @set! ts.optimizer_state = opt_state
+        @set! ts.step = ts.step + 1
+
+        return grads, loss, stats, ts
+    end
+
+    @eval function Lux.Training.$(fname)(::ReactantBackend, obj_fn::F, data,
+            ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}, F}) where {F}
+        grads, ps, loss, stats, st, opt_state = ts.cache.extras.compiled_grad_and_step_function(
+            obj_fn, ts.model, data, ts.parameters, ts.states, ts.optimizer_state)
+
+        @set! ts.states = st
+        @set! ts.parameters = ps
+        @set! ts.optimizer_state = opt_state
+        @set! ts.step = ts.step + 1
+
+        return grads, loss, stats, ts
+    end
+end
+
+function compute_gradients_internal_and_step(objective_function::F, model, data, ps,
+        st, opt_state) where {F}
+    dps = Enzyme.make_zero(ps)
+    _, (loss, stₙ, stats) = Enzyme.autodiff(
+        Enzyme.ReverseWithPrimal, Const(objective_function), Active, Const(model),
+        Duplicated(ps, dps), Const(st), Const(data))
+    opt_state, ps = Optimisers.update(opt_state, ps, dps)
+    return dps, ps, loss, stats, stₙ, opt_state
+end
+
+function compute_gradients_internal_and_step!(objective_function::F, model, data, ps,
+        st, opt_state) where {F}
+    dps = Enzyme.make_zero(ps)
+    _, (loss, stₙ, stats) = Enzyme.autodiff(
+        Enzyme.ReverseWithPrimal, Const(objective_function), Active, Const(model),
+        Duplicated(ps, dps), Const(st), Const(data))
+    # XXX: Inplace updates not actually inplace
+    opt_state, ps = Optimisers.update!(opt_state, ps, dps)
+    return dps, ps, loss, stats, stₙ, opt_state
+end
diff --git a/ext/LuxReverseDiffExt/training.jl b/ext/LuxReverseDiffExt/training.jl
@@ -1,13 +1,13 @@
 # Uncompiled ReverseDiff
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ad::AutoReverseDiff{false}, obj_fn::F, data, ts::TrainState) where {F}
     @set! ts.cache = TrainingBackendCache(
         ad, True(), Lux.recursive_make_zero(ts.parameters), nothing)
     @set! ts.objective_function = obj_fn
     return Lux.Training.compute_gradients(ad, obj_fn, data, ts)
 end
 
-function Lux.Training.compute_gradients(::AutoReverseDiff{false}, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(::AutoReverseDiff{false}, obj_fn::F, data,
         ts::TrainState{<:TrainingBackendCache{AutoReverseDiff{false}}}) where {F}
     dparams = Training.dparameters(ts.cache)
     tape = ReverseDiff.InstructionTape()
@@ -24,7 +24,7 @@ function Lux.Training.compute_gradients(::AutoReverseDiff{false}, obj_fn::F, dat
 end
 
 # Compiled ReverseDiff
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ad::AutoReverseDiff{true}, obj_fn::F, data, ts::TrainState) where {F}
     @set! ts.cache = TrainingBackendCache(
         ad, True(), Lux.recursive_make_zero(ts.parameters),
@@ -35,7 +35,7 @@ function Lux.Training.compute_gradients(
 end
 
 ## Tape hasn't been compiled yet / Function mismatch so recompile
-function Lux.Training.compute_gradients(ad::AutoReverseDiff{true}, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(ad::AutoReverseDiff{true}, obj_fn::F, data,
         ts::TrainState{<:TrainingBackendCache{AutoReverseDiff{true}}}) where {F}
     if LuxCore.statelength(ts.states) != 0
         throw(ArgumentError("AutoReverseDiff(; compile=true) is not supported for Lux \
@@ -82,7 +82,7 @@ function Lux.Training.compute_gradients(ad::AutoReverseDiff{true}, obj_fn::F, da
     return dparams, ReverseDiff.value(loss), NamedTuple(), ts
 end
 
-function Lux.Training.compute_gradients(::AutoReverseDiff{true}, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(::AutoReverseDiff{true}, obj_fn::F, data,
         ts::TrainState{<:TrainingBackendCache{AutoReverseDiff{true}}, F}) where {F}
     (; ps_cache, data_cache, output) = ts.cache.extras
 

diff --git a/ext/LuxTrackerExt/training.jl b/ext/LuxTrackerExt/training.jl
@@ -1,4 +1,4 @@
-function Lux.Training.compute_gradients(::AutoTracker, obj_fn::F, data,
+function Lux.Training.compute_gradients_impl(::AutoTracker, obj_fn::F, data,
         ts::TrainState{<:TrainingBackendCache{AutoTracker}}) where {F}
     dps = Training.dparameters(ts.cache)
     ps_tracked = construct_tracked_params(ts.parameters, dps)
@@ -13,7 +13,7 @@ function Lux.Training.compute_gradients(::AutoTracker, obj_fn::F, data,
     return dps, loss.data, stats, ts
 end
 
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ad::AutoTracker, obj_fn::F, data, ts::TrainState) where {F}
     grads = Lux.recursive_make_zero(ts.parameters)
     cache = TrainingBackendCache(ad, True(), grads, nothing)

diff --git a/ext/LuxZygoteExt/training.jl b/ext/LuxZygoteExt/training.jl
@@ -1,4 +1,4 @@
-function Lux.Training.compute_gradients(
+function Lux.Training.compute_gradients_impl(
         ::AutoZygote, objective_function::F, data, ts::Lux.Training.TrainState) where {F}
     (loss, st, stats), back = Zygote.pullback(
         objective_function, ts.model, ts.parameters, ts.states, data)

diff --git a/src/helpers/losses.jl b/src/helpers/losses.jl
@@ -120,7 +120,8 @@ function huber_loss(x::T1, y::T2, δ::T3) where {T1, T2, T3}
     T = promote_type(T1, T2, T3)
     diff = x - y
     abs_diff = abs(diff)
-    return ifelse(abs_diff ≤ δ, T(0.5) * abs2(diff), δ * (abs_diff - T(0.5) * δ))
+    return ifelse(
+        abs_diff ≤ δ, convert(T, 0.5) * abs2(diff), δ * (abs_diff - convert(T, 0.5) * δ))
 end
 has_custom_derivative(::typeof(huber_loss)) = true
 function derivative(::typeof(huber_loss), x::T, y::T2, δ::T3) where {T, T2, T3}
@@ -148,7 +149,7 @@ function derivative(::typeof(l2_hinge_loss), x::T1, y::T2) where {T1, T2}
 end
 
 function siamese_contrastive_loss(x::T1, y::T2, margin=true) where {T1, T2}
-    return (true - y) * x^2 + y * max(promote_type(T1, T2)(false), margin - x)^2
+    return (true - y) * x^2 + y * max(convert(promote_type(T1, T2), false), margin - x)^2
 end
 
 poisson_loss(x::T1, y::T2, ϵ) where {T1, T2} = x - xlogy(y, x + get_ϵ(T1, ϵ))

diff --git a/src/helpers/training.jl b/src/helpers/training.jl
@@ -10,6 +10,7 @@ using Static: StaticBool, Static, False, True
 
 using ..Lux: Lux
 using LuxCore: LuxCore, AbstractLuxLayer
+using MLDataDevices: XLADevice, get_device_type, get_device, cpu_device
 
 """
     TrainState
@@ -61,7 +62,13 @@ Constructor for [`TrainState`](@ref).
 [`TrainState`](@ref) object.
 """
 function TrainState(model::AbstractLuxLayer, ps, st, optimizer::Optimisers.AbstractRule)
-    st_opt = Optimisers.setup(optimizer, ps)
+    dev = get_device(ps)
+    st_opt = if dev isa XLADevice
+        ps_cpu = ps |> cpu_device()
+        Optimisers.setup(optimizer, ps_cpu) |> dev
+    else
+        Optimisers.setup(optimizer, ps)
+    end
     return TrainState(nothing, nothing, model, ps, st, optimizer, st_opt, 0)
 end
 
@@ -96,6 +103,8 @@ function Base.show(io::IO, ::MIME"text/plain", ts::TrainState)
         print(io, "\n    objective_function: ", nameof(typeof(ts.objective_function)))
 end
 
+struct ReactantBackend end
+
 const APPLY_GRAD_DOCSTRING = """
 ## Arguments
 
@@ -183,7 +192,20 @@ A 4-Tuple containing:
     returned in step `i + 1` might be aliased by the old gradients. If you want to prevent
     this, simply use `copy(grads)` or `deepcopy(grads)` to make a copy of the gradients.
 """
-function compute_gradients(ad::AbstractADType, ::F, _, ::TrainState) where {F}
+function compute_gradients(ad, obj_fn::F, data, ts::TrainState) where {F}
+    dev_type = get_device_type((ts.parameters, ts.states))
+    return compute_gradients_impl(maybe_wrap_adtype(ad, dev_type), obj_fn, data, ts)
+end
+
+maybe_wrap_adtype(backend::ReactantBackend, _) = backend
+maybe_wrap_adtype(ad::AbstractADType, _) = ad
+function maybe_wrap_adtype(ad::AbstractADType, ::Type{XLADevice})
+    ad isa AutoEnzyme && return ReactantBackend()
+    throw(ArgumentError("Computing gradients for models on XLA is supported only with \
+                         Enzyme.jl (`AutoEnzyme`)."))
+end
+
+function compute_gradients_impl(ad, ::F, _, ts::TrainState) where {F}
     return check_if_compute_gradients_implemented(ad)
 end
 
@@ -192,6 +214,10 @@ function check_if_compute_gradients_implemented(::T) where {T <: AbstractADType}
                          yet!"))
 end
 
+function check_if_compute_gradients_implemented(::ReactantBackend)
+    throw(ArgumentError("Load `Reactant` with `using Reactant` before using this function!"))
+end
+
 for package in (:Zygote, :Tracker, :ReverseDiff, :Enzyme)
     adtype = Symbol(:Auto, package)
     msg = "Load `$(package)` with `using $(package)`/`import $(package)` before using this \
@@ -244,7 +270,10 @@ only the parameters in `ts` are updated inplace. Users should be using the retur
 object for further training steps, else there is no caching and performance will be
 suboptimal (and absolutely terrible for backends like `AutoReactant`).
 """
-function single_train_step! end
+function single_train_step!(backend, obj_fn::F, data, ts::TrainState) where {F}
+    backend = maybe_wrap_adtype(backend, get_device_type((ts.parameters, ts.states)))
+    return single_train_step_impl!(backend, obj_fn, data, ts)
+end
 
 """
     single_train_step(backend, obj_fn::F, data, ts::TrainState)
@@ -259,10 +288,14 @@ In most cases you should use [`single_train_step!`](@ref) instead of this functi
 
 Returned values are the same as [`compute_gradients`](@ref).
 """
-function single_train_step end
+function single_train_step(backend, obj_fn::F, data, ts::TrainState) where {F}
+    backend = maybe_wrap_adtype(backend, get_device_type((ts.parameters, ts.states)))
+    return single_train_step_impl(backend, obj_fn, data, ts)
+end
 
 for inplace in ("!", "")
-    step, apply_fn = Symbol(:single_train_step, inplace), Symbol(:apply_gradients, inplace)
+    step = Symbol(:single_train_step_impl, inplace)
+    apply_fn = Symbol(:apply_gradients, inplace)
     @eval function $(step)(backend, obj_fn::F, data, ts::TrainState) where {F}
         grads, loss, stats, ts = compute_gradients(backend, obj_fn, data, ts)
         ts = $(apply_fn)(ts, grads)