diff --git a/Project.toml b/Project.toml index f390512ca..53be72a2a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Turing" uuid = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" -version = "0.21.10" +version = "0.21.11" [deps] AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001" @@ -11,7 +11,6 @@ AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c" BangBang = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" @@ -20,6 +19,7 @@ EllipticalSliceSampling = "cad2338a-1db2-11e9-3401-43bc07c9ede2" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Libtask = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c" MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d" NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -32,7 +32,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" -ZygoteRules = "700de1a5-db45-46bc-99cf-38207098b444" [compat] AbstractMCMC = "4" @@ -43,7 +42,6 @@ AdvancedVI = "0.1" BangBang = "0.3" Bijectors = "0.8, 0.9, 0.10" DataStructures = "0.18" -DiffResults = "1" Distributions = "0.23.3, 0.24, 0.25" DistributionsAD = "0.6" DocStringExtensions = "0.8, 0.9" @@ -51,6 +49,7 @@ DynamicPPL = "0.20" EllipticalSliceSampling = "0.5, 1" ForwardDiff = "0.10.3" Libtask = "0.6.7, 0.7" +LogDensityProblems = "0.12" MCMCChains = "5" NamedArrays = "0.9" Reexport = "0.2, 1" @@ -60,5 +59,4 @@ SpecialFunctions = "0.7.2, 0.8, 0.9, 0.10, 1, 2" StatsBase = "0.32, 0.33" StatsFuns = "0.8, 0.9, 1" Tracker = "0.2.3" -ZygoteRules = "0.2" julia = "1.6" diff --git a/docs/src/using-turing/autodiff.md b/docs/src/using-turing/autodiff.md index dd9e2976d..44eff6133 100644 --- a/docs/src/using-turing/autodiff.md +++ b/docs/src/using-turing/autodiff.md @@ -10,7 +10,7 @@ title: Automatic Differentiation Turing supports four packages of automatic differentiation (AD) in the back end during sampling. The default AD backend is [ForwardDiff](https://github.com/JuliaDiff/ForwardDiff.jl) for forward-mode AD. Three reverse-mode AD backends are also supported, namely [Tracker](https://github.com/FluxML/Tracker.jl), [Zygote](https://github.com/FluxML/Zygote.jl) and [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). `Zygote` and `ReverseDiff` are supported optionally if explicitly loaded by the user with `using Zygote` or `using ReverseDiff` next to `using Turing`. -To switch between the different AD backends, one can call function `Turing.setadbackend(backend_sym)`, where `backend_sym` can be `:forwarddiff` (`ForwardDiff`), `:tracker` (`Tracker`), `:zygote` (`Zygote`) or `:reversediff` (`ReverseDiff.jl`). When using `ReverseDiff`, to compile the tape only once and cache it for later use, the user needs to load [Memoization.jl](https://github.com/marius311/Memoization.jl) first with `using Memoization` then call `Turing.setrdcache(true)`. However, note that the use of caching in certain types of models can lead to incorrect results and/or errors. Models for which the compiled tape can be safely cached are models with fixed size loops and no run-time if statements. Compile-time if statements are fine. To empty the cache, you can call `Turing.emptyrdcache()`. +To switch between the different AD backends, one can call function `Turing.setadbackend(backend_sym)`, where `backend_sym` can be `:forwarddiff` (`ForwardDiff`), `:tracker` (`Tracker`), `:zygote` (`Zygote`) or `:reversediff` (`ReverseDiff.jl`). When using `ReverseDiff`, to compile the tape only once and cache it for later use, the user has to call `Turing.setrdcache(true)`. However, note that the use of caching in certain types of models can lead to incorrect results and/or errors. Models for which the compiled tape can be safely cached are models with fixed size loops and no run-time if statements. Compile-time if statements are fine. ## Compositional Sampling with Differing AD Modes diff --git a/src/Turing.jl b/src/Turing.jl index 86eeb9be3..fd9f1b861 100644 --- a/src/Turing.jl +++ b/src/Turing.jl @@ -9,6 +9,7 @@ using Tracker: Tracker import AdvancedVI import DynamicPPL: getspace, NoDist, NamedDist +import LogDensityProblems import Random const PROGRESS = Ref(true) @@ -37,12 +38,20 @@ function (f::LogDensityFunction)(θ::AbstractVector) return getlogp(last(DynamicPPL.evaluate!!(f.model, VarInfo(f.varinfo, f.sampler, θ), f.sampler, f.context))) end +# LogDensityProblems interface +LogDensityProblems.logdensity(f::LogDensityFunction, θ::AbstractVector) = f(θ) +LogDensityProblems.dimension(f::LogDensityFunction) = length(f.varinfo[f.sampler]) +function LogDensityProblems.capabilities(::Type{<:LogDensityFunction}) + return LogDensityProblems.LogDensityOrder{0}() +end + # Standard tag: Improves stacktraces # Ref: https://www.stochasticlifestyle.com/improved-forwarddiff-jl-stacktraces-with-package-tags/ struct TuringTag end # Allow Turing tag in gradient etc. calls of the log density function ForwardDiff.checktag(::Type{ForwardDiff.Tag{TuringTag, V}}, ::LogDensityFunction, ::AbstractArray{V}) where {V} = true +ForwardDiff.checktag(::Type{ForwardDiff.Tag{TuringTag, V}}, ::Base.Fix1{typeof(LogDensityProblems.logdensity),<:LogDensityFunction}, ::AbstractArray{V}) where {V} = true # Random probability measures. include("stdlib/distributions.jl") diff --git a/src/contrib/inference/dynamichmc.jl b/src/contrib/inference/dynamichmc.jl index 6a9207217..fd7c5a6e3 100644 --- a/src/contrib/inference/dynamichmc.jl +++ b/src/contrib/inference/dynamichmc.jl @@ -19,25 +19,6 @@ DynamicNUTS{AD}(space::Symbol...) where AD = DynamicNUTS{AD, space}() DynamicPPL.getspace(::DynamicNUTS{<:Any, space}) where {space} = space -# Only define traits for `DynamicNUTS` sampler to avoid type piracy and surprises -# TODO: Implement generally with `LogDensityProblems` -const DynamicHMCLogDensity{M<:Model,S<:Sampler{<:DynamicNUTS},V<:AbstractVarInfo} = Turing.LogDensityFunction{V,M,S,DynamicPPL.DefaultContext} - -function DynamicHMC.dimension(ℓ::DynamicHMCLogDensity) - return length(ℓ.varinfo[ℓ.sampler]) -end - -function DynamicHMC.capabilities(::Type{<:DynamicHMCLogDensity}) - return DynamicHMC.LogDensityOrder{1}() -end - -function DynamicHMC.logdensity_and_gradient( - ℓ::DynamicHMCLogDensity, - x::AbstractVector, -) - return gradient_logp(x, ℓ.varinfo, ℓ.model, ℓ.sampler, ℓ.context) -end - """ DynamicNUTSState @@ -46,9 +27,10 @@ State of the [`DynamicNUTS`](@ref) sampler. # Fields $(TYPEDFIELDS) """ -struct DynamicNUTSState{V<:AbstractVarInfo,C,M,S} +struct DynamicNUTSState{L,V<:AbstractVarInfo,C,M,S} + logdensity::L vi::V - "Cache of sample, log density, and gradient of log density." + "Cache of sample, log density, and gradient of log density evaluation." cache::C metric::M stepsize::S @@ -61,10 +43,10 @@ function gibbs_state( state::DynamicNUTSState, varinfo::AbstractVarInfo, ) - # Update the previous evaluation. - ℓ = Turing.LogDensityFunction(varinfo, model, spl, DynamicPPL.DefaultContext()) + # Update the log density function and its cached evaluation. + ℓ = LogDensityProblems.ADgradient(Turing.LogDensityFunction(varinfo, model, spl, DynamicPPL.DefaultContext())) Q = DynamicHMC.evaluate_ℓ(ℓ, varinfo[spl]) - return DynamicNUTSState(varinfo, Q, state.metric, state.stepsize) + return DynamicNUTSState(ℓ, varinfo, Q, state.metric, state.stepsize) end DynamicPPL.initialsampler(::Sampler{<:DynamicNUTS}) = SampleFromUniform() @@ -82,10 +64,13 @@ function DynamicPPL.initialstep( model(rng, vi, spl) end + # Define log-density function. + ℓ = LogDensityProblems.ADgradient(Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext())) + # Perform initial step. results = DynamicHMC.mcmc_keep_warmup( rng, - Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()), + ℓ, 0; initialization = (q = vi[spl],), reporter = DynamicHMC.NoProgressReport(), @@ -99,7 +84,7 @@ function DynamicPPL.initialstep( # Create first sample and state. sample = Transition(vi) - state = DynamicNUTSState(vi, Q, steps.H.κ, steps.ϵ) + state = DynamicNUTSState(ℓ, vi, Q, steps.H.κ, steps.ϵ) return sample, state end @@ -113,7 +98,7 @@ function AbstractMCMC.step( ) # Compute next sample. vi = state.vi - ℓ = Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()) + ℓ = state.logdensity steps = DynamicHMC.mcmc_steps( rng, DynamicHMC.NUTS(), @@ -129,7 +114,7 @@ function AbstractMCMC.step( # Create next sample and state. sample = Transition(vi) - newstate = DynamicNUTSState(vi, Q, state.metric, state.stepsize) + newstate = DynamicNUTSState(ℓ, vi, Q, state.metric, state.stepsize) return sample, newstate end diff --git a/src/contrib/inference/sghmc.jl b/src/contrib/inference/sghmc.jl index d026b3656..0cb42c8a4 100644 --- a/src/contrib/inference/sghmc.jl +++ b/src/contrib/inference/sghmc.jl @@ -41,7 +41,8 @@ function SGHMC{AD}( return SGHMC{AD,space,typeof(_learning_rate)}(_learning_rate, _momentum_decay) end -struct SGHMCState{V<:AbstractVarInfo, T<:AbstractVector{<:Real}} +struct SGHMCState{L,V<:AbstractVarInfo, T<:AbstractVector{<:Real}} + logdensity::L vi::V velocity::T end @@ -61,7 +62,8 @@ function DynamicPPL.initialstep( # Compute initial sample and state. sample = Transition(vi) - state = SGHMCState(vi, zero(vi[spl])) + ℓ = LogDensityProblems.ADgradient(Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext())) + state = SGHMCState(ℓ, vi, zero(vi[spl])) return sample, state end @@ -74,9 +76,10 @@ function AbstractMCMC.step( kwargs... ) # Compute gradient of log density. + ℓ = state.logdensity vi = state.vi θ = vi[spl] - _, grad = gradient_logp(θ, vi, model, spl) + grad = last(LogDensityProblems.logdensity_and_gradient(ℓ, θ)) # Update latent variables and velocity according to # equation (15) of Chen et al. (2014) @@ -92,7 +95,7 @@ function AbstractMCMC.step( # Compute next sample and state. sample = Transition(vi) - newstate = SGHMCState(vi, newv) + newstate = SGHMCState(ℓ, vi, newv) return sample, newstate end @@ -191,7 +194,8 @@ metadata(t::SGLDTransition) = (lp = t.lp, SGLD_stepsize = t.stepsize) DynamicPPL.getlogp(t::SGLDTransition) = t.lp -struct SGLDState{V<:AbstractVarInfo} +struct SGLDState{L,V<:AbstractVarInfo} + logdensity::L vi::V step::Int end @@ -211,7 +215,8 @@ function DynamicPPL.initialstep( # Create first sample and state. sample = SGLDTransition(vi, zero(spl.alg.stepsize(0))) - state = SGLDState(vi, 1) + ℓ = LogDensityProblems.ADgradient(Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext())) + state = SGLDState(ℓ, vi, 1) return sample, state end @@ -224,9 +229,10 @@ function AbstractMCMC.step( kwargs... ) # Perform gradient step. + ℓ = state.logdensity vi = state.vi θ = vi[spl] - _, grad = gradient_logp(θ, vi, model, spl) + grad = last(LogDensityProblems.logdensity_and_gradient(ℓ, θ)) step = state.step stepsize = spl.alg.stepsize(step) θ .+= (stepsize / 2) .* grad .+ sqrt(stepsize) .* randn(rng, eltype(θ), length(θ)) @@ -237,7 +243,7 @@ function AbstractMCMC.step( # Compute next sample and state. sample = SGLDTransition(vi, stepsize) - newstate = SGLDState(vi, state.step + 1) + newstate = SGLDState(ℓ, vi, state.step + 1) return sample, newstate end diff --git a/src/essential/Essential.jl b/src/essential/Essential.jl index d18a6e7ab..df0a9b5ac 100644 --- a/src/essential/Essential.jl +++ b/src/essential/Essential.jl @@ -13,22 +13,13 @@ import Bijectors: link, invlink using AdvancedVI using StatsFuns: logsumexp, softmax @reexport using DynamicPPL -using Requires import AdvancedPS -import DiffResults -import ZygoteRules +import LogDensityProblems include("container.jl") include("ad.jl") -function __init__() - @require ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" begin - include("compat/reversediff.jl") - export ReverseDiffAD, getrdcache, setrdcache, emptyrdcache - end -end - export @model, @varname, generate_observe, @@ -53,11 +44,13 @@ export @model, ForwardDiffAD, TrackerAD, ZygoteAD, + ReverseDiffAD, value, - gradient_logp, CHUNKSIZE, ADBACKEND, setchunksize, + setrdcache, + getrdcache, verifygrad, @logprob_str, @prob_str diff --git a/src/essential/ad.jl b/src/essential/ad.jl index 0e4005395..b56ce0140 100644 --- a/src/essential/ad.jl +++ b/src/essential/ad.jl @@ -18,6 +18,9 @@ end function _setadbackend(::Val{:zygote}) ADBACKEND[] = :zygote end +function _setadbackend(::Val{:reversediff}) + ADBACKEND[] = :reversediff +end const ADSAFE = Ref(false) function setadsafe(switch::Bool) @@ -39,9 +42,7 @@ struct ForwardDiffAD{chunk,standardtag} <: ADBackend end # Use standard tag if not specified otherwise ForwardDiffAD{N}() where {N} = ForwardDiffAD{N,true}() -getchunksize(::Type{<:ForwardDiffAD{chunk}}) where chunk = chunk -getchunksize(::Type{<:Sampler{Talg}}) where Talg = getchunksize(Talg) -getchunksize(::Type{SampleFromPrior}) = CHUNKSIZE[] +getchunksize(::ForwardDiffAD{chunk}) where chunk = chunk standardtag(::ForwardDiffAD{<:Any,true}) = true standardtag(::ForwardDiffAD) = false @@ -49,12 +50,24 @@ standardtag(::ForwardDiffAD) = false struct TrackerAD <: ADBackend end struct ZygoteAD <: ADBackend end +struct ReverseDiffAD{cache} <: ADBackend end + +const RDCache = Ref(false) + +setrdcache(b::Bool) = setrdcache(Val(b)) +setrdcache(::Val{false}) = RDCache[] = false +setrdcache(::Val{true}) = RDCache[] = true + +getrdcache() = RDCache[] + ADBackend() = ADBackend(ADBACKEND[]) ADBackend(T::Symbol) = ADBackend(Val(T)) ADBackend(::Val{:forwarddiff}) = ForwardDiffAD{CHUNKSIZE[]} ADBackend(::Val{:tracker}) = TrackerAD ADBackend(::Val{:zygote}) = ZygoteAD +ADBackend(::Val{:reversediff}) = ReverseDiffAD{getrdcache()} + ADBackend(::Val) = error("The requested AD backend is not available. Make sure to load all required packages.") """ @@ -63,54 +76,15 @@ ADBackend(::Val) = error("The requested AD backend is not available. Make sure t Find the autodifferentiation backend of the algorithm `alg`. """ getADbackend(spl::Sampler) = getADbackend(spl.alg) -getADbackend(spl::SampleFromPrior) = ADBackend()() +getADbackend(::SampleFromPrior) = ADBackend()() -""" - gradient_logp( - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler, - ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() - ) - -Computes the value of the log joint of `θ` and its gradient for the model -specified by `(vi, sampler, model)` using whichever automatic differentation -tool is currently active. -""" -function gradient_logp( - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler, - ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - return gradient_logp(getADbackend(sampler), θ, vi, model, sampler, ctx) +function LogDensityProblems.ADgradient(ℓ::Turing.LogDensityFunction) + return LogDensityProblems.ADgradient(getADbackend(ℓ.sampler), ℓ) end -""" -gradient_logp( - backend::ADBackend, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler = SampleFromPrior(), - ctx::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - -Compute the value of the log joint of `θ` and its gradient for the model -specified by `(vi, sampler, model)` using `backend` for AD, e.g. `ForwardDiffAD{N}()` uses `ForwardDiff.jl` with chunk size `N`, `TrackerAD()` uses `Tracker.jl` and `ZygoteAD()` uses `Zygote.jl`. -""" -function gradient_logp( - ad::ForwardDiffAD, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler=SampleFromPrior(), - context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - # Define log density function. - f = Turing.LogDensityFunction(vi, model, sampler, context) +function LogDensityProblems.ADgradient(ad::ForwardDiffAD, ℓ::Turing.LogDensityFunction) + θ = ℓ.varinfo[ℓ.sampler] + f = Base.Fix1(LogDensityProblems.logdensity, ℓ) # Define configuration for ForwardDiff. tag = if standardtag(ad) @@ -118,58 +92,30 @@ function gradient_logp( else ForwardDiff.Tag(f, eltype(θ)) end - chunk_size = getchunksize(typeof(ad)) + chunk_size = getchunksize(ad) config = if chunk_size == 0 ForwardDiff.GradientConfig(f, θ, ForwardDiff.Chunk(θ), tag) else ForwardDiff.GradientConfig(f, θ, ForwardDiff.Chunk(length(θ), chunk_size), tag) end - # Obtain both value and gradient of the log density function. - out = DiffResults.GradientResult(θ) - ForwardDiff.gradient!(out, f, θ, config) - logp = DiffResults.value(out) - ∂logp∂θ = DiffResults.gradient(out) + return LogDensityProblems.ADgradient(Val(:ForwardDiff), ℓ; gradientconfig=config) +end - return logp, ∂logp∂θ +function LogDensityProblems.ADgradient(::TrackerAD, ℓ::Turing.LogDensityFunction) + return LogDensityProblems.ADgradient(Val(:Tracker), ℓ) end -function gradient_logp( - ::TrackerAD, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler = SampleFromPrior(), - context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - # Define log density function. - f = Turing.LogDensityFunction(vi, model, sampler, context) - - # Compute forward pass and pullback. - l_tracked, ȳ = Tracker.forward(f, θ) - - # Remove tracking info. - l::typeof(getlogp(vi)) = Tracker.data(l_tracked) - ∂l∂θ::typeof(θ) = Tracker.data(only(ȳ(1))) - - return l, ∂l∂θ + +function LogDensityProblems.ADgradient(::ZygoteAD, ℓ::Turing.LogDensityFunction) + return LogDensityProblems.ADgradient(Val(:Zygote), ℓ) end -function gradient_logp( - backend::ZygoteAD, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler = SampleFromPrior(), - context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - # Define log density function. - f = Turing.LogDensityFunction(vi, model, sampler, context) - - # Compute forward pass and pullback. - l::typeof(getlogp(vi)), ȳ = ZygoteRules.pullback(f, θ) - ∂l∂θ::typeof(θ) = only(ȳ(1)) - - return l, ∂l∂θ +for cache in (:true, :false) + @eval begin + function LogDensityProblems.ADgradient(::ReverseDiffAD{$cache}, ℓ::Turing.LogDensityFunction) + return LogDensityProblems.ADgradient(Val(:ReverseDiff), ℓ; compile=Val($cache)) + end + end end function verifygrad(grad::AbstractVector{<:Real}) diff --git a/src/essential/compat/reversediff.jl b/src/essential/compat/reversediff.jl deleted file mode 100644 index cc077c5e0..000000000 --- a/src/essential/compat/reversediff.jl +++ /dev/null @@ -1,80 +0,0 @@ -using .ReverseDiff: compile, GradientTape - -struct ReverseDiffAD{cache} <: ADBackend end -const RDCache = Ref(false) -setrdcache(b::Bool) = setrdcache(Val(b)) -setrdcache(::Val{false}) = RDCache[] = false -setrdcache(::Val) = throw("Memoization.jl is not loaded. Please load it before setting the cache to true.") -function emptyrdcache end - -getrdcache() = RDCache[] -ADBackend(::Val{:reversediff}) = ReverseDiffAD{getrdcache()} -function _setadbackend(::Val{:reversediff}) - ADBACKEND[] = :reversediff -end - -function gradient_logp( - backend::ReverseDiffAD{false}, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler = SampleFromPrior(), - context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() -) - # Define log density function. - f = Turing.LogDensityFunction(vi, model, sampler, context) - - # Obtain both value and gradient of the log density function. - tp, result = taperesult(f, θ) - ReverseDiff.gradient!(result, tp, θ) - logp = DiffResults.value(result) - ∂logp∂θ = DiffResults.gradient(result) - - return logp, ∂logp∂θ -end - -tape(f, x) = GradientTape(f, x) -taperesult(f, x) = (tape(f, x), DiffResults.GradientResult(x)) - -@require Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73" @eval begin - setrdcache(::Val{true}) = RDCache[] = true - function emptyrdcache() - Memoization.empty_cache!(memoized_taperesult) - return - end - - function gradient_logp( - backend::ReverseDiffAD{true}, - θ::AbstractVector{<:Real}, - vi::VarInfo, - model::Model, - sampler::AbstractSampler = SampleFromPrior(), - context::DynamicPPL.AbstractContext = DynamicPPL.DefaultContext() - ) - # Define log density function. - f = Turing.LogDensityFunction(vi, model, sampler, context) - - # Obtain both value and gradient of the log density function. - ctp, result = memoized_taperesult(f, θ) - ReverseDiff.gradient!(result, ctp, θ) - logp = DiffResults.value(result) - ∂logp∂θ = DiffResults.gradient(result) - - return logp, ∂logp∂θ - end - - # This makes sure we generate a single tape per Turing model and sampler - struct RDTapeKey{F, Tx} - f::F - x::Tx - end - function Memoization._get!(f, d::Dict, keys::Tuple{Tuple{RDTapeKey}, Any}) - key = keys[1][1] - return Memoization._get!(f, d, (key.f, typeof(key.x), size(key.x), Threads.threadid())) - end - memoized_taperesult(f, x) = memoized_taperesult(RDTapeKey(f, x)) - Memoization.@memoize Dict function memoized_taperesult(k::RDTapeKey) - return compiledtape(k.f, k.x), DiffResults.GradientResult(k.x) - end - compiledtape(f, x) = compile(GradientTape(f, x)) -end diff --git a/src/inference/Inference.jl b/src/inference/Inference.jl index 8dfe8306a..2024cff2c 100644 --- a/src/inference/Inference.jl +++ b/src/inference/Inference.jl @@ -28,8 +28,9 @@ import AdvancedHMC; const AHMC = AdvancedHMC import AdvancedMH; const AMH = AdvancedMH import AdvancedPS import BangBang -import ..Essential: getchunksize, getADbackend +import ..Essential: getADbackend import EllipticalSliceSampling +import LogDensityProblems import Random import MCMCChains import StatsBase: predict @@ -76,7 +77,6 @@ abstract type Hamiltonian{AD} <: InferenceAlgorithm end abstract type StaticHamiltonian{AD} <: Hamiltonian{AD} end abstract type AdaptiveHamiltonian{AD} <: Hamiltonian{AD} end -getchunksize(::Type{<:Hamiltonian{AD}}) where AD = getchunksize(AD) getADbackend(::Hamiltonian{AD}) where AD = AD() # Algorithm for sampling from the prior diff --git a/src/inference/hmc.jl b/src/inference/hmc.jl index 34274a32e..c9583754d 100644 --- a/src/inference/hmc.jl +++ b/src/inference/hmc.jl @@ -159,8 +159,11 @@ function DynamicPPL.initialstep( # Create a Hamiltonian. metricT = getmetricT(spl.alg) metric = metricT(length(theta)) - ∂logπ∂θ = gen_∂logπ∂θ(vi, spl, model) - logπ = Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()) + ℓ = LogDensityProblems.ADgradient( + Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()) + ) + logπ = Base.Fix1(LogDensityProblems.logdensity, ℓ) + ∂logπ∂θ(x) = LogDensityProblems.logdensity_and_gradient(ℓ, x) hamiltonian = AHMC.Hamiltonian(metric, logπ, ∂logπ∂θ) # Compute phase point z. @@ -262,8 +265,11 @@ end function get_hamiltonian(model, spl, vi, state, n) metric = gen_metric(n, spl, state) - ℓπ = Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()) - ∂ℓπ∂θ = gen_∂logπ∂θ(vi, spl, model) + ℓ = LogDensityProblems.ADgradient( + Turing.LogDensityFunction(vi, model, spl, DynamicPPL.DefaultContext()) + ) + ℓπ = Base.Fix1(LogDensityProblems.logdensity, ℓ) + ∂ℓπ∂θ = Base.Fix1(LogDensityProblems.logdensity_and_gradient, ℓ) return AHMC.Hamiltonian(metric, ℓπ, ∂ℓπ∂θ) end @@ -422,19 +428,6 @@ end getstepsize(sampler::Sampler{<:Hamiltonian}, state) = sampler.alg.ϵ getstepsize(sampler::Sampler{<:AdaptiveHamiltonian}, state) = AHMC.getϵ(state.adaptor) -""" - gen_∂logπ∂θ(vi, spl::Sampler, model) - -Generate a function that takes a vector of reals `θ` and compute the logpdf and -gradient at `θ` for the model specified by `(vi, spl, model)`. -""" -function gen_∂logπ∂θ(vi, spl::Sampler, model) - function ∂logπ∂θ(x) - return gradient_logp(x, vi, model, spl) - end - return ∂logπ∂θ -end - gen_metric(dim::Int, spl::Sampler{<:Hamiltonian}, state) = AHMC.UnitEuclideanMetric(dim) function gen_metric(dim::Int, spl::Sampler{<:AdaptiveHamiltonian}, state) return AHMC.renew(state.hamiltonian.metric, AHMC.getM⁻¹(state.adaptor.pc)) diff --git a/src/modes/ModeEstimation.jl b/src/modes/ModeEstimation.jl index 7efb3be7e..184da87f1 100644 --- a/src/modes/ModeEstimation.jl +++ b/src/modes/ModeEstimation.jl @@ -10,6 +10,8 @@ using DynamicPPL: Model, AbstractContext, VarInfo, VarName, _getindex, getsym, getfield, setorder!, get_and_set_val!, istrans +import LogDensityProblems + export constrained_space, MAP, MLE, @@ -107,14 +109,9 @@ end function (f::OptimLogDensity)(F, G, z) if G !== nothing # Calculate negative log joint and its gradient. - sampler = f.sampler - neglogp, ∇neglogp = Turing.gradient_logp( - z, - DynamicPPL.VarInfo(f.varinfo, sampler, z), - f.model, - sampler, - f.context, - ) + # TODO: Make OptimLogDensity already an LogDensityProblems.ADgradient? Allow to specify AD? + ℓ = LogDensityProblems.ADgradient(f) + neglogp, ∇neglogp = LogDensityProblems.logdensity_and_gradient(ℓ, z) # Save the gradient to the pre-allocated array. copyto!(G, ∇neglogp) diff --git a/test/Project.toml b/test/Project.toml index 450ad8b04..dfaf97a50 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -12,8 +12,8 @@ DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c" MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d" -Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73" NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673" Optim = "429524aa-4258-5aef-a3af-852621145aeb" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" @@ -44,8 +44,8 @@ DynamicHMC = "2.1.6, 3.0" DynamicPPL = "0.20" FiniteDifferences = "0.10.8, 0.11, 0.12" ForwardDiff = "0.10.12" +LogDensityProblems = "0.12" MCMCChains = "5" -Memoization = "0.1.4" NamedArrays = "0.9.4" Optim = "0.22, 1.0" Optimization = "3.5" diff --git a/test/essential/ad.jl b/test/essential/ad.jl index c36006c94..1620b50e4 100644 --- a/test/essential/ad.jl +++ b/test/essential/ad.jl @@ -27,11 +27,19 @@ _x = [_m, _s] grad_FWAD = sort(g(_x)) + ℓ = Turing.LogDensityFunction(vi, ad_test_f, SampleFromPrior(), DynamicPPL.DefaultContext()) x = map(x->Float64(x), vi[SampleFromPrior()]) - ∇E1 = gradient_logp(TrackerAD(), x, vi, ad_test_f)[2] + + trackerℓ = LogDensityProblems.ADgradient(TrackerAD(), ℓ) + @test trackerℓ isa LogDensityProblems.TrackerGradientLogDensity + @test trackerℓ.ℓ === ℓ + ∇E1 = LogDensityProblems.logdensity_and_gradient(trackerℓ, x)[2] @test sort(∇E1) ≈ grad_FWAD atol=1e-9 - ∇E2 = gradient_logp(ZygoteAD(), x, vi, ad_test_f)[2] + zygoteℓ = LogDensityProblems.ADgradient(ZygoteAD(), ℓ) + @test zygoteℓ isa LogDensityProblems.ZygoteGradientLogDensity + @test zygoteℓ.ℓ === ℓ + ∇E2 = LogDensityProblems.logdensity_and_gradient(zygoteℓ, x)[2] @test sort(∇E2) ≈ grad_FWAD atol=1e-9 end @turing_testset "general AD tests" begin @@ -71,19 +79,13 @@ Turing.setadbackend(:tracker) sample(dir(), HMC(0.01, 1), 1000); Turing.setadbackend(:zygote) - sample(dir(), HMC(0.01, 1), 1000); + sample(dir(), HMC(0.01, 1), 1000) Turing.setadbackend(:reversediff) Turing.setrdcache(false) - sample(dir(), HMC(0.01, 1), 1000); + sample(dir(), HMC(0.01, 1), 1000) Turing.setrdcache(true) - sample(dir(), HMC(0.01, 1), 1000); - caches = Memoization.find_caches(Turing.Essential.memoized_taperesult) - @test length(caches) == 1 - @test !isempty(first(values(caches))) - Turing.emptyrdcache() - caches = Memoization.find_caches(Turing.Essential.memoized_taperesult) - @test length(caches) == 1 - @test isempty(first(values(caches))) + sample(dir(), HMC(0.01, 1), 1000) + Turing.setrdcache(false) end # FIXME: For some reasons PDMatDistribution AD tests fail with ReverseDiff @testset "PDMatDistribution AD" begin @@ -163,7 +165,7 @@ @test mean(Array(chn[:sigma])) ≈ std(data) atol=0.5 end - Turing.emptyrdcache() + Turing.setrdcache(false) end @testset "chunksize" begin diff --git a/test/runtests.jl b/test/runtests.jl index 5e0c1f0e2..46e8f2647 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,7 +7,6 @@ using DistributionsAD using FiniteDifferences using ForwardDiff using MCMCChains -using Memoization using NamedArrays using Optim using Optimization @@ -38,10 +37,12 @@ using MCMCChains: Chains using StatsFuns: binomlogpdf, logistic, logsumexp using TimerOutputs: TimerOutputs, @timeit using Turing: BinomialLogit, ForwardDiffAD, Sampler, SampleFromPrior, NUTS, TrackerAD, - Variational, ZygoteAD, getspace, gradient_logp + Variational, ZygoteAD, getspace using Turing.Essential: TuringDenseMvNormal, TuringDiagMvNormal using Turing.Variational: TruncatedADAGrad, DecayedADAGrad, AdvancedVI +import LogDensityProblems + setprogress!(false) include(pkgdir(Turing)*"/test/test_utils/AllUtils.jl") diff --git a/test/skipped/unit_test_helper.jl b/test/skipped/unit_test_helper.jl index 0a5c52789..b66002ff6 100644 --- a/test/skipped/unit_test_helper.jl +++ b/test/skipped/unit_test_helper.jl @@ -8,9 +8,13 @@ function test_grad(turing_model, grad_f; trans=Dict()) end d = length(vi.vals) @testset "Gradient using random inputs" begin + ℓ = LogDensityProblems.ADgradient( + TrackerAD(), + Turing.LogDensityFunction(vi, model_f, SampleFromPrior(), DynamicPPL.DefaultContext()), + ) for _ = 1:10000 theta = rand(d) - @test Turing.gradient_logp(TrackerAD(), theta, vi, model_f) == grad_f(theta)[2] + @test LogDensityProblems.logdensity_and_gradient(ℓ, theta) == grad_f(theta)[2] end end end diff --git a/test/test_utils/ad_utils.jl b/test/test_utils/ad_utils.jl index 55fe375c4..7b323c749 100644 --- a/test/test_utils/ad_utils.jl +++ b/test/test_utils/ad_utils.jl @@ -90,10 +90,14 @@ function test_model_ad(model, f, syms::Vector{Symbol}) # Call ForwardDiff's AD directly. grad_FWAD = sort(ForwardDiff.gradient(f, x)) - # Compare with `gradient_logp`. + # Compare with `logdensity_and_gradient`. z = vi[SampleFromPrior()] for chunksize in (0, 1, 10), standardtag in (true, false, 0, 3) - l, ∇E = gradient_logp(ForwardDiffAD{chunksize, standardtag}(), z, vi, model) + ℓ = LogDensityProblems.ADgradient( + ForwardDiffAD{chunksize, standardtag}(), + Turing.LogDensityFunction(vi, model, SampleFromPrior(), DynamicPPL.DefaultContext()), + ) + l, ∇E = LogDensityProblems.logdensity_and_gradient(ℓ, z) # Compare result @test l ≈ logp