TuringLang · Red-Portal · Sep 13, 2024 · Aug 3, 2024 · Aug 3, 2024 · Aug 3, 2024
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
@@ -13,6 +13,7 @@ concurrency:
 permissions:
   contents: write
   pull-requests: write
+  issues: write
 
 jobs:
   benchmark:

diff --git a/Project.toml b/Project.toml
@@ -42,7 +42,7 @@ Accessors = "0.1"
 Bijectors = "0.13"
 ChainRulesCore = "1.16"
 DiffResults = "1"
-Distributions = "0.25.87"
+Distributions = "0.25.111"
 DocStringExtensions = "0.8, 0.9"
 Enzyme = "0.12.32"
 FillArrays = "1.3"

diff --git a/docs/make.jl b/docs/make.jl
@@ -18,6 +18,7 @@ makedocs(;
             "Reparameterization Gradient Estimator" => "elbo/repgradelbo.md",
         ],
         "Variational Families" => "families.md",
+        "Optimization" => "optimization.md",
     ],
 )
 

diff --git a/docs/src/families.md b/docs/src/families.md
@@ -3,7 +3,7 @@
 The [RepGradELBO](@ref repgradelbo) objective assumes that the members of the variational family have a differentiable sampling path.
 We provide multiple pre-packaged variational families that can be readily used.
 
-## The `LocationScale` Family
+## [The `LocationScale` Family](@id locscale)
 
 The [location-scale](https://en.wikipedia.org/wiki/Location%E2%80%93scale_family) variational family is a family of probability distributions, where their sampling process can be represented as
 
@@ -38,6 +38,8 @@ where ``\mathbb{H}(\varphi)`` is the entropy of the base distribution.
 Notice the ``\mathbb{H}(\varphi)`` does not depend on ``\log |C|``.
 The derivative of the entropy with respect to ``\lambda`` is thus independent of the base distribution.
 
+### API
+
 !!! note
 
     For stable convergence, the initial `scale` needs to be sufficiently large and well-conditioned.
@@ -128,14 +130,134 @@ and the entropy is given by the matrix determinant lemma as
 
 where ``\mathbb{H}(\varphi)`` is the entropy of the base distribution.
 
-!!! note
+```@setup lowrank
+using ADTypes
+using AdvancedVI
+using Distributions
+using LinearAlgebra
+using LogDensityProblems
+using Optimisers
+using Plots
+using ReverseDiff
+
+struct Target{D}
+    dist::D
+end
+
+function LogDensityProblems.logdensity(model::Target, θ)
+    logpdf(model.dist, θ)
+end
+
+function LogDensityProblems.dimension(model::Target)
+    return length(model.dist)
+end
+
+function LogDensityProblems.capabilities(::Type{<:Target})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+
+n_dims     = 30
+U_true     = randn(n_dims, 3)
+D_true     = Diagonal(log.(1 .+ exp.(randn(n_dims))))
+Σ_true     = D_true + U_true*U_true'
+Σsqrt_true = sqrt(Σ_true)
+μ_true     = randn(n_dims)
+model      = Target(MvNormal(μ_true, Σ_true));
+
+d  = LogDensityProblems.dimension(model);
+μ  = zeros(d);
+
+L     = Diagonal(ones(d));
+q0_mf = MeanFieldGaussian(μ, L)
+
+L     = LowerTriangular(diagm(ones(d)));
+q0_fr = FullRankGaussian(μ, L)
+
+D     = ones(n_dims)
+U     = zeros(n_dims, 3)
+q0_lr = LowRankGaussian(μ, D, U)
+
+obj = RepGradELBO(1);
+
+max_iter = 10^4
+
+function callback(; params, averaged_params, restructure, stat, kwargs...)
+    q = restructure(averaged_params)
+    μ, Σ = mean(q), cov(q)
+    (dist2 = sum(abs2, μ - μ_true) + tr(Σ + Σ_true - 2*sqrt(Σsqrt_true*Σ*Σsqrt_true)),)
+end
+
+_, _, stats_fr, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_fr,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+_, _, stats_mf, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_mf,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+_, _, stats_lr, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_lr,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+t       = [stat.iteration for stat in stats_fr]
+dist_fr = [sqrt(stat.dist2) for stat in stats_fr]
+dist_mf = [sqrt(stat.dist2) for stat in stats_mf]
+dist_lr = [sqrt(stat.dist2) for stat in stats_lr]
+plot( t, dist_mf , label="Mean-Field Gaussian", xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+plot!(t, dist_fr,  label="Full-Rank Gaussian",  xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+plot!(t, dist_lr,  label="Low-Rank Gaussian",   xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+savefig("lowrank_family_wasserstein.svg")
+nothing
+```
+
+Consider a 30-dimensional Gaussian with a diagonal plus low-rank covariance structure, where the true rank is 3.
+Then, we can compare the convergence speed of `LowRankGaussian` versus `FullRankGaussian`:
+
+![](lowrank_family_wasserstein.svg)
+
+As we can see, `LowRankGaussian` converges faster than `FullRankGaussian`.
+While `FullRankGaussian` can converge to the true solution since it is a more expressive variational family, `LowRankGaussian` gets there faster.
+
+!!! info
+    `MvLocationScaleLowRank` tend to work better with the `Optimisers.Adam` optimizer due to non-smoothness.
+    Other optimisers may experience divergences.
 
-    `logpdf` for `LocationScaleLowRank` is unfortunately not computationally efficient and has the same time complexity as `LocationScale` with a full-rank scale.
+
+### API
 
 ```@docs
 MvLocationScaleLowRank
 ```
 
+The `logpdf` of  `MvLocationScaleLowRank` has an optional argument `non_differentiable::Bool` (default: `false`).
+If set as `true`, a more efficient ``O\left(r d^2\right)`` implementation is used to evaluate the density.
+This, however, is not differentiable under most AD frameworks due to the use of Cholesky `lowrankupdate`.
+The default value is `false`, which uses a ``O\left(d^3\right)`` implementation, is differentiable and therefore compatible with the `StickingTheLandingEntropy` estimator.
+
 The following is a specialized constructor for convenience:
 
 ```@docs

diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl
@@ -184,7 +184,53 @@ export MvLocationScaleLowRank, LowRankGaussian
 
 include("families/location_scale_low_rank.jl")
 
-# Optimization Routine
+# Optimization Rules
+
+include("optimization/rules.jl")
+
+export DoWG, DoG, COCOB
+
+# Output averaging strategy
+
+abstract type AbstractAverager end
+
+"""
+    init(avg, params)
+
+Initialize the state of the averaging strategy `avg` with the initial parameters `params`.
+
+# Arguments
+- `avg::AbstractAverager`: Averaging strategy.
+- `params`: Initial variational parameters.
+"""
+init(::AbstractAverager, ::Any) = nothing
+
+"""
+    apply(avg, avg_st, params)
+
+Apply averaging strategy `avg` on `params` given the state `avg_st`.
+
+# Arguments
+- `avg::AbstractAverager`: Averaging strategy.
+- `avg_st`: Previous state of the averaging strategy.
+- `params`: Initial variational parameters.
+"""
+function apply(::AbstractAverager, ::Any, ::Any) end
+
+"""
+    value(avg, avg_st)
+
+Compute the output of the averaging strategy `avg` from the state `avg_st`.
+
+# Arguments
+- `avg::AbstractAverager`: Averaging strategy.
+- `avg_st`: Previous state of the averaging strategy.
+"""
+function value(::AbstractAverager, ::Any) end
+
+include("optimization/averaging.jl")
+
+export NoAveraging, PolynomialAveraging
 
 function optimize end
 

diff --git a/src/families/location_scale.jl b/src/families/location_scale.jl
@@ -45,7 +45,7 @@ Functors.@functor MvLocationScale (location, scale)
 # is very inefficient.
 # begin
 struct RestructureMeanField{S<:Diagonal,D,L,E}
-    q::MvLocationScale{S,D,L,E}
+    model::MvLocationScale{S,D,L,E}
 end
 
 function (re::RestructureMeanField)(flat::AbstractVector)
@@ -113,16 +113,21 @@ function Distributions._rand!(
     return x .+= location
 end
 
-Distributions.mean(q::MvLocationScale) = q.location
+function Distributions.mean(q::MvLocationScale)
+    @unpack location, scale = q
+    return location + scale * Fill(mean(q.dist), length(location))
+end
 
 function Distributions.var(q::MvLocationScale)
     C = q.scale
-    return Diagonal(C * C')
+    σ2 = var(q.dist)
+    return σ2 * diag(C * C')
 end
 
 function Distributions.cov(q::MvLocationScale)
     C = q.scale
-    return Hermitian(C * C')
+    σ2 = var(q.dist)
+    return σ2 * Hermitian(C * C')
 end
 
 """

diff --git a/src/families/location_scale_low_rank.jl b/src/families/location_scale_low_rank.jl
@@ -20,9 +20,9 @@ represented as follows:
 ```julia
   d = length(location)
   r = size(scale_factors, 2)
-  u_d = rand(dist, d)
-  u_f = rand(dist, r)
-  z = scale_diag.*u_d + scale_factors*u_f + location
+  u_diag = rand(dist, d)
+  u_factors = rand(dist, r)
+  z = scale_diag.*u_diag + scale_factors*u_factors + location
 ```
 
 `scale_eps` sets a constraint on the smallest value of `scale_diag` to be enforced during optimization.
@@ -60,21 +60,29 @@ function StatsBase.entropy(q::MvLocationScaleLowRank)
     return n_dims * convert(eltype(location), entropy(dist)) + logdetΣ / 2
 end
 
-function Distributions.logpdf(q::MvLocationScaleLowRank, z::AbstractVector{<:Real})
+function Distributions.logpdf(
+    q::MvLocationScaleLowRank, z::AbstractVector{<:Real}; non_differntiable::Bool=false
+)
     @unpack location, scale_diag, scale_factors, dist = q
-    #
-    ## More efficient O(kd^2) but non-differentiable version:
-    #
-    # Σchol = Cholesky(LowerTriangular(diagm(sqrt.(scale_diag))))
-    # n_factors = size(scale_factors, 2)
-    # for k in 1:n_factors
-    #     factor = scale_factors[:,k]
-    #     lowrankupdate!(Σchol, factor)
-    # end
-
-    Σ = Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors'
-    Σchol = cholesky(Σ)
-    return sum(Base.Fix1(logpdf, dist), Σchol.L \ (z - location)) - logdet(Σchol.L)
+    μ_base = mean(dist)
+    n_dims = length(location)
+
+    scale2chol = if non_differntiable
+        # Fast O(kd^2) path (not supported by most current AD frameworks):
+        scale2chol = Cholesky(LowerTriangular(diagm(sqrt.(scale_diag))))
+        n_factors = size(scale_factors, 2)
+        for k in 1:n_factors
+            factor = scale_factors[:, k] # copy necessary due to in-place mutation
+            lowrankupdate!(scale2chol, factor)
+        end
+        scale2chol
+    else
+        # Slow but differentiable O(d^3) path
+        scale2 = Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors'
+        cholesky(scale2)
+    end
+    z_std = z - mean(q) + scale2chol.L * Fill(μ_base, n_dims)
+    return sum(Base.Fix1(logpdf, dist), scale2chol.L \ z_std) - logdet(scale2chol.L)
 end
 
 function Distributions.rand(q::MvLocationScaleLowRank)
@@ -111,18 +119,25 @@ function Distributions._rand!(
     return x .+= location
 end
 
-Distributions.mean(q::MvLocationScaleLowRank) = q.location
+function Distributions.mean(q::MvLocationScaleLowRank)
+    @unpack location, scale_diag, scale_factors = q
+    μ = mean(q.dist)
+    return location +
+           scale_diag .* Fill(μ, length(scale_diag)) +
+           scale_factors * Fill(μ, size(scale_factors, 2))
+end
 
 function Distributions.var(q::MvLocationScaleLowRank)
     @unpack scale_diag, scale_factors = q
-    return Diagonal(
-        scale_diag .* scale_diag + sum(scale_factors .* scale_factors; dims=2)[:, 1]
-    )
+    σ2 = var(q.dist)
+    return σ2 *
+           (scale_diag .* scale_diag + sum(scale_factors .* scale_factors; dims=2)[:, 1])
 end
 
 function Distributions.cov(q::MvLocationScaleLowRank)
     @unpack scale_diag, scale_factors = q
-    return Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors'
+    σ2 = var(q.dist)
+    return σ2 * (Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors')
 end
 
 function update_variational_params!(

diff --git a/test/Project.toml b/test/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ADTypes = "0.2.1, 1"
 Bijectors = "0.13"
 DiffResults = "1.0"
-Distributions = "0.25.100"
+Distributions = "0.25.111"
 DistributionsAD = "0.6.45"
 Enzyme = "0.12.32"
 FillArrays = "1.6.1"
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,6 +18,7 @@ makedocs(; @@
                 "Reparameterization Gradient Estimator" => "elbo/repgradelbo.md",
             ],
             "Variational Families" => "families.md",
+            "Optimization" => "optimization.md",
         ],
     )
@@ Expand Down @@