From 304e62b3c8ec515697485f6e22d646415c41e4f2 Mon Sep 17 00:00:00 2001
From: Oscar Dowson <odow@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:45:27 +1200
Subject: [PATCH] Add Hessian support to GrayBox (#100)

---
 ext/MathOptAIFluxExt.jl       | 28 ++++++++---
 ext/MathOptAIPythonCallExt.jl | 33 +++++++++----
 src/predictors/GrayBox.jl     | 49 +++++++++++++------
 test/test_Flux.jl             | 43 ++++++++++++++++
 test/test_PythonCall.jl       | 92 +++++++++++++++++++++++++++++++++++
 5 files changed, 213 insertions(+), 32 deletions(-)

diff --git a/ext/MathOptAIFluxExt.jl b/ext/MathOptAIFluxExt.jl
index 4c26831..da81836 100644
--- a/ext/MathOptAIFluxExt.jl
+++ b/ext/MathOptAIFluxExt.jl
@@ -71,9 +71,9 @@ function MathOptAI.add_predictor(
     x::Vector;
     config::Dict = Dict{Any,Any}(),
     reduced_space::Bool = false,
-    gray_box::Bool = false,
+    kwargs...,
 )
-    inner_predictor = MathOptAI.build_predictor(predictor; config, gray_box)
+    inner_predictor = MathOptAI.build_predictor(predictor; config, kwargs...)
     if reduced_space
         inner_predictor = MathOptAI.ReducedSpace(inner_predictor)
     end
@@ -85,6 +85,7 @@ end
         predictor::Flux.Chain;
         config::Dict = Dict{Any,Any}(),
         gray_box::Bool = false,
+        gray_box_hessian::Bool = false,
     )
 
 Convert a trained neural network from Flux.jl to a [`Pipeline`](@ref).
@@ -110,6 +111,8 @@ Convert a trained neural network from Flux.jl to a [`Pipeline`](@ref).
    `Flux.relu => MathOptAI.QuadraticReLU()`.
  * `gray_box`: if `true`, the neural network is added as a user-defined
    nonlinear operator, with gradients provided by `Flux.withjacobian`.
+ * `gray_box_hessian`: if `true`, the gray box additionally computes the Hessian
+   of the output using `Flux.hessian`.
 
 ## Example
 
@@ -141,12 +144,13 @@ function MathOptAI.build_predictor(
     predictor::Flux.Chain;
     config::Dict = Dict{Any,Any}(),
     gray_box::Bool = false,
+    gray_box_hessian::Bool = false,
 )
     if gray_box
         if !isempty(config)
             error("cannot specify the `config` kwarg if `gray_box = true`")
         end
-        return MathOptAI.GrayBox(predictor)
+        return MathOptAI.GrayBox(predictor; hessian = gray_box_hessian)
     end
     inner_predictor = MathOptAI.Pipeline(MathOptAI.AbstractPredictor[])
     for layer in predictor.layers
@@ -155,15 +159,23 @@ function MathOptAI.build_predictor(
     return inner_predictor
 end
 
-function MathOptAI.GrayBox(predictor::Flux.Chain)
+function MathOptAI.GrayBox(predictor::Flux.Chain; hessian::Bool = false)
     function output_size(x)
         return only(Flux.outputsize(predictor, (length(x),)))
     end
-    function with_jacobian(x)
-        ret = Flux.withjacobian(x -> predictor(Float32.(x)), collect(x))
-        return (value = ret.val, jacobian = only(ret.grad))
+    function callback(x)
+        x32 = collect(Float32.(x))
+        ret = Flux.withjacobian(predictor, x32)
+        if !hessian
+            return (value = ret.val, jacobian = only(ret.grad))
+        end
+        Hs = map(1:length(ret.val)) do i
+            return Flux.hessian(x -> predictor(x)[i], x32)
+        end
+        H = cat(Hs...; dims = 3)
+        return (value = ret.val, jacobian = only(ret.grad), hessian = H)
     end
-    return MathOptAI.GrayBox(output_size, with_jacobian)
+    return MathOptAI.GrayBox(output_size, callback; has_hessian = hessian)
 end
 
 function _add_predictor(::MathOptAI.Pipeline, layer::Any, ::Dict)
diff --git a/ext/MathOptAIPythonCallExt.jl b/ext/MathOptAIPythonCallExt.jl
index 5977e87..1f2b8e9 100644
--- a/ext/MathOptAIPythonCallExt.jl
+++ b/ext/MathOptAIPythonCallExt.jl
@@ -38,6 +38,8 @@ Add a trained neural network from Pytorch via PythonCall.jl to `model`.
    The supported Symbols are `:ReLU`, `:Sigmoid`, and `:Tanh`.
  * `gray_box`: if `true`, the neural network is added as a user-defined
    nonlinear operator, with gradients provided by `torch.func.jacrev`.
+ * `gray_box_hessian`: if `true`, the gray box additionally computes the Hessian
+   of the output using `torch.func.hessian`.
 """
 function MathOptAI.add_predictor(
     model::JuMP.AbstractModel,
@@ -45,9 +47,9 @@ function MathOptAI.add_predictor(
     x::Vector;
     config::Dict = Dict{Any,Any}(),
     reduced_space::Bool = false,
-    gray_box::Bool = false,
+    kwargs...,
 )
-    inner_predictor = MathOptAI.build_predictor(predictor; config, gray_box)
+    inner_predictor = MathOptAI.build_predictor(predictor; config, kwargs...)
     if reduced_space
         inner_predictor = MathOptAI.ReducedSpace(inner_predictor)
     end
@@ -59,6 +61,7 @@ end
         predictor::MathOptAI.PytorchModel;
         config::Dict = Dict{Any,Any}(),
         gray_box::Bool = false,
+        gray_box_hessian::Bool = false,
     )
 
 Convert a trained neural network from Pytorch via PythonCall.jl to a
@@ -80,17 +83,20 @@ Convert a trained neural network from Pytorch via PythonCall.jl to a
    The supported Symbols are `:ReLU`, `:Sigmoid`, and `:Tanh`.
  * `gray_box`: if `true`, the neural network is added as a user-defined
    nonlinear operator, with gradients provided by `torch.func.jacrev`.
+ * `gray_box_hessian`: if `true`, the gray box additionally computes the Hessian
+   of the output using `torch.func.hessian`.
 """
 function MathOptAI.build_predictor(
     predictor::MathOptAI.PytorchModel;
     config::Dict = Dict{Any,Any}(),
     gray_box::Bool = false,
+    gray_box_hessian::Bool = false,
 )
     if gray_box
         if !isempty(config)
             error("cannot specify the `config` kwarg if `gray_box = true`")
         end
-        return MathOptAI.GrayBox(predictor)
+        return MathOptAI.GrayBox(predictor; hessian = gray_box_hessian)
     end
     torch = PythonCall.pyimport("torch")
     nn = PythonCall.pyimport("torch.nn")
@@ -118,23 +124,30 @@ function _predictor(nn, layer, config)
     return error("unsupported layer: $layer")
 end
 
-function MathOptAI.GrayBox(predictor::MathOptAI.PytorchModel)
+function MathOptAI.GrayBox(
+    predictor::MathOptAI.PytorchModel;
+    hessian::Bool = false,
+)
     torch = PythonCall.pyimport("torch")
     torch_model = torch.load(predictor.filename)
     J = torch.func.jacrev(torch_model)
+    H = torch.func.hessian(torch_model)
     # TODO(odow): I'm not sure if there is a better way to get the output
     # dimension of a torch model object?
     output_size(::Any) = PythonCall.pyconvert(Int, torch_model[-1].out_features)
-    function with_jacobian(x)
+    function callback(x)
         py_x = torch.tensor(collect(x))
         py_value = torch_model(py_x).detach().numpy()
+        value = PythonCall.pyconvert(Vector, py_value)
         py_jacobian = J(py_x).detach().numpy()
-        return (;
-            value = PythonCall.pyconvert(Vector, py_value),
-            jacobian = PythonCall.pyconvert(Matrix, py_jacobian),
-        )
+        jacobian = PythonCall.pyconvert(Matrix, py_jacobian)
+        if !hessian
+            return (; value, jacobian)
+        end
+        hessians = PythonCall.pyconvert(Array, H(py_x).detach().numpy())
+        return (; value, jacobian, hessian = permutedims(hessians, (2, 3, 1)))
     end
-    return MathOptAI.GrayBox(output_size, with_jacobian)
+    return MathOptAI.GrayBox(output_size, callback; has_hessian = hessian)
 end
 
 end  # module
diff --git a/src/predictors/GrayBox.jl b/src/predictors/GrayBox.jl
index 8b631fb..f22b0b3 100644
--- a/src/predictors/GrayBox.jl
+++ b/src/predictors/GrayBox.jl
@@ -7,7 +7,8 @@
 """
     GrayBox(
         output_size::Function,
-        with_jacobian::Function,
+        callback::Function;
+        has_hessian::Bool = false,
     ) <: AbstractPredictor
 
 An [`AbstractPredictor`](@ref) that represents the function ``f(x)`` as a
@@ -17,10 +18,13 @@ user-defined nonlinear operator.
 
  * `output_size(x::Vector):Int`: given an input vector `x`, return the dimension
    of the output vector
- * `with_jacobian(x::Vector)::NamedTuple -> (;value, jacobian)`: given an input
-   vector `x`, return a `NamedTuple` that computes the primal value and Jacobian
-   of the output value with respect to the input. `jacobian[j, i]` is the
-   partial derivative of `value[j]` with respect to `x[i]`.
+ * `callback(x::Vector)::NamedTuple -> (;value, jacobian[, hessian])`: given an
+   input vector `x`, return a `NamedTuple` that computes the primal value and
+   Jacobian of the output value with respect to the input. `jacobian[j, i]` is
+   the partial derivative of `value[j]` with respect to `x[i]`.
+ * `has_hessian`: if `true`, the `callback` additionally contains a field
+   `hessian`, which is an `N × N × M` matrix, where `hessian[i, j, k]` is the
+   partial derivative of `value[k]` with respect to `x[i]` and `x[j]`.
 
 ## Example
 
@@ -55,7 +59,16 @@ julia> y = MathOptAI.add_predictor(model, MathOptAI.ReducedSpace(f), x)
 """
 struct GrayBox{F<:Function,G<:Function} <: AbstractPredictor
     output_size::F
-    with_jacobian::G
+    callback::G
+    has_hessian::Bool
+
+    function GrayBox(
+        output_size::F,
+        callback::G;
+        has_hessian::Bool = false,
+    ) where {F<:Function,G<:Function}
+        return new{F,G}(output_size, callback, has_hessian)
+    end
 end
 
 function add_predictor(model::JuMP.AbstractModel, predictor::GrayBox, x::Vector)
@@ -73,7 +86,7 @@ function add_predictor(
     last_x, cache = nothing, nothing
     function update(x)
         if x != last_x
-            cache = predictor.predictor.with_jacobian(x)
+            cache = predictor.predictor.callback(x)
             last_x = x
         end
         return
@@ -87,14 +100,22 @@ function add_predictor(
         g .= cache.jacobian[i, :]
         return
     end
+    function ∇²f(H::AbstractMatrix{Float64}, k::Int, x...)
+        update(x)
+        for j in 1:length(x), i in j:length(x)
+            H[i, j] = cache.hessian[i, j, k]
+        end
+        return
+    end
     return map(1:predictor.predictor.output_size(x)) do i
-        op_i = JuMP.add_nonlinear_operator(
-            model,
-            length(x),
-            (x...) -> f(i, x...),
-            (g, x...) -> ∇f(g, i, x...);
-            name = Symbol("op_$(gensym())"),
-        )
+        callbacks = if predictor.predictor.has_hessian
+            ∇²fi = (H, x...) -> ∇²f(H, i, x...)
+            ((x...) -> f(i, x...), (g, x...) -> ∇f(g, i, x...), ∇²fi)
+        else
+            ((x...) -> f(i, x...), (g, x...) -> ∇f(g, i, x...))
+        end
+        name = Symbol("op_$(gensym())")
+        op_i = JuMP.add_nonlinear_operator(model, length(x), callbacks...; name)
         return op_i(x...)
     end
 end
diff --git a/test/test_Flux.jl b/test/test_Flux.jl
index fb0605d..103fef1 100644
--- a/test/test_Flux.jl
+++ b/test/test_Flux.jl
@@ -230,6 +230,27 @@ function test_gray_box_scalar_output()
     return
 end
 
+function test_gray_box_scalar_output_hessian()
+    chain = Flux.Chain(Flux.Dense(2 => 16, Flux.relu), Flux.Dense(16 => 1))
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    set_attribute(model, "max_iter", 5)
+    @variable(model, 0 <= x[1:2] <= 1)
+    y = MathOptAI.add_predictor(
+        model,
+        chain,
+        x;
+        gray_box = true,
+        gray_box_hessian = true,
+        reduced_space = true,
+    )
+    @objective(model, Max, only(y))
+    optimize!(model)
+    @test termination_status(model) == ITERATION_LIMIT
+    @test isapprox(value.(y), chain(Float32.(value.(x))); atol = 1e-2)
+    return
+end
+
 function test_gray_box_vector_output()
     chain = Flux.Chain(Flux.Dense(3 => 16, Flux.relu), Flux.Dense(16 => 2))
     model = Model(Ipopt.Optimizer)
@@ -251,6 +272,28 @@ function test_gray_box_vector_output()
     return
 end
 
+function test_gray_box_vector_output_hessian()
+    chain = Flux.Chain(Flux.Dense(3 => 16, Flux.relu), Flux.Dense(16 => 2))
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    set_attribute(model, "max_iter", 5)
+    @variable(model, 0 <= x[1:3] <= 1)
+    y = MathOptAI.add_predictor(
+        model,
+        chain,
+        x;
+        gray_box = true,
+        gray_box_hessian = true,
+        reduced_space = true,
+    )
+    @test length(y) == 2
+    @objective(model, Max, sum(y))
+    optimize!(model)
+    @test termination_status(model) == ITERATION_LIMIT
+    @test isapprox(value.(y), chain(Float32.(value.(x))); atol = 1e-2)
+    return
+end
+
 end  # module
 
 TestFluxExt.runtests()
diff --git a/test/test_PythonCall.jl b/test/test_PythonCall.jl
index ecad1e8..f94abe6 100644
--- a/test/test_PythonCall.jl
+++ b/test/test_PythonCall.jl
@@ -190,6 +190,43 @@ function test_model_Tanh_scalar_GrayBox()
     return
 end
 
+function test_model_Tanh_scalar_GrayBox_hessian()
+    dir = mktempdir()
+    filename = joinpath(dir, "model_Tanh_GrayBox.pt")
+    PythonCall.pyexec(
+        """
+        import torch
+
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 16),
+            torch.nn.Tanh(),
+            torch.nn.Linear(16, 1),
+        )
+
+        torch.save(model, filename)
+        """,
+        @__MODULE__,
+        (; filename = filename),
+    )
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    @variable(model, x[1:2])
+    ml_model = MathOptAI.PytorchModel(filename)
+    y = MathOptAI.add_predictor(
+        model,
+        ml_model,
+        x;
+        gray_box = true,
+        gray_box_hessian = true,
+    )
+    @test num_variables(model) == 3
+    @test num_constraints(model; count_variable_in_set_constraints = true) == 1
+    optimize!(model)
+    @test is_solved_and_feasible(model)
+    @test ≈(_evaluate_model(filename, value.(x)), value.(y); atol = 1e-5)
+    return
+end
+
 function test_model_Tanh_vector_GrayBox()
     dir = mktempdir()
     filename = joinpath(dir, "model_Tanh_vector_GrayBox.pt")
@@ -239,6 +276,61 @@ function test_model_Tanh_vector_GrayBox()
     return
 end
 
+function test_model_Tanh_vector_GrayBox_hessian()
+    dir = mktempdir()
+    filename = joinpath(dir, "model_Tanh_vector_GrayBox.pt")
+    PythonCall.pyexec(
+        """
+        import torch
+
+        model = torch.nn.Sequential(
+            torch.nn.Linear(3, 16),
+            torch.nn.Tanh(),
+            torch.nn.Linear(16, 2),
+        )
+
+        torch.save(model, filename)
+        """,
+        @__MODULE__,
+        (; filename = filename),
+    )
+    # Full-space
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    @variable(model, x[1:3])
+    ml_model = MathOptAI.PytorchModel(filename)
+    y = MathOptAI.add_predictor(
+        model,
+        ml_model,
+        x;
+        gray_box = true,
+        gray_box_hessian = true,
+    )
+    @test num_variables(model) == 5
+    @test num_constraints(model; count_variable_in_set_constraints = true) == 2
+    optimize!(model)
+    @test is_solved_and_feasible(model)
+    @test ≈(_evaluate_model(filename, value.(x)), value.(y); atol = 1e-5)
+    # Reduced-space
+    model = Model(Ipopt.Optimizer)
+    set_silent(model)
+    @variable(model, x[1:3])
+    ml_model = MathOptAI.PytorchModel(filename)
+    y = MathOptAI.add_predictor(
+        model,
+        ml_model,
+        x;
+        gray_box = true,
+        reduced_space = true,
+    )
+    @test num_variables(model) == 3
+    @test num_constraints(model; count_variable_in_set_constraints = true) == 0
+    optimize!(model)
+    @test is_solved_and_feasible(model)
+    @test ≈(_evaluate_model(filename, value.(x)), value.(y); atol = 1e-5)
+    return
+end
+
 end  # module
 
 TestPythonCallExt.runtests()