From af1a9f9e71a80b1050fa1f04f3e5c614ca8055e6 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 11 Nov 2022 11:15:17 -0600
Subject: [PATCH] Integrating all ExaAdmm GPU backends

---
 Project.toml                         |  9 ++--
 src/Evaluators/ProxALEvalutor.jl     | 27 ++++++----
 src/ExaAdmmBackend/ExaAdmmBackend.jl | 16 +++---
 src/ProxAL.jl                        |  1 -
 src/backends.jl                      |  3 +-
 src/blocks.jl                        |  4 +-
 src/params.jl                        |  2 +-
 test/blockmodel.jl                   | 80 +++++++++++++++++++++-------
 test/convergence.jl                  | 35 ++++++++++--
 test/runtests.jl                     | 28 +++++-----
 10 files changed, 139 insertions(+), 66 deletions(-)

diff --git a/Project.toml b/Project.toml
index 1738b21..e959abc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ProxAL"
 uuid = "12c3852d-bf95-4e7b-be60-68937c3c927b"
 authors = ["Anirudh Subramanyam <asubramanyam@anl.gov>", "Youngdae Kim <youngdae@anl.gov>", "Francois Pacaud <fpacaud@anl.gov>", "Michel Schanen <mschanen@anl.gov>"]
-version = "0.6.0"
+version = "0.7.0"
 
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -18,12 +18,11 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [compat]
-AMDGPU = "=0.4.2"
+AMDGPU = "0.4"
 CUDA = "3.4"
 CatViews = "1"
 ExaAdmm = "0.3"
@@ -36,8 +35,10 @@ MPI = "0.19"
 julia = "1.8"
 
 [extras]
+CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "LazyArtifacts"]
+test = ["CUDAKernels", "LazyArtifacts", "ROCKernels", "Test"]
diff --git a/src/Evaluators/ProxALEvalutor.jl b/src/Evaluators/ProxALEvalutor.jl
index 798b6d9..859f8a9 100644
--- a/src/Evaluators/ProxALEvalutor.jl
+++ b/src/Evaluators/ProxALEvalutor.jl
@@ -400,21 +400,28 @@ function optimize!(
         iteration()
 
         # Check convergence
-        minviol = max(
+        if max(
             runinfo.maxviol_t[end],
             runinfo.maxviol_c[end],
             runinfo.maxviol_t_actual[end],
             runinfo.maxviol_c_actual[end],
             runinfo.maxviol_d[end]
-        )
-        if minviol < runinfo.minviol
-            runinfo.minviol = minviol
-            algparams.tron_outer_eps = minviol
-            if runinfo.output
-                ProxAL.write(runinfo, nlp, "solution_$(modelinfo.case_name)_$(comm_ranks(comm)).h5")
-            end
-        end
-        if minviol <= algparams.tol
+        ) <= algparams.tol
+        # minviol = max(
+        #     runinfo.maxviol_t[end],
+        #     runinfo.maxviol_c[end],
+        #     runinfo.maxviol_t_actual[end],
+        #     runinfo.maxviol_c_actual[end],
+        #     runinfo.maxviol_d[end]
+        # )
+        # if minviol < runinfo.minviol
+        #     runinfo.minviol = minviol
+        #     algparams.tron_outer_eps = minviol
+        #     if runinfo.output
+        #         ProxAL.write(runinfo, nlp, "solution_$(modelinfo.case_name)_$(comm_ranks(comm)).h5")
+        #     end
+        # end
+        # if minviol <= algparams.tol
             break
         end
     end
diff --git a/src/ExaAdmmBackend/ExaAdmmBackend.jl b/src/ExaAdmmBackend/ExaAdmmBackend.jl
index 5f079a5..7b0d0cd 100644
--- a/src/ExaAdmmBackend/ExaAdmmBackend.jl
+++ b/src/ExaAdmmBackend/ExaAdmmBackend.jl
@@ -126,22 +126,18 @@ function ModelProxAL(
 end
 
 function ExaAdmm.AdmmEnv(opfdata, rho_va::Float64, rho_pq::Float64; use_gpu=false, ka_device=nothing, options...)
+    T = Float64
     if use_gpu
-        if isa(ka_device, ROCDevice)
-            T = Float64
-            VT = ROCVector{Float64}
-            VI = ROCVector{Int}
-            MT = ROCMatrix{Float64}
-        elseif isa(ka_device, CUDADevice)
+        if !isa(ka_device, Nothing)
+            VT = typeof(ExaAdmm.KAArray{Float64}(0, ka_device))
+            VI = typeof(ExaAdmm.KAArray{Int}(0, ka_device))
+            MT = typeof(ExaAdmm.KAArray{Float64}(0, 0, ka_device))
+        else
             VT = CuVector{Float64}
             VI = CuVector{Int}
             MT = CuMatrix{Float64}
-        else
-            error("Unknown device type $ka_device")
         end
-
     else
-        T = Float64
         VT = Vector{Float64}
         VI = Vector{Int}
         MT = Matrix{Float64}
diff --git a/src/ProxAL.jl b/src/ProxAL.jl
index d1f8d0f..38ef220 100644
--- a/src/ProxAL.jl
+++ b/src/ProxAL.jl
@@ -8,7 +8,6 @@ using Printf, CatViews
 using ExaPF
 using ExaAdmm
 using KernelAbstractions
-using ROCKernels
 using LinearAlgebra
 using SparseArrays
 using MPI
diff --git a/src/backends.jl b/src/backends.jl
index 100155c..4817e45 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -640,7 +640,7 @@ function init!(block::AdmmBlockBackend, algparams::AlgParams)
     if algparams.decompCtgs && k > 1
         if modelinfo.ctgs_link_constr_type == :corrective_penalty
             copyto!(opfmodel.smin, zeros(length(gens)))
-            copyto!(opfmodel.smax, 1.0.*[g.scen_agc for g in gens])
+            copyto!(opfmodel.smax, 2.0.*[g.scen_agc for g in gens])
         else
             @assert modelinfo.ctgs_link_constr_type == :preventive_penalty
             copyto!(opfmodel.smin, zeros(length(gens)))
@@ -810,7 +810,6 @@ function optimize!(block::AdmmBlockBackend, x0::Union{Nothing, AbstractArray}, a
         set_start_values!(block, x0)
     end
     # Optimize with optimizer, using ExaPF model
-    block.env.params.outer_eps = algparams.tron_outer_eps*2e2
     ExaAdmm.admm_two_level(block.env, block.model, block.env.ka_device)
     # Recover solution in ProxAL format
     solution = get_solution(block)
diff --git a/src/blocks.jl b/src/blocks.jl
index 7b6c167..af25f53 100644
--- a/src/blocks.jl
+++ b/src/blocks.jl
@@ -31,9 +31,9 @@ total number of contingencies).
 
 ### Deporting the resolution on the GPU
 
-When the backend is set to `ExaBlockBackend` (and a CUDA GPU is available), the user
+When the backend is set to `ExaAdmmBackend` (and a CUDA GPU is available), the user
 could chose to deport the resolution of each subproblem directly on
-the GPU simply by setting `algparams.device=CUDADevice`. However, note that
+the GPU simply by setting `algparams.device=GPU`. However, note that
 we could not instantiate more subproblems on the GPU than the number of GPU
 available.
 
diff --git a/src/params.jl b/src/params.jl
index 3707623..c15bbd9 100644
--- a/src/params.jl
+++ b/src/params.jl
@@ -9,7 +9,7 @@ const MOI_OPTIMAL_STATUSES = [
 
 @enum(TargetDevice,
     CPU,
-    CUDADevice,
+    GPU,
     KADevice,
 )
 
diff --git a/test/blockmodel.jl b/test/blockmodel.jl
index e934a3f..af86ad0 100644
--- a/test/blockmodel.jl
+++ b/test/blockmodel.jl
@@ -1,4 +1,6 @@
 using Test
+using CUDA
+using AMDGPU
 using MPI
 using Ipopt
 using ExaPF
@@ -20,6 +22,31 @@ rtol = 1e-4
 case_file = joinpath(DATA_DIR, "$(case).m")
 load_file = joinpath(DATA_DIR, "mp_demand", "$(case)_oneweek_168")
 
+solver_list = ["ExaAdmmCPU"]
+if CUDA.has_cuda_gpu()
+    using CUDAKernels
+    function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::CUDADevice) where {T}
+        return CuArray{T}(undef, n)
+    end
+    function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::CUDADevice) where {T}
+        return CuArray{T}(undef, n1, n2)
+    end
+    gpu_device = CUDADevice()
+    push!(solver_list, "ExaAdmmGPUKA")
+elseif AMDGPU.has_rocm_gpu()
+    using ROCKernels
+    # Set for crusher login node to avoid other users
+    AMDGPU.default_device!(AMDGPU.devices()[2])
+    function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::ROCDevice) where {T}
+        return ROCArray{T}(undef, n)
+    end
+    function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::ROCDevice) where {T}
+        return ROCArray{T}(undef, n1, n2)
+    end
+    gpu_device = ROCDevice()
+    push!(solver_list, "ExaAdmmGPUKA")
+end
+
 @testset "Block Model Backends" begin
     # ctgs_arr = deepcopy(rawdata.ctgs_arr)
 
@@ -82,24 +109,41 @@ load_file = joinpath(DATA_DIR, "mp_demand", "$(case)_oneweek_168")
         slack_jump = solution.st
 
         @testset "ExaAdmm BlockModel" begin
-            blockmodel = ProxAL.AdmmBlockBackend(
-                blkid, opfdata_c, nlp.rawdata, algparams, modelinfo_local, t, 1, T;
-            )
-            ProxAL.init!(blockmodel, nlp.algparams)
-            ProxAL.set_objective!(blockmodel, nlp.algparams, primal, dual)
-
-            # Test optimization
-            x0 = nothing
-            solution = ProxAL.optimize!(blockmodel, x0, nlp.algparams)
-            @test solution.status ∈ ProxAL.MOI_OPTIMAL_STATUSES
-            obj_tron   = solution.minimum
-            pg_tron    = solution.pg
-            slack_tron = solution.st
-            # TODO: implement ProxAL objective in ExaTron
-            @test obj_jump ≈ obj_tron rtol=1e-4
-            @test pg_jump ≈ pg_tron rtol=1e-3
-            if t > 1  # slack could be of any value for t == 1
-                @test slack_jump ≈ slack_tron rtol=1e-3
+            for solver in solver_list
+                @testset "$(solver)" begin
+                    if solver == "ExaAdmmCPU"
+                        backend = AdmmBackend()
+                    end
+                    if solver == "ExaAdmmGPU"
+                        backend = AdmmBackend()
+                        algparams.device = ProxAL.GPU # Assuming CUDA
+                        algparams.ka_device = nothing
+                    end
+                    if solver == "ExaAdmmGPUKA"
+                        backend = AdmmBackend()
+                        algparams.device = ProxAL.KADevice
+                        algparams.ka_device = gpu_device
+                    end
+                    blockmodel = ProxAL.AdmmBlockBackend(
+                        blkid, opfdata_c, nlp.rawdata, algparams, modelinfo_local, t, 1, T;
+                    )
+                    ProxAL.init!(blockmodel, nlp.algparams)
+                    ProxAL.set_objective!(blockmodel, nlp.algparams, primal, dual)
+
+                    # Test optimization
+                    x0 = nothing
+                    solution = ProxAL.optimize!(blockmodel, x0, nlp.algparams)
+                    @test solution.status ∈ ProxAL.MOI_OPTIMAL_STATUSES
+                    obj_tron   = solution.minimum
+                    pg_tron    = solution.pg
+                    slack_tron = solution.st
+                    # TODO: implement ProxAL objective in ExaTron
+                    @test obj_jump ≈ obj_tron rtol=1e-4
+                    @test pg_jump ≈ pg_tron rtol=1e-3
+                    if t > 1  # slack could be of any value for t == 1
+                        @test slack_jump ≈ slack_tron rtol=1e-3
+                    end
+                end
             end
         end
     end
diff --git a/test/convergence.jl b/test/convergence.jl
index 09e2038..c83fed0 100644
--- a/test/convergence.jl
+++ b/test/convergence.jl
@@ -4,6 +4,7 @@ using DelimitedFiles, Printf
 using LinearAlgebra, JuMP
 using CatViews
 using CUDA
+using AMDGPU
 using MPI
 using LazyArtifacts
 
@@ -37,9 +38,28 @@ algparams.verbose = 0
 
 solver_list = ["Ipopt", "ExaAdmmCPU"]
 if CUDA.has_cuda_gpu()
-    # TODO: MadNLP broken currently
-    # push!(solver_list, "MadNLPGPU")
     push!(solver_list, "ExaAdmmGPU")
+    using CUDAKernels
+    function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::CUDADevice) where {T}
+        return CuArray{T}(undef, n)
+    end
+    function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::CUDADevice) where {T}
+        return CuArray{T}(undef, n1, n2)
+    end
+    gpu_device = CUDADevice()
+    push!(solver_list, "ExaAdmmGPUKA")
+elseif AMDGPU.has_rocm_gpu()
+    using ROCKernels
+    # Set for crusher login node to avoid other users
+    AMDGPU.default_device!(AMDGPU.devices()[2])
+    function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::ROCDevice) where {T}
+        return ROCArray{T}(undef, n)
+    end
+    function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::ROCDevice) where {T}
+        return ROCArray{T}(undef, n1, n2)
+    end
+    gpu_device = ROCDevice()
+    push!(solver_list, "ExaAdmmGPUKA")
 end
 if isfile(joinpath(dirname(@__FILE__), "..", "build/libhiop.so"))
     push!(solver_list, "Hiop")
@@ -52,7 +72,6 @@ end
 
     for solver in solver_list
     @testset "$(solver)" begin
-        println("Testing using $(solver)")
         if solver == "Ipopt"
             using Ipopt
             backend = JuMPBackend()
@@ -69,7 +88,15 @@ end
             backend = AdmmBackend()
             algparams.tron_outer_iterlim=2000
             algparams.tron_outer_eps=1e-6
-            algparams.device = ProxAL.CUDADevice
+            algparams.device = ProxAL.GPU # Assuming CUDA
+            algparams.ka_device = nothing
+        end
+        if solver == "ExaAdmmGPUKA"
+            backend = AdmmBackend()
+            algparams.tron_outer_iterlim=2000
+            algparams.tron_outer_eps=1e-6
+            algparams.device = ProxAL.KADevice
+            algparams.ka_device = gpu_device
         end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 0945e94..43b870e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,21 +7,21 @@ testdir = @__DIR__
     @testset "Integration tests" begin
         include("blockmodel.jl")
     end
-    # @testset "ExaAdmm backend" begin
-    #     include("exaadmm.jl")
-    # end
+    @testset "ExaAdmm backend" begin
+        include("exaadmm.jl")
+    end
 
-    # # Testing using 1 process
-    # @testset "Sequential tests" begin
-    #     include("convergence.jl")
-    # end
+    # Testing using 1 process
+    @testset "Sequential tests" begin
+        include("convergence.jl")
+    end
 
-    # # Testing using 2 processes
+    # Testing using 2 processes
 
-    # @testset "Parallel tests" begin
-    #     mpiexec() do cmd
-    #         run(`$cmd -n 2 $(Base.julia_cmd()) --project=$testdir/.. $testdir/convergence.jl 1`)
-    #     end
-    #     @test true
-    # end
+    @testset "Parallel tests" begin
+        mpiexec() do cmd
+            run(`$cmd -n 2 $(Base.julia_cmd()) --project=$testdir/.. $testdir/convergence.jl 1`)
+        end
+        @test true
+    end
 end