From af1a9f9e71a80b1050fa1f04f3e5c614ca8055e6 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 11 Nov 2022 11:15:17 -0600 Subject: [PATCH] Integrating all ExaAdmm GPU backends --- Project.toml | 9 ++-- src/Evaluators/ProxALEvalutor.jl | 27 ++++++---- src/ExaAdmmBackend/ExaAdmmBackend.jl | 16 +++--- src/ProxAL.jl | 1 - src/backends.jl | 3 +- src/blocks.jl | 4 +- src/params.jl | 2 +- test/blockmodel.jl | 80 +++++++++++++++++++++------- test/convergence.jl | 35 ++++++++++-- test/runtests.jl | 28 +++++----- 10 files changed, 139 insertions(+), 66 deletions(-) diff --git a/Project.toml b/Project.toml index 1738b21..e959abc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ProxAL" uuid = "12c3852d-bf95-4e7b-be60-68937c3c927b" authors = ["Anirudh Subramanyam ", "Youngdae Kim ", "Francois Pacaud ", "Michel Schanen "] -version = "0.6.0" +version = "0.7.0" [deps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" @@ -18,12 +18,11 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [compat] -AMDGPU = "=0.4.2" +AMDGPU = "0.4" CUDA = "3.4" CatViews = "1" ExaAdmm = "0.3" @@ -36,8 +35,10 @@ MPI = "0.19" julia = "1.8" [extras] +CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3" +ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "LazyArtifacts"] +test = ["CUDAKernels", "LazyArtifacts", "ROCKernels", "Test"] diff --git a/src/Evaluators/ProxALEvalutor.jl b/src/Evaluators/ProxALEvalutor.jl index 798b6d9..859f8a9 100644 --- a/src/Evaluators/ProxALEvalutor.jl +++ b/src/Evaluators/ProxALEvalutor.jl @@ -400,21 +400,28 @@ function optimize!( iteration() # Check convergence - minviol = max( + if max( runinfo.maxviol_t[end], runinfo.maxviol_c[end], runinfo.maxviol_t_actual[end], runinfo.maxviol_c_actual[end], runinfo.maxviol_d[end] - ) - if minviol < runinfo.minviol - runinfo.minviol = minviol - algparams.tron_outer_eps = minviol - if runinfo.output - ProxAL.write(runinfo, nlp, "solution_$(modelinfo.case_name)_$(comm_ranks(comm)).h5") - end - end - if minviol <= algparams.tol + ) <= algparams.tol + # minviol = max( + # runinfo.maxviol_t[end], + # runinfo.maxviol_c[end], + # runinfo.maxviol_t_actual[end], + # runinfo.maxviol_c_actual[end], + # runinfo.maxviol_d[end] + # ) + # if minviol < runinfo.minviol + # runinfo.minviol = minviol + # algparams.tron_outer_eps = minviol + # if runinfo.output + # ProxAL.write(runinfo, nlp, "solution_$(modelinfo.case_name)_$(comm_ranks(comm)).h5") + # end + # end + # if minviol <= algparams.tol break end end diff --git a/src/ExaAdmmBackend/ExaAdmmBackend.jl b/src/ExaAdmmBackend/ExaAdmmBackend.jl index 5f079a5..7b0d0cd 100644 --- a/src/ExaAdmmBackend/ExaAdmmBackend.jl +++ b/src/ExaAdmmBackend/ExaAdmmBackend.jl @@ -126,22 +126,18 @@ function ModelProxAL( end function ExaAdmm.AdmmEnv(opfdata, rho_va::Float64, rho_pq::Float64; use_gpu=false, ka_device=nothing, options...) + T = Float64 if use_gpu - if isa(ka_device, ROCDevice) - T = Float64 - VT = ROCVector{Float64} - VI = ROCVector{Int} - MT = ROCMatrix{Float64} - elseif isa(ka_device, CUDADevice) + if !isa(ka_device, Nothing) + VT = typeof(ExaAdmm.KAArray{Float64}(0, ka_device)) + VI = typeof(ExaAdmm.KAArray{Int}(0, ka_device)) + MT = typeof(ExaAdmm.KAArray{Float64}(0, 0, ka_device)) + else VT = CuVector{Float64} VI = CuVector{Int} MT = CuMatrix{Float64} - else - error("Unknown device type $ka_device") end - else - T = Float64 VT = Vector{Float64} VI = Vector{Int} MT = Matrix{Float64} diff --git a/src/ProxAL.jl b/src/ProxAL.jl index d1f8d0f..38ef220 100644 --- a/src/ProxAL.jl +++ b/src/ProxAL.jl @@ -8,7 +8,6 @@ using Printf, CatViews using ExaPF using ExaAdmm using KernelAbstractions -using ROCKernels using LinearAlgebra using SparseArrays using MPI diff --git a/src/backends.jl b/src/backends.jl index 100155c..4817e45 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -640,7 +640,7 @@ function init!(block::AdmmBlockBackend, algparams::AlgParams) if algparams.decompCtgs && k > 1 if modelinfo.ctgs_link_constr_type == :corrective_penalty copyto!(opfmodel.smin, zeros(length(gens))) - copyto!(opfmodel.smax, 1.0.*[g.scen_agc for g in gens]) + copyto!(opfmodel.smax, 2.0.*[g.scen_agc for g in gens]) else @assert modelinfo.ctgs_link_constr_type == :preventive_penalty copyto!(opfmodel.smin, zeros(length(gens))) @@ -810,7 +810,6 @@ function optimize!(block::AdmmBlockBackend, x0::Union{Nothing, AbstractArray}, a set_start_values!(block, x0) end # Optimize with optimizer, using ExaPF model - block.env.params.outer_eps = algparams.tron_outer_eps*2e2 ExaAdmm.admm_two_level(block.env, block.model, block.env.ka_device) # Recover solution in ProxAL format solution = get_solution(block) diff --git a/src/blocks.jl b/src/blocks.jl index 7b6c167..af25f53 100644 --- a/src/blocks.jl +++ b/src/blocks.jl @@ -31,9 +31,9 @@ total number of contingencies). ### Deporting the resolution on the GPU -When the backend is set to `ExaBlockBackend` (and a CUDA GPU is available), the user +When the backend is set to `ExaAdmmBackend` (and a CUDA GPU is available), the user could chose to deport the resolution of each subproblem directly on -the GPU simply by setting `algparams.device=CUDADevice`. However, note that +the GPU simply by setting `algparams.device=GPU`. However, note that we could not instantiate more subproblems on the GPU than the number of GPU available. diff --git a/src/params.jl b/src/params.jl index 3707623..c15bbd9 100644 --- a/src/params.jl +++ b/src/params.jl @@ -9,7 +9,7 @@ const MOI_OPTIMAL_STATUSES = [ @enum(TargetDevice, CPU, - CUDADevice, + GPU, KADevice, ) diff --git a/test/blockmodel.jl b/test/blockmodel.jl index e934a3f..af86ad0 100644 --- a/test/blockmodel.jl +++ b/test/blockmodel.jl @@ -1,4 +1,6 @@ using Test +using CUDA +using AMDGPU using MPI using Ipopt using ExaPF @@ -20,6 +22,31 @@ rtol = 1e-4 case_file = joinpath(DATA_DIR, "$(case).m") load_file = joinpath(DATA_DIR, "mp_demand", "$(case)_oneweek_168") +solver_list = ["ExaAdmmCPU"] +if CUDA.has_cuda_gpu() + using CUDAKernels + function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::CUDADevice) where {T} + return CuArray{T}(undef, n) + end + function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::CUDADevice) where {T} + return CuArray{T}(undef, n1, n2) + end + gpu_device = CUDADevice() + push!(solver_list, "ExaAdmmGPUKA") +elseif AMDGPU.has_rocm_gpu() + using ROCKernels + # Set for crusher login node to avoid other users + AMDGPU.default_device!(AMDGPU.devices()[2]) + function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::ROCDevice) where {T} + return ROCArray{T}(undef, n) + end + function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::ROCDevice) where {T} + return ROCArray{T}(undef, n1, n2) + end + gpu_device = ROCDevice() + push!(solver_list, "ExaAdmmGPUKA") +end + @testset "Block Model Backends" begin # ctgs_arr = deepcopy(rawdata.ctgs_arr) @@ -82,24 +109,41 @@ load_file = joinpath(DATA_DIR, "mp_demand", "$(case)_oneweek_168") slack_jump = solution.st @testset "ExaAdmm BlockModel" begin - blockmodel = ProxAL.AdmmBlockBackend( - blkid, opfdata_c, nlp.rawdata, algparams, modelinfo_local, t, 1, T; - ) - ProxAL.init!(blockmodel, nlp.algparams) - ProxAL.set_objective!(blockmodel, nlp.algparams, primal, dual) - - # Test optimization - x0 = nothing - solution = ProxAL.optimize!(blockmodel, x0, nlp.algparams) - @test solution.status ∈ ProxAL.MOI_OPTIMAL_STATUSES - obj_tron = solution.minimum - pg_tron = solution.pg - slack_tron = solution.st - # TODO: implement ProxAL objective in ExaTron - @test obj_jump ≈ obj_tron rtol=1e-4 - @test pg_jump ≈ pg_tron rtol=1e-3 - if t > 1 # slack could be of any value for t == 1 - @test slack_jump ≈ slack_tron rtol=1e-3 + for solver in solver_list + @testset "$(solver)" begin + if solver == "ExaAdmmCPU" + backend = AdmmBackend() + end + if solver == "ExaAdmmGPU" + backend = AdmmBackend() + algparams.device = ProxAL.GPU # Assuming CUDA + algparams.ka_device = nothing + end + if solver == "ExaAdmmGPUKA" + backend = AdmmBackend() + algparams.device = ProxAL.KADevice + algparams.ka_device = gpu_device + end + blockmodel = ProxAL.AdmmBlockBackend( + blkid, opfdata_c, nlp.rawdata, algparams, modelinfo_local, t, 1, T; + ) + ProxAL.init!(blockmodel, nlp.algparams) + ProxAL.set_objective!(blockmodel, nlp.algparams, primal, dual) + + # Test optimization + x0 = nothing + solution = ProxAL.optimize!(blockmodel, x0, nlp.algparams) + @test solution.status ∈ ProxAL.MOI_OPTIMAL_STATUSES + obj_tron = solution.minimum + pg_tron = solution.pg + slack_tron = solution.st + # TODO: implement ProxAL objective in ExaTron + @test obj_jump ≈ obj_tron rtol=1e-4 + @test pg_jump ≈ pg_tron rtol=1e-3 + if t > 1 # slack could be of any value for t == 1 + @test slack_jump ≈ slack_tron rtol=1e-3 + end + end end end end diff --git a/test/convergence.jl b/test/convergence.jl index 09e2038..c83fed0 100644 --- a/test/convergence.jl +++ b/test/convergence.jl @@ -4,6 +4,7 @@ using DelimitedFiles, Printf using LinearAlgebra, JuMP using CatViews using CUDA +using AMDGPU using MPI using LazyArtifacts @@ -37,9 +38,28 @@ algparams.verbose = 0 solver_list = ["Ipopt", "ExaAdmmCPU"] if CUDA.has_cuda_gpu() - # TODO: MadNLP broken currently - # push!(solver_list, "MadNLPGPU") push!(solver_list, "ExaAdmmGPU") + using CUDAKernels + function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::CUDADevice) where {T} + return CuArray{T}(undef, n) + end + function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::CUDADevice) where {T} + return CuArray{T}(undef, n1, n2) + end + gpu_device = CUDADevice() + push!(solver_list, "ExaAdmmGPUKA") +elseif AMDGPU.has_rocm_gpu() + using ROCKernels + # Set for crusher login node to avoid other users + AMDGPU.default_device!(AMDGPU.devices()[2]) + function ProxAL.ExaAdmm.KAArray{T}(n::Int, device::ROCDevice) where {T} + return ROCArray{T}(undef, n) + end + function ProxAL.ExaAdmm.KAArray{T}(n1::Int, n2::Int, device::ROCDevice) where {T} + return ROCArray{T}(undef, n1, n2) + end + gpu_device = ROCDevice() + push!(solver_list, "ExaAdmmGPUKA") end if isfile(joinpath(dirname(@__FILE__), "..", "build/libhiop.so")) push!(solver_list, "Hiop") @@ -52,7 +72,6 @@ end for solver in solver_list @testset "$(solver)" begin - println("Testing using $(solver)") if solver == "Ipopt" using Ipopt backend = JuMPBackend() @@ -69,7 +88,15 @@ end backend = AdmmBackend() algparams.tron_outer_iterlim=2000 algparams.tron_outer_eps=1e-6 - algparams.device = ProxAL.CUDADevice + algparams.device = ProxAL.GPU # Assuming CUDA + algparams.ka_device = nothing + end + if solver == "ExaAdmmGPUKA" + backend = AdmmBackend() + algparams.tron_outer_iterlim=2000 + algparams.tron_outer_eps=1e-6 + algparams.device = ProxAL.KADevice + algparams.ka_device = gpu_device end diff --git a/test/runtests.jl b/test/runtests.jl index 0945e94..43b870e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,21 +7,21 @@ testdir = @__DIR__ @testset "Integration tests" begin include("blockmodel.jl") end - # @testset "ExaAdmm backend" begin - # include("exaadmm.jl") - # end + @testset "ExaAdmm backend" begin + include("exaadmm.jl") + end - # # Testing using 1 process - # @testset "Sequential tests" begin - # include("convergence.jl") - # end + # Testing using 1 process + @testset "Sequential tests" begin + include("convergence.jl") + end - # # Testing using 2 processes + # Testing using 2 processes - # @testset "Parallel tests" begin - # mpiexec() do cmd - # run(`$cmd -n 2 $(Base.julia_cmd()) --project=$testdir/.. $testdir/convergence.jl 1`) - # end - # @test true - # end + @testset "Parallel tests" begin + mpiexec() do cmd + run(`$cmd -n 2 $(Base.julia_cmd()) --project=$testdir/.. $testdir/convergence.jl 1`) + end + @test true + end end