From 6fedbcaf4d0afcc3a233cdfcdb172bdcb16156a6 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 02:13:48 +0800
Subject: [PATCH 01/16] update to Lux new interfave

---
 Project.toml                    |  1 -
 docs/src/qa.md                  |  9 +++---
 docs/src/tutorials/helmholtz.md |  2 +-
 src/Sophon.jl                   |  2 +-
 src/pde/componentarrays.jl      |  2 +-
 src/pde/discretize.jl           |  2 +-
 src/pde/pinn_types.jl           | 54 ++++++++++++++-------------------
 src/pde/utils.jl                |  8 ++++-
 8 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/Project.toml b/Project.toml
index 853f05ea..14e800bd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -46,7 +46,6 @@ SophonTaylorDiffExt = "TaylorDiff"
 
 [compat]
 Adapt = "3"
-CUDA = "5"
 ChainRulesCore = "1"
 ComponentArrays = "0.15"
 Distributions = "0.25"
diff --git a/docs/src/qa.md b/docs/src/qa.md
index ca3bf579..c0207650 100644
--- a/docs/src/qa.md
+++ b/docs/src/qa.md
@@ -1,10 +1,11 @@
-## Q: How can I train the model using GPUs?
+## Q: How can I train the model using my GPU?
 
-A: To train the model on GPUs, invoke the gpu function on instances of PINN:
+A: To train the model on a single GPU, do the following:
 
 ```julia
-using Lux
-pinn = gpu(PINN(...))
+using Lux, LuxCUDA
+device = gpu_device()
+pinn = PINN(...) |> device
 ```
 ## Q: How can I monitor the loss for each loss function?
 
diff --git a/docs/src/tutorials/helmholtz.md b/docs/src/tutorials/helmholtz.md
index 130ad2b9..c4663d7a 100644
--- a/docs/src/tutorials/helmholtz.md
+++ b/docs/src/tutorials/helmholtz.md
@@ -38,7 +38,7 @@ bcs = [u(-1,y) ~ 0, u(1,y) ~ 0, u(x, -1) ~ 0, u(x, 1) ~ 0]
 Note that the boundary conditions are compatible with periocity, which allows us to apply [`BACON`](@ref).
 ```@example helmholtz
 chain = BACON(2, 1, 5, 2; hidden_dims = 32, num_layers=5)
-pinn = PINN(chain) # call `gpu` on it if you want to use gpu
+pinn = PINN(chain)
 sampler = QuasiRandomSampler(300, 100)  
 strategy = NonAdaptiveTraining()
 
diff --git a/src/Sophon.jl b/src/Sophon.jl
index 7f7d852a..8198eb5f 100644
--- a/src/Sophon.jl
+++ b/src/Sophon.jl
@@ -13,7 +13,7 @@ using ComponentArrays
 import SciMLBase
 import SciMLBase: parameterless_type, __solve, build_solution, NullParameters
 using StatsBase, QuasiMonteCarlo
-using Adapt, ChainRulesCore, CUDA, GPUArrays, GPUArraysCore
+using Adapt, ChainRulesCore, GPUArrays, GPUArraysCore
 import GPUArraysCore: AbstractGPUArray
 import QuasiMonteCarlo
 import Sobol
diff --git a/src/pde/componentarrays.jl b/src/pde/componentarrays.jl
index 7a95ba02..23c11001 100644
--- a/src/pde/componentarrays.jl
+++ b/src/pde/componentarrays.jl
@@ -11,5 +11,5 @@ const AbstractGPUComponentVecorMat{T, Ax} = Union{AbstractGPUComponentVector{T,
                                                   AbstractGPUComponentMatrix{T, Ax}}
 
 function _ComponentArray(nt::NamedTuple)
-    return isongpu(nt) ? adapt(CuArray, ComponentArray(cpu(nt))) : ComponentArray(nt)
+    return isongpu(nt) ? adapt(get_gpu_adaptor(), ComponentArray(cpu(nt))) : ComponentArray(nt)
 end
diff --git a/src/pde/discretize.jl b/src/pde/discretize.jl
index 585bb0f8..86a5ca2f 100644
--- a/src/pde/discretize.jl
+++ b/src/pde/discretize.jl
@@ -80,7 +80,7 @@ function discretize(pde_system, pinn::PINN, sampler::PINNSampler,
 
     datasets = map(Base.Fix1(broadcast, fdtype), datasets)
     datasets = init_params isa AbstractGPUComponentVector ?
-               map(Base.Fix1(adapt, CuArray), datasets) : datasets
+               map(Base.Fix1(adapt, get_gpu_adaptor()), datasets) : datasets
     pde_and_bcs_loss_function = build_loss_function(pde_system, pinn, strategy,
                                                     derivative, derivative_bc,
                                                     fdtype)
diff --git a/src/pde/pinn_types.jl b/src/pde/pinn_types.jl
index faed9b62..60fc3144 100644
--- a/src/pde/pinn_types.jl
+++ b/src/pde/pinn_types.jl
@@ -2,7 +2,8 @@
     PINN(chain, rng::AbstractRNG=Random.default_rng())
     PINN(rng::AbstractRNG=Random.default_rng(); kwargs...)
 
-A container for a neural network, its states and its initial parameters. Call `gpu` and `cpu` to move the neural network to the GPU and CPU respectively.
+A container for a neural network, its states and its initial parameters. Call `Lux.gpu_device()`
+and `Lux.cpu_device()` to move the neural network to the GPU and CPU respectively.
 The default element type of the parameters is `Float64`.
 
 ## Fields
@@ -120,40 +121,29 @@ end
 
 const NTofChainState{names} = NamedTuple{names, <:Tuple{Vararg{ChainState}}}
 
-function Lux.cpu(cs::ChainState)
-    Lux.@set! cs.state = cpu(cs.state)
-    return cs
-end
-
-function Lux.gpu(cs::ChainState)
-    Lux.@set! cs.state = adapt(CuArray, cs.state)
-    return cs
-end
-
-function Lux.cpu(cs::NamedTuple{names, <:Tuple{Vararg{ChainState}}}) where {names}
-    return map(cs) do c
-        return cpu(c)
+for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
+    ldev = Symbol("Lux$(dev)Device")
+    ladaptor = Symbol("Lux$(dev)Adaptor")
+    @eval begin
+        function (device::$ldev)(cs::ChainState)
+            Lux.@set! cs.state = device(cs.state)
+            return c
+        end
+
+        function (device::$ldev)(cs::NTofChainState{names}) where {names}
+            return map(cs) do c
+                return device(c)
+            end
+        end
+
+        function (device::$ldev)(pinn::PINN)
+            Lux.@set! pinn.phi = device(pinn.phi)
+            Lux.@set! pinn.init_params = adapt($(ladaptor)(), pinn.init_params)
+            return pinn
+        end
     end
 end
 
-function Lux.gpu(cs::NamedTuple{names, <:Tuple{Vararg{ChainState}}}) where {names}
-    return map(cs) do c
-        return gpu(c)
-    end
-end
-
-function Lux.gpu(pinn::PINN)
-    Lux.@set! pinn.phi = gpu(pinn.phi)
-    Lux.@set! pinn.init_params = adapt(CuArray, pinn.init_params)
-    return pinn
-end
-
-function Lux.cpu(pinn::PINN)
-    Lux.@set! pinn.phi = cpu(pinn.phi)
-    Lux.@set! pinn.init_params = cpu(pinn.init_params)
-    return pinn
-end
-
 """
 using Sophon, ModelingToolkit, DomainSets
 using DomainSets: ×
diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index b0a292c1..89658155 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -14,7 +14,13 @@ This function is only used for the first order derivative.
 """
 forwarddiff(phi, t, εs, order, θ) = ForwardDiff.gradient(sum ∘ Base.Fix2(phi, θ), t)
 
-@inline maybe_adapt(x::AbstractGPUArray, ε_) = ChainRulesCore.@ignore_derivatives convert(CuArray, ε_)
+for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
+    ldev = Symbol("Lux$(dev)Device")
+    ladaptor = Symbol("Lux$(dev)Adaptor")
+    @inline get_adaptor(::$(ldev)) = $(ladaptor)()
+end
+@inline get_gpu_adaptor() = get_adaptor(gpu_device())
+@inline maybe_adapt(x::AbstractGPUArray, ε_) = ChainRulesCore.@ignore_derivatives adapt(get_gpu_adaptor(), ε_)
 @inline maybe_adapt(x, ε_) = ChainRulesCore.@ignore_derivatives ε_
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{1}) where {T<:AbstractFloat}

From ab3c7425de0d703f16e975e0b474b5f5ef12fce3 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 02:16:20 +0800
Subject: [PATCH 02/16] Update Project.toml

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 14e800bd..d3095fc2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.4.4"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"

From 13a8a3d497607003242e0c5405e8e36127a31b84 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 02:20:48 +0800
Subject: [PATCH 03/16] rm GPUArrays

---
 Project.toml     | 2 --
 src/Sophon.jl    | 2 +-
 src/pde/utils.jl | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index d3095fc2..972b2e37 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,7 +10,6 @@ ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DomainSets = "5b8099bc-c8ec-5219-889f-1d9e522a28bf"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -50,7 +49,6 @@ ComponentArrays = "0.15"
 Distributions = "0.25"
 DomainSets = "0.5, 0.6"
 ForwardDiff = "0.10"
-GPUArrays = "9"
 GPUArraysCore = "0.1"
 LRUCache = "1"
 Lux = "0.5.6"
diff --git a/src/Sophon.jl b/src/Sophon.jl
index 8198eb5f..00442659 100644
--- a/src/Sophon.jl
+++ b/src/Sophon.jl
@@ -13,7 +13,7 @@ using ComponentArrays
 import SciMLBase
 import SciMLBase: parameterless_type, __solve, build_solution, NullParameters
 using StatsBase, QuasiMonteCarlo
-using Adapt, ChainRulesCore, GPUArrays, GPUArraysCore
+using Adapt, ChainRulesCore, GPUArraysCore
 import GPUArraysCore: AbstractGPUArray
 import QuasiMonteCarlo
 import Sobol
diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index 89658155..441f6fbd 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -17,7 +17,7 @@ forwarddiff(phi, t, εs, order, θ) = ForwardDiff.gradient(sum ∘ Base.Fix2(phi
 for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
     ldev = Symbol("Lux$(dev)Device")
     ladaptor = Symbol("Lux$(dev)Adaptor")
-    @inline get_adaptor(::$(ldev)) = $(ladaptor)()
+    @eval @inline get_adaptor(::$(ldev)) = $(ladaptor)()
 end
 @inline get_gpu_adaptor() = get_adaptor(gpu_device())
 @inline maybe_adapt(x::AbstractGPUArray, ε_) = ChainRulesCore.@ignore_derivatives adapt(get_gpu_adaptor(), ε_)

From 0f9d613e0bd4ef9a495a4df3133b91b87c8f7f1a Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 02:56:00 +0800
Subject: [PATCH 04/16] Update pinn_types.jl

---
 src/pde/pinn_types.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pde/pinn_types.jl b/src/pde/pinn_types.jl
index 60fc3144..9ff7578e 100644
--- a/src/pde/pinn_types.jl
+++ b/src/pde/pinn_types.jl
@@ -127,7 +127,7 @@ for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
     @eval begin
         function (device::$ldev)(cs::ChainState)
             Lux.@set! cs.state = device(cs.state)
-            return c
+            return cs
         end
 
         function (device::$ldev)(cs::NTofChainState{names}) where {names}

From 62636e896370a38f80ada26f1f83f660530116c2 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 12:41:36 +0800
Subject: [PATCH 05/16] using LuxCUDA

---
 Project.toml                      |  8 +++++---
 docs/Project.toml                 |  1 +
 ext/SophonTaylorDiffExt.jl        |  7 -------
 ext/SophonTaylorDiffLuxCUDAExt.jl | 14 ++++++++++++++
 src/Sophon.jl                     |  8 +++-----
 5 files changed, 23 insertions(+), 15 deletions(-)
 create mode 100644 ext/SophonTaylorDiffLuxCUDAExt.jl

diff --git a/Project.toml b/Project.toml
index 972b2e37..68e4144e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -21,6 +21,7 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
+PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
 QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
@@ -29,18 +30,19 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 RuntimeGeneratedFunctions = "7e49a35a-f44a-4d26-94aa-eba1b4ca6b47"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Sobol = "ed01d8cd-4d21-5b2a-85b4-cc3bdc58bad4"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 
 [weakdeps]
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 TaylorDiff = "b36ab563-344f-407b-a36a-4f200bebf99c"
 
 [extensions]
 SophonOptimisersExt = "Optimisers"
 SophonTaylorDiffExt = "TaylorDiff"
+SophonTaylorDiffLuxExt = ["TaylorDiff", "LuxCUDA"]
 
 [compat]
 Adapt = "3"
@@ -52,6 +54,7 @@ ForwardDiff = "0.10"
 GPUArraysCore = "0.1"
 LRUCache = "1"
 Lux = "0.5.6"
+LuxCUDA = "0.3"
 MacroTools = "0.5"
 Memoize = "0.4"
 ModelingToolkit = "8"
@@ -59,14 +62,13 @@ NNlib = "0.9"
 Optimisers = "0.2"
 Optimization = "3"
 OptimizationOptimisers = "0.1"
+PackageExtensionCompat = "1"
 ProgressBars = "1.5"
 QuasiMonteCarlo = "0.2, 0.3"
 Requires = "1"
 RuntimeGeneratedFunctions = "0.5"
 SciMLBase = "2"
 Sobol = "1, 2"
-StaticArrays = "1.5"
-StaticArraysCore = "1"
 StatsBase = "0.33, 0.34"
 Symbolics = "4, 5"
 julia = "1.8"
diff --git a/docs/Project.toml b/docs/Project.toml
index 2d69b6e6..adb4ece9 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -6,6 +6,7 @@ DocThemeIndigo = "8bac0ac5-51bf-41f9-885e-2bf1ac2bec5f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 DomainSets = "5b8099bc-c8ec-5219-889f-1d9e522a28bf"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 Integrals = "de52edbc-65ea-441a-8357-d3a637375a31"
 IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
diff --git a/ext/SophonTaylorDiffExt.jl b/ext/SophonTaylorDiffExt.jl
index b15cbb00..916b591c 100644
--- a/ext/SophonTaylorDiffExt.jl
+++ b/ext/SophonTaylorDiffExt.jl
@@ -147,13 +147,6 @@ for l in 1:4
     end
 end
 
-# avoid NaN
-function Base.:*(A::Union{Sophon.CuMatrix{T}, LinearAlgebra.Transpose{T, Sophon.CuArray}},
-                 B::Sophon.CuMatrix{TaylorScalar{T, N}}) where {T, N}
-    C = similar(B, (size(A, 1), size(B, 2)))
-    fill!(C, zero(eltype(C)))
-    return LinearAlgebra.mul!(C, A, B)
-end
 
 function __init__()
     @static if VERSION >= v"1.9.0"
diff --git a/ext/SophonTaylorDiffLuxCUDAExt.jl b/ext/SophonTaylorDiffLuxCUDAExt.jl
new file mode 100644
index 00000000..50ee4b37
--- /dev/null
+++ b/ext/SophonTaylorDiffLuxCUDAExt.jl
@@ -0,0 +1,14 @@
+module SophonTaylorDiffLuxCUDAExt
+
+using TaylorDiff, LuxCUDA, Sophon
+
+
+# avoid NaN
+function Base.:*(A::Union{Sophon.CuMatrix{T}, LinearAlgebra.Transpose{T, Sophon.CuArray}},
+    B::Sophon.CuMatrix{TaylorScalar{T, N}}) where {T, N}
+    C = similar(B, (size(A, 1), size(B, 2)))
+    fill!(C, zero(eltype(C)))
+    return LinearAlgebra.mul!(C, A, B)
+end
+
+emd
diff --git a/src/Sophon.jl b/src/Sophon.jl
index 00442659..31a87233 100644
--- a/src/Sophon.jl
+++ b/src/Sophon.jl
@@ -26,7 +26,7 @@ using ForwardDiff
 using MacroTools
 using MacroTools: prewalk, postwalk
 using Requires
-using StaticArrays: SVector
+using StaticArraysCore: SVector
 
 RuntimeGeneratedFunctions.init(@__MODULE__)
 
@@ -46,11 +46,9 @@ include("pde/training_strategies.jl")
 include("pde/pinnsampler.jl")
 include("pde/discretize.jl")
 
+using PackageExtensionCompat
 function __init__()
-    @static if !isdefined(Base, :get_extension)
-        @require Optimisers="3bd65402-5787-11e9-1adc-39752487f4e2" begin include("../ext/SophonOptimisersExt.jl") end
-        @require TaylorDiff="b36ab563-344f-407b-a36a-4f200bebf99c" begin include("../ext/SophonTaylorDiffExt.jl") end
-    end
+    @require_extensions
 end
 
 export @showprogress

From cca3b11f51454e405fbaf9aa0b3878910cb18eb7 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 12:58:00 +0800
Subject: [PATCH 06/16] Update Project.toml

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 68e4144e..93d63202 100644
--- a/Project.toml
+++ b/Project.toml
@@ -81,6 +81,7 @@ OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
 TaylorDiff = "b36ab563-344f-407b-a36a-4f200bebf99c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 
 [targets]
 test = ["Test", "Zygote", "ModelingToolkit", "DomainSets", "OptimizationOptimJL", "TaylorDiff"]

From 61742a6eeefcaf4dca6788aebe5d9ae13b1a49a8 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 13:03:44 +0800
Subject: [PATCH 07/16] Update Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 93d63202..8fc1a328 100644
--- a/Project.toml
+++ b/Project.toml
@@ -58,7 +58,7 @@ LuxCUDA = "0.3"
 MacroTools = "0.5"
 Memoize = "0.4"
 ModelingToolkit = "8"
-NNlib = "0.9"
+NNlib = "0.8, 0.9"
 Optimisers = "0.2"
 Optimization = "3"
 OptimizationOptimisers = "0.1"

From c3dd66260abc638ce3810201dcb7cff59b980557 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 13:53:20 +0800
Subject: [PATCH 08/16] fix finitediff on gpu

---
 docs/Project.toml                 | 1 +
 ext/SophonTaylorDiffLuxCUDAExt.jl | 6 ++----
 src/pde/utils.jl                  | 6 ++++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index adb4ece9..f516c55d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -19,5 +19,6 @@ OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Sophon = "077df616-1c15-4d29-b519-7542a62df138"
+StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/ext/SophonTaylorDiffLuxCUDAExt.jl b/ext/SophonTaylorDiffLuxCUDAExt.jl
index 50ee4b37..d4dc2f85 100644
--- a/ext/SophonTaylorDiffLuxCUDAExt.jl
+++ b/ext/SophonTaylorDiffLuxCUDAExt.jl
@@ -2,13 +2,11 @@ module SophonTaylorDiffLuxCUDAExt
 
 using TaylorDiff, LuxCUDA, Sophon
 
-
-# avoid NaN
 function Base.:*(A::Union{Sophon.CuMatrix{T}, LinearAlgebra.Transpose{T, Sophon.CuArray}},
-    B::Sophon.CuMatrix{TaylorScalar{T, N}}) where {T, N}
+                 B::Sophon.CuMatrix{TaylorScalar{T, N}}) where {T, N}
     C = similar(B, (size(A, 1), size(B, 2)))
     fill!(C, zero(eltype(C)))
     return LinearAlgebra.mul!(C, A, B)
 end
 
-emd
+end
diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index 441f6fbd..13695811 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -20,8 +20,10 @@ for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
     @eval @inline get_adaptor(::$(ldev)) = $(ladaptor)()
 end
 @inline get_gpu_adaptor() = get_adaptor(gpu_device())
-@inline maybe_adapt(x::AbstractGPUArray, ε_) = ChainRulesCore.@ignore_derivatives adapt(get_gpu_adaptor(), ε_)
-@inline maybe_adapt(x, ε_) = ChainRulesCore.@ignore_derivatives ε_
+
+@memoize maybe_adapt(x::AbstractGPUArray, ε) = convert(typeof(x), ε)
+@memoize maybe_adapt(x, ε) = ε
+ChainRulesCore.@non_differentiable maybe_adapt(x, ε)
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{1}) where {T<:AbstractFloat}
     ε = maybe_adapt(x, ε_)

From ff9b95ca9aa949a664434bf1622954e68d07758c Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 14:00:44 +0800
Subject: [PATCH 09/16] Update utils.jl

---
 src/pde/utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index 13695811..b6463c2f 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -21,7 +21,7 @@ for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
 end
 @inline get_gpu_adaptor() = get_adaptor(gpu_device())
 
-@memoize maybe_adapt(x::AbstractGPUArray, ε) = convert(typeof(x), ε)
+@memoize maybe_adapt(x::AbstractGPUArray, ε) = convert(parameterless_type(x), ε)
 @memoize maybe_adapt(x, ε) = ε
 ChainRulesCore.@non_differentiable maybe_adapt(x, ε)
 

From 6b8a28cbd292eb1d3290b1bd2050308e866cc094 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 14:55:33 +0800
Subject: [PATCH 10/16] gpu_device() on prob

---
 Project.toml               |  3 ++-
 docs/src/qa.md             |  3 +--
 ext/SophonLuxCUDAExt.jl    | 12 ++++++++++++
 src/Sophon.jl              |  1 -
 src/pde/componentarrays.jl | 15 ---------------
 src/pde/discretize.jl      |  4 +---
 src/pde/pinn_types.jl      |  3 +--
 src/pde/utils.jl           | 25 +++++++------------------
 test/runtests.jl           | 21 ---------------------
 9 files changed, 24 insertions(+), 63 deletions(-)
 create mode 100644 ext/SophonLuxCUDAExt.jl
 delete mode 100644 src/pde/componentarrays.jl

diff --git a/Project.toml b/Project.toml
index 8fc1a328..f16e9ad0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -42,7 +42,8 @@ TaylorDiff = "b36ab563-344f-407b-a36a-4f200bebf99c"
 [extensions]
 SophonOptimisersExt = "Optimisers"
 SophonTaylorDiffExt = "TaylorDiff"
-SophonTaylorDiffLuxExt = ["TaylorDiff", "LuxCUDA"]
+SophonTaylorDiffLuxCUDAExt = ["TaylorDiff", "LuxCUDA"]
+SophonLuxCUDAExt = "LuxCUDA"
 
 [compat]
 Adapt = "3"
diff --git a/docs/src/qa.md b/docs/src/qa.md
index c0207650..10fc5660 100644
--- a/docs/src/qa.md
+++ b/docs/src/qa.md
@@ -4,8 +4,7 @@ A: To train the model on a single GPU, do the following:
 
 ```julia
 using Lux, LuxCUDA
-device = gpu_device()
-pinn = PINN(...) |> device
+prob = Sophon.discretize(...) |> gpu_device()
 ```
 ## Q: How can I monitor the loss for each loss function?
 
diff --git a/ext/SophonLuxCUDAExt.jl b/ext/SophonLuxCUDAExt.jl
new file mode 100644
index 00000000..b309ef28
--- /dev/null
+++ b/ext/SophonLuxCUDAExt.jl
@@ -0,0 +1,12 @@
+module SophonLuxCUDAExt
+
+using Lux, LuxCUDA, Sophon, ModelingToolkit
+
+function (::LuxCUDADevice)(prob::Union{ModelingToolkit.PDESystem, Sophon.PDESystem})
+    u0 = adapt(CuArray, prob.u0)
+    p = [adapt(CuArray, prob.p[i]) for i in 1:length(prob.p)]
+    prob = remake(prob, u0=u0, p=p)
+    return prob
+end
+
+end
diff --git a/src/Sophon.jl b/src/Sophon.jl
index 31a87233..cde41cae 100644
--- a/src/Sophon.jl
+++ b/src/Sophon.jl
@@ -38,7 +38,6 @@ include("layers/nets.jl")
 include("layers/utils.jl")
 include("layers/operators.jl")
 
-include("pde/componentarrays.jl")
 include("pde/pinn_types.jl")
 include("pde/utils.jl")
 include("pde/sym_utils.jl")
diff --git a/src/pde/componentarrays.jl b/src/pde/componentarrays.jl
deleted file mode 100644
index 23c11001..00000000
--- a/src/pde/componentarrays.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-const AbstractGPUComponentArray{T, N, Ax} = ComponentArray{T, N,
-                                                           <:GPUArraysCore.AbstractGPUVector,
-                                                           Ax}
-const AbstractGPUComponentVector{T, Ax} = ComponentArray{T, 1,
-                                                         <:GPUArraysCore.AbstractGPUVector,
-                                                         Ax}
-const AbstractGPUComponentMatrix{T, Ax} = ComponentArray{T, 2,
-                                                         <:GPUArraysCore.AbstractGPUMatrix,
-                                                         Ax}
-const AbstractGPUComponentVecorMat{T, Ax} = Union{AbstractGPUComponentVector{T, Ax},
-                                                  AbstractGPUComponentMatrix{T, Ax}}
-
-function _ComponentArray(nt::NamedTuple)
-    return isongpu(nt) ? adapt(get_gpu_adaptor(), ComponentArray(cpu(nt))) : ComponentArray(nt)
-end
diff --git a/src/pde/discretize.jl b/src/pde/discretize.jl
index 86a5ca2f..51dc4a3d 100644
--- a/src/pde/discretize.jl
+++ b/src/pde/discretize.jl
@@ -76,11 +76,9 @@ function discretize(pde_system, pinn::PINN, sampler::PINNSampler,
                     adtype=Optimization.AutoZygote())
     datasets = sample(pde_system, sampler)
     init_params = Lux.fmap(Base.Fix1(broadcast, fdtype), pinn.init_params)
-    init_params = _ComponentArray(init_params)
+    init_params = ComponentArray(init_params)
 
     datasets = map(Base.Fix1(broadcast, fdtype), datasets)
-    datasets = init_params isa AbstractGPUComponentVector ?
-               map(Base.Fix1(adapt, get_gpu_adaptor()), datasets) : datasets
     pde_and_bcs_loss_function = build_loss_function(pde_system, pinn, strategy,
                                                     derivative, derivative_bc,
                                                     fdtype)
diff --git a/src/pde/pinn_types.jl b/src/pde/pinn_types.jl
index 9ff7578e..09d707ee 100644
--- a/src/pde/pinn_types.jl
+++ b/src/pde/pinn_types.jl
@@ -2,8 +2,7 @@
     PINN(chain, rng::AbstractRNG=Random.default_rng())
     PINN(rng::AbstractRNG=Random.default_rng(); kwargs...)
 
-A container for a neural network, its states and its initial parameters. Call `Lux.gpu_device()`
-and `Lux.cpu_device()` to move the neural network to the GPU and CPU respectively.
+A container for a neural network, its states and its initial parameters.
 The default element type of the parameters is `Float64`.
 
 ## Fields
diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index b6463c2f..24550f2f 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -1,7 +1,3 @@
-function isongpu(nt::NamedTuple)
-    return any(x -> x isa AbstractGPUArray, Lux.fcollect(nt))
-end
-
 function get_l2_loss_function(loss_function, dataset)
     loss(θ) = mean(abs2, loss_function(dataset, θ))
     return loss
@@ -14,35 +10,28 @@ This function is only used for the first order derivative.
 """
 forwarddiff(phi, t, εs, order, θ) = ForwardDiff.gradient(sum ∘ Base.Fix2(phi, θ), t)
 
-for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
-    ldev = Symbol("Lux$(dev)Device")
-    ladaptor = Symbol("Lux$(dev)Adaptor")
-    @eval @inline get_adaptor(::$(ldev)) = $(ladaptor)()
-end
-@inline get_gpu_adaptor() = get_adaptor(gpu_device())
-
-@memoize maybe_adapt(x::AbstractGPUArray, ε) = convert(parameterless_type(x), ε)
-@memoize maybe_adapt(x, ε) = ε
-ChainRulesCore.@non_differentiable maybe_adapt(x, ε)
+@memoize maybe_convert(x::AbstractGPUArray, ε) = convert(parameterless_type(x), ε)
+@memoize maybe_convert(x, ε) = ε
+ChainRulesCore.@non_differentiable maybe_convert(x, ε)
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{1}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ ε, θ) .- phi(x .- ε, θ)) .* (h / 2)
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{2}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ ε, θ) .+ phi(x .- ε, θ) .- 2 .* phi(x, θ)) .* h^2
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{3}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ 2 .* ε, θ) .- 2 .* phi(x .+ ε, θ) .+ 2 .* phi(x .- ε, θ) -
             phi(x .- 2 .* ε, θ)) .* h^3 ./ 2
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{4}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ 2 .* ε, θ) .- 4 .* phi(x .+ ε, θ) .+ 6 .* phi(x, θ) .-
             4 .* phi(x .- ε, θ) .+ phi(x .- 2 .* ε, θ)) .* h^4
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 335976aa..fd1f7e23 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -288,27 +288,6 @@ rng = Random.default_rng()
         @test_nowarn AdaptiveTraining((θ, p) -> p, 5)
         @test_nowarn AdaptiveTraining(((θ, p) -> p, (θ, p) -> θ), (3, 4, 5))
     end
-
-    #=
-    @testset "GPU" begin
-        @testset "single model" begin
-            pinn = PINN(DiscreteFourierFeature(2,1,2,2))
-            pinn = pinn |> gpu
-            @test getdata(pinn.init_params) isa CuArray
-            phi = pinn.phi
-            @test phi.state.weight isa CuArray
-        end
-
-        @testset "multiple models" begin
-            pinn = PINN(u = DiscreteFourierFeature(2,1,2,2),
-                        v = DiscreteFourierFeature(2,1,2,2))
-            pinn = pinn |> gpu
-            @test getdata(pinn.init_params) isa CuArray
-            phi = pinn.phi
-            @test phi.u.state.weight isa CuArray
-        end
-    end
-    =#
 end
 
 @testset "BetaSampler" begin include("betasampler.jl") end

From 855804504de9254a929b913dc755103855969978 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 14:56:27 +0800
Subject: [PATCH 11/16] Update SophonLuxCUDAExt.jl

---
 ext/SophonLuxCUDAExt.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/SophonLuxCUDAExt.jl b/ext/SophonLuxCUDAExt.jl
index b309ef28..6705bea6 100644
--- a/ext/SophonLuxCUDAExt.jl
+++ b/ext/SophonLuxCUDAExt.jl
@@ -1,8 +1,8 @@
 module SophonLuxCUDAExt
 
-using Lux, LuxCUDA, Sophon, ModelingToolkit
+using Lux, LuxCUDA, Sophon, Optimization
 
-function (::LuxCUDADevice)(prob::Union{ModelingToolkit.PDESystem, Sophon.PDESystem})
+function (::LuxCUDADevice)(prob::OptimizationProblem)
     u0 = adapt(CuArray, prob.u0)
     p = [adapt(CuArray, prob.p[i]) for i in 1:length(prob.p)]
     prob = remake(prob, u0=u0, p=p)

From 07d78c655abf0985e2f6f395fecd329189c00ee3 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 15:11:28 +0800
Subject: [PATCH 12/16] Update SophonTaylorDiffExt.jl

---
 ext/SophonTaylorDiffExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/SophonTaylorDiffExt.jl b/ext/SophonTaylorDiffExt.jl
index 916b591c..47a0136b 100644
--- a/ext/SophonTaylorDiffExt.jl
+++ b/ext/SophonTaylorDiffExt.jl
@@ -115,7 +115,7 @@ for N in 1:5
 end
 
 @inline function taylordiff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{N}) where {T <: Number, N}
-    ε = Sophon.maybe_adapt(x, ε_)
+    ε = Sophon.maybe_convert(x, ε_)
     return TaylorDiff.derivative(Base.Fix2(phi, θ), x, ε, Val{N+1}())
 end
 

From 77f7d24eaac0d570cdca6508d5c4b3d9d08cb815 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 15:11:30 +0800
Subject: [PATCH 13/16] Update helmholtz.md

---
 docs/src/tutorials/helmholtz.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/src/tutorials/helmholtz.md b/docs/src/tutorials/helmholtz.md
index c4663d7a..58713e44 100644
--- a/docs/src/tutorials/helmholtz.md
+++ b/docs/src/tutorials/helmholtz.md
@@ -55,9 +55,7 @@ xs, ys= [infimum(d.domain):0.01:supremum(d.domain) for d in domains]
 u_analytic(x,y) = sinpi(a1*x)*sinpi(a2*y)
 u_real = [u_analytic(x,y) for x in xs, y in ys]
 
-phi_cpu = cpu(phi) # in case you are using GPU
-ps_cpu = cpu(res.u)
-u_pred = [sum(phi_cpu(([x,y]), ps_cpu)) for x in xs, y in ys]
+u_pred = [sum(phi(([x,y]), ps)) for x in xs, y in ys]
 
 using CairoMakie
 axis = (xlabel="x", ylabel="y", title="Analytical Solution")

From fcc879d22f1cfb0e4e512a981dcf007b646fa3f0 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 17:02:04 +0800
Subject: [PATCH 14/16] Update helmholtz.md

---
 docs/src/tutorials/helmholtz.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/tutorials/helmholtz.md b/docs/src/tutorials/helmholtz.md
index 58713e44..a09b3cde 100644
--- a/docs/src/tutorials/helmholtz.md
+++ b/docs/src/tutorials/helmholtz.md
@@ -50,7 +50,7 @@ prob = Sophon.discretize(helmholtz, pinn, sampler, strategy)
 Let's plot the result.
 ```@example helmholtz
 phi = pinn.phi
-
+ps = res.u
 xs, ys= [infimum(d.domain):0.01:supremum(d.domain) for d in domains]
 u_analytic(x,y) = sinpi(a1*x)*sinpi(a2*y)
 u_real = [u_analytic(x,y) for x in xs, y in ys]

From 6f750b6aa2d6afa9f8f837974578dfa5de5a5fd0 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 17:10:13 +0800
Subject: [PATCH 15/16] Update SophonLuxCUDAExt.jl

---
 ext/SophonLuxCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/SophonLuxCUDAExt.jl b/ext/SophonLuxCUDAExt.jl
index 6705bea6..23cbd78d 100644
--- a/ext/SophonLuxCUDAExt.jl
+++ b/ext/SophonLuxCUDAExt.jl
@@ -1,6 +1,6 @@
 module SophonLuxCUDAExt
 
-using Lux, LuxCUDA, Sophon, Optimization
+using Lux, LuxCUDA, Sophon, Optimization, Adapt
 
 function (::LuxCUDADevice)(prob::OptimizationProblem)
     u0 = adapt(CuArray, prob.u0)

From 798e747755552dfdb487ac992f72f5070dcf7321 Mon Sep 17 00:00:00 2001
From: Yicheng Wu <yicheng.wu@ucalgary.ca>
Date: Sat, 25 Nov 2023 18:22:01 +0800
Subject: [PATCH 16/16] fix zygote error by using tuple

---
 ext/SophonLuxCUDAExt.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ext/SophonLuxCUDAExt.jl b/ext/SophonLuxCUDAExt.jl
index 23cbd78d..47a5c92b 100644
--- a/ext/SophonLuxCUDAExt.jl
+++ b/ext/SophonLuxCUDAExt.jl
@@ -4,9 +4,8 @@ using Lux, LuxCUDA, Sophon, Optimization, Adapt
 
 function (::LuxCUDADevice)(prob::OptimizationProblem)
     u0 = adapt(CuArray, prob.u0)
-    p = [adapt(CuArray, prob.p[i]) for i in 1:length(prob.p)]
-    prob = remake(prob, u0=u0, p=p)
-    return prob
+    p = Tuple(adapt(CuArray, prob.p[i]) for i in 1:length(prob.p))  # have to use tuple here...
+    return Optimization.OptimizationProblem(prob.f, u0, p)
 end
 
 end