diff --git a/Project.toml b/Project.toml
index 8fc1a328..f16e9ad0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -42,7 +42,8 @@ TaylorDiff = "b36ab563-344f-407b-a36a-4f200bebf99c"
 [extensions]
 SophonOptimisersExt = "Optimisers"
 SophonTaylorDiffExt = "TaylorDiff"
-SophonTaylorDiffLuxExt = ["TaylorDiff", "LuxCUDA"]
+SophonTaylorDiffLuxCUDAExt = ["TaylorDiff", "LuxCUDA"]
+SophonLuxCUDAExt = "LuxCUDA"
 
 [compat]
 Adapt = "3"
diff --git a/docs/src/qa.md b/docs/src/qa.md
index c0207650..10fc5660 100644
--- a/docs/src/qa.md
+++ b/docs/src/qa.md
@@ -4,8 +4,7 @@ A: To train the model on a single GPU, do the following:
 
 ```julia
 using Lux, LuxCUDA
-device = gpu_device()
-pinn = PINN(...) |> device
+prob = Sophon.discretize(...) |> gpu_device()
 ```
 ## Q: How can I monitor the loss for each loss function?
 
diff --git a/ext/SophonLuxCUDAExt.jl b/ext/SophonLuxCUDAExt.jl
new file mode 100644
index 00000000..b309ef28
--- /dev/null
+++ b/ext/SophonLuxCUDAExt.jl
@@ -0,0 +1,12 @@
+module SophonLuxCUDAExt
+
+using Lux, LuxCUDA, Sophon, ModelingToolkit
+
+function (::LuxCUDADevice)(prob::Union{ModelingToolkit.PDESystem, Sophon.PDESystem})
+    u0 = adapt(CuArray, prob.u0)
+    p = [adapt(CuArray, prob.p[i]) for i in 1:length(prob.p)]
+    prob = remake(prob, u0=u0, p=p)
+    return prob
+end
+
+end
diff --git a/src/Sophon.jl b/src/Sophon.jl
index 31a87233..cde41cae 100644
--- a/src/Sophon.jl
+++ b/src/Sophon.jl
@@ -38,7 +38,6 @@ include("layers/nets.jl")
 include("layers/utils.jl")
 include("layers/operators.jl")
 
-include("pde/componentarrays.jl")
 include("pde/pinn_types.jl")
 include("pde/utils.jl")
 include("pde/sym_utils.jl")
diff --git a/src/pde/componentarrays.jl b/src/pde/componentarrays.jl
deleted file mode 100644
index 23c11001..00000000
--- a/src/pde/componentarrays.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-const AbstractGPUComponentArray{T, N, Ax} = ComponentArray{T, N,
-                                                           <:GPUArraysCore.AbstractGPUVector,
-                                                           Ax}
-const AbstractGPUComponentVector{T, Ax} = ComponentArray{T, 1,
-                                                         <:GPUArraysCore.AbstractGPUVector,
-                                                         Ax}
-const AbstractGPUComponentMatrix{T, Ax} = ComponentArray{T, 2,
-                                                         <:GPUArraysCore.AbstractGPUMatrix,
-                                                         Ax}
-const AbstractGPUComponentVecorMat{T, Ax} = Union{AbstractGPUComponentVector{T, Ax},
-                                                  AbstractGPUComponentMatrix{T, Ax}}
-
-function _ComponentArray(nt::NamedTuple)
-    return isongpu(nt) ? adapt(get_gpu_adaptor(), ComponentArray(cpu(nt))) : ComponentArray(nt)
-end
diff --git a/src/pde/discretize.jl b/src/pde/discretize.jl
index 86a5ca2f..51dc4a3d 100644
--- a/src/pde/discretize.jl
+++ b/src/pde/discretize.jl
@@ -76,11 +76,9 @@ function discretize(pde_system, pinn::PINN, sampler::PINNSampler,
                     adtype=Optimization.AutoZygote())
     datasets = sample(pde_system, sampler)
     init_params = Lux.fmap(Base.Fix1(broadcast, fdtype), pinn.init_params)
-    init_params = _ComponentArray(init_params)
+    init_params = ComponentArray(init_params)
 
     datasets = map(Base.Fix1(broadcast, fdtype), datasets)
-    datasets = init_params isa AbstractGPUComponentVector ?
-               map(Base.Fix1(adapt, get_gpu_adaptor()), datasets) : datasets
     pde_and_bcs_loss_function = build_loss_function(pde_system, pinn, strategy,
                                                     derivative, derivative_bc,
                                                     fdtype)
diff --git a/src/pde/pinn_types.jl b/src/pde/pinn_types.jl
index 9ff7578e..09d707ee 100644
--- a/src/pde/pinn_types.jl
+++ b/src/pde/pinn_types.jl
@@ -2,8 +2,7 @@
     PINN(chain, rng::AbstractRNG=Random.default_rng())
     PINN(rng::AbstractRNG=Random.default_rng(); kwargs...)
 
-A container for a neural network, its states and its initial parameters. Call `Lux.gpu_device()`
-and `Lux.cpu_device()` to move the neural network to the GPU and CPU respectively.
+A container for a neural network, its states and its initial parameters.
 The default element type of the parameters is `Float64`.
 
 ## Fields
diff --git a/src/pde/utils.jl b/src/pde/utils.jl
index b6463c2f..24550f2f 100644
--- a/src/pde/utils.jl
+++ b/src/pde/utils.jl
@@ -1,7 +1,3 @@
-function isongpu(nt::NamedTuple)
-    return any(x -> x isa AbstractGPUArray, Lux.fcollect(nt))
-end
-
 function get_l2_loss_function(loss_function, dataset)
     loss(θ) = mean(abs2, loss_function(dataset, θ))
     return loss
@@ -14,35 +10,28 @@ This function is only used for the first order derivative.
 """
 forwarddiff(phi, t, εs, order, θ) = ForwardDiff.gradient(sum ∘ Base.Fix2(phi, θ), t)
 
-for (dev) in (:CPU, :CUDA, :AMDGPU, :Metal)
-    ldev = Symbol("Lux$(dev)Device")
-    ladaptor = Symbol("Lux$(dev)Adaptor")
-    @eval @inline get_adaptor(::$(ldev)) = $(ladaptor)()
-end
-@inline get_gpu_adaptor() = get_adaptor(gpu_device())
-
-@memoize maybe_adapt(x::AbstractGPUArray, ε) = convert(parameterless_type(x), ε)
-@memoize maybe_adapt(x, ε) = ε
-ChainRulesCore.@non_differentiable maybe_adapt(x, ε)
+@memoize maybe_convert(x::AbstractGPUArray, ε) = convert(parameterless_type(x), ε)
+@memoize maybe_convert(x, ε) = ε
+ChainRulesCore.@non_differentiable maybe_convert(x, ε)
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{1}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ ε, θ) .- phi(x .- ε, θ)) .* (h / 2)
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{2}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ ε, θ) .+ phi(x .- ε, θ) .- 2 .* phi(x, θ)) .* h^2
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{3}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ 2 .* ε, θ) .- 2 .* phi(x .+ ε, θ) .+ 2 .* phi(x .- ε, θ) -
             phi(x .- 2 .* ε, θ)) .* h^3 ./ 2
 end
 
 @inline function finitediff(phi, x, θ, ε_::AbstractVector{T}, h::T, ::Val{4}) where {T<:AbstractFloat}
-    ε = maybe_adapt(x, ε_)
+    ε = maybe_convert(x, ε_)
     return (phi(x .+ 2 .* ε, θ) .- 4 .* phi(x .+ ε, θ) .+ 6 .* phi(x, θ) .-
             4 .* phi(x .- ε, θ) .+ phi(x .- 2 .* ε, θ)) .* h^4
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 335976aa..fd1f7e23 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -288,27 +288,6 @@ rng = Random.default_rng()
         @test_nowarn AdaptiveTraining((θ, p) -> p, 5)
         @test_nowarn AdaptiveTraining(((θ, p) -> p, (θ, p) -> θ), (3, 4, 5))
     end
-
-    #=
-    @testset "GPU" begin
-        @testset "single model" begin
-            pinn = PINN(DiscreteFourierFeature(2,1,2,2))
-            pinn = pinn |> gpu
-            @test getdata(pinn.init_params) isa CuArray
-            phi = pinn.phi
-            @test phi.state.weight isa CuArray
-        end
-
-        @testset "multiple models" begin
-            pinn = PINN(u = DiscreteFourierFeature(2,1,2,2),
-                        v = DiscreteFourierFeature(2,1,2,2))
-            pinn = pinn |> gpu
-            @test getdata(pinn.init_params) isa CuArray
-            phi = pinn.phi
-            @test phi.u.state.weight isa CuArray
-        end
-    end
-    =#
 end
 
 @testset "BetaSampler" begin include("betasampler.jl") end