From ea6bd626e280d468132f8ecba1dab78c5859e527 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:28:16 +0200
Subject: [PATCH 01/47] Restructure FFTOP to work on GPU

---
 ext/LinearOperatorFFTWExt/FFTOp.jl            | 58 ++++++++-----------
 .../LinearOperatorFFTWExt.jl                  |  2 +-
 2 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/ext/LinearOperatorFFTWExt/FFTOp.jl b/ext/LinearOperatorFFTWExt/FFTOp.jl
index 5ea0d8c..ef56bf1 100644
--- a/ext/LinearOperatorFFTWExt/FFTOp.jl
+++ b/ext/LinearOperatorFFTWExt/FFTOp.jl
@@ -1,6 +1,6 @@
 export FFTOpImpl
 
-mutable struct FFTOpImpl{T} <: FFTOp{T}
+mutable struct FFTOpImpl{T, vecT, P <: AbstractFFTs.Plan{T}, IP <: AbstractFFTs.Plan{T}} <: FFTOp{T}
   nrow :: Int
   ncol :: Int
   symmetric :: Bool
@@ -14,10 +14,10 @@ mutable struct FFTOpImpl{T} <: FFTOp{T}
   args5 :: Bool
   use_prod5! :: Bool
   allocated5 :: Bool
-  Mv5 :: Vector{T}
-  Mtu5 :: Vector{T}
-  plan
-  iplan
+  Mv5 :: vecT
+  Mtu5 :: vecT
+  plan :: P
+  iplan :: IP
   shift::Bool
   unitary::Bool
 end
@@ -34,13 +34,14 @@ returns an operator which performs an FFT on Arrays of type T
 * `shape::Tuple`  - size of the array to transform
 * (`shift=true`)  - if true, fftshifts are performed
 * (`unitary=true`)  - if true, FFT is normalized such that it is unitary
+* (`S = Vector{T}`) - type of temporary vector, change to use on GPU
+* (`kwargs...`) - keyword arguments given to fft plan
 """
-function LinearOperatorCollection.FFTOp(T::Type; shape::NTuple{D,Int64}, shift::Bool=true, unitary::Bool=true, cuda::Bool=false) where D
+function LinearOperatorCollection.FFTOp(T::Type; shape::NTuple{D,Int64}, shift::Bool=true, unitary::Bool=true, S = Array{Complex{real(T)}}, kwargs...) where D
   
-  #tmpVec = cuda ? CuArray{T}(undef,shape) : Array{Complex{real(T)}}(undef, shape)
-  tmpVec = Array{Complex{real(T)}}(undef, shape)
-  plan = plan_fft!(tmpVec; flags=FFTW.MEASURE)
-  iplan = plan_bfft!(tmpVec; flags=FFTW.MEASURE)
+  tmpVec = S(undef, shape...)
+  plan = plan_fft!(tmpVec; kwargs...)
+  iplan = plan_bfft!(tmpVec; kwargs...)
 
   if unitary
     facF = T(1.0/sqrt(prod(shape)))
@@ -50,39 +51,26 @@ function LinearOperatorCollection.FFTOp(T::Type; shape::NTuple{D,Int64}, shift::
     facB = T(1.0)
   end
 
-  let shape_=shape, plan_=plan, iplan_=iplan, tmpVec_=tmpVec, facF_=facF, facB_=facB
+  let shape_ = shape, plan_ = plan, iplan_ = iplan, tmpVec_ = tmpVec, facF_ = facF, facB_ = facB
 
-  if shift
-    return FFTOpImpl{T}(prod(shape), prod(shape), false, false
-              , (res, x) -> fft_multiply_shift!(res, plan_, x, shape_, facF_, tmpVec_) 
-              , nothing
-              , (res, x) -> fft_multiply_shift!(res, iplan_, x, shape_, facB_, tmpVec_) 
-              , 0, 0, 0, true, false, true, T[], T[]
-              , plan
-              , iplan
-              , shift
-              , unitary)
-  else
-    return FFTOpImpl{T}(prod(shape), prod(shape), false, false
-            , (res, x) -> fft_multiply!(res, plan_, x, facF_, tmpVec_) 
-            , nothing
-            , (res, x) -> fft_multiply!(res, iplan_, x, facB_, tmpVec_)
-            , 0, 0, 0, true, false, true, T[], T[]
-            , plan
-            , iplan
-            , shift
-            , unitary)
-  end
+    fun! = fft_multiply!
+    if shift
+      fun! = fft_multiply_shift!
+    end
+
+    return FFTOpImpl(prod(shape), prod(shape), false, false, (res, x) -> fun!(res, plan_, x, shape_, facF_, tmpVec_),
+        nothing, (res, x) -> fun!(res, iplan_, x, shape_, facB_, tmpVec_),
+        0, 0, 0, true, false, true, similar(tmpVec, 0), similar(tmpVec, 0), plan, iplan, shift, unitary)
   end
 end
 
-function fft_multiply!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, factor::T, tmpVec::Array{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
+function fft_multiply!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, factor::T, tmpVec::AbstractArray{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
   tmpVec[:] .= x
   plan * tmpVec
   res .= factor .* vec(tmpVec)
 end
 
-function fft_multiply_shift!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, shape::NTuple{D}, factor::T, tmpVec::Array{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
+function fft_multiply_shift!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, shape::NTuple{D}, factor::T, tmpVec::AbstractArray{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
   ifftshift!(tmpVec, reshape(x,shape))
   plan * tmpVec
   fftshift!(reshape(res,shape), tmpVec)
@@ -91,5 +79,5 @@ end
 
 
 function Base.copy(S::FFTOpImpl)
-  return FFTOp(eltype(S); shape=size(S.plan), shift=S.shift, unitary=S.unitary)
+  return FFTOp(eltype(S); shape=size(S.plan), shift=S.shift, unitary=S.unitary, S = LinearOperators.storage_type(S)) # TODO loses kwargs...
 end
diff --git a/ext/LinearOperatorFFTWExt/LinearOperatorFFTWExt.jl b/ext/LinearOperatorFFTWExt/LinearOperatorFFTWExt.jl
index 5768120..4708dde 100644
--- a/ext/LinearOperatorFFTWExt/LinearOperatorFFTWExt.jl
+++ b/ext/LinearOperatorFFTWExt/LinearOperatorFFTWExt.jl
@@ -1,6 +1,6 @@
 module LinearOperatorFFTWExt
 
-using LinearOperatorCollection, FFTW
+using LinearOperatorCollection, FFTW, FFTW.AbstractFFTs
 
 include("FFTOp.jl")
 include("DCTOp.jl")

From 9a2069c55cb6b3e424397dcd24c75651737fa09d Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 11 Apr 2024 13:37:51 +0200
Subject: [PATCH 02/47] Add GPU support to GradientOp

---
 Project.toml                                  |  3 ++
 ext/LinearOperatorGPUArraysExt/GradientOp.jl  | 38 +++++++++++++++++++
 .../LinearOperatorGPUArraysExt.jl             |  8 ++++
 src/GradientOp.jl                             | 14 +++----
 4 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 ext/LinearOperatorGPUArraysExt/GradientOp.jl
 create mode 100644 ext/LinearOperatorGPUArraysExt/LinearOperatorGPUArraysExt.jl

diff --git a/Project.toml b/Project.toml
index 17c7656..b125b5b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 julia = "1.9"
+GPUArrays = "8, 9, 10"
 NFFT = "0.13"
 LinearOperators = "2.3.3"
 Wavelets = "0.9, 0.10"
@@ -23,6 +24,7 @@ Reexport = "1.0"
 FFTW = "1.0"
 
 [weakdeps]
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
@@ -34,3 +36,4 @@ test = ["Test", "FFTW", "Wavelets", "NFFT"]
 LinearOperatorNFFTExt = ["NFFT", "FFTW"]
 LinearOperatorFFTWExt = "FFTW"
 LinearOperatorWaveletExt = "Wavelets"
+LinearOperatorGPUArraysExt = "GPUArrays"
\ No newline at end of file
diff --git a/ext/LinearOperatorGPUArraysExt/GradientOp.jl b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
new file mode 100644
index 0000000..dfd2145
--- /dev/null
+++ b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
@@ -0,0 +1,38 @@
+function LinearOperatorCollection.grad!(res::vecT, img::vecT, shape, dim) where {vecT <: AbstractGPUVector}
+  δ = zeros(Int, length(shape))
+  δ[dim] = 1
+  δ = Tuple(δ)
+  di = CartesianIndex(δ)
+
+  gpu_call(reshape(res, shape .- δ), reshape(img,shape), di) do ctx, res_, img_, di_
+    idx = @cartesianidx(res_)
+    @inbounds res_[idx] = img_[idx] - img_[idx + di_]
+    return nothing  
+  end
+  
+  return res
+end
+
+# adjoint of directional gradients
+function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, dim::Int64) where {vecT <: AbstractGPUVector, N}
+  δ = zeros(Int, length(shape))
+  δ[dim] = 1
+  δ = Tuple(δ)
+  di = CartesianIndex(δ)
+
+  res_ = reshape(res,shape)
+  g_ = reshape(g, shape .- δ)
+
+  res_ .= 0
+  gpu_call(res_, g_, di, elements = length(g)) do ctx, res_k, g_k, di_k
+    idx = @cartesianidx(g_k)
+    @inbounds res_k[idx]  = g_k[idx]
+    return nothing  
+  end
+
+  gpu_call(res_, g_, di, elements = length(g)) do ctx, res_k, g_k, di_k
+    idx = @cartesianidx(g_k)
+    @inbounds res_k[idx + di_k] -= g_k[idx]
+    return nothing  
+  end
+end
diff --git a/ext/LinearOperatorGPUArraysExt/LinearOperatorGPUArraysExt.jl b/ext/LinearOperatorGPUArraysExt/LinearOperatorGPUArraysExt.jl
new file mode 100644
index 0000000..613228b
--- /dev/null
+++ b/ext/LinearOperatorGPUArraysExt/LinearOperatorGPUArraysExt.jl
@@ -0,0 +1,8 @@
+module LinearOperatorGPUArraysExt
+
+using LinearOperatorCollection, GPUArrays
+
+include("GradientOp.jl")
+
+
+end # module
diff --git a/src/GradientOp.jl b/src/GradientOp.jl
index 8baedff..d041c3b 100644
--- a/src/GradientOp.jl
+++ b/src/GradientOp.jl
@@ -12,22 +12,22 @@ directional gradient operator along the dimensions `dims` for an array of size `
 # Optional Keyword argument
   * `dims`                     - dimension(s) along which the gradient is applied; default is `1:length(shape)`
 """
-function GradientOp(::Type{T}; shape::NTuple{N,Int}, dims=1:length(shape)) where {T <: Number, N}
-  return GradientOpImpl(T, shape, dims)
+function GradientOp(::Type{T}; shape::NTuple{N,Int}, dims=1:length(shape), kwargs...) where {T <: Number, N}
+  return GradientOpImpl(T, shape, dims; kwargs...)
 end
 
-function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dims) where N
-  return vcat([GradientOpImpl(T, shape, dim) for dim ∈ dims]...)
+function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dims; kwargs...) where N
+  return vcat([GradientOpImpl(T, shape, dim; kwargs...) for dim ∈ dims]...)
 end
 
-function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dim::Int) where N
+function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dim::Int; S = Vector{T}) where N
   nrow = div( (shape[dim]-1)*prod(shape), shape[dim] )
   ncol = prod(shape)
   return LinearOperator{T}(nrow, ncol, false, false,
                           (res,x) -> (grad!(res,x,shape,dim)),
                           (res,x) -> (grad_t!(res,x,shape,dim)),
-                          (res,x) -> (grad_t!(res,x,shape,dim))
-                          )
+                          (res,x) -> (grad_t!(res,x,shape,dim)),
+                          S = S)
 end
 
 # directional gradients

From ae5fd8c77ece312b339b3d3b8c7d4636788fff15 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 11 Apr 2024 13:43:33 +0200
Subject: [PATCH 03/47] Added S kwarg to WaveletOp

---
 ext/LinearOperatorWaveletExt/WaveletOp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/LinearOperatorWaveletExt/WaveletOp.jl b/ext/LinearOperatorWaveletExt/WaveletOp.jl
index 07ddaa6..d5e8afe 100644
--- a/ext/LinearOperatorWaveletExt/WaveletOp.jl
+++ b/ext/LinearOperatorWaveletExt/WaveletOp.jl
@@ -10,10 +10,10 @@ a given input array.
 * `shape`                 - size of the Array to transform
 * (`wt=wavelet(WT.db2)`)  - Wavelet to apply
 """
-function LinearOperatorCollection.WaveletOp(::Type{T}; shape::Tuple, wt=wavelet(WT.db2)) where T <: Number
+function LinearOperatorCollection.WaveletOp(::Type{T}; shape::Tuple, wt=wavelet(WT.db2), S = Vector{T}) where T <: Number
   shape = filter(x-> x != 1, shape) # Drop dimension with 1
   return LinearOperator(T, prod(shape), prod(shape), false, false
             , (res,x)->dwt!(reshape(res,shape), reshape(x,shape), wt)
             , nothing
-            , (res,x)->idwt!(reshape(res,shape), reshape(x,shape), wt) )
+            , (res,x)->idwt!(reshape(res,shape), reshape(x,shape), wt), S = S)
 end

From 51a4e0fccde0c0d83319508b4cd969c58f0ab83c Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 11 Apr 2024 14:05:25 +0200
Subject: [PATCH 04/47] Add GPU support to ProdOp

---
 src/ProdOp.jl | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index b8eca6a..210814e 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -7,7 +7,7 @@ export ProdOp
   that the latter can be made copyable. This is particularly relevant for
   multi-threaded code
 """
-mutable struct ProdOp{T,U,V} <: AbstractLinearOperatorFromCollection{T}
+mutable struct ProdOp{T,U,V, vecT <: AbstractVector{T}} <: AbstractLinearOperatorFromCollection{T}
   nrow :: Int
   ncol :: Int
   symmetric :: Bool
@@ -21,11 +21,11 @@ mutable struct ProdOp{T,U,V} <: AbstractLinearOperatorFromCollection{T}
   args5 :: Bool
   use_prod5! :: Bool
   allocated5 :: Bool
-  Mv5 :: Vector{T}
-  Mtu5 :: Vector{T}
+  Mv5 :: vecT
+  Mtu5 :: vecT
   A::U
   B::V
-  tmp::Vector{T}
+  tmp::vecT
 end
 
 """
@@ -33,11 +33,11 @@ end
 
 composition/product of two Operators. Differs with * since it can handle normal operator
 """
-function ProdOp(A,B)
+function ProdOp(A::AbstractLinearOperator, B::AbstractLinearOperator)
   nrow = size(A, 1)
   ncol = size(B, 2)
-  S = promote_type(eltype(A), eltype(B))
-  tmp_ = Vector{S}(undef, size(B, 1))
+  S = promote_type(storage_type(A), storage_type(B))
+  tmp_ = S(undef, size(B, 1))
 
   function produ!(res, x::AbstractVector{T}, tmp) where T<:Union{Real,Complex}
     mul!(tmp, B, x)
@@ -58,11 +58,13 @@ function ProdOp(A,B)
                      (res,x) -> produ!(res,x,tmp_),
                      (res,y) -> tprodu!(res,y,tmp_),
                      (res,y) -> ctprodu!(res,y,tmp_), 
-                     0, 0, 0, false, false, false, S[], S[],
+                     0, 0, 0, false, false, false, similar(tmp_, 0), similar(tmp_, 0),
                      A, B, tmp_)
 
   return Op
 end
+ProdOp(A::AbstractMatrix, B::AbstractLinearOperator) = ProdOp(LinearOperator(A), B)
+ProdOp(A::AbstractLinearOperator, B::AbstractMatrix) = ProdOp(A, LinearOperator(B))
 
 function Base.copy(S::ProdOp{T}) where T
   A = copy(S.A)

From 52361c2ba797a84b0bfc3fde58e552dd23398768 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 11 Apr 2024 15:18:52 +0200
Subject: [PATCH 05/47] Add GPU support for SamplingOp

---
 src/SamplingOp.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/SamplingOp.jl b/src/SamplingOp.jl
index fdbdbd4..243f667 100644
--- a/src/SamplingOp.jl
+++ b/src/SamplingOp.jl
@@ -1,11 +1,11 @@
 export vectorizePattern
 
 function LinearOperatorCollection.SamplingOp(::Type{T};
-  pattern::P, shape::Tuple=()) where {P} where T <: Number
+  pattern::P, shape::Tuple=(), S = Vector{T}) where {P} where T <: Number
   if length(shape) == 0
-    return SamplingOpImpl(pattern)
+    return SamplingOpImpl(T, pattern; S = S)
   else
-    return SamplingOpImpl(pattern, shape, T)
+    return SamplingOpImpl(T, pattern, shape, S = S;)
   end
 end
 
@@ -28,16 +28,16 @@ indicated by pattern.
 * `pattern::Array{Int}` - indices to sample
 * `shape::Tuple`        - size of the array to sample
 """
-function SamplingOpImpl(pattern::T, shape::Tuple, type::Type=ComplexF64) where T<:AbstractArray{Int}
+function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Int}, shape::Tuple; S = Vector{T})
   ndims(pattern)>1 ?  idx = vectorizePattern(pattern, shape) : idx = pattern
-  return opEye(type,length(idx))*opRestriction(idx, prod(shape))
+  return opRestriction(idx, prod(shape), S = S)
 end
 
-function SamplingOpImpl(pattern::T) where T<:AbstractArray{Bool}
+function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Bool}; S = Vector{T})
 
   function prod!(res::Vector{U}, x::Vector{V}) where {U,V}
     res .= pattern.*x
   end
 
-  return LinearOperator(T, length(pattern), length(pattern), true, false, prod!)
+  return LinearOperator(T, length(pattern), length(pattern), true, false, prod!; S = S)
 end
\ No newline at end of file

From 44dc932ea870dd13fad77fb2ddfd6a305685071c Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 11 Apr 2024 18:00:26 +0200
Subject: [PATCH 06/47] Add GPU support for NormalOp

---
 src/NormalOp.jl | 50 +++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index 67c222a..b704aec 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -1,14 +1,21 @@
 export normalOperator
 
-function LinearOperatorCollection.NormalOp(::Type{T};
-  parent, weights, tmp::Union{Nothing,Vector{T}}) where T <: Number
-  if tmp == nothing
-    return NormalOpImpl(parent, weights)
-  else
-    return NormalOpImpl(parent, weights, tmp)
-  end
+function LinearOperatorCollection.NormalOp(::Type{T}; parent, weights = nothing) where T <: Number
+  return NormalOp(T, parent, weights)
 end
 
+NormalOp(::Type{T}, parent::AbstractMatrix{T}, weights) where T  = NormalOp(T, LinearOperator(parent), weights)
+
+# TODO Are weights always restricted to T or can they also be real(T)?
+function NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, ::Nothing) where T
+  weights = similar(storage_type(parent), size(parent, 1))
+  weights .= one(eltype(parent))
+  return NormalOp(T, parent, weights)
+end
+NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, weights::AbstractVector{T}) where T = NormalOp(T, parent, WeightingOp(weights))
+
+NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, weights::AbstractLinearOperator{T}; kwargs...) where T = NormalOpImpl(parent, weights)
+
 mutable struct NormalOpImpl{T,S,D,V} <: NormalOp{T}
   nrow :: Int
   ncol :: Int
@@ -23,8 +30,8 @@ mutable struct NormalOpImpl{T,S,D,V} <: NormalOp{T}
   args5 :: Bool
   use_prod5! :: Bool
   allocated5 :: Bool
-  Mv5 :: Vector{T}
-  Mtu5 :: Vector{T}
+  Mv5 :: V
+  Mtu5 :: V
   parent::S
   weights::D
   tmp::V
@@ -32,25 +39,21 @@ end
 
 LinearOperators.storage_type(op::NormalOpImpl) = typeof(op.Mv5)
 
-function NormalOpImpl(parent, weights)
-  T = promote_type(eltype(parent), eltype(weights))
-  tmp = Vector{T}(undef, size(parent, 1))
-  return NormalOpImpl(parent, weights, tmp)
-end
-
-function NormalOpImpl(parent, weights, tmp::Vector{T}) where T
+function NormalOpImpl(parent::AbstractLinearOperator{T}, weights::AbstractLinearOperator{T}) where T 
+  S = promote_type(storage_type(parent), storage_type(weights))
+  tmp = S(undef, size(parent, 1))
 
-  function produ!(y, parent, tmp, x)
+  function produ!(y, parent, weights, tmp, x)
     mul!(tmp, parent, x)
     mul!(tmp, weights, tmp) # This can be dangerous. We might need to create two tmp vectors
     return mul!(y, adjoint(parent), tmp)
   end
 
-  return NormalOpImpl(size(parent,2), size(parent,2), false, false
-         , (res,x) -> produ!(res, parent, tmp, x)
+  return NormalOpImpl{T, typeof(parent), typeof(weights), typeof(tmp)}(size(parent,2), size(parent,2), false, false
+         , (res,x) -> produ!(res, parent, weights, tmp, x)
          , nothing
          , nothing
-         , 0, 0, 0, false, false, false, T[], T[]
+         , 0, 0, 0, true, false, true, similar(tmp, 0), similar(tmp, 0)
          , parent, weights, tmp)
 end
 
@@ -58,7 +61,6 @@ function Base.copy(S::NormalOpImpl)
   return NormalOpImpl(copy(S.parent), S.weights, copy(S.tmp))
 end
 
-function normalOperator(parent, weights=opEye(eltype(parent), size(parent,1)))
-  return NormalOpImpl(parent, weights)
-end
-
+function normalOperator(parent, weights=nothing)
+  return NormalOp(eltype(parent); parent = parent, weights = weights)
+end
\ No newline at end of file

From 438f7ff8af4623e6298230d88c2b74d7456a3891 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Mon, 15 Apr 2024 11:25:15 +0200
Subject: [PATCH 07/47] Lessen restriction on ProdOp, NormalOp

---
 src/NormalOp.jl | 12 +++++-------
 src/ProdOp.jl   |  4 +---
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index b704aec..ce1be07 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -4,17 +4,15 @@ function LinearOperatorCollection.NormalOp(::Type{T}; parent, weights = nothing)
   return NormalOp(T, parent, weights)
 end
 
-NormalOp(::Type{T}, parent::AbstractMatrix{T}, weights) where T  = NormalOp(T, LinearOperator(parent), weights)
-
 # TODO Are weights always restricted to T or can they also be real(T)?
-function NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, ::Nothing) where T
+function NormalOp(::Type{T}, parent, ::Nothing) where T
   weights = similar(storage_type(parent), size(parent, 1))
   weights .= one(eltype(parent))
   return NormalOp(T, parent, weights)
 end
-NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, weights::AbstractVector{T}) where T = NormalOp(T, parent, WeightingOp(weights))
+NormalOp(::Type{T}, parent, weights::AbstractVector{T}) where T = NormalOp(T, parent, WeightingOp(weights))
 
-NormalOp(::Type{T}, parent::AbstractLinearOperator{T}, weights::AbstractLinearOperator{T}; kwargs...) where T = NormalOpImpl(parent, weights)
+NormalOp(::Type{T}, parent, weights::AbstractLinearOperator{T}; kwargs...) where T = NormalOpImpl(parent, weights)
 
 mutable struct NormalOpImpl{T,S,D,V} <: NormalOp{T}
   nrow :: Int
@@ -39,7 +37,7 @@ end
 
 LinearOperators.storage_type(op::NormalOpImpl) = typeof(op.Mv5)
 
-function NormalOpImpl(parent::AbstractLinearOperator{T}, weights::AbstractLinearOperator{T}) where T 
+function NormalOpImpl(parent, weights)
   S = promote_type(storage_type(parent), storage_type(weights))
   tmp = S(undef, size(parent, 1))
 
@@ -49,7 +47,7 @@ function NormalOpImpl(parent::AbstractLinearOperator{T}, weights::AbstractLinear
     return mul!(y, adjoint(parent), tmp)
   end
 
-  return NormalOpImpl{T, typeof(parent), typeof(weights), typeof(tmp)}(size(parent,2), size(parent,2), false, false
+  return NormalOpImpl{eltype(parent), typeof(parent), typeof(weights), typeof(tmp)}(size(parent,2), size(parent,2), false, false
          , (res,x) -> produ!(res, parent, weights, tmp, x)
          , nothing
          , nothing
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index 210814e..75dd79e 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -33,7 +33,7 @@ end
 
 composition/product of two Operators. Differs with * since it can handle normal operator
 """
-function ProdOp(A::AbstractLinearOperator, B::AbstractLinearOperator)
+function ProdOp(A, B)
   nrow = size(A, 1)
   ncol = size(B, 2)
   S = promote_type(storage_type(A), storage_type(B))
@@ -63,8 +63,6 @@ function ProdOp(A::AbstractLinearOperator, B::AbstractLinearOperator)
 
   return Op
 end
-ProdOp(A::AbstractMatrix, B::AbstractLinearOperator) = ProdOp(LinearOperator(A), B)
-ProdOp(A::AbstractLinearOperator, B::AbstractMatrix) = ProdOp(A, LinearOperator(B))
 
 function Base.copy(S::ProdOp{T}) where T
   A = copy(S.A)

From 161d5277d113c855f4be11e6581321718e210481 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:55:49 +0200
Subject: [PATCH 08/47] Add GPU support to NFFTOp

---
 ext/LinearOperatorNFFTExt/NFFTOp.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ext/LinearOperatorNFFTExt/NFFTOp.jl b/ext/LinearOperatorNFFTExt/NFFTOp.jl
index 495ea99..9dcb69a 100644
--- a/ext/LinearOperatorNFFTExt/NFFTOp.jl
+++ b/ext/LinearOperatorNFFTExt/NFFTOp.jl
@@ -5,7 +5,7 @@ function LinearOperatorCollection.NFFTOp(::Type{T};
   return NFFTOpImpl(shape, nodes; toeplitz, oversamplingFactor, kernelSize, kargs... )
 end
 
-mutable struct NFFTOpImpl{T} <: NFFTOp{T}
+mutable struct NFFTOpImpl{T, vecT, P <: AbstractNFFTPlan} <: NFFTOp{T}
   nrow :: Int
   ncol :: Int
   symmetric :: Bool
@@ -19,9 +19,9 @@ mutable struct NFFTOpImpl{T} <: NFFTOp{T}
   args5 :: Bool
   use_prod5! :: Bool
   allocated5 :: Bool
-  Mv5 :: Vector{T}
-  Mtu5 :: Vector{T}
-  plan
+  Mv5 :: vecT
+  Mtu5 :: vecT
+  plan :: P
   toeplitz :: Bool
 end
 
@@ -39,24 +39,24 @@ generates a `NFFTOpImpl` which evaluates the MRI Fourier signal encoding operato
 * (`nodes=nothing`)         - Array containg the trajectory nodes (redundant)
 * (`kargs`)                 - additional keyword arguments
 """
-function NFFTOpImpl(shape::Tuple, tr::AbstractMatrix{T}; toeplitz=false, oversamplingFactor=1.25, kernelSize=3, kargs...) where {T}
+function NFFTOpImpl(shape::Tuple, tr::AbstractMatrix{T}; toeplitz=false, oversamplingFactor=1.25, kernelSize=3, S = Vector{Complex{T}}, kargs...) where {T}
 
-  plan = plan_nfft(tr, shape, m=kernelSize, σ=oversamplingFactor, precompute=NFFT.TENSOR,
+  plan = plan_nfft(S, tr, shape, m=kernelSize, σ=oversamplingFactor, precompute=NFFT.TENSOR,
 		                          fftflags=FFTW.ESTIMATE, blocking=true)
 
-  return NFFTOpImpl{Complex{T}}(size(tr,2), prod(shape), false, false
+  return NFFTOpImpl{eltype(S), S, typeof(plan)}(size(tr,2), prod(shape), false, false
             , (res,x) -> produ!(res,plan,x)
             , nothing
             , (res,y) -> ctprodu!(res,plan,y)
-            , 0, 0, 0, false, false, false, Complex{T}[], Complex{T}[]
+            , 0, 0, 0, false, false, false, S(), S()
             , plan, toeplitz)
 end
 
-function produ!(y::AbstractVector, plan::NFFT.NFFTPlan, x::AbstractVector) 
+function produ!(y::AbstractVector, plan::AbstractNFFTPlan, x::AbstractVector) 
   mul!(y, plan, reshape(x,plan.N))
 end
 
-function ctprodu!(x::AbstractVector, plan::NFFT.NFFTPlan, y::AbstractVector)
+function ctprodu!(x::AbstractVector, plan::AbstractNFFTPlan, y::AbstractVector)
   mul!(reshape(x, plan.N), adjoint(plan), y)
 end
 

From 7ea715c4ee21d3a19e15270ec3aaf53c5e7145c4 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:17:15 +0200
Subject: [PATCH 09/47] NormalOp use storage_type of parent

---
 src/NormalOp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index ce1be07..8e9e09d 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -1,6 +1,6 @@
 export normalOperator
 
-function LinearOperatorCollection.NormalOp(::Type{T}; parent, weights = nothing) where T <: Number
+function LinearOperatorCollection.NormalOp(::Type{T}; parent, weights = opEye(eltype(parent), size(parent, 1), S = storage_type(parent))) where T <: Number
   return NormalOp(T, parent, weights)
 end
 
@@ -59,6 +59,6 @@ function Base.copy(S::NormalOpImpl)
   return NormalOpImpl(copy(S.parent), S.weights, copy(S.tmp))
 end
 
-function normalOperator(parent, weights=nothing)
+function normalOperator(parent, weights=opEye(eltype(parent), size(parent, 1), S= storage_type(parent)))
   return NormalOp(eltype(parent); parent = parent, weights = weights)
 end
\ No newline at end of file

From 60f81a1f05fba5303f07391de3eede188c953c1a Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:17:40 +0200
Subject: [PATCH 10/47] Add GPU support to NormalOpNFFTToeplitz

---
 .../LinearOperatorNFFTExt.jl                  |  2 +-
 ext/LinearOperatorNFFTExt/NFFTOp.jl           | 43 +++++++++++--------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/ext/LinearOperatorNFFTExt/LinearOperatorNFFTExt.jl b/ext/LinearOperatorNFFTExt/LinearOperatorNFFTExt.jl
index e85560c..8140e24 100644
--- a/ext/LinearOperatorNFFTExt/LinearOperatorNFFTExt.jl
+++ b/ext/LinearOperatorNFFTExt/LinearOperatorNFFTExt.jl
@@ -1,6 +1,6 @@
 module LinearOperatorNFFTExt
 
-using LinearOperatorCollection, NFFT, FFTW
+using LinearOperatorCollection, NFFT, NFFT.AbstractNFFTs, FFTW, FFTW.AbstractFFTs
 
 include("NFFTOp.jl")
 
diff --git a/ext/LinearOperatorNFFTExt/NFFTOp.jl b/ext/LinearOperatorNFFTExt/NFFTOp.jl
index 9dcb69a..65b6a25 100644
--- a/ext/LinearOperatorNFFTExt/NFFTOp.jl
+++ b/ext/LinearOperatorNFFTExt/NFFTOp.jl
@@ -77,7 +77,7 @@ end
 ### Toeplitz Operator ###
 #########################################################################
 
-mutable struct NFFTToeplitzNormalOp{T,D,W} <: AbstractLinearOperator{T}
+mutable struct NFFTToeplitzNormalOp{T,D,W, vecT <: AbstractVector{T}, matT <: AbstractArray{T, D}, P <: AbstractFFTs.Plan, IP <: AbstractFFTs.Plan} <: AbstractLinearOperator{T}
   nrow :: Int
   ncol :: Int
   symmetric :: Bool
@@ -91,20 +91,20 @@ mutable struct NFFTToeplitzNormalOp{T,D,W} <: AbstractLinearOperator{T}
   args5 :: Bool
   use_prod5! :: Bool
   allocated5 :: Bool
-  Mv5 :: Vector{T}
-  Mtu5 :: Vector{T}
+  Mv5 :: vecT
+  Mtu5 :: vecT
   shape::NTuple{D,Int}
   weights::W
-  fftplan
-  ifftplan
-  λ::Array{T}
-  xL1::Array{T,D}
-  xL2::Array{T,D}
+  fftplan :: P
+  ifftplan :: IP
+  λ::matT
+  xL1::matT
+  xL2::matT
 end
 
 LinearOperators.storage_type(op::NFFTToeplitzNormalOp) = typeof(op.Mv5)
 
-function NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1::Array{T,D}, xL2::Array{T,D}) where {T,D}
+function NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1::matT, xL2::matT) where {T, D, matT <: AbstractArray{T, D}}
 
   function produ!(y, shape, fftplan, ifftplan, λ, xL1, xL2, x)
     xL1 .= 0
@@ -127,33 +127,38 @@ function NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1::Array{T,D},
          , shape, W, fftplan, ifftplan, λ, xL1, xL2)
 end
 
-function NFFTToeplitzNormalOp(S::NFFTOp{T}, W=opEye(T,size(S,1))) where {T}
-  shape = S.plan.N
+function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft, 1), S= LinearOperators.storage_type(nfft))) where {T}
+  shape = nfft.plan.N
+
+  tmpVec = similar(nfft.Mv5, (2 .* shape)...)
+  tmpVec .= zero(T)
 
   # plan the FFTs
-  fftplan  = plan_fft( zeros(T, 2 .* shape);flags=FFTW.MEASURE)
-  ifftplan = plan_ifft(zeros(T, 2 .* shape);flags=FFTW.MEASURE)
+  fftplan  = plan_fft(tmpVec)
+  ifftplan = plan_ifft(tmpVec)
 
   # TODO extend the following function by weights
-  # λ = calculateToeplitzKernel(shape, S.plan.k; m = S.plan.params.m, σ = S.plan.params.σ, window = S.plan.params.window, LUTSize = S.plan.params.LUTSize, fftplan = fftplan)
+  # λ = calculateToeplitzKernel(shape, nfft.plan.k; m = nfft.plan.params.m, σ = nfft.plan.params.σ, window = nfft.plan.params.window, LUTSize = nfft.plan.params.LUTSize, fftplan = fftplan)
 
   shape_os = 2 .* shape
-  p = plan_nfft(typeof(S.plan.k), S.plan.k, shape_os; m = S.plan.params.m, σ = S.plan.params.σ,
+  p = plan_nfft(typeof(tmpVec), nfft.plan.k, shape_os; m = nfft.plan.params.m, σ = nfft.plan.params.σ,
 		precompute=NFFT.POLYNOMIAL, fftflags=FFTW.ESTIMATE, blocking=true)
-  eigMat = adjoint(p) * ( W  * ones(T, size(S.plan.k,2)))
+  tmpOnes = similar(tmpVec, size(nfft.plan.k, 2))
+  tmpOnes .= one(T)
+  eigMat = adjoint(p) * ( W  * tmpOnes)
   λ = fftplan * fftshift(eigMat)
 
-  xL1 = Array{T}(undef, 2 .* shape)
+  xL1 = tmpVec
   xL2 = similar(xL1)
 
   return NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1, xL2)
 end
 
-function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W=opEye(T,size(S,1))) where T
+function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W = opEye(eltype(S), size(S, 1), S= LinearOperators.storage_type(S))) where T
   if S.toeplitz
     return NFFTToeplitzNormalOp(S,W)
   else
-    return LinearOperatorCollection.NormalOpImpl(S,W)
+    return NormalOp(eltype(S); parent = S, weights = W)
   end
 end
 

From 4ee1f31b9795af6389994b21d463d2c9daccecf1 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:35:03 +0200
Subject: [PATCH 11/47] Allow kwargs for normalOperator(...)

---
 ext/LinearOperatorNFFTExt/NFFTOp.jl | 8 ++++----
 src/NormalOp.jl                     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ext/LinearOperatorNFFTExt/NFFTOp.jl b/ext/LinearOperatorNFFTExt/NFFTOp.jl
index 65b6a25..231b518 100644
--- a/ext/LinearOperatorNFFTExt/NFFTOp.jl
+++ b/ext/LinearOperatorNFFTExt/NFFTOp.jl
@@ -134,8 +134,8 @@ function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft,
   tmpVec .= zero(T)
 
   # plan the FFTs
-  fftplan  = plan_fft(tmpVec)
-  ifftplan = plan_ifft(tmpVec)
+  fftplan  = plan_fft(tmpVec; kwargs...)
+  ifftplan = plan_ifft(tmpVec; kwargs...)
 
   # TODO extend the following function by weights
   # λ = calculateToeplitzKernel(shape, nfft.plan.k; m = nfft.plan.params.m, σ = nfft.plan.params.σ, window = nfft.plan.params.window, LUTSize = nfft.plan.params.LUTSize, fftplan = fftplan)
@@ -154,9 +154,9 @@ function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft,
   return NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1, xL2)
 end
 
-function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W = opEye(eltype(S), size(S, 1), S= LinearOperators.storage_type(S))) where T
+function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W = opEye(eltype(S), size(S, 1), S= LinearOperators.storage_type(S)), kwargs...) where T
   if S.toeplitz
-    return NFFTToeplitzNormalOp(S,W)
+    return NFFTToeplitzNormalOp(S,W, kwargs...)
   else
     return NormalOp(eltype(S); parent = S, weights = W)
   end
diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index 8e9e09d..fee17ec 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -59,6 +59,6 @@ function Base.copy(S::NormalOpImpl)
   return NormalOpImpl(copy(S.parent), S.weights, copy(S.tmp))
 end
 
-function normalOperator(parent, weights=opEye(eltype(parent), size(parent, 1), S= storage_type(parent)))
+function normalOperator(parent, weights=opEye(eltype(parent), size(parent, 1), S= storage_type(parent)), kwargs...)
   return NormalOp(eltype(parent); parent = parent, weights = weights)
 end
\ No newline at end of file

From 31095ce291957a0b07718410978ce63dc4cf65bc Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:26:56 +0200
Subject: [PATCH 12/47] Fix copy for NormalOp

---
 src/NormalOp.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index fee17ec..889eacf 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -40,7 +40,10 @@ LinearOperators.storage_type(op::NormalOpImpl) = typeof(op.Mv5)
 function NormalOpImpl(parent, weights)
   S = promote_type(storage_type(parent), storage_type(weights))
   tmp = S(undef, size(parent, 1))
+  return NormalOpImpl(parent, weights, tmp)
+end
 
+function NormalOpImpl(parent, weights, tmp)
   function produ!(y, parent, weights, tmp, x)
     mul!(tmp, parent, x)
     mul!(tmp, weights, tmp) # This can be dangerous. We might need to create two tmp vectors

From 62a8c8e892779bc16695051883f1713abb4e164b Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:56:50 +0200
Subject: [PATCH 13/47] Add conversion to dense array to WaveletOp

---
 ext/LinearOperatorWaveletExt/WaveletOp.jl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ext/LinearOperatorWaveletExt/WaveletOp.jl b/ext/LinearOperatorWaveletExt/WaveletOp.jl
index d5e8afe..107542f 100644
--- a/ext/LinearOperatorWaveletExt/WaveletOp.jl
+++ b/ext/LinearOperatorWaveletExt/WaveletOp.jl
@@ -12,8 +12,24 @@ a given input array.
 """
 function LinearOperatorCollection.WaveletOp(::Type{T}; shape::Tuple, wt=wavelet(WT.db2), S = Vector{T}) where T <: Number
   shape = filter(x-> x != 1, shape) # Drop dimension with 1
+  tmp = Array{T}(undef, shape...)
+  tmpRes = Array{T}(undef, shape...)
   return LinearOperator(T, prod(shape), prod(shape), false, false
-            , (res,x)->dwt!(reshape(res,shape), reshape(x,shape), wt)
+            , (res,x)->prodwt!(res, x, wt, tmp, tmpRes)
             , nothing
-            , (res,x)->idwt!(reshape(res,shape), reshape(x,shape), wt), S = S)
+            , (res,x)->ctprodwt!(res, x, wt, tmp, tmpRes), S = S)
 end
+
+prodwt!(res::Vector{T}, x::Vector{T}, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D} = dwt!(reshape(res,size(tmpRes)), reshape(x,size(tmpRes)), wt)
+function prodwt!(res::vecT, x::vecT, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D, vecT <: AbstractArray{T}}
+  tmp[:] = x
+  dwt!(tmpRes, tmp, wt)
+  res[:] = tmpRes
+end
+
+ctprodwt!(res::Vector{T}, x::Vector{T}, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D} = idwt!(reshape(res,size(tmpRes)), reshape(x,size(tmpRes)), wt)
+function ctprodwt!(res::vecT, x::vecT, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D, vecT <: AbstractArray{T}}
+  tmp[:] = x
+  idwt!(tmpsRes, tmp, wt)
+  res[:] = tmpRes
+end
\ No newline at end of file

From 7495aa6d49aa139825c76e1204835375da7d94e5 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 18 Apr 2024 09:34:40 +0200
Subject: [PATCH 14/47] Add ProdNormalOp (migration MRIReco

---
 src/LinearOperatorCollection.jl |  2 +-
 src/ProdOp.jl                   | 61 ++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/src/LinearOperatorCollection.jl b/src/LinearOperatorCollection.jl
index 574036a..ebbb406 100644
--- a/src/LinearOperatorCollection.jl
+++ b/src/LinearOperatorCollection.jl
@@ -7,7 +7,7 @@ using SparseArrays
 using Random
 using InteractiveUtils
 
-import Base: *, copy, getproperty, setproperty!
+import Base: *, ∘, copy, getproperty, setproperty!
 import LinearOperators: storage_type
 
 using Reexport
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index 75dd79e..1109ef9 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -72,5 +72,64 @@ end
 
 Base.:*(::Type{<:ProdOp}, A, B) = ProdOp(A, B)
 Base.:*(::Type{<:ProdOp}, A, args...) = ProdOp(A, *(ProdOp, args...))
+Base.:∘(A::AbstractLinearOperator, B::AbstractLinearOperator) = ProdOp(A, B)
 
-storage_type(op::ProdOp) = typeof(op.Mv5)
\ No newline at end of file
+storage_type(op::ProdOp) = typeof(op.Mv5)
+
+mutable struct ProdNormalOp{T,S,U,V <: AbstractVector{T}} <: AbstractLinearOperator{T}
+  nrow :: Int
+  ncol :: Int
+  symmetric :: Bool
+  hermitian :: Bool
+  prod! :: Function
+  tprod! :: Nothing
+  ctprod! :: Nothing
+  nprod :: Int
+  ntprod :: Int
+  nctprod :: Int
+  args5 :: Bool
+  use_prod5! :: Bool
+  allocated5 :: Bool
+  Mv5 :: V
+  Mtu5 :: V
+  opOuter::S
+  normalOpInner::U
+  tmp::V
+end
+
+storage_type(op::ProdNormalOp) = typeof(op.Mv5)
+
+
+function ProdNormalOp(opOuter, normalOpInner, tmp)
+
+  function produ!(y, opOuter, normalOpInner, tmp, x)
+    mul!(tmp, opOuter, x)
+    mul!(tmp, normalOpInner, tmp) # This can be dangerous. We might need to create two tmp vectors
+    return mul!(y, adjoint(opOuter), tmp)
+  end
+
+  return ProdNormalOp(size(opOuter,2), size(opOuter,2), false, false
+         , (res,x) -> produ!(res, opOuter, normalOpInner, tmp, x)
+         , nothing
+         , nothing
+         , 0, 0, 0, false, false, false, similar(tmp, 0), similar(tmp, 0)
+         , opOuter, normalOpInner, tmp)
+end
+
+
+# In this case we are converting the left argument into a 
+# weighting matrix, that is passed to normalOperator
+# TODO Port vom MRIOperators drops given weighting matrix, I just left it out for now
+normalOperator(S::ProdOp{T, <:WeightingOp, matT}) where matT = normalOperator(S.B, S.A)
+function normalOperator(S::ProdOp, W=opEye(eltype(S),size(S,1), S = storage_type(S)))
+  arrayType = storage_type(S)
+  tmp = arrayType(undef, size(S.A, 2))
+  return ProdNormalOp(S.B, normalOperator(S.A, W), tmp)
+end
+
+function Base.copy(S::ProdNormalOp) 
+  opOuter = copy(S.opOuter)
+  opInner = copy(S.normalOpInner)
+  tmp = copy(S.tmp)
+  return ProdNormalOp(opOuter, opInner, tmp)
+end

From 135b5c4d1c11066341ee7055c1ecfa915f06ec3f Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:51:01 +0200
Subject: [PATCH 15/47] Fix include order for ProdOp

---
 src/LinearOperatorCollection.jl | 2 +-
 src/ProdOp.jl                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/LinearOperatorCollection.jl b/src/LinearOperatorCollection.jl
index ebbb406..e66bd73 100644
--- a/src/LinearOperatorCollection.jl
+++ b/src/LinearOperatorCollection.jl
@@ -83,10 +83,10 @@ function createLinearOperator(op::String, ::Type{T}; kargs...) where T <: Number
 end
 
 
+include("WeightingOp.jl")
 include("ProdOp.jl")
 include("GradientOp.jl")
 include("SamplingOp.jl")
-include("WeightingOp.jl")
 include("NormalOp.jl")
 
 end
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index 1109ef9..d7f9d08 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -120,7 +120,7 @@ end
 # In this case we are converting the left argument into a 
 # weighting matrix, that is passed to normalOperator
 # TODO Port vom MRIOperators drops given weighting matrix, I just left it out for now
-normalOperator(S::ProdOp{T, <:WeightingOp, matT}) where matT = normalOperator(S.B, S.A)
+normalOperator(S::ProdOp{T, <:WeightingOp, matT}) where {T, matT} = normalOperator(S.B, S.A)
 function normalOperator(S::ProdOp, W=opEye(eltype(S),size(S,1), S = storage_type(S)))
   arrayType = storage_type(S)
   tmp = arrayType(undef, size(S.A, 2))

From 54d5fe5b2eb83769c51155102900ea3c6f1af228 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Mon, 22 Apr 2024 14:05:00 +0200
Subject: [PATCH 16/47] Migrate DiagOp from MRIReco with GPU support

---
 src/DiagOp.jl                   | 168 ++++++++++++++++++++++++++++++++
 src/LinearOperatorCollection.jl |   1 +
 2 files changed, 169 insertions(+)
 create mode 100644 src/DiagOp.jl

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
new file mode 100644
index 0000000..1eca414
--- /dev/null
+++ b/src/DiagOp.jl
@@ -0,0 +1,168 @@
+export DiagOp
+
+mutable struct DiagOp{T, vecT, vecO} <: AbstractLinearOperator{T}
+  nrow :: Int
+  ncol :: Int
+  symmetric :: Bool
+  hermitian :: Bool
+  prod! :: Function
+  tprod! :: Function
+  ctprod! :: Function
+  nprod :: Int
+  ntprod :: Int
+  nctprod :: Int
+  args5 :: Bool
+  use_prod5! :: Bool
+  allocated5 :: Bool
+  Mv5 :: vecT
+  Mtu5 :: vecT
+  ops :: vecO
+  equalOps :: Bool
+  xIdx :: Vector{Int64}
+  yIdx :: Vector{Int64}
+end
+
+
+LinearOperators.storage_type(op::DiagOp) = typeof(op.Mv5)
+
+
+
+"""
+    DiagOp(ops::AbstractLinearOperator...)
+    DiagOp(ops::Vector{AbstractLinearOperator})
+    DiagOp(ops::NTuple{N,AbstractLinearOperator})
+
+create a bloc-diagonal operator out of the `LinearOperator`s contained in ops
+"""
+DiagOp(ops::AbstractLinearOperator...) = DiagOp(ops)
+
+function DiagOp(ops)
+  nrow = 0
+  ncol = 0
+  S = LinearOperators.storage_type(first(ops))
+  for i = 1:length(ops)
+    nrow += ops[i].nrow
+    ncol += ops[i].ncol
+    S = promote_type(S, LinearOperators.storage_type(ops[i]))
+  end
+
+  xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
+  yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
+
+  Op = DiagOp( nrow, ncol, false, false,
+                     (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
+                     (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
+                     (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
+                     0, 0, 0, false, false, false, S(undef, 0), S(undef, 0),
+                     [ops...], false, xIdx, yIdx)
+
+  return Op
+end
+
+function DiagOp(op::AbstractLinearOperator, N=1)
+  nrow = N*op.nrow
+  ncol = N*op.ncol
+  S = LinearOperators.storage_type(first(ops))
+  ops = [copy(op) for n=1:N]
+
+  xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
+  yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
+
+  Op = DiagOp{S}( nrow, ncol, false, false,
+                    (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
+                    (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
+                    (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
+                     0, 0, 0, false, false, false, S(undef, 0), S(undef, 0),
+                     ops, true, xIdx, yIdx )
+
+  return Op
+end
+
+function diagOpProd(y::AbstractVector{T}, x::AbstractVector{T}, nrow::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+  for i=1:length(ops)
+    mul!(view(y,yIdx[i]:yIdx[i+1]-1), ops[i], view(x,xIdx[i]:xIdx[i+1]-1))
+  end
+  return y
+end
+
+function diagOpTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+  for i=1:length(ops)
+    mul!(view(y,yIdx[i]:yIdx[i+1]-1), transpose(ops[i]), view(x,xIdx[i]:xIdx[i+1]-1))
+  end
+  return y
+end
+
+function diagOpCTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+  for i=1:length(ops)
+    mul!(view(y,yIdx[i]:yIdx[i+1]-1), adjoint(ops[i]), view(x,xIdx[i]:xIdx[i+1]-1))
+  end
+  return y
+end
+
+### Normal Matrix Code ###
+
+mutable struct DiagNormalOp{T,vecT,V} <: AbstractLinearOperator{T}
+  nrow :: Int
+  ncol :: Int
+  symmetric :: Bool
+  hermitian :: Bool
+  prod! :: Function
+  tprod! :: Nothing
+  ctprod! :: Nothing
+  nprod :: Int
+  ntprod :: Int
+  nctprod :: Int
+  args5 :: Bool
+  use_prod5! :: Bool
+  allocated5 :: Bool
+  Mv5 :: vecT
+  Mtu5 :: vecT
+  normalOps::V
+  idx::Vector{Int64}
+  y::Vector{T}
+end
+
+LinearOperators.storage_type(op::DiagNormalOp) = typeof(op.Mv5)
+
+function DiagNormalOp(normalOps, N, idx, y::AbstractVector{T}) where {T}
+
+  S = LinearOperators.storage_type(first(normalOps))
+  for nop in normalOps
+    S = promote_type(S, LinearOperators.storage_type(nop))
+  end
+
+  return DiagNormalOp(N, N, false, false
+         , (res,x) -> produ!(res, normalOps, idx, x)
+         , nothing
+         , nothing
+         , 0, 0, 0, false, false, false, S(undef, 0), S(undef, 0)
+         , normalOps, idx, y)
+end
+
+function diagNormOpProd(y, normalOps, idx, x)
+  for i=1:length(normalOps)
+    mul!(view(y,idx[i]:idx[i+1]-1), normalOps[i], view(x,idx[i]:idx[i+1]-1))
+ end
+ return y
+end
+
+function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(diag), size(diag,1), S = LinearOperators.storage_type(diag)))
+  T = promote_type(eltype(diag), eltype(W))
+  S = promote_type(LinearOperators.storage_type(diag), LinearOperators.storage_type(W))
+  weights = W*S(ones(diag.nrow))
+
+
+  if diag.equalOps
+    # this optimization is only allowed if all ops are the same
+
+    # we promote the weights to be of the same type as T, which will be required
+    # when creating the temporary vector in normalOperator in a later stage
+    opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)))
+    op = DiagNormalOp([copy(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
+  else
+    op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)))
+                     for i in 1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
+  end
+
+  return op
+end
\ No newline at end of file
diff --git a/src/LinearOperatorCollection.jl b/src/LinearOperatorCollection.jl
index e66bd73..8487977 100644
--- a/src/LinearOperatorCollection.jl
+++ b/src/LinearOperatorCollection.jl
@@ -88,5 +88,6 @@ include("ProdOp.jl")
 include("GradientOp.jl")
 include("SamplingOp.jl")
 include("NormalOp.jl")
+include("DiagOp.jl")
 
 end

From 3e2b95d4bf7c058bf073b49aa6e3ac41a50fdd03 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:48:07 +0200
Subject: [PATCH 17/47] Fix copy NFFTOp

---
 ext/LinearOperatorNFFTExt/NFFTOp.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ext/LinearOperatorNFFTExt/NFFTOp.jl b/ext/LinearOperatorNFFTExt/NFFTOp.jl
index 231b518..5b2fcd5 100644
--- a/ext/LinearOperatorNFFTExt/NFFTOp.jl
+++ b/ext/LinearOperatorNFFTExt/NFFTOp.jl
@@ -61,13 +61,13 @@ function ctprodu!(x::AbstractVector, plan::AbstractNFFTPlan, y::AbstractVector)
 end
 
 
-function Base.copy(S::NFFTOpImpl{T}) where {T}
+function Base.copy(S::NFFTOpImpl{T, vecT, P}) where {T, vecT, P}
   plan = copy(S.plan)
-  return NFFTOpImpl{T}(size(plan.k,2), prod(plan.N), false, false
+  return NFFTOpImpl{T, vecT, P}(size(plan.k,2), prod(plan.N), false, false
               , (res,x) -> produ!(res,plan,x)
               , nothing
               , (res,y) -> ctprodu!(res,plan,y)
-              , 0, 0, 0, false, false, false, T[], T[]
+              , 0, 0, 0, false, false, false, vecT(undef, 0), vecT(undef, 0)
               , plan, S.toeplitz)
 end
 

From e828d88c6f7a49f314616d534f1ceb60cddee9ce Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:49:45 +0200
Subject: [PATCH 18/47] Fix DiagOp constructor and normalOp

---
 src/DiagOp.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index 1eca414..0bc40ef 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -49,7 +49,7 @@ function DiagOp(ops)
   xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
   yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
 
-  Op = DiagOp( nrow, ncol, false, false,
+  Op = DiagOp{eltype(first(ops)), S, typeof(ops)}( nrow, ncol, false, false,
                      (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
                      (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
                      (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
@@ -62,13 +62,13 @@ end
 function DiagOp(op::AbstractLinearOperator, N=1)
   nrow = N*op.nrow
   ncol = N*op.ncol
-  S = LinearOperators.storage_type(first(ops))
   ops = [copy(op) for n=1:N]
+  S = LinearOperators.storage_type(first(ops))
 
   xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
   yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
 
-  Op = DiagOp{S}( nrow, ncol, false, false,
+  Op = DiagOp{eltype(op), S, typeof(ops)}( nrow, ncol, false, false,
                     (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
                     (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
                     (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
@@ -119,7 +119,7 @@ mutable struct DiagNormalOp{T,vecT,V} <: AbstractLinearOperator{T}
   Mtu5 :: vecT
   normalOps::V
   idx::Vector{Int64}
-  y::Vector{T}
+  y::vecT
 end
 
 LinearOperators.storage_type(op::DiagNormalOp) = typeof(op.Mv5)
@@ -131,15 +131,15 @@ function DiagNormalOp(normalOps, N, idx, y::AbstractVector{T}) where {T}
     S = promote_type(S, LinearOperators.storage_type(nop))
   end
 
-  return DiagNormalOp(N, N, false, false
-         , (res,x) -> produ!(res, normalOps, idx, x)
+  return DiagNormalOp{eltype(first(normalOps)), S, typeof(normalOps)}(N, N, false, false
+         , (res,x) -> diagNormOpProd!(res, normalOps, idx, x)
          , nothing
          , nothing
          , 0, 0, 0, false, false, false, S(undef, 0), S(undef, 0)
          , normalOps, idx, y)
 end
 
-function diagNormOpProd(y, normalOps, idx, x)
+function diagNormOpProd!(y, normalOps, idx, x)
   for i=1:length(normalOps)
     mul!(view(y,idx[i]:idx[i+1]-1), normalOps[i], view(x,idx[i]:idx[i+1]-1))
  end

From 8cb2bb414d14d446e22c7acfc88a55a5f1ca2036 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:50:15 +0200
Subject: [PATCH 19/47] Allow WeightingOp weights vec to be GPU arrays

---
 src/WeightingOp.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/WeightingOp.jl b/src/WeightingOp.jl
index cb80fd2..80f0061 100644
--- a/src/WeightingOp.jl
+++ b/src/WeightingOp.jl
@@ -7,12 +7,12 @@ generates a `LinearOperator` which multiplies an input vector index-wise with `w
 * `weights::Vector{T}` - weights vector
 * `rep::Int=1`         - number of sub-arrays that need to be multiplied with `weights`
 """
-mutable struct WeightingOp{T} <: AbstractLinearOperatorFromCollection{T}
+mutable struct WeightingOp{T, vecT <: AbstractVector{T}} <: AbstractLinearOperatorFromCollection{T}
   op::LinearOperator{T}
-  weights::Vector{T}
+  weights::vecT
   function WeightingOp(weights::vecT, rep::Int=1) where {T <: Number, vecT<:AbstractVector{T}}
     weights_cat = repeat(weights,rep)
-    return new{T}(opDiagonal(weights_cat), weights_cat)
+    return new{T, vecT}(opDiagonal(weights_cat), weights_cat)
   end
 end
 WeightingOp(::Type{T}; weights::vecT, rep::Int=1) where {T <: Number, vecT<:AbstractVector{T}} = WeightingOp(weights, rep)

From f562a4adeca4bbe906d6c159fc93336c7bd31744 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:06:17 +0200
Subject: [PATCH 20/47] Add operatore copy function as kwarg

---
 ext/LinearOperatorNFFTExt/NFFTOp.jl |  6 +++---
 src/DiagOp.jl                       | 16 +++++++++-------
 src/ProdOp.jl                       |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/ext/LinearOperatorNFFTExt/NFFTOp.jl b/ext/LinearOperatorNFFTExt/NFFTOp.jl
index 5b2fcd5..623df4f 100644
--- a/ext/LinearOperatorNFFTExt/NFFTOp.jl
+++ b/ext/LinearOperatorNFFTExt/NFFTOp.jl
@@ -127,7 +127,7 @@ function NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1::matT, xL2::m
          , shape, W, fftplan, ifftplan, λ, xL1, xL2)
 end
 
-function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft, 1), S= LinearOperators.storage_type(nfft))) where {T}
+function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft, 1), S= LinearOperators.storage_type(nfft)); kwargs...) where {T}
   shape = nfft.plan.N
 
   tmpVec = similar(nfft.Mv5, (2 .* shape)...)
@@ -154,9 +154,9 @@ function NFFTToeplitzNormalOp(nfft::NFFTOp{T}, W=opEye(eltype(nfft), size(nfft,
   return NFFTToeplitzNormalOp(shape, W, fftplan, ifftplan, λ, xL1, xL2)
 end
 
-function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W = opEye(eltype(S), size(S, 1), S= LinearOperators.storage_type(S)), kwargs...) where T
+function LinearOperatorCollection.normalOperator(S::NFFTOpImpl{T}, W = opEye(eltype(S), size(S, 1), S= LinearOperators.storage_type(S)); copyOpsFn = copy, kwargs...) where T
   if S.toeplitz
-    return NFFTToeplitzNormalOp(S,W, kwargs...)
+    return NFFTToeplitzNormalOp(S,W; kwargs...)
   else
     return NormalOp(eltype(S); parent = S, weights = W)
   end
diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index 0bc40ef..b2f7bda 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -59,10 +59,10 @@ function DiagOp(ops)
   return Op
 end
 
-function DiagOp(op::AbstractLinearOperator, N=1)
+function DiagOp(op::AbstractLinearOperator, N=1; copyOpsFn = copy)
   nrow = N*op.nrow
   ncol = N*op.ncol
-  ops = [copy(op) for n=1:N]
+  ops = [copyOpsFn(op) for n=1:N]
   S = LinearOperators.storage_type(first(ops))
 
   xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
@@ -146,10 +146,12 @@ function diagNormOpProd!(y, normalOps, idx, x)
  return y
 end
 
-function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(diag), size(diag,1), S = LinearOperators.storage_type(diag)))
+function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(diag), size(diag,1), S = LinearOperators.storage_type(diag)); copyOpsFn = copy, kwargs...)
   T = promote_type(eltype(diag), eltype(W))
   S = promote_type(LinearOperators.storage_type(diag), LinearOperators.storage_type(W))
-  weights = W*S(ones(diag.nrow))
+  tmp = S(undef, diag.nrow)
+  tmp .= one(eltype(diag))
+  weights = W*tmp
 
 
   if diag.equalOps
@@ -157,10 +159,10 @@ function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(di
 
     # we promote the weights to be of the same type as T, which will be required
     # when creating the temporary vector in normalOperator in a later stage
-    opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)))
-    op = DiagNormalOp([copy(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
+    opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)), copyOpsFn = copyOpsFn)
+    op = DiagNormalOp([copyOpsFn(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
   else
-    op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)))
+    op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)), copyOpsFn = copyOpsFn)
                      for i in 1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
   end
 
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index d7f9d08..3ba9b9c 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -120,8 +120,8 @@ end
 # In this case we are converting the left argument into a 
 # weighting matrix, that is passed to normalOperator
 # TODO Port vom MRIOperators drops given weighting matrix, I just left it out for now
-normalOperator(S::ProdOp{T, <:WeightingOp, matT}) where {T, matT} = normalOperator(S.B, S.A)
-function normalOperator(S::ProdOp, W=opEye(eltype(S),size(S,1), S = storage_type(S)))
+normalOperator(S::ProdOp{T, <:WeightingOp, matT}; kwargs...) where {T, matT} = normalOperator(S.B, S.A)
+function normalOperator(S::ProdOp, W=opEye(eltype(S),size(S,1), S = storage_type(S)); kwargs...)
   arrayType = storage_type(S)
   tmp = arrayType(undef, size(S.A, 2))
   return ProdNormalOp(S.B, normalOperator(S.A, W), tmp)

From c261108bd30a4650ab5653b18f7c7ee71cd09c4f Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 24 Apr 2024 13:46:56 +0200
Subject: [PATCH 21/47] Pass along normalOperator kwargs (for FFT flags)

---
 src/DiagOp.jl | 4 ++--
 src/ProdOp.jl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index b2f7bda..ff19d26 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -159,10 +159,10 @@ function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(di
 
     # we promote the weights to be of the same type as T, which will be required
     # when creating the temporary vector in normalOperator in a later stage
-    opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)), copyOpsFn = copyOpsFn)
+    opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)); copyOpsFn = copyOpsFn, kwargs...)
     op = DiagNormalOp([copyOpsFn(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
   else
-    op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)), copyOpsFn = copyOpsFn)
+    op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)); copyOpsFn = copyOpsFn, kwargs...)
                      for i in 1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
   end
 
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index 3ba9b9c..efacd69 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -120,11 +120,11 @@ end
 # In this case we are converting the left argument into a 
 # weighting matrix, that is passed to normalOperator
 # TODO Port vom MRIOperators drops given weighting matrix, I just left it out for now
-normalOperator(S::ProdOp{T, <:WeightingOp, matT}; kwargs...) where {T, matT} = normalOperator(S.B, S.A)
+normalOperator(S::ProdOp{T, <:WeightingOp, matT}; kwargs...) where {T, matT} = normalOperator(S.B, S.A; kwargs...)
 function normalOperator(S::ProdOp, W=opEye(eltype(S),size(S,1), S = storage_type(S)); kwargs...)
   arrayType = storage_type(S)
   tmp = arrayType(undef, size(S.A, 2))
-  return ProdNormalOp(S.B, normalOperator(S.A, W), tmp)
+  return ProdNormalOp(S.B, normalOperator(S.A, W; kwargs...), tmp)
 end
 
 function Base.copy(S::ProdNormalOp) 

From b2489a3acd67643417f55d9272148dbebd31a8af Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 24 Apr 2024 14:58:51 +0200
Subject: [PATCH 22/47] Reduce allocation in DiagOp normalOperator

---
 src/DiagOp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index ff19d26..c61f61b 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -160,10 +160,10 @@ function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(di
     # we promote the weights to be of the same type as T, which will be required
     # when creating the temporary vector in normalOperator in a later stage
     opInner = normalOperator(diag.ops[1], WeightingOp(T; weights=T.(weights[diag.yIdx[1]:diag.yIdx[2]-1].^2)); copyOpsFn = copyOpsFn, kwargs...)
-    op = DiagNormalOp([copyOpsFn(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
+    op = DiagNormalOp([copyOpsFn(opInner) for i=1:length(diag.ops)], size(diag,2), diag.xIdx, S(undef, diag.ncol) )
   else
     op = DiagNormalOp([normalOperator(diag.ops[i], WeightingOp(T; weights=T.(weights[diag.yIdx[i]:diag.yIdx[i+1]-1].^2)); copyOpsFn = copyOpsFn, kwargs...)
-                     for i in 1:length(diag.ops)], size(diag,2), diag.xIdx, S(zeros(T, diag.ncol)) )
+                     for i in 1:length(diag.ops)], size(diag,2), diag.xIdx, S(undef, diag.ncol) )
   end
 
   return op

From af02185d1ce94c8a7b4db15a62c4dfd290a9fff5 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:27:05 +0200
Subject: [PATCH 23/47] Fix tmp array construction for FFTOp on UnionAll
 storage vectors

---
 ext/LinearOperatorFFTWExt/FFTOp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/LinearOperatorFFTWExt/FFTOp.jl b/ext/LinearOperatorFFTWExt/FFTOp.jl
index ef56bf1..ae05b6b 100644
--- a/ext/LinearOperatorFFTWExt/FFTOp.jl
+++ b/ext/LinearOperatorFFTWExt/FFTOp.jl
@@ -39,7 +39,7 @@ returns an operator which performs an FFT on Arrays of type T
 """
 function LinearOperatorCollection.FFTOp(T::Type; shape::NTuple{D,Int64}, shift::Bool=true, unitary::Bool=true, S = Array{Complex{real(T)}}, kwargs...) where D
   
-  tmpVec = S(undef, shape...)
+  tmpVec = similar(S(undef, 0), shape...)
   plan = plan_fft!(tmpVec; kwargs...)
   iplan = plan_bfft!(tmpVec; kwargs...)
 

From af27ffc82f336743f063d0a1b2cd7c471671356c Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:27:35 +0200
Subject: [PATCH 24/47] Fix normalOp constructor to use eltype of storage_type

---
 src/NormalOp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index 889eacf..e8c0ab1 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -62,6 +62,6 @@ function Base.copy(S::NormalOpImpl)
   return NormalOpImpl(copy(S.parent), S.weights, copy(S.tmp))
 end
 
-function normalOperator(parent, weights=opEye(eltype(parent), size(parent, 1), S= storage_type(parent)), kwargs...)
-  return NormalOp(eltype(parent); parent = parent, weights = weights)
+function normalOperator(parent, weights=opEye(eltype(parent), size(parent, 1), S= storage_type(parent)); kwargs...)
+  return NormalOp(eltype(storage_type((parent))); parent = parent, weights = weights)
 end
\ No newline at end of file

From 671bbcdb152ff87d51d332f3a9eca98baa71ce4f Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:28:04 +0200
Subject: [PATCH 25/47] Improve WaveletOp between GPU and CPU

---
 ext/LinearOperatorWaveletExt/WaveletOp.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ext/LinearOperatorWaveletExt/WaveletOp.jl b/ext/LinearOperatorWaveletExt/WaveletOp.jl
index 107542f..6e238db 100644
--- a/ext/LinearOperatorWaveletExt/WaveletOp.jl
+++ b/ext/LinearOperatorWaveletExt/WaveletOp.jl
@@ -22,14 +22,14 @@ end
 
 prodwt!(res::Vector{T}, x::Vector{T}, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D} = dwt!(reshape(res,size(tmpRes)), reshape(x,size(tmpRes)), wt)
 function prodwt!(res::vecT, x::vecT, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D, vecT <: AbstractArray{T}}
-  tmp[:] = x
+  copyto!(tmp, x)
   dwt!(tmpRes, tmp, wt)
-  res[:] = tmpRes
+  copyto!(res, tmpRes)
 end
 
 ctprodwt!(res::Vector{T}, x::Vector{T}, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D} = idwt!(reshape(res,size(tmpRes)), reshape(x,size(tmpRes)), wt)
 function ctprodwt!(res::vecT, x::vecT, wt, tmp::Array{T, D}, tmpRes::Array{T, D}) where {T, D, vecT <: AbstractArray{T}}
-  tmp[:] = x
-  idwt!(tmpsRes, tmp, wt)
-  res[:] = tmpRes
+  copyto!(tmp, x)
+  idwt!(tmpRes, tmp, wt)
+  copyto!(res, tmpRes)
 end
\ No newline at end of file

From 2b1331dbf3b98a70536cb71beda882da09186bd1 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:28:21 +0200
Subject: [PATCH 26/47] Fix missing ; in SamplingOp

---
 src/SamplingOp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/SamplingOp.jl b/src/SamplingOp.jl
index 243f667..165ce94 100644
--- a/src/SamplingOp.jl
+++ b/src/SamplingOp.jl
@@ -30,7 +30,7 @@ indicated by pattern.
 """
 function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Int}, shape::Tuple; S = Vector{T})
   ndims(pattern)>1 ?  idx = vectorizePattern(pattern, shape) : idx = pattern
-  return opRestriction(idx, prod(shape), S = S)
+  return opRestriction(idx, prod(shape); S = S)
 end
 
 function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Bool}; S = Vector{T})

From 6dfdd5be62c426179f3fc5f76889998be86f622b Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 30 Apr 2024 10:42:14 +0200
Subject: [PATCH 27/47] Add RadonOp based on RadonKA

---
 Project.toml                                  |  5 +-
 .../LinearOperatorRadonKAExt.jl               |  7 +++
 ext/LinearOperatorRadonKAExt/RadonOp.jl       | 46 +++++++++++++++++++
 src/LinearOperatorCollection.jl               |  3 +-
 4 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 ext/LinearOperatorRadonKAExt/LinearOperatorRadonKAExt.jl
 create mode 100644 ext/LinearOperatorRadonKAExt/RadonOp.jl

diff --git a/Project.toml b/Project.toml
index b125b5b..1036449 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,6 +19,7 @@ julia = "1.9"
 GPUArrays = "8, 9, 10"
 NFFT = "0.13"
 LinearOperators = "2.3.3"
+RadonKA = "0.6"
 Wavelets = "0.9, 0.10"
 Reexport = "1.0"
 FFTW = "1.0"
@@ -28,6 +29,7 @@ GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+RadonKA = "86de8297-835b-47df-b249-c04e8db91db5"
 
 [targets]
 test = ["Test", "FFTW", "Wavelets", "NFFT"]
@@ -36,4 +38,5 @@ test = ["Test", "FFTW", "Wavelets", "NFFT"]
 LinearOperatorNFFTExt = ["NFFT", "FFTW"]
 LinearOperatorFFTWExt = "FFTW"
 LinearOperatorWaveletExt = "Wavelets"
-LinearOperatorGPUArraysExt = "GPUArrays"
\ No newline at end of file
+LinearOperatorGPUArraysExt = "GPUArrays"
+LinearOperatorRadonKAExt = "RadonKA"
\ No newline at end of file
diff --git a/ext/LinearOperatorRadonKAExt/LinearOperatorRadonKAExt.jl b/ext/LinearOperatorRadonKAExt/LinearOperatorRadonKAExt.jl
new file mode 100644
index 0000000..29caf35
--- /dev/null
+++ b/ext/LinearOperatorRadonKAExt/LinearOperatorRadonKAExt.jl
@@ -0,0 +1,7 @@
+module LinearOperatorRadonKAExt
+
+using LinearOperatorCollection, RadonKA
+
+include("RadonOp.jl")
+
+end
\ No newline at end of file
diff --git a/ext/LinearOperatorRadonKAExt/RadonOp.jl b/ext/LinearOperatorRadonKAExt/RadonOp.jl
new file mode 100644
index 0000000..e4d6132
--- /dev/null
+++ b/ext/LinearOperatorRadonKAExt/RadonOp.jl
@@ -0,0 +1,46 @@
+function LinearOperatorCollection.RadonOp(::Type{T}; shape::NTuple{N, Int}, angles,
+   geometry = RadonParallelCircle(shape[1], -(shape[1]-1)÷2:(shape[1]-1)÷2), μ = nothing, S = Vector{T}) where {T, N}
+  return RadonOpImpl(T; shape, angles, geometry, μ, S)
+end
+
+mutable struct RadonOpImpl{T, vecT <: AbstractVector{T}, vecT2, G, A} <: RadonOp{T}
+  nrow :: Int
+  ncol :: Int
+  symmetric :: Bool
+  hermitian :: Bool
+  prod! :: Function
+  tprod! :: Nothing
+  ctprod! :: Function
+  nprod :: Int
+  ntprod :: Int
+  nctprod :: Int
+  args5 :: Bool
+  use_prod5! :: Bool
+  allocated5 :: Bool
+  Mv5 :: vecT
+  Mtu5 :: vecT
+  angles :: vecT2
+  geometry :: G
+  μ :: A
+end
+
+LinearOperators.storage_type(op::RadonOpImpl) = typeof(op.Mv5)
+
+function RadonOpImpl(T::Type; shape::NTuple{N, Int64}, angles, geometry, μ, S) where N
+  N_sinogram = length(geometry.in_height)
+  N_angles = length(angles)
+  d = length(shape) == 3 ? shape[3] : 1
+  nrow = N_sinogram * N_angles * d
+  ncol = prod(shape)
+  return RadonOpImpl(nrow, ncol, false, false,
+  (res, x) -> prod_radon!(res, x, shape, angles, geometry, μ),
+  nothing, 
+  (res, x) -> ctprod_radon!(res, x, (N_sinogram, N_angles, d), angles, geometry, μ),
+  0, 0, 0, true, false, true, S(undef, 0), S(undef, 0), angles, geometry, μ)
+end
+
+prod_radon!(res::vecT, x::vecT, shape, angles::vecT2, geometry::G, μ::A) where {vecT, vecT2, G, A} = copyto!(res, radon(reshape(x, shape), angles; geometry, μ))
+prod_radon!(res::vecT, x::vecT, shape, angles::vecT2, ::Nothing, μ::A) where {vecT, vecT2, A} = copyto!(res, radon(reshape(x, shape), angles; μ))
+
+ctprod_radon!(res::vecT, x::vecT, shape, angles::vecT2, geometry::G, μ::A) where {vecT, vecT2, G, A} = copyto!(res, RadonKA.backproject(reshape(x, shape), angles; geometry, μ))
+ctprod_radon!(res::vecT, x::vecT, shape, angles::vecT2, ::Nothing, μ::A) where {vecT, vecT2, A} = copyto!(res, RadonKA.backproject(reshape(x, shape), angles; μ))
diff --git a/src/LinearOperatorCollection.jl b/src/LinearOperatorCollection.jl
index 8487977..9846da5 100644
--- a/src/LinearOperatorCollection.jl
+++ b/src/LinearOperatorCollection.jl
@@ -30,7 +30,7 @@ end
 
 export linearOperatorList, createLinearOperator
 export AbstractLinearOperatorFromCollection, WaveletOp, FFTOp, DCTOp, DSTOp, NFFTOp,
-       SamplingOp, NormalOp, WeightingOp, GradientOp
+       SamplingOp, NormalOp, WeightingOp, GradientOp, RadonOp
 
 abstract type AbstractLinearOperatorFromCollection{T} <: AbstractLinearOperator{T} end
 abstract type WaveletOp{T} <: AbstractLinearOperatorFromCollection{T} end
@@ -41,6 +41,7 @@ abstract type NFFTOp{T} <: AbstractLinearOperatorFromCollection{T} end
 abstract type SamplingOp{T} <: AbstractLinearOperatorFromCollection{T} end
 abstract type NormalOp{T} <: AbstractLinearOperatorFromCollection{T} end
 abstract type GradientOp{T} <: AbstractLinearOperatorFromCollection{T} end
+abstract type RadonOp{T} <: AbstractLinearOperatorFromCollection{T} end
 
 
 """

From 19d74d5d1e9160d3a28cfab4d93116812b4ab8c4 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 28 May 2024 14:31:51 +0200
Subject: [PATCH 28/47] Fix eltype of SamplingOp

---
 src/SamplingOp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/SamplingOp.jl b/src/SamplingOp.jl
index 165ce94..f8a74f6 100644
--- a/src/SamplingOp.jl
+++ b/src/SamplingOp.jl
@@ -30,7 +30,7 @@ indicated by pattern.
 """
 function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Int}, shape::Tuple; S = Vector{T})
   ndims(pattern)>1 ?  idx = vectorizePattern(pattern, shape) : idx = pattern
-  return opRestriction(idx, prod(shape); S = S)
+  return opEye(T,length(idx), S = S)*opRestriction(idx, prod(shape); S = S)
 end
 
 function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Bool}; S = Vector{T})

From 8811386fd276ed2ff0a342ae189d080710061b02 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Mon, 3 Jun 2024 18:29:26 +0200
Subject: [PATCH 29/47] Init updating tests

---
 Project.toml          |   4 +-
 test/runtests.jl      |   9 +-
 test/testOperators.jl | 195 ++++++++++++++++++++++++------------------
 3 files changed, 121 insertions(+), 87 deletions(-)

diff --git a/Project.toml b/Project.toml
index 1036449..a21b8df 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,10 +13,12 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 
 [compat]
 julia = "1.9"
 GPUArrays = "8, 9, 10"
+JLArrays = "0.1"
 NFFT = "0.13"
 LinearOperators = "2.3.3"
 RadonKA = "0.6"
@@ -32,7 +34,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 RadonKA = "86de8297-835b-47df-b249-c04e8db91db5"
 
 [targets]
-test = ["Test", "FFTW", "Wavelets", "NFFT"]
+test = ["Test", "FFTW", "Wavelets", "NFFT", "JLArrays"]
 
 [extensions]
 LinearOperatorNFFTExt = ["NFFT", "FFTW"]
diff --git a/test/runtests.jl b/test/runtests.jl
index fc5842f..9e15300 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,11 @@ using LinearAlgebra
 using FFTW
 using Wavelets
 using NFFT
+using JLArrays
 
-include("testNormalOp.jl")
-include("testOperators.jl")
+arrayTypes = [Array]
+
+@testset "LinearOperatorCollection" begin
+  include("testNormalOp.jl")
+  include("testOperators.jl")
+end
diff --git a/test/testOperators.jl b/test/testOperators.jl
index ccb1ecc..ee80d6f 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -1,4 +1,4 @@
-function testDCT1d(N=32)
+function testDCT1d(N=32;arrayType = Array)
   Random.seed!(1235)
   x = zeros(ComplexF64, N^2)
   for i=1:5
@@ -24,18 +24,20 @@ function testDCT1d(N=32)
 
   x2 = adjoint(D3)*y3
   @test norm(x2 - x) / norm(x) ≈ 0 atol=0.01
+  true
 end
 
-function testFFT1d(N=32,shift=true)
+function testFFT1d(N=32,shift=true;arrayType = Array)
   Random.seed!(1234)
   x = zeros(N^2)
   for i=1:5
     x .+= rand()*cos.(rand(1:N^2)*collect(1:N^2))
   end
-  D1 = FFTOp(ComplexF64, shape=(N^2,), shift=shift)
+  xop = arrayType(x)
+  D1 = FFTOp(ComplexF64, shape=(N^2,), shift=shift, S = typeof(ComplexF64.(xop)))
   D2 =  1.0/N*[exp(-2*pi*im*j*k/N^2) for j=0:N^2-1,k=0:N^2-1]
 
-  y1 = D1*x
+  y1 = Array(D1*xop)
   if shift
     y2 = fftshift(D2*fftshift(x))
   else
@@ -43,27 +45,29 @@ function testFFT1d(N=32,shift=true)
   end
   @test norm(y1 - y2) / norm(y1) ≈ 0 atol=0.01
 
-  x1 = adjoint(D1) * y1
+  x1 = Array(adjoint(D1) * arrayType(y1))
   if shift
     x2 = ifftshift(adjoint(D2)*ifftshift(y2))
   else
     x2 = adjoint(D2)*y2
   end
   @test norm(x1 - x2) / norm(x1) ≈ 0 atol=0.01
+  true
 end
 
-function testFFT2d(N=32,shift=true)
+function testFFT2d(N=32,shift=true;arrayType = Array)
   Random.seed!(1234)
   x = zeros(N^2)
   for i=1:5
     x .+= rand()*cos.(rand(1:N^2)*collect(1:N^2))
   end
-  D1 = FFTOp(ComplexF64, shape=(N,N), shift=shift)
+  xop = arrayType(x)
+  D1 = FFTOp(ComplexF64, shape=(N,N), shift=shift, S = typeof(ComplexF64.(xop)))
 
   idx = CartesianIndices((N,N))[collect(1:N^2)]
   D2 =  1.0/N*[ exp(-2*pi*im*((idx[j][1]-1)*(idx[k][1]-1)+(idx[j][2]-1)*(idx[k][2]-1))/N) for j=1:N^2, k=1:N^2 ]
 
-  y1 = D1*x
+  y1 = Array(D1*xop)
   if shift
     y2 = D2*vec(fftshift(reshape(x,N,N)))
     y2 = vec(fftshift(reshape(y2,N,N)))
@@ -72,7 +76,7 @@ function testFFT2d(N=32,shift=true)
   end
   @test norm(y1 - y2) / norm(y1) ≈ 0 atol=0.01
 
-  x1 = adjoint(D1) * y1
+  x1 = Array(adjoint(D1) * arrayType(y1))
   if shift
     x2 = adjoint(D2)*vec(ifftshift(reshape(y2,N,N)))
     x2 = vec(ifftshift(reshape(x2,N,N)))
@@ -80,67 +84,74 @@ function testFFT2d(N=32,shift=true)
     x2 = adjoint(D2)*y2
   end
   @test norm(x1 - x2) / norm(x1) ≈ 0 atol=0.01
+  true
 end
 
-function testWeighting(N=512)
+function testWeighting(N=512;arrayType = Array)
   Random.seed!(1234)
   x1 = rand(N)
   weights = rand(N)
-  W = WeightingOp(weights)
-  y1 = W*x1
+  W = WeightingOp(arrayType(weights))
+  y1 = W*arrayType(x1)
   y = weights .* x1
 
-  @test norm(y1 - y) / norm(y) ≈ 0 atol=0.01
+  @test norm(Array(y1) - y) / norm(y) ≈ 0 atol=0.01
 
   x2 = rand(2*N)
-  W2 = WeightingOp(weights, 2)
-  y2 = W2*x2
+  W2 = WeightingOp(arrayType(weights), 2)
+  y2 = W2*arrayType(x2)
   y = repeat(weights,2) .* x2
 
-  @test norm(y2 - y) / norm(y) ≈ 0 atol=0.01
+  @test norm(Array(y2) - y) / norm(y) ≈ 0 atol=0.01
+  true
 end
 
-function testGradOp1d(N=512)
+function testGradOp1d(N=512;arrayType = Array)
   x = rand(N)
-  G = GradientOp(eltype(x); shape=size(x))
+  xop = arrayType(x)
+  G = GradientOp(eltype(x); shape=size(x), S = typeof(xop))
   G0 = Bidiagonal(ones(N),-ones(N-1), :U)[1:N-1,:]
 
-  y = G*x
+  y = Array(G*xop)
   y0 = G0*x
   @test norm(y - y0) / norm(y0) ≈ 0 atol=0.001
 
-  xr = transpose(G)*y
+  xr = Array(transpose(G)*arrayType(y))
   xr0 = transpose(G0)*y0
 
   @test norm(xr - xr0) / norm(xr0) ≈ 0 atol=0.001
+  true
 end
 
-function testGradOp2d(N=64)
+function testGradOp2d(N=64;arrayType = Array)
   x = repeat(1:N,1,N)
-  G = GradientOp(eltype(x); shape=size(x))
+  xop = arrayType(vec(x))
+  G = GradientOp(eltype(x); shape=size(x), S = typeof(xop))
   G_1d = Bidiagonal(ones(N),-ones(N-1), :U)[1:N-1,:]
 
-  y = G*vec(x)
+  y = Array(G*xop)
   y0 = vcat( vec(G_1d*x), vec(x*transpose(G_1d)) )
   @test norm(y - y0) / norm(y0) ≈ 0 atol=0.001
 
-  xr = transpose(G)*y
+  xr = Array(transpose(G)*arrayType(y))
   y0_x = reshape(y0[1:N*(N-1)],N-1,N)
   y0_y = reshape(y0[N*(N-1)+1:end],N,N-1)
   xr0 = transpose(G_1d)*y0_x + y0_y*G_1d
   xr0 = vec(xr0)
 
   @test norm(xr - xr0) / norm(xr0) ≈ 0 atol=0.001
+  true
 end
 
-function testDirectionalGradOp(N=64)
+function testDirectionalGradOp(N=64;arrayType = Array)
   x = rand(ComplexF64,N,N)
-  G1 = GradientOp(eltype(x); shape=size(x), dims=1)
-  G2 = GradientOp(eltype(x); shape=size(x), dims=2)
+  xop = arrayType(vec(x))
+  G1 = GradientOp(eltype(x); shape=size(x), dims=1, S = typeof(xop))
+  G2 = GradientOp(eltype(x); shape=size(x), dims=2, S = typeof(xop))
   G_1d = Bidiagonal(ones(N),-ones(N-1), :U)[1:N-1,:]
 
-  y1 = G1*vec(x)
-  y2 = G2*vec(x)
+  y1 = Array(G1*x)
+  y2 = Array(G2*x)
   y1_ref = zeros(ComplexF64, N-1,N)
   y2_ref = zeros(ComplexF64, N, N-1)
   for i=1:N
@@ -151,8 +162,8 @@ function testDirectionalGradOp(N=64)
   @test norm(y1-vec(y1_ref)) / norm(y1_ref) ≈ 0 atol=0.001
   @test norm(y2-vec(y2_ref)) / norm(y2_ref) ≈ 0 atol=0.001
   
-  x1r = transpose(G1)*y1
-  x2r = transpose(G2)*y2
+  x1r = Array(transpose(G1)*arrayType(y1))
+  x2r = Array(transpose(G2)*arrayType(y2))
 
   x1r_ref = zeros(ComplexF64, N,N)
   x2r_ref = zeros(ComplexF64, N,N)
@@ -162,19 +173,21 @@ function testDirectionalGradOp(N=64)
   end
   @test norm(x1r-vec(x1r_ref)) / norm(x1r_ref) ≈ 0 atol=0.001
   @test norm(x2r-vec(x2r_ref)) / norm(x2r_ref) ≈ 0 atol=0.001
+  true
 end
 
-function testSampling(N=64)
+function testSampling(N=64;arrayType = Array)
   x = rand(ComplexF64,N,N)
+  xop = arrayType(vec(x))
   # index-based sampling
   idx = shuffle(collect(1:N^2)[1:N*div(N,2)])
-  SOp = SamplingOp(ComplexF64, pattern=idx, shape=(N,N))
-  y = SOp*vec(x)
-  x2 = adjoint(SOp)*y
+  SOp = SamplingOp(ComplexF64, pattern=idx, shape=(N,N), S = typeof(xop))
+  y = Array(SOp*xop)
+  x2 = Array(adjoint(SOp)*y)
   # mask-based sampling
   msk = zeros(Bool,N*N);msk[idx].=true
-  SOp2 = SamplingOp(ComplexF64, pattern=msk)
-  y2 = ComplexF64.(SOp2*vec(x))
+  SOp2 = SamplingOp(ComplexF64, pattern=msk, S = typeof(xop))
+  y2 = ComplexF64.(Array(SOp2*xop))
   # references
   y_ref = vec(x[idx])
   x2_ref = zeros(ComplexF64,N^2)
@@ -183,19 +196,23 @@ function testSampling(N=64)
   @test norm(y - y_ref) / norm(y_ref) ≈ 0 atol=0.000001
   @test norm(x2 - x2_ref) / norm(x2_ref) ≈ 0 atol=0.000001
   @test norm(y2 - x2_ref) / norm(x2_ref) ≈ 0 atol=0.000001
+  true
 end
 
-function testWavelet(M=64,N=60)
+function testWavelet(M=64,N=60;arrayType = Array)
   x = rand(M,N)
-  WOp = WaveletOp(Float64, shape=(M,N))
-  x_wavelet = WOp*vec(x)
+  xop = arrayType(vec(x))
+  WOp = WaveletOp(Float64, shape=(M,N), S = typeof(xop))
+  # TODO comparison against wavelet?
+  x_wavelet = Array(WOp*xop)
   x_reco = reshape( adjoint(WOp)*x_wavelet, M, N)
 
   @test norm(x_reco - x) / norm(x) ≈ 0 atol=0.001
+  true
 end
 
 # test FourierOperators
-function testNFFT2d(N=16)
+function testNFFT2d(N=16;arrayType = Array)
   # random image
   x = zeros(ComplexF64,N,N)
   for i=1:N,j=1:N
@@ -208,15 +225,16 @@ function testNFFT2d(N=16)
   F_adj = adjoint(F)
 
   # Operator
+  xop = arrayType(vec(x))
   nodes = [(idx[d] - N÷2 - 1)./N for d=1:2, idx in vec(CartesianIndices((N,N)))]
-  F_nfft = NFFTOp(ComplexF64; shape=(N,N), nodes, symmetrize=false)
+  F_nfft = NFFTOp(ComplexF64; shape=(N,N), nodes, symmetrize=false, S = typeof(xop))
 
   # test against FourierOperators
   y = vec( ifftshift(reshape(F*vec(fftshift(x)),N,N)) )
   y_adj = vec( ifftshift(reshape(F_adj*vec(fftshift(x)),N,N)) )
 
-  y_nfft = F_nfft * vec(x)
-  y_adj_nfft = adjoint(F_nfft) * vec(x)
+  y_nfft = Array(F_nfft * xop)
+  y_adj_nfft = Array(adjoint(F_nfft) * xop)
 
   @test y     ≈ y_nfft      rtol = 1e-2
   @test y_adj ≈ y_adj_nfft  rtol = 1e-2
@@ -224,31 +242,33 @@ function testNFFT2d(N=16)
   # test AHA w/o Toeplitz
   F_nfft.toeplitz = false
   AHA = normalOperator(F_nfft)
-  y_AHA_nfft = AHA * vec(x)
-  y_AHA = F' * F * vec(x)
+  y_AHA_nfft = Array(AHA * xop)
+  y_AHA = F' * F * xop
   @test y_AHA ≈ y_AHA_nfft   rtol = 1e-2
 
   # test AHA with Toeplitz
   F_nfft.toeplitz = true
   AHA = normalOperator(F_nfft)
-  y_AHA_nfft = AHA * vec(x)
-  y_AHA_nfft = adjoint(F_nfft) * F_nfft * vec(x)
+  y_AHA_nfft_1 = Array(AHA * xop)
+  y_AHA_nfft_2 = Array(adjoint(F_nfft) * F_nfft * xop)
   y_AHA = F' * F * vec(x)
-  @test y_AHA ≈ y_AHA_nfft   rtol = 1e-2
+  @test y_AHA_nfft_1 ≈ y_AHA_nfft_2   rtol = 1e-2
+  @test y_AHA ≈ y_AHA_nfft_1   rtol = 1e-2
 
   # test type stability;
   # TODO: Ensure type stability for Trajectory objects and test here
   nodes = Float32.(nodes)
-  F_nfft = NFFTOp(ComplexF32; shape=(N,N), nodes, symmetrize=false)
+  F_nfft = NFFTOp(ComplexF32; shape=(N,N), nodes, symmetrize=false, S = typeof(xop))
 
-  y_nfft = F_nfft * vec(ComplexF32.(x))
-  y_adj_nfft = adjoint(F_nfft) * vec(ComplexF32.(x))
+  y_nfft = F_nfft * ComplexF32.(xop)
+  y_adj_nfft = adjoint(F_nfft) * ComplexF32.(xop)
 
   @test Complex{eltype(nodes)} === eltype(y_nfft)
   @test Complex{eltype(nodes)} === eltype(y_adj_nfft)
+  true
 end
 
-function testNFFT3d(N=12)
+function testNFFT3d(N=12;arrayType = Array)
   # random image
   x = zeros(ComplexF64,N,N,N)
   for i=1:N,j=1:N,k=1:N
@@ -261,15 +281,16 @@ function testNFFT3d(N=12)
   F_adj = F'
 
   # Operator
+  xop = arrayType(vec(x))
   nodes = [(idx[d] - N÷2 - 1)./N for d=1:3, idx in vec(CartesianIndices((N,N,N)))]
-  F_nfft = NFFTOp(ComplexF64; shape=(N,N,N), nodes=nodes, symmetrize=false)
+  F_nfft = NFFTOp(ComplexF64; shape=(N,N,N), nodes=nodes, symmetrize=false, S = typeof(xop))
 
   # test agains FourierOperators
   y = vec( ifftshift(reshape(F*vec(fftshift(x)),N,N,N)) )
   y_adj = vec( ifftshift(reshape(F_adj*vec(fftshift(x)),N,N,N)) )
 
-  y_nfft = F_nfft*vec(x)
-  y_adj_nfft = adjoint(F_nfft) * vec(x)
+  y_nfft = Array(F_nfft*xop)
+  y_adj_nfft = Array(adjoint(F_nfft) * xop)
 
   @test  y     ≈ y_nfft     rtol = 1e-2
   @test  y_adj ≈ y_adj_nfft rtol = 1e-2
@@ -277,43 +298,49 @@ function testNFFT3d(N=12)
   # test AHA w/o Toeplitz
   F_nfft.toeplitz = false
   AHA = normalOperator(F_nfft)
-  y_AHA_nfft = AHA * vec(x)
+  y_AHA_nfft = Array(AHA * xop)
   y_AHA = F' * F * vec(x)
   @test y_AHA ≈ y_AHA_nfft   rtol = 1e-2
 
   # test AHA with Toeplitz
   F_nfft.toeplitz = true
   AHA = normalOperator(F_nfft)
-  y_AHA_nfft = AHA * vec(x)
-  y_AHA_nfft = adjoint(F_nfft) * F_nfft * vec(x)
+  y_AHA_nfft_1 = Array(AHA * xop)
+  y_AHA_nfft_2 = Array(adjoint(F_nfft) * F_nfft * xop)
   y_AHA = F' * F * vec(x)
-  @test y_AHA ≈ y_AHA_nfft   rtol = 1e-2
+  @test y_AHA_nfft_1 ≈ y_AHA_nfft_2   rtol = 1e-2
+  @test y_AHA ≈ y_AHA_nfft_1   rtol = 1e-2
+  true
 end
 
+# TODO RadonOp
+
 @testset "Linear Operators" begin
-  @info "test DCT-II and DCT-IV Ops"
-  for N in [2,8,16,32]
-    testDCT1d(N)
-  end
-  @info "test FFTOp"
-  for N in [8,16,32]
-    testFFT1d(N,false)
-    testFFT1d(N,true)
-    testFFT2d(N,false)
-    testFFT2d(N,true)
+  for arrayType in arrayTypes
+    @info "test DCT-II and DCT-IV Ops"
+    for N in [2,8,16,32]
+      @test testDCT1d(N;arrayType) skip = !isa(arrayType, Array)
+    end
+    @info "test FFTOp"
+    for N in [8,16,32]
+      @test testFFT1d(N,false;arrayType)
+      @test testFFT1d(N,true;arrayType)
+      @test testFFT2d(N,false;arrayType)
+      @test testFFT2d(N,true;arrayType)
+    end
+    @info "test WeightingOp"
+    @test testWeighting(512;arrayType)
+    @info "test GradientOp"
+    @test testGradOp1d(512;arrayType)
+    @test testGradOp2d(64;arrayType)
+    @test testDirectionalGradOp(64;arrayType) 
+    @info "test SamplingOp"
+    @test testSampling(64;arrayType)
+    @info "test WaveletOp"
+    @test testWavelet(64,64;arrayType)
+    @test testWavelet(64,60;arrayType)
+    @info "test NFFTOp"
+    @test testNFFT2d(;arrayType) skip = arrayType == JLArray
+    @test testNFFT3d(;arrayType) skip = arrayType == JLArray
   end
-  @info "test WeightingOp"
-  testWeighting(512)
-  @info "test GradientOp"
-  testGradOp1d(512)
-  testGradOp2d(64)
-  testDirectionalGradOp(64) 
-  @info "test SamplingOp"
-  testSampling(64)
-  @info "test WaveletOp"
-  testWavelet(64,64)
-  testWavelet(64,60)
-  @info "test NFFTOp"
-  testNFFT2d()
-  testNFFT3d()
 end

From 7ae7ddb5b64349fa8b2f9a9bdbb936e5030412d7 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:48:37 +0200
Subject: [PATCH 30/47] Fix bugs in FFTOp and SamplingOp

---
 ext/LinearOperatorFFTWExt/FFTOp.jl |  5 ++---
 src/SamplingOp.jl                  |  5 +++--
 test/runtests.jl                   |  2 +-
 test/testOperators.jl              | 30 +++++++++++++++---------------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/ext/LinearOperatorFFTWExt/FFTOp.jl b/ext/LinearOperatorFFTWExt/FFTOp.jl
index ae05b6b..c858351 100644
--- a/ext/LinearOperatorFFTWExt/FFTOp.jl
+++ b/ext/LinearOperatorFFTWExt/FFTOp.jl
@@ -64,9 +64,8 @@ function LinearOperatorCollection.FFTOp(T::Type; shape::NTuple{D,Int64}, shift::
   end
 end
 
-function fft_multiply!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, factor::T, tmpVec::AbstractArray{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
-  tmpVec[:] .= x
-  plan * tmpVec
+function fft_multiply!(res::AbstractVector{T}, plan::P, x::AbstractVector{Tr}, ::NTuple{D}, factor::T, tmpVec::AbstractArray{T,D}) where {T, Tr, P<:AbstractFFTs.Plan, D}
+  plan * copyto!(tmpVec, x)
   res .= factor .* vec(tmpVec)
 end
 
diff --git a/src/SamplingOp.jl b/src/SamplingOp.jl
index f8a74f6..e67997f 100644
--- a/src/SamplingOp.jl
+++ b/src/SamplingOp.jl
@@ -34,8 +34,9 @@ function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Int}, shape::T
 end
 
 function SamplingOpImpl(T::Type{<:Number}, pattern::AbstractArray{Bool}; S = Vector{T})
-
-  function prod!(res::Vector{U}, x::Vector{V}) where {U,V}
+  pattern = copyto!(similar(S(undef,0), Bool, size(pattern)...), pattern)
+  
+  function prod!(res::AbstractArray{U}, x::AbstractArray{V}) where {U,V}
     res .= pattern.*x
   end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 9e15300..c1b2287 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,7 +7,7 @@ using Wavelets
 using NFFT
 using JLArrays
 
-arrayTypes = [Array]
+arrayTypes = [Array, JLArray]
 
 @testset "LinearOperatorCollection" begin
   include("testNormalOp.jl")
diff --git a/test/testOperators.jl b/test/testOperators.jl
index ee80d6f..ee50007 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -150,8 +150,8 @@ function testDirectionalGradOp(N=64;arrayType = Array)
   G2 = GradientOp(eltype(x); shape=size(x), dims=2, S = typeof(xop))
   G_1d = Bidiagonal(ones(N),-ones(N-1), :U)[1:N-1,:]
 
-  y1 = Array(G1*x)
-  y2 = Array(G2*x)
+  y1 = Array(G1*xop)
+  y2 = Array(G2*xop)
   y1_ref = zeros(ComplexF64, N-1,N)
   y2_ref = zeros(ComplexF64, N, N-1)
   for i=1:N
@@ -183,7 +183,7 @@ function testSampling(N=64;arrayType = Array)
   idx = shuffle(collect(1:N^2)[1:N*div(N,2)])
   SOp = SamplingOp(ComplexF64, pattern=idx, shape=(N,N), S = typeof(xop))
   y = Array(SOp*xop)
-  x2 = Array(adjoint(SOp)*y)
+  x2 = Array(adjoint(SOp)*arrayType(y))
   # mask-based sampling
   msk = zeros(Bool,N*N);msk[idx].=true
   SOp2 = SamplingOp(ComplexF64, pattern=msk, S = typeof(xop))
@@ -258,7 +258,7 @@ function testNFFT2d(N=16;arrayType = Array)
   # test type stability;
   # TODO: Ensure type stability for Trajectory objects and test here
   nodes = Float32.(nodes)
-  F_nfft = NFFTOp(ComplexF32; shape=(N,N), nodes, symmetrize=false, S = typeof(xop))
+  F_nfft = NFFTOp(ComplexF32; shape=(N,N), nodes, symmetrize=false, S = typeof(ComplexF32.(xop)))
 
   y_nfft = F_nfft * ComplexF32.(xop)
   y_adj_nfft = adjoint(F_nfft) * ComplexF32.(xop)
@@ -316,31 +316,31 @@ end
 # TODO RadonOp
 
 @testset "Linear Operators" begin
-  for arrayType in arrayTypes
-    @info "test DCT-II and DCT-IV Ops"
+  @testset for arrayType in arrayTypes
+    @info "test DCT-II and DCT-IV Ops: $arrayType"
     for N in [2,8,16,32]
-      @test testDCT1d(N;arrayType) skip = !isa(arrayType, Array)
+      @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
     end
-    @info "test FFTOp"
+    @info "test FFTOp: $arrayType"
     for N in [8,16,32]
       @test testFFT1d(N,false;arrayType)
       @test testFFT1d(N,true;arrayType)
       @test testFFT2d(N,false;arrayType)
       @test testFFT2d(N,true;arrayType)
     end
-    @info "test WeightingOp"
+    @info "test WeightingOp: $arrayType"
     @test testWeighting(512;arrayType)
-    @info "test GradientOp"
+    @info "test GradientOp: $arrayType"
     @test testGradOp1d(512;arrayType)
     @test testGradOp2d(64;arrayType)
     @test testDirectionalGradOp(64;arrayType) 
-    @info "test SamplingOp"
+    @info "test SamplingOp: $arrayType"
     @test testSampling(64;arrayType)
-    @info "test WaveletOp"
+    @info "test WaveletOp: $arrayType"
     @test testWavelet(64,64;arrayType)
     @test testWavelet(64,60;arrayType)
-    @info "test NFFTOp"
-    @test testNFFT2d(;arrayType) skip = arrayType == JLArray
-    @test testNFFT3d(;arrayType) skip = arrayType == JLArray
+    @info "test NFFTOp: $arrayType"
+    @test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    @test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
   end
 end

From 82d41a37b522a1983db5b356d089076155dc2075 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 4 Jun 2024 18:44:47 +0200
Subject: [PATCH 31/47] Add tests and bugfixes for DiagOp

---
 src/DiagOp.jl         |  51 +++++++------------
 test/testOperators.jl | 116 +++++++++++++++++++++++++++++++++---------
 2 files changed, 110 insertions(+), 57 deletions(-)

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index c61f61b..5287d5a 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -28,71 +28,58 @@ LinearOperators.storage_type(op::DiagOp) = typeof(op.Mv5)
 
 
 """
-    DiagOp(ops::AbstractLinearOperator...)
-    DiagOp(ops::Vector{AbstractLinearOperator})
-    DiagOp(ops::NTuple{N,AbstractLinearOperator})
+    DiagOp(ops...)
+    DiagOp(ops::Vector{...})
+    DiagOp(ops::NTuple{N,...})
 
-create a bloc-diagonal operator out of the `LinearOperator`s contained in ops
+create a bloc-diagonal operator out of the `LinearOperator`s  or `Array`s contained in ops
 """
-DiagOp(ops::AbstractLinearOperator...) = DiagOp(ops)
-
 function DiagOp(ops)
   nrow = 0
   ncol = 0
   S = LinearOperators.storage_type(first(ops))
   for i = 1:length(ops)
-    nrow += ops[i].nrow
-    ncol += ops[i].ncol
+    nrow += size(ops[i], 1)
+    ncol += size(ops[i], 2)
     S = promote_type(S, LinearOperators.storage_type(ops[i]))
   end
 
-  xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
-  yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
+  xIdx = cumsum(vcat(1,[size(ops[i], 2) for i=1:length(ops)]))
+  yIdx = cumsum(vcat(1,[size(ops[i], 1) for i=1:length(ops)]))
 
   Op = DiagOp{eltype(first(ops)), S, typeof(ops)}( nrow, ncol, false, false,
-                     (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
-                     (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
-                     (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
+                     (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops)),
+                     (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops)),
+                     (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops)),
                      0, 0, 0, false, false, false, S(undef, 0), S(undef, 0),
                      [ops...], false, xIdx, yIdx)
 
   return Op
 end
+DiagOp(ops...) = DiagOp(collect(ops))
 
-function DiagOp(op::AbstractLinearOperator, N=1; copyOpsFn = copy)
-  nrow = N*op.nrow
-  ncol = N*op.ncol
+function DiagOp(op::Union{AbstractLinearOperator{T}, AbstractArray{T}}, N::Int64=1; copyOpsFn = copy) where T <: Number
   ops = [copyOpsFn(op) for n=1:N]
-  S = LinearOperators.storage_type(first(ops))
-
-  xIdx = cumsum(vcat(1,[ops[i].ncol for i=1:length(ops)]))
-  yIdx = cumsum(vcat(1,[ops[i].nrow for i=1:length(ops)]))
-
-  Op = DiagOp{eltype(op), S, typeof(ops)}( nrow, ncol, false, false,
-                    (res,x) -> (diagOpProd(res,x,nrow,xIdx,yIdx,ops...)),
-                    (res,y) -> (diagOpTProd(res,y,ncol,yIdx,xIdx,ops...)),
-                    (res,y) -> (diagOpCTProd(res,y,ncol,yIdx,xIdx,ops...)),
-                     0, 0, 0, false, false, false, S(undef, 0), S(undef, 0),
-                     ops, true, xIdx, yIdx )
-
-  return Op
+  op = DiagOp(ops)
+  op.equalOps = true
+  return op
 end
 
-function diagOpProd(y::AbstractVector{T}, x::AbstractVector{T}, nrow::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+function diagOpProd(y::AbstractVector{T}, x::AbstractVector{T}, nrow::Int, xIdx, yIdx, ops) where T
   for i=1:length(ops)
     mul!(view(y,yIdx[i]:yIdx[i+1]-1), ops[i], view(x,xIdx[i]:xIdx[i+1]-1))
   end
   return y
 end
 
-function diagOpTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+function diagOpTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops) where T
   for i=1:length(ops)
     mul!(view(y,yIdx[i]:yIdx[i+1]-1), transpose(ops[i]), view(x,xIdx[i]:xIdx[i+1]-1))
   end
   return y
 end
 
-function diagOpCTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops :: AbstractLinearOperator...) where T
+function diagOpCTProd(y::AbstractVector{T}, x::AbstractVector{T}, ncol::Int, xIdx, yIdx, ops) where T
   for i=1:length(ops)
     mul!(view(y,yIdx[i]:yIdx[i+1]-1), adjoint(ops[i]), view(x,xIdx[i]:xIdx[i+1]-1))
   end
diff --git a/test/testOperators.jl b/test/testOperators.jl
index ee50007..5f63c26 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -313,34 +313,100 @@ function testNFFT3d(N=12;arrayType = Array)
   true
 end
 
+function testDiagOp(N=32,K=2;arrayType = Array)
+  x = arrayType(rand(ComplexF64, K*N))
+  block = arrayType(rand(ComplexF64, N, N))
+  
+  F = arrayType(zeros(ComplexF64, K*N, K*N))
+  for k = 1:K
+    start = (k-1)*N + 1
+    stop = k*N
+    F[start:stop,start:stop] = block
+  end
+
+  blocks = [block for k = 1:K]
+  op1 = DiagOp(blocks)
+  op2 = DiagOp(blocks...)
+  op3 = DiagOp(block, K)
+
+  # Operations
+  @testset "Diag Prod" begin
+    y = Array(F * x)
+    y1 = Array(op1 * x)
+    y2 = Array(op2 * x)
+    y3 = Array(op3 * x)
+
+    @test y ≈ y1 rtol = 1e-2
+    @test y1 ≈ y2 rtol = 1e-2
+    @test y2 ≈ y3 rtol = 1e-2
+  end
+
+  @testset "Diag Transpose" begin
+    y = Array(transpose(F) * x)
+    y1 = Array(transpose(op1) * x)
+    y2 = Array(transpose(op2) * x)
+    y3 = Array(transpose(op3) * x)
+
+    @test y ≈ y1 rtol = 1e-2
+    @test y1 ≈ y2 rtol = 1e-2
+    @test y2 ≈ y3 rtol = 1e-2
+  end
+
+  @testset "Diag Adjoint" begin
+    y = Array(adjoint(F) * x)
+    y1 = Array(adjoint(op1) * x)
+    y2 = Array(adjoint(op2) * x)
+    y3 = Array(adjoint(op3) * x)
+
+    @test y ≈ y1 rtol = 1e-2
+    @test y1 ≈ y2 rtol = 1e-2
+    @test y2 ≈ y3 rtol = 1e-2
+  end
+
+  @testset "Diag Normal" begin
+    y = Array(adjoint(F) * F* x)
+    y1 = Array(normalOperator(op1) * x)
+    y2 = Array(normalOperator(op2) * x)
+    y3 = Array(normalOperator(op3) * x)
+
+    @test y ≈ y1 rtol = 1e-2
+    @test y1 ≈ y2 rtol = 1e-2
+    @test y2 ≈ y3 rtol = 1e-2
+  end
+
+  true
+end
+
 # TODO RadonOp
 
 @testset "Linear Operators" begin
   @testset for arrayType in arrayTypes
-    @info "test DCT-II and DCT-IV Ops: $arrayType"
-    for N in [2,8,16,32]
-      @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
-    end
-    @info "test FFTOp: $arrayType"
-    for N in [8,16,32]
-      @test testFFT1d(N,false;arrayType)
-      @test testFFT1d(N,true;arrayType)
-      @test testFFT2d(N,false;arrayType)
-      @test testFFT2d(N,true;arrayType)
-    end
-    @info "test WeightingOp: $arrayType"
-    @test testWeighting(512;arrayType)
-    @info "test GradientOp: $arrayType"
-    @test testGradOp1d(512;arrayType)
-    @test testGradOp2d(64;arrayType)
-    @test testDirectionalGradOp(64;arrayType) 
-    @info "test SamplingOp: $arrayType"
-    @test testSampling(64;arrayType)
-    @info "test WaveletOp: $arrayType"
-    @test testWavelet(64,64;arrayType)
-    @test testWavelet(64,60;arrayType)
-    @info "test NFFTOp: $arrayType"
-    @test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
-    @test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    #@info "test DCT-II and DCT-IV Ops: $arrayType"
+    #for N in [2,8,16,32]
+    #  @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
+    #end
+    #@info "test FFTOp: $arrayType"
+    #for N in [8,16,32]
+    #  @test testFFT1d(N,false;arrayType)
+    #  @test testFFT1d(N,true;arrayType)
+    #  @test testFFT2d(N,false;arrayType)
+    #  @test testFFT2d(N,true;arrayType)
+    #end
+    #@info "test WeightingOp: $arrayType"
+    #@test testWeighting(512;arrayType)
+    #@info "test GradientOp: $arrayType"
+    #@test testGradOp1d(512;arrayType)
+    #@test testGradOp2d(64;arrayType)
+    #@test testDirectionalGradOp(64;arrayType) 
+    #@info "test SamplingOp: $arrayType"
+    #@test testSampling(64;arrayType)
+    #@info "test WaveletOp: $arrayType"
+    #@test testWavelet(64,64;arrayType)
+    #@test testWavelet(64,60;arrayType)
+    #@info "test NFFTOp: $arrayType"
+    #@test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    #@test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    @info "test DiagOp: $arrayType"
+    @test testDiagOp(;arrayType)
   end
 end

From 309a61aaa6813e59cdaddae24d86c2e7e6d49b06 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Tue, 4 Jun 2024 19:10:47 +0200
Subject: [PATCH 32/47] Add RadonOp test

---
 Project.toml          |  2 +-
 test/runtests.jl      |  1 +
 test/testOperators.jl | 72 +++++++++++++++++++++++++++----------------
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/Project.toml b/Project.toml
index a21b8df..64bc9e6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,7 +34,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 RadonKA = "86de8297-835b-47df-b249-c04e8db91db5"
 
 [targets]
-test = ["Test", "FFTW", "Wavelets", "NFFT", "JLArrays"]
+test = ["Test", "FFTW", "Wavelets", "NFFT", "JLArrays", "RadonKA"]
 
 [extensions]
 LinearOperatorNFFTExt = ["NFFT", "FFTW"]
diff --git a/test/runtests.jl b/test/runtests.jl
index c1b2287..362e471 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,6 +5,7 @@ using LinearAlgebra
 using FFTW
 using Wavelets
 using NFFT
+using RadonKA
 using JLArrays
 
 arrayTypes = [Array, JLArray]
diff --git a/test/testOperators.jl b/test/testOperators.jl
index 5f63c26..4228497 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -316,7 +316,7 @@ end
 function testDiagOp(N=32,K=2;arrayType = Array)
   x = arrayType(rand(ComplexF64, K*N))
   block = arrayType(rand(ComplexF64, N, N))
-  
+
   F = arrayType(zeros(ComplexF64, K*N, K*N))
   for k = 1:K
     start = (k-1)*N + 1
@@ -377,36 +377,54 @@ function testDiagOp(N=32,K=2;arrayType = Array)
   true
 end
 
-# TODO RadonOp
+function testRadonOp(N=32;arrayType = Array)
+  x = arrayType(rand(N, N))
+  xop = vec(x)
+  geom = RadonParallelCircle(N, -(N-1)÷2:(N-1)÷2)
+  angles = collect(range(0, pi, 100))
+
+  op = RadonOp(eltype(x); shape = (N, N), angles, geometry = geom, S = typeof(xop))
+
+  y = Array(radon(x, angles; geometry = geom))
+  y1 = reshape(Array(op * xop), size(y)...)
+  @test y ≈ y1 rtol = 1e-2
+
+  xtmp = Array(backproject(arrayType(y), angles; geometry = geom))
+  x2 = reshape(Array(adjoint(op) * arrayType(vec(y1))), size(xtmp)...)
+  @test xtmp ≈ x2 rtol = 1e-2
+  true
+end
 
 @testset "Linear Operators" begin
   @testset for arrayType in arrayTypes
-    #@info "test DCT-II and DCT-IV Ops: $arrayType"
-    #for N in [2,8,16,32]
-    #  @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
-    #end
-    #@info "test FFTOp: $arrayType"
-    #for N in [8,16,32]
-    #  @test testFFT1d(N,false;arrayType)
-    #  @test testFFT1d(N,true;arrayType)
-    #  @test testFFT2d(N,false;arrayType)
-    #  @test testFFT2d(N,true;arrayType)
-    #end
-    #@info "test WeightingOp: $arrayType"
-    #@test testWeighting(512;arrayType)
-    #@info "test GradientOp: $arrayType"
-    #@test testGradOp1d(512;arrayType)
-    #@test testGradOp2d(64;arrayType)
-    #@test testDirectionalGradOp(64;arrayType) 
-    #@info "test SamplingOp: $arrayType"
-    #@test testSampling(64;arrayType)
-    #@info "test WaveletOp: $arrayType"
-    #@test testWavelet(64,64;arrayType)
-    #@test testWavelet(64,60;arrayType)
-    #@info "test NFFTOp: $arrayType"
-    #@test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
-    #@test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    @info "test DCT-II and DCT-IV Ops: $arrayType"
+    for N in [2,8,16,32]
+      @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
+    end
+    @info "test FFTOp: $arrayType"
+    for N in [8,16,32]
+      @test testFFT1d(N,false;arrayType)
+      @test testFFT1d(N,true;arrayType)
+      @test testFFT2d(N,false;arrayType)
+      @test testFFT2d(N,true;arrayType)
+    end
+    @info "test WeightingOp: $arrayType"
+    @test testWeighting(512;arrayType)
+    @info "test GradientOp: $arrayType"
+    @test testGradOp1d(512;arrayType)
+    @test testGradOp2d(64;arrayType)
+    @test testDirectionalGradOp(64;arrayType) 
+    @info "test SamplingOp: $arrayType"
+    @test testSampling(64;arrayType)
+    @info "test WaveletOp: $arrayType"
+    @test testWavelet(64,64;arrayType)
+    @test testWavelet(64,60;arrayType)
+    @info "test NFFTOp: $arrayType"
+    @test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    @test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
     @info "test DiagOp: $arrayType"
     @test testDiagOp(;arrayType)
+    @info "test RadonOp: $arrayType"
+    @test testRadonOp(;arrayType) skip = arrayType == JLArray # Stackoverflow for kernelabstraction
   end
 end

From ef5bcd28c5f3a925310616ab1b3a49f6ffc795d6 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 5 Jun 2024 11:52:33 +0200
Subject: [PATCH 33/47] Add breakage workflow

---
 .github/workflows/Breakage.yml | 88 ++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 .github/workflows/Breakage.yml

diff --git a/.github/workflows/Breakage.yml b/.github/workflows/Breakage.yml
new file mode 100644
index 0000000..2a8b622
--- /dev/null
+++ b/.github/workflows/Breakage.yml
@@ -0,0 +1,88 @@
+name: Breakage 
+# Based on: https://github.com/JuliaSmoothOptimizers/LinearOperators.jl/blob/main/.github/workflows/Breakage.yml
+on:
+    pull_request:
+        branches:
+            - master
+  
+jobs:
+  break:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        pkg: [
+          "JuliaImageRecon/RegularizedLeastSquares.jl",
+          "MagneticResonanceImaging/MRIReco.jl"
+        ]
+        pkgversion: [latest, stable]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      # Install Julia
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: 1
+          arch: x64
+      - uses: actions/cache@v1
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@v1
+
+      # Breakage test
+      - name: 'Breakage of ${{ matrix.pkg }}, ${{ matrix.pkgversion }} version'
+        env:
+          URL: ${{ matrix.pkg }}
+          VERSION: ${{ matrix.pkgversion }}
+        run: |
+          set -v
+          mkdir -p ./pr
+          echo "${{ github.event.number }}" > ./pr/NR
+          git clone https://github.com/$URL
+          export PKG=$(echo $URL | cut -f2 -d/)
+          cd $PKG
+          if [ $VERSION == "stable" ]; then
+            TAG=$(git tag -l "v*" --sort=-creatordate | head -n1)
+            if [ -z "$TAG" ]; then
+              TAG="no_tag"
+            else
+              git checkout $TAG
+            fi
+          else
+            TAG=$VERSION
+          fi
+          export TAG
+          julia -e 'using Pkg;
+            PKG, TAG, VERSION = ENV["PKG"], ENV["TAG"], ENV["VERSION"]
+            joburl = joinpath(ENV["GITHUB_SERVER_URL"], ENV["GITHUB_REPOSITORY"], "actions/runs", ENV["GITHUB_RUN_ID"])
+            open("../pr/$PKG-$VERSION", "w") do io
+              try
+                TAG == "no_tag" && error("Not tag for $VERSION")
+                pkg"activate .";
+                pkg"instantiate";
+                pkg"dev ../";
+                if TAG == "latest"
+                  global TAG = chomp(read(`git rev-parse --short HEAD`, String))
+                end
+                pkg"build";
+                pkg"test";
+
+                print(io, "[![](https://img.shields.io/badge/$TAG-Pass-green)]($joburl)");
+              catch e
+                @error e;
+                print(io, "[![](https://img.shields.io/badge/$TAG-Fail-red)]($joburl)");
+              end;
+            end'
+
+      - uses: actions/upload-artifact@v2
+        with:
+          name: pr
+          path: pr/
\ No newline at end of file

From 46130c1b9bdf75905cf5b71fd326cc1cc4ab6228 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 5 Jun 2024 11:53:40 +0200
Subject: [PATCH 34/47] Fix branch name

---
 .github/workflows/Breakage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/Breakage.yml b/.github/workflows/Breakage.yml
index 2a8b622..e87470b 100644
--- a/.github/workflows/Breakage.yml
+++ b/.github/workflows/Breakage.yml
@@ -3,7 +3,7 @@ name: Breakage
 on:
     pull_request:
         branches:
-            - master
+            - main
   
 jobs:
   break:

From ed3a934364028963cb3e60b8e14d0bfe31be0d1a Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 5 Jun 2024 12:02:37 +0200
Subject: [PATCH 35/47] Attempt to let breakage fail if tests fail

---
 .github/workflows/Breakage.yml | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/Breakage.yml b/.github/workflows/Breakage.yml
index e87470b..e9bc3b9 100644
--- a/.github/workflows/Breakage.yml
+++ b/.github/workflows/Breakage.yml
@@ -63,26 +63,12 @@ jobs:
           julia -e 'using Pkg;
             PKG, TAG, VERSION = ENV["PKG"], ENV["TAG"], ENV["VERSION"]
             joburl = joinpath(ENV["GITHUB_SERVER_URL"], ENV["GITHUB_REPOSITORY"], "actions/runs", ENV["GITHUB_RUN_ID"])
-            open("../pr/$PKG-$VERSION", "w") do io
-              try
-                TAG == "no_tag" && error("Not tag for $VERSION")
-                pkg"activate .";
-                pkg"instantiate";
-                pkg"dev ../";
-                if TAG == "latest"
-                  global TAG = chomp(read(`git rev-parse --short HEAD`, String))
-                end
-                pkg"build";
-                pkg"test";
-
-                print(io, "[![](https://img.shields.io/badge/$TAG-Pass-green)]($joburl)");
-              catch e
-                @error e;
-                print(io, "[![](https://img.shields.io/badge/$TAG-Fail-red)]($joburl)");
-              end;
-            end'
-
-      - uses: actions/upload-artifact@v2
-        with:
-          name: pr
-          path: pr/
\ No newline at end of file
+            TAG == "no_tag" && error("Not tag for $VERSION")
+            pkg"activate .";
+            pkg"instantiate";
+            pkg"dev ../";
+            if TAG == "latest"
+              global TAG = chomp(read(`git rev-parse --short HEAD`, String))
+            end
+            pkg"build";
+            pkg"test";'
\ No newline at end of file

From c278627ccf5c906b6296f20f3cf29a6fc7fa59bf Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 5 Jun 2024 13:51:55 +0200
Subject: [PATCH 36/47] Improve @testset handling

---
 test/testOperators.jl | 45 +++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/test/testOperators.jl b/test/testOperators.jl
index 4228497..fb25c12 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -24,7 +24,6 @@ function testDCT1d(N=32;arrayType = Array)
 
   x2 = adjoint(D3)*y3
   @test norm(x2 - x) / norm(x) ≈ 0 atol=0.01
-  true
 end
 
 function testFFT1d(N=32,shift=true;arrayType = Array)
@@ -52,7 +51,6 @@ function testFFT1d(N=32,shift=true;arrayType = Array)
     x2 = adjoint(D2)*y2
   end
   @test norm(x1 - x2) / norm(x1) ≈ 0 atol=0.01
-  true
 end
 
 function testFFT2d(N=32,shift=true;arrayType = Array)
@@ -84,7 +82,6 @@ function testFFT2d(N=32,shift=true;arrayType = Array)
     x2 = adjoint(D2)*y2
   end
   @test norm(x1 - x2) / norm(x1) ≈ 0 atol=0.01
-  true
 end
 
 function testWeighting(N=512;arrayType = Array)
@@ -103,7 +100,6 @@ function testWeighting(N=512;arrayType = Array)
   y = repeat(weights,2) .* x2
 
   @test norm(Array(y2) - y) / norm(y) ≈ 0 atol=0.01
-  true
 end
 
 function testGradOp1d(N=512;arrayType = Array)
@@ -120,7 +116,6 @@ function testGradOp1d(N=512;arrayType = Array)
   xr0 = transpose(G0)*y0
 
   @test norm(xr - xr0) / norm(xr0) ≈ 0 atol=0.001
-  true
 end
 
 function testGradOp2d(N=64;arrayType = Array)
@@ -140,7 +135,6 @@ function testGradOp2d(N=64;arrayType = Array)
   xr0 = vec(xr0)
 
   @test norm(xr - xr0) / norm(xr0) ≈ 0 atol=0.001
-  true
 end
 
 function testDirectionalGradOp(N=64;arrayType = Array)
@@ -173,7 +167,6 @@ function testDirectionalGradOp(N=64;arrayType = Array)
   end
   @test norm(x1r-vec(x1r_ref)) / norm(x1r_ref) ≈ 0 atol=0.001
   @test norm(x2r-vec(x2r_ref)) / norm(x2r_ref) ≈ 0 atol=0.001
-  true
 end
 
 function testSampling(N=64;arrayType = Array)
@@ -196,7 +189,6 @@ function testSampling(N=64;arrayType = Array)
   @test norm(y - y_ref) / norm(y_ref) ≈ 0 atol=0.000001
   @test norm(x2 - x2_ref) / norm(x2_ref) ≈ 0 atol=0.000001
   @test norm(y2 - x2_ref) / norm(x2_ref) ≈ 0 atol=0.000001
-  true
 end
 
 function testWavelet(M=64,N=60;arrayType = Array)
@@ -208,7 +200,6 @@ function testWavelet(M=64,N=60;arrayType = Array)
   x_reco = reshape( adjoint(WOp)*x_wavelet, M, N)
 
   @test norm(x_reco - x) / norm(x) ≈ 0 atol=0.001
-  true
 end
 
 # test FourierOperators
@@ -265,7 +256,6 @@ function testNFFT2d(N=16;arrayType = Array)
 
   @test Complex{eltype(nodes)} === eltype(y_nfft)
   @test Complex{eltype(nodes)} === eltype(y_adj_nfft)
-  true
 end
 
 function testNFFT3d(N=12;arrayType = Array)
@@ -310,7 +300,6 @@ function testNFFT3d(N=12;arrayType = Array)
   y_AHA = F' * F * vec(x)
   @test y_AHA_nfft_1 ≈ y_AHA_nfft_2   rtol = 1e-2
   @test y_AHA ≈ y_AHA_nfft_1   rtol = 1e-2
-  true
 end
 
 function testDiagOp(N=32,K=2;arrayType = Array)
@@ -374,7 +363,6 @@ function testDiagOp(N=32,K=2;arrayType = Array)
     @test y2 ≈ y3 rtol = 1e-2
   end
 
-  true
 end
 
 function testRadonOp(N=32;arrayType = Array)
@@ -392,39 +380,38 @@ function testRadonOp(N=32;arrayType = Array)
   xtmp = Array(backproject(arrayType(y), angles; geometry = geom))
   x2 = reshape(Array(adjoint(op) * arrayType(vec(y1))), size(xtmp)...)
   @test xtmp ≈ x2 rtol = 1e-2
-  true
 end
 
 @testset "Linear Operators" begin
   @testset for arrayType in arrayTypes
     @info "test DCT-II and DCT-IV Ops: $arrayType"
     for N in [2,8,16,32]
-      @test testDCT1d(N;arrayType) skip = arrayType != Array # Not implemented for GPUs
+      arrayType != Array || @testset testDCT1d(N;arrayType) # Not implemented for GPUs
     end
     @info "test FFTOp: $arrayType"
     for N in [8,16,32]
-      @test testFFT1d(N,false;arrayType)
-      @test testFFT1d(N,true;arrayType)
-      @test testFFT2d(N,false;arrayType)
-      @test testFFT2d(N,true;arrayType)
+      @testset testFFT1d(N,false;arrayType)
+      @testset testFFT1d(N,true;arrayType)
+      @testset testFFT2d(N,false;arrayType)
+      @testset testFFT2d(N,true;arrayType)
     end
     @info "test WeightingOp: $arrayType"
-    @test testWeighting(512;arrayType)
+    @testset testWeighting(512;arrayType)
     @info "test GradientOp: $arrayType"
-    @test testGradOp1d(512;arrayType)
-    @test testGradOp2d(64;arrayType)
-    @test testDirectionalGradOp(64;arrayType) 
+    @testset testGradOp1d(512;arrayType)
+    @testset testGradOp2d(64;arrayType)
+    @testset testDirectionalGradOp(64;arrayType) 
     @info "test SamplingOp: $arrayType"
-    @test testSampling(64;arrayType)
+    @testset testSampling(64;arrayType)
     @info "test WaveletOp: $arrayType"
-    @test testWavelet(64,64;arrayType)
-    @test testWavelet(64,60;arrayType)
+    @testset testWavelet(64,64;arrayType)
+    @testset testWavelet(64,60;arrayType)
     @info "test NFFTOp: $arrayType"
-    @test testNFFT2d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
-    @test testNFFT3d(;arrayType) skip = arrayType == JLArray # JLArray does not have a NFFTPlan
+    arrayType == JLArray || @testset testNFFT2d(;arrayType) # JLArray does not have a NFFTPlan
+    arrayType == JLArray || @testset testNFFT3d(;arrayType) # JLArray does not have a NFFTPlan
     @info "test DiagOp: $arrayType"
-    @test testDiagOp(;arrayType)
+    @testset testDiagOp(;arrayType)
     @info "test RadonOp: $arrayType"
-    @test testRadonOp(;arrayType) skip = arrayType == JLArray # Stackoverflow for kernelabstraction
+    arrayType == JLArray || @testset testRadonOp(;arrayType) # Stackoverflow for kernelabstraction
   end
 end

From 2475c8953d20a074dfcb93ab4ea3a06e455528b7 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 6 Jun 2024 15:46:44 +0200
Subject: [PATCH 37/47] Test setup for CUDA

---
 test/gpu/cuda.jl      | 5 +++++
 test/runtests.jl      | 3 ++-
 test/testOperators.jl | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 test/gpu/cuda.jl

diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
new file mode 100644
index 0000000..5f6349c
--- /dev/null
+++ b/test/gpu/cuda.jl
@@ -0,0 +1,5 @@
+using CUDA, CuNFFT
+
+arrayTypes = [CuArray]
+
+include(joinpath(@__DIR__(), "..", "runtests.jl"))
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 362e471..c5a8e88 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,8 @@ using NFFT
 using RadonKA
 using JLArrays
 
-arrayTypes = [Array, JLArray]
+areTypesDefined = @isdefined arrayTypes
+arrayTypes = areTypesDefined ? arrayTypes : [Array, JLArray]
 
 @testset "LinearOperatorCollection" begin
   include("testNormalOp.jl")
diff --git a/test/testOperators.jl b/test/testOperators.jl
index fb25c12..b7195b1 100644
--- a/test/testOperators.jl
+++ b/test/testOperators.jl
@@ -234,7 +234,7 @@ function testNFFT2d(N=16;arrayType = Array)
   F_nfft.toeplitz = false
   AHA = normalOperator(F_nfft)
   y_AHA_nfft = Array(AHA * xop)
-  y_AHA = F' * F * xop
+  y_AHA = F' * F * vec(x)
   @test y_AHA ≈ y_AHA_nfft   rtol = 1e-2
 
   # test AHA with Toeplitz

From ee3b3fb2a9e7288dd366327ea664f2749ec4c47d Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Mon, 10 Jun 2024 15:48:23 +0200
Subject: [PATCH 38/47] Use fill for res in GradOp

---
 ext/LinearOperatorGPUArraysExt/GradientOp.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ext/LinearOperatorGPUArraysExt/GradientOp.jl b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
index dfd2145..1187837 100644
--- a/ext/LinearOperatorGPUArraysExt/GradientOp.jl
+++ b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
@@ -14,7 +14,7 @@ function LinearOperatorCollection.grad!(res::vecT, img::vecT, shape, dim) where
 end
 
 # adjoint of directional gradients
-function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, dim::Int64) where {vecT <: AbstractGPUVector, N}
+function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, dim::Int64) where {T, vecT <: AbstractGPUVector{T}, N}
   δ = zeros(Int, length(shape))
   δ[dim] = 1
   δ = Tuple(δ)
@@ -23,7 +23,7 @@ function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,In
   res_ = reshape(res,shape)
   g_ = reshape(g, shape .- δ)
 
-  res_ .= 0
+  fill!(res, zero(T))
   gpu_call(res_, g_, di, elements = length(g)) do ctx, res_k, g_k, di_k
     idx = @cartesianidx(g_k)
     @inbounds res_k[idx]  = g_k[idx]

From 27188c69e6e000c2a95c426ed0f60b7ec166a04e Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Wed, 19 Jun 2024 11:51:02 +0200
Subject: [PATCH 39/47] Add LinearOperatorException for non-concrete S in Diag,
 Normal and ProdOps

---
 src/DiagOp.jl   | 2 ++
 src/NormalOp.jl | 1 +
 src/ProdOp.jl   | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/DiagOp.jl b/src/DiagOp.jl
index 5287d5a..05016f7 100644
--- a/src/DiagOp.jl
+++ b/src/DiagOp.jl
@@ -43,6 +43,7 @@ function DiagOp(ops)
     ncol += size(ops[i], 2)
     S = promote_type(S, LinearOperators.storage_type(ops[i]))
   end
+  isconcretetype(S) || throw(LinearOperatorException("Storage types cannot be promoted to a concrete type"))
 
   xIdx = cumsum(vcat(1,[size(ops[i], 2) for i=1:length(ops)]))
   yIdx = cumsum(vcat(1,[size(ops[i], 1) for i=1:length(ops)]))
@@ -136,6 +137,7 @@ end
 function LinearOperatorCollection.normalOperator(diag::DiagOp, W=opEye(eltype(diag), size(diag,1), S = LinearOperators.storage_type(diag)); copyOpsFn = copy, kwargs...)
   T = promote_type(eltype(diag), eltype(W))
   S = promote_type(LinearOperators.storage_type(diag), LinearOperators.storage_type(W))
+  isconcretetype(S) || throw(LinearOperatorException("Storage types cannot be promoted to a concrete type"))
   tmp = S(undef, diag.nrow)
   tmp .= one(eltype(diag))
   weights = W*tmp
diff --git a/src/NormalOp.jl b/src/NormalOp.jl
index e8c0ab1..0749b2a 100644
--- a/src/NormalOp.jl
+++ b/src/NormalOp.jl
@@ -39,6 +39,7 @@ LinearOperators.storage_type(op::NormalOpImpl) = typeof(op.Mv5)
 
 function NormalOpImpl(parent, weights)
   S = promote_type(storage_type(parent), storage_type(weights))
+  isconcretetype(S) || throw(LinearOperatorException("Storage types cannot be promoted to a concrete type"))
   tmp = S(undef, size(parent, 1))
   return NormalOpImpl(parent, weights, tmp)
 end
diff --git a/src/ProdOp.jl b/src/ProdOp.jl
index efacd69..27af4ac 100644
--- a/src/ProdOp.jl
+++ b/src/ProdOp.jl
@@ -36,7 +36,8 @@ composition/product of two Operators. Differs with * since it can handle normal
 function ProdOp(A, B)
   nrow = size(A, 1)
   ncol = size(B, 2)
-  S = promote_type(storage_type(A), storage_type(B))
+  S = promote_type(LinearOperators.storage_type(A), LinearOperators.storage_type(B))
+  isconcretetype(S) || throw(LinearOperatorException("Storage types cannot be promoted to a concrete type"))
   tmp_ = S(undef, size(B, 1))
 
   function produ!(res, x::AbstractVector{T}, tmp) where T<:Union{Real,Complex}

From 885b7c5fae24f8a49888020d50c0833aac67dac5 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Fri, 21 Jun 2024 17:06:32 +0200
Subject: [PATCH 40/47] Improve GradOp performance on GPU for multiple dims

---
 ext/LinearOperatorGPUArraysExt/GradientOp.jl | 70 +++++++++++--------
 src/GradientOp.jl                            | 72 ++++++++++++++------
 2 files changed, 96 insertions(+), 46 deletions(-)

diff --git a/ext/LinearOperatorGPUArraysExt/GradientOp.jl b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
index 1187837..7cddd0a 100644
--- a/ext/LinearOperatorGPUArraysExt/GradientOp.jl
+++ b/ext/LinearOperatorGPUArraysExt/GradientOp.jl
@@ -1,38 +1,54 @@
-function LinearOperatorCollection.grad!(res::vecT, img::vecT, shape, dim) where {vecT <: AbstractGPUVector}
-  δ = zeros(Int, length(shape))
-  δ[dim] = 1
-  δ = Tuple(δ)
-  di = CartesianIndex(δ)
-
-  gpu_call(reshape(res, shape .- δ), reshape(img,shape), di) do ctx, res_, img_, di_
-    idx = @cartesianidx(res_)
-    @inbounds res_[idx] = img_[idx] - img_[idx + di_]
-    return nothing  
+function LinearOperatorCollection.grad!(res::vecT, img::vecT, shape::NTuple{N,Int64}, di::CartesianIndex{N}) where {vecT <: AbstractGPUVector, N}
+  res = reshape(res, shape .- Tuple(di))
+
+  if length(res) > 0
+    gpu_call(grad_kernel!, res, reshape(img,shape), di)
   end
-  
+
   return res
 end
 
-# adjoint of directional gradients
-function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, dim::Int64) where {T, vecT <: AbstractGPUVector{T}, N}
-  δ = zeros(Int, length(shape))
-  δ[dim] = 1
-  δ = Tuple(δ)
-  di = CartesianIndex(δ)
+function grad_kernel!(ctx, res, img, di)
+  idx = @cartesianidx(res)
+  @inbounds res[idx] = img[idx] - img[idx + di]
+  return nothing  
+end
 
+# adjoint of directional gradients
+function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, di::CartesianIndex{N}) where {T, vecT <: AbstractGPUVector{T}, N}
   res_ = reshape(res,shape)
-  g_ = reshape(g, shape .- δ)
+  g_ = reshape(g, shape .- Tuple(di))
 
   fill!(res, zero(T))
-  gpu_call(res_, g_, di, elements = length(g)) do ctx, res_k, g_k, di_k
-    idx = @cartesianidx(g_k)
-    @inbounds res_k[idx]  = g_k[idx]
-    return nothing  
+  if length(g_) > 0
+    gpu_call(grad_t_kernel_1!, res_, g_, di, elements = length(g))
+    gpu_call(grad_t_kernel_2!, res_, g_, di, elements = length(g))
   end
+end
 
-  gpu_call(res_, g_, di, elements = length(g)) do ctx, res_k, g_k, di_k
-    idx = @cartesianidx(g_k)
-    @inbounds res_k[idx + di_k] -= g_k[idx]
-    return nothing  
-  end
+function grad_t_kernel_1!(ctx, res, g, di)
+  idx = @cartesianidx(g)
+  @inbounds res[idx] += g[idx]
+  return nothing  
+end
+
+function grad_t_kernel_2!(ctx, res, g, di)
+  idx = @cartesianidx(g)
+  @inbounds res[idx + di] -= g[idx]
+  return nothing  
 end
+
+function LinearOperatorCollection.grad_t!(res::vecT, g::vecT, shape::NTuple{N,Int64}, dirs, dims, dim_ends, tmp) where {T, vecT <: AbstractGPUVector{T}, N}
+  dim_start = 1
+  res = reshape(res, shape)
+
+  fill!(res, zero(eltype(res)))
+  for (i, di) in enumerate(dirs)
+    g_ = reshape(view(g, dim_start:dim_ends[i]), shape .- Tuple(di))
+    if length(g_) > 0
+      gpu_call(grad_t_kernel_1!, res, g_, di, elements = length(g))
+      gpu_call(grad_t_kernel_2!, res, g_, di, elements = length(g))
+    end  
+    dim_start = dim_ends[i] + 1
+  end
+end
\ No newline at end of file
diff --git a/src/GradientOp.jl b/src/GradientOp.jl
index d041c3b..b8f77bd 100644
--- a/src/GradientOp.jl
+++ b/src/GradientOp.jl
@@ -16,46 +16,80 @@ function GradientOp(::Type{T}; shape::NTuple{N,Int}, dims=1:length(shape), kwarg
   return GradientOpImpl(T, shape, dims; kwargs...)
 end
 
-function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dims; kwargs...) where N
-  return vcat([GradientOpImpl(T, shape, dim; kwargs...) for dim ∈ dims]...)
+function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dims; S = Vector{T}) where N
+  dirs = CartesianIndex{N}[]
+  cols = Int64[]
+  for dim in dims
+    δ = zeros(Int32, N)
+    δ[dim] = 1
+    δ = NTuple{N}(δ)
+    di = CartesianIndex(δ)
+    push!(dirs, di)
+    push!(cols, div((shape[dim]-1)*prod(shape), shape[dim]))
+  end
+  dim_ends = accumulate(+, cols)
+  
+  nrow = sum(cols)
+  ncol = prod(shape)
+
+  tmp = S(undef, ncol)
+  
+  return LinearOperator{T}(nrow, ncol, false, false,
+                          (res,x) -> (grad!(res,x,shape,dirs, dims, dim_ends)),
+                          (res,x) -> (grad_t!(res,x,shape,dirs, dims, dim_ends, tmp)),
+                          (res,x) -> (grad_t!(res,x,shape,dirs, dims, dim_ends, tmp)),
+                          S = S)
 end
 
 function GradientOpImpl(T::Type, shape::NTuple{N,Int}, dim::Int; S = Vector{T}) where N
   nrow = div( (shape[dim]-1)*prod(shape), shape[dim] )
   ncol = prod(shape)
+  δ = zeros(Int, length(shape))
+  δ[dim] = 1
+  δ = Tuple(δ)
+  dir = CartesianIndex(δ)
   return LinearOperator{T}(nrow, ncol, false, false,
-                          (res,x) -> (grad!(res,x,shape,dim)),
-                          (res,x) -> (grad_t!(res,x,shape,dim)),
-                          (res,x) -> (grad_t!(res,x,shape,dim)),
+                          (res,x) -> (grad!(res,x,shape,dir)),
+                          (res,x) -> (grad_t!(res,x,shape,dir)),
+                          (res,x) -> (grad_t!(res,x,shape,dir)),
                           S = S)
 end
 
+function grad!(res::T, img::U, shape, dirs, dims, dim_ends) where {T<:AbstractVector, U<:AbstractVector}
+  dim_start = 1
+
+  for (i, dir) in enumerate(dirs)
+    grad!(view(res, dim_start:dim_ends[i]), img, shape, dir)
+    dim_start = dim_ends[i] + 1
+  end
+end
+
 # directional gradients
-function grad!(res::T, img::U, shape, dim) where {T<:AbstractVector, U<:AbstractVector}
+function grad!(res::T, img::U, shape::NTuple{N,Int64}, di::CartesianIndex{N}) where {N, T<:AbstractVector, U<:AbstractVector}
   img_ = reshape(img,shape)
 
-  δ = zeros(Int, length(shape))
-  δ[dim] = 1
-  δ = Tuple(δ)
-  di = CartesianIndex(δ)
-
-  res_ = reshape(res, shape .- δ)
+  res_ = reshape(res, shape .- Tuple(di))
 
   Threads.@threads for i ∈ CartesianIndices(res_)
     @inbounds res_[i] = img_[i] - img_[i + di]
   end
 end
 
+function grad_t!(res::T, g::U, shape, dirs, dims, dims_end, tmp) where {T<:AbstractVector, U<:AbstractVector}
+  dim_start = 1
 
-# adjoint of directional gradients
-function grad_t!(res::T, g::U, shape::NTuple{N,Int64}, dim::Int64) where {T<:AbstractVector, U<:AbstractVector, N}
-  δ = zeros(Int, length(shape))
-  δ[dim] = 1
-  δ = Tuple(δ)
-  di = CartesianIndex(δ)
+  fill!(res, zero(eltype(res)))
+  for (i, dir) in enumerate(dirs)
+    grad_t!(tmp, view(g, dim_start:dims_end[i]), shape, dir)
+    dim_start = dims_end[i] + 1
+    res .= res .+ tmp 
+  end
+end
 
+# adjoint of directional gradients
+function grad_t!(res::T, g::U, shape::NTuple{N,Int64}, di::CartesianIndex{N}) where {N, T<:AbstractVector, U<:AbstractVector}
   res_ = reshape(res,shape)
-  g_ = reshape(g, shape .- δ)
+  g_ = reshape(g, shape .- Tuple(di))
 
   res_ .= 0
   Threads.@threads for i ∈ CartesianIndices(g_)

From 44994a58367733d726bffaaa5cd3a7e9c2886190 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 13:26:00 +0200
Subject: [PATCH 41/47] Add CUDA buildkite

---
 .buildkite/pipeline.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .buildkite/pipeline.yml

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..5b1b8a5
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,15 @@
+steps:
+  - label: "Nvidia GPUs -- LinearOperators.jl"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: 1.8
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    command: |
+      julia --color=yes --project -e '
+      using Pkg
+      Pkg.add("CUDA")
+      Pkg.instantiate()
+      include("test/gpu/cuda.jl")'
+    timeout_in_minutes: 30
\ No newline at end of file

From a97ed904d89b384694cf88b6576c2c9027154147 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 13:31:29 +0200
Subject: [PATCH 42/47] Try fixing buildkite

---
 .buildkite/pipeline.yml | 2 +-
 Project.toml            | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 5b1b8a5..ec152ac 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -2,7 +2,7 @@ steps:
   - label: "Nvidia GPUs -- LinearOperators.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: 1.10
     agents:
       queue: "juliagpu"
       cuda: "*"
diff --git a/Project.toml b/Project.toml
index 64bc9e6..3f97137 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,10 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
+NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
+Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+RadonKA = "86de8297-835b-47df-b249-c04e8db91db5"
 
 [compat]
 julia = "1.9"

From ef2473184abd58b9238140b129794579f4620a39 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 13:33:15 +0200
Subject: [PATCH 43/47] Fix julia version in buildkite

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ec152ac..6409371 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -2,7 +2,7 @@ steps:
   - label: "Nvidia GPUs -- LinearOperators.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.10
+          version: "1.10"
     agents:
       queue: "juliagpu"
       cuda: "*"

From e06506c89207ad2ba5fe4f91913d9ac0b7c8ed85 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 13:39:37 +0200
Subject: [PATCH 44/47] Add CuNFFT to CUDA buildkite

---
 .buildkite/pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6409371..e471b6f 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -10,6 +10,7 @@ steps:
       julia --color=yes --project -e '
       using Pkg
       Pkg.add("CUDA")
+      Pkg.add("CuNFFT")
       Pkg.instantiate()
       include("test/gpu/cuda.jl")'
     timeout_in_minutes: 30
\ No newline at end of file

From 5e66c0020349e282ea7868ac81da9ebf4b8a27ce Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 13:51:02 +0200
Subject: [PATCH 45/47] Add all extras to buildkite

---
 .buildkite/pipeline.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e471b6f..45bb68f 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -11,6 +11,11 @@ steps:
       using Pkg
       Pkg.add("CUDA")
       Pkg.add("CuNFFT")
+      Pkg.add("JLArrays")
+      Pkg.add("NFFT")
+      Pkg.add("Wavelets")
+      Pkg.add("FFTW")
+      Pkg.add("RadonKA")
       Pkg.instantiate()
       include("test/gpu/cuda.jl")'
     timeout_in_minutes: 30
\ No newline at end of file

From 1423c54475a54c41cfcab6f5fe56fba86f316a4e Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:33:18 +0200
Subject: [PATCH 46/47] Use TestEnv to fix compat issues for buildkite

---
 .buildkite/pipeline.yml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 45bb68f..119311e 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -9,13 +9,10 @@ steps:
     command: |
       julia --color=yes --project -e '
       using Pkg
+      Pkg.add("TestEnv")
+      using TestEnv
+      TestEnv.activate();
       Pkg.add("CUDA")
-      Pkg.add("CuNFFT")
-      Pkg.add("JLArrays")
-      Pkg.add("NFFT")
-      Pkg.add("Wavelets")
-      Pkg.add("FFTW")
-      Pkg.add("RadonKA")
       Pkg.instantiate()
       include("test/gpu/cuda.jl")'
     timeout_in_minutes: 30
\ No newline at end of file

From 8e14f5df01236355506e0c9609a5e6f8be2561f5 Mon Sep 17 00:00:00 2001
From: nHackel <nHackel@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:40:24 +0200
Subject: [PATCH 47/47] Readd CuNFFT to buildkite

---
 .buildkite/pipeline.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 119311e..691a002 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -13,6 +13,7 @@ steps:
       using TestEnv
       TestEnv.activate();
       Pkg.add("CUDA")
+      Pkg.add("CuNFFT")
       Pkg.instantiate()
       include("test/gpu/cuda.jl")'
     timeout_in_minutes: 30
\ No newline at end of file