From e08df27f5281466a2d04fadc6072afd3381c9283 Mon Sep 17 00:00:00 2001
From: Ludvig af Klinteberg <ludvig.af.klinteberg@mdu.se>
Date: Wed, 16 Oct 2024 17:03:48 +0200
Subject: [PATCH 1/2] Add support for GPU type 3 transform  * Tests are
 tentative pending fix of gpu_method=0 bug  * Fix some typos along the way

---
 README.md               |   4 +-
 src/cufinufft.jl        |  27 ++++++++++-
 src/cufinufft_simple.jl | 101 ++++++++++++++++++++++++++++++++++++++++
 src/types.jl            |   5 +-
 test/test_cuda.jl       |  45 ++++++++++++++++++
 test/test_nufft.jl      |   2 +-
 6 files changed, 178 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index be59824..38a99cb 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This is a full-featured Julia interface to [FINUFFT](https://github.com/flatiron
 
 ## Installation
 
-FINUFFT.jl requires Julia v1.6 or later, and has been tested up to v1.10. From the Pkg REPL mode (hit `]` in REPL to enter), run
+FINUFFT.jl requires Julia v1.6 or later, and has been tested up to v1.11. From the Pkg REPL mode (hit `]` in REPL to enter), run
 
 ```julia
 add FINUFFT
@@ -105,7 +105,7 @@ see [examples/time2d1.jl](examples/time2d1.jl)
 
 Finally, the more involved codes [test/test_nufft.jl](test/test_nufft.jl) 
 and [test/test_cuda.jl](test/test_cuda.jl)
-tests `dtype=Float64` and `dtype=Float32` precisions for all supported transform types, and can be used as references.
+test `dtype=Float64` and `dtype=Float32` precisions for all supported transform types, and can be used as references.
 The outputs are tested there for mathematical correctness.
 In the 1D type 1 it also tests a vectorized simple, a guru call and
 a vectorized guru call.
diff --git a/src/cufinufft.jl b/src/cufinufft.jl
index dc9d92c..e46e766 100644
--- a/src/cufinufft.jl
+++ b/src/cufinufft.jl
@@ -122,7 +122,6 @@ function _cufinufft_makeplan(::Type{dtype},
 
     n_modes = ones(Int64,3)
     if type==3
-        throw("Type 3 not implemented yet")
         @assert ndims(n_modes_or_dim) == 0
         dim = n_modes_or_dim
     else
@@ -130,7 +129,7 @@ function _cufinufft_makeplan(::Type{dtype},
         dim = length(n_modes_or_dim)
         n_modes[1:dim] .= n_modes_or_dim
     end
-    
+
     if dtype==Float64
         tol = Float64(eps)
         ret = ccall( (:cufinufft_makeplan, libcufinufft),
@@ -414,6 +413,30 @@ function cufinufft_exec!(plan::cufinufft_plan{T},
                          plan.plan_ptr,output,input
                          )
         end
+    elseif type==3
+        nk = plan.nk
+        if ntrans==1
+            @assert size(output)==(nk,ntrans) || size(output)==(nk,)
+        else
+            @assert size(output)==(nk,ntrans)
+        end
+        if T==Float64
+            ret = ccall( (:cufinufft_execute, libcufinufft),
+                         Cint,
+                         (cufinufft_plan_c,
+                          CuRef{ComplexF64},
+                          CuRef{ComplexF64}),
+                         plan.plan_ptr,input,output
+                         )
+        else
+            ret = ccall( (:cufinufftf_execute, libcufinufft),
+                         Cint,
+                         (cufinufft_plan_c,
+                          CuRef{ComplexF32},
+                          CuRef{ComplexF32}),
+                         plan.plan_ptr,input,output
+                         )
+        end
     else
         ret = ERR_TYPE_NOTVALID
     end
diff --git a/src/cufinufft_simple.jl b/src/cufinufft_simple.jl
index b80fc9b..fa32dd4 100644
--- a/src/cufinufft_simple.jl
+++ b/src/cufinufft_simple.jl
@@ -60,6 +60,35 @@ function nufft1d2!(xj      :: CuArray{T},
     cufinufft_destroy!(plan)
 end
 
+"""
+    nufft1d3!(xj      :: CuArray{Float64} or CuArray{Float32}, 
+              cj      :: CuArray{ComplexF64} or CuArray{ComplexF32}, 
+              iflag   :: Integer, 
+              eps     :: Real,
+              sk      :: CuArray{Float64} or CuArray{Float32},
+              fk      :: CuArray{ComplexF64} or CuArray{ComplexF32};
+              kwargs...
+             )
+
+CUDA version.
+"""
+function nufft1d3!(xj      :: CuArray{T},
+                   cj      :: CuArray{Complex{T}},
+                   iflag   :: Integer, 
+                   eps     :: Real,
+                   sk      :: CuArray{T},
+                   fk      :: CuArray{Complex{T}};
+                   kwargs...) where T <: finufftReal
+    (nj, nk) = valid_setpts(3,1,xj,T[],T[],sk)
+    ntrans = valid_ntr(xj,cj)
+
+    checkkwdtype(T; kwargs...)
+    plan = _cufinufft_makeplan(T,3,1,iflag,ntrans,eps;kwargs...)
+    cufinufft_setpts!(plan,xj,CuVector{T}(),CuVector{T}(),sk)
+    cufinufft_exec!(plan,cj,fk)
+    cufinufft_destroy!(plan)
+end
+
 ## 2D
 
 """
@@ -123,6 +152,40 @@ function nufft2d2!(xj      :: CuArray{T},
     cufinufft_destroy!(plan)
 end
 
+"""
+    nufft2d3!(xj      :: CuArray{Float64} or CuArray{Float32}, 
+              yj      :: CuArray{Float64} or CuArray{Float32},
+              cj      :: CuArray{ComplexF64} or CuArray{ComplexF32}, 
+              iflag   :: Integer, 
+              eps     :: Real,
+              sk      :: CuArray{Float64} or CuArray{Float32},
+              tk      :: CuArray{Float64} or CuArray{Float32},
+              fk      :: CuArray{ComplexF64} or CuArray{ComplexF32};
+              kwargs...
+             )
+
+CUDA version.
+"""
+function nufft2d3!(xj      :: CuArray{T}, 
+                   yj      :: CuArray{T},
+                   cj      :: CuArray{Complex{T}}, 
+                   iflag   :: Integer, 
+                   eps     :: Real,
+                   sk      :: CuArray{T},
+                   tk      :: CuArray{T},
+                   fk      :: CuArray{Complex{T}};
+                   kwargs...) where T <: finufftReal
+    (nj, nk) = valid_setpts(3,2,xj,yj,T[],sk,tk)
+    ntrans = valid_ntr(xj,cj)
+
+    checkkwdtype(T; kwargs...)
+    plan = _cufinufft_makeplan(T,3,2,iflag,ntrans,eps;kwargs...)
+    cufinufft_setpts!(plan,xj,yj,CuVector{T}(),sk,tk)
+    cufinufft_exec!(plan,cj,fk)
+    cufinufft_destroy!(plan)
+end
+
+
 ## 3D
 
 """
@@ -188,3 +251,41 @@ function nufft3d2!(xj      :: CuArray{T},
     cufinufft_exec!(plan,fk,cj)
     cufinufft_destroy!(plan)
 end
+
+"""
+    nufft3d3!(xj      :: CuArray{Float64} or CuArray{Float32}, 
+              yj      :: CuArray{Float64} or CuArray{Float32},
+              zj      :: CuArray{Float64} or CuArray{Float32},
+              cj      :: CuArray{ComplexF64} or CuArray{ComplexF32}, 
+              iflag   :: Integer, 
+              eps     :: Real,
+              sk      :: CuArray{Float64} or CuArray{Float32},
+              tk      :: CuArray{Float64} or CuArray{Float32},
+              uk      :: CuArray{Float64} or CuArray{Float32},
+              fk      :: CuArray{ComplexF64} or CuArray{ComplexF32};
+              kwargs...
+             )
+
+CUDA version.
+"""
+function nufft3d3!(xj      :: CuArray{T},
+                   yj      :: CuArray{T},
+                   zj      :: CuArray{T},
+                   cj      :: CuArray{Complex{T}},
+                   iflag   :: Integer,
+                   eps     :: Real,
+                   sk      :: CuArray{T},
+                   tk      :: CuArray{T},
+                   uk      :: CuArray{T},
+                   fk      :: CuArray{Complex{T}};
+                   kwargs...) where T <: finufftReal
+    (nj, nk) = valid_setpts(3,3,xj,yj,zj,sk,tk,uk)
+    ntrans = valid_ntr(xj,cj)
+
+    checkkwdtype(T; kwargs...)
+    plan = _cufinufft_makeplan(T,3,3,iflag,ntrans,eps;kwargs...)
+    cufinufft_setpts!(plan,xj,yj,zj,sk,tk,uk)
+    cufinufft_exec!(plan,cj,fk)
+    cufinufft_destroy!(plan)
+end
+
diff --git a/src/types.jl b/src/types.jl
index 46506fc..af493b7 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -130,6 +130,7 @@ const nufft_c_opts = nufft_opts        # for backward compatibility - remove?
         gpu_stream           :: Ptr{Cvoid}
         modeord              :: Cint # (type 1,2 only): 0 CMCL-style increasing mode order
                                      #                  1 FFT-style mode order
+        debug                :: Cint # 0: no debug, 1: debug
     end
 
 Options struct passed to cuFINUFFT, see C documentation.
@@ -162,6 +163,8 @@ mutable struct cufinufft_opts
     gpu_stream           :: Ptr{Cvoid}
 
     modeord              :: Cint # (type 1,2 only): 0 CMCL-style increasing mode order
-                                 #                  1 FFT-style mode order
+    #                  1 FFT-style mode order
+
+    debug                :: Cint # 0: no debug, 1: debug
     cufinufft_opts() = new()
 end
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 9a9cebc..369ad49 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -50,6 +50,9 @@ function test_cuda(tol::Real, dtype::DataType)
     y_d = CuArray(y)
     z_d = CuArray(z)
     c_d = CuArray(c)
+    s_d = CuArray(s)
+    t_d = CuArray(t)
+    u_d = CuArray(u)
     F1D_d = CuArray(F1D)
     F2D_d = CuArray(F2D)
     F3D_d = CuArray(F3D)
@@ -128,6 +131,21 @@ function test_cuda(tol::Real, dtype::DataType)
                 relerr_1d2_guru = norm(vec(out2)-vec(ref), Inf) / norm(vec(ref), Inf)
                 @test relerr_1d2_guru < errfac*tol
             end
+
+            # 1D3
+            @testset "1D3" begin
+                out_d = CUDA.zeros(Complex{T},nk)
+                ref = zeros(Complex{T},nk)
+                for k=1:nk
+                    for j=1:nj
+                        ref[k] += c[j] * exp(1im*s[k]*x[j])
+                    end
+                end
+                nufft1d3!(x_d,c_d,1,tol,s_d,out_d, debug=1, gpu_method=1) # FIXME
+                relerr_1d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
+                @test relerr_1d3 < errfac*tol
+            end
+
         end
 
         ## 2D
@@ -177,6 +195,20 @@ function test_cuda(tol::Real, dtype::DataType)
                 relerr_2d2_guru = norm(vec(out2)-vec(ref), Inf) / norm(vec(ref), Inf)
                 @test relerr_2d2_guru < errfac*tol
             end
+
+            @testset "2D3" begin
+                # 2D3
+                out_d = CUDA.zeros(Complex{T},nk)
+                ref = zeros(Complex{T},nk)
+                for k=1:nk
+                    for j=1:nj
+                        ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]))
+                    end
+                end
+                nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d, debug=1, gpu_method=1) # FIXME
+                relerr_2d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
+                @test relerr_2d3 < errfac*tol
+            end
         end
 
         ## 3D
@@ -231,6 +263,19 @@ function test_cuda(tol::Real, dtype::DataType)
                 @test relerr_3d2_guru < errfac*tol
             end
 
+            @testset "3D3" begin
+                # 3D3
+                out_d = CUDA.zeros(Complex{T},nk)
+                ref = zeros(Complex{T},nk)
+                for k=1:nk
+                    for j=1:nj
+                        ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]+u[k]*z[j]))
+                    end
+                end
+                nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d, debug=1, gpu_method=1) # FIXME
+                relerr_3d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
+                @test relerr_3d3 < errfac*tol
+            end
         end
     end
 end
diff --git a/test/test_nufft.jl b/test/test_nufft.jl
index 5aca25f..dfc1163 100644
--- a/test/test_nufft.jl
+++ b/test/test_nufft.jl
@@ -165,7 +165,7 @@ function test_nufft(tol::Real, dtype::DataType)
                 @test reldiff < errdifffac*tol
             end
 
-            @testset "3D3" begin
+            @testset "2D3" begin
                 # 2D3
                 out = zeros(Complex{T},nk)
                 ref = zeros(Complex{T},nk)

From af3b3eb65e24b8f7a9cfb1cdab36156f0713d188 Mon Sep 17 00:00:00 2001
From: Ludvig af Klinteberg <ludvig.af.klinteberg@mdu.se>
Date: Fri, 18 Oct 2024 09:06:44 +0200
Subject: [PATCH 2/2] Remove FIXMEs

---
 test/test_cuda.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_cuda.jl b/test/test_cuda.jl
index 369ad49..4c4dc70 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda.jl
@@ -141,7 +141,7 @@ function test_cuda(tol::Real, dtype::DataType)
                         ref[k] += c[j] * exp(1im*s[k]*x[j])
                     end
                 end
-                nufft1d3!(x_d,c_d,1,tol,s_d,out_d, debug=1, gpu_method=1) # FIXME
+                nufft1d3!(x_d,c_d,1,tol,s_d,out_d)
                 relerr_1d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
                 @test relerr_1d3 < errfac*tol
             end
@@ -205,7 +205,7 @@ function test_cuda(tol::Real, dtype::DataType)
                         ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]))
                     end
                 end
-                nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d, debug=1, gpu_method=1) # FIXME
+                nufft2d3!(x_d,y_d,c_d,1,tol,s_d,t_d,out_d)
                 relerr_2d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
                 @test relerr_2d3 < errfac*tol
             end
@@ -272,7 +272,7 @@ function test_cuda(tol::Real, dtype::DataType)
                         ref[k] += c[j] * exp(1im*(s[k]*x[j]+t[k]*y[j]+u[k]*z[j]))
                     end
                 end
-                nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d, debug=1, gpu_method=1) # FIXME
+                nufft3d3!(x_d,y_d,z_d,c_d,1,tol,s_d,t_d,u_d,out_d)
                 relerr_3d3 = norm(vec(Array(out_d))-vec(ref), Inf) / norm(vec(ref), Inf)
                 @test relerr_3d3 < errfac*tol
             end