diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4fcbc91..50b4cb9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,4 +34,4 @@ jobs:
 
     - name: Pytest
       run: |
-        pytest tests/ --cov
+        pytest tests/ --cov -k "not cuda"
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000..c030e39
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,91 @@
+pipeline {
+  agent none
+  options {
+    disableConcurrentBuilds()
+    buildDiscarder(logRotator(numToKeepStr: '8', daysToKeepStr: '20'))
+    timeout(time: 1, unit: 'HOURS')
+  }
+  stages {
+    stage('CUDA Tests') {
+      agent {
+         dockerfile {
+            filename 'ci/docker/Dockerfile-cuda11.8'
+            args '--gpus 2'
+            label 'docker && v100'
+         }
+      }
+      environment {
+    HOME = "$WORKSPACE"
+    PYBIN = "/opt/python/cp39-cp39/bin"
+    LIBRARY_PATH = "$WORKSPACE/finufft/build"
+    LD_LIBRARY_PATH = "$WORKSPACE/finufft/build"
+      }
+      steps {
+
+    // TODO - reconsider install strategy once finufft/cufinufft 2.2 is released
+  checkout scmGit(branches: [[name: '*/master']],
+                  extensions: [cloneOption(noTags: true, reference: '', shallow: true),
+                               [$class: 'RelativeTargetDirectory', relativeTargetDir: 'finufft'],
+                               cleanAfterCheckout()],
+                  userRemoteConfigs: [[url: 'https://github.com/flatironinstitute/finufft']])
+
+    sh '''#!/bin/bash -ex
+      nvidia-smi
+    '''
+    sh '''#!/bin/bash -ex
+      echo $HOME
+      ls
+    '''
+    sh '''#!/bin/bash -ex
+        cd finufft
+        # v100 cuda arch
+        cuda_arch="70"
+
+        cmake -B build . -DFINUFFT_USE_CUDA=ON \
+                         -DFINUFFT_USE_CPU=OFF \
+                         -DFINUFFT_BUILD_TESTS=OFF \
+                         -DCMAKE_CUDA_ARCHITECTURES="$cuda_arch" \
+                         -DBUILD_TESTING=ON
+        cd build
+        make -j4
+    '''
+
+    sh '${PYBIN}/python3 -m venv $HOME'
+    sh '''#!/bin/bash -ex
+      source $HOME/bin/activate
+      python3 -m pip install --upgrade pip
+      # we could also move pytorch install inside docker
+      python3 -m pip install "torch~=2.1.0" --index-url https://download.pytorch.org/whl/cu118
+      python3 -m pip install finufft/python/cufinufft
+
+      python3 -m pip install -e .[dev]
+
+      python3 -m pytest -k "cuda" tests/ --cov -v
+    '''
+      }
+    }
+  }
+  post {
+    failure {
+      emailext subject: '$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS',
+           body: '''$PROJECT_NAME - Build #$BUILD_NUMBER - $BUILD_STATUS
+
+Check console output at $BUILD_URL to view full results.
+
+Building $BRANCH_NAME for $CAUSE
+$JOB_DESCRIPTION
+
+Chages:
+$CHANGES
+
+End of build log:
+${BUILD_LOG,maxLines=200}
+''',
+           recipientProviders: [
+         [$class: 'DevelopersRecipientProvider'],
+           ],
+           replyTo: '$DEFAULT_REPLYTO',
+           to: 'bward@flatironinstitute.org'
+    }
+  }
+}
diff --git a/ci/docker/Dockerfile-cuda11.8 b/ci/docker/Dockerfile-cuda11.8
new file mode 100644
index 0000000..ce622f7
--- /dev/null
+++ b/ci/docker/Dockerfile-cuda11.8
@@ -0,0 +1,58 @@
+# Based on https://github.com/flatironinstitute/finufft/blob/master/tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64
+
+FROM quay.io/pypa/manylinux2014_x86_64
+LABEL maintainer "Brian Ward"
+
+ENV CUDA_MAJOR 11
+ENV CUDA_MINOR 8
+ENV CUDA_DASH_VERSION ${CUDA_MAJOR}-${CUDA_MINOR}
+ENV CUDA_DOT_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}
+
+# ---- The following block adds layers for CUDA --- #
+# base
+RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
+    echo "$NVIDIA_GPGKEY_SUM  /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA" | sha256sum -c --strict -
+
+COPY ci/docker/cuda.repo /etc/yum.repos.d/cuda.repo
+
+# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+RUN yum install -y \
+        cuda-cudart-${CUDA_DASH_VERSION} \
+        cuda-compat-${CUDA_DASH_VERSION} && \
+    ln -s cuda-${CUDA_DOT_VERSION} /usr/local/cuda
+
+# nvidia-docker 1.0
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
+    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=${CUDA_DOT_VERSION} brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441"
+
+# runtime
+RUN yum install -y \
+        cuda-libraries-${CUDA_DASH_VERSION} \
+        cuda-nvtx-${CUDA_DASH_VERSION} \
+        cuda-cudart-devel-${CUDA_DASH_VERSION} \
+        cuda-libraries-devel-${CUDA_DASH_VERSION} \
+        cuda-nvprof-${CUDA_DASH_VERSION} \
+        cuda-nvcc-${CUDA_DASH_VERSION}
+
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+# /CUDA #
+
+# CUDA 11 doesn't work on gcc/g++ newer than v9
+RUN yum install -y \
+        devtoolset-9-gcc \
+        devtoolset-9-gcc-c++ \
+        cmake && \
+    rm -rf /var/cache/yum/*
+
+ENV PATH /opt/rh/devtoolset-9/root/usr/bin:${PATH}
+
diff --git a/ci/docker/cuda.repo b/ci/docker/cuda.repo
new file mode 100644
index 0000000..ba2cba6
--- /dev/null
+++ b/ci/docker/cuda.repo
@@ -0,0 +1,6 @@
+[cuda]
+name=cuda
+baseurl=https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64
+enabled=1
+gpgcheck=1
+gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA
diff --git a/pytorch_finufft/functional.py b/pytorch_finufft/functional.py
index 7b40e71..c6fde87 100644
--- a/pytorch_finufft/functional.py
+++ b/pytorch_finufft/functional.py
@@ -5,9 +5,28 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
-import finufft
 import torch
 
+try:
+    import finufft
+
+    FINUFFT_AVAIL = True
+except ImportError:
+    FINUFFT_AVAIL = False
+
+try:
+    import cufinufft
+
+    CUFINUFFT_AVAIL = True
+except ImportError:
+    CUFINUFFT_AVAIL = False
+
+if not (FINUFFT_AVAIL or CUFINUFFT_AVAIL):
+    raise ImportError(
+        "No FINUFFT implementation available. "
+        "Install either finufft or cufinufft and ensure they are importable."
+    )
+
 import pytorch_finufft._err as err
 
 ###############################################################################
@@ -1595,27 +1614,40 @@ def backward(
         )
 
 
-
-
-
 ###############################################################################
 # Consolidated forward function for all 1D, 2D, and 3D problems for nufft type 1
 ###############################################################################
 
-def get_nufft_func(dim, nufft_type):
-    return getattr(finufft, f"nufft{dim}d{nufft_type}")
+
+def get_nufft_func(dim, nufft_type, device_type):
+    if device_type == "cuda":
+        return getattr(cufinufft, f"nufft{dim}d{nufft_type}")
+
+    # CPU needs extra work to go to/from torch and numpy
+    finufft_func = getattr(finufft, f"nufft{dim}d{nufft_type}")
+
+    def f(*args, **kwargs):
+        new_args = [arg for arg in args]
+        for i in range(len(new_args)):
+            if isinstance(new_args[i], torch.Tensor):
+                new_args[i] = new_args[i].data.numpy()
+
+        return torch.from_numpy(finufft_func(*new_args, **kwargs))
+
+    return f
 
 
 class finufft_type1(torch.autograd.Function):
     @staticmethod
     def forward(
-            ctx: Any,
-            points: torch.Tensor,
-            values: torch.Tensor,
-            output_shape: Union[int, tuple[int, int], tuple[int, int, int]],
-            out: Optional[torch.Tensor]=None,
-            fftshift: bool=False,
-            finufftkwargs: dict[str, Union[int, float]]=None):
+        ctx: Any,
+        points: torch.Tensor,
+        values: torch.Tensor,
+        output_shape: Union[int, Tuple[int, int], Tuple[int, int, int]],
+        out: Optional[torch.Tensor] = None,
+        fftshift: bool = False,
+        finufftkwargs: dict[str, Union[int, float]] = None,
+    ):
         """
         Evaluates the Type 1 NUFFT on the inputs.
 
@@ -1626,8 +1658,13 @@ def forward(
             # All this requires is a check on the out array to make sure it is the
             # correct shape.
 
-        err._type1_checks(points, values, output_shape) # revisit these error checks to take into account the shape of points instead of passing them separately
-        # ^ make sure these checks check for consistency between output shape and len(points)
+        # TODO:
+        # revisit these error checks to take into account the shape of points
+        # instead of passing them separately
+        # make sure these checks check for consistency between output shape and
+        # len(points)
+        # Also need device checks
+        err._type1_checks(points, values, output_shape)
 
         if finufftkwargs is None:
             finufftkwargs = dict()
@@ -1640,7 +1677,8 @@ def forward(
             #   to note instead that there is a conflict in fftshift
             if _mode_ordering != 1:
                 raise ValueError(
-                    "Double specification of ordering; only one of fftshift and modeord should be provided"
+                    "Double specification of ordering; only one of fftshift and "
+                    "modeord should be provided"
                 )
             _mode_ordering = 0
 
@@ -1654,38 +1692,34 @@ def forward(
         ndim = points.shape[0]
         assert len(output_shape) == ndim
 
-        nufft_func = get_nufft_func(ndim, 1)
-        finufft_out = torch.from_numpy(
-            nufft_func(
-                *points.data.numpy(),
-                values.data.numpy(),
-                output_shape,
-                modeord=_mode_ordering,
-                isign=_i_sign,
-                **finufftkwargs,
-            )
+        nufft_func = get_nufft_func(ndim, 1, points.device.type)
+        finufft_out = nufft_func(
+            *points, values, output_shape, isign=_i_sign, **finufftkwargs
         )
+        # because modeord is missing from cufinufft
+        if _mode_ordering:
+            finufft_out = torch.fft.ifftshift(finufft_out)
 
         return finufft_out
 
     @staticmethod
     def backward(
         ctx: Any, grad_output: torch.Tensor
-    ) -> tuple[Union[torch.Tensor, None], ...]:
+    ) -> Tuple[Union[torch.Tensor, None], ...]:
         """
-        Implements derivatives wrt. each argument in the forward method.
+         Implements derivatives wrt. each argument in the forward method.
 
-        Parameters
-        ----------
-        ctx : Any
-            Pytorch context object.
-        grad_output : torch.Tensor
-            Backpass gradient output
+         Parameters
+         ----------
+         ctx : Any
+             Pytorch context object.
+         grad_output : torch.Tensor
+             Backpass gradient output
 
-        Returns
-        -------
-        tuple[Union[torch.Tensor, None], ...]
-            A tuple of derivatives wrt. each argument in the forward method
+         Returns
+         -------
+        Tuple[Union[torch.Tensor, None], ...]
+             A tuple of derivatives wrt. each argument in the forward method
         """
         _i_sign = -1 * ctx.isign
         _mode_ordering = ctx.mode_ordering
@@ -1695,52 +1729,57 @@ def backward(
 
         start_points = -(np.array(grad_output.shape) // 2)
         end_points = start_points + grad_output.shape
-        slices = tuple(slice(start, end) for start, end in zip(start_points, end_points))
+        slices = tuple(
+            slice(start, end) for start, end in zip(start_points, end_points)
+        )
 
         # CPU idiosyncracy that needs to be done differently
-        coord_ramps = torch.from_numpy(np.mgrid[slices])
+        coord_ramps = torch.from_numpy(np.mgrid[slices]).to(points.device)
 
         grads_points = None
         grad_values = None
 
         ndim = points.shape[0]
 
-        nufft_func = get_nufft_func(ndim, 2)
+        nufft_func = get_nufft_func(ndim, 2, points.device.type)
 
         if ctx.needs_input_grad[0]:
             # wrt points
 
-            if _mode_ordering != 0:
-                coord_ramps = torch.fft.ifftshift(coord_ramps, dim=tuple(range(1, ndim+1)))
-            
+            if _mode_ordering:
+                coord_ramps = torch.fft.ifftshift(
+                    coord_ramps, dim=tuple(range(1, ndim + 1))
+                )
+
             ramped_grad_output = coord_ramps * grad_output[np.newaxis] * 1j * _i_sign
 
             grads_points = []
-            for ramp in ramped_grad_output: # we can batch this into finufft
-                backprop_ramp = torch.from_numpy(
-                    nufft_func(
-                        *points.numpy(),
-                        ramp.data.numpy(),
-                        isign=_i_sign,
-                        modeord=_mode_ordering,
-                        **finufftkwargs,
-                    ))
+            for ramp in ramped_grad_output:  # we can batch this into finufft
+                if _mode_ordering:
+                    ramp = torch.fft.fftshift(ramp)
+
+                backprop_ramp = nufft_func(
+                    *points,
+                    ramp,
+                    isign=_i_sign,
+                    **finufftkwargs,
+                )
+
                 grad_points = (backprop_ramp.conj() * values).real
+
                 grads_points.append(grad_points)
-            
+
             grads_points = torch.stack(grads_points)
 
         if ctx.needs_input_grad[1]:
-            np_grad_output = grad_output.data.numpy()
+            if _mode_ordering:
+                grad_output = torch.fft.fftshift(grad_output)
 
-            grad_values = torch.from_numpy(
-                nufft_func(
-                    *points.numpy(),
-                    np_grad_output,
-                    isign=_i_sign,
-                    modeord=_mode_ordering,
-                    **finufftkwargs,
-                )
+            grad_values = nufft_func(
+                *points,
+                grad_output,
+                isign=_i_sign,
+                **finufftkwargs,
             )
 
         return (
diff --git a/tests/test_1d/test_forward_1d.py b/tests/test_1d/test_forward_1d.py
index 5379927..eec8a9b 100644
--- a/tests/test_1d/test_forward_1d.py
+++ b/tests/test_1d/test_forward_1d.py
@@ -66,15 +66,14 @@ def test_1d_t1_forward_CPU(values: torch.Tensor) -> None:
         torch.linalg.norm(finufft1D1_out - against_scipy) / N**2
     ) == pytest.approx(0, abs=1e-06)
 
-
     abs_errors = torch.abs(finufft1D1_out - against_torch)
     l_inf_error = abs_errors.max()
     l_2_error = torch.sqrt(torch.sum(abs_errors**2))
     l_1_error = torch.sum(abs_errors)
 
-    assert l_inf_error < 3.5e-3 * N ** .6
-    assert l_2_error < 7.5e-4 * N ** 1.1
-    assert l_1_error < 5e-4 * N ** 1.6
+    assert l_inf_error < 3.5e-3 * N**0.6
+    assert l_2_error < 7.5e-4 * N**1.1
+    assert l_1_error < 5e-4 * N**1.6
 
 
 @pytest.mark.parametrize("targets", cases)
@@ -106,17 +105,16 @@ def test_1d_t2_forward_CPU(targets: torch.Tensor):
     )
 
 
-@pytest.mark.parametrize("N", Ns)
-def test_t1_forward_CPU(N: int) -> None:
+def check_t1_forward(N: int, device: str) -> None:
     """
     Tests against implementations of the FFT by setting up a uniform grid
     over which to call FINUFFT through the API.
     """
     g = np.mgrid[:N] * 2 * np.pi / N
     g.shape = 1, -1
-    points = torch.from_numpy(g.reshape(1, -1))
+    points = torch.from_numpy(g.reshape(1, -1)).to(device)
 
-    values = torch.randn(*points[0].shape, dtype=torch.complex128)
+    values = torch.randn(*points[0].shape, dtype=torch.complex128).to(device)
 
     print("N is " + str(N))
     print("shape of points is " + str(points.shape))
@@ -136,10 +134,19 @@ def test_t1_forward_CPU(N: int) -> None:
     l_1_error = torch.sum(abs_errors)
 
     assert l_inf_error < 4.5e-5 * N
-    assert l_2_error < 1e-5 * N ** 2
-    assert l_1_error < 1e-5 * N ** 3
+    assert l_2_error < 1e-5 * N**2
+    assert l_1_error < 1e-5 * N**3
 
 
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_CPU(N: int) -> None:
+    check_t1_forward(N, "cpu")
+
+
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_cuda(N: int) -> None:
+    check_t1_forward(N, "cuda")
+
 
 # @pytest.mark.parametrize("values", cases)
 # def test_1d_t3_forward_CPU(values: torch.Tensor) -> None:
diff --git a/tests/test_2d/test_backward_2d.py b/tests/test_2d/test_backward_2d.py
index 6a9b707..ddde4cf 100644
--- a/tests/test_2d/test_backward_2d.py
+++ b/tests/test_2d/test_backward_2d.py
@@ -5,8 +5,6 @@
 
 import pytorch_finufft
 
-from functools import partial
-
 torch.set_default_tensor_type(torch.DoubleTensor)
 torch.set_default_dtype(torch.float64)
 torch.manual_seed(0)
@@ -100,48 +98,68 @@ def test_t1_backward_CPU_values(
     assert gradcheck(apply_finufft2d1(modifier, fftshift, isign), inputs)
 
 
-@pytest.mark.parametrize("N", Ns)
-@pytest.mark.parametrize("modifier", length_modifiers)
-@pytest.mark.parametrize("fftshift", [False, True])
-@pytest.mark.parametrize("isign", [-1, 1])
-def test_t1_consolidated_backward_CPU_values(N: int, modifier: int, fftshift: bool, isign: int) -> None:
-
-    points = torch.rand((2, N), dtype=torch.float64) * 2 * np.pi
-    values = torch.randn(N, dtype=torch.complex128)
+def check_t1_backward(
+    N: int,
+    modifier: int,
+    fftshift: bool,
+    isign: int,
+    device: str,
+    points_or_values: bool,
+) -> None:
+    points = torch.rand((2, N), dtype=torch.float64).to(device) * 2 * np.pi
+    values = torch.randn(N, dtype=torch.complex128).to(device)
 
-    points.requires_grad = False
-    values.requires_grad = True
+    points.requires_grad = points_or_values
+    values.requires_grad = not points_or_values
 
     inputs = (points, values)
 
     def func(points, values):
         return pytorch_finufft.functional.finufft_type1.apply(
-            points, values, (N,N + modifier), None, fftshift, dict(isign=isign)
+            points, values, (N, N + modifier), None, fftshift, dict(isign=isign)
         )
 
-    assert gradcheck(func, inputs)
+    assert gradcheck(func, inputs, atol=1e-5 * N)
 
 
 @pytest.mark.parametrize("N", Ns)
 @pytest.mark.parametrize("modifier", length_modifiers)
 @pytest.mark.parametrize("fftshift", [False, True])
 @pytest.mark.parametrize("isign", [-1, 1])
-def test_t1_consolidated_backward_CPU_points(N: int, modifier: int, fftshift: bool, isign: int) -> None:
+def test_t1_consolidated_backward_CPU_points(
+    N: int, modifier: int, fftshift: bool, isign: int
+) -> None:
+    check_t1_backward(N, modifier, fftshift, isign, "cpu", True)
 
-    points = torch.rand((2, N), dtype=torch.float64) * 2 * np.pi
-    values = torch.randn(N, dtype=torch.complex128)
 
-    points.requires_grad = True
-    values.requires_grad = False
+@pytest.mark.parametrize("N", Ns)
+@pytest.mark.parametrize("modifier", length_modifiers)
+@pytest.mark.parametrize("fftshift", [False, True])
+@pytest.mark.parametrize("isign", [-1, 1])
+def test_t1_consolidated_backward_CPU_values(
+    N: int, modifier: int, fftshift: bool, isign: int
+) -> None:
+    check_t1_backward(N, modifier, fftshift, isign, "cpu", False)
 
-    inputs = (points, values)
 
-    def func(points, values):
-        return pytorch_finufft.functional.finufft_type1.apply(
-            points, values, (N,N + modifier), None, fftshift, dict(isign=isign)
-        )
+@pytest.mark.parametrize("N", Ns)
+@pytest.mark.parametrize("modifier", length_modifiers)
+@pytest.mark.parametrize("fftshift", [False, True])
+@pytest.mark.parametrize("isign", [-1, 1])
+def test_t1_consolidated_backward_cuda_values(
+    N: int, modifier: int, fftshift: bool, isign: int
+) -> None:
+    check_t1_backward(N, modifier, fftshift, isign, "cuda", False)
 
-    assert gradcheck(func, inputs, atol=1e-5 * N)
+
+@pytest.mark.parametrize("N", Ns)
+@pytest.mark.parametrize("modifier", length_modifiers)
+@pytest.mark.parametrize("fftshift", [False, True])
+@pytest.mark.parametrize("isign", [-1, 1])
+def test_t1_consolidated_backward_cuda_points(
+    N: int, modifier: int, fftshift: bool, isign: int
+) -> None:
+    check_t1_backward(N, modifier, fftshift, isign, "cuda", True)
 
 
 @pytest.mark.parametrize("N", Ns)
diff --git a/tests/test_2d/test_forward_2d.py b/tests/test_2d/test_forward_2d.py
index 1dda568..0ae6d85 100644
--- a/tests/test_2d/test_forward_2d.py
+++ b/tests/test_2d/test_forward_2d.py
@@ -1,10 +1,12 @@
 import numpy as np
 import pytest
 import torch
-torch.manual_seed(0)
 
 import pytorch_finufft
 
+torch.manual_seed(0)
+
+
 # Case generation
 Ns = [
     10,
@@ -52,8 +54,8 @@ def test_2d_t1_forward_CPU(N: int) -> None:
     l_1_error = torch.sum(abs_errors)
 
     assert l_inf_error < 5e-5 * N
-    assert l_2_error < 1e-5 * N ** 2
-    assert l_1_error < 1e-5 * N ** 3
+    assert l_2_error < 1e-5 * N**2
+    assert l_1_error < 1e-5 * N**3
 
 
 @pytest.mark.parametrize("N", Ns)
@@ -102,8 +104,8 @@ def test_2d_t2_forward_CPU(N: int) -> None:
     l_1_error = torch.sum(abs_errors)
 
     assert l_inf_error < 1e-5 * N
-    assert l_2_error < 1e-5 * N ** 2
-    assert l_1_error < 1e-5 * N ** 3
+    assert l_2_error < 1e-5 * N**2
+    assert l_1_error < 1e-5 * N**3
 
 
 # @pytest.mark.parametrize("N", Ns)
@@ -122,16 +124,15 @@ def test_2d_t2_forward_CPU(N: int) -> None:
 #     pass
 
 
-@pytest.mark.parametrize("N", Ns)
-def test_t1_forward_CPU(N: int) -> None:
+def check_t1_forward(N: int, device: str) -> None:
     """
     Tests against implementations of the FFT by setting up a uniform grid
     over which to call FINUFFT through the API.
     """
     g = np.mgrid[:N, :N] * 2 * np.pi / N
-    points = torch.from_numpy(g.reshape(2, -1))
+    points = torch.from_numpy(g.reshape(2, -1)).to(device)
 
-    values = torch.randn(*points[0].shape, dtype=torch.complex128)
+    values = torch.randn(*points[0].shape, dtype=torch.complex128).to(device)
 
     print("N is " + str(N))
     print("shape of points is " + str(points.shape))
@@ -151,6 +152,15 @@ def test_t1_forward_CPU(N: int) -> None:
     l_1_error = torch.sum(abs_errors)
 
     assert l_inf_error < 4.5e-5 * N
-    assert l_2_error < 1e-5 * N ** 2
-    assert l_1_error < 1e-5 * N ** 3
+    assert l_2_error < 1e-5 * N**2
+    assert l_1_error < 1e-5 * N**3
 
+
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_CPU(N: int) -> None:
+    check_t1_forward(N, "cpu")
+
+
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_cuda(N: int) -> None:
+    check_t1_forward(N, "cuda")
diff --git a/tests/test_3d/test_forward_3d.py b/tests/test_3d/test_forward_3d.py
index 45484aa..524e9a6 100644
--- a/tests/test_3d/test_forward_3d.py
+++ b/tests/test_3d/test_forward_3d.py
@@ -1,10 +1,12 @@
 import numpy as np
 import pytest
 import torch
-torch.manual_seed(0)
 
 import pytorch_finufft
 
+torch.manual_seed(0)
+
+
 # Case generation
 Ns = [
     5,
@@ -45,10 +47,9 @@ def test_3d_t1_forward_CPU(N: int) -> None:
         l_2_error = torch.sqrt(torch.sum(abs_errors**2))
         l_1_error = torch.sum(abs_errors)
 
-        assert l_inf_error < 2e-5 * N ** 1.5
-        assert l_2_error < 1e-5 * N ** 3
-        assert l_1_error < 1e-5 * N ** 4.5
-
+        assert l_inf_error < 2e-5 * N**1.5
+        assert l_2_error < 1e-5 * N**3
+        assert l_1_error < 1e-5 * N**4.5
 
 
 @pytest.mark.parametrize("N", Ns)
@@ -79,21 +80,20 @@ def test_3d_t2_forward_CPU(N: int) -> None:
         l_2_error = torch.sqrt(torch.sum(abs_errors**2))
         l_1_error = torch.sum(abs_errors)
 
-        assert l_inf_error < 1e-5 * N ** 1.5
-        assert l_2_error < 1e-5 * N ** 3
-        assert l_1_error < 1e-5 * N ** 4.5
+        assert l_inf_error < 1e-5 * N**1.5
+        assert l_2_error < 1e-5 * N**3
+        assert l_1_error < 1e-5 * N**4.5
 
 
-@pytest.mark.parametrize("N", Ns)
-def test_t1_forward_CPU(N: int) -> None:
+def check_t1_forward(N: int, device: str) -> None:
     """
     Tests against implementations of the FFT by setting up a uniform grid
     over which to call FINUFFT through the API.
     """
     g = np.mgrid[:N, :N, :N] * 2 * np.pi / N
-    points = torch.from_numpy(g.reshape(3, -1))
+    points = torch.from_numpy(g.reshape(3, -1)).to(device)
 
-    values = torch.randn(*points[0].shape, dtype=torch.complex128)
+    values = torch.randn(*points[0].shape, dtype=torch.complex128).to(device)
 
     print("N is " + str(N))
     print("shape of points is " + str(points.shape))
@@ -112,6 +112,16 @@ def test_t1_forward_CPU(N: int) -> None:
     l_2_error = torch.sqrt(torch.sum(abs_errors**2))
     l_1_error = torch.sum(abs_errors)
 
-    assert l_inf_error < 1.5e-5 * N ** 1.5
-    assert l_2_error < 1e-5 * N ** 3
-    assert l_1_error < 1e-5 * N ** 4.5
\ No newline at end of file
+    assert l_inf_error < 1.5e-5 * N**1.5
+    assert l_2_error < 1e-5 * N**3
+    assert l_1_error < 1e-5 * N**4.5
+
+
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_CPU(N: int) -> None:
+    check_t1_forward(N, "cpu")
+
+
+@pytest.mark.parametrize("N", Ns)
+def test_t1_forward_cuda(N: int) -> None:
+    check_t1_forward(N, "cuda")