diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_cache.py b/dpnp/tests/third_party/cupy/fft_tests/test_cache.py
new file mode 100644
index 00000000000..dc2a82a2385
--- /dev/null
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_cache.py
@@ -0,0 +1,510 @@
+import contextlib
+import io
+import queue
+import threading
+import unittest
+
+import pytest
+
+import dpnp as cupy
+from dpnp.tests.third_party.cupy import testing
+
+# from cupy.cuda import cufft
+# from cupy.cuda import device
+# from cupy.cuda import runtime
+# from cupy.fft import config
+
+# from .test_fft import (multi_gpu_config, _skip_multi_gpu_bug)
+
+pytest.skip("FFT cache functions are not supported", allow_module_level=True)
+
+
+def intercept_stdout(func):
+    with io.StringIO() as buf, contextlib.redirect_stdout(buf):
+        func()
+        stdout = buf.getvalue()
+    return stdout
+
+
+n_devices = runtime.getDeviceCount()
+
+
+class TestPlanCache(unittest.TestCase):
+    def setUp(self):
+        self.caches = []
+        self.old_sizes = []
+        for i in range(n_devices):
+            with device.Device(i):
+                cache = config.get_plan_cache()
+                self.old_sizes.append(cache.get_size())
+                cache.clear()
+                cache.set_memsize(-1)
+                cache.set_size(2)
+            self.caches.append(cache)
+
+    def tearDown(self):
+        for i in range(n_devices):
+            with device.Device(i):
+                cache = config.get_plan_cache()
+                cache.clear()
+                cache.set_size(self.old_sizes[i])
+                cache.set_memsize(-1)
+
+    def test_LRU_cache1(self):
+        # test if insertion and clean-up works
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+
+        cache.clear()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+    def test_LRU_cache2(self):
+        # test if plan is reused
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        # run once and fetch the cached plan
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        iterator = iter(cache)
+        plan0 = next(iterator)[1].plan
+
+        # repeat
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        iterator = iter(cache)
+        plan1 = next(iterator)[1].plan
+
+        # we should get the same plan
+        assert plan0 is plan1
+
+    def test_LRU_cache3(self):
+        # test if cache size is limited
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        # run once and fetch the cached plan
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        iterator = iter(cache)
+        plan = next(iterator)[1].plan
+
+        # run another two FFTs with different sizes so that the first
+        # plan is discarded from the cache
+        a = testing.shaped_random((20,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+        a = testing.shaped_random((30,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+
+        # check if the first plan is indeed not cached
+        for _, node in cache:
+            assert plan is not node.plan
+
+    def test_LRU_cache4(self):
+        # test if fetching the plan will reorder it to the top
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        # this creates a Plan1d
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+
+        # this creates a PlanNd
+        a = testing.shaped_random((10, 20), cupy, cupy.float32)
+        cupy.fft.fftn(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+
+        # The first in the cache is the most recently used one;
+        # using an iterator to access the linked list guarantees that
+        # we don't alter the cache order
+        iterator = iter(cache)
+        assert isinstance(next(iterator)[1].plan, cufft.PlanNd)
+        assert isinstance(next(iterator)[1].plan, cufft.Plan1d)
+        with pytest.raises(StopIteration):
+            next(iterator)
+
+        # this brings Plan1d to the top
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+        iterator = iter(cache)
+        assert isinstance(next(iterator)[1].plan, cufft.Plan1d)
+        assert isinstance(next(iterator)[1].plan, cufft.PlanNd)
+        with pytest.raises(StopIteration):
+            next(iterator)
+
+        # An LRU cache guarantees that such a silly operation never
+        # raises StopIteration
+        iterator = iter(cache)
+        for i in range(100):
+            cache[next(iterator)[0]]
+
+    @testing.multi_gpu(2)
+    def test_LRU_cache5(self):
+        # test if the LRU cache is thread-local
+
+        def init_caches(gpus):
+            for i in gpus:
+                with device.Device(i):
+                    config.get_plan_cache()
+
+        # Testing in the current thread: in setUp() we ensure all caches
+        # are initialized
+        stdout = intercept_stdout(config.show_plan_cache_info)
+        assert "uninitialized" not in stdout
+
+        def thread_show_plan_cache_info(queue):
+            # allow output from another thread to be accessed by the
+            # main thread
+            cupy.cuda.Device().use()
+            stdout = intercept_stdout(config.show_plan_cache_info)
+            queue.put(stdout)
+
+        # When starting a new thread, the cache is uninitialized there
+        # (for both devices)
+        q = queue.Queue()
+        thread = threading.Thread(target=thread_show_plan_cache_info, args=(q,))
+        thread.start()
+        thread.join()
+        stdout = q.get()
+        assert stdout.count("uninitialized") == n_devices
+
+        def thread_init_caches(gpus, queue):
+            cupy.cuda.Device().use()
+            init_caches(gpus)
+            thread_show_plan_cache_info(queue)
+
+        # Now let's try initializing device 0 on another thread
+        thread = threading.Thread(
+            target=thread_init_caches,
+            args=(
+                [0],
+                q,
+            ),
+        )
+        thread.start()
+        thread.join()
+        stdout = q.get()
+        assert stdout.count("uninitialized") == n_devices - 1
+
+        # ...and this time both devices
+        thread = threading.Thread(
+            target=thread_init_caches,
+            args=(
+                [0, 1],
+                q,
+            ),
+        )
+        thread.start()
+        thread.join()
+        stdout = q.get()
+        assert stdout.count("uninitialized") == n_devices - 2
+
+    @testing.multi_gpu(2)
+    def test_LRU_cache6(self):
+        # test if each device has a separate cache
+        cache0 = self.caches[0]
+        cache1 = self.caches[1]
+
+        # ensure a fresh state
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # do some computation on GPU 0
+        with device.Device(0):
+            a = testing.shaped_random((10,), cupy, cupy.float32)
+            cupy.fft.fft(a)
+        assert cache0.get_curr_size() == 1 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # do some computation on GPU 1
+        with device.Device(1):
+            c = testing.shaped_random((16,), cupy, cupy.float64)
+            cupy.fft.fft(c)
+        assert cache0.get_curr_size() == 1 <= cache0.get_size()
+        assert cache1.get_curr_size() == 1 <= cache1.get_size()
+
+        # reset device 0
+        cache0.clear()
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 1 <= cache1.get_size()
+
+        # reset device 1
+        cache1.clear()
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+    @testing.multi_gpu(2)
+    @pytest.mark.skipif(
+        runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
+    )
+    def test_LRU_cache7(self):
+        # test accessing a multi-GPU plan
+        cache0 = self.caches[0]
+        cache1 = self.caches[1]
+
+        # ensure a fresh state
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # do some computation on GPU 0
+        with device.Device(0):
+            a = testing.shaped_random((10,), cupy, cupy.float32)
+            cupy.fft.fft(a)
+        assert cache0.get_curr_size() == 1 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # do a multi-GPU FFT
+        config.use_multi_gpus = True
+        config.set_cufft_gpus([0, 1])
+        c = testing.shaped_random((128,), cupy, cupy.complex64)
+        cupy.fft.fft(c)
+        assert cache0.get_curr_size() == 2 <= cache0.get_size()
+        assert cache1.get_curr_size() == 1 <= cache1.get_size()
+
+        # check both devices' caches see the same multi-GPU plan
+        plan0 = next(iter(cache0))[1].plan
+        plan1 = next(iter(cache1))[1].plan
+        assert plan0 is plan1
+
+        # reset
+        config.use_multi_gpus = False
+        config._device = None
+
+        # do some computation on GPU 1
+        with device.Device(1):
+            e = testing.shaped_random((20,), cupy, cupy.complex128)
+            cupy.fft.fft(e)
+        assert cache0.get_curr_size() == 2 <= cache0.get_size()
+        assert cache1.get_curr_size() == 2 <= cache1.get_size()
+
+        # by this time, the multi-GPU plan remains the most recently
+        # used one on GPU 0, but not on GPU 1
+        assert plan0 is next(iter(cache0))[1].plan
+        assert plan1 is not next(iter(cache1))[1].plan
+
+        # now use it again to make it the most recent
+        config.use_multi_gpus = True
+        config.set_cufft_gpus([0, 1])
+        c = testing.shaped_random((128,), cupy, cupy.complex64)
+        cupy.fft.fft(c)
+        assert cache0.get_curr_size() == 2 <= cache0.get_size()
+        assert cache1.get_curr_size() == 2 <= cache1.get_size()
+        assert plan0 is next(iter(cache0))[1].plan
+        assert plan1 is next(iter(cache1))[1].plan
+        # reset
+        config.use_multi_gpus = False
+        config._device = None
+
+        # Do 2 more different FFTs on one of the devices, and the
+        # multi-GPU plan would be discarded from both caches
+        with device.Device(1):
+            x = testing.shaped_random((30,), cupy, cupy.complex128)
+            cupy.fft.fft(x)
+            y = testing.shaped_random((40, 40), cupy, cupy.complex64)
+            cupy.fft.fftn(y)
+        for _, node in cache0:
+            assert plan0 is not node.plan
+        for _, node in cache1:
+            assert plan1 is not node.plan
+        assert cache0.get_curr_size() == 1 <= cache0.get_size()
+        assert cache1.get_curr_size() == 2 <= cache1.get_size()
+
+    def test_LRU_cache8(self):
+        # test if Plan1d and PlanNd can coexist in the same cache
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        # do a 1D FFT
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        assert isinstance(next(iter(cache))[1].plan, cufft.Plan1d)
+
+        # then a 3D FFT
+        a = testing.shaped_random((8, 8, 8), cupy, cupy.complex128)
+        cupy.fft.fftn(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+        iterator = iter(cache)
+
+        # the cached order is 1. PlanNd, 2. Plan1d
+        assert isinstance(next(iterator)[1].plan, cufft.PlanNd)
+        assert isinstance(next(iterator)[1].plan, cufft.Plan1d)
+
+    def test_LRU_cache9(self):
+        # test if memsizes in the cache adds up
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        memsize = 0
+        a = testing.shaped_random((10,), cupy, cupy.float32)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        memsize += next(iter(cache))[1].plan.work_area.mem.size
+
+        a = testing.shaped_random((48,), cupy, cupy.complex64)
+        cupy.fft.fft(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+        memsize += next(iter(cache))[1].plan.work_area.mem.size
+
+        assert memsize == cache.get_curr_memsize()
+
+    def test_LRU_cache10(self):
+        # test if deletion works and if show_info() is consistent with data
+        cache = config.get_plan_cache()
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        curr_size = 0
+        size = 2
+        curr_memsize = 0
+        memsize = "(unlimited)"  # default
+
+        a = testing.shaped_random((16, 16), cupy, cupy.float32)
+        cupy.fft.fft2(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        node1 = next(iter(cache))[1]
+        curr_size += 1
+        curr_memsize += node1.plan.work_area.mem.size
+        stdout = intercept_stdout(cache.show_info)
+        assert "{0} / {1} (counts)".format(curr_size, size) in stdout
+        assert "{0} / {1} (bytes)".format(curr_memsize, memsize) in stdout
+        assert str(node1) in stdout
+
+        a = testing.shaped_random((1024,), cupy, cupy.complex64)
+        cupy.fft.ifft(a)
+        assert cache.get_curr_size() == 2 <= cache.get_size()
+        node2 = next(iter(cache))[1]
+        curr_size += 1
+        curr_memsize += node2.plan.work_area.mem.size
+        stdout = intercept_stdout(cache.show_info)
+        assert "{0} / {1} (counts)".format(curr_size, size) in stdout
+        assert "{0} / {1} (bytes)".format(curr_memsize, memsize) in stdout
+        assert str(node2) + "\n" + str(node1) in stdout
+
+        # test deletion
+        key = node2.key
+        del cache[key]
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        curr_size -= 1
+        curr_memsize -= node2.plan.work_area.mem.size
+        stdout = intercept_stdout(cache.show_info)
+        assert "{0} / {1} (counts)".format(curr_size, size) in stdout
+        assert "{0} / {1} (bytes)".format(curr_memsize, memsize) in stdout
+        assert str(node2) not in stdout
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.multi_gpu(2)
+    @pytest.mark.skipif(
+        runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
+    )
+    def test_LRU_cache11(self):
+        # test if collectively deleting a multi-GPU plan works
+        _skip_multi_gpu_bug((128,), self.gpus)
+        cache0 = self.caches[0]
+        cache1 = self.caches[1]
+
+        # ensure a fresh state
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # do a multi-GPU FFT
+        c = testing.shaped_random((128,), cupy, cupy.complex64)
+        cupy.fft.fft(c)
+        assert cache0.get_curr_size() == 1 <= cache0.get_size()
+        assert cache1.get_curr_size() == 1 <= cache1.get_size()
+
+        node0 = next(iter(cache0))[1]
+        node1 = next(iter(cache1))[1]
+        assert node0.key == node1.key
+        assert node0.plan is node1.plan
+        assert cache0.get_curr_memsize() == node0.memsize > 0
+        assert cache1.get_curr_memsize() == node1.memsize > 0
+
+        # delete
+        del cache0[node0.key]
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+        assert cache0.get_curr_memsize() == 0
+        assert cache1.get_curr_memsize() == 0
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.multi_gpu(2)
+    @pytest.mark.skipif(
+        runtime.is_hip, reason="hipFFT doesn't support multi-GPU"
+    )
+    def test_LRU_cache12(self):
+        # test if an error is raise when one of the caches is unable
+        # to fit it a multi-GPU plan
+        cache0 = self.caches[0]
+        cache1 = self.caches[1]
+
+        # ensure a fresh state
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+        # make it impossible to cache
+        cache1.set_memsize(1)
+
+        # do a multi-GPU FFT
+        with pytest.raises(RuntimeError) as e:
+            c = testing.shaped_random((128,), cupy, cupy.complex64)
+            cupy.fft.fft(c)
+        assert "plan memsize is too large for device 1" in str(e.value)
+        assert cache0.get_curr_size() == 0 <= cache0.get_size()
+        assert cache1.get_curr_size() == 0 <= cache1.get_size()
+
+    @unittest.skipIf(runtime.is_hip, "rocFFT has different plan sizes")
+    @unittest.skipIf(
+        runtime.runtimeGetVersion() >= 11080,
+        "CUDA 11.8 has different plan size",
+    )
+    def test_LRU_cache13(self):
+        # test if plan insertion respect the memory size limit
+        cache = config.get_plan_cache()
+        cache.set_memsize(1024)
+
+        # ensure a fresh state
+        assert cache.get_curr_size() == 0 <= cache.get_size()
+
+        # On CUDA 10.0 + sm75, this generates a plan of size 1024 bytes
+        a = testing.shaped_random((128,), cupy, cupy.complex64)
+        cupy.fft.ifft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        assert cache.get_curr_memsize() == 1024 == cache.get_memsize()
+
+        # a second plan (of same size) is generated, but the cache is full,
+        # so the first plan is evicted
+        a = testing.shaped_random((64,), cupy, cupy.complex128)
+        cupy.fft.ifft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        assert cache.get_curr_memsize() == 1024 == cache.get_memsize()
+        plan = next(iter(cache))[1].plan
+
+        # this plan is twice as large, so won't fit in
+        a = testing.shaped_random((128,), cupy, cupy.complex128)
+        with pytest.raises(RuntimeError) as e:
+            cupy.fft.ifft(a)
+        assert "memsize is too large" in str(e.value)
+        # the cache remains intact
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        assert cache.get_curr_memsize() == 1024 == cache.get_memsize()
+        plan1 = next(iter(cache))[1].plan
+        assert plan1 is plan
+
+        # double the cache size would make the plan just fit (and evict
+        # the existing one)
+        cache.set_memsize(2048)
+        cupy.fft.ifft(a)
+        assert cache.get_curr_size() == 1 <= cache.get_size()
+        assert cache.get_curr_memsize() == 2048 == cache.get_memsize()
+        plan2 = next(iter(cache))[1].plan
+        assert plan2 is not plan
diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_callback.py b/dpnp/tests/third_party/cupy/fft_tests/test_callback.py
new file mode 100644
index 00000000000..56c06c16f74
--- /dev/null
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_callback.py
@@ -0,0 +1,831 @@
+import contextlib
+import string
+import sys
+import tempfile
+from unittest import mock
+
+import numpy as np
+import pytest
+from cupy import testing
+
+import dpnp as cupy
+
+pytest.skip("FFT callbacks are not supported", allow_module_level=True)
+
+
+@contextlib.contextmanager
+def use_temporary_cache_dir():
+    target = "cupy.fft._callback.get_cache_dir"
+    with tempfile.TemporaryDirectory() as path:
+        with mock.patch(target, lambda: path):
+            yield path
+
+
+_load_callback = r"""
+__device__ ${data_type} CB_ConvertInput(
+    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+{
+    ${data_type} x = ((${data_type}*)dataIn)[offset];
+    ${element} *= 2.5;
+    return x;
+}
+
+__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+"""
+
+_load_callback_with_aux = r"""
+__device__ ${data_type} CB_ConvertInput(
+    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+{
+    ${data_type} x = ((${data_type}*)dataIn)[offset];
+    ${element} *= *((${aux_type}*)callerInfo);
+    return x;
+}
+
+__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+"""
+
+_load_callback_with_aux2 = r"""
+__device__ ${data_type} CB_ConvertInput(
+    void* dataIn, size_t offset, void* callerInfo, void* sharedPtr)
+{
+    ${data_type} x = ((${data_type}*)dataIn)[offset];
+    ${element} *= ((${aux_type}*)callerInfo)[offset];
+    return x;
+}
+
+__device__ ${load_type} d_loadCallbackPtr = CB_ConvertInput;
+"""
+
+_store_callback = r"""
+__device__ void CB_ConvertOutput(
+    void *dataOut, size_t offset, ${data_type} element,
+    void *callerInfo, void *sharedPointer)
+{
+    ${data_type} x = element;
+    ${element} /= 3.8;
+    ((${data_type}*)dataOut)[offset] = x;
+}
+
+__device__ ${store_type} d_storeCallbackPtr = CB_ConvertOutput;
+"""
+
+_store_callback_with_aux = r"""
+__device__ void CB_ConvertOutput(
+    void *dataOut, size_t offset, ${data_type} element,
+    void *callerInfo, void *sharedPointer)
+{
+    ${data_type} x = element;
+    ${element} /= *((${aux_type}*)callerInfo);
+    ((${data_type}*)dataOut)[offset] = x;
+}
+
+__device__ ${store_type} d_storeCallbackPtr = CB_ConvertOutput;
+"""
+
+
+def _set_load_cb(code, element, data_type, callback_type, aux_type=None):
+    return string.Template(code).substitute(
+        data_type=data_type,
+        aux_type=aux_type,
+        load_type=callback_type,
+        element=element,
+    )
+
+
+def _set_store_cb(code, element, data_type, callback_type, aux_type=None):
+    return string.Template(code).substitute(
+        data_type=data_type,
+        aux_type=aux_type,
+        store_type=callback_type,
+        element=element,
+    )
+
+
+@testing.parameterize(
+    *testing.product(
+        {
+            "n": [None, 5, 10, 15],
+            "shape": [(10, 7), (10,), (10, 10)],
+            "norm": [None, "ortho"],
+        }
+    )
+)
+@testing.with_requires("cython>=0.29.0")
+@pytest.mark.skipif(
+    not sys.platform.startswith("linux"),
+    reason="callbacks are only supported on Linux",
+)
+@pytest.mark.skipif(
+    cupy.cuda.runtime.is_hip, reason="hipFFT does not support callbacks"
+)
+class Test1dCallbacks:
+
+    def _test_load_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        code = _load_callback
+        if dtype == np.complex64:
+            types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+        elif dtype == np.complex128:
+            types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+        elif dtype == np.float32:
+            types = ("x", "cufftReal", "cufftCallbackLoadR")
+        else:
+            types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+        cb_load = _set_load_cb(code, *types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, n=self.n, norm=self.norm)
+            if dtype in (np.float32, np.complex64):
+                if fft_func != "irfft":
+                    out = out.astype(np.complex64)
+                else:
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(cb_load=cb_load):
+                    out = fft(a, n=self.n, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_fft_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "fft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_ifft_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "ifft")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_rfft_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "rfft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_irfft_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "irfft")
+
+    def _test_store_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        code = _store_callback
+        if dtype == np.complex64:
+            if fft_func != "irfft":
+                types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                types = ("x", "cufftReal", "cufftCallbackStoreR")
+        elif dtype == np.complex128:
+            if fft_func != "irfft":
+                types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
+            else:
+                types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+        elif dtype == np.float32:
+            types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+        elif dtype == np.float64:
+            types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
+        cb_store = _set_store_cb(code, *types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            out = fft(a, n=self.n, norm=self.norm)
+            if fft_func != "irfft":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(cb_store=cb_store):
+                    out = fft(a, n=self.n, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_fft_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "fft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_ifft_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "ifft")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_rfft_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "rfft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_irfft_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "irfft")
+
+    def _test_load_store_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        load_code = _load_callback
+        store_code = _store_callback
+        if fft_func in ("fft", "ifft"):
+            if dtype == np.complex64:
+                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                )
+        elif fft_func == "rfft":
+            if dtype == np.float32:
+                load_types = ("x", "cufftReal", "cufftCallbackLoadR")
+                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                load_types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                )
+        else:  # irfft
+            if dtype == np.complex64:
+                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+                store_types = ("x", "cufftReal", "cufftCallbackStoreR")
+            else:
+                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                store_types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+        cb_load = _set_load_cb(load_code, *load_types)
+        cb_store = _set_store_cb(store_code, *store_types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, n=self.n, norm=self.norm)
+            if fft_func != "irfft":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(
+                    cb_load=cb_load, cb_store=cb_store
+                ):
+                    out = fft(a, n=self.n, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_fft_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "fft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_ifft_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "ifft")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_rfft_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "rfft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_irfft_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "irfft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_fft_load_aux(self, xp, dtype):
+        fft = xp.fft.fft
+        c = _load_callback_with_aux2
+        if dtype == np.complex64:
+            cb_load = _set_load_cb(
+                c, "x.x", "cufftComplex", "cufftCallbackLoadC", "float"
+            )
+        else:
+            cb_load = _set_load_cb(
+                c, "x.x", "cufftDoubleComplex", "cufftCallbackLoadZ", "double"
+            )
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        out_last = self.n if self.n is not None else self.shape[-1]
+        out_shape = list(self.shape)
+        out_shape[-1] = out_last
+        last_min = min(self.shape[-1], out_last)
+        b = xp.arange(np.prod(out_shape), dtype=xp.dtype(dtype).char.lower())
+        b = b.reshape(out_shape)
+        if xp is np:
+            x = np.zeros(out_shape, dtype=dtype)
+            x[..., 0:last_min] = a[..., 0:last_min]
+            x.real *= b
+            out = fft(x, n=self.n, norm=self.norm)
+            if dtype in (np.float32, np.complex64):
+                out = out.astype(np.complex64)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(
+                    cb_load=cb_load, cb_load_aux_arr=b
+                ):
+                    out = fft(a, n=self.n, norm=self.norm)
+
+        return out
+
+    def _test_load_store_aux_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        load_code = _load_callback_with_aux
+        store_code = _store_callback_with_aux
+        if xp is cupy:
+            load_aux = xp.asarray(2.5, dtype=xp.dtype(dtype).char.lower())
+            store_aux = xp.asarray(3.8, dtype=xp.dtype(dtype).char.lower())
+
+        if fft_func in ("fft", "ifft"):
+            if dtype == np.complex64:
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "float",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "float",
+                )
+            else:
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "double",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "double",
+                )
+        elif fft_func == "rfft":
+            if dtype == np.float32:
+                load_types = ("x", "cufftReal", "cufftCallbackLoadR", "float")
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "float",
+                )
+            else:
+                load_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackLoadD",
+                    "double",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "double",
+                )
+        else:  # irfft
+            if dtype == np.complex64:
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "float",
+                )
+                store_types = ("x", "cufftReal", "cufftCallbackStoreR", "float")
+            else:
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "double",
+                )
+                store_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "double",
+                )
+        cb_load = _set_load_cb(load_code, *load_types)
+        cb_store = _set_store_cb(store_code, *store_types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, n=self.n, norm=self.norm)
+            if fft_func != "irfft":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(
+                    cb_load=cb_load,
+                    cb_store=cb_store,
+                    cb_load_aux_arr=load_aux,
+                    cb_store_aux_arr=store_aux,
+                ):
+                    out = fft(a, n=self.n, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_fft_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "fft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_ifft_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "ifft")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_rfft_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "rfft")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(rtol=1e-4, atol=1e-7, contiguous_check=False)
+    def test_irfft_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "irfft")
+
+
+@testing.parameterize(
+    {"shape": (3, 4), "s": None, "axes": None, "norm": None},
+    {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1), "norm": None},
+    {"shape": (3, 4), "s": None, "axes": (-2, -1), "norm": None},
+    {"shape": (3, 4), "s": None, "axes": None, "norm": "ortho"},
+    {"shape": (2, 3, 4), "s": None, "axes": None, "norm": None},
+    {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1), "norm": None},
+    {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1), "norm": None},
+    {"shape": (2, 3, 4), "s": None, "axes": None, "norm": "ortho"},
+    {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2), "norm": "ortho"},
+)
+@testing.with_requires("cython>=0.29.0")
+@pytest.mark.skipif(
+    not sys.platform.startswith("linux"),
+    reason="callbacks are only supported on Linux",
+)
+@pytest.mark.skipif(
+    cupy.cuda.runtime.is_hip, reason="hipFFT does not support callbacks"
+)
+class TestNdCallbacks:
+
+    def _test_load_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        load_code = _load_callback
+        if dtype == np.complex64:
+            types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+        elif dtype == np.complex128:
+            types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+        elif dtype == np.float32:
+            types = ("x", "cufftReal", "cufftCallbackLoadR")
+        else:
+            types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+        cb_load = _set_load_cb(load_code, *types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            if dtype in (np.float32, np.complex64):
+                if fft_func != "irfftn":
+                    out = out.astype(np.complex64)
+                else:
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(cb_load=cb_load):
+                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fftn_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "fftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifftn_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "ifftn")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfftn_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "rfftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfftn_load(self, xp, dtype):
+        return self._test_load_helper(xp, dtype, "irfftn")
+
+    def _test_store_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        store_code = _store_callback
+        if dtype == np.complex64:
+            if fft_func != "irfftn":
+                types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                types = ("x", "cufftReal", "cufftCallbackStoreR")
+        elif dtype == np.complex128:
+            if fft_func != "irfftn":
+                types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
+            else:
+                types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+        elif dtype == np.float32:
+            types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+        elif dtype == np.float64:
+            types = ("x.y", "cufftDoubleComplex", "cufftCallbackStoreZ")
+        cb_store = _set_store_cb(store_code, *types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            if fft_func != "irfftn":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(cb_store=cb_store):
+                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fftn_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "fftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifftn_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "ifftn")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfftn_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "rfftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfftn_store(self, xp, dtype):
+        return self._test_store_helper(xp, dtype, "irfftn")
+
+    def _test_load_store_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        load_code = _load_callback
+        store_code = _store_callback
+        if fft_func in ("fftn", "ifftn"):
+            if dtype == np.complex64:
+                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                )
+        elif fft_func == "rfftn":
+            if dtype == np.float32:
+                load_types = ("x", "cufftReal", "cufftCallbackLoadR")
+                store_types = ("x.y", "cufftComplex", "cufftCallbackStoreC")
+            else:
+                load_types = ("x", "cufftDoubleReal", "cufftCallbackLoadD")
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                )
+        else:  # irfft
+            if dtype == np.complex64:
+                load_types = ("x.x", "cufftComplex", "cufftCallbackLoadC")
+                store_types = ("x", "cufftReal", "cufftCallbackStoreR")
+            else:
+                load_types = ("x.x", "cufftDoubleComplex", "cufftCallbackLoadZ")
+                store_types = ("x", "cufftDoubleReal", "cufftCallbackStoreD")
+        cb_load = _set_load_cb(load_code, *load_types)
+        cb_store = _set_store_cb(store_code, *store_types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            if fft_func != "irfftn":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(
+                    cb_load=cb_load, cb_store=cb_store
+                ):
+                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fftn_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "fftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifftn_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "ifftn")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfftn_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "rfftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfftn_load_store(self, xp, dtype):
+        return self._test_load_store_helper(xp, dtype, "irfftn")
+
+    def _test_load_store_aux_helper(self, xp, dtype, fft_func):
+        fft = getattr(xp.fft, fft_func)
+        load_code = _load_callback_with_aux
+        store_code = _store_callback_with_aux
+        if xp is cupy:
+            load_aux = xp.asarray(2.5, dtype=xp.dtype(dtype).char.lower())
+            store_aux = xp.asarray(3.8, dtype=xp.dtype(dtype).char.lower())
+
+        if fft_func in ("fftn", "ifftn"):
+            if dtype == np.complex64:
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "float",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "float",
+                )
+            else:
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "double",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "double",
+                )
+        elif fft_func == "rfftn":
+            if dtype == np.float32:
+                load_types = ("x", "cufftReal", "cufftCallbackLoadR", "float")
+                store_types = (
+                    "x.y",
+                    "cufftComplex",
+                    "cufftCallbackStoreC",
+                    "float",
+                )
+            else:
+                load_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackLoadD",
+                    "double",
+                )
+                store_types = (
+                    "x.y",
+                    "cufftDoubleComplex",
+                    "cufftCallbackStoreZ",
+                    "double",
+                )
+        else:  # irfftn
+            if dtype == np.complex64:
+                load_types = (
+                    "x.x",
+                    "cufftComplex",
+                    "cufftCallbackLoadC",
+                    "float",
+                )
+                store_types = ("x", "cufftReal", "cufftCallbackStoreR", "float")
+            else:
+                load_types = (
+                    "x.x",
+                    "cufftDoubleComplex",
+                    "cufftCallbackLoadZ",
+                    "double",
+                )
+                store_types = (
+                    "x",
+                    "cufftDoubleReal",
+                    "cufftCallbackStoreD",
+                    "double",
+                )
+        cb_load = _set_load_cb(load_code, *load_types)
+        cb_store = _set_store_cb(store_code, *store_types)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if xp is np:
+            a.real *= 2.5
+            out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+            if fft_func != "irfftn":
+                out.imag /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.complex64)
+            else:
+                out /= 3.8
+                if dtype in (np.float32, np.complex64):
+                    out = out.astype(np.float32)
+        else:
+            with use_temporary_cache_dir():
+                with xp.fft.config.set_cufft_callbacks(
+                    cb_load=cb_load,
+                    cb_store=cb_store,
+                    cb_load_aux_arr=load_aux,
+                    cb_store_aux_arr=store_aux,
+                ):
+                    out = fft(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        return out
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fftn_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "fftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifftn_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "ifftn")
+
+    @testing.for_float_dtypes(no_float16=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfftn_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "rfftn")
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-4, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfftn_load_store_aux(self, xp, dtype):
+        return self._test_load_store_aux_helper(xp, dtype, "irfftn")
diff --git a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
index 918b6e2a23f..d458c7fba30 100644
--- a/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
+++ b/dpnp/tests/third_party/cupy/fft_tests/test_fft.py
@@ -1,4 +1,5 @@
 import functools
+import warnings
 
 import numpy as np
 import pytest
@@ -6,6 +7,7 @@
 import dpnp as cupy
 from dpnp.tests.helper import has_support_aspect64
 from dpnp.tests.third_party.cupy import testing
+from dpnp.tests.third_party.cupy.testing._loops import _wraps_partial
 
 
 @pytest.fixture
@@ -15,20 +17,102 @@ def skip_forward_backward(request):
             pytest.skip("forward/backward is supported by NumPy 1.20+")
 
 
+def nd_planning_states(states=[True, False], name="enable_nd"):
+    """Decorator for parameterized tests with and without nd planning
+
+    Tests are repeated with config.enable_nd_planning set to True and False
+
+    Args:
+         states(list of bool): The boolean cases to test.
+         name(str): Argument name to which specified dtypes are passed.
+
+    This decorator adds a keyword argument specified by ``name``
+    to the test fixture. Then, it runs the fixtures in parallel
+    by passing the each element of ``dtypes`` to the named
+    argument.
+    """
+
+    def decorator(impl):
+        @_wraps_partial(impl, name)
+        def test_func(self, *args, **kw):
+            # get original global planning state
+            # planning_state = config.enable_nd_planning
+            try:
+                for nd_planning in states:
+                    try:
+                        # enable or disable nd planning
+                        # config.enable_nd_planning = nd_planning
+
+                        kw[name] = nd_planning
+                        impl(self, *args, **kw)
+                    except Exception:
+                        print(name, "is", nd_planning)
+                        raise
+            finally:
+                # restore original global planning state
+                # config.enable_nd_planning = planning_state
+                pass
+
+        return test_func
+
+    return decorator
+
+
+def multi_gpu_config(gpu_configs=None):
+    """Decorator for parameterized tests with different GPU configurations.
+
+    Args:
+        gpu_configs (list of list): The GPUs to test.
+
+    .. notes:
+        The decorated tests are skipped if no or only one GPU is available.
+    """
+
+    def decorator(impl):
+        @functools.wraps(impl)
+        def test_func(self, *args, **kw):
+            use_multi_gpus = config.use_multi_gpus
+            _devices = config._devices
+
+            try:
+                for gpus in gpu_configs:
+                    try:
+                        nGPUs = len(gpus)
+                        assert nGPUs >= 2, "Must use at least two gpus"
+                        config.use_multi_gpus = True
+                        config.set_cufft_gpus(gpus)
+                        self.gpus = gpus
+
+                        impl(self, *args, **kw)
+                    except Exception:
+                        print("GPU config is:", gpus)
+                        raise
+            finally:
+                config.use_multi_gpus = use_multi_gpus
+                config._devices = _devices
+                del self.gpus
+
+        return test_func
+
+    return decorator
+
+
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *testing.product(
         {
             "n": [None, 0, 5, 10, 15],
             "shape": [(0,), (10, 0), (10,), (10, 10)],
-            "norm": [None, "backward", "ortho", "forward", ""],
+            "norm": [None, "backward", "ortho", "forward"],
         }
     )
 )
 class TestFft:
+
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
@@ -36,17 +120,11 @@ class TestFft:
     )
     def test_fft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
-        out = xp.fft.fft(a, n=self.n, norm=self.norm)
-
-        # np.fft.fft always returns np.complex128
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.fft(a, n=self.n, norm=self.norm)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
@@ -57,14 +135,10 @@ def test_fft(self, xp, dtype):
     @testing.with_requires("numpy!=1.17.1")
     def test_ifft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
-        out = xp.fft.ifft(a, n=self.n, norm=self.norm)
-
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.ifft(a, n=self.n, norm=self.norm)
 
 
+# @testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     *testing.product(
         {
@@ -75,10 +149,11 @@ def test_ifft(self, xp, dtype):
     )
 )
 class TestFftOrder:
+
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
-        atol=1e-6,
+        rtol=1e-3,
+        atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
@@ -87,17 +162,11 @@ def test_fft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
         if self.data_order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.fft(a, axis=self.axis)
-
-        # np.fft.fft always returns np.complex128
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.fft(a, axis=self.axis)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
@@ -107,123 +176,308 @@ def test_ifft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
         if self.data_order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.ifft(a, axis=self.axis)
+        return xp.fft.ifft(a, axis=self.axis)
+
+
+# See #3757 and NVIDIA internal ticket 3093094
+def _skip_multi_gpu_bug(shape, gpus):
+    # avoid CUDA 11.0 (will be fixed by CUDA 11.2) bug triggered by
+    # - batch = 1
+    # - gpus = [1, 0]
+    if (
+        11000 <= cupy.cuda.runtime.runtimeGetVersion() < 11020
+        and len(shape) == 1
+        and gpus == [1, 0]
+    ):
+        pytest.skip("avoid CUDA 11 bug")
+
+
+# Almost identical to the TestFft class, except that
+# 1. multi-GPU cuFFT is used
+# 2. the tested parameter combinations are adjusted to meet the requirements
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *testing.product(
+        {
+            "n": [None, 0, 64],
+            "shape": [(0,), (0, 10), (64,), (4, 64)],
+            "norm": [None, "backward", "ortho", "forward"],
+        }
+    )
+)
+@pytest.mark.skip("multi GPU is not supported")
+@testing.multi_gpu(2)
+class TestMultiGpuFft:
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
 
-        return out
+        a = testing.shaped_random(self.shape, xp, dtype)
+        return xp.fft.fft(a, n=self.n, norm=self.norm)
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    # NumPy 1.17.0 and 1.17.1 raises ZeroDivisonError due to a bug
+    @testing.with_requires("numpy!=1.17.0")
+    @testing.with_requires("numpy!=1.17.1")
+    def test_ifft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        return xp.fft.ifft(a, n=self.n, norm=self.norm)
+
+
+# Almost identical to the TestFftOrder class, except that
+# 1. multi-GPU cuFFT is used
+# 2. the tested parameter combinations are adjusted to meet the requirements
+@testing.with_requires("numpy>=2.0")
+@testing.parameterize(
+    *testing.product(
+        {
+            "shape": [(10, 10), (10, 5, 10)],
+            "data_order": ["F", "C"],
+            "axis": [0, 1, -1],
+        }
+    )
+)
+@pytest.mark.skip("multi GPU is not supported")
+@testing.multi_gpu(2)
+class TestMultiGpuFftOrder:
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.data_order == "F":
+            a = xp.asfortranarray(a)
+        return xp.fft.fft(a, axis=self.axis)
 
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
 
+        a = testing.shaped_random(self.shape, xp, dtype)
+        if self.data_order == "F":
+            a = xp.asfortranarray(a)
+        return xp.fft.ifft(a, axis=self.axis)
+
+
+@pytest.mark.skip("default FFT function is not supported")
+@testing.with_requires("numpy>=2.0")
+class TestDefaultPlanType:
+
+    @nd_planning_states()
+    def test_default_fft_func(self, enable_nd):
+        # test cases where nd cuFFT plan is possible
+        ca = cupy.ones((16, 16, 16))
+        for axes in [(0, 1), (1, 2), None, (0, 1, 2)]:
+            fft_func = _default_fft_func(ca, axes=axes)
+            if enable_nd:
+                # TODO(leofang): test newer ROCm versions
+                if axes == (0, 1) and cupy.cuda.runtime.is_hip:
+                    assert fft_func is _fft
+                else:
+                    assert fft_func is _fftn
+            else:
+                assert fft_func is _fft
+
+        # only a single axis is transformed -> 1d plan preferred
+        for axes in [(0,), (1,), (2,)]:
+            assert _default_fft_func(ca, axes=axes) is _fft
+
+        # non-contiguous axes -> nd plan not possible
+        assert _default_fft_func(ca, axes=(0, 2)) is _fft
+
+        # >3 axes transformed -> nd plan not possible
+        ca = cupy.ones((2, 4, 6, 8))
+        assert _default_fft_func(ca) is _fft
+
+        # first or last axis not included -> nd plan not possible
+        assert _default_fft_func(ca, axes=(1,)) is _fft
+
+        # for rfftn
+        ca = cupy.random.random((4, 2, 6))
+        for s, axes in zip([(3, 4), None, (8, 7, 5)], [(-2, -1), (0, 1), None]):
+            fft_func = _default_fft_func(ca, s=s, axes=axes, value_type="R2C")
+            if enable_nd:
+                # TODO(leofang): test newer ROCm versions
+                if axes == (0, 1) and cupy.cuda.runtime.is_hip:
+                    assert fft_func is _fft
+                else:
+                    assert fft_func is _fftn
+            else:
+                assert fft_func is _fft
+
+        # nd plan not possible if last axis is not 0 or ndim-1
+        assert _default_fft_func(ca, axes=(2, 1), value_type="R2C") is _fft
+
+        # for irfftn
+        ca = cupy.random.random((4, 2, 6)).astype(cupy.complex128)
+        for s, axes in zip([(3, 4), None, (8, 7, 5)], [(-2, -1), (0, 1), None]):
+            fft_func = _default_fft_func(ca, s=s, axes=axes, value_type="C2R")
+            if enable_nd:
+                # To get around hipFFT's bug, we don't use PlanNd for C2R
+                # TODO(leofang): test newer ROCm versions
+                if cupy.cuda.runtime.is_hip:
+                    assert fft_func is _fft
+                else:
+                    assert fft_func is _fftn
+            else:
+                assert fft_func is _fft
+
+        # nd plan not possible if last axis is not 0 or ndim-1
+        assert _default_fft_func(ca, axes=(2, 1), value_type="C2R") is _fft
+
+
+@pytest.mark.skip("memory management is not supported")
+@testing.with_requires("numpy>=2.0")
+@testing.slow
+class TestFftAllocate:
+
+    def test_fft_allocate(self):
+        # Check CuFFTError is not raised when the GPU memory is enough.
+        # See https://github.com/cupy/cupy/issues/1063
+        # TODO(mizuno): Simplify "a" after memory compaction is implemented.
+        a = []
+        for i in range(10):
+            a.append(cupy.empty(100000000))
+        del a
+        b = cupy.empty(100000007, dtype=cupy.float32)
+        cupy.fft.fft(b)
+        # Free huge memory for slow test
+        del b
+        cupy.get_default_memory_pool().free_all_blocks()
+        # Clean up FFT plan cache
+        cupy.fft.config.clear_plan_cache()
+
+
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
         testing.product_dict(
             [
-                # some of the following cases are modified, since in NumPy 2.0.0
-                # `s` must contain only integer `s`, not None values, and
-                # If `s` is not None, `axes` must not be None either.
                 {"shape": (3, 4), "s": None, "axes": None},
-                {"shape": (3, 4), "s": (1, 4), "axes": (0, 1)},
-                {"shape": (3, 4), "s": (1, 5), "axes": (0, 1)},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-1, -2)},
                 # {"shape": (3, 4), "s": None, "axes": (0,)}, # mkl_fft gh-109
+                {"shape": (3, 4), "s": None, "axes": None},
                 # {"shape": (3, 4), "s": None, "axes": ()}, # mkl_fft gh-108
                 {"shape": (2, 3, 4), "s": None, "axes": None},
-                {"shape": (2, 3, 4), "s": (1, 4, 4), "axes": (0, 1, 2)},
-                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (0, 1, 2)},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-2, -1)},
                 {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
                 {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)},
                 # {"shape": (2, 3, 4), "s": None, "axes": (0, 1)}, # mkl_fft gh-109
+                {"shape": (2, 3, 4), "s": None, "axes": None},
                 # {"shape": (2, 3, 4), "s": None, "axes": ()}, # mkl_fft gh-108
                 # {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)}, # mkl_fft gh-109
                 {"shape": (2, 3, 4, 5), "s": None, "axes": None},
                 # {"shape": (0, 5), "s": None, "axes": None}, # mkl_fft gh-110
                 # {"shape": (2, 0, 5), "s": None, "axes": None}, # mkl_fft gh-110
                 # {"shape": (0, 0, 5), "s": None, "axes": None}, # mkl_fft gh-110
-                {"shape": (3, 4), "s": (0, 5), "axes": (0, 1)},
-                {"shape": (3, 4), "s": (1, 0), "axes": (0, 1)},
+                {"shape": (3, 4), "s": (0, 5), "axes": (-2, -1)},
+                {"shape": (3, 4), "s": (1, 0), "axes": (-2, -1)},
             ],
-            testing.product(
-                {"norm": [None, "backward", "ortho", "forward", ""]}
-            ),
+            testing.product({"norm": [None, "backward", "ortho", "forward"]}),
         )
     )
 )
 class TestFft2:
+
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_fft2(self, xp, dtype, order):
+    def test_fft2(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.fft2(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        with warnings.catch_warnings():
+            # axis=None and s != None, NumPy 2.0
+            warnings.simplefilter("ignore", DeprecationWarning)
+            out = xp.fft.fft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
         if self.axes is not None and not self.axes:
             assert out is a
             return out
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
         return out
 
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_ifft2(self, xp, dtype, order):
+    def test_ifft2(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.ifft2(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        with warnings.catch_warnings():
+            # axis=None and s != None, NumPy 2.0
+            warnings.simplefilter("ignore", DeprecationWarning)
+            out = xp.fft.ifft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
         if self.axes is not None and not self.axes:
             assert out is a
             return out
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
         return out
 
 
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
         testing.product_dict(
             [
-                # some of the following cases are modified, since in NumPy 2.0.0
-                # `s` must contain only integer `s`, not None values, and
-                # If `s` is not None, `axes` must not be None either.
                 {"shape": (3, 4), "s": None, "axes": None},
-                {"shape": (3, 4), "s": (1, 4), "axes": (0, 1)},
-                {"shape": (3, 4), "s": (1, 5), "axes": (0, 1)},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-1, -2)},
                 {"shape": (3, 4), "s": None, "axes": [-1, -2]},
                 # {"shape": (3, 4), "s": None, "axes": (0,)}, # mkl_fft gh-109
                 # {"shape": (3, 4), "s": None, "axes": ()}, # mkl_fft gh-108
+                {"shape": (3, 4), "s": None, "axes": None},
                 {"shape": (2, 3, 4), "s": None, "axes": None},
-                {"shape": (2, 3, 4), "s": (1, 4, 4), "axes": (0, 1, 2)},
-                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (0, 1, 2)},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)},
                 {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
                 {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)},
                 # {"shape": (2, 3, 4), "s": None, "axes": (-1, -3)}, # mkl_fft gh-109
                 # {"shape": (2, 3, 4), "s": None, "axes": (0, 1)}, # mkl_fft gh-109
+                {"shape": (2, 3, 4), "s": None, "axes": None},
                 # {"shape": (2, 3, 4), "s": None, "axes": ()}, # mkl_fft gh-108
                 # {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)}, # mkl_fft gh-109
                 {"shape": (2, 3, 4), "s": (4, 3, 2), "axes": (2, 0, 1)},
@@ -232,62 +486,402 @@ def test_ifft2(self, xp, dtype, order):
                 # {"shape": (2, 0, 5), "s": None, "axes": None}, # mkl_fft gh-110
                 # {"shape": (0, 0, 5), "s": None, "axes": None}, # mkl_fft gh-110
             ],
-            testing.product(
-                {"norm": [None, "backward", "ortho", "forward", ""]}
-            ),
+            testing.product({"norm": [None, "backward", "ortho", "forward"]}),
         )
     )
 )
 class TestFftn:
+
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_fftn(self, xp, dtype, order):
+    def test_fftn(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.fftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        with warnings.catch_warnings():
+            # axis=None and s != None, NumPy 2.0
+            warnings.simplefilter("ignore", DeprecationWarning)
+            out = xp.fft.fftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
         if self.axes is not None and not self.axes:
             assert out is a
             return out
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
         return out
 
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_ifftn(self, xp, dtype, order):
+    def test_ifftn(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.ifftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        with warnings.catch_warnings():
+            # axis=None and s != None, NumPy 2.0
+            warnings.simplefilter("ignore", DeprecationWarning)
+            out = xp.fft.ifftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
         if self.axes is not None and not self.axes:
             assert out is a
             return out
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
         return out
 
 
+@pytest.mark.skip("get_fft_plan() is not supported")
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *(
+        testing.product_dict(
+            [
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (-1, -2)},
+                {"shape": (3, 4), "s": None, "axes": (0,)},
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)},
+                {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (2, 3), "axes": (-2, -1)},
+                {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)},
+                {"shape": (0, 5), "s": None, "axes": None},
+                {"shape": (2, 0, 5), "s": None, "axes": None},
+                {"shape": (0, 0, 5), "s": None, "axes": None},
+            ],
+            testing.product({"norm": [None, "backward", "ortho", "forward"]}),
+        )
+    )
+)
+class TestPlanCtxManagerFftn:
+
+    @pytest.fixture(autouse=True)
+    def skip_buggy(self):
+        if cupy.cuda.runtime.is_hip:
+            # TODO(leofang): test newer ROCm versions
+            if self.axes == (0, 1) and self.shape == (2, 3, 4):
+                pytest.skip(
+                    "hipFFT's PlanNd for this case "
+                    "is buggy, so Plan1d is generated "
+                    "instead"
+                )
+
+    @nd_planning_states()
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fftn(self, xp, dtype, enable_nd):
+        assert config.enable_nd_planning == enable_nd
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.fftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        plan = get_fft_plan(a, self.s, self.axes)
+        with plan:
+            return xp.fft.fftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+    @nd_planning_states()
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifftn(self, xp, dtype, enable_nd):
+        assert config.enable_nd_planning == enable_nd
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.ifftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        plan = get_fft_plan(a, self.s, self.axes)
+        with plan:
+            return xp.fft.ifftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+    @nd_planning_states()
+    @testing.for_complex_dtypes()
+    def test_fftn_error_on_wrong_plan(self, dtype, enable_nd):
+        if 0 in self.shape:
+            pytest.skip("0 in shape")
+        # This test ensures the context manager plan is picked up
+
+        from cupy.fft import fftn
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        assert config.enable_nd_planning == enable_nd
+
+        # can't get a plan, so skip
+        if self.axes is not None:
+            if self.s is not None:
+                if len(self.s) != len(self.axes):
+                    return
+            elif len(self.shape) != len(self.axes):
+                return
+
+        a = testing.shaped_random(self.shape, cupy, dtype)
+        bad_in_shape = tuple(2 * i for i in self.shape)
+        if self.s is None:
+            bad_out_shape = bad_in_shape
+        else:
+            bad_out_shape = tuple(2 * i for i in self.s)
+        b = testing.shaped_random(bad_in_shape, cupy, dtype)
+        plan_wrong = get_fft_plan(b, bad_out_shape, self.axes)
+
+        with pytest.raises(ValueError) as ex, plan_wrong:
+            fftn(a, s=self.s, axes=self.axes, norm=self.norm)
+        # targeting a particular error
+        assert "The cuFFT plan and a.shape do not match" in str(ex.value)
+
+
+@pytest.mark.skip("get_fft_plan() is not supported")
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *testing.product(
+        {
+            "n": [None, 5, 10, 15],
+            "shape": [
+                (10,),
+            ],
+            "norm": [None, "backward", "ortho", "forward"],
+        }
+    )
+)
+class TestPlanCtxManagerFft:
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fft(self, xp, dtype):
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.fft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape)
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.fft(a, n=self.n, norm=self.norm)
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifft(self, xp, dtype):
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.ifft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape)
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.ifft(a, n=self.n, norm=self.norm)
+
+    @testing.for_complex_dtypes()
+    def test_fft_error_on_wrong_plan(self, dtype):
+        # This test ensures the context manager plan is picked up
+
+        from cupy.fft import fft
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        a = testing.shaped_random(self.shape, cupy, dtype)
+        bad_shape = tuple(5 * i for i in self.shape)
+        b = testing.shaped_random(bad_shape, cupy, dtype)
+        plan_wrong = get_fft_plan(b)
+        assert isinstance(plan_wrong, cupy.cuda.cufft.Plan1d)
+
+        with pytest.raises(ValueError) as ex, plan_wrong:
+            fft(a, n=self.n, norm=self.norm)
+        # targeting a particular error
+        assert "Target array size does not match the plan." in str(ex.value)
+
+
+# Almost identical to the TestPlanCtxManagerFft class, except that
+# 1. multi-GPU cuFFT is used
+# 2. the tested parameter combinations are adjusted to meet the requirements
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *testing.product(
+        {
+            "n": [None, 64],
+            "shape": [(64,), (128,)],
+            "norm": [None, "backward", "ortho", "forward", ""],
+        }
+    )
+)
+@pytest.mark.skip("get_fft_plan() is not supported")
+@testing.multi_gpu(2)
+class TestMultiGpuPlanCtxManagerFft:
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_fft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.fft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape)
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.fft(a, n=self.n, norm=self.norm)
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_ifft(self, xp, dtype):
+        _skip_multi_gpu_bug(self.shape, self.gpus)
+
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.ifft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape)
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.ifft(a, n=self.n, norm=self.norm)
+
+    @multi_gpu_config(gpu_configs=[[0, 1], [1, 0]])
+    @testing.for_complex_dtypes()
+    def test_fft_error_on_wrong_plan(self, dtype):
+        # This test ensures the context manager plan is picked up
+
+        from cupy.fft import fft
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        a = testing.shaped_random(self.shape, cupy, dtype)
+        bad_shape = tuple(4 * i for i in self.shape)
+        b = testing.shaped_random(bad_shape, cupy, dtype)
+        plan_wrong = get_fft_plan(b)
+        assert isinstance(plan_wrong, cupy.cuda.cufft.Plan1d)
+
+        with pytest.raises(ValueError) as ex, plan_wrong:
+            fft(a, n=self.n, norm=self.norm)
+        # targeting a particular error
+        if self.norm == "":
+            # if norm is invalid, we still get ValueError, but it's raised
+            # when checking norm, earlier than the plan check
+            return  # skip
+        assert "Target array size does not match the plan." in str(ex.value)
+
+
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *(
+        testing.product_dict(
+            [
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (3, 4), "s": None, "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (-1, -2)},
+                {"shape": (3, 4), "s": None, "axes": (0,)},
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (1, 4, None), "axes": None},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": None},
+                {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)},
+                {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4, 5), "s": None, "axes": (-3, -2, -1)},
+            ],
+            testing.product(
+                {"norm": [None, "backward", "ortho", "forward", ""]}
+            ),
+        )
+    )
+)
+@pytest.mark.skip("default FFT function is not supported")
+class TestFftnContiguity:
+
+    @nd_planning_states([True])
+    @testing.for_all_dtypes()
+    def test_fftn_orders(self, dtype, enable_nd):
+        for order in ["C", "F"]:
+            a = testing.shaped_random(self.shape, cupy, dtype)
+            if order == "F":
+                a = cupy.asfortranarray(a)
+            out = cupy.fft.fftn(a, s=self.s, axes=self.axes)
+
+            fft_func = _default_fft_func(a, s=self.s, axes=self.axes)
+            if fft_func is _fftn:
+                # nd plans have output with contiguity matching the input
+                assert out.flags.c_contiguous == a.flags.c_contiguous
+                assert out.flags.f_contiguous == a.flags.f_contiguous
+            else:
+                # 1d planning case doesn't guarantee preserved contiguity
+                pass
+
+    @nd_planning_states([True])
+    @testing.for_all_dtypes()
+    def test_ifftn_orders(self, dtype, enable_nd):
+        for order in ["C", "F"]:
+
+            a = testing.shaped_random(self.shape, cupy, dtype)
+            if order == "F":
+                a = cupy.asfortranarray(a)
+            out = cupy.fft.ifftn(a, s=self.s, axes=self.axes)
+
+            fft_func = _default_fft_func(a, s=self.s, axes=self.axes)
+            if fft_func is _fftn:
+                # nd plans have output with contiguity matching the input
+                assert out.flags.c_contiguous == a.flags.c_contiguous
+                assert out.flags.f_contiguous == a.flags.f_contiguous
+            else:
+                # 1d planning case doesn't guarantee preserved contiguity
+                pass
+
+
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *testing.product(
@@ -299,9 +893,10 @@ def test_ifftn(self, xp, dtype, order):
     )
 )
 class TestRfft:
+
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
@@ -309,17 +904,12 @@ class TestRfft:
     )
     def test_rfft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
-        out = xp.fft.rfft(a, n=self.n, norm=self.norm)
-
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.rfft(a, n=self.n, norm=self.norm)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
-        atol=2e-6,
+        rtol=1e-3,
+        atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
@@ -328,32 +918,106 @@ def test_irfft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
         out = xp.fft.irfft(a, n=self.n, norm=self.norm)
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
+        if dtype == xp.float16 and xp is cupy:
+            # XXX: np2.0: f16 dtypes differ
+            out = out.astype(np.float16)
+        elif (
+            xp is np
+            and np.lib.NumpyVersion(np.__version__) < "2.0.0"
+            and dtype == np.float32
+        ):
             out = out.astype(np.float32)
 
         return out
 
 
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *testing.product(
+        {
+            "n": [None, 5, 10, 15],
+            "shape": [(10,)],
+            "norm": [None, "backward", "ortho", "forward"],
+        }
+    )
+)
+@pytest.mark.skip("get_fft_plan() is not supported")
+class TestPlanCtxManagerRfft:
+
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfft(self, xp, dtype):
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.rfft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape, value_type="R2C")
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.rfft(a, n=self.n, norm=self.norm)
+
+    @testing.for_complex_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfft(self, xp, dtype):
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.irfft(a, n=self.n, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        shape = (self.n,) if self.n is not None else None
+        plan = get_fft_plan(a, shape=shape, value_type="C2R")
+        assert isinstance(plan, cupy.cuda.cufft.Plan1d)
+        with plan:
+            return xp.fft.irfft(a, n=self.n, norm=self.norm)
+
+    @testing.for_all_dtypes(no_complex=True)
+    def test_rfft_error_on_wrong_plan(self, dtype):
+        # This test ensures the context manager plan is picked up
+
+        from cupy.fft import rfft
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        a = testing.shaped_random(self.shape, cupy, dtype)
+        bad_shape = tuple(5 * i for i in self.shape)
+        b = testing.shaped_random(bad_shape, cupy, dtype)
+        plan_wrong = get_fft_plan(b, value_type="R2C")
+        assert isinstance(plan_wrong, cupy.cuda.cufft.Plan1d)
+
+        with pytest.raises(ValueError) as ex, plan_wrong:
+            rfft(a, n=self.n, norm=self.norm)
+        # targeting a particular error
+        assert "Target array size does not match the plan." in str(ex.value)
+
+
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
         testing.product_dict(
             [
-                # some of the following cases are modified, since in NumPy 2.0.0
-                # `s` must contain only integer `s`, not None values, and
-                # If `s` is not None, `axes` must not be None either.
                 {"shape": (3, 4), "s": None, "axes": None},
-                {"shape": (3, 4), "s": (1, 4), "axes": (0, 1)},
-                {"shape": (3, 4), "s": (1, 5), "axes": (0, 1)},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-1, -2)},
                 {"shape": (3, 4), "s": None, "axes": (0,)},
+                {"shape": (3, 4), "s": None, "axes": None},
                 # {"shape": (2, 3, 4), "s": None, "axes": None}, # mkl_fft gh-116
-                # {"shape": (2, 3, 4), "s": (1, 4, 4), "axes": (0, 1, 2)}, # mkl_fft gh-115
-                # {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (0, 1, 2)}, # mkl_fft gh-115
+                # {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)}, # mkl_fft gh-115
                 # {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)}, # mkl_fft gh-116
                 # {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)}, # mkl_fft gh-116
                 {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
                 {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)},
                 # {"shape": (2, 3, 4, 5), "s": None, "axes": None}, # mkl_fft gh-109 and gh-116
             ],
@@ -364,54 +1028,60 @@ def test_irfft(self, xp, dtype):
     )
 )
 class TestRfft2:
+
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_rfft2(self, xp, dtype, order):
+    def test_rfft2(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.rfft2(a, s=self.s, axes=self.axes, norm=self.norm)
-
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.rfft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_irfft2(self, xp, dtype, order):
-        if self.s is None and self.axes in [None, (-2, -1)]:
+    def test_irfft2(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
+
+        if self.s is None and self.axes == None:
             pytest.skip("Input is not Hermitian Symmetric")
+        elif dtype == xp.float16 and xp is cupy:
+            pytest.xfail("XXX: np2.0: f16 dtypes differ")
+        elif (
+            np.lib.NumpyVersion(np.__version__) < "2.0.0"
+            and dtype == np.float32
+        ):
+            pytest.skip("dtypes differ")
+
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.irfft2(a, s=self.s, axes=self.axes, norm=self.norm)
-
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.float32)
-
-        return out
+        return xp.fft.irfft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
 
+# @testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (3, 4), "s": None, "axes": (), "norm": None},
     {"shape": (2, 3, 4), "s": None, "axes": (), "norm": None},
 )
 class TestRfft2EmptyAxes:
+
     @testing.for_all_dtypes(no_complex=True)
     def test_rfft2(self, dtype):
         for xp in (np, cupy):
@@ -427,26 +1097,24 @@ def test_irfft2(self, dtype):
                 xp.fft.irfft2(a, s=self.s, axes=self.axes, norm=self.norm)
 
 
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *(
         testing.product_dict(
             [
-                # some of the following cases are modified, since in NumPy 2.0.0
-                # `s` must contain only integer `s`, not None values, and
-                # If `s` is not None, `axes` must not be None either.
                 {"shape": (3, 4), "s": None, "axes": None},
-                {"shape": (3, 4), "s": (1, 4), "axes": (0, 1)},
-                {"shape": (3, 4), "s": (1, 5), "axes": (0, 1)},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-2, -1)},
                 {"shape": (3, 4), "s": None, "axes": (-1, -2)},
                 {"shape": (3, 4), "s": None, "axes": (0,)},
-                # {"shape": (2, 3, 4), "s": None, "axes": None}, # mkl_fft gh-116
-                # {"shape": (2, 3, 4), "s": (1, 4, 4), "axes": (0, 1, 2)}, # mkl_fft gh-115
-                # {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (0, 1, 2)}, # mkl_fft gh-115
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                # {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)}, # mkl_fft gh-115
                 # {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)}, # mkl_fft gh-116
                 # {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)}, # mkl_fft gh-116
                 {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
                 {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)},
                 # {"shape": (2, 3, 4, 5), "s": None, "axes": None}, # mkl_fft gh-109 and gh-116
             ],
@@ -457,54 +1125,207 @@ def test_irfft2(self, dtype):
     )
 )
 class TestRfftn:
+
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_rfftn(self, xp, dtype, order):
+    def test_rfftn(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.rfftn(a, s=self.s, axes=self.axes, norm=self.norm)
-
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.complex64)
-
-        return out
+        return xp.fft.rfftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
+    @nd_planning_states()
     @testing.for_orders("CF")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_irfftn(self, xp, dtype, order):
-        if self.s is None and self.axes in [None, (-2, -1)]:
+    def test_irfftn(self, xp, dtype, order, enable_nd):
+        # assert config.enable_nd_planning == enable_nd
+
+        if self.s is None and self.axes == None:
             pytest.skip("Input is not Hermitian Symmetric")
+        elif dtype == xp.float16 and xp is cupy:
+            pytest.xfail("XXX: np2.0: f16 dtypes differ")
+        elif (
+            np.lib.NumpyVersion(np.__version__) < "2.0.0"
+            and dtype == np.float32
+        ):
+            pytest.skip("dtypes differ")
+
         a = testing.shaped_random(self.shape, xp, dtype)
         if order == "F":
             a = xp.asfortranarray(a)
-        out = xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
+        return xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
-            out = out.astype(np.float32)
 
-        return out
+# Only those tests in which a legit plan can be obtained are kept
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *(
+        testing.product_dict(
+            [
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (3, 4), "s": (1, 5), "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (0,)},
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (2, 3), "axes": (0, 1, 2)},
+            ],
+            testing.product({"norm": [None, "backward", "ortho", "forward"]}),
+        )
+    )
+)
+@pytest.mark.skip("get_fft_plan() is not supported")
+class TestPlanCtxManagerRfftn:
+
+    @pytest.fixture(autouse=True)
+    def skip_buggy(self):
+        if cupy.cuda.runtime.is_hip:
+            # TODO(leofang): test newer ROCm versions
+            if self.axes == (0, 1) and self.shape == (2, 3, 4):
+                pytest.skip(
+                    "hipFFT's PlanNd for this case "
+                    "is buggy, so Plan1d is generated "
+                    "instead"
+                )
+
+    @nd_planning_states()
+    @testing.for_all_dtypes(no_complex=True)
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_rfftn(self, xp, dtype, enable_nd):
+        assert config.enable_nd_planning == enable_nd
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if xp is np:
+            return xp.fft.rfftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
 
+        plan = get_fft_plan(a, self.s, self.axes, value_type="R2C")
+        with plan:
+            return xp.fft.rfftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
+    @nd_planning_states()
+    @testing.for_all_dtypes()
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-7, accept_error=ValueError, contiguous_check=False
+    )
+    def test_irfftn(self, xp, dtype, enable_nd):
+        assert config.enable_nd_planning == enable_nd
+        a = testing.shaped_random(self.shape, xp, dtype)
+
+        if dtype == xp.float16 and xp is cupy:
+            pytest.xfail("XXX: np2.0: f16 dtypes differ")
+
+        if xp is np:
+            return xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+        from cupyx.scipy.fftpack import get_fft_plan
+
+        plan = get_fft_plan(a, self.s, self.axes, value_type="C2R")
+        with plan:
+            return xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
+
+    # TODO(leofang): write test_rfftn_error_on_wrong_plan()?
+
+
+@testing.with_requires("numpy>=2.0")
+@pytest.mark.usefixtures("skip_forward_backward")
+@testing.parameterize(
+    *(
+        testing.product_dict(
+            [
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (3, 4), "s": None, "axes": (-2, -1)},
+                {"shape": (3, 4), "s": None, "axes": (-1, -2)},
+                {"shape": (3, 4), "s": None, "axes": (0,)},
+                {"shape": (3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4), "s": (1, 4, 10), "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-3, -2, -1)},
+                {"shape": (2, 3, 4), "s": None, "axes": (-1, -2, -3)},
+                {"shape": (2, 3, 4), "s": None, "axes": (0, 1)},
+                {"shape": (2, 3, 4), "s": None, "axes": None},
+                {"shape": (2, 3, 4, 5), "s": None, "axes": None},
+            ],
+            testing.product({"norm": [None, "backward", "ortho", "forward"]}),
+        )
+    )
+)
+@pytest.mark.skip("default FFT function is not supported")
+class TestRfftnContiguity:
+
+    @nd_planning_states([True])
+    @testing.for_float_dtypes()
+    def test_rfftn_orders(self, dtype, enable_nd):
+        for order in ["C", "F"]:
+            a = testing.shaped_random(self.shape, cupy, dtype)
+            if order == "F":
+                a = cupy.asfortranarray(a)
+            out = cupy.fft.rfftn(a, s=self.s, axes=self.axes)
+
+            fft_func = _default_fft_func(
+                a, s=self.s, axes=self.axes, value_type="R2C"
+            )
+            if fft_func is _fftn:
+                # nd plans have output with contiguity matching the input
+                assert out.flags.c_contiguous == a.flags.c_contiguous
+                assert out.flags.f_contiguous == a.flags.f_contiguous
+            else:
+                # 1d planning case doesn't guarantee preserved contiguity
+                pass
+
+    @nd_planning_states([True])
+    @testing.for_all_dtypes()
+    def test_ifftn_orders(self, dtype, enable_nd):
+        for order in ["C", "F"]:
+
+            a = testing.shaped_random(self.shape, cupy, dtype)
+            if order == "F":
+                a = cupy.asfortranarray(a)
+            out = cupy.fft.irfftn(a, s=self.s, axes=self.axes)
+
+            fft_func = _default_fft_func(
+                a, s=self.s, axes=self.axes, value_type="C2R"
+            )
+            if fft_func is _fftn:
+                # nd plans have output with contiguity matching the input
+                assert out.flags.c_contiguous == a.flags.c_contiguous
+                assert out.flags.f_contiguous == a.flags.f_contiguous
+            else:
+                # 1d planning case doesn't guarantee preserved contiguity
+                pass
+
+
+# @testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (3, 4), "s": None, "axes": (), "norm": None},
     {"shape": (2, 3, 4), "s": None, "axes": (), "norm": None},
 )
 class TestRfftnEmptyAxes:
+
     @testing.for_all_dtypes(no_complex=True)
     def test_rfftn(self, dtype):
         for xp in (np, cupy):
@@ -520,21 +1341,23 @@ def test_irfftn(self, dtype):
                 xp.fft.irfftn(a, s=self.s, axes=self.axes, norm=self.norm)
 
 
+# @testing.with_requires("numpy>=2.0")
 @pytest.mark.usefixtures("skip_forward_backward")
 @testing.parameterize(
     *testing.product(
         {
             "n": [None, 5, 10, 15],
             "shape": [(10,), (10, 10)],
-            "norm": [None, "backward", "ortho", "forward", ""],
+            "norm": [None, "backward", "ortho", "forward"],
         }
     )
 )
 class TestHfft:
+
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
-        atol=2e-6,
+        rtol=1e-3,
+        atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
         type_check=has_support_aspect64(),
@@ -543,14 +1366,21 @@ def test_hfft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
         out = xp.fft.hfft(a, n=self.n, norm=self.norm)
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
+        if dtype == xp.float16 and xp is cupy:
+            # XXX: np2.0: f16 dtypes differ
+            out = out.astype(np.float16)
+        elif (
+            xp is np
+            and np.lib.NumpyVersion(np.__version__) < "2.0.0"
+            and dtype == np.float32
+        ):
             out = out.astype(np.float32)
 
         return out
 
     @testing.for_all_dtypes(no_complex=True)
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
         accept_error=ValueError,
         contiguous_check=False,
@@ -560,39 +1390,46 @@ def test_ihfft(self, xp, dtype):
         a = testing.shaped_random(self.shape, xp, dtype)
         out = xp.fft.ihfft(a, n=self.n, norm=self.norm)
 
-        if xp is np and dtype in [np.float16, np.float32, np.complex64]:
+        if (
+            xp is np
+            and np.lib.NumpyVersion(np.__version__) < "2.0.0"
+            and dtype == np.float32
+        ):
             out = out.astype(np.complex64)
 
         return out
 
 
+# @testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"n": 1, "d": 1},
     {"n": 10, "d": 0.5},
     {"n": 100, "d": 2},
 )
 class TestFftfreq:
+
+    @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
+        contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_fftfreq(self, xp):
-        out = xp.fft.fftfreq(self.n, self.d)
-
-        return out
+    def test_fftfreq(self, xp, dtype):
+        return xp.fft.fftfreq(self.n, self.d)
 
+    @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
+        contiguous_check=False,
         type_check=has_support_aspect64(),
     )
-    def test_rfftfreq(self, xp):
-        out = xp.fft.rfftfreq(self.n, self.d)
-
-        return out
+    def test_rfftfreq(self, xp, dtype):
+        return xp.fft.rfftfreq(self.n, self.d)
 
 
+# @testing.with_requires("numpy>=2.0")
 @testing.parameterize(
     {"shape": (5,), "axes": None},
     {"shape": (5,), "axes": 0},
@@ -603,26 +1440,54 @@ def test_rfftfreq(self, xp):
     {"shape": (10, 10), "axes": (0, 1)},
 )
 class TestFftshift:
+
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
+        contiguous_check=False,
         type_check=has_support_aspect64(),
     )
     def test_fftshift(self, xp, dtype):
         x = testing.shaped_random(self.shape, xp, dtype)
-        out = xp.fft.fftshift(x, self.axes)
-
-        return out
+        return xp.fft.fftshift(x, self.axes)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(
-        rtol=1e-4,
+        rtol=1e-3,
         atol=1e-7,
+        contiguous_check=False,
         type_check=has_support_aspect64(),
     )
     def test_ifftshift(self, xp, dtype):
         x = testing.shaped_random(self.shape, xp, dtype)
-        out = xp.fft.ifftshift(x, self.axes)
+        return xp.fft.ifftshift(x, self.axes)
 
-        return out
+
+@pytest.mark.skip("no threading support")
+class TestThreading:
+
+    def test_threading1(self):
+        import threading
+
+        from cupy.cuda.cufft import get_current_plan
+
+        def thread_get_curr_plan():
+            cupy.cuda.Device().use()
+            return get_current_plan()
+
+        new_thread = threading.Thread(target=thread_get_curr_plan)
+        new_thread.start()
+
+    def test_threading2(self):
+        import threading
+
+        a = cupy.arange(100, dtype=cupy.complex64).reshape(10, 10)
+
+        def thread_do_fft():
+            cupy.cuda.Device().use()
+            b = cupy.fft.fftn(a)
+            return b
+
+        new_thread = threading.Thread(target=thread_do_fft)
+        new_thread.start()