ptycho · daurer · Mar 8, 2024 · Aug 24, 2023 · Aug 24, 2023 · Aug 25, 2023
diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
@@ -56,6 +56,16 @@ def norm2(input):
     return np.sum(abs2(input))
 
 
+def gaussian_kernel_2d(shape, sigmau, sigmav):
+    """
+    2D Gaussian kernel using the last 2 dimension of given shape
+    Requires sigma for both dimensions (sigmau and sigmav)
+    """
+    u, v = np.fft.fftfreq(shape[-2]), np.fft.fftfreq(shape[-1])
+    uu, vv = np.meshgrid(u, v, sparse=True, indexing='ij')
+    kernel = np.exp(-2* ( (np.pi*sigmau)**2 * uu**2 + (np.pi*sigmav)**2 * vv**2 ) )
+    return kernel
+
 def complex_gaussian_filter(input, mfs):
     '''
     takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
@@ -70,6 +80,44 @@ def complex_gaussian_filter(input, mfs):
         input.dtype)
 
 
+def complex_gaussian_filter_fft(input, mfs):
+    '''
+    takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
+    '''
+    if len(mfs) > 2:
+        raise NotImplementedError("Only batches of 2D arrays allowed!")
+    elif len(mfs) == 1:
+        mfs = np.array([mfs,mfs])
+    else:
+        mfs = np.array(mfs)
+
+    k = gaussian_kernel_2d(input.shape, mfs[0], mfs[1]).astype(input.dtype)
+    return fft_filter(input, k)
+
+
+def fft_filter(input, kernel, prefactor=None, postfactor=None, forward=True):
+    """
+    Compute
+    output = ifft(fft( prefactor * input ) * kernel) * postfactor
+    """
+    # Make a copy (and cast if necessary)
+    x = np.array(input)
+
+
+    if prefactor is not None:
+        x *= prefactor
+
+    if forward:
+        x = np.fft.ifftn(np.fft.fftn(x, norm="ortho") * kernel, norm="ortho")
+    else:
+        x = np.fft.fftn(np.fft.ifftn(x, norm="ortho") * kernel, norm="ortho")
+
+    if postfactor is not None:
+        x *= postfactor
+
+    return x
+
+
 def mass_center(A):
     '''
     Input will always be real, and 2d or 3d, single precision here
@@ -81,7 +129,7 @@ def interpolated_shift(c, shift, do_linear=False):
     '''
     complex bicubic interpolated shift.
     complex output. This shift should be applied to 2D arrays. shift should have len=c.ndims 
-    
+
     '''
     if not do_linear:
         return ndi.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.shift(

diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
@@ -23,13 +23,23 @@
 from ptypy.engines import register
 from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ptypy.accelerate.base import address_manglers
+from ptypy.accelerate.base.array_utils import complex_gaussian_filter, complex_gaussian_filter_fft
 
 
 __all__ = ['ML_serial']
 
 @register()
 class ML_serial(ML):
 
+    """
+    Defaults:
+
+    [smooth_gradient_method]
+    default = convolution
+    type = str
+    help = Method to be used for smoothing the gradient, choose between ```convolution``` or ```fft```.
+    """
+
     def __init__(self, ptycho_parent, pars=None):
         """
         Maximum likelihood reconstruction engine.
@@ -143,7 +153,12 @@ def engine_prepare(self):
         self.ML_model.prepare()
 
     def _get_smooth_gradient(self, data, sigma):
-        return self.smooth_gradient(data)
+        if self.p.smooth_gradient_method == "convolution":
+            return complex_gaussian_filter(data, sigma)
+        elif self.p.smooth_gradient_method == "fft":
+            return complex_gaussian_filter_fft(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
@@ -272,7 +287,7 @@ def engine_iterate(self, num=1):
         return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement:
@@ -283,7 +298,7 @@ def position_update(self):
         # Update positions
         if do_update_pos:
             """
-            Iterates through all positions and refines them by a given algorithm. 
+            Iterates through all positions and refines them by a given algorithm.
             """
             log(4, "----------- START POS REF -------------")
             for dID in self.di.S.keys():
@@ -308,7 +323,7 @@ def position_update(self):
                 max_oby = ob.shape[-2] - aux.shape[-2] - 1
                 max_obx = ob.shape[-1] - aux.shape[-1] - 1
 
-                # We need to re-calculate the current error 
+                # We need to re-calculate the current error
                 PCK.build_aux(aux, addr, ob, pr)
                 aux[:] = FW(aux)
                 PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
@@ -338,7 +353,7 @@ def engine_finalize(self):
                 for i,view in enumerate(d.views):
                     for j,(pname, pod) in enumerate(view.pods.items()):
                         delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res
-                        pod.ob_view.coord += delta 
+                        pod.ob_view.coord += delta
                         pod.ob_view.storage.update_views(pod.ob_view)
             self.ptycho.record_positions = True
 

diff --git a/ptypy/accelerate/cuda_common/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu
@@ -22,10 +22,11 @@ extern "C" __global__ void batched_multiply(const complex<IN_TYPE>* input,
   int gy = threadIdx.y + blockIdx.y * blockDim.y;
   int gz = threadIdx.z + blockIdx.z * blockDim.z;
 
-  if (gx > rows || gy > columns || gz > nBatches)
+  if (gx > rows - 1 || gy > columns - 1 || gz > nBatches)
     return;
 
   auto val = input[gz * rows * columns + gy * rows + gx];
+
   if (MPY_DO_FILT)  // set at compile-time
   {
     val *= filter[gy * rows + gx];

diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 from ptypy.accelerate.cuda_common.utils import map2ctype
+from ptypy.accelerate.base.array_utils import gaussian_kernel_2d
 from ptypy.utils.math_utils import gaussian
 from . import load_kernel
 
@@ -62,6 +63,37 @@ def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarra
     def norm2(self, A, out=None):
         return self.dot(A, A, out)
 
+class BatchedMultiplyKernel:
+    def __init__(self, array, queue=None, math_type=np.complex64):
+        self.queue = queue
+        self.array_shape = array.shape[-2:]
+        self.batches = int(np.prod(array.shape[0:array.ndim-2]) if array.ndim > 2 else 1)
+        self.batched_multiply_cuda = load_kernel("batched_multiply", {
+            'MPY_DO_SCALE': 'true',
+            'MPY_DO_FILT': 'true',
+            'IN_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'MATH_TYPE': 'float' if math_type==np.complex64 else 'double'
+        })
+        self.block = (32,32,1)
+        self.grid = (
+            int((self.array_shape[0] + 31) // 32),
+            int((self.array_shape[1] + 31) // 32),
+            int(self.batches)
+        )
+
+    def multiply(self, x,y, scale=1.):
+        assert x.dtype == y.dtype, "Input arrays must be of same data type"
+        assert x.shape[-2:] == y.shape[-2:], "Input arrays must be of the same size in last 2 dims"
+        if self.queue is not None:
+            self.queue.use()
+        self.batched_multiply_cuda(self.grid,
+                                   self.block,
+                                   args=(x,x,y,
+                                   np.float32(scale),
+                                   np.int32(self.batches),
+                                   np.int32(self.array_shape[0]),
+                                   np.int32(self.array_shape[1])))
 
 class TransposeKernel:
 
@@ -322,6 +354,62 @@ def delxb(self, input, out, axis=-1):
                            out, lower_dim, higher_dim, np.int32(input.shape[axis])))
 
 
+class FFTGaussianSmoothingKernel:
+    def __init__(self, queue=None, kernel_type='float'):
+        if kernel_type not in ['float', 'double']:
+            raise ValueError('Invalid data type for kernel')
+        self.kernel_type = kernel_type
+        self.dtype = np.complex64
+        self.stype = "complex<float>"
+        self.queue = queue
+        self.sigma = None
+
+        from .kernels import FFTFilterKernel
+
+        # Create general FFT filter object 
+        self.fft_filter = FFTFilterKernel(queue_thread=queue)
+
+    def allocate(self, shape, sigma=1.):
+
+        # Create kernel
+        self.sigma = sigma
+        kernel = self._compute_kernel(shape, sigma)
+
+        # Allocate filter
+        kernel_dev = cp.asarray(kernel)
+        self.fft_filter.allocate(kernel=kernel_dev)
+
+    def _compute_kernel(self, shape, sigma):
+        # Create kernel
+        sigma = np.array(sigma)
+        if sigma.size == 1:
+            sigma = np.array([sigma, sigma])
+        if sigma.size > 2:
+            raise NotImplementedError("Only batches of 2D arrays allowed!")
+        self.sigma = sigma
+
+        kernel = gaussian_kernel_2d(shape, sigma[0], sigma[1]).astype(self.dtype)
+
+        # Extend kernel if needed
+        if len(shape) == 3:
+            kernel = np.tile(kernel, (shape[0],1,1))
+
+        return kernel
+
+    def filter(self, data, sigma=None):
+        """
+        Apply filter in-place
+
+        If sigma is not None: reallocate a new fft filter first.
+        """
+        if self.sigma is None:
+            self.allocate(shape=data.shape, sigma=sigma)
+        else:
+            self.fft_filter.set_kernel(self._compute_kernel(data.shape, sigma))
+
+        self.fft_filter.apply_filter(data)
+
+
 class GaussianSmoothingKernel:
     def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
         if kernel_type not in ['float', 'double']:

diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -23,7 +23,7 @@
 from .. import get_context, log_device_memory_stats
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
 from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
-from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel
+from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel
 from ..mem_utils import GpuDataManager
 
 #from ..mem_utils import GpuDataManager
@@ -79,8 +79,12 @@ def engine_initialize(self):
         self.qu_htod = cp.cuda.Stream()
         self.qu_dtoh = cp.cuda.Stream()
 
-        self.GSK = GaussianSmoothingKernel(queue=self.queue)
-        self.GSK.tmp = None
+        if self.p.smooth_gradient_method == "convolution":
+            self.GSK = GaussianSmoothingKernel(queue=self.queue)
+            self.GSK.tmp = None
+
+        if self.p.smooth_gradient_method == "fft":
+            self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue)
 
         # Real/Fourier Support Kernel
         self.RSK = {}
@@ -260,9 +264,18 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False):
                     dev=dev, container=container, sync_copy=sync_copy)
 
     def _get_smooth_gradient(self, data, sigma):
-        if self.GSK.tmp is None:
-            self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64)
-        self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+        if self.p.smooth_gradient_method == "convolution":
+            if self.GSK.tmp is None:
+                self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64)
+            try:
+                self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+            except MemoryError:
+                raise RuntimeError("Convolution kernel too large for direct convolution on GPU",
+                                   "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.")
+        elif self.p.smooth_gradient_method == "fft":
+            self.FGSK.filter(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
         return data
 
     def _replace_ob_grad(self):

diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -170,6 +170,34 @@ def apply_real_support(self, x):
         x *= self.support
 
 
+class FFTFilterKernel:
+    def __init__(self, queue_thread=None, fft='cupy'):
+        # Current implementation recompiles every time there is a change in input shape.
+        self.queue = queue_thread
+        self._fft_type = fft
+        self._fft1 = None
+        self._fft2 = None
+    def allocate(self, kernel, prefactor=None, postfactor=None, forward=True):
+        FFT = choose_fft(kernel.shape[-2:], fft_type=self._fft_type)
+
+        self._fft1 = FFT(kernel, self.queue,
+                        pre_fft=prefactor,
+                        post_fft=kernel,
+                        symmetric=True,
+                        forward=forward)
+        self._fft2 = FFT(kernel, self.queue,
+                         post_fft=postfactor,
+                        symmetric=True,
+                        forward=not forward)
+
+    def set_kernel(self, kernel):
+        self._fft1.post_fft.set(kernel)
+
+    def apply_filter(self, x):
+        self._fft1.ft(x, x)
+        self._fft2.ift(x, x)
+
+
 class FourierUpdateKernel(ab.FourierUpdateKernel):
 
     def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'):