diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
index 839b08e70..410d9102f 100644
--- a/ptypy/accelerate/base/array_utils.py
+++ b/ptypy/accelerate/base/array_utils.py
@@ -3,6 +3,7 @@
 '''
 import numpy as np
 from scipy import ndimage as ndi
+from ptypy import utils as u
 
 
 def dot(A, B, acc_dtype=np.float64):
@@ -148,3 +149,26 @@ def crop_pad_2d_simple(A, B):
     b1, b2 = B.shape[-2:]
     offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2]
     fill3D(A, B, offset)
+
+def resample(A, B):
+    """
+    Resamples the last two dimensions of B onto shape of A and places it in A.
+    The ratio between shapes needs to be a power of 2 along the last two dimension.
+    upsampling (A larger than B): nearest neighbour interpolation
+    downsampling (B larger than A): integrate over neighbouring regions
+    """
+    assert A.ndim > 2, "Arrays must have at least 2 dimensions"
+    assert B.ndim > 2, "Arrays must have at least 2 dimensions"
+    assert A.shape[:-2] == B.shape[:-2], "Arrays must have same shape expect along the last 2 dimensions"
+    assert A.shape[-2] == A.shape[-1], "Last two dimensions must be of equal length"
+    assert B.shape[-2] == B.shape[-2], "Last two dimensions must be of equal length"
+    # same sampling, no need to call this function
+    assert A.shape != B.shape, "A and B have the same shape, no need to do any resampling"
+    # upsampling
+    if A.shape[-1] > B.shape[-1]:
+        resample = A.shape[-1] // B.shape[-1]
+        A[:] = u.repeat_2d(B, resample) / (resample**2)
+    # downsampling
+    elif A.shape[-1] < B.shape[-1]:
+        resample = B.shape[-1] // A.shape[-1]
+        A[:] = u.rebin_2d(B, resample) * (resample**2)
diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py
index 44573bf56..1a77eeb65 100644
--- a/ptypy/accelerate/base/engines/DM_serial.py
+++ b/ptypy/accelerate/base/engines/DM_serial.py
@@ -178,8 +178,18 @@ def _setup_kernels(self):
             aux = np.zeros(ash, dtype=np.complex64)
             kern.aux = aux
 
+            # create extra array for resampling (if needed)
+            kern.resample = scan.resample > 1
+            if kern.resample:
+                ish = (ash[0],) + tuple(geo.shape//scan.resample)
+                kern.aux_tmp1 = np.zeros(ash, dtype=np.float32)
+                kern.aux_tmp2 = np.zeros(ish, dtype=np.float32)
+                aux_f = kern.aux_tmp2 # making sure to pass the correct shapes to FUK 
+            else:
+                aux_f = aux
+                
             # setup kernels, one for each SCAN.
-            kern.FUK = FourierUpdateKernel(aux, nmodes)
+            kern.FUK = FourierUpdateKernel(aux_f, nmodes)
             kern.FUK.allocate()
 
             kern.POK = PoUpdateKernel()
@@ -279,6 +289,12 @@ def engine_iterate(self, num=1):
                 pbound = self.pbound_scan[prep.label]
                 aux = kern.aux
 
+                # resampling
+                resample = kern.resample
+                if resample:
+                    aux_tmp1 = kern.aux_tmp1
+                    aux_tmp2 = kern.aux_tmp2
+
                 # local references
                 ma = prep.ma
                 ob = self.ob.S[oID].data
@@ -290,7 +306,12 @@ def engine_iterate(self, num=1):
                     t1 = time.time()
                     AWK.build_aux_no_ex(aux, addr, ob, pr)
                     aux[:] = FW(aux)
-                    FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+                    if resample:
+                        aux_tmp1 = np.abs(aux)**2
+                        au.resample(aux_tmp2, aux_tmp1)
+                        FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True)
+                    else:
+                        FUK.log_likelihood(aux, addr, mag, ma, err_phot)
                     self.benchmark.F_LLerror += time.time() - t1
 
                 ## build auxilliary wave
@@ -305,9 +326,18 @@ def engine_iterate(self, num=1):
 
                 ## Deviation from measured data
                 t1 = time.time()
-                FUK.fourier_error(aux, addr, mag, ma, ma_sum)
-                FUK.error_reduce(addr, err_fourier)
-                FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
+                if resample:
+                    aux_tmp1 = np.abs(aux)**2
+                    au.resample(aux_tmp2, aux_tmp1)
+                    FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True)
+                    FUK.error_reduce(addr, err_fourier)
+                    FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False)
+                    au.resample(aux_tmp1, aux_tmp2)
+                    aux *= aux_tmp1
+                else:
+                    FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    FUK.error_reduce(addr, err_fourier)
+                    FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
                 self.benchmark.C_Fourier_update += time.time() - t1
 
                 ## backward FFT
@@ -318,7 +348,12 @@ def engine_iterate(self, num=1):
                 ## build exit wave
                 t1 = time.time()
                 AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                FUK.exit_error(aux,addr)
+                if resample:
+                    aux_tmp1 = np.abs(aux)**2
+                    au.resample(aux_tmp2, aux_tmp1)
+                    FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True)
+                else:
+                    FUK.exit_error(aux,addr)
                 FUK.error_reduce(addr, err_exit)
                 self.benchmark.E_Build_exit += time.time() - t1
 
diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
index 214aa0536..21a473fbe 100644
--- a/ptypy/accelerate/base/engines/ML_serial.py
+++ b/ptypy/accelerate/base/engines/ML_serial.py
@@ -21,8 +21,9 @@
 from ptypy.utils import parallel
 from ptypy.engines.utils import Cnorm2, Cdot
 from ptypy.engines import register
-from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel
+from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ptypy.accelerate.base import address_manglers
+from ptypy.accelerate.base import array_utils as au
 
 
 __all__ = ['ML_serial']
@@ -90,6 +91,13 @@ def _setup_kernels(self):
             kern.a = np.zeros(ash, dtype=np.complex64)
             kern.b = np.zeros(ash, dtype=np.complex64)
 
+            # create extra array for resampling (if needed)
+            kern.resample = scan.resample > 1
+            if kern.resample:
+                ish = (ash[0],) + tuple(geo.shape//scan.resample)
+                kern.aux_tmp1 = np.zeros(ash, dtype=np.float32)
+                kern.aux_tmp2 = np.zeros(ish, dtype=np.float32)
+
             # setup kernels, one for each SCAN.
             kern.GDK = GradientDescentKernel(aux, nmodes)
             kern.GDK.allocate()
@@ -284,6 +292,13 @@ def prepare(self):
             prep = self.engine.diff_info[d.ID]
             prep.weights = (self.Irenorm * self.engine.ma.S[d.ID].data
                             / (1. / self.Irenorm + d.data)).astype(d.data.dtype)
+            prep.intensity = self.engine.di.S[d.ID].data
+            kern = self.engine.kernels[prep.label]
+            if kern.resample:
+                prep.weights2 = np.zeros((prep.weights.shape[0],) + kern.aux_tmp1.shape[-2:], dtype=prep.weights.dtype)
+                au.resample(prep.weights2, prep.weights)
+                prep.intensity2 = np.zeros((prep.intensity.shape[0],) + kern.aux_tmp1.shape[-2:], dtype=prep.weights.dtype)
+                au.resample(prep.intensity2, prep.intensity)
 
     def __del__(self):
         """
@@ -325,7 +340,8 @@ def new_grad(self):
 
             # get addresses and auxilliary array
             addr = prep.addr
-            w = prep.weights
+            w = prep.weights2
+            I = prep.intensity2
             err_phot = prep.err_phot
             fic = prep.float_intens_coeff
 
@@ -334,7 +350,11 @@ def new_grad(self):
             obg = ob_grad.S[oID].data
             pr = self.engine.pr.S[pID].data
             prg = pr_grad.S[pID].data
-            I = self.engine.di.S[dID].data
+
+            # resampling
+            if kern.resample:
+                aux_tmp1 = kern.aux_tmp1
+                aux_tmp2 = kern.aux_tmp2
 
             # make propagated exit (to buffer)
             AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
@@ -342,7 +362,14 @@ def new_grad(self):
             # forward prop
             aux[:] = FW(aux)
 
-            GDK.make_model(aux, addr)
+            if kern.resample:
+                aux_tmp1 = np.abs(aux)**2
+                au.resample(aux_tmp2, aux_tmp1)
+                au.resample(aux_tmp1, aux_tmp2)
+                GDK.make_model(aux_tmp1, addr, aux_is_intensity=True)
+            else:
+                GDK.make_model(aux, addr)
+
             if self.p.floating_intensities:
                 GDK.floating_intensity(addr, w, I, fic)
             GDK.main(aux, addr, w, I)
@@ -406,6 +433,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h):
             # get addresses and auxilliary array
             addr = prep.addr
             w = prep.weights
+            I = prep.intensity
             fic = prep.float_intens_coeff
 
             # local references
@@ -413,7 +441,11 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h):
             ob_h = c_ob_h.S[oID].data
             pr = self.pr.S[pID].data
             pr_h = c_pr_h.S[pID].data
-            I = self.di.S[dID].data
+
+            # resampling
+            if kern.resample:
+                w = prep.weights2
+                I = prep.intensity2
 
             # make propagated exit (to buffer)
             AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index b1f109444..d11d2e77e 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -49,9 +49,9 @@ def allocate(self):
         self.npy.fdev = np.zeros(self.fshape, dtype=np.float32)
         self.npy.ferr = np.zeros(self.fshape, dtype=np.float32)
 
-    def fourier_error(self, b_aux, addr, mag, mask, mask_sum):
+    def fourier_error(self, b_aux, addr, mag, mask, mask_sum, aux_is_intensity=False):
         # reference shape (write-to shape)
-        sh = self.fshape
+        sh = b_aux.shape
         # stopper
         maxz = mag.shape[0]
 
@@ -65,7 +65,10 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum):
         # build model from complex fourier magnitudes, summing up 
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
-        af = np.sqrt((np.abs(tf) ** 2).sum(1))
+        if aux_is_intensity:
+            af = np.sqrt(tf.sum(1))
+        else:
+            af = np.sqrt((np.abs(tf) ** 2).sum(1))
 
         # calculate difference to real data (g_mag)
         fdev[:] = af - mag
@@ -74,9 +77,9 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum):
         ferr[:] = mask * np.abs(fdev) ** 2 / mask_sum.reshape((maxz, 1, 1))
         return
 
-    def fourier_deviation(self, b_aux, addr, mag):
+    def fourier_deviation(self, b_aux, addr, mag, aux_is_intensity=False):
         # reference shape (write-to shape)
-        sh = self.fshape
+        sh = b_aux.shape
         # stopper
         maxz = mag.shape[0]
 
@@ -89,7 +92,10 @@ def fourier_deviation(self, b_aux, addr, mag):
         # build model from complex fourier magnitudes, summing up 
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
-        af = np.sqrt((np.abs(tf) ** 2).sum(1))
+        if aux_is_intensity:
+            af = np.sqrt(tf.sum(1))
+        else:
+            af = np.sqrt((np.abs(tf) ** 2).sum(1))
 
         # calculate difference to real data (g_mag)
         fdev[:] = af - mag
@@ -97,8 +103,6 @@ def fourier_deviation(self, b_aux, addr, mag):
         return
 
     def error_reduce(self, addr, err_sum):
-        # reference shape (write-to shape)
-        sh = self.fshape
 
         # stopper
         maxz = err_sum.shape[0]
@@ -113,9 +117,9 @@ def error_reduce(self, addr, err_sum):
         err_sum[:] = ferr.sum(-1).sum(-1)
         return
 
-    def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0):
+    def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0, mult=True):
 
-        sh = self.fshape
+        sh = b_aux.shape
         nmodes = self.nmodes
 
         # stopper
@@ -153,12 +157,15 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0):
 
         #fm[:] = mag / (af + 1e-6)
         # upcasting
-        aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
+        if mult:
+            aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
+        else:
+            aux[:] = (np.ones((ish[0] // nmodes, nmodes, ish[1], ish[2])) * fm[:, np.newaxis, :, :]).reshape(ish)
         return
 
-    def fmag_update_nopbound(self, b_aux, addr, mag, mask):
+    def fmag_update_nopbound(self, b_aux, addr, mag, mask, mult=True):
 
-        sh = self.fshape
+        sh = b_aux.shape
         nmodes = self.nmodes
 
         # stopper
@@ -180,12 +187,15 @@ def fmag_update_nopbound(self, b_aux, addr, mag, mask):
         fm[:] = (1 - mask) + mask * mag / (af + self.denom)
 
         # upcasting
-        aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
+        if mult:
+            aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
+        else:
+            aux[:] = (np.ones((ish[0] // nmodes, nmodes, ish[1], ish[2])) * fm[:, np.newaxis, :, :]).reshape(ish)
         return
 
-    def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
+    def log_likelihood(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False):
         # reference shape (write-to shape)
-        sh = self.fshape
+        sh = b_aux.shape
         # stopper
         maxz = mag.shape[0]
 
@@ -195,7 +205,10 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
         # build model from complex fourier magnitudes, summing up 
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
-        LL = (np.abs(tf) ** 2).sum(1)
+        if aux_is_intensity:
+            LL = tf.sum(1)
+        else:
+            LL = (np.abs(tf) ** 2).sum(1)
 
         # Intensity data
         I = mag**2
@@ -204,7 +217,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
         err_phot[:] = ((mask * (LL - I)**2 / (I + 1.)).sum(-1).sum(-1) /  np.prod(LL.shape[-2:]))
         return
 
-    def exit_error(self, aux, addr):
+    def exit_error(self, aux, addr, aux_is_intensity=False):
         sh = addr.shape
         maxz = sh[0]
 
@@ -212,7 +225,11 @@ def exit_error(self, aux, addr):
         ferr = self.npy.ferr[:maxz]
         dex = aux[:maxz * self.nmodes]
         fsh = dex.shape[-2:]
-        ferr[:] = (np.abs(dex.reshape((maxz,self.nmodes,fsh[0], fsh[1])))**2).sum(axis=1) / np.prod(fsh)
+        tf = dex.reshape((maxz,self.nmodes,fsh[0], fsh[1]))
+        if aux_is_intensity:
+            ferr[:] = (tf).sum(axis=1) / np.prod(fsh)
+        else:
+            ferr[:] = (np.abs(tf)**2).sum(axis=1) / np.prod(fsh)
 
 
 class GradientDescentKernel(BaseKernel):
@@ -256,10 +273,10 @@ def allocate(self):
 
         self.npy.fic_tmp = np.ones((self.fshape[0],), dtype=self.ftype)
 
-    def make_model(self, b_aux, addr):
+    def make_model(self, b_aux, addr, aux_is_intensity=False):
 
         # reference shape (= GPU global dims)
-        sh = self.fshape
+        sh = b_aux.shape
 
         # batch buffers
         Imodel = self.npy.Imodel
@@ -267,7 +284,10 @@ def make_model(self, b_aux, addr):
 
         ## Actual math ## (subset of FUK.fourier_error)
         tf = aux.reshape(sh[0], self.nmodes, sh[1], sh[2])
-        Imodel[:] = (np.abs(tf) ** 2).sum(1)
+        if aux_is_intensity:
+            Imodel[:] = tf.sum(1)
+        else:
+            Imodel[:] = (np.abs(tf) ** 2).sum(1)
 
     def make_a012(self, b_f, b_a, b_b, addr, I, fic):
 
@@ -377,8 +397,8 @@ def main(self, b_aux, addr, w, I):
         ## math ##
         DI = Imodel - I
         tmp = w * DI
-        err[:] = tmp * DI
 
+        err[:] = tmp * DI
         aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * tmp[:, np.newaxis, :, :]).reshape(ish)
         return
 
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
index 85f816223..ed40475db 100644
--- a/ptypy/accelerate/cuda_pycuda/array_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -239,6 +239,85 @@ def crop_pad_2d_simple(self, A, B):
         self.fill3D(A, B, offset)
 
 
+class ResampleKernel:
+    def __init__(self, queue=None):
+        self.queue = queue
+        # we lazy-load this depending on the data types we get
+        self.upsample_cuda = {}
+        self.downsample_cuda = {}
+
+    def resample(self, A, B):
+        assert A.ndim > 2, "Arrays must have at least 2 dimensions"
+        assert B.ndim > 2, "Arrays must have at least 2 dimensions"
+        assert A.shape[:-2] == B.shape[:-2], "Arrays must have same shape expect along the last 2 dimensions"
+        assert A.shape[-2] == A.shape[-1], "Last two dimensions must be of equal length"
+        assert B.shape[-2] == B.shape[-2], "Last two dimensions must be of equal length"
+        # same sampling, no need to call this function
+        assert A.shape != B.shape, "A and B have the same shape, no need to do any resampling"
+        assert A.dtype == B.dtype, "A and B must have the same data type"
+
+        def map_type(dt):
+            if dt == np.float32:
+                return 'float'
+            elif dt == np.float64: 
+                return 'double'
+            elif dt == np.complex64: 
+                return 'complex<float>'
+            elif dt == np.complex128: 
+                return 'complex<double>'
+            elif dt == np.int32:
+                return 'int'
+            elif dt == np.int64:
+                return 'long long'
+            else:
+                raise ValueError('No mapping for {}'.format(dt))
+        
+        bx = 16
+        by = 16
+        # upsampling
+        if A.shape[-1] > B.shape[-1]:
+            resample = A.shape[-1] // B.shape[-1]
+            assert resample <= bx, "Upsampling up to factor {} is supported".format(bx)
+            
+            version = '{}'.format(map_type(A.dtype))
+            if version not in self.upsample_cuda:
+                self.upsample_cuda[version] = load_kernel('upsample', {
+                    'TYPE': map_type(A.dtype)
+                })
+            self.upsample_cuda[version](A, B, 
+                np.int32(A.shape[1]),
+                np.int32(A.shape[2]),
+                np.int32(resample),
+                block=(bx, by, 1),
+                grid=(
+                    int((A.shape[2] + bx - 1) // bx), 
+                    int((A.shape[1] + by - 1) // by), 
+                    int(A.shape[0])),
+                shared=int(bx*by*A.dtype.itemsize/resample/resample),
+                stream=self.queue)
+    
+        # downsampling
+        elif A.shape[-1] < B.shape[-1]:
+            resample = B.shape[-1] // A.shape[-1]
+            assert resample <= bx, "Downsampling up to factor {} is supported".format(bx)
+            
+            version = '{}'.format(map_type(A.dtype))
+            if version not in self.downsample_cuda:
+                self.downsample_cuda[version] = load_kernel('downsample', {
+                    'TYPE': map_type(A.dtype)
+                })
+            self.downsample_cuda[version](A, B, 
+                np.int32(B.shape[1]),
+                np.int32(B.shape[2]),
+                np.int32(resample),
+                block=(bx, by, 1),
+                grid=(
+                    int((B.shape[2] + bx - 1) // bx), 
+                    int((B.shape[1] + by - 1) // by), 
+                    int(A.shape[0])),
+                shared=int(bx*by*A.dtype.itemsize),
+                stream=self.queue)
+
 class DerivativesKernel:
     def __init__(self, dtype, queue=None):
         if dtype == np.float32:
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu b/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu
new file mode 100644
index 000000000..7c9544ad3
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu
@@ -0,0 +1,60 @@
+#include <thrust/complex.h>
+using thrust::complex;
+
+extern "C" __global__ void downsample(TYPE* A,
+                                    const TYPE* B,
+                                    int Brows,
+                                    int Bcols,
+                                    int factor)
+{
+    // shared mem is BDIMx x BDIMy, declared on the call-side
+    extern __shared__ char sh_raw[];
+    auto block = reinterpret_cast<TYPE*>(sh_raw);
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int bx = threadIdx.x + blockIdx.x * blockDim.x;
+    const int by = threadIdx.y + blockIdx.y * blockDim.y;
+
+    // offset A and B to the right point in first dimension
+    A += blockIdx.z * Brows / factor * Bcols / factor;
+    B += blockIdx.z * Brows * Bcols; 
+
+    // read input data into shared memory
+    if (bx < Bcols && by < Brows) {
+        block[ty * blockDim.x + tx] = B[by * Bcols + bx];
+    } else {
+        block[ty * blockDim.x + tx] = TYPE(0);
+    }
+    __syncthreads();
+
+    // reduce across the downsampling area
+    // first in x direction
+    int c = 1;
+    while (c != factor) 
+    {
+        c *= 2;
+        if (tx % c == 0) {
+            block[ty * blockDim.x + tx] += block[ty * blockDim.x + tx + c/2];
+        }
+        __syncthreads();
+    }
+
+    // now in y direction
+    c = 1;
+    while (c != factor) 
+    {
+        c *= 2;
+        if (ty % c == 0) {
+            block[ty * blockDim.x + tx] += block[(ty +c/2) * blockDim.x + tx];
+        }
+        __syncthreads();
+    }
+
+    // now write out
+    if (tx % factor == 0 && ty % factor == 0 && bx < Bcols && by < Brows) {
+        const int ax = bx / factor;
+        const int ay = by / factor;
+        A[ay * Bcols / factor + ax] = block[ty * blockDim.x + tx];
+    }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
index fdac52e46..93671d158 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
@@ -5,22 +5,33 @@ using std::sqrt;
 using thrust::abs;
 using thrust::complex;
 
-// specify max number of threads/block and min number of blocks per SM,
-// to assist the compiler in register optimisations.
-// We achieve a higher occupancy in this case, as less registers are used
-// (guided by profiler)
-extern "C" __global__ void __launch_bounds__(1024, 2)
-    exit_error(int nmodes,
-               const complex<IN_TYPE> * __restrict aux,
-               OUT_TYPE *ferr,
-               const int * __restrict addr,
-               int A,
-               int B)
+// version if input is complex, i.e. we need to calc abs(.)**2
+inline __device__ MATH_TYPE aux_intensity(complex<MATH_TYPE> aux_t) {
+  MATH_TYPE abst = abs(aux_t);
+  return abst * abst; // if we do this manually (real*real +imag*imag)
+                      // we get differences to numpy due to rounding
+}
+
+// version if input is real, so we can just return it
+inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) {
+  return aux_t;
+}
+
+
+// generic template that works for complex aux, or intensity aux
+template <class AUX_T>
+inline __device__ void exit_error_impl(int nmodes,
+  const AUX_T* __restrict aux,
+  OUT_TYPE *ferr,
+  const int* __restrict addr,
+  int A,
+  int B) 
 {
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   int addr_stride = 15;
   MATH_TYPE denom = A * B;
+  
 
   const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride;
   const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride;
@@ -35,13 +46,40 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
       MATH_TYPE acc = 0.0;
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
-        MATH_TYPE abs_exit_wave = abs(t_aux);
-        acc += abs_exit_wave *
-               abs_exit_wave;  // if we do this manually (real*real +imag*imag)
-                               // we get differences to numpy due to rounding
+        acc += aux_intensity(aux[a * B + b + idx * A * B]);
       }
       ferr[a * B + b] = OUT_TYPE(acc / denom);
     }
   }
 }
+
+// 2 kernels using the same template above - only difference is the input
+// type for aux (and the name)
+
+
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    exit_error(int nmodes,
+               const complex<IN_TYPE>* __restrict aux,
+               OUT_TYPE *ferr,
+               const int * __restrict addr,
+               int A,
+               int B)
+{
+  exit_error_impl(nmodes, aux, ferr, addr, A, B);
+}
+
+
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    exit_error_auxintensity(int nmodes,
+               const IN_TYPE* __restrict aux,
+               OUT_TYPE *ferr,
+               const int * __restrict addr,
+               int A,
+               int B)
+{
+  exit_error_impl(nmodes, aux, ferr, addr, A, B);
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
index f8f695ca5..5babcd291 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
@@ -11,15 +11,16 @@
 using std::sqrt;
 using thrust::complex;
 
-extern "C" __global__ void fmag_all_update(complex<OUT_TYPE>* f,
-                                           const IN_TYPE* fmask,
-                                           const IN_TYPE* fmag,
-                                           const IN_TYPE* fdev,
-                                           const IN_TYPE* err_fmag,
-                                           const int* addr_info,
-                                           IN_TYPE pbound_,
-                                           int A,
-                                           int B)
+template <bool doMult, class FType>
+inline __device__ void fmag_all_update_impl(FType* f,
+                                            const IN_TYPE* fmask,
+                                            const IN_TYPE* fmag,
+                                            const IN_TYPE* fdev,
+                                            const IN_TYPE* err_fmag,
+                                            const int* addr_info,
+                                            IN_TYPE pbound_,
+                                            int A,
+                                            int B)
 {
   int batch = blockIdx.x;
   int tx = threadIdx.x;
@@ -55,8 +56,38 @@ extern "C" __global__ void fmag_all_update(complex<OUT_TYPE>* f,
         MATH_TYPE fdevv = fdev[a * A + b];
         MATH_TYPE fm = (MATH_TYPE(1) - m) +
                    m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + MATH_TYPE(1e-7)));
-        f[a * A + b] *= fm;
+        if (doMult)  // compile-time constant, so only one branch ends up in kernel
+          f[a * A + b] *= fm;
+        else
+          f[a * A + b] = fm;
       }
     }
   }
 }
+
+extern "C" __global__ void fmag_all_update(complex<OUT_TYPE>* f,
+                                           const IN_TYPE* fmask,
+                                           const IN_TYPE* fmag,
+                                           const IN_TYPE* fdev,
+                                           const IN_TYPE* err_fmag,
+                                           const int* addr_info,
+                                           IN_TYPE pbound_,
+                                           int A,
+                                           int B)
+{
+  fmag_all_update_impl<true>(f, fmask, fmag, fdev, err_fmag, addr_info, pbound_, A, B);
+}
+
+
+extern "C" __global__ void fmag_all_update_nomult(OUT_TYPE* f,
+                                           const IN_TYPE* fmask,
+                                           const IN_TYPE* fmag,
+                                           const IN_TYPE* fdev,
+                                           const IN_TYPE* err_fmag,
+                                           const int* addr_info,
+                                           IN_TYPE pbound_,
+                                           int A,
+                                           int B)
+{
+  fmag_all_update_impl<false>(f, fmask, fmag, fdev, err_fmag, addr_info, pbound_, A, B);
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
index 40a65c172..166e22f4c 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
@@ -11,13 +11,14 @@
 using std::sqrt;
 using thrust::complex;
 
-extern "C" __global__ void fmag_update_nopbound(complex<OUT_TYPE>* f,
-                                                const IN_TYPE* fmask,
-                                                const IN_TYPE* fmag,
-                                                const IN_TYPE* fdev,
-                                                const int* addr_info,
-                                                int A,
-                                                int B)
+template <bool doMult, class FType>
+inline __device__ void fmag_update_nopbound_impl(FType* f,
+                                                 const IN_TYPE* fmask,
+                                                 const IN_TYPE* fmag,
+                                                 const IN_TYPE* fdev,
+                                                 const int* addr_info,
+                                                 int A,
+                                                 int B)
 {
   const int bid = blockIdx.z;
   const int tx = threadIdx.x;
@@ -48,6 +49,32 @@ extern "C" __global__ void fmag_update_nopbound(complex<OUT_TYPE>* f,
     MATH_TYPE fdevv = fdev[a * A + b];
     MATH_TYPE fm =
         (MATH_TYPE(1) - m) + m * (fmagv / (fmagv + fdevv + MATH_TYPE(1e-7)));
-    f[a * A + b] *= fm;
+    if (doMult) // compile-time constant, so only one branch ends up in kernel
+      f[a * A + b] *= fm;
+    else
+      f[a * A + b] = fm;
   }
 }
+
+extern "C" __global__ void fmag_update_nopbound(complex<OUT_TYPE>* f,
+                                                const IN_TYPE* fmask,
+                                                const IN_TYPE* fmag,
+                                                const IN_TYPE* fdev,
+                                                const int* addr_info,
+                                                int A,
+                                                int B)
+{
+  fmag_update_nopbound_impl<true>(f, fmask, fmag, fdev, addr_info, A, B);
+}
+
+
+extern "C" __global__ void fmag_update_nopbound_nomult(OUT_TYPE* f,
+                                                const IN_TYPE* fmask,
+                                                const IN_TYPE* fmag,
+                                                const IN_TYPE* fdev,
+                                                const int* addr_info,
+                                                int A,
+                                                int B)
+{
+  fmag_update_nopbound_impl<false>(f, fmask, fmag, fdev, addr_info, A, B);
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
index 3427222c3..5c6526b37 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
@@ -13,13 +13,23 @@ using std::sqrt;
 using thrust::abs;
 using thrust::complex;
 
-// specify max number of threads/block and min number of blocks per SM,
-// to assist the compiler in register optimisations.
-// We achieve a higher occupancy in this case, as less registers are used
-// (guided by profiler)
-extern "C" __global__ void __launch_bounds__(1024, 2)
-    fourier_deviation(int nmodes,
-                      const complex<IN_TYPE> *f,
+// version where input is complex and we need to calculate intensity
+inline __device__ MATH_TYPE intensity(complex<MATH_TYPE> f)
+{
+  MATH_TYPE abst = abs(f);
+  return abst * abst;  // if we do this manually (real*real +imag*imag)
+                       // we get differences to numpy due to rounding
+}
+
+// version where input is already an intensity - just return
+inline __device__ MATH_TYPE intensity(MATH_TYPE f)
+{
+  return f;
+}
+
+template <class AUX_T>
+inline __device__ void fourier_deviation_impl(int nmodes,
+                      const AUX_T *f,
                       const IN_TYPE *fmag,
                       OUT_TYPE *fdev,
                       const int *addr,
@@ -46,13 +56,40 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
     MATH_TYPE acc = MATH_TYPE(0);
     for (int idx = 0; idx < nmodes; ++idx)
     {
-      complex<MATH_TYPE> t_f = f[a * B + b + idx * A * B];
-      MATH_TYPE abs_exit_wave = abs(t_f);
-      acc += abs_exit_wave *
-             abs_exit_wave;  // if we do this manually (real*real +imag*imag)
-                             // we get differences to numpy due to rounding
+      acc += intensity(f[a * B + b + idx * A * B]);
     }
     auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]);
     fdev[a * B + b] = fdevv;
   }
 }
+
+// two kernels - one if input is already an intensity and one where it's not
+
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    fourier_deviation(int nmodes,
+                      const complex<IN_TYPE> *f,
+                      const IN_TYPE *fmag,
+                      OUT_TYPE *fdev,
+                      const int *addr,
+                      int A,
+                      int B)
+{
+  fourier_deviation_impl(nmodes, f, fmag, fdev, addr, A, B);
+}
+
+
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    fourier_deviation_auxintensity(int nmodes,
+                      const IN_TYPE *f,
+                      const IN_TYPE *fmag,
+                      OUT_TYPE *fdev,
+                      const int *addr,
+                      int A,
+                      int B)
+{
+  fourier_deviation_impl(nmodes, f, fmag, fdev, addr, A, B);
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
index ad483c870..93e86c0f7 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
@@ -14,13 +14,22 @@ using std::sqrt;
 using thrust::abs;
 using thrust::complex;
 
-// specify max number of threads/block and min number of blocks per SM,
-// to assist the compiler in register optimisations.
-// We achieve a higher occupancy in this case, as less registers are used
-// (guided by profiler)
-extern "C" __global__ void __launch_bounds__(1024, 2)
-    fourier_error(int nmodes,
-                  const complex<IN_TYPE> *f,
+// version if input is complex, i.e. we need to calc abs(.)**2
+inline __device__ MATH_TYPE aux_intensity(complex<MATH_TYPE> aux_t) {
+  MATH_TYPE abst = abs(aux_t);
+  return abst * abst; // if we do this manually (real*real +imag*imag)
+                      // we get differences to numpy due to rounding
+}
+
+// version if input is real, so we can just return it
+inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) {
+  return aux_t;
+}
+
+template <class AUX_T>
+inline __device__ void fourier_error_impl(
+                  int nmodes,
+                  const AUX_T *f,
                   const IN_TYPE *fmask,
                   const IN_TYPE *fmag,
                   OUT_TYPE *fdev,
@@ -51,11 +60,7 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
       MATH_TYPE acc = MATH_TYPE(0);
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        complex<MATH_TYPE> t_f = f[a * B + b + idx * A * B];
-        MATH_TYPE abs_exit_wave = abs(t_f);
-        acc += abs_exit_wave *
-               abs_exit_wave;  // if we do this manually (real*real +imag*imag)
-                               // we get differences to numpy due to rounding
+        acc += aux_intensity(f[a * B + b + idx * A * B]);
       }
       auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]);
       ferr[a * B + b] = (fmask[a * B + b] * fdevv * fdevv) / mask_sum[ma[0]];
@@ -63,3 +68,38 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
     }
   }
 }
+
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    fourier_error(int nmodes,
+                  const complex<IN_TYPE> *f,
+                  const IN_TYPE *fmask,
+                  const IN_TYPE *fmag,
+                  OUT_TYPE *fdev,
+                  OUT_TYPE *ferr,
+                  const IN_TYPE *mask_sum,
+                  const int *addr,
+                  int A,
+                  int B)
+{
+  fourier_error_impl(nmodes, f, fmask, fmag, fdev, ferr, mask_sum, addr, A, B);
+}
+
+
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    fourier_error_auxintensity(int nmodes,
+                  const IN_TYPE *f,
+                  const IN_TYPE *fmask,
+                  const IN_TYPE *fmag,
+                  OUT_TYPE *fdev,
+                  OUT_TYPE *ferr,
+                  const IN_TYPE *mask_sum,
+                  const int *addr,
+                  int A,
+                  int B)
+{
+  fourier_error_impl(nmodes, f, fmask, fmag, fdev, ferr, mask_sum, addr, A, B);
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
index 90455b1e2..04fa202a7 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
@@ -13,13 +13,24 @@ using std::sqrt;
 using thrust::abs;
 using thrust::complex;
 
-// specify max number of threads/block and min number of blocks per SM,
-// to assist the compiler in register optimisations.
-// We achieve a higher occupancy in this case, as less registers are used
-// (guided by profiler)
-extern "C" __global__ void __launch_bounds__(1024, 2)
-    log_likelihood(int nmodes,
-                   complex<OUT_TYPE> *aux,
+// version if input is complex, i.e. we need to calc abs(.)**2
+inline __device__ MATH_TYPE aux_intensity(complex<MATH_TYPE> aux_t) {
+  MATH_TYPE abst = abs(aux_t);
+  return abst * abst; // if we do this manually (real*real +imag*imag)
+                      // we get differences to numpy due to rounding
+}
+
+// version if input is real, so we can just return it
+inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) {
+  return aux_t;
+}
+
+////////// log_likelihood with 1 thread block per image
+
+template <class AUX_T>
+inline __device__ void log_likelihood_impl(
+                   int nmodes,
+                   AUX_T *aux,
                    const IN_TYPE *fmask,
                    const IN_TYPE *fmag,
                    const int *addr,
@@ -48,11 +59,7 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
       MATH_TYPE acc = 0.0;
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
-        MATH_TYPE abs_exit_wave = abs(t_aux);
-        acc += abs_exit_wave *
-               abs_exit_wave;  // if we do this manually (real*real +imag*imag)
-                               // we get differences to numpy due to rounding
+        acc += aux_intensity(aux[a * B + b + idx * A * B]);
       }
       auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
       llerr[a * B + b] =
@@ -61,9 +68,12 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
   }
 }
 
-
-extern "C" __global__ void 
-    log_likelihood2(int nmodes,
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    log_likelihood(int nmodes,
                    complex<OUT_TYPE> *aux,
                    const IN_TYPE *fmask,
                    const IN_TYPE *fmag,
@@ -71,6 +81,34 @@ extern "C" __global__ void
                    IN_TYPE *llerr,
                    int A,
                    int B)
+{
+  log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
+}
+
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    log_likelihood_auxintensity(int nmodes,
+                   OUT_TYPE *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
+{
+  log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
+}
+
+/////////////////// version with 1 thread block per x dimension only
+template <class AUX_T>
+__device__ inline void log_likelihood2_impl(
+                   int nmodes,
+                   AUX_T *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
 {
   int bid = blockIdx.z;
   int tx = threadIdx.x;
@@ -94,14 +132,37 @@ extern "C" __global__ void
     MATH_TYPE acc = 0.0;
     for (int idx = 0; idx < nmodes; ++idx)
     {
-      complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
-      MATH_TYPE abs_exit_wave = abs(t_aux);
-      acc += abs_exit_wave *
-              abs_exit_wave;  // if we do this manually (real*real +imag*imag)
-                              // we get differences to numpy due to rounding
+      acc += aux_intensity(aux[a * B + b + idx * A * B]);
     }
     auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
     llerr[a * B + b] =
         MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm;
   }
+}
+
+
+extern "C" __global__ void 
+    log_likelihood2(int nmodes,
+                   complex<OUT_TYPE> *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
+{
+  log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
+}
+
+extern "C" __global__ void 
+    log_likelihood2_auxintensity(int nmodes,
+                   OUT_TYPE *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
+{
+  log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B);
 }
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu b/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu
new file mode 100644
index 000000000..ed4024114
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu
@@ -0,0 +1,41 @@
+#include <thrust/complex.h>
+using thrust::complex;
+
+extern "C" __global__ void upsample(TYPE* A, 
+                                    const TYPE* B, 
+                                    int Arows, 
+                                    int Acols, 
+                                    int factor)
+{
+  // shared mem is BDIM/factor x BDIM/factor of TYPE, declared on the call-side
+  extern __shared__ char sh_raw[];
+  auto block = reinterpret_cast<TYPE*>(sh_raw);
+
+  const int tx = threadIdx.x;
+  const int ty = threadIdx.y;
+  const int ax = threadIdx.x + blockIdx.x * blockDim.x;
+  const int ay = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // offset A and B to the right point in first dimension
+  A += blockIdx.z * Arows * Acols;
+  B += blockIdx.z * Arows / factor * Acols / factor;
+
+  // read the input data into the shared memory
+  // we use a thread block per output size, so we only need a
+  // subset of the threads to read the data in
+  if (tx < blockDim.x / factor && ty < blockDim.y / factor && ax < Acols &&
+      ay < Arows)
+  {
+    const int tbx = blockIdx.x * blockDim.x / factor + tx;
+    const int tby = blockIdx.y * blockDim.y / factor + ty;
+    block[ty * blockDim.x / factor + tx] =
+        B[tby * Acols / factor + tbx] / (factor * factor);
+  }
+  __syncthreads();
+
+  // now all the threads read from shared mem to output
+  if (ax < Acols && ay < Arows)
+  {
+    A[ay * Acols + ax] = block[(ty / factor * blockDim.x + tx) / factor];
+  }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
index 961851072..8e15a1516 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
@@ -12,16 +12,17 @@
 import time
 from pycuda import gpuarray
 import pycuda.driver as cuda
+from pycuda.elementwise import ElementwiseKernel
 
 from ptypy import utils as u
 from ptypy.utils.verbose import logger, log
 from ptypy.utils import parallel
 from ptypy.engines import register
 from ptypy.accelerate.base.engines import DM_serial
-from .. import get_context
+from .. import get_context, debug_options
 from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
-from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel
+from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, ResampleKernel, TransposeKernel, ClipMagnitudesKernel
 from ..mem_utils import make_pagelocked_paired_arrays as mppa
 from ..multi_gpu import get_multi_gpu_communicator
 
@@ -111,9 +112,19 @@ def _setup_kernels(self):
             aux = np.zeros(ash, dtype=np.complex64)
             kern.aux = gpuarray.to_gpu(aux)
 
+            # create extra array for resampling (if needed)
+            kern.resample = scan.resample > 1
+            if kern.resample:
+                ish = (ash[0],) + tuple(geo.shape//scan.resample)
+                kern.aux_tmp1 = np.zeros(ash, dtype=np.float32)
+                kern.aux_tmp2 = np.zeros(ish, dtype=np.float32)
+                aux_f = kern.aux_tmp2 # making sure to pass the correct shapes to FUK 
+            else:
+                aux_f = aux
+
             # setup kernels, one for each SCAN.
             log(4, "Setting up FourierUpdateKernel")
-            kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
+            kern.FUK = FourierUpdateKernel(aux_f, nmodes, queue_thread=self.queue)
             kern.FUK.allocate()
 
             log(4, "Setting up PoUpdateKernel")
@@ -135,6 +146,18 @@ def _setup_kernels(self):
             kern.PROP.allocate()
             kern.resolution = geo.resolution[0]
 
+            if kern.resample:
+                logger.info("Setting up Resample kernel")
+                kern.RSMP = ResampleKernel(queue=self.queue)
+
+                logger.info("Setting up abs**2 kernel")
+                kern.ABS_SQR = ElementwiseKernel(
+                    "pycuda::complex<float>* in, float* out",
+                    "float abst = abs(in[i]); out[i] = abst*abst;",
+                    "abs_sqr",
+                    options=debug_options
+                )
+
             if self.do_position_refinement:
                 log(4, "Setting up PositionCorrectionKernel")
                 kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
@@ -186,6 +209,11 @@ def engine_prepare(self):
             if self.do_position_refinement:
                 prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu)
 
+            kern = self.kernels[prep.label]
+            if kern.resample:
+                kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1)
+                kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2)
+
     def engine_iterate(self, num=1):
         """
         Compute one iteration.
@@ -206,6 +234,9 @@ def engine_iterate(self, num=1):
                 FUK = kern.FUK
                 AWK = kern.AWK
                 PROP = kern.PROP
+                if kern.resample:
+                    ABS_SQR = kern.ABS_SQR
+                    RSMP = kern.RSMP
 
                 # get addresses and buffers
                 addr = prep.addr_gpu
@@ -217,6 +248,12 @@ def engine_iterate(self, num=1):
                 pbound = self.pbound_scan[prep.label]
                 aux = kern.aux
 
+                # resampling
+                resample = kern.resample
+                if resample:
+                    aux_tmp1 = kern.aux_tmp1_gpu
+                    aux_tmp2 = kern.aux_tmp2_gpu
+
                 # local references
                 ma = self.ma.S[dID].gpu
                 ob = self.ob.S[oID].gpu
@@ -227,7 +264,12 @@ def engine_iterate(self, num=1):
                 if self.p.compute_log_likelihood:
                     AWK.build_aux_no_ex(aux, addr, ob, pr)
                     PROP.fw(aux, aux)
-                    FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+                    if resample:
+                        ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                        RSMP.resample(aux_tmp2, aux_tmp1)
+                        FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True)
+                    else:
+                        FUK.log_likelihood(aux, addr, mag, ma, err_phot)
 
                 ## build auxilliary wave
                 AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
@@ -236,16 +278,31 @@ def engine_iterate(self, num=1):
                 PROP.fw(aux, aux)
 
                 ## Deviation from measured data
-                FUK.fourier_error(aux, addr, mag, ma, ma_sum)
-                FUK.error_reduce(addr, err_fourier)
-                FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
+                if resample:
+                    ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                    RSMP.resample(aux_tmp2, aux_tmp1)
+                    FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True)
+                    FUK.error_reduce(addr, err_fourier)
+                    FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False)
+                    RSMP.resample(aux_tmp1, aux_tmp2)
+                    # aux *= aux_tmp1 , but with stream
+                    aux._elwise_multiply(aux_tmp1, aux, stream=self.queue)
+                else:
+                    FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    FUK.error_reduce(addr, err_fourier)
+                    FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
 
                 ## backward FFT
                 PROP.bw(aux, aux)
 
                 ## build exit wave
                 AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                FUK.exit_error(aux, addr)
+                if resample:
+                    ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                    RSMP.resample(aux_tmp2, aux_tmp1)
+                    FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True)
+                else:
+                    FUK.exit_error(aux, addr)
                 FUK.error_reduce(addr, err_exit)
 
             parallel.barrier()
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
index 9306475b1..3f28d3c78 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
@@ -114,6 +114,10 @@ def engine_prepare(self):
             prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit)
             if self.do_position_refinement:
                 prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu)
+            kern = self.kernels[prep.label]
+            if kern.resample:
+                kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1)
+                kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2)
             ma = self.ma.S[dID].data.astype(np.float32)
             prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4)
             prep.ma[:] = ma
@@ -179,10 +183,19 @@ def engine_iterate(self, num=1):
                     FUK = kern.FUK
                     AWK = kern.AWK
                     POK = kern.POK
+                    PROP = kern.PROP
+                    if kern.resample:
+                        ABS_SQR = kern.ABS_SQR
+                        RSMP = kern.RSMP
 
                     pbound = self.pbound_scan[prep.label]
                     aux = kern.aux
-                    PROP = kern.PROP
+
+                    # resampling
+                    resample = kern.resample
+                    if resample:
+                        aux_tmp1 = kern.aux_tmp1_gpu
+                        aux_tmp2 = kern.aux_tmp2_gpu                    
 
                     # get addresses and auxilliary array
                     addr = prep.addr_gpu
@@ -214,6 +227,17 @@ def engine_iterate(self, num=1):
                         if self.p.compute_log_likelihood:
                             AWK.build_aux_no_ex(aux, addr, ob, pr)
                             PROP.fw(aux, aux)
+                            if resample:
+                                ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                                RSMP.resample(aux_tmp2, aux_tmp1)
+                                # synchronize h2d stream with compute stream
+                                self.queue.wait_for_event(ev_mag)
+                                FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True)
+                            else:
+                                # synchronize h2d stream with compute stream
+                                self.queue.wait_for_event(ev_mag)
+                                FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+
                             # synchronize h2d stream with compute stream
                             self.queue.wait_for_event(ev_mag)
                             FUK.log_likelihood(aux, addr, mag, ma, err_phot)
@@ -226,11 +250,23 @@ def engine_iterate(self, num=1):
                         PROP.fw(aux, aux)
 
                         ## Deviation from measured data
-                        # synchronize h2d stream with compute stream
-                        self.queue.wait_for_event(ev_mag)
-                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
-                        FUK.error_reduce(addr, err_fourier)
-                        FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
+                        if resample:
+                            ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                            RSMP.resample(aux_tmp2, aux_tmp1)
+                            # synchronize h2d stream with compute stream
+                            self.queue.wait_for_event(ev_mag)
+                            FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True)
+                            FUK.error_reduce(addr, err_fourier)
+                            FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False)
+                            RSMP.resample(aux_tmp1, aux_tmp2)
+                            # aux *= aux_tmp1 , but with stream
+                            aux._elwise_multiply(aux_tmp1, aux, stream=self.queue)
+                        else:
+                            # synchronize h2d stream with compute stream
+                            self.queue.wait_for_event(ev_mag)
+                            FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            FUK.error_reduce(addr, err_fourier)
+                            FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
 
                         data_mag.record_done(self.queue, 'compute')
                         data_ma.record_done(self.queue, 'compute')
@@ -238,7 +274,12 @@ def engine_iterate(self, num=1):
                         PROP.bw(aux, aux)
                         ## apply changes
                         AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                        FUK.exit_error(aux, addr)
+                        if resample:
+                            ABS_SQR(aux, aux_tmp1, stream=self.queue)
+                            RSMP.resample(aux_tmp2, aux_tmp1)
+                            FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True)
+                        else:
+                            FUK.exit_error(aux, addr)
                         FUK.error_reduce(addr, err_exit)
 
                     prestr = '%d Iteration (Overlap) #%02d:  ' % (parallel.rank, inner)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
index d2db342f5..e6b045786 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
@@ -182,6 +182,11 @@ def engine_prepare(self):
             prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit)
             if self.do_position_refinement:
                 prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu)
+            kern = self.kernels[prep.label]
+            if kern.resample:
+                kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1)
+                kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2)
+                
             ma = self.ma.S[dID].data.astype(np.float32)
             prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4)
             prep.ma[:] = ma   
@@ -290,13 +295,20 @@ def engine_iterate(self, num=1):
                     pbound = self.pbound_scan[prep.label]
                     aux = kern.aux
                     PROP = kern.PROP
-
+                    
                     # set streams
                     queue = streamdata.queue
                     FUK.queue = queue
                     AWK.queue = queue
                     POK.queue = queue
                     PROP.queue = queue
+                    resample = kern.resample
+                    if resample:
+                        ABS_SQR = kern.ABS_SQR
+                        RSMP = kern.RSMP
+                        RSMP.queue = queue
+                        aux_tmp1 = kern.aux_tmp1_gpu
+                        aux_tmp2 = kern.aux_tmp2_gpu
 
                     # get addresses and auxilliary array
                     addr = prep.addr_gpu
@@ -331,7 +343,12 @@ def engine_iterate(self, num=1):
                             t1 = time.time()
                             AWK.build_aux_no_ex(aux, addr, ob, pr)
                             PROP.fw(aux, aux)
-                            FUK.log_likelihood(aux, addr, mag, ma, err_phot)                    
+                            if resample:
+                                ABS_SQR(aux, aux_tmp1, stream=queue)
+                                RSMP.resample(aux_tmp2, aux_tmp1)
+                                FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True)
+                            else:
+                                FUK.log_likelihood(aux, addr, mag, ma, err_phot)                    
                             self.benchmark.F_LLerror += time.time() - t1
 
                         ## prep + forward FFT
@@ -345,9 +362,19 @@ def engine_iterate(self, num=1):
 
                         ## Deviation from measured data
                         t1 = time.time()
-                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
-                        FUK.error_reduce(addr, err_fourier)
-                        FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
+                        if resample:
+                            ABS_SQR(aux, aux_tmp1, stream=queue)
+                            RSMP.resample(aux_tmp2, aux_tmp1)
+                            FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True)
+                            FUK.error_reduce(addr, err_fourier)
+                            FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False)
+                            RSMP.resample(aux_tmp1, aux_tmp2)
+                            # aux *= aux_tmp1 , but with stream
+                            aux._elwise_multiply(aux_tmp1, aux, stream=queue)
+                        else:
+                            FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            FUK.error_reduce(addr, err_fourier)
+                            FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
                         self.benchmark.C_Fourier_update += time.time() - t1
                         streamdata.record_done_ma(dID)
 
@@ -356,7 +383,12 @@ def engine_iterate(self, num=1):
                         PROP.bw(aux, aux)
                         ## apply changes
                         AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                        FUK.exit_error(aux, addr)
+                        if resample:
+                            ABS_SQR(aux, aux_tmp1, stream=queue)
+                            RSMP.resample(aux_tmp2, aux_tmp1)
+                            FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True)
+                        else:
+                            FUK.exit_error(aux, addr)
                         FUK.error_reduce(addr, err_exit)
                         self.benchmark.E_Build_exit += time.time() - t1
                         
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 47dd4cb79..da0c7848d 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -162,18 +162,26 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma
         self.accumulate_type = accumulate_type
         self.math_type = math_type
         self.queue = queue_thread
-        self.fmag_all_update_cuda = load_kernel("fmag_all_update", {
-            'IN_TYPE': 'float',
-            'OUT_TYPE': 'float',
-            'MATH_TYPE': self.math_type
-        })
+        self.fmag_all_update_cuda, self.fmag_all_update_nomult_cuda = load_kernel(
+            ["fmag_all_update", "fmag_all_update_nomult"], 
+            {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, 
+            "fmag_all_update.cu")
         self.fmag_update_nopbound_cuda = None
+        self.fmag_update_nopbound_nomult_cuda = None
         self.fourier_deviation_cuda = None
-        self.fourier_error_cuda = load_kernel("fourier_error", {
-            'IN_TYPE': 'float',
-            'OUT_TYPE': 'float',
-            'MATH_TYPE': self.math_type
-        })
+        self.fourier_deviation_auxintensity_cuda = None
+        self.fourier_error_cuda, self.fourier_error_auxintensity_cuda = load_kernel(
+            ["fourier_error", "fourier_error_auxintensity"], 
+            {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            },
+            "fourier_error.cu")
         self.fourier_error2_cuda = None
         self.error_reduce_cuda = load_kernel("error_reduce", {
             'IN_TYPE': 'float',
@@ -183,18 +191,23 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma
             'BDIM_Y': 32,
         })
         self.fourier_update_cuda = None
-        self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel(
-            ("log_likelihood", "log_likelihood2"), {
+        self.log_likelihood_cuda, self.log_likelihood_auxintensity_cuda, \
+            self.log_likelihood2_cuda, self.log_likelihood2_auxintensity_cuda = \
+            load_kernel(
+                ("log_likelihood", "log_likelihood_auxintensity", "log_likelihood2", "log_likelihood2_auxintensity"), {
                 'IN_TYPE': 'float',
                 'OUT_TYPE': 'float',
                 'MATH_TYPE': self.math_type
             },
             "log_likelihood.cu")
-        self.exit_error_cuda = load_kernel("exit_error", {
-            'IN_TYPE': 'float',
-            'OUT_TYPE': 'float',
-            'MATH_TYPE': self.math_type
-        })
+        self.exit_error_cuda, self.exit_error_auxintensity_cuda = load_kernel(
+            ["exit_error", "exit_error_auxintensity"], 
+            {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            },
+            "exit_error.cu")
 
         self.gpu = Adict()
         self.gpu.fdev = None
@@ -204,24 +217,26 @@ def allocate(self):
         self.gpu.fdev = gpuarray.zeros(self.fshape, dtype=np.float32)
         self.gpu.ferr = gpuarray.zeros(self.fshape, dtype=np.float32)
 
-    def fourier_error(self, f, addr, fmag, fmask, mask_sum):
+    def fourier_error(self, f, addr, fmag, fmask, mask_sum, aux_is_intensity=False):
+        assert (aux_is_intensity and f.dtype == np.float32) or (not aux_is_intensity and f.dtype == np.complex64)
         fdev = self.gpu.fdev
         ferr = self.gpu.ferr
         if True:
+            kernel = self.fourier_error_cuda if not aux_is_intensity else self.fourier_error_auxintensity_cuda
             # version going over all modes in a single thread (faster)
-            self.fourier_error_cuda(np.int32(self.nmodes),
-                                    f,
-                                    fmask,
-                                    fmag,
-                                    fdev,
-                                    ferr,
-                                    mask_sum,
-                                    addr,
-                                    np.int32(self.fshape[1]),
-                                    np.int32(self.fshape[2]),
-                                    block=(32, 32, 1),
-                                    grid=(int(fmag.shape[0]), 1, 1),
-                                    stream=self.queue)
+            kernel(np.int32(self.nmodes),
+                   f,
+                   fmask,
+                   fmag,
+                   fdev,
+                   ferr,
+                   mask_sum,
+                   addr,
+                   np.int32(self.fshape[1]),
+                   np.int32(self.fshape[2]),
+                   block=(32, 32, 1),
+                   grid=(int(fmag.shape[0]), 1, 1),
+                   stream=self.queue)
         else:
             # version using one thread per mode + shared mem reduction (slower)
             if self.fourier_error2_cuda is None:
@@ -249,26 +264,31 @@ def fourier_error(self, f, addr, fmag, fmask, mask_sum):
                                      shared=int(bx*by*bz*4),
                                      stream=self.queue)
 
-    def fourier_deviation(self, f, addr, fmag):
+    def fourier_deviation(self, f, addr, fmag, aux_is_intensity=False):
+        assert (aux_is_intensity and f.dtype == np.float32) or (not aux_is_intensity and f.dtype == np.complex64)
         fdev = self.gpu.fdev
         if self.fourier_deviation_cuda is None:
-            self.fourier_deviation_cuda = load_kernel("fourier_deviation",{
-                'IN_TYPE': 'float',
-                'OUT_TYPE': 'float',
-                'MATH_TYPE': self.math_type
-            })
+            self.fourier_deviation_cuda, self.fourier_deviation_auxintensity_cuda = load_kernel(
+                ["fourier_deviation", "fourier_deviation_auxintensity"],
+                {
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type
+                },
+                "fourier_deviation.cu")
+        kernel = self.fourier_deviation_auxintensity_cuda if aux_is_intensity else self.fourier_deviation_cuda
         bx = 64
         by = 1
-        self.fourier_deviation_cuda(np.int32(self.nmodes),
-                                f,
-                                fmag,
-                                fdev,
-                                addr,
-                                np.int32(self.fshape[1]),
-                                np.int32(self.fshape[2]),
-                                block=(bx, by, 1),
-                                grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])),
-                                stream=self.queue)
+        kernel(np.int32(self.nmodes),
+               f,
+               fmag,
+               fdev,
+               addr,
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(bx, by, 1),
+               grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])),
+               stream=self.queue)
 
 
     def error_reduce(self, addr, err_sum):
@@ -280,43 +300,50 @@ def error_reduce(self, addr, err_sum):
                                grid=(int(err_sum.shape[0]), 1, 1),
                                stream=self.queue)
 
-    def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0):
+    def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0, mult=True):
+        assert (mult and f.dtype == np.complex64) or (not mult and f.dtype == np.float32)
+
         fdev = self.gpu.fdev
-        self.fmag_all_update_cuda(f,
-                                  fmask,
-                                  fmag,
-                                  fdev,
-                                  err_fmag,
-                                  addr,
-                                  np.float32(pbound),
-                                  np.int32(self.fshape[1]),
-                                  np.int32(self.fshape[2]),
-                                  block=(32, 32, 1),
-                                  grid=(int(fmag.shape[0]*self.nmodes), 1, 1),
-                                  stream=self.queue)
+        kernel = self.fmag_all_update_cuda if mult else self.fmag_all_update_nomult_cuda
+        kernel(f,
+               fmask,
+               fmag,
+               fdev,
+               err_fmag,
+               addr,
+               np.float32(pbound),
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(32, 32, 1),
+               grid=(int(fmag.shape[0]*self.nmodes), 1, 1),
+               stream=self.queue)
     
-    def fmag_update_nopbound(self, f, addr, fmag, fmask):
+    def fmag_update_nopbound(self, f, addr, fmag, fmask, mult=True):
+        assert (mult and f.dtype == np.complex64) or (not mult and f.dtype == np.float32)
         fdev = self.gpu.fdev
         bx = 64
         by = 1
         if self.fmag_update_nopbound_cuda is None:
-            self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", {
-                'IN_TYPE': 'float',
-                'OUT_TYPE': 'float',
-                'MATH_TYPE': self.math_type
-            })
-        self.fmag_update_nopbound_cuda(f,
-                                  fmask,
-                                  fmag,
-                                  fdev,
-                                  addr,
-                                  np.int32(self.fshape[1]),
-                                  np.int32(self.fshape[2]),
-                                  block=(bx, by, 1),
-                                  grid=(1, 
-                                    int((self.fshape[2] + by - 1) // by), 
-                                    int(fmag.shape[0]*self.nmodes)),
-                                  stream=self.queue)
+            self.fmag_update_nopbound_cuda, self.fmag_update_nopbound_nomult_cuda = load_kernel(
+                ["fmag_update_nopbound","fmag_update_nopbound_nomult"], 
+                {
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type
+                }, "fmag_update_nopbound.cu")
+        kernel = self.fmag_update_nopbound_cuda if mult else self.fmag_update_nopbound_nomult_cuda
+        kernel(f,
+               fmask,
+               fmag,
+               fdev,
+               addr,
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(bx, by, 1),
+               grid=(1, 
+                  int((self.fshape[2] + by - 1) // by), 
+                  int(fmag.shape[0]*self.nmodes)),
+               stream=self.queue)
 
     # Note: this was a test to join the kernels, but it's > 2x slower!
     def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0):
@@ -350,53 +377,61 @@ def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0):
                                  shared=smem,
                                  stream=self.queue)
 
-    def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
+    def log_likelihood(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False):
+        assert (aux_is_intensity and b_aux.dtype == np.float32) or (not aux_is_intensity and b_aux.dtype == np.complex64)
         ferr = self.gpu.ferr
-        self.log_likelihood_cuda(np.int32(self.nmodes),
-                                 b_aux,
-                                 mask,
-                                 mag,
-                                 addr,
-                                 ferr,
-                                 np.int32(self.fshape[1]),
-                                 np.int32(self.fshape[2]),
-                                 block=(32, 32, 1),
-                                 grid=(int(mag.shape[0]), 1, 1),
-                                 stream=self.queue)
+        kernel = self.log_likelihood_cuda if not aux_is_intensity else self.log_likelihood_auxintensity_cuda
+        kernel(np.int32(self.nmodes),
+               b_aux,
+               mask,
+               mag,
+               addr,
+               ferr,
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(32, 32, 1),
+               grid=(int(mag.shape[0]), 1, 1),
+               stream=self.queue)
         # TODO: we might want to move this call outside of here
         self.error_reduce(addr, err_phot)
 
-    def log_likelihood2(self, b_aux, addr, mag, mask, err_phot):
+    def log_likelihood2(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False):
+        assert (aux_is_intensity and b_aux.dtype == np.float32) or (not aux_is_intensity and b_aux.dtype == np.complex64)
         ferr = self.gpu.ferr
         bx = 64
         by = 1
-        self.log_likelihood2_cuda(np.int32(self.nmodes),
-                                 b_aux,
-                                 mask,
-                                 mag,
-                                 addr,
-                                 ferr,
-                                 np.int32(self.fshape[1]),
-                                 np.int32(self.fshape[2]),
-                                 block=(bx, by, 1),
-                                 grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])),
-                                 stream=self.queue)
+        kernel = self.log_likelihood2_cuda if not aux_is_intensity else self.log_likelihood2_auxintensity_cuda
+        kernel(np.int32(self.nmodes),
+               b_aux,
+               mask,
+               mag,
+               addr,
+               ferr,
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(bx, by, 1),
+               grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])),
+               stream=self.queue)
         # TODO: we might want to move this call outside of here
         self.error_reduce(addr, err_phot)
 
-    def exit_error(self, aux, addr):
+    def exit_error(self, aux, addr, aux_is_intensity=False):
+        assert (aux_is_intensity and aux.dtype == np.float32) or \
+            (not aux_is_intensity and aux.dtype == np.complex64)
+
         sh = addr.shape
         maxz = sh[0]
         ferr = self.gpu.ferr
-        self.exit_error_cuda(np.int32(self.nmodes),
-                             aux,
-                             ferr,
-                             addr,
-                             np.int32(self.fshape[1]),
-                             np.int32(self.fshape[2]),
-                             block=(32, 32, 1),
-                             grid=(int(maxz), 1, 1),
-                             stream=self.queue)
+        kernel = self.exit_error_auxintensity_cuda if aux_is_intensity else self.exit_error_cuda
+        kernel(np.int32(self.nmodes),
+               aux,
+               ferr,
+               addr,
+               np.int32(self.fshape[1]),
+               np.int32(self.fshape[2]),
+               block=(32, 32, 1),
+               grid=(int(maxz), 1, 1),
+               stream=self.queue)
 
     # DEPRECATED?
     def execute(self, kernel_name=None, compare=False, sync=False):
diff --git a/ptypy/core/geometry.py b/ptypy/core/geometry.py
index 05a2b630e..f2f67a731 100644
--- a/ptypy/core/geometry.py
+++ b/ptypy/core/geometry.py
@@ -386,7 +386,7 @@ def upsample(self, c):
         """
         if self.resample == 1:
             return c
-        return u.zoom(c, self.resample, order=0) / (self.resample**2)
+        return u.repeat_2d(c, self.resample)[0] / (self.resample**2)
 
     def downsample(self, a):
         """
@@ -394,7 +394,7 @@ def downsample(self, a):
         """
         if self.resample == 1:
             return a
-        return u.rebin_2d(a, self.resample)[0]
+        return u.rebin_2d(a, self.resample)[0] * (self.resample**2)
 
     def __str__(self):
         keys = list(self.p.keys())
diff --git a/ptypy/utils/array_utils.py b/ptypy/utils/array_utils.py
index a6dc3ede9..0942f3f83 100644
--- a/ptypy/utils/array_utils.py
+++ b/ptypy/utils/array_utils.py
@@ -15,7 +15,7 @@
 __all__ = ['grids', 'switch_orientation', 'mirror',
            'crop_pad_symmetric_2d', 'crop_pad_axis', 'crop_pad',
            'pad_lr', 'zoom', 'shift_zoom', 'c_zoom',
-           'rebin', 'rebin_2d', 'rectangle', 'ellipsis']
+           'rebin', 'rebin_2d', 'repeat_2d', 'rectangle', 'ellipsis']
 
 
 def switch_orientation(A, orientation, center=None):
@@ -105,6 +105,33 @@ def rebin_2d(A, rebin=1):
     else:
         return A.reshape(-1, newdim[0], rebin, newdim[1], rebin).mean(-1).mean(-2)
 
+def repeat_2d(A, repeat=1):
+    """
+    Repeats array `A` symmetrically along the last 2 axes
+    with factor `repeat`.
+
+    Parameters
+    ----------
+    A : array-like
+        input array, must be at least two-dimensional.
+
+    repeat : int
+        repeat factor, ``repeat=2`` means that one pixel will be repeated 
+        into a square of 4 pixels.
+
+    Returns
+    -------
+    out : ndarray
+        repeated array. Note that data type casting follows numpy rules, so that boolean arrays are converted to int.
+
+    See also
+    --------
+    rebin_2d
+    """
+    sh = np.asarray(A.shape[-2:])
+    newdim = sh * repeat
+    assert (sh % repeat == 0).all(), "Last two axes %s of input array `A` cannot be repeated by %s" % (str(tuple(sh)), str(repeat))
+    return np.repeat(np.repeat(A.reshape((-1,sh[0],1,sh[1],1)),repeat, axis=-4),repeat,axis=-2).reshape(-1,newdim[0],newdim[1])
 
 def crop_pad_symmetric_2d(A, newshape, center=None):
     """
diff --git a/templates/minimal_prep_and_run_resample_DM.py b/templates/minimal_prep_and_run_resample_DM.py
index b06281223..570d166e2 100644
--- a/templates/minimal_prep_and_run_resample_DM.py
+++ b/templates/minimal_prep_and_run_resample_DM.py
@@ -48,3 +48,4 @@
 # prepare and run
 P = Ptycho(p,level=4)
 P.run()
+P.finalize()
diff --git a/templates/minimal_prep_and_run_resample_DM_serial.py b/templates/minimal_prep_and_run_resample_DM_serial.py
new file mode 100644
index 000000000..91b037441
--- /dev/null
+++ b/templates/minimal_prep_and_run_resample_DM_serial.py
@@ -0,0 +1,53 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.accelerate.base.engines import DM_serial
+
+p = u.Param()
+# for verbose output
+p.verbose_level = 3
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'Vanilla' # or 'Full'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# Resample by a factor of 2
+p.scans.MF.resample = 2
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_serial'
+p.engines.engine00.numiter = 100
+p.engines.engine00.probe_support = 0.05
+
+# prepare and run
+P = Ptycho(p,level=4)
+P.run()
+P.finalize()
diff --git a/templates/minimal_prep_and_run_resample_ML.py b/templates/minimal_prep_and_run_resample_ML.py
index 2edbb8bcc..0ebfc05ad 100644
--- a/templates/minimal_prep_and_run_resample_ML.py
+++ b/templates/minimal_prep_and_run_resample_ML.py
@@ -56,4 +56,5 @@
 # prepare and run
 P = Ptycho(p,level=4)
 P.run()
+P.finalize()
 
diff --git a/templates/minimal_prep_and_run_resample_ML_serial.py b/templates/minimal_prep_and_run_resample_ML_serial.py
new file mode 100644
index 000000000..926917089
--- /dev/null
+++ b/templates/minimal_prep_and_run_resample_ML_serial.py
@@ -0,0 +1,61 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.accelerate.base.engines import ML_serial
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 4
+p.frames_per_block = 100
+
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=False)
+#p.io.autoplot = u.Param()
+#p.io.autoplot.dump = True
+#p.io.autoplot = False
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'Full'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# Resample by a factor of 2
+p.scans.MF.resample = 2
+
+p.engines = u.Param()
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_serial'
+p.engines.engine01.reg_del2 = True                  # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine01.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine01.scale_precond = True
+p.engines.engine01.scale_probe_object = 1.
+p.engines.engine01.smooth_gradient = 20.
+p.engines.engine01.smooth_gradient_decay = 1/50.
+p.engines.engine01.floating_intensities = True
+p.engines.engine01.numiter = 300
+
+# prepare and run
+P = Ptycho(p,level=4)
+P.run()
+P.finalize()
+
diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py
index b1cac58fe..158656456 100644
--- a/test/accelerate_tests/base_tests/array_utils_test.py
+++ b/test/accelerate_tests/base_tests/array_utils_test.py
@@ -291,6 +291,69 @@ def test_crop_pad_3(self):
                          dtype=np.complex64)
         np.testing.assert_array_almost_equal(A, exp_A)
 
+    def test_downsample_factor_2(self):
+        # downsample, complex, 3D
+        B = np.indices((4,4), dtype=np.complex64)
+        B = np.indices((4,4), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,2,2), dtype=B.dtype)
+        au.resample(A,B)
+        exp_A = np.array([[[ 2. +2.j,  2.+10.j],
+                           [10. +2.j, 10.+10.j]],
+                          [[ 2. +2.j, 10.+ 2.j],
+                           [ 2.+10.j, 10.+10.j]]], dtype=np.complex64)
+        np.testing.assert_almost_equal(A.sum(), B.sum())            
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_upsample_factor_2(self):
+        # upsample, complex, 3D
+        B = np.indices((2,2), dtype=np.complex64)
+        B = np.indices((2,2), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,4,4), dtype=B.dtype)
+        au.resample(A,B)
+        exp_A = np.array([[[0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                           [0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                           [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j],
+                           [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j]],
+                          [[0.  +0.j  , 0.  +0.j  , 0.25+0.j  , 0.25+0.j  ],
+                           [0.  +0.j  , 0.  +0.j  , 0.25+0.j  , 0.25+0.j  ],
+                           [0.  +0.25j, 0.  +0.25j, 0.25+0.25j, 0.25+0.25j],
+                           [0.  +0.25j, 0.  +0.25j, 0.25+0.25j, 0.25+0.25j]]], dtype=np.complex64)
+        np.testing.assert_almost_equal(A.sum(), B.sum())
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_downsample_factor_4(self):
+        # downsample, complex, 3D
+        B = np.indices((8,8), dtype=np.complex64)
+        B = np.indices((8,8), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,2,2), dtype=B.dtype)
+        au.resample(A,B)
+        exp_A = np.array([[[24.+24.j, 24.+88.j],
+                           [88.+24.j, 88.+88.j]],
+                          [[24.+24.j, 88.+24.j],
+                           [24.+88.j, 88.+88.j]]], dtype=np.complex64)
+        np.testing.assert_almost_equal(A.sum(), B.sum())
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_upsample_factor_4(self):
+        # upsample, complex, 3D
+        Bshape = (2,4,4)
+        B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16
+        B = B + 1j * B
+        A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype)
+        au.resample(A, B)
+        exp_A = np.zeros_like(A)
+        
+        # building the expected value element-wise, to ensure correctness
+        for z in range(A.shape[0]):
+            for y in range(A.shape[1]):
+                for x in range(A.shape[2]):
+                    exp_A[z, y, x] = B[z, y//4, x//4] / 16
+                    if np.abs(A[z, y, x]-exp_A[z, y, x]) > 1e-6:
+                        print("mismatch! i=({},{},{}): act={}, exp={}".format(z, y, x, A[z, y, x], exp_A[z, y, x]))
+
+        np.testing.assert_almost_equal(A.sum(), B.sum())
+        np.testing.assert_array_almost_equal(A, exp_A)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/base_tests/fourier_update_kernel_test.py b/test/accelerate_tests/base_tests/fourier_update_kernel_test.py
index 00ad5e3d3..dd2d3a7a6 100644
--- a/test/accelerate_tests/base_tests/fourier_update_kernel_test.py
+++ b/test/accelerate_tests/base_tests/fourier_update_kernel_test.py
@@ -196,6 +196,128 @@ def test_fourier_error(self):
                                       err_msg="ferr does not give the expected error "
                                               "for the fourier_update_kernel.fourier_error emthods")
 
+
+    def test_fourier_error_aux_is_intensity(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2 # number og object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0 # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3))
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+
+        mask_sum = mask.sum(-1).sum(-1)
+
+        err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
+        pbound_set = 0.9
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK.allocate()
+        i = np.abs(f)**2
+        FUK.fourier_error(i, addr, fmag, mask, mask_sum, aux_is_intensity=True)
+
+
+        expected_fdev = np.array([[[7.7459664,   6.7459664,   5.7459664,   4.7459664,   3.7459664],
+                                   [2.7459664,   1.7459664,   0.74596643,  -0.25403357,  -1.2540336],
+                                   [-2.2540336,  -3.2540336,  -4.2540336,  -5.2540336,  -6.2540336],
+                                   [-7.2540336,  -8.254034,  -9.254034, -10.254034, -11.254034],
+                                   [-12.254034, -13.254034, -14.254034, -15.254034, -16.254034]],
+
+                                  [[-6.3452415,  -7.3452415,  -8.345242,  -9.345242, -10.345242],
+                                   [-11.345242, -12.345242, -13.345242, -14.345242, -15.345242],
+                                   [-16.345242, -17.345242, -18.345242, -19.345242, -20.345242],
+                                   [-21.345242, -22.345242, -23.345242, -24.345242, -25.345242],
+                                   [-26.345242, -27.345242, -28.345242, -29.345242, -30.345242]],
+
+                                  [[-20.13363, -21.13363, -22.13363, -23.13363, -24.13363],
+                                   [-25.13363, -26.13363, -27.13363, -28.13363, -29.13363],
+                                   [-30.13363, -31.13363, -32.13363, -33.13363, -34.13363],
+                                   [-35.13363, -36.13363, -37.13363, -38.13363, -39.13363],
+                                   [-40.13363, -41.13363, -42.13363, -43.13363, -44.13363]],
+
+                                  [[-33.866074, -34.866074, -35.866074, -36.866074, -37.866074],
+                                   [-38.866074, -39.866074, -40.866074, -41.866074, -42.866074],
+                                   [-43.866074, -44.866074, -45.866074, -46.866074, -47.866074],
+                                   [-48.866074, -49.866074, -50.866074, -51.866074, -52.866074],
+                                   [-53.866074, -54.866074, -55.866074, -56.866074, -57.866074]]],
+                                 dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(FUK.npy.fdev, expected_fdev,
+                                      err_msg="fdev does not give the expected error "
+                                              "for the fourier_update_kernel.fourier_error emthods")
+
+        expected_ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                                   [7.54033208e-01, 3.04839879e-01, 5.56465909e-02, 6.45330548e-03, 1.57260016e-01],
+                                   [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                                   [5.26210022e+00, 6.81290817e+00, 8.56371498e+00, 1.05145216e+01, 1.26653280e+01],
+                                   [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
+
+                                  [[1.61048353e+00, 2.15810299e+00, 2.78572226e+00, 3.49334168e+00, 4.28096104e+00],
+                                   [5.14858055e+00, 6.09619951e+00, 7.12381887e+00, 8.23143768e+00, 9.41905785e+00],
+                                   [1.06866770e+01, 1.20342960e+01, 1.34619150e+01, 1.49695349e+01, 1.65571537e+01],
+                                   [1.82247734e+01, 1.99723930e+01, 2.18000126e+01, 2.37076321e+01, 2.56952515e+01],
+                                   [2.77628708e+01, 2.99104881e+01, 3.21381073e+01, 3.44457283e+01, 3.68333473e+01]],
+
+                                  [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                                   [6.31699409e+01, 6.82966690e+01, 7.36233978e+01, 7.91501160e+01, 8.48768463e+01],
+                                   [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                                   [1.23437180e+02, 1.30563919e+02, 1.37890640e+02, 1.45417374e+02, 1.53144089e+02],
+                                   [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
+
+                                  [[4.58764343e+01, 4.86257210e+01, 5.14550095e+01, 5.43642960e+01, 5.73535805e+01],
+                                   [6.04228668e+01, 6.35721550e+01, 6.68014374e+01, 7.01107254e+01, 7.35000076e+01],
+                                   [7.69692993e+01, 8.05185852e+01, 8.41478729e+01, 8.78571548e+01, 9.16464386e+01],
+                                   [9.55157242e+01, 9.94650116e+01, 1.03494293e+02, 1.07603584e+02, 1.11792870e+02],
+                                   [1.16062157e+02, 1.20411446e+02, 1.24840721e+02, 1.29350006e+02, 1.33939301e+02]]],
+                                 dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(FUK.npy.ferr, expected_ferr,
+                                      err_msg="ferr does not give the expected error "
+                                              "for the fourier_update_kernel.fourier_error emthods")
+
+
     def test_error_reduce(self):
         # array from the previous test
         ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
index 23950af26..411fd59d4 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
@@ -355,6 +355,119 @@ def test_crop_pad_simple_oblike_UNITY(self):
         # Assert
         np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
 
+    def test_downsample_factor_2(self):
+        # downsample, complex, 3D
+        B = np.indices((4,4), dtype=np.complex64)
+        B = np.indices((4,4), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,2,2), dtype=B.dtype)
+        
+        A_dev = gpuarray.to_gpu(A)
+        B_dev = gpuarray.to_gpu(B)
+        
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        exp_A = np.array([[[ 2. +2.j,  2.+10.j],
+                           [10. +2.j, 10.+10.j]],
+                          [[ 2. +2.j, 10.+ 2.j],
+                           [ 2.+10.j, 10.+10.j]]], dtype=np.complex64)
+
+        np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum())
+        np.testing.assert_array_almost_equal(A_dev.get(), exp_A)
+
+    def test_upsample_factor_2(self):
+        # upsample, complex, 3D
+        B = np.indices((2,2), dtype=np.complex64)
+        B = np.indices((2,2), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,4,4), dtype=B.dtype)
+        A_dev = gpuarray.to_gpu(A)
+        B_dev = gpuarray.to_gpu(B)
+
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        exp_A = np.array([[[0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                           [0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                           [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j],
+                           [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j]],
+                          [[0.  +0.j  , 0.  +0.j  , 0.25+0.j  , 0.25+0.j  ],
+                           [0.  +0.j  , 0.  +0.j  , 0.25+0.j  , 0.25+0.j  ],
+                           [0.  +0.25j, 0.  +0.25j, 0.25+0.25j, 0.25+0.25j],
+                           [0.  +0.25j, 0.  +0.25j, 0.25+0.25j, 0.25+0.25j]]], dtype=np.complex64)
+
+        np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum())
+        np.testing.assert_array_almost_equal(A_dev.get(), exp_A)
+
+    def test_downsample_factor_4(self):
+        # downsample, complex, 3D
+        B = np.indices((8,8), dtype=np.complex64)
+        B = np.indices((8,8), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2,2,2), dtype=B.dtype)
+        
+        A_dev = gpuarray.to_gpu(A)
+        B_dev = gpuarray.to_gpu(B)
+        
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        exp_A = np.array([[[24.+24.j, 24.+88.j],
+                           [88.+24.j, 88.+88.j]],
+                          [[24.+24.j, 88.+24.j],
+                           [24.+88.j, 88.+88.j]]], dtype=np.complex64)
+                           
+        np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum())
+        np.testing.assert_array_almost_equal(A_dev.get(), exp_A)
+
+    def test_upsample_factor_4(self):
+        # upsample, complex, 3D
+        Bshape = (2,4,4)
+        B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16
+        B = B + 1j * B
+        A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype)
+        A_dev = gpuarray.to_gpu(A)
+        B_dev = gpuarray.to_gpu(B)
+
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        exp_A = np.zeros_like(A)
+
+        # building the expected value element-wise, to ensure correctness
+        for z in range(A.shape[0]):
+            for y in range(A.shape[1]):
+                for x in range(A.shape[2]):
+                    exp_A[z, y, x] = B[z, y//4, x//4] / 16
+
+        np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum())
+        np.testing.assert_array_almost_equal(A_dev.get(), exp_A)
+
+    def test_upsample_factor_4_UNITY(self):
+        np.random.seed(1983)
+        
+        Bshape = (10,64,64)
+        B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16
+        B = B + 1j * B
+        A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        au.resample(A, B)
+
+        np.testing.assert_allclose(A_dev.get(), A, rtol=1e-6, atol=1e-6, verbose=False)
+
+
+    def test_downsample_factor_4_UNITY(self):
+        np.random.seed(1983)
+        
+        A = np.zeros((10, 64, 64), dtype=np.float32)
+        B = np.random.rand(10, 256, 256).astype(np.float32)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        k = gau.ResampleKernel(queue=self.stream)
+        k.resample(A_dev, B_dev)
+        au.resample(A, B)
+
+        np.testing.assert_allclose(A_dev.get(), A, rtol=1e-6, atol=1e-6, verbose=False)
+
     def test_max_abs2_complex_UNITY(self):
         np.random.seed(1983)
         X = (np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + \
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
index 3d7cb5fa6..e00075f5b 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
@@ -19,7 +19,7 @@
 class FourierUpdateKernelTest(PyCudaTest):
 
 
-    def test_fmag_all_update_UNITY(self):
+    def fmag_update_all_UNITY_tester(self, mult):
         '''
         setup
         '''
@@ -92,6 +92,9 @@ def test_fmag_all_update_UNITY(self):
         nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
         nFUK.error_reduce(addr, err_fmag)
         # print(np.sqrt(pbound_set/err_fmag))
+        if not mult:
+            f = np.real(f)
+
         f_d = gpuarray.to_gpu(f)
         fmag_d = gpuarray.to_gpu(fmag)
         mask_d = gpuarray.to_gpu(mask)
@@ -103,10 +106,11 @@ def test_fmag_all_update_UNITY(self):
         FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev)
         FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr)
 
-        FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set)
+
+        FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set, mult=mult)
 
 
-        nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set)
+        nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set, mult=mult)
         expected_f = f
         measured_f = f_d.get()
         np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f "
@@ -114,7 +118,13 @@ def test_fmag_all_update_UNITY(self):
                                                                                                                                repr(measured_f),
                                                                                                                                repr(mask)))
 
-    def test_fmag_update_nopbound_UNITY(self):
+    def test_fmag_update_all_UNITY(self):
+        self.fmag_update_all_UNITY_tester(True)
+
+    def test_fmag_update_all_nomult_UNITY(self):
+        self.fmag_update_all_UNITY_tester(False)
+
+    def fmag_update_nopbound_UNITY_tester(self, mult):
         '''
         setup
         '''
@@ -186,6 +196,8 @@ def test_fmag_update_nopbound_UNITY(self):
         nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
         nFUK.error_reduce(addr, err_fmag)
         # print(np.sqrt(pbound_set/err_fmag))
+        if not mult:
+            f = np.abs(f)**2
         f_d = gpuarray.to_gpu(f)
         fmag_d = gpuarray.to_gpu(fmag)
         mask_d = gpuarray.to_gpu(mask)
@@ -196,8 +208,8 @@ def test_fmag_update_nopbound_UNITY(self):
         FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev)
         FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr)
 
-        FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d)
-        nFUK.fmag_update_nopbound(f, addr, fmag, mask)
+        FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d, mult=mult)
+        nFUK.fmag_update_nopbound(f, addr, fmag, mask, mult=mult)
 
         expected_f = f
         measured_f = f_d.get()
@@ -205,9 +217,13 @@ def test_fmag_update_nopbound_UNITY(self):
                                                                       "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" %  (repr(expected_f),
                                                                                                                                repr(measured_f),
                                                                                                                                repr(mask)))
+    def test_fmag_update_nopbound_UNITY(self):
+        self.fmag_update_nopbound_UNITY_tester(True)
 
+    def test_fmag_update_nopbound_nomult_UNITY(self):
+        self.fmag_update_nopbound_UNITY_tester(False)
 
-    def test_fourier_error_UNITY(self):
+    def fourier_error_UNITY_tester(self, aux_is_int):
         '''
         setup
         '''
@@ -266,6 +282,9 @@ def test_fourier_error_UNITY(self):
         '''
         mask_sum = mask.sum(-1).sum(-1)
 
+        if aux_is_int:
+            f = np.abs(f)**2
+
         from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
         f_d = gpuarray.to_gpu(f)
         fmag_d = gpuarray.to_gpu(fmag)
@@ -279,8 +298,8 @@ def test_fourier_error_UNITY(self):
         nFUK.allocate()
         FUK.allocate()
 
-        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
-        FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d)
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum, aux_is_intensity=aux_is_int)
+        FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d, aux_is_intensity=aux_is_int)
 
         expected_fdev = nFUK.npy.fdev
         measured_fdev = FUK.gpu.fdev.get()
@@ -296,7 +315,14 @@ def test_fourier_error_UNITY(self):
                                                                             "is \n%s, \nbut gpu ferr is \n %s, \n " % (
                                                                             repr(expected_ferr),
                                                                             repr(measured_ferr)))
-    def test_fourier_deviation_UNITY(self):
+
+    def test_fourier_error_UNITY(self):
+        self.fourier_error_UNITY_tester(False)
+
+    def test_fourier_error_aux_is_intensity_UNITY(self):
+        self.fourier_error_UNITY_tester(True)
+
+    def fourier_deviation_UNITY_tester(self, aux_is_int):
         '''
         setup - using the fourier_error as reference, so we need mask, etc.
         '''
@@ -355,6 +381,9 @@ def test_fourier_deviation_UNITY(self):
         '''
         mask_sum = mask.sum(-1).sum(-1)
 
+        if aux_is_int:
+            f = np.abs(f)**2
+
         from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
         f_d = gpuarray.to_gpu(f)
         fmag_d = gpuarray.to_gpu(fmag)
@@ -366,8 +395,8 @@ def test_fourier_deviation_UNITY(self):
         nFUK.allocate()
         FUK.allocate()
 
-        nFUK.fourier_deviation(f, addr, fmag)
-        FUK.fourier_deviation(f_d, addr_d, fmag_d)
+        nFUK.fourier_deviation(f, addr, fmag, aux_is_intensity=aux_is_int)
+        FUK.fourier_deviation(f_d, addr_d, fmag_d, aux_is_intensity=aux_is_int)
 
         expected_fdev = nFUK.npy.fdev
         measured_fdev = FUK.gpu.fdev.get()
@@ -376,7 +405,11 @@ def test_fourier_deviation_UNITY(self):
                                                                             repr(expected_fdev),
                                                                             repr(measured_fdev)))
 
+    def test_fourier_deviation_UNITY(self):
+        self.fourier_deviation_UNITY_tester(False)
 
+    def test_fourier_deviation_aux_is_intensity_UNITY(self):
+        self.fourier_deviation_UNITY_tester(True)
 
     def test_error_reduce_UNITY(self):
         '''
@@ -522,7 +555,7 @@ def test_error_reduce(self):
                                                                    "is not behaving as expected.")
 
 
-    def log_likelihood_UNITY_tester(self, use_version2=False):
+    def log_likelihood_UNITY_tester(self, use_version2=False, aux_is_int=False):
         '''
         setup
         '''
@@ -579,6 +612,10 @@ def log_likelihood_UNITY_tester(self, use_version2=False):
         '''
         test
         '''
+
+        if aux_is_int:
+            f = np.abs(f)**2
+
         mask_sum = mask.sum(-1).sum(-1)
         LLerr = np.zeros_like(mask_sum, dtype=np.float32)
         f_d = gpuarray.to_gpu(f)
@@ -590,14 +627,14 @@ def log_likelihood_UNITY_tester(self, use_version2=False):
         from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
         nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
         nFUK.allocate()
-        nFUK.log_likelihood(f, addr, fmag, mask, LLerr)
+        nFUK.log_likelihood(f, addr, fmag, mask, LLerr, aux_is_intensity=aux_is_int)
 
         FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
         FUK.allocate()
         if use_version2:
-            FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+            FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d, aux_is_intensity=aux_is_int)
         else:
-            FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+            FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d, aux_is_intensity=aux_is_int)
 
         expected_err_phot = LLerr
         measured_err_phot = LLerr_d.get()
@@ -607,12 +644,18 @@ def log_likelihood_UNITY_tester(self, use_version2=False):
                                                                                  repr(expected_err_phot),
                                                                                  repr(measured_err_phot)), rtol=1e-5)
     def test_log_likelihood_UNITY(self):
-        self.log_likelihood_UNITY_tester(False)
+        self.log_likelihood_UNITY_tester(False, False)
 
     def test_log_likelihood2_UNITY(self):
-        self.log_likelihood_UNITY_tester(True)
+        self.log_likelihood_UNITY_tester(True, False)
 
-    def test_exit_error_UNITY(self):
+    def test_log_likelihood_aux_is_intensity_UNITY(self):
+        self.log_likelihood_UNITY_tester(False, True)
+
+    def test_log_likelihood2_aux_is_intensity_UNITY(self):
+        self.log_likelihood_UNITY_tester(True, True)
+
+    def exit_error_UNITY_tester(self, aux_is_int):
         '''
         setup
         '''
@@ -659,6 +702,10 @@ def test_exit_error_UNITY(self):
         '''
         test
         '''
+        
+        if aux_is_int:
+            aux = np.abs(aux)**2
+
         from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
         aux_d = gpuarray.to_gpu(aux)
         addr_d = gpuarray.to_gpu(addr)
@@ -669,8 +716,8 @@ def test_exit_error_UNITY(self):
         nFUK.allocate()
         FUK.allocate()
 
-        nFUK.exit_error(aux, addr, )
-        FUK.exit_error(aux_d, addr_d)
+        nFUK.exit_error(aux, addr, aux_is_intensity=aux_is_int)
+        FUK.exit_error(aux_d, addr_d, aux_is_intensity=aux_is_int)
 
         expected_ferr = nFUK.npy.ferr
         measured_ferr = FUK.gpu.ferr.get()
@@ -681,5 +728,11 @@ def test_exit_error_UNITY(self):
                                                                             repr(measured_ferr)), rtol=1e-7)
 
 
+    def test_exit_error_UNITY(self):
+        self.exit_error_UNITY_tester(False)
+
+    def test_exit_error_aux_is_intensity_UNITY(self):
+        self.exit_error_UNITY_tester(True)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/core_tests/geometry_test.py b/test/core_tests/geometry_test.py
index 84e4c868d..67d26638c 100644
--- a/test/core_tests/geometry_test.py
+++ b/test/core_tests/geometry_test.py
@@ -58,6 +58,38 @@ def test_geometry_nearfield_resolution(self):
         G = self.set_up_nearfield()
         assert (np.round(G.resolution*1e7) == [1.00, 1.00]).all(), "geometry resolution incorrect for the nearfield"
 
+    def test_downsample(self):
+        G = self.set_up_farfield()
+        G.resample = 2
+
+        B = np.indices((4,4), dtype=np.complex64)[0] + 1j * np.indices((4,4), dtype=np.complex64)[1]
+        A = G.downsample(B)
+        exp_A = np.array([[ 2. +2.j,  2.+10.j],
+                          [10. +2.j, 10.+10.j]], dtype=np.complex64)
+        np.testing.assert_almost_equal(A.sum(), B.sum())            
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_upsample(self):
+        G = self.set_up_farfield()
+        G.resample = 2
+
+        B = np.indices((2,2), dtype=np.complex64)[0] + 1j * np.indices((2,2), dtype=np.complex64)[1]
+        A = G.upsample(B)
+        exp_A = np.array([[0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                          [0.  +0.j  , 0.  +0.j  , 0.  +0.25j, 0.  +0.25j],
+                          [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j],
+                          [0.25+0.j  , 0.25+0.j  , 0.25+0.25j, 0.25+0.25j]], dtype=np.complex64)
+        np.testing.assert_almost_equal(A.sum(), B.sum())            
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_downsample_upsample(self):
+        G = self.set_up_farfield()
+        G.resample = 2
+
+        A = np.random.random((4,4))
+        B = G.downsample(G.upsample(A))
+        np.testing.assert_almost_equal(A.sum(), B.sum())            
+        np.testing.assert_array_almost_equal(A, B)
 
 if __name__ == '__main__':
     unittest.main()