diff --git a/archive/cuda_extension/extensions.py b/archive/cuda_extension/extensions.py
new file mode 100644
index 000000000..f0dfc0010
--- /dev/null
+++ b/archive/cuda_extension/extensions.py
@@ -0,0 +1,102 @@
+'''
+These are the optional extensions for ptypy
+'''
+
+
+from distutils.version import LooseVersion
+from distutils.extension import Extension
+import os
+import multiprocessing
+import subprocess
+import re
+import numpy as np
+
+
+# this is a hacky version, but is the desired behaviour
+class AccelerationExtension(object):
+    def __init__(self, debug=False):
+        self.debug = debug
+        self._options = None
+
+    def get_full_options(self):
+        return self._options
+
+    def get_reflection_options(self):
+        user_options = []
+        boolean_options = []
+        for name, description in self._options.items():
+            if isinstance(description['default'], str):
+                user_options.append((name+'=', None, description['doc']))
+            elif isinstance(description['default'], bool):
+                user_options.append((name, None, description['doc']))
+                boolean_options.append(name)
+            else:
+                raise NotImplementedError("Don't know what to do with parameter:%s of type: %s" % (name, type(description['default'])))
+        return user_options, boolean_options
+
+    def build(self, options):
+        raise NotImplementedError('You need to implement the build method!')
+
+    def getExtension(self):
+        raise NotImplementedError('You need to return cython extension object.')
+
+
+class CudaExtension(AccelerationExtension): # probably going to inherit from something.
+    def __init__(self, *args, **kwargs):
+        super(CudaExtension, self).__init__(*args, **kwargs)
+        self._options = {'cudadir': {'default': '',
+                                     'doc': 'CUDA directory'},
+                         'cudaflags': {'default': '-gencode arch=compute_35,\\"code=sm_35\\" ' +
+                                                  '-gencode arch=compute_37,\\"code=sm_37\\" ' +
+                                                  '-gencode arch=compute_52,\\"code=sm_52\\" ' +
+                                                  '-gencode arch=compute_60,\\"code=sm_60\\" ' +
+                                                  '-gencode arch=compute_70,\\"code=sm_70\\" ',
+                                       'doc': 'Flags to the CUDA compiler'},
+                         'gputiming': {'default': False,
+                                       'doc': 'Do GPU timing'}}
+
+    def build(self, options):
+        cudadir = options['cudadir']
+        cudaflags = options['cudaflags']
+        gputiming = options['gputiming']
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError(
+                "CMake must be installed to build the CUDA extensions.")
+
+        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)',
+                                               out.decode()).group(1))
+        if cmake_version < '3.8.0':
+            raise RuntimeError("CMake >= 3.8.0 is required")
+
+        srcdir = os.path.abspath('cuda')
+        buildtmp = os.path.abspath(os.path.join('build', 'cuda'))
+        cmake_args = [
+            "-DCMAKE_BUILD_TYPE=" + ("Debug" if self.debug else "Release"),
+            '-DCMAKE_CUDA_FLAGS={}'.format(cudaflags),
+            '-DGPU_TIMING={}'.format("ON" if gputiming else "OFF")
+        ]
+        if cudadir:
+            cmake_args += '-DCMAKE_CUDA_COMPILER="{}/bin/nvcc"'.format(cudadir)
+        build_args = ["--config", "Debug" if self.debug else "Release", "--", "-j{}".format(multiprocessing.cpu_count() + 1)]
+        if not os.path.exists(buildtmp):
+            os.makedirs(buildtmp)
+        env = os.environ.copy()
+        subprocess.check_call(['cmake', srcdir] + cmake_args,
+                              cwd=buildtmp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args,
+                              cwd=buildtmp)
+        print("Complete.")
+
+    def getExtension(self):
+        libdirs = ['build/cuda']
+        if 'LD_LIBRARY_PATH' in os.environ:
+            libdirs += os.environ['LD_LIBRARY_PATH'].split(':')
+        return Extension('*',
+                         sources=['ptypy/accelerate/cuda/gpu_extension.pyx'],
+                         include_dirs=[np.get_include()],
+                         libraries=['gpu_extension', 'cudart', 'cufft'],
+                         library_dirs=libdirs,
+                         depends=['build/cuda/libgpu_extension.a', ],
+                         language="c++")
diff --git a/archive/misc/mpitest.cpp b/archive/misc/mpitest.cpp
new file mode 100644
index 000000000..e4ff84577
--- /dev/null
+++ b/archive/misc/mpitest.cpp
@@ -0,0 +1,47 @@
+/** This is a simple C++ test to check if cuda-aware MPI works as
+ *  expected.
+ *  It allocates a GPU array and puts 1s into it, then sends it
+ *  across MPI to the receiving rank, which transfers back to 
+ *  host and outputs the values. 
+ *  The expected output is:
+ * 
+ *  Received 1, 1
+ * 
+ * Compile with:
+ *   mpic++ -o test mpitest.cpp -L/path/to/cuda/libs -lcudart
+ * 
+ * Run with:
+ *   mpirun -np 2 test
+ */
+
+#include <cstdio>
+#include <string>
+#include <mpi.h>
+#include <cuda_runtime_api.h>
+#include <iostream>
+
+int main(int argc, char** argv)
+{
+    MPI_Init(&argc, &argv);
+
+    int rank;
+    MPI_Status status;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    if (rank == 0) {
+        int* d_send;
+        cudaMalloc((void**)&d_send, 2*sizeof(int));
+        int h_send[] = {1, 1};
+        cudaMemcpy(d_send, h_send, 2*sizeof(int), cudaMemcpyHostToDevice);
+        MPI_Send(d_send, 2, MPI_INT, 1, 99, MPI_COMM_WORLD);
+        std::cout << "Data has been sent...\n";
+    } else if (rank == 1) {
+        int* d_recv;
+        cudaMalloc((void**)&d_recv, 2*sizeof(int));
+        MPI_Recv(d_recv, 2, MPI_INT, 0, 99, MPI_COMM_WORLD, &status);
+        int h_recv[2];
+        cudaMemcpy(h_recv, d_recv, 2*sizeof(int), cudaMemcpyDeviceToHost);
+        std::cout << "Received " << h_recv[0] << ", " << h_recv[1] << "\n";
+    }
+
+}
\ No newline at end of file
diff --git a/benchmark/diamond_benchmarks/ML_accurracy_test.py b/benchmark/diamond_benchmarks/ML_accurracy_test.py
new file mode 100644
index 000000000..a8da654ac
--- /dev/null
+++ b/benchmark/diamond_benchmarks/ML_accurracy_test.py
@@ -0,0 +1,404 @@
+'''
+Load real data and prepare an accuracy report of GPU vs numpy
+'''
+
+import h5py
+import numpy as np
+import csv
+
+import pycuda.driver as cuda
+from pycuda import gpuarray
+
+from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel
+from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel
+
+
+class GradientDescentAccuracyTester:
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+    headings = ['Kernel', 'Version', 'Iter', 'MATH_TYPE', 'IN/OUT_TYPE',
+                'ACC_TYPE', 'Array', 'num_elements', 'num_errors', 'max_relerr', 'max_abserr']
+
+    def __init__(self):
+        import sys
+        np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf)
+        cuda.init()
+        self.device = cuda.Device(0)
+        self.ctx = self.device.make_context()
+        self.stream = cuda.Stream()
+        self.results = []
+
+    def __del__(self):
+        np.set_printoptions()
+        self.ctx.pop()
+        self.ctx.detach()
+
+    def test_make_model(self, name, iter,
+                        math_type={'float', 'double'},
+                        data_type={'float', 'double'}):
+
+        res = []
+
+        # Load data
+        with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.make_model(aux, addr)
+        ref = BGDK.npy.Imodel
+
+        # GPU variants
+        addr_dev = gpuarray.to_gpu(addr)
+        for d in data_type:
+            if d == 'float':
+                aux_dev = gpuarray.to_gpu(aux.astype(np.complex64))
+            else:
+                aux_dev = gpuarray.to_gpu(aux.astype(np.complex128))
+            for m in math_type:
+                # data type will be determined based on aux_dev data type automatically
+                GDK = GradientDescentKernel(
+                    aux_dev, addr.shape[1], queue=self.stream, math_type=m)
+                GDK.allocate()
+                GDK.make_model(aux_dev, addr_dev)
+                act = GDK.gpu.Imodel.get()
+
+                num, num_mis, max_abs, max_rel = self._calc_diffs(act, ref)
+
+                line = ['make_model', name, iter, d, m, 'N/A',
+                        'Imodel', num, num_mis, max_rel, max_abs]
+                print(line)
+                res.append(line)
+
+        return res
+
+    def test_floating_intensity(self, name, iter,
+                                math_type={'float', 'double'},
+                                data_type={'float', 'double'},
+                                acc_type={'float', 'double'}):
+
+        # note that this is actually calling 4 kernels:
+        # - floating_intensity_cuda_step1
+        # - error_reduce_cuda (2x)
+        # - floating_intensity_cuda_step2
+
+        res = []
+
+        # Load data
+        with h5py.File(self.datadir % name + "floating_intensities_%04d.h5" % iter, "r") as f:
+            w = f["w"][:]
+            addr = f["addr"][:]
+            I = f["I"][:]
+            fic = f["fic"][:]
+            Imodel = f["Imodel"][:]
+        with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f:
+            aux = f["aux"][:]
+
+        # CPU Kernel
+        ficref = np.copy(fic)
+        Iref = np.copy(Imodel)
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.npy.Imodel = Iref
+        BGDK.floating_intensity(addr, w, I, ficref)  # modifies fic, Imodel
+        Iref = BGDK.npy.Imodel
+
+        addr_dev = gpuarray.to_gpu(addr)
+        for d in data_type:
+            for m in math_type:
+                for a in acc_type:
+                    if d == 'float':
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex64))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float32))
+                        fic_dev = gpuarray.to_gpu(fic.astype(np.float32))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float32))
+                        Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float32))
+                    else:
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex128))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float64))
+                        fic_dev = gpuarray.to_gpu(fic.astype(np.float64))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float64))
+                        Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float64))
+
+                    # GPU kernel
+                    GDK = GradientDescentKernel(
+                        aux_dev, addr.shape[1], accumulate_type=a, math_type=m, queue=self.stream)
+                    GDK.allocate()
+                    GDK.gpu.Imodel = Imodel_dev
+                    GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev)
+
+                    Iact = GDK.gpu.Imodel.get()
+                    fact = fic_dev.get()
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        Iact, Iref)
+                    line = ['floating_intensity', name, iter, d, m,
+                            a, 'Imodel', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        fact, ficref)
+                    line = ['floating_intensity', name, iter, d, m,
+                            a, 'fic', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+        return res
+
+    def test_main_and_error_reduce(self, name, iter,
+                                   math_type={'float', 'double'},
+                                   data_type={'float', 'double'},
+                                   acc_type={'float', 'double'}):
+
+        res = []
+
+        # Load data
+        with h5py.File(self.datadir % name + "main_%04d.h5" % iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            w = f["w"][:]
+            I = f["I"][:]
+        # Load data
+        with h5py.File(self.datadir % name + "error_reduce_%04d.h5" % iter, "r") as f:
+            err_phot = f["err_phot"][:]
+
+        # CPU Kernel
+        auxref = np.copy(aux)
+        errref = np.copy(err_phot)
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.main(auxref, addr, w, I)
+        BGDK.error_reduce(addr, errref)
+        LLerrref = BGDK.npy.LLerr
+
+        addr_dev = gpuarray.to_gpu(addr)
+        for d in data_type:
+            for m in math_type:
+                for a in acc_type:
+                    if d == 'float':
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex64))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float32))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float32))
+                        err_phot_dev = gpuarray.to_gpu(
+                            err_phot.astype(np.float32))
+                    else:
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex128))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float64))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float64))
+                        err_phot_dev = gpuarray.to_gpu(
+                            err_phot.astype(np.float64))
+
+                    # GPU kernel
+                    GDK = GradientDescentKernel(
+                        aux_dev, addr.shape[1], accumulate_type=a, math_type=m)
+                    GDK.allocate()
+                    GDK.main(aux_dev, addr_dev, w_dev, I_dev)
+                    GDK.error_reduce(addr_dev, err_phot_dev)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        auxref, aux_dev.get())
+                    line = ['main_and_error_reduce', name, iter, d,
+                            m, a, 'aux', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        LLerrref, GDK.gpu.LLerr.get())
+                    line = ['main_and_error_reduce', name, iter, d,
+                            m, a, 'LLerr', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        errref, err_phot_dev.get())
+                    line = ['main_and_error_reduce', name, iter, d, m,
+                            a, 'err_phot', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+        return res
+
+    def test_make_a012(self, name, iter,
+                       math_type={'float', 'double'},
+                       data_type={'float', 'double'},
+                       acc_type={'float', 'double'}):
+
+        # Reduce the array size to make the tests run faster
+        Nmax = 10
+        Ymax = 128
+        Xmax = 128
+
+        res = []
+
+        # Load data
+        with h5py.File(self.datadir % name + "make_a012_%04d.h5" % iter, "r") as g:
+            addr = g["addr"][:Nmax]
+            I = g["I"][:Nmax, :Ymax, :Xmax]
+            b_f = g["f"][:Nmax, :Ymax, :Xmax]
+            b_a = g["a"][:Nmax, :Ymax, :Xmax]
+            b_b = g["b"][:Nmax, :Ymax, :Xmax]
+            fic = g["fic"][:Nmax]
+        with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as h:
+            aux = h["aux"][:Nmax, :Ymax, :Xmax]
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.make_a012(b_f, b_a, b_b, addr, I, fic)
+        Imodelref = BGDK.npy.Imodel
+        LLerrref = BGDK.npy.LLerr
+        LLdenref = BGDK.npy.LLden
+
+        addr_dev = gpuarray.to_gpu(addr)
+        for d in data_type:
+            for m in math_type:
+                for a in acc_type:
+                    if d == 'float':
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex64))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float32))
+                        b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex64))
+                        b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex64))
+                        b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex64))
+                        fic_dev = gpuarray.to_gpu(fic.astype(np.float32))
+                    else:
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex128))
+                        I_dev = gpuarray.to_gpu(I.astype(np.float64))
+                        b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex128))
+                        b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex128))
+                        b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex128))
+                        fic_dev = gpuarray.to_gpu(fic.astype(np.float64))
+
+                    GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream,
+                                                math_type=m, accumulate_type=a)
+                    GDK.allocate()
+                    GDK.gpu.Imodel.fill(np.nan)
+                    GDK.gpu.LLerr.fill(np.nan)
+                    GDK.gpu.LLden.fill(np.nan)
+                    GDK.make_a012(b_f_dev, b_a_dev, b_b_dev,
+                                  addr_dev, I_dev, fic_dev)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        LLerrref, GDK.gpu.LLerr.get())
+                    line = ['make_a012', name, iter, d, m, a,
+                            'LLerr', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        LLdenref, GDK.gpu.LLden.get())
+                    line = ['make_a012', name, iter, d, m, a,
+                            'LLden', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        Imodelref, GDK.gpu.Imodel.get())
+                    line = ['make_a012', name, iter, d, m, a,
+                            'Imodel', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+        return res
+
+    def test_fill_b(self, name, iter,
+                    math_type={'float', 'double'},
+                    data_type={'float', 'double'},
+                    acc_type={'float', 'double'}):
+
+        res = []
+
+        # Load data
+
+        Nmax = 10
+        Ymax = 128
+        Xmax = 128
+
+        with h5py.File(self.datadir % name + "fill_b_%04d.h5" % iter, "r") as f:
+            w = f["w"][:Nmax, :Ymax, :Xmax]
+            addr = f["addr"][:]
+            B = f["B"][:]
+            Brenorm = f["Brenorm"][...]
+            A0 = f["A0"][:Nmax, :Ymax, :Xmax]
+            A1 = f["A1"][:Nmax, :Ymax, :Xmax]
+            A2 = f["A2"][:Nmax, :Ymax, :Xmax]
+        with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f:
+            aux = f["aux"][:Nmax, :Ymax, :Xmax]
+
+        # CPU Kernel
+        Bref = np.copy(B)
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.npy.Imodel = A0
+        BGDK.npy.LLerr = A1
+        BGDK.npy.LLden = A2
+        BGDK.fill_b(addr, Brenorm, w, Bref)
+
+        addr_dev = gpuarray.to_gpu(addr)
+        for d in data_type:
+            for m in math_type:
+                for a in acc_type:
+                    if d == 'float':
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex64))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float32))
+                        B_dev = gpuarray.to_gpu(B.astype(np.float32))
+                        A0_dev = gpuarray.to_gpu(A0.astype(np.float32))
+                        A1_dev = gpuarray.to_gpu(A1.astype(np.float32))
+                        A2_dev = gpuarray.to_gpu(A2.astype(np.float32))
+                    else:
+                        aux_dev = gpuarray.to_gpu(aux.astype(np.complex128))
+                        w_dev = gpuarray.to_gpu(w.astype(np.float64))
+                        B_dev = gpuarray.to_gpu(B.astype(np.float64))
+                        A0_dev = gpuarray.to_gpu(A0.astype(np.float64))
+                        A1_dev = gpuarray.to_gpu(A1.astype(np.float64))
+                        A2_dev = gpuarray.to_gpu(A2.astype(np.float64))
+
+                    GDK = GradientDescentKernel(
+                        aux_dev, addr.shape[1], queue=self.stream, math_type=m, accumulate_type=a)
+                    GDK.allocate()
+                    GDK.gpu.Imodel = A0_dev
+                    GDK.gpu.LLerr = A1_dev
+                    GDK.gpu.LLden = A2_dev
+                    GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev)
+
+                    num, num_mis, max_abs, max_rel = self._calc_diffs(
+                        Bref, B_dev.get())
+                    line = ['fill_b', name, iter, d, m, a,
+                            'B', num, num_mis, max_rel, max_abs]
+                    print(line)
+                    res.append(line)
+
+        return res
+
+    def _calc_diffs(self, act, ref):
+        diffs = np.abs(ref - act)
+        max_abs = np.max(diffs[:])
+        aref = np.abs(ref[:])
+        max_rel = np.max(
+            np.divide(diffs[:], aref, out=np.zeros_like(diffs[:]), where=aref > 0))
+        num_mis = np.count_nonzero(diffs[:] > self.atol + self.rtol * aref)
+        num = np.prod(ref.shape)
+
+        return num, num_mis, max_abs, max_rel
+
+
+tester = GradientDescentAccuracyTester()
+print(tester.headings)
+
+res = [tester.headings]
+for ver in [("base", 10), ("regul", 50), ("floating", 0)]:
+    res += tester.test_make_model(*ver)
+    res += tester.test_floating_intensity(*ver)
+    res += tester.test_main_and_error_reduce(*ver)
+    res += tester.test_make_a012(*ver)
+    res += tester.test_fill_b(*ver)
+
+with open('ML_accuracy_test_results.csv', 'w', newline='') as f:
+    writer = csv.writer(f)
+    writer.writerows(res)
+
+print('Done.')
diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
index 0c9927ea9..414b785b3 100644
--- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
@@ -7,6 +7,8 @@
 from ptypy.core import Ptycho
 from ptypy import utils as u
 import time
+from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream
+from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams
 
 import os
 import getpass
diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py
new file mode 100644
index 000000000..15e1c7513
--- /dev/null
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py
@@ -0,0 +1,75 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import time
+from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream
+from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams
+
+import os
+import getpass
+from pathlib import Path
+username = getpass.getuser()
+tmpdir = os.path.join('/dls/tmp', username, 'dumps', 'ptypy')
+Path(tmpdir).mkdir(parents=True, exist_ok=True)
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 3
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home = tmpdir
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param()
+p.io.interaction.server = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.i14_3 = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.i14_3.name = 'BlockFull' # or 'Full'
+p.scans.i14_3.data= u.Param()
+p.scans.i14_3.data.name = 'MoonFlowerScan'
+p.scans.i14_3.data.shape = 512
+p.scans.i14_3.data.num_frames = 4000 #50000 is the real value
+p.scans.i14_3.data.save = None
+
+p.scans.i14_3.illumination = u.Param()
+p.scans.i14_3.coherence = u.Param(num_probe_modes=10)
+p.scans.i14_3.illumination.diversity = u.Param()
+p.scans.i14_3.illumination.diversity.noise = (0.5, 1.0)
+p.scans.i14_3.illumination.diversity.power = 0.1
+
+# position distance in fraction of illumination frame
+p.scans.i14_3.data.density = 0.2
+# total number of photon in empty beam
+p.scans.i14_3.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.i14_3.data.psf = 0.4
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_pycuda_stream'
+p.engines.engine00.numiter = 100
+p.engines.engine00.numiter_contiguous = 20
+p.engines.engine00.probe_update_start = 1
+p.engines.engine00.probe_update_cuda_atomics = False
+p.engines.engine00.object_update_cuda_atomics = True
+
+
+# prepare and run
+P = Ptycho(p,level=4)
+t1 = time.perf_counter()
+P.run()
+t2 = time.perf_counter()
+P.print_stats()
+print('Elapsed Compute Time: {} seconds'.format(t2-t1))
diff --git a/extensions.py b/extensions.py
index f0dfc0010..c36483e09 100644
--- a/extensions.py
+++ b/extensions.py
@@ -1,102 +1,115 @@
 '''
-These are the optional extensions for ptypy
+Compilation tools for Nvidia builds of extension modules.
 '''
-
-
-from distutils.version import LooseVersion
-from distutils.extension import Extension
 import os
-import multiprocessing
-import subprocess
-import re
-import numpy as np
-
+import sysconfig
+import pybind11
+from distutils.unixccompiler import UnixCCompiler
+from distutils.command.build_ext import build_ext
 
-# this is a hacky version, but is the desired behaviour
-class AccelerationExtension(object):
-    def __init__(self, debug=False):
-        self.debug = debug
-        self._options = None
 
-    def get_full_options(self):
-        return self._options
+def find_in_path(name, path):
+    "Find a file in a search path"
+    # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = os.path.join(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
 
-    def get_reflection_options(self):
-        user_options = []
-        boolean_options = []
-        for name, description in self._options.items():
-            if isinstance(description['default'], str):
-                user_options.append((name+'=', None, description['doc']))
-            elif isinstance(description['default'], bool):
-                user_options.append((name, None, description['doc']))
-                boolean_options.append(name)
-            else:
-                raise NotImplementedError("Don't know what to do with parameter:%s of type: %s" % (name, type(description['default'])))
-        return user_options, boolean_options
+def locate_cuda():
+    """
+    Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = os.path.join(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                                   'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
 
-    def build(self, options):
-        raise NotImplementedError('You need to implement the build method!')
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': os.path.join(home, 'include'),
+                  'lib64': os.path.join(home, 'lib64')}
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+    return cudaconfig
 
-    def getExtension(self):
-        raise NotImplementedError('You need to return cython extension object.')
-
-
-class CudaExtension(AccelerationExtension): # probably going to inherit from something.
+class NvccCompiler(UnixCCompiler):
     def __init__(self, *args, **kwargs):
-        super(CudaExtension, self).__init__(*args, **kwargs)
-        self._options = {'cudadir': {'default': '',
-                                     'doc': 'CUDA directory'},
-                         'cudaflags': {'default': '-gencode arch=compute_35,\\"code=sm_35\\" ' +
-                                                  '-gencode arch=compute_37,\\"code=sm_37\\" ' +
-                                                  '-gencode arch=compute_52,\\"code=sm_52\\" ' +
-                                                  '-gencode arch=compute_60,\\"code=sm_60\\" ' +
-                                                  '-gencode arch=compute_70,\\"code=sm_70\\" ',
-                                       'doc': 'Flags to the CUDA compiler'},
-                         'gputiming': {'default': False,
-                                       'doc': 'Do GPU timing'}}
+        super(NvccCompiler, self).__init__(*args, **kwargs)
+        self.CUDA = locate_cuda()
+        module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') 
+        # by default, compile for all of these 
+        archflag = '-gencode=arch=compute_50,code=sm_50' + \
+            ' -gencode=arch=compute_52,code=sm_52' + \
+            ' -gencode=arch=compute_60,code=sm_60' + \
+            ' -gencode=arch=compute_61,code=sm_61' + \
+            ' -gencode=arch=compute_70,code=sm_70' + \
+            ' -gencode=arch=compute_75,code=sm_75' + \
+            ' -gencode=arch=compute_75,code=compute_75'
+        self.src_extensions.append('.cu')
+        self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"]
+        self.NVCC_FLAGS = ["-dc", archflag]
+        self.CXXFLAGS = ['"-fPIC"']
+        pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')]  
+        INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir]
+        self.INCLUDES = ["-I%s" % ix for ix in INCLUDES]
+        self.OPTFLAGS = ["-O3", "-std=c++14"]
+
+    def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
+        default_compiler_so = self.compiler_so
+        CPPFLAGS = self.INCLUDES + extra_postargs # little hack here, since postargs usually goes at the end, which we won't do.
+        # makefile line is
+        # $(NVCC) $(NVCC_FLAGS) $(OPTFLAGS) -Xcompiler "$(CXXFLAGS)" $(CPPFLAGS)
+        compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS
+        compiler_exec = " ".join(compiler_command)
+        self.set_executable('compiler_so', compiler_exec)
+        postargs = [] # we don't actually have any postargs
+        super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+    
+    def link(self, target_desc, objects,
+             output_filename, output_dir=None, libraries=None,
+             library_dirs=None, runtime_library_dirs=None,
+             export_symbols=None, debug=0, extra_preargs=None,
+             extra_postargs=None, build_temp=None, target_lang=None):
+        default_linker_so = self.linker_so
+        # make file line is
+        # $(NVCC) $(OPTFLAGS) -shared $(LD_FLAGS) $(OBJ) $(OBJ_MOD) -o $@
+        linker_command = [self.CUDA["nvcc"]] + self.OPTFLAGS + ["-shared"] + self.LD_FLAGS
+        linker_exec = " ".join(linker_command)
+        self.set_executable('linker_so', linker_exec)
+        super(NvccCompiler, self).link(target_desc, objects,
+             output_filename, output_dir=None, libraries=None,
+             library_dirs=None, runtime_library_dirs=None,
+             export_symbols=None, debug=0, extra_preargs=None,
+             extra_postargs=None, build_temp=None, target_lang=None)
+        self.linker_so = default_linker_so
 
-    def build(self, options):
-        cudadir = options['cudadir']
-        cudaflags = options['cudaflags']
-        gputiming = options['gputiming']
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError(
-                "CMake must be installed to build the CUDA extensions.")
+class CustomBuildExt(build_ext):
 
-        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)',
-                                               out.decode()).group(1))
-        if cmake_version < '3.8.0':
-            raise RuntimeError("CMake >= 3.8.0 is required")
+    def build_extension(self, ext):
+        has_cu = any([src.endswith('.cu') for src in ext.sources])
+        if has_cu:
+            old_compiler = self.compiler
+            self.compiler = NvccCompiler(verbose=old_compiler.verbose,
+                                        dry_run=old_compiler.dry_run,
+                                        force=old_compiler.force) # this is our bespoke compiler
+            super(CustomBuildExt, self).build_extension(ext)
+            self.compiler=old_compiler
+        else:
+            super(CustomBuildExt, self).build_extension(ext)
 
-        srcdir = os.path.abspath('cuda')
-        buildtmp = os.path.abspath(os.path.join('build', 'cuda'))
-        cmake_args = [
-            "-DCMAKE_BUILD_TYPE=" + ("Debug" if self.debug else "Release"),
-            '-DCMAKE_CUDA_FLAGS={}'.format(cudaflags),
-            '-DGPU_TIMING={}'.format("ON" if gputiming else "OFF")
-        ]
-        if cudadir:
-            cmake_args += '-DCMAKE_CUDA_COMPILER="{}/bin/nvcc"'.format(cudadir)
-        build_args = ["--config", "Debug" if self.debug else "Release", "--", "-j{}".format(multiprocessing.cpu_count() + 1)]
-        if not os.path.exists(buildtmp):
-            os.makedirs(buildtmp)
-        env = os.environ.copy()
-        subprocess.check_call(['cmake', srcdir] + cmake_args,
-                              cwd=buildtmp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args,
-                              cwd=buildtmp)
-        print("Complete.")
 
-    def getExtension(self):
-        libdirs = ['build/cuda']
-        if 'LD_LIBRARY_PATH' in os.environ:
-            libdirs += os.environ['LD_LIBRARY_PATH'].split(':')
-        return Extension('*',
-                         sources=['ptypy/accelerate/cuda/gpu_extension.pyx'],
-                         include_dirs=[np.get_include()],
-                         libraries=['gpu_extension', 'cudart', 'cufft'],
-                         library_dirs=libdirs,
-                         depends=['build/cuda/libgpu_extension.a', ],
-                         language="c++")
diff --git a/ptypy/accelerate/base/address_manglers.py b/ptypy/accelerate/base/address_manglers.py
index c60543cb4..100c4d382 100644
--- a/ptypy/accelerate/base/address_manglers.py
+++ b/ptypy/accelerate/base/address_manglers.py
@@ -4,52 +4,83 @@
 
 import numpy as np
 np.random.seed(0)
-class RandomIntMangle(object):
+
+class BaseMangler(object):
     '''
-    assumes integer pixel shift.
+    Assumes integer pixel shift.
     '''
-    def __init__(self, max_step_per_shift,  start, stop, max_bound=None,  randomseed=None):
+    def __init__(self, max_step_per_shift,  start, stop, nshifts, max_bound=None,  randomseed=None):
         # can be initialised in the engine.init
 
         self.max_bound = max_bound  # maximum distance from the starting positions
-        self.max_step = lambda it: (max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression
-        self.call_no = 0
+        self.max_step = lambda it: np.ceil(max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression
+        self.nshifts = nshifts
+        self.delta = 0
 
-    def mangle_address(self, addr_current, addr_original, iteration):
+    def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx):
         '''
-        Takes the current address book and adds an offset to it according to the parameters
+        Mangles with the address given a delta shift
         '''
-        mangled_addr = np.zeros_like(addr_current)
-        mangled_addr[:] = addr_current  # make a copy
-        max_step = self.max_step(iteration)
-        deltas = np.random.randint(0, max_step + 1, (addr_current.shape[0], 2))
-        # the following improves things a lot!
-        deltas[:, 0] = (-1)**self.call_no
-        deltas[:, 1] = (-1)**(self.call_no//2)
-        self.call_no += 1
-
-        # deltas = np.zeros((addr_current.shape[0], 2)) # for testing
         old_positions = np.zeros((addr_current.shape[0], 2))
         old_positions[:] = addr_current[:, 0, 1, 1:]
         new_positions = np.zeros((addr_current.shape[0],2))
-        # new_positions[1:] = old_positions[1:] + deltas[1:]  # first mode is same as all of them.
-        new_positions[:] = old_positions + deltas  # first mode is same as all of them.
-        self.apply_bounding_box(new_positions, old_positions, addr_original)
+        new_positions[:] = old_positions + self.delta[index]  # first mode is same as all of them.
         #  now update the main matrix (Same for all modes)
-        for idx in range(addr_original.shape[1]):
+        for idx in range(addr_current.shape[1]):
             mangled_addr[:, idx, 1, 1:] = new_positions
-        return mangled_addr
+        self.apply_bounding_box(mangled_addr[:,:,1,1], 0, max_oby)
+        self.apply_bounding_box(mangled_addr[:,:,1,2], 0, max_obx)
+
+    def apply_bounding_box(self, addr, min, max):
+        '''
+        Check if the mangled addresses are within valid bounds
+        '''
+        addr[addr<min] = min
+        addr[addr>max] = max
+
+    def setup_shifts(self, current_iteration, nframes=1):
+        ''' 
+        Arrange an array of shifts
+        '''
+        raise NotImplementedError("This method needs to be overwritten in order to position correct")
+
+
+class RandomIntMangler(BaseMangler):
 
-    def apply_bounding_box(self, new_positions, old_positions, addr_original):
+    def __init__(self, *args, **kwargs):
+        super(RandomIntMangler, self).__init__(*args, **kwargs)
+
+    def setup_shifts(self, current_iteration, nframes=1):
+        '''
+        Calculates random integer shifts
+        '''
+        max_step = self.max_step(current_iteration)
+        self.delta = np.random.randint(0, max_step + 1, (self.nshifts, nframes, 2))
+        for index in range(self.nshifts):
+            self.delta[index, :, 0] *= (-1)**index 
+            self.delta[index, :, 1] *= (-1)**(index//2)
+        # check if the shifts are within the maximum bound
+        norms = np.linalg.norm(self.delta, axis=-1)
+        self.delta[norms > self.max_bound] = 0
+
+class GridSearchMangler(BaseMangler):
+    def __init__(self, *args, **kwargs):
+        super(GridSearchMangler, self).__init__(*args, **kwargs)
+
+    def setup_shifts(self, current_iteration, nframes=1):
         '''
-        Checks if the new co-ordinates lie within the bounding box. If not, we undo this move.
+        Calculates integer shifts on a grid
         '''
+        max_step = self.max_step(current_iteration)
+        delta = np.mgrid[-max_step:max_step+1:1,
+                         -max_step:max_step+1:1]
+        within_bound = (delta[0]**2 + delta[1]**2) < (self.max_bound**2)
+        print(max_step, self.max_bound, within_bound.sum())
+        self.delta = np.tile(delta[:,within_bound].T.reshape(within_bound.sum(),1,2), (1,nframes,1))
+        self.nshifts = self.delta.shape[0]
+
+
+        
+
+
 
-        distances_from_original = new_positions - addr_original[:, 0, 1, 1:]
-        # logger.warning("distance from original is %s" % repr(distances_from_original))
-        norms = np.linalg.norm(distances_from_original, axis=-1)
-        for i in range(len(new_positions)):
-            if norms[i]> self.max_bound:
-                new_positions[i] = old_positions[i]
-        # new_positions[norms>self.max_bound] = old_positions[norms>self.max_bound] # make sure we aren't outside the bounding box
-#
\ No newline at end of file
diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
index c2d341711..839b08e70 100644
--- a/ptypy/accelerate/base/array_utils.py
+++ b/ptypy/accelerate/base/array_utils.py
@@ -17,6 +17,12 @@ def dot(A, B, acc_dtype=np.float64):
 def norm2(A):
     return dot(A, A)
 
+def max_abs2(A):
+    '''
+    A has ndim = 3.
+    compute abs2, sum along first dimension and take maximum along last two dims
+    '''
+    return np.max(np.sum(np.abs(A)**2,axis=0),axis=(-2,-1))
 
 def abs2(input):
     '''
@@ -26,6 +32,7 @@ def abs2(input):
     '''
     return np.multiply(input, input.conj()).real
 
+
 def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype):
     '''
     :param in1. An array . Can be inplace. Can be complex or real.
@@ -40,6 +47,7 @@ def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype):
         out1[o1[0], o1[1]:(o1[1] + inshape[1]), o1[2]:(o1[2] + inshape[2])] += in1[i1[0]]
     return out1
 
+
 def norm2(input):
     '''
     Input here could be a variety of 1D, 2D, 3D complex or real. all will be single precision at least.
@@ -47,17 +55,20 @@ def norm2(input):
     '''
     return np.sum(abs2(input))
 
+
 def complex_gaussian_filter(input, mfs):
     '''
     takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
     '''
-    if len(mfs)>2:
+    if len(mfs) > 2:
         raise NotImplementedError("Only batches of 2D arrays allowed!")
 
     if input.ndim == 3:
         mfs = np.insert(mfs, 0, 0)
 
-    return (ndi.gaussian_filter(np.real(input), mfs) +1j *ndi.gaussian_filter(np.imag(input), mfs)).astype(input.dtype)
+    return (ndi.gaussian_filter(np.real(input), mfs) + 1j * ndi.gaussian_filter(np.imag(input), mfs)).astype(
+        input.dtype)
+
 
 def mass_center(A):
     '''
@@ -65,6 +76,7 @@ def mass_center(A):
     '''
     return np.array(ndi.measurements.center_of_mass(A), dtype=A.dtype)
 
+
 def interpolated_shift(c, shift, do_linear=False):
     '''
     complex bicubic interpolated shift.
@@ -72,9 +84,13 @@ def interpolated_shift(c, shift, do_linear=False):
     
     '''
     if not do_linear:
-        return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j*ndi.interpolation.shift(np.imag(c), shift, order=3, prefilter=True)
+        return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.interpolation.shift(
+            np.imag(c), shift, order=3, prefilter=True)
     else:
-        return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0, prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1, mode='constant', cval=0, prefilter=False)
+        return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0,
+                                       prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1,
+                                                                                       mode='constant', cval=0,
+                                                                                       prefilter=False)
 
 
 def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max):
@@ -84,4 +100,51 @@ def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max):
     ampl = np.abs(complex_input)
     phase = np.exp(1j * np.angle(complex_input))
     ampl = np.clip(ampl, clip_min, clip_max)
-    complex_input[:] = ampl * phase
\ No newline at end of file
+    complex_input[:] = ampl * phase
+
+
+def fill3D(A, B, offset=[0, 0, 0]):
+    """
+    Fill 3-dimensional array A with B.
+    """
+    if A.ndim < 3 or B.ndim < 3:
+        raise ValueError('Input arrays must each be at least 3D')
+    assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+    ash = A.shape
+    bsh = B.shape
+    misfit = np.array(bsh) - np.array(ash)
+    assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes."
+
+    Alim = np.array(A.shape[-3:])
+    Blim = np.array(B.shape[-3:])
+    off = np.array(offset)
+    Ao = off.copy()
+    Ao[Ao < 0] = 0
+    Bo = -off.copy()
+    Bo[Bo < 0] = 0
+    assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap"
+    A[..., Ao[0]:min(off[0] + Blim[0], Alim[0]),
+    Ao[1]:min(off[1] + Blim[1], Alim[1]),
+    Ao[2]:min(off[2] + Blim[2], Alim[2])] \
+        = B[..., Bo[0]:min(Alim[0] - off[0], Blim[0]),
+          Bo[1]:min(Alim[1] - off[1], Blim[1]),
+          Bo[2]:min(Alim[2] - off[2], Blim[2])]
+
+
+def crop_pad_2d_simple(A, B):
+    """
+    Places B in A centered around the last two axis. A and B must be of the same shape
+    anywhere but the last two dims.
+    """
+    assert A.ndim >= 2, "Arrays must have more than 2 dimensions."
+    assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+    misfit = np.array(A.shape) - np.array(B.shape)
+    assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes."
+    if A.ndim == 2:
+        A = A.reshape((1,) + A.shape)
+    if B.ndim == 2:
+        B = B.reshape((1,) + B.shape)
+    a1, a2 = A.shape[-2:]
+    b1, b2 = B.shape[-2:]
+    offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2]
+    fill3D(A, B, offset)
diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py
index 7352a0fce..44573bf56 100644
--- a/ptypy/accelerate/base/engines/DM_serial.py
+++ b/ptypy/accelerate/base/engines/DM_serial.py
@@ -7,10 +7,6 @@
     :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
     :license: GPLv2, see LICENSE for details.
 """
-
-# from .. import core
-from __future__ import division
-
 import numpy as np
 import time
 
@@ -19,7 +15,6 @@
 from ptypy.utils import parallel
 from ptypy.engines import BaseEngine, register, DM
 from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
-from ptypy.accelerate.base import address_manglers
 from ptypy.accelerate.base import array_utils as au
 
 
@@ -32,8 +27,6 @@
 # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that.
 # - Fourier_update_kernel needs to allow batched execution
 
-## for debugging
-#from matplotlib import pyplot as plt
 
 __all__ = ['DM_serial']
 
@@ -200,17 +193,8 @@ def _setup_kernels(self):
             kern.resolution = geo.resolution[0]
 
             if self.do_position_refinement:
-                addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]),
-                                                                self.p.position_refinement.start,
-                                                                self.p.position_refinement.stop,
-                                                                max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
-                                                                randomseed=0)
-                logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
-                logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
-
-                kern.PCK = PositionCorrectionKernel(aux, nmodes)
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution)
                 kern.PCK.allocate()
-                kern.PCK.address_mangler = addr_mangler
 
     def engine_prepare(self):
 
@@ -333,7 +317,7 @@ def engine_iterate(self, num=1):
 
                 ## build exit wave
                 t1 = time.time()
-                AWK.build_exit(aux, addr, ob, pr, ex)
+                AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
                 FUK.exit_error(aux,addr)
                 FUK.error_reduce(addr, err_exit)
                 self.benchmark.E_Build_exit += time.time() - t1
@@ -350,7 +334,7 @@ def engine_iterate(self, num=1):
             self.overlap_update(MPI=True)
             parallel.barrier()
 
-            if self.do_position_refinement and (self.curiter):
+            if self.do_position_refinement:
                 do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
                 do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
 
@@ -370,7 +354,8 @@ def engine_iterate(self, num=1):
                         kern = self.kernels[prep.label]
                         aux = kern.aux
                         addr = prep.addr
-                        original_addr = prep.original_addr # use this instead of the one in the address mangler.
+                        original_addr = prep.original_addr
+                        mangled_addr = addr.copy()
                         mag = prep.mag
                         ma_sum = prep.ma_sum
                         err_fourier = prep.err_fourier
@@ -378,16 +363,34 @@ def engine_iterate(self, num=1):
                         PCK = kern.PCK
                         FW = kern.FW
 
+                        # Keep track of object boundaries
+                        max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                        max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                        # We need to re-calculate the current error 
+                        PCK.build_aux(aux, addr, ob, pr)
+                        aux[:] = FW(aux)
+                        if self.p.position_refinement.metric == "fourier":
+                            PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            PCK.error_reduce(addr, err_fourier)
+                        if self.p.position_refinement.metric == "photon":
+                            PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
                         error_state = np.zeros_like(err_fourier)
                         error_state[:] = err_fourier
+                        PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
                         log(4, 'Position refinement trial: iteration %s' % (self.curiter))
-                        for i in range(self.p.position_refinement.nshifts):
-                            mangled_addr = PCK.address_mangler.mangle_address(addr, original_addr, self.curiter)
+                        for i in range(PCK.mangler.nshifts):
+                            PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
                             PCK.build_aux(aux, mangled_addr, ob, pr)
                             aux[:] = FW(aux)
-                            PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
-                            PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "fourier":
+                                PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "photon":
+                                PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
                             PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
+
                         prep.err_fourier = error_state
                         prep.addr = addr
 
@@ -418,8 +421,6 @@ def overlap_update(self, MPI=True):
             # Update probe
             log(4, prestr + '----- probe update -----', True)
             change = self.probe_update(MPI=(parallel.size > 1 and MPI))
-            # change = self.probe_update(MPI=(parallel.size>1 and MPI))
-
             log(4, prestr + 'change in probe is %.3f' % change, True)
 
             # stop iteration if probe change is small
@@ -434,7 +435,7 @@ def object_update(self, MPI=False):
             cfact = self.p.object_inertia * self.mean_power
             
             if self.p.obj_smooth_std is not None:
-                logger.info('Smoothing object, cfact is %.2f' % cfact)
+                log(4, 'Smoothing object, cfact is %.2f' % cfact)
                 smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std]
                 ob.data = cfact * au.complex_gaussian_filter(ob.data, smooth_mfs)
             else:
@@ -464,19 +465,19 @@ def object_update(self, MPI=False):
                 parallel.allreduce(ob.data)
                 parallel.allreduce(obn.data)
                 ob.data /= obn.data
-
-                # Clip object (This call takes like one ms. Not time critical)
-                if self.p.clip_object is not None:
-                    clip_min, clip_max = self.p.clip_object
-                    ampl_obj = np.abs(ob.data)
-                    phase_obj = np.exp(1j * np.angle(ob.data))
-                    too_high = (ampl_obj > clip_max)
-                    too_low = (ampl_obj < clip_min)
-                    ob.data[too_high] = clip_max * phase_obj[too_high]
-                    ob.data[too_low] = clip_min * phase_obj[too_low]
             else:
                 ob.data /= obn.data
 
+            # Clip object (This call takes like one ms. Not time critical)
+            if self.p.clip_object is not None:
+                clip_min, clip_max = self.p.clip_object
+                ampl_obj = np.abs(ob.data)
+                phase_obj = np.exp(1j * np.angle(ob.data))
+                too_high = (ampl_obj > clip_max)
+                too_low = (ampl_obj < clip_min)
+                ob.data[too_high] = clip_max * phase_obj[too_high]
+                ob.data[too_low] = clip_min * phase_obj[too_low]
+
         self.benchmark.object_update += time.time() - t1
         self.benchmark.calls_object += 1
 
@@ -533,11 +534,11 @@ def probe_update(self, MPI=False):
 
         return np.sqrt(change)
 
-    def engine_finalize(self):
+    def engine_finalize(self, benchmark=True):
         """
         try deleting ever helper contianer
         """
-        if parallel.master:
+        if parallel.master and benchmark:
             print("----- BENCHMARKS ----")
             acc = 0.
             for name in sorted(self.benchmark.keys()):
@@ -563,7 +564,7 @@ def engine_finalize(self):
                 res = self.kernels[prep.label].resolution
                 for i,view in enumerate(d.views):
                     for j,(pname, pod) in enumerate(view.pods.items()):
-                        delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res
+                        delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res
                         pod.ob_view.coord += delta 
                         pod.ob_view.storage.update_views(pod.ob_view)
 
diff --git a/ptypy/accelerate/base/engines/DM_serial_stream.py b/ptypy/accelerate/base/engines/DM_serial_stream.py
index e3eadc085..2c65511dc 100644
--- a/ptypy/accelerate/base/engines/DM_serial_stream.py
+++ b/ptypy/accelerate/base/engines/DM_serial_stream.py
@@ -29,8 +29,6 @@
 # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that.
 # - Fourier_update_kernel needs to allow batched execution
 
-## for debugging
-#from matplotlib import pyplot as plt
 
 __all__ = ['DM_serial_stream']
 
@@ -139,7 +137,7 @@ def engine_iterate(self, num=1):
 
                         ## apply changes #2
                         t1 = time.time()
-                        AWK.build_exit(aux, addr, ob, pr, ex)
+                        AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
                         self.benchmark.E_Build_exit += time.time() - t1
 
                         err_phot = np.zeros_like(err_fourier)
diff --git a/ptypy/accelerate/base/engines/DR_serial.py b/ptypy/accelerate/base/engines/DR_serial.py
new file mode 100644
index 000000000..b13828919
--- /dev/null
+++ b/ptypy/accelerate/base/engines/DR_serial.py
@@ -0,0 +1,424 @@
+# -*- coding: utf-8 -*-
+"""
+Local Difference Map/Alternate Projections reconstruction engine.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import numpy as np
+import time
+
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy import defaults_tree
+from ptypy.engines import register
+from ptypy.engines.base import PositionCorrectionEngine
+from ptypy.core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull
+from ptypy.accelerate.base.engines import DM_serial
+from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.base import address_manglers
+from ptypy.accelerate.base import array_utils as au
+
+
+__all__ = ['DR_serial']
+
+@register()
+class DR_serial(PositionCorrectionEngine):
+    """
+    An implementation of the Douglas-Rachford algorithm
+    that can be operated like the ePIE algorithm.
+
+    Defaults:
+
+    [name]
+    default = DR_serial
+    type = str
+    help =
+    doc =
+
+    [alpha]
+    default = 1
+    type = float
+    lowlim = 0.0
+    help = Tuning parameter, a value of 0 makes it equal to ePIE.
+
+    [tau]
+    default = 1
+    type = float
+    lowlim = 0.0
+    help = fourier update parameter, a value of 0 means no fourier update.
+
+    [probe_inertia]
+    default = 1e-9
+    type = float
+    lowlim = 0.0
+    help = Weight of the current probe estimate in the update
+
+    [object_inertia]
+    default = 1e-4
+    type = float
+    lowlim = 0.0
+    help = Weight of the current object in the update
+
+    [clip_object]
+    default = None
+    type = tuple
+    help = Clip object amplitude into this interval
+
+    [rescale_probe]
+    default = True
+    type = bool
+    lowlim = 0
+    help = Normalise probe power according to data
+
+    [compute_log_likelihood]
+    default = True
+    type = bool
+    help = A switch for computing the log-likelihood error (this can impact the performance of the engine)
+
+    [compute_exit_error]
+    default = False
+    type = bool
+    help = A switch for computing the exitwave error (this can impact the performance of the engine)
+
+    [compute_fourier_error]
+    default = False
+    type = bool
+    help = A switch for computing the fourier error (this can impact the performance of the engine)
+
+    """
+
+    SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull]
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Local difference map reconstruction engine.
+        """
+        super(DR_serial, self).__init__(ptycho_parent, pars)
+
+        # Instance attributes
+        self.error = None
+        self.mean_power = None
+
+        # keep track of timings
+        self.benchmark = u.Param()
+
+        # Stores all information needed with respect to the diffraction storages.
+        self.diff_info = {}
+        self.ob_cfact = {}
+        self.pr_cfact = {}
+        self.kernels = {}
+
+        self.ptycho.citations.add_article(
+            title='Semi-implicit relaxed Douglas-Rachford algorithm (sDR) for ptychography',
+            author='Pham et al.',
+            journal='Opt. Express',
+            volume=27,
+            year=2019,
+            page=31246,
+            doi='10.1364/OE.27.031246',
+            comment='The local douglas-rachford reconstruction algorithm',
+        )
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        super(DR_serial, self).engine_initialize()
+
+        self.error = []
+        self._reset_benchmarks()
+        self._setup_kernels()
+
+    def _reset_benchmarks(self):
+        self.benchmark.A_Build_aux = 0.
+        self.benchmark.B_Prop = 0.
+        self.benchmark.C_Fourier_update = 0.
+        self.benchmark.D_iProp = 0.
+        self.benchmark.E_Build_exit = 0.
+        self.benchmark.F_LLerror = 0.
+        self.benchmark.probe_update = 0.
+        self.benchmark.object_update = 0.
+        self.benchmark.calls_fourier = 0
+        self.benchmark.calls_object = 0
+        self.benchmark.calls_probe = 0
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            # TODO: make this part of the engine rather than scan
+            fpc = self.ptycho.frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (1 * nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+
+            # setup kernels, one for each SCAN.
+            kern.FUK = FourierUpdateKernel(aux, nmodes)
+            kern.FUK.allocate()
+
+            kern.POK = PoUpdateKernel()
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel()
+            kern.AWK.allocate()
+
+            kern.FW = geo.propagator.fw
+            kern.BW = geo.propagator.bw
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]),
+                                                                self.p.position_refinement.start,
+                                                                self.p.position_refinement.stop,
+                                                                max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
+                                                                randomseed=0)
+                logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
+                logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
+
+                kern.PCK = PositionCorrectionKernel(aux, nmodes)
+                kern.PCK.allocate()
+                kern.PCK.address_mangler = addr_mangler
+
+    def engine_prepare(self):
+
+        """
+        Last minute initialization.
+
+        Everything that needs to be recalculated when new data arrives.
+        """
+        if self.ptycho.new_data:
+
+            # recalculate everything
+            mean_power = 0.
+            for s in self.di.storages.values():
+                mean_power += s.mean_power
+            self.mean_power = mean_power / len(self.di.storages)
+
+        ## Serialize new data ##
+        for label, d in self.ptycho.new_data:
+            prep = u.Param()
+            prep.label = label
+            self.diff_info[d.ID] = prep
+            prep.mag = np.sqrt(np.abs(d.data))
+            prep.ma = self.ma.S[d.ID].data.astype(np.float32)
+            prep.ma_sum = prep.ma.sum(-1).sum(-1)
+            prep.err_phot = np.zeros_like(prep.ma_sum)
+            prep.err_fourier = np.zeros_like(prep.ma_sum)
+            prep.err_exit = np.zeros_like(prep.ma_sum)
+            
+        # Unfortunately this needs to be done for all pods, since
+        # the shape of the probe / object was modified.
+        # TODO: possible scaling issue, remove the need for padding
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.view_IDs, prep.poe_IDs, prep.addr = DM_serial.serialize_array_access(d)
+            if self.do_position_refinement:
+                prep.original_addr = np.zeros_like(prep.addr)
+                prep.original_addr[:] = prep.addr
+            pID, oID, eID = prep.poe_IDs
+
+            ob = self.ob.S[oID]
+            misfit = np.asarray(ob.shape[-2:]) % 32
+            if (misfit != 0).any():
+                pad = 32 - np.asarray(ob.shape[-2:]) % 32
+                ob.data = u.crop_pad(ob.data, [[0, pad[0]], [0, pad[1]]], axes=[-2, -1], filltype='project')
+                ob.shape = ob.data.shape
+
+            # Keep a list of view indices
+            prep.rng = np.random.default_rng()
+            prep.vieworder = np.arange(prep.addr.shape[0])
+
+            # Modify addresses, copy pa into ea and remove da/ma
+            prep.addr_ex = np.vstack([prep.addr[:,0,2,0], prep.addr[:,-1,2,0]+1]).T
+            prep.addr[:,:,2] = prep.addr[:,:,0]
+            prep.addr[:,:,3:,0] = 0
+
+            # Reference to ex
+            prep.ex = self.ex.S[eID].data
+
+            # calculate c_facts
+            #cfact = self.p.object_inertia * self.mean_power
+            #self.ob_cfact[oID] = cfact / u.parallel.size
+
+            #pr = self.pr.S[pID]
+            #cfact = self.p.probe_inertia * len(pr.views) / pr.data.shape[0]
+            #self.pr_cfact[pID] = cfact / u.parallel.size
+
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        for it in range(num):   
+
+            error_dct = {}
+
+            for dID in self.di.S.keys():
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                FW = kern.FW
+                BW = kern.BW
+
+                # global aux buffer
+                aux = kern.aux
+
+                # references for ob, pr
+                ob = self.ob.S[oID].data
+                pr = self.pr.S[pID].data
+
+                # shuffle view order
+                vieworder = prep.vieworder
+                prep.rng.shuffle(vieworder)
+
+                # Iterate through views
+                for i in vieworder:
+
+                    # Get local adress and arrays
+                    addr = prep.addr[i,None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex[ex_from:ex_to]
+                    mag = prep.mag[i,None]
+                    ma = prep.ma[i,None]
+                    ma_sum = prep.ma_sum[i,None]
+                    err_phot = prep.err_phot[i,None]
+                    err_fourier = prep.err_fourier[i,None]
+                    err_exit = prep.err_exit[i,None]
+
+                    ## build auxilliary wave
+                    t1 = time.time()
+                    AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+                    self.benchmark.A_Build_aux += time.time() - t1
+
+                    ## forward FFT
+                    t1 = time.time()
+                    aux[:] = FW(aux)
+                    self.benchmark.B_Prop += time.time() - t1
+
+                    ## Deviation from measured data
+                    t1 = time.time()
+                    if self.p.compute_fourier_error:
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+                    self.benchmark.C_Fourier_update += time.time() - t1
+
+                    ## backward FFT
+                    t1 = time.time()
+                    aux[:] = BW(aux)
+                    self.benchmark.D_iProp += time.time() - t1
+
+                    ## build exit wave
+                    t1 = time.time()
+                    AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau)
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+                    self.benchmark.E_Build_exit += time.time() - t1
+                    self.benchmark.calls_fourier += 1
+
+                    ## probe/object rescale
+                    #if self.p.rescale_probe:
+                    #    pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean())
+
+                    ## build auxilliary wave (ob * pr product)
+                    t1 = time.time()
+                    AWK.build_aux_no_ex(aux, addr, ob, pr)
+                    self.benchmark.A_Build_aux += time.time() - t1
+
+                    # object update
+                    t1 = time.time()
+                    POK.ob_update_local(addr, ob, pr, ex, aux)
+                    self.benchmark.object_update += time.time() - t1
+                    self.benchmark.calls_object += 1
+
+                    # probe update
+                    t1 = time.time()
+                    POK.pr_update_local(addr, pr, ob, ex, aux)
+                    self.benchmark.probe_update += time.time() - t1
+                    self.benchmark.calls_probe += 1
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        t1 = time.time()
+                        aux[:] = FW(aux)
+                        FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+                        self.benchmark.F_LLerror += time.time() - t1
+
+                # update errors
+                errs = np.ascontiguousarray(np.vstack([np.hstack(prep.err_fourier), 
+                                                       np.hstack(prep.err_phot), 
+                                                       np.hstack(prep.err_exit)]).T)
+                error_dct.update(zip(prep.view_IDs, errs))
+
+            self.curiter += 1
+
+        error = parallel.gather_dict(error_dct)
+        return error
+
+
+    def engine_finalize(self):
+        """
+        try deleting ever helper contianer
+        """
+        if parallel.master and self.benchmark.calls_fourier:
+            print("----- BENCHMARKS ----")
+            acc = 0.
+            for name in sorted(self.benchmark.keys()):
+                t = self.benchmark[name]
+                if name[0] in 'ABCDEFGHI':
+                    print('%20s : %1.3f ms per iteration' % (name, t / self.benchmark.calls_fourier * 1000))
+                    acc += t
+                elif str(name) == 'probe_update':
+                    print('%20s : %1.3f ms per call. %d calls' % (
+                        name, t / self.benchmark.calls_probe * 1000, self.benchmark.calls_probe))
+                elif str(name) == 'object_update':
+                    print('%20s : %1.3f ms per call. %d calls' % (
+                        name, t / self.benchmark.calls_object * 1000, self.benchmark.calls_object))
+
+            print('%20s : %1.3f ms per iteration. %d calls' % (
+                'Fourier_total', acc / self.benchmark.calls_fourier * 1000, self.benchmark.calls_fourier))
+
+        self._reset_benchmarks()
+
+        if self.do_position_refinement:
+            for label, d in self.di.storages.items():
+                prep = self.diff_info[d.ID]
+                res = self.kernels[prep.label].resolution
+                for i,view in enumerate(d.views):
+                    for j,(pname, pod) in enumerate(view.pods.items()):
+                        delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res
+                        pod.ob_view.coord += delta 
+                        pod.ob_view.storage.update_views(pod.ob_view)
diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
index 04bfd58ba..214aa0536 100644
--- a/ptypy/accelerate/base/engines/ML_serial.py
+++ b/ptypy/accelerate/base/engines/ML_serial.py
@@ -17,16 +17,15 @@
 from ptypy.engines.ML import ML, BaseModel
 from .DM_serial import serialize_array_access
 from ptypy import utils as u
-from ptypy.utils.verbose import logger
+from ptypy.utils.verbose import logger, log
 from ptypy.utils import parallel
 from ptypy.engines.utils import Cnorm2, Cdot
 from ptypy.engines import register
-from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \
-    PositionCorrectionKernel
+from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel
 from ptypy.accelerate.base import address_manglers
 
-__all__ = ['ML_serial']
 
+__all__ = ['ML_serial']
 
 @register()
 class ML_serial(ML):
@@ -104,20 +103,6 @@ def _setup_kernels(self):
             kern.FW = geo.propagator.fw
             kern.BW = geo.propagator.bw
 
-            if self.do_position_refinement:
-                addr_mangler = address_manglers.RandomIntMangle(
-                    int(self.p.position_refinement.amplitude // geo.resolution[0]),
-                    self.p.position_refinement.start,
-                    self.p.position_refinement.stop,
-                    max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
-                    randomseed=0)
-                logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
-                logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
-
-                kern.PCK = PositionCorrectionKernel(aux, nmodes)
-                kern.PCK.allocate()
-                kern.PCK.address_mangler = addr_mangler
-
     def engine_prepare(self):
 
         ## Serialize new data ##
@@ -137,9 +122,6 @@ def engine_prepare(self):
         for label, d in self.di.storages.items():
             prep = self.diff_info[d.ID]
             prep.view_IDs, prep.poe_IDs, prep.addr = serialize_array_access(d)
-            if self.do_position_refinement:
-                prep.original_addr = np.zeros_like(prep.addr)
-                prep.original_addr[:] = prep.addr
 
         self.ML_model.prepare()
 
@@ -194,7 +176,6 @@ def engine_iterate(self, num=1):
 
             # probe/object rescaling
             if self.p.scale_precond:
-                cn2_new_pr_grad = cn2_new_pr_grad
                 if cn2_new_pr_grad > 1e-5:
                     scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad
                                  / cn2_new_pr_grad)
@@ -362,10 +343,8 @@ def new_grad(self):
             aux[:] = FW(aux)
 
             GDK.make_model(aux, addr)
-
             if self.p.floating_intensities:
                 GDK.floating_intensity(addr, w, I, fic)
-
             GDK.main(aux, addr, w, I)
             GDK.error_reduce(addr, err_phot)
 
@@ -448,7 +427,6 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h):
             b[:] = FW(b)
 
             GDK.make_a012(f, a, b, addr, I, fic)
-
             GDK.fill_b(addr, Brenorm, w, B)
 
         parallel.allreduce(B)
diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index 6e62fccdf..b1f109444 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -1,5 +1,6 @@
 import numpy as np
 from ptypy.utils.verbose import logger, log
+from .array_utils import max_abs2
 
 class Adict(object):
 
@@ -73,6 +74,28 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum):
         ferr[:] = mask * np.abs(fdev) ** 2 / mask_sum.reshape((maxz, 1, 1))
         return
 
+    def fourier_deviation(self, b_aux, addr, mag):
+        # reference shape (write-to shape)
+        sh = self.fshape
+        # stopper
+        maxz = mag.shape[0]
+
+        # batch buffers
+        fdev = self.npy.fdev[:maxz]
+        aux = b_aux[:maxz * self.nmodes]
+
+        ## Actual math ##
+
+        # build model from complex fourier magnitudes, summing up 
+        # all modes incoherently
+        tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
+        af = np.sqrt((np.abs(tf) ** 2).sum(1))
+
+        # calculate difference to real data (g_mag)
+        fdev[:] = af - mag
+
+        return
+
     def error_reduce(self, addr, err_sum):
         # reference shape (write-to shape)
         sh = self.fshape
@@ -133,6 +156,33 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0):
         aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
         return
 
+    def fmag_update_nopbound(self, b_aux, addr, mag, mask):
+
+        sh = self.fshape
+        nmodes = self.nmodes
+
+        # stopper
+        maxz = mag.shape[0]
+
+        # batch buffers
+        fdev = self.npy.fdev[:maxz]
+        aux = b_aux[:maxz * nmodes]
+
+        # write-to shape
+        ish = aux.shape
+
+        ## Actual math ##
+
+        # local values
+        fm = np.ones((maxz, sh[1], sh[2]), np.float32)
+
+        af = fdev + mag
+        fm[:] = (1 - mask) + mask * mag / (af + self.denom)
+
+        # upcasting
+        aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish)
+        return
+
     def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
         # reference shape (write-to shape)
         sh = self.fshape
@@ -358,7 +408,6 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0):
         aux = b_aux[:maxz * nmodes]
         flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3])
         rows, cols = ex.shape[-2:]
-
         for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
             tmp = ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \
                   pr[prc[0], :, :] * \
@@ -368,7 +417,7 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0):
             aux[ind, :, :] = tmp
         return
 
-    def build_exit(self, b_aux, addr, ob, pr, ex):
+    def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1):
 
         sh = addr.shape
 
@@ -384,9 +433,35 @@ def build_exit(self, b_aux, addr, ob, pr, ex):
         rows, cols = ex.shape[-2:]
 
         for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
-            dex = aux[ind, :, :] - \
+            dex = aux[ind, :, :] - alpha * \
                   ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \
-                  pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols]
+                  pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + (alpha - 1) * \
+                  ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]
+
+            ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex
+            aux[ind, :, :] = dex
+        return
+
+    def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1):
+        sh = addr.shape
+
+        nmodes = sh[1]
+
+        # stopper
+        maxz = sh[0]
+
+        # batch buffers
+        aux = b_aux[:maxz * nmodes]
+
+        flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            dex = tau * aux[ind, :, :] + (tau * alpha - 1) * \
+                  ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + \
+                  (1 - tau * (1 + alpha)) * \
+                  ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \
+                  pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] 
 
             ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex
             aux[ind, :, :] = dex
@@ -478,8 +553,39 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0):
                 ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
         return
 
+    def ob_update_local(self, addr, ob, pr, ex, aux):
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        pr_norm = max_abs2(pr)
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \
+                pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \
+                (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \
+                pr_norm
+        return
+
+    def pr_update_local(self, addr, pr, ob, ex, aux):
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        ob_norm = max_abs2(ob)
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \
+                ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \
+                (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \
+                ob_norm
+        return
+
 class PositionCorrectionKernel(BaseKernel):
-    def __init__(self, aux, nmodes):
+    from ptypy.accelerate.base import address_manglers
+
+    MANGLERS = {
+        'Annealing': address_manglers.RandomIntMangler,
+        'GridSearch': address_manglers.GridSearchMangler
+    }
+
+    def __init__(self, aux, nmodes, parameters, resolution):
         super(PositionCorrectionKernel, self).__init__()
         ash = aux.shape
         self.fshape = (ash[0] // nmodes, ash[1], ash[2])
@@ -487,11 +593,20 @@ def __init__(self, aux, nmodes):
         self.npy.fdev = None
         self.addr = None
         self.nmodes = nmodes
-        self.address_mangler = None
+        self.param = parameters
+        self.nshifts = parameters.nshifts
+        self.resolution = resolution
         self.kernels = ['build_aux',
                         'fourier_error',
                         'error_reduce',
                         'update_addr']
+        self.setup()
+
+    def setup(self):
+        Mangler = self.MANGLERS[self.param.method]
+        self.mangler = Mangler(int(self.param.amplitude // self.resolution[0]), self.param.start, self.param.stop,
+                               self.param.nshifts,
+                               max_bound=int(self.param.max_shift // self.resolution[0]), randomseed=0)
 
     def allocate(self):
         self.npy.fdev = np.zeros(self.fshape, dtype=np.float32) # we won't use this again but preallocate for speed
@@ -565,11 +680,32 @@ def error_reduce(self, addr, err_sum):
         err_sum[:] = ferr.sum(-1).sum(-1)
         return
 
+    def log_likelihood(self, b_aux, addr, mag, mask, err_sum):
+        # reference shape (write-to shape)
+        sh = self.fshape
+        # stopper
+        maxz = mag.shape[0]
+
+        # batch buffers
+        aux = b_aux[:maxz * self.nmodes]
+
+        # build model from complex fourier magnitudes, summing up 
+        # all modes incoherently
+        tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
+        LL = (np.abs(tf) ** 2).sum(1)
+
+        # Intensity data
+        I = mag**2
+
+        # Calculate log likelihood error
+        err_sum[:] = ((mask * (LL - I)**2 / (I + 1.)).sum(-1).sum(-1) /  np.prod(LL.shape[-2:]))
+        return
+
     def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum):
         '''
         updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu
         '''
         update_indices = err_sum < error_state
-        log(4, "updating %s indices" % np.sum(update_indices))
+        log(4, "Position correction: updating %s indices" % np.sum(update_indices))
         addr[update_indices] = mangled_addr[update_indices]
         error_state[update_indices] = err_sum[update_indices]
diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py
index 04074625b..e6c51d49f 100644
--- a/ptypy/accelerate/cuda_pycuda/__init__.py
+++ b/ptypy/accelerate/cuda_pycuda/__init__.py
@@ -3,8 +3,14 @@
 import numpy as np
 import os
 # debug_options = []
-#debug_options = ['-O0', '-G', '-g', '-std=c++11', '--keep']
-debug_options = ['-O3', '-DNDEBUG', '-std=c++11', '-lineinfo'] # release mode flags
+# debug_options = ['-O0', '-G', '-g']
+debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags
+
+# C++14 support was added with CUDA 9, so we only enable the flag there
+if cuda.get_version()[0] >= 9:
+    debug_options += ['-std=c++14']
+else:
+    debug_options += ['-std=c++11']
 
 context = None
 queue = None
@@ -18,9 +24,13 @@ def get_context(new_context=False, new_queue=False):
 
     if context is None or new_context:
         cuda.init()
-        if parallel.rank_local < cuda.Device.count():
-            context = cuda.Device(parallel.rank_local).make_context()
-            context.push()
+        if parallel.rank_local >= cuda.Device.count():
+            raise Exception('Local rank must be smaller than total device count, \
+                rank={}, rank_local={}, device_count={}'.format(
+                parallel.rank, parallel.rank_local, cuda.Device.count()
+            ))
+        context = cuda.Device(parallel.rank_local).make_context()
+        context.push()
         # print("made context %s on rank %s" % (str(context), str(parallel.rank)))
         # print("The cuda device count on %s is:%s" % (str(parallel.rank),
         #                                              str(cuda.Device.count())))
@@ -28,20 +38,31 @@ def get_context(new_context=False, new_queue=False):
         #                                                     str(parallel.rank_local)))
     if queue is None or new_queue:
         queue = cuda.Stream()
+    
     return context, queue
 
 
 def load_kernel(name, subs={}, file=None):
 
     if file is None:
-        fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name)
+        if isinstance(name, str):
+            fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name)
+        else:
+            raise ValueError("name parameter must be a string if not filename is given")
     else:
         fn = "%s/cuda/%s" % (os.path.dirname(__file__), file)
-        
+
     with open(fn, 'r') as f:
         kernel = f.read()
     for k,v in list(subs.items()):
         kernel = kernel.replace(k, str(v))
+    # insert a preprocessor line directive to assist compiler errors
+    escaped = fn.replace("\\", "\\\\")
+    kernel = '#line 1 "{}"\n'.format(escaped) + kernel
     mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options)
-    return mod.get_function(name)
+    
+    if isinstance(name, str):
+        return mod.get_function(name)
+    else:  # tuple
+        return tuple(mod.get_function(n) for n in name)
 
diff --git a/ptypy/accelerate/cuda_pycuda/address_manglers.py b/ptypy/accelerate/cuda_pycuda/address_manglers.py
new file mode 100644
index 000000000..d19a77fa4
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/address_manglers.py
@@ -0,0 +1,74 @@
+from ptypy.accelerate.cuda_pycuda import load_kernel
+import numpy as np
+from ptypy.accelerate.base import address_manglers as npam
+from pycuda import gpuarray
+import pycuda.driver as cuda
+
+class BaseMangler(npam.BaseMangler):
+
+    def __init__(self, *args, queue_thread=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue = queue_thread
+        self.get_address_cuda = load_kernel("get_address")
+        self.delta = None
+        self.delta_gpu = None
+
+    def _setup_delta_gpu(self):
+        assert self.delta is not None, "Setup delta using the setup_shifts method first"
+        self.delta = np.ascontiguousarray(self.delta, dtype=np.int32)
+        
+        if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]:
+            self.delta_gpu = gpuarray.empty(self.delta.shape, dtype=np.int32)
+        # in case self.delta is smaller than delta_gpu, this will only copy the
+        # relevant part
+        cuda.memcpy_htod(dest=self.delta_gpu.ptr,
+                         src=self.delta)
+
+    def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx):
+        assert addr_current.dtype == np.int32, "addresses must be int32"
+        assert mangled_addr.dtype == np.int32, "addresses must be int32"
+        assert len(addr_current.shape) == 4, "addresses must be 4 dimensions"
+        assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated"
+        assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first"
+        assert index < self.delta_gpu.shape[0], "Index out of range for deltas"
+        assert isinstance(self.delta_gpu, gpuarray.GPUArray), "Only GPU arrays are supported for delta"
+
+        # only using a single thread block here as it's not enough work
+        # otherwise
+        self.get_address_cuda(
+            addr_current,
+            mangled_addr,
+            np.int32(addr_current.shape[0] * addr_current.shape[1]),
+            self.delta_gpu[index,None],
+            np.int32(max_oby),
+            np.int32(max_obx),
+            block=(64,1,1),
+            grid=(1, 1, 1),
+            stream=self.queue)
+
+# with multiple inheritance, we have to be explicit which super class 
+# we are calling in the methods
+class RandomIntMangler(BaseMangler, npam.RandomIntMangler):
+
+    def __init__(self, *args, **kwargs):
+        BaseMangler.__init__(self, *args, **kwargs)
+
+    def setup_shifts(self, *args, **kwargs):
+        npam.RandomIntMangler.setup_shifts(self, *args, **kwargs)
+        self._setup_delta_gpu()
+
+    def get_address(self, *args, **kwargs):
+        BaseMangler.get_address(self, *args, **kwargs)
+
+
+class GridSearchMangler(BaseMangler, npam.GridSearchMangler):
+
+    def __init__(self, *args, **kwargs):
+        BaseMangler.__init__(self, *args, **kwargs)
+
+    def setup_shifts(self, *args, **kwargs):
+        npam.GridSearchMangler.setup_shifts(self, *args, **kwargs)
+        self._setup_delta_gpu()
+
+    def get_address(self, *args, **kwargs):
+        BaseMangler.get_address(self, *args, **kwargs)
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
index 7ec819b95..85f816223 100644
--- a/ptypy/accelerate/cuda_pycuda/array_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -1,28 +1,45 @@
 from . import load_kernel
 from pycuda import gpuarray
+import pycuda.driver as cuda
 from ptypy.utils import gaussian
 import numpy as np
 
+# maps a numpy dtype to the corresponding C type
+def map2ctype(dt):
+    if dt == np.float32:
+        return 'float'
+    elif dt == np.float64: 
+        return 'double'
+    elif dt == np.complex64: 
+        return 'complex<float>'
+    elif dt == np.complex128: 
+        return 'complex<double>'
+    elif dt == np.int32:
+        return 'int'
+    elif dt == np.int64:
+        return 'long long'
+    else:
+        raise ValueError('No mapping for {}'.format(dt))
+
+
 class ArrayUtilsKernel:
     def __init__(self, acc_dtype=np.float64, queue=None):
         self.queue = queue
         self.acc_dtype = acc_dtype
         self.cdot_cuda = load_kernel("dot", {
-            'INTYPE': 'complex<float>',
-            'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float'
+            'IN_TYPE': 'complex<float>',
+            'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float'
         })
         self.dot_cuda = load_kernel("dot", {
-            'INTYPE': 'float',
-            'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float'
+            'IN_TYPE': 'float',
+            'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float'
         })
         self.full_reduce_cuda = load_kernel("full_reduce", {
-            'DTYPE': 'double' if acc_dtype==np.float64 else 'float',
+            'IN_TYPE': 'double' if acc_dtype==np.float64 else 'float',
+            'OUT_TYPE': 'double' if acc_dtype==np.float64 else 'float',
+            'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float',
             'BDIM_X': 1024
         })
-        self.transpose_cuda = load_kernel("transpose", {
-            'DTYPE': 'int',
-            'BDIM': 16
-        })
         self.Ctmp = None
         
     def dot(self, A, B, out=None):
@@ -60,6 +77,18 @@ def dot(self, A, B, out=None):
         
         return out
 
+    def norm2(self, A, out=None):
+        return self.dot(A, A, out)
+
+class TransposeKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        self.transpose_cuda = load_kernel("transpose", {
+            'DTYPE': 'int',
+            'BDIM': 16
+        })
+
     def transpose(self, input, output):
         # only for int at the moment (addr array), and 2D (reshape pls)
         if len(input.shape) != 2:
@@ -80,8 +109,134 @@ def transpose(self, input, output):
         self.transpose_cuda(input, output, np.int32(width), np.int32(height),
             block=blk, grid=grd, stream=self.queue)
 
-    def norm2(self, A, out=None):
-        return self.dot(A, A, out)
+class MaxAbs2Kernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        # we lazy-load this depending on the data types we get
+        self.max_abs2_cuda = {}
+
+    def max_abs2(self, X, out):
+        """ Calculate max(abs(x)**2) across the final 2 dimensions"""
+        rows = np.int32(X.shape[-2])
+        cols = np.int32(X.shape[-1])
+        firstdims = np.int32(np.prod(X.shape[:-2]))
+        gy = int(rows)
+        # lazy-loading, keeping scratch memory and both kernels in the same dictionary
+        bx = int(64)
+        version = '{},{},{}'.format(map2ctype(X.dtype), map2ctype(out.dtype), gy)
+        if version not in self.max_abs2_cuda:
+            step1, step2 = load_kernel(
+                    ("max_abs2_step1", "max_abs2_step2"), 
+                    {
+                        'IN_TYPE': map2ctype(X.dtype),
+                        'OUT_TYPE': map2ctype(out.dtype),
+                        'BDIM_X': bx,
+                    }, "max_abs2.cu")
+            self.max_abs2_cuda[version] = {
+                'step1': step1,
+                'step2': step2,
+                'scratchmem': gpuarray.empty((gy,), dtype=out.dtype)
+            }
+
+        # if self.max_abs2_cuda[version]['scratchmem'] is None \
+        #     or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy:
+        #     self.max_abs2_cuda[version]['scratchmem'] =
+        scratch = self.max_abs2_cuda[version]['scratchmem']
+
+        
+        self.max_abs2_cuda[version]['step1'](X, firstdims, rows, cols, scratch,
+            block=(bx, 1, 1), grid=(1, gy, 1),
+            stream=self.queue)
+        self.max_abs2_cuda[version]['step2'](scratch, np.int32(gy), out,
+            block=(bx, 1, 1), grid=(1, 1, 1),
+            stream=self.queue
+        )
+    
+
+class CropPadKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        # we lazy-load this depending on the data types we get
+        self.fill3D_cuda = {}
+
+    def fill3D(self, A, B, offset=[0, 0, 0]):
+        """
+        Fill 3-dimensional array A with B.
+        """
+        if A.ndim < 3 or B.ndim < 3:
+            raise ValueError('Input arrays must each be at least 3D')
+        assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+        ash = A.shape
+        bsh = B.shape
+        misfit = np.array(bsh) - np.array(ash)
+        assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes."
+
+        Alim = np.array(A.shape[-3:])
+        Blim = np.array(B.shape[-3:])
+        off = np.array(offset)
+        Ao = off.copy()
+        Ao[Ao < 0] = 0
+        Bo = -off.copy()
+        Bo[Bo < 0] = 0
+        assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap"
+        Ao = Ao.astype(np.int32)
+        Bo =     Bo.astype(np.int32)
+        lengths = np.array([
+            min(off[0] + Blim[0], Alim[0]) - Ao[0],
+            min(off[1] + Blim[1], Alim[1]) - Ao[1],
+            min(off[2] + Blim[2], Alim[2]) - Ao[2],
+        ], dtype=np.int32)
+        lengths2 = np.array([
+            min(Alim[0] - off[0], Blim[0]) - Bo[0],
+            min(Alim[1] - off[1], Blim[1]) - Bo[1],
+            min(Alim[2] - off[2], Blim[2]) - Bo[2],
+        ], dtype=np.int32)
+        assert (lengths == lengths2).all(), "left and right lenghts are not matching"
+        batch = int(np.prod(A.shape[:-3]))
+        
+        # lazy loading depending on data type
+        version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype))
+        if version not in self.fill3D_cuda:
+            self.fill3D_cuda[version] = load_kernel("fill3D", {
+              'IN_TYPE': map2ctype(B.dtype),
+              'OUT_TYPE': map2ctype(A.dtype)
+            })
+        bx = by = 32
+        self.fill3D_cuda[version](
+            A, B, 
+            np.int32(A.shape[-3]), np.int32(A.shape[-2]), np.int32(A.shape[-1]),
+            np.int32(B.shape[-3]), np.int32(B.shape[-2]), np.int32(B.shape[-1]),
+            Ao[0], Ao[1], Ao[2],
+            Bo[0], Bo[1], Bo[2],
+            lengths[0], lengths[1], lengths[2],
+            block=(int(bx), int(by), int(1)),
+            grid=(
+                int((lengths[2] + bx - 1)//bx),
+                int((lengths[1] + by - 1)//by),
+                int(batch)),
+            stream=self.queue
+        )
+
+
+    def crop_pad_2d_simple(self, A, B):
+        """
+        Places B in A centered around the last two axis. A and B must be of the same shape
+        anywhere but the last two dims.
+        """
+        assert A.ndim >= 2, "Arrays must have more than 2 dimensions."
+        assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+        misfit = np.array(A.shape) - np.array(B.shape)
+        assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes."
+        if A.ndim == 2:
+            A = A.reshape((1,) + A.shape)
+        if B.ndim == 2:
+            B = B.reshape((1,) + B.shape)
+        a1, a2 = A.shape[-2:]
+        b1, b2 = B.shape[-2:]
+        offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2]
+        self.fill3D(A, B, offset)
 
 
 class DerivativesKernel:
@@ -99,30 +254,27 @@ def __init__(self, dtype, queue=None):
         self.last_axis_block = (256, 4, 1)
         self.mid_axis_block = (256, 4, 1)
 
-        self.delxf_last = load_kernel("delx_last", file="delx_last.cu", subs={
-            'IS_FORWARD': 'true',
-            'BDIM_X': str(self.last_axis_block[0]),
-            'BDIM_Y': str(self.last_axis_block[1]),
-            'DTYPE': stype
-        })
-        self.delxb_last = load_kernel("delx_last", file="delx_last.cu", subs={
-            'IS_FORWARD': 'false',
-            'BDIM_X': str(self.last_axis_block[0]),
-            'BDIM_Y': str(self.last_axis_block[1]),
-            'DTYPE': stype
-        })
-        self.delxf_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={
-            'IS_FORWARD': 'true',
-            'BDIM_X': str(self.mid_axis_block[0]),
-            'BDIM_Y': str(self.mid_axis_block[1]),
-            'DTYPE': stype
-        })
-        self.delxb_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={
-            'IS_FORWARD': 'false',
-            'BDIM_X': str(self.mid_axis_block[0]),
-            'BDIM_Y': str(self.mid_axis_block[1]),
-            'DTYPE': stype
-        })
+        self.delxf_last, self.delxf_mid = load_kernel(
+            ("delx_last", "delx_mid"), 
+            file="delx.cu", 
+            subs={
+                'IS_FORWARD': 'true',
+                'BDIM_X': str(self.last_axis_block[0]),
+                'BDIM_Y': str(self.last_axis_block[1]),
+                'IN_TYPE': stype,
+                'OUT_TYPE': stype
+            })
+        self.delxb_last, self.delxb_mid  = load_kernel(
+            ("delx_last", "delx_mid"), 
+            file="delx.cu", 
+            subs={
+                'IS_FORWARD': 'false',
+                'BDIM_X': str(self.last_axis_block[0]),
+                'BDIM_Y': str(self.last_axis_block[1]),
+                'IN_TYPE': stype,
+                'OUT_TYPE': stype
+            })
+        
 
     def delxf(self, input, out, axis=-1):
         if input.dtype != self.dtype:
@@ -188,34 +340,59 @@ def delxb(self, input, out, axis=-1):
 
 
 class GaussianSmoothingKernel:
-    def __init__(self, queue=None, num_stdevs=4):
+    def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
+        if kernel_type not in ['float', 'double']:
+            raise ValueError('Invalid data type for kernel')
+        self.kernel_type = kernel_type
         self.dtype = np.complex64
         self.stype = "complex<float>"
         self.queue = queue
         self.num_stdevs = num_stdevs
         self.blockdim_x = 4
         self.blockdim_y = 16
+
         
         # At least 2 blocks per SM
         self.max_shared_per_block = 48 * 1024 // 2 
         self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize
-        self.max_kernel_radius = self.max_shared_per_block_complex / self.blockdim_y
+        self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y)
 
-        self.convolution_row = load_kernel("convolution_row", file="convolution.cu", subs={
-            'BDIM_X': self.blockdim_x,
-            'BDIM_Y': self.blockdim_y,
-            'DTYPE': self.stype
+        self.convolution_row = load_kernel(
+            "convolution_row", file="convolution.cu", subs={
+                'BDIM_X': self.blockdim_x,
+                'BDIM_Y': self.blockdim_y,
+                'DTYPE': self.stype,
+                'MATH_TYPE': self.kernel_type
         })
-        self.convolution_col = load_kernel("convolution_col", file="convolution.cu", subs={
-            'BDIM_X': self.blockdim_y,
-            'BDIM_Y': self.blockdim_x,
-            'DTYPE': self.stype
+        self.convolution_col = load_kernel(
+        "convolution_col", file="convolution.cu", subs={
+                'BDIM_X': self.blockdim_y,   # NOTE: we swap x and y in this columns
+                'BDIM_Y': self.blockdim_x,
+                'DTYPE': self.stype,
+                'MATH_TYPE': self.kernel_type
         })
+        # pre-allocate kernel memory on gpu, with max-radius to accomodate
+        dtype=np.float32 if self.kernel_type == 'float' else np.float64
+        self.kernel_gpu = gpuarray.empty((self.max_kernel_radius,), dtype=dtype)
+        # keep track of previus radius and std to determine if we need to transfer again
+        self.r = 0
+        self.std = 0
 
     
-    def convolution(self, input, output, mfs):
-        ndims = input.ndim
-        shape = input.shape
+    def convolution(self, data, mfs, tmp=None):
+        """
+        Calculates a stacked 2D convolution for smoothing, with the standard deviations
+        given in mfs (stdx, stdy). It works in-place in the data array,
+        and tmp is a gpu-allocated array of the same size and type as data,
+        used internally for temporary storage
+        """
+        ndims = data.ndim
+        shape = data.shape
+
+        # Create temporary array (if not given)
+        if tmp is None:
+            tmp = gpuarray.empty(shape, dtype=data.dtype)
+        assert shape == tmp.shape and data.dtype == tmp.dtype
 
         # Check input dimensions        
         if ndims == 3:
@@ -232,15 +409,23 @@ def convolution(self, input, output, mfs):
         else:
             raise NotImplementedError("input needs to be of dimensions 0 < ndims <= 3")
 
+        input = data
+        output = tmp
+
         # Row convolution kernel
         # TODO: is this threshold acceptable in all cases?
         if stdx > 0.1:
             r = int(self.num_stdevs * stdx + 0.5)
-            g = gaussian(np.arange(-r,r+1), stdx)
-            g /= g.sum()
-            kernel = gpuarray.to_gpu(g[r:].astype(np.float32))
             if r > self.max_kernel_radius:
                 raise ValueError("Size of Gaussian kernel too large")
+            if r != self.r or stdx != self.std:
+                # recalculate + transfer
+                g = gaussian(np.arange(-r,r+1), stdx)
+                g /= g.sum()
+                k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64))
+                self.kernel_gpu[:r+1] = k[:]
+                self.r = r
+                self.std = stdx
 
             bx = self.blockdim_x
             by = self.blockdim_y
@@ -251,21 +436,27 @@ def convolution(self, input, output, mfs):
 
             blk = (bx, by, 1)
             grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches)
-            self.convolution_row(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), 
+            self.convolution_row(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), 
                                  block=blk, grid=grd, shared=shared, stream=self.queue)
 
-            # Overwrite input
             input = output
-
+            output = data
+        
         # Column convolution kernel
         # TODO: is this threshold acceptable in all cases?
         if stdy > 0.1:
             r = int(self.num_stdevs * stdy + 0.5)
-            g = gaussian(np.arange(-r,r+1), stdy)
-            g /= g.sum()
-            kernel = gpuarray.to_gpu(g[r:].astype(np.float32))
             if r > self.max_kernel_radius:
                 raise ValueError("Size of Gaussian kernel too large")
+            if r != self.r or stdy != self.std:
+                # recalculate + transfer
+                g = gaussian(np.arange(-r,r+1), stdy)
+                g /= g.sum()
+                k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64))
+                self.kernel_gpu[:r+1] = k[:]
+                self.r = r
+                self.std = stdy
+                
 
             bx = self.blockdim_y
             by = self.blockdim_x
@@ -276,9 +467,35 @@ def convolution(self, input, output, mfs):
 
             blk = (bx, by, 1)
             grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches)
-            self.convolution_col(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), 
+            self.convolution_col(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), 
                                  block=blk, grid=grd, shared=shared, stream=self.queue)
             
         # TODO: is this threshold acceptable in all cases?
         if (stdx <= 0.1 and stdy <= 0.1):
-            output[:] = input[:]
+            return   # nothing to do
+        elif (stdx > 0.1 and stdy > 0.1):
+            return   # both parts have run, output is back in data
+        else:
+            data[:] = tmp[:]  # only one of them has run, output is in tmp
+
+class ClipMagnitudesKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", {
+            'IN_TYPE': 'complex<float>',
+        })
+
+    def clip_magnitudes_to_range(self, array, clip_min, clip_max):
+
+        cmin = np.float32(clip_min)
+        cmax = np.float32(clip_max)
+
+        npixel = np.int32(np.prod(array.shape))
+        bx = 256
+        gx = int((npixel + bx - 1) // bx)
+        self.clip_magnitudes_cuda(array, cmin, cmax,
+                npixel,
+                block=(bx, 1, 1),
+                grid=(gx, 1, 1),
+                stream=self.queue)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu
index 15ca555fa..1263841b6 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu
@@ -1,13 +1,19 @@
 /** This kernel was used for FFT pre- and post-scaling,
     to test if cuFFT via python is worthwhile.
     It turned out it wasn't.
-*/
+ * 
+ * Data types:
+ * - IN_TYPE: the data type for the inputs
+ * - OUT_TYPE: the data type for the outputs
+ * - MATH_TYPE: the data type used for computation (filter)
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void batched_multiply(const complex<float>* input,
-                                            complex<float>* output,
-                                            const complex<float>* filter,
+extern "C" __global__ void batched_multiply(const complex<IN_TYPE>* input,
+                                            complex<OUT_TYPE>* output,
+                                            const complex<MATH_TYPE>* filter,
                                             float scale,
                                             int nBatches,
                                             int rows,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu
index 88b22c256..e9ceeb80c 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu
@@ -1,24 +1,43 @@
+/** build_aux kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
+// core calculation function - used by both kernels and inlined
+inline __device__ complex<MATH_TYPE> calculate(
+    const complex<MATH_TYPE>& t_obj,
+    const complex<MATH_TYPE>& t_probe,
+    const complex<MATH_TYPE>& t_ex,
+    MATH_TYPE alpha)
+{
+  return t_obj * t_probe * (MATH_TYPE(1) + alpha) - t_ex * alpha;
+}
+
 extern "C" __global__ void build_aux(
-    complex<float>* auxiliary_wave,
-    const complex<float>* __restrict__ exit_wave,
+    complex<OUT_TYPE>* auxiliary_wave,
+    const complex<IN_TYPE>* __restrict__ exit_wave,
     int B,
     int C,
-    const complex<float>* __restrict__ probe,
+    const complex<IN_TYPE>* __restrict__ probe,
     int E,
     int F,
-    const complex<float>* __restrict__ obj,
+    const complex<IN_TYPE>* __restrict__ obj,
     int H,
     int I,
     const int* __restrict__ addr,
-    float alpha)
+    IN_TYPE alpha_)
 {
   int bid = blockIdx.x;
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   int addr_stride = 15;
+  const MATH_TYPE alpha = alpha_;  // type conversion
 
   const int* oa = addr + 3 + bid * addr_stride;
   const int* pa = addr + bid * addr_stride;
@@ -35,9 +54,46 @@ extern "C" __global__ void build_aux(
                    // (it will work for less as well)
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auxiliary_wave[b * C + c] =
-          obj[b * I + c] * probe[b * F + c] * (1.0f + alpha) -
-          exit_wave[b * C + c] * alpha;
+      auxiliary_wave[b * C + c] = calculate(
+          obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha);
     }
   }
 }
+
+extern "C" __global__ void build_aux2(
+    complex<OUT_TYPE>* auxiliary_wave,
+    const complex<IN_TYPE>* __restrict__ exit_wave,
+    int B,
+    int C,
+    const complex<IN_TYPE>* __restrict__ probe,
+    int E,
+    int F,
+    const complex<IN_TYPE>* __restrict__ obj,
+    int H,
+    int I,
+    const int* __restrict__ addr,
+    IN_TYPE alpha_)
+{
+  int bid = blockIdx.z;
+  int tx = threadIdx.x;
+  int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  int addr_stride = 15;
+  const MATH_TYPE alpha = alpha_;  // type conversion
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  exit_wave += ea[0] * B * C;
+  auxiliary_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+    auxiliary_wave[b * C + c] = calculate(
+        obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha);
+  }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu
index 384efc070..ee091c58e 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu
@@ -1,23 +1,32 @@
+/** build_aux without exit wave kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave,
+extern "C" __global__ void build_aux_no_ex(complex<OUT_TYPE>* auxilliary_wave,
                                            int aRows,
                                            int aCols,
-                                           const CTYPE* __restrict__ probe,
+                                           const complex<IN_TYPE>* __restrict__ probe,
                                            int pRows,
                                            int pCols,
-                                           const CTYPE* __restrict__ obj,
+                                           const complex<IN_TYPE>* __restrict__ obj,
                                            int oRows,
                                            int oCols,
                                            const int* __restrict__ addr,
-                                           FTYPE fac,
+                                           IN_TYPE fac_,
                                            int doAdd)
 {
   int bid = blockIdx.x;
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   const int addr_stride = 15;
+  const MATH_TYPE fac = fac_;   // type conversion
 
   const int* oa = addr + 3 + bid * addr_stride;
   const int* pa = addr + bid * addr_stride;
@@ -32,7 +41,9 @@ extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave,
 #   pragma unroll(4)
     for (int c = tx; c < aCols; c += blockDim.x)
     {
-      auto tmp = obj[b * oCols + c] * probe[b * pCols + c] * fac;
+      complex<MATH_TYPE> t_obj = obj[b * oCols + c];
+      complex<MATH_TYPE> t_probe = probe[b * pCols + c];
+      auto tmp = t_obj * t_probe * fac;
       if (doAdd)
       {
         auxilliary_wave[b * aCols + c] += tmp;
@@ -43,4 +54,50 @@ extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave,
       }
     }
   }
+}
+
+extern "C" __global__ void build_aux2_no_ex(complex<OUT_TYPE>* auxilliary_wave,
+                                           int aRows,
+                                           int aCols,
+                                           const complex<IN_TYPE>* __restrict__ probe,
+                                           int pRows,
+                                           int pCols,
+                                           const complex<IN_TYPE>* __restrict__ obj,
+                                           int oRows,
+                                           int oCols,
+                                           const int* __restrict__ addr,
+                                           IN_TYPE fac_,
+                                           int doAdd)
+{
+  int bid = blockIdx.z;
+  int tx = threadIdx.x;
+  int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= aRows)
+    return;
+  const int addr_stride = 15;
+  const MATH_TYPE fac = fac_;   // type conversion
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  obj += oa[0] * oRows * oCols + oa[1] * oCols + oa[2];
+  probe += pa[0] * pRows * pCols + pa[1] * pCols + pa[2];
+  auxilliary_wave += ea[0] * aRows * aCols;
+
+  for (int c = tx; c < aCols; c += blockDim.x)
+  {
+    complex<MATH_TYPE> t_obj = obj[b * oCols + c];
+    complex<MATH_TYPE> t_probe = probe[b * pCols + c];
+    auto tmp = t_obj * t_probe * fac;
+    if (doAdd)
+    {
+      auxilliary_wave[b * aCols + c] += tmp;
+    }
+    else
+    {
+      auxilliary_wave[b * aCols + c] = tmp;
+    }
+  }
+
 }
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu
index 004e7f0ed..327040371 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu
@@ -1,12 +1,20 @@
+/** build_aux for position correction.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
 extern "C" __global__ void build_aux_position_correction(
-    complex<float>* auxiliary_wave,
-    const complex<float>* __restrict__ probe,
+    complex<OUT_TYPE>* auxiliary_wave,
+    const complex<IN_TYPE>* __restrict__ probe,
     int B,
     int C,
-    const complex<float>* __restrict__ obj,
+    const complex<IN_TYPE>* __restrict__ obj,
     int H,
     int I,
     const int* __restrict__ addr)
@@ -30,7 +38,9 @@ extern "C" __global__ void build_aux_position_correction(
                    // (it will work for less as well)
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auxiliary_wave[b * C + c] = obj[b * I + c] * probe[b * C + c];
+      complex<MATH_TYPE> t_obj = obj[b * I + c];
+      complex<MATH_TYPE> t_probe = probe[b * C + c];
+      auxiliary_wave[b * C + c] = t_obj * t_probe;
     }
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu
index 87031184e..2b98634dc 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu
@@ -1,3 +1,12 @@
+/** build_exit kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
+
 #include <thrust/complex.h>
 using thrust::complex;
 
@@ -9,22 +18,24 @@ __device__ inline void atomicAdd(complex<T>* x, complex<T> y)
   atomicAdd(xf + 1, y.imag());
 }
 
-extern "C" __global__ void build_exit(complex<float>* auxiliary_wave,
-                                      complex<float>* exit_wave,
+extern "C" __global__ void build_exit(complex<OUT_TYPE>* auxiliary_wave,
+                                      complex<OUT_TYPE>* exit_wave,
                                       int B,
                                       int C,
-                                      const complex<float>* __restrict__ probe,
+                                      const complex<IN_TYPE>* __restrict__ probe,
                                       int E,
                                       int F,
-                                      const complex<float>* __restrict__ obj,
+                                      const complex<IN_TYPE>* __restrict__ obj,
                                       int H,
                                       int I,
-                                      const int* __restrict__ addr)
+                                      const int* __restrict__ addr,
+                                      IN_TYPE alpha_)
 {
   int bid = blockIdx.x;
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   const int addr_stride = 15;
+  const MATH_TYPE alpha = alpha_;  // type conversion
 
   const int* oa = addr + 3 + bid * addr_stride;
   const int* pa = addr + bid * addr_stride;
@@ -41,8 +52,12 @@ extern "C" __global__ void build_exit(complex<float>* auxiliary_wave,
                    // (it will work for less as well)
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auto auxv = auxiliary_wave[b * C + c];
-      auxv -= probe[b * F + c] * obj[b * I + c];
+      complex<MATH_TYPE> auxv = auxiliary_wave[b * C + c];
+      complex<MATH_TYPE> t_probe = probe[b * F + c];
+      complex<MATH_TYPE> t_obj = obj[b * I + c];
+      complex<MATH_TYPE> t_exit = exit_wave[b * C + c];
+      auxv -= alpha * t_probe * t_obj;
+      auxv += (alpha - 1) * t_exit;
       exit_wave[b * C + c] += auxv;
       auxiliary_wave[b * C + c] = auxv;
     }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu
new file mode 100644
index 000000000..8528f2e9c
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu
@@ -0,0 +1,60 @@
+/** build_exit_alpha_tau kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
+
+#include <thrust/complex.h>
+using thrust::complex;
+
+
+extern "C" __global__ void build_exit_alpha_tau(
+                                      complex<OUT_TYPE>* auxiliary_wave,
+                                      complex<OUT_TYPE>* exit_wave,
+                                      int B,
+                                      int C,
+                                      const complex<IN_TYPE>* __restrict__ probe,
+                                      int E,
+                                      int F,
+                                      const complex<IN_TYPE>* __restrict__ obj,
+                                      int H,
+                                      int I,
+                                      const int* __restrict__ addr,
+                                      IN_TYPE alpha_,
+                                      IN_TYPE tau_)
+{
+  int bid = blockIdx.z;
+  int tx = threadIdx.x;
+  const int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  const int addr_stride = 15;
+  MATH_TYPE alpha = alpha_;
+  MATH_TYPE tau = tau_;
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  exit_wave += ea[0] * B * C;
+  auxiliary_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+      complex<MATH_TYPE> t_aux = auxiliary_wave[b * C + c];
+      complex<MATH_TYPE> t_probe = probe[b * F + c];
+      complex<MATH_TYPE> t_obj = obj[b * I + c];
+      complex<MATH_TYPE> t_ex = exit_wave[b * C + c];
+
+      auto dex = tau * t_aux + (tau * alpha - MATH_TYPE(1)) * t_ex +
+        (MATH_TYPE(1) - tau * (MATH_TYPE(1) + alpha)) * t_obj * t_probe;
+
+      exit_wave[b * C + c] += dex;
+      auxiliary_wave[b * C + c] = dex;
+  }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu
new file mode 100644
index 000000000..8128091f9
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu
@@ -0,0 +1,30 @@
+/** clip_magnitudes.
+ *
+ */
+ #include <cassert>
+ #include <cmath>
+ #include <thrust/complex.h>
+ using thrust::complex;
+ 
+ extern "C" __global__ void clip_magnitudes(IN_TYPE *arr,
+                                            float clip_min,
+                                            float clip_max,
+                                            int N)                                             
+{
+  int id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (id >= N)
+    return;
+  
+  auto v = arr[id];
+  auto mag = abs(v);
+  auto theta = arg(v);
+
+  if (mag > clip_max)
+    mag = clip_max;
+  if (mag < clip_min)
+    mag = clip_min;
+
+  v = thrust::polar(mag, theta);
+  arr[id] = v;
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu
index 1b008c815..ae42ecba5 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu
@@ -1,3 +1,11 @@
+/**
+ * Data types:
+ * - DTYPE (float/double/complex<float>/complex<double>)
+ * - MATH_TYPE (float/double) - used for the convolution kernel itself
+ * 
+ * A symmetric convolution kernel is assumed here
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
@@ -42,7 +50,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input,
                                            DTYPE *output,
                                            int height,
                                            int width,
-                                           const float* kernel,
+                                           const MATH_TYPE* kernel,
                                            int kernel_radius)
 {
     int tx = threadIdx.x;
@@ -97,7 +105,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input,
     if (gby + ty >= width || gbx + tx >= height)
         return;
 
-    // compute
+    // compute  - will be complex<double> if kernel is double
     auto sum = shm[tx * shwidth + (ty + kernel_radius)] * kernel[0];
     for (int i = 1; i <= kernel_radius; ++i)
     {
@@ -117,7 +125,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input,
                                            DTYPE *output,
                                            int height,
                                            int width,
-                                           const float* kernel,
+                                           const MATH_TYPE* kernel,
                                            int kernel_radius)
 {
     int tx = threadIdx.x;
@@ -169,7 +177,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input,
     if (gby + ty >= width || gbx + tx >= height)
         return;
 
-    // compute
+    // compute - will be complex<double> if kernel is double
     auto sum = shm[(tx + kernel_radius) * BDIM_Y + ty] * kernel[0];
     for (int i = 1; i <= kernel_radius; ++i)
     {
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu
similarity index 58%
rename from ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu
rename to ptypy/accelerate/cuda_pycuda/cuda/delx.cu
index ffc6600ca..f2e8a934e 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu
@@ -1,6 +1,14 @@
+/** difference along axes (last and mid axis kernels)
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs 
+ * - OUT_TYPE: the data type for the outputs 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
+
 /** Finite difference for forward/backward for any axis that is not the
  * last one, assuring that the reads and writes are coalesced.
  *
@@ -40,8 +48,8 @@ using thrust::complex;
  * zero if it's the end of the input.
  *
  */
-extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input,
-                                    DTYPE *output,
+extern "C" __global__ void delx_mid(const IN_TYPE *__restrict__ input,
+                                    OUT_TYPE *output,
                                     int lower_dim,   // x for 3D
                                     int higher_dim,  // z for 3D
                                     int axis_dim)
@@ -49,8 +57,8 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input,
   // reinterpret to avoid compiler warning that
   // constructor of complex<float>() cannot be called if it's
   // shared memory - polluting the outputs
-  __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)];
-  auto shared_data = reinterpret_cast<DTYPE *>(shr);
+  __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)];
+  auto shared_data = reinterpret_cast<IN_TYPE *>(shr);
 
   unsigned int tx = threadIdx.x;
   unsigned int ty = threadIdx.y;
@@ -82,7 +90,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input,
     {
       if (IS_FORWARD)
       {
-        DTYPE plus1;
+        IN_TYPE plus1;
         if (ty < BDIM_Y - 1 &&
             iy < axis_dim - 1)  // we have a next element in shared data
         {
@@ -100,7 +108,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input,
       }
       else
       {
-        DTYPE minus1;
+        IN_TYPE minus1;
         if (ty > 0)  // we have a previous element in shared
         {
           minus1 = shared_data[(ty - 1) * BDIM_X + tx];
@@ -118,3 +126,85 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input,
     }
   }
 }
+
+
+
+/** This is the special case for when we diff along the last axis.
+ * 
+ * Here, flat_dim is all other dims multiplied together, and axis_dim
+ * is the dimension along which we diff. 
+ * To ensure that we stay coalesced (compared to delx_mid), 
+ * we use the x index to iterate within each thread block (the loop).
+ * Otherwise it follows the same ideas as delx_mid - please read the
+ * description there.
+  */
+extern "C" __global__ void delx_last(const IN_TYPE *__restrict__ input,
+                                     OUT_TYPE *output,
+                                     int flat_dim,
+                                     int axis_dim)
+{
+  // reinterpret to avoid constructor of complex<float>() + compiler warning
+  __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)];
+  auto shared_data = reinterpret_cast<IN_TYPE *>(shr);
+
+  unsigned int tx = threadIdx.x;
+  unsigned int ty = threadIdx.y;
+
+  unsigned int ix = tx;
+  unsigned int iy = ty + blockIdx.x * BDIM_Y;  // we always use x in grid
+
+  int stride_y = axis_dim;
+
+  auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X;
+  for (int bidx = 0; bidx < maxblocks; ++bidx)
+  {
+    ix = tx + bidx * BDIM_X;
+
+    if (iy < flat_dim && ix < axis_dim)
+    {
+      shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix];
+    }
+
+    __syncthreads();
+
+    if (iy < flat_dim && ix < axis_dim)
+    {
+      if (IS_FORWARD)
+      {
+        IN_TYPE plus1;
+        if (tx < BDIM_X - 1 &&
+            ix < axis_dim - 1)  // we have a next element in shared data
+        {
+          plus1 = shared_data[ty * BDIM_X + tx + 1];
+        }
+        else if (ix == axis_dim - 1)  // end of axis - same as current to get 0
+        {
+          plus1 = shared_data[ty * BDIM_X + tx];
+        }
+        else  // end of block, but nore input is there
+        {
+          plus1 = input[iy * stride_y + ix + 1];
+        }
+
+        output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx];
+      }
+      else
+      {
+        IN_TYPE minus1;
+        if (tx > 0)  // we have a previous element in shared
+        {
+          minus1 = shared_data[ty * BDIM_X + tx - 1];
+        }
+        else if (ix == 0)  // use same as next to get zero
+        {
+          minus1 = shared_data[ty * BDIM_X + tx];
+        }
+        else  // read previous input (ty == 0 but iy > 0)
+        {
+          minus1 = input[iy * stride_y + ix - 1];
+        }
+        output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1;
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu
deleted file mode 100644
index c4449f19a..000000000
--- a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <thrust/complex.h>
-using thrust::complex;
-
-/** This is the special case for when we diff along the last axis.
- * 
- * Here, flat_dim is all other dims multiplied together, and axis_dim
- * is the dimension along which we diff. 
- * To ensure that we stay coalesced (compared to delx_mid), 
- * we use the x index to iterate within each thread block (the loop).
- * Otherwise it follows the same ideas as delx_mid - please read the
- * description there.
-  */
-extern "C" __global__ void delx_last(const DTYPE *__restrict__ input,
-                                     DTYPE *output,
-                                     int flat_dim,
-                                     int axis_dim)
-{
-  // reinterpret to avoid constructor of complex<float>() + compiler warning
-  __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)];
-  auto shared_data = reinterpret_cast<DTYPE *>(shr);
-
-  unsigned int tx = threadIdx.x;
-  unsigned int ty = threadIdx.y;
-
-  unsigned int ix = tx;
-  unsigned int iy = ty + blockIdx.x * BDIM_Y;  // we always use x in grid
-
-  int stride_y = axis_dim;
-
-  auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X;
-  for (int bidx = 0; bidx < maxblocks; ++bidx)
-  {
-    ix = tx + bidx * BDIM_X;
-
-    if (iy < flat_dim && ix < axis_dim)
-    {
-      shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix];
-    }
-
-    __syncthreads();
-
-    if (iy < flat_dim && ix < axis_dim)
-    {
-      if (IS_FORWARD)
-      {
-        DTYPE plus1;
-        if (tx < BDIM_X - 1 &&
-            ix < axis_dim - 1)  // we have a next element in shared data
-        {
-          plus1 = shared_data[ty * BDIM_X + tx + 1];
-        }
-        else if (ix == axis_dim - 1)  // end of axis - same as current to get 0
-        {
-          plus1 = shared_data[ty * BDIM_X + tx];
-        }
-        else  // end of block, but nore input is there
-        {
-          plus1 = input[iy * stride_y + ix + 1];
-        }
-
-        output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx];
-      }
-      else
-      {
-        DTYPE minus1;
-        if (tx > 0)  // we have a previous element in shared
-        {
-          minus1 = shared_data[ty * BDIM_X + tx - 1];
-        }
-        else if (ix == 0)  // use same as next to get zero
-        {
-          minus1 = shared_data[ty * BDIM_X + tx];
-        }
-        else  // read previous input (ty == 0 but iy > 0)
-        {
-          minus1 = input[iy * stride_y + ix - 1];
-        }
-        output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1;
-      }
-    }
-  }
-}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu
index 1f53b0d0c..21087abe3 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu
@@ -15,15 +15,15 @@ __device__ inline T dotmul(const complex<T>& a, const complex<T>& b)
   return a.real() * b.real() + a.imag() * b.imag();
 }
 
-extern "C" __global__ void dot(const INTYPE* a,
-                               const INTYPE* b,
+extern "C" __global__ void dot(const IN_TYPE* a,
+                               const IN_TYPE* b,
                                int size,
-                               ACCTYPE* out)
+                               ACC_TYPE* out)
 {
   int tx = threadIdx.x;
   int ix = tx + blockIdx.x * blockDim.x;
 
-  __shared__ ACCTYPE sh[1024];
+  __shared__ ACC_TYPE sh[1024];
 
   if (ix < size)
   {
@@ -31,7 +31,7 @@ extern "C" __global__ void dot(const INTYPE* a,
   }
   else
   {
-    sh[tx] = ACCTYPE(0);
+    sh[tx] = ACC_TYPE(0);
   }
   __syncthreads();
 
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu
index 177732e9b..91b5357b4 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu
@@ -1,17 +1,24 @@
-
-extern "C" __global__ void error_reduce(const float* ferr,
-                                        float* err_fmag,
+/** error_reduce kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - ACC_TYPE: the data type used for computation 
+ */
+
+extern "C" __global__ void error_reduce(const IN_TYPE* ferr,
+                                        OUT_TYPE* err_fmag,
                                         int M,
                                         int N)
 {
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   int batch = blockIdx.x;
-  extern __shared__ float sum_v[1024];
+  __shared__ ACC_TYPE sum_v[BDIM_X*BDIM_Y];
 
   int shidx =
       ty * blockDim.x + tx;  // shidx: index in shared memory for this block
-  float sum = 0.0f;
+  ACC_TYPE sum = ACC_TYPE(0.0);
 
   for (int m = ty; m < M; m += blockDim.y)
   {
@@ -20,7 +27,7 @@ extern "C" __global__ void error_reduce(const float* ferr,
     {
       int idx = batch * M * N + m * N +
                 n;  // idx is index qwith respect to the full stack
-      sum += ferr[idx];
+      sum += ACC_TYPE(ferr[idx]);
     }
   }
 
@@ -28,7 +35,7 @@ extern "C" __global__ void error_reduce(const float* ferr,
 
   __syncthreads();
 
-  int nt = blockDim.x * blockDim.y;
+  int nt = BDIM_X * BDIM_Y;
   int c = nt;
 
   while (c > 1)
@@ -44,6 +51,6 @@ extern "C" __global__ void error_reduce(const float* ferr,
 
   if (shidx == 0)
   {
-    err_fmag[batch] = float(sum_v[0]);
+    err_fmag[batch] = OUT_TYPE(sum_v[0]);
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
index d4f774319..fdac52e46 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
@@ -11,16 +11,16 @@ using thrust::complex;
 // (guided by profiler)
 extern "C" __global__ void __launch_bounds__(1024, 2)
     exit_error(int nmodes,
-               complex<float> *aux,
-               float *ferr,
-               const int *addr,
+               const complex<IN_TYPE> * __restrict aux,
+               OUT_TYPE *ferr,
+               const int * __restrict addr,
                int A,
                int B)
 {
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   int addr_stride = 15;
-  float denom = A * B;
+  MATH_TYPE denom = A * B;
 
   const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride;
   const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride;
@@ -32,15 +32,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
   {
     for (int b = tx; b < B; b += blockDim.x)
     {
-      float acc = 0.0;
+      MATH_TYPE acc = 0.0;
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        float abs_exit_wave = abs(aux[a * B + b + idx * A * B]);
+        complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
+        MATH_TYPE abs_exit_wave = abs(t_aux);
         acc += abs_exit_wave *
                abs_exit_wave;  // if we do this manually (real*real +imag*imag)
                                // we get differences to numpy due to rounding
       }
-      ferr[a * B + b] = acc / denom;
+      ferr[a * B + b] = OUT_TYPE(acc / denom);
     }
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu
new file mode 100644
index 000000000..c3f03d8ca
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu
@@ -0,0 +1,60 @@
+/** fill3D kernel.
+ * 
+ * Data types:
+ * - IN_TYPE: the data type for the inputs
+ * - OUT_TYPE: data type for outputs 
+ */
+
+#include <cmath>
+#include <thrust/complex.h>
+using thrust::complex;
+
+extern "C" __global__ void fill3D(
+    OUT_TYPE* A,
+    const IN_TYPE* B,
+    // final dimensions of A/B in [z, y, x]
+    int A_Z,
+    int A_Y,
+    int A_X,
+    int B_Z,
+    int B_Y,
+    int B_X,
+    // offsets to start reading/writing
+    int Ao_z,
+    int Ao_y,
+    int Ao_x,
+    int Bo_z,
+    int Bo_y,
+    int Bo_x,
+    // lengths to copy
+    int len_z,
+    int len_y,
+    int len_x
+    )
+{
+    // We use the following strategy:
+    // - BlockIdx.z for the batch (first dims combined if 4D+)
+    // - blockDim.z = 1
+    // - multiple blocks are used across y and x dimensions
+    // - we loop over z dimension within the thread block
+    int batch = blockIdx.z;
+    int ix = threadIdx.x + blockIdx.x * blockDim.x;
+    int iy = threadIdx.y + blockIdx.y * blockDim.y;
+
+    if (ix >= len_x || iy >= len_y)
+        return;
+
+    // offset for current batch (4D+ dimension)
+    A += batch * A_X * A_Y * A_Z;
+    B += batch * B_X * B_Y * B_Z;
+
+    // offset for start position in each dimension of the last 3
+    A += Ao_z * A_Y * A_X + Ao_y * A_X + Ao_x;
+    B += Bo_z * B_Y * B_X + Bo_y * B_X + Bo_x;
+
+    // copy data
+    for (int iz = 0; iz < len_z; ++iz) {
+        A[iz * A_Y * A_X + iy * A_X + ix] = 
+            B[iz * B_Y * B_X + iy * B_X + ix];
+    }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu
index cfdffb911..46d0d09f1 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu
@@ -1,29 +1,84 @@
-extern "C" __global__ void fill_b(const FTYPE* A0,
-                                  const FTYPE* A1,
-                                  const FTYPE* A2,
-                                  const FTYPE* w,
-                                  FTYPE Brenorm,
+/** fill_b kernels.
+ * Data types:
+ * - IN_TYPE: the data type for the inputs
+ * - OUT_TYPE: the data type for the outputs
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: the accumulator type for summing
+ */
+
+extern "C" __global__ void fill_b(const IN_TYPE* A0,
+                                  const IN_TYPE* A1,
+                                  const IN_TYPE* A2,
+                                  const IN_TYPE* w,
+                                  IN_TYPE Brenorm,
                                   int size,
-                                  double* out)
+                                  ACC_TYPE* out)
 {
   int tx = threadIdx.x;
   int ix = tx + blockIdx.x * blockDim.x;
-  __shared__ double smem[3][BDIM_X];
+  __shared__ ACC_TYPE smem[3][BDIM_X];
 
   if (ix < size)
   {
-    // FTYPE(2) to make sure it's float in single precision and doesn't
+    // MATHTYPE(2) to make sure it's float in single precision and doesn't
     // accidentally promote the equation to double
-    smem[0][tx] = w[ix] * A0[ix] * A0[ix];
-    smem[1][tx] = w[ix] * FTYPE(2) * A0[ix] * A1[ix];
-    smem[2][tx] = w[ix] * (A1[ix] * A1[ix] + FTYPE(2) * A0[ix] * A2[ix]);
+    MATH_TYPE t_a0 = A0[ix];
+    MATH_TYPE t_a1 = A1[ix];
+    MATH_TYPE t_a2 = A2[ix];
+    MATH_TYPE t_w = w[ix];
+    smem[0][tx] = t_w * t_a0 * t_a0;
+    smem[1][tx] = t_w * MATH_TYPE(2) * t_a0 * t_a1;
+    smem[2][tx] = t_w * (t_a1 * t_a1 + MATH_TYPE(2) * t_a0 * t_a2);
   }
   else
   {
-    smem[0][tx] = FTYPE(0);
-    smem[1][tx] = FTYPE(0);
-    smem[2][tx] = FTYPE(0);
+    smem[0][tx] = ACC_TYPE(0);
+    smem[1][tx] = ACC_TYPE(0);
+    smem[2][tx] = ACC_TYPE(0);
+  }
+  __syncthreads();
+
+  int nt = blockDim.x;
+  int c = nt;
+  while (c > 1)
+  {
+    int half = c / 2;
+    if (tx < half)
+    {
+      smem[0][tx] += smem[0][c - tx - 1];
+      smem[1][tx] += smem[1][c - tx - 1];
+      smem[2][tx] += smem[2][c - tx - 1];
+    }
+    __syncthreads();
+    c = c - half;
+  }
+
+  if (tx == 0)
+  {
+    out[blockIdx.x * 3 + 0] = MATH_TYPE(smem[0][0]) * MATH_TYPE(Brenorm);
+    out[blockIdx.x * 3 + 1] = MATH_TYPE(smem[1][0]) * MATH_TYPE(Brenorm);
+    out[blockIdx.x * 3 + 2] = MATH_TYPE(smem[2][0]) * MATH_TYPE(Brenorm);
+  }
+}
+
+extern "C" __global__ void fill_b_reduce(const ACC_TYPE* in, OUT_TYPE* B, int blocks)
+{
+  // always a single thread block for 2nd stage
+  assert(gridDim.x == 1);
+  int tx = threadIdx.x;
+
+  __shared__ ACC_TYPE smem[3][BDIM_X];
+
+  double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+  for (int ix = tx; ix < blocks; ix += blockDim.x)
+  {
+    sum0 += in[ix * 3 + 0];
+    sum1 += in[ix * 3 + 1];
+    sum2 += in[ix * 3 + 2];
   }
+  smem[0][tx] = sum0;
+  smem[1][tx] = sum1;
+  smem[2][tx] = sum2;
   __syncthreads();
 
   int nt = blockDim.x;
@@ -43,8 +98,8 @@ extern "C" __global__ void fill_b(const FTYPE* A0,
 
   if (tx == 0)
   {
-    out[blockIdx.x * 3 + 0] = smem[0][0] * double(Brenorm);
-    out[blockIdx.x * 3 + 1] = smem[1][0] * double(Brenorm);
-    out[blockIdx.x * 3 + 2] = smem[2][0] * double(Brenorm);
+    B[0] += OUT_TYPE(smem[0][0]);
+    B[1] += OUT_TYPE(smem[1][0]);
+    B[2] += OUT_TYPE(smem[2][0]);
   }
-}
\ No newline at end of file
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu
deleted file mode 100644
index c37d494d8..000000000
--- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <cassert>
-
-extern "C" __global__ void fill_b_reduce(const double* in, FTYPE* B, int blocks)
-{
-  // always a single thread block for 2nd stage
-  assert(gridDim.x == 1);
-  int tx = threadIdx.x;
-
-  __shared__ double smem[3][BDIM_X];
-
-  double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
-  for (int ix = tx; ix < blocks; ix += blockDim.x)
-  {
-    sum0 += in[ix * 3 + 0];
-    sum1 += in[ix * 3 + 1];
-    sum2 += in[ix * 3 + 2];
-  }
-  smem[0][tx] = sum0;
-  smem[1][tx] = sum1;
-  smem[2][tx] = sum2;
-  __syncthreads();
-
-  int nt = blockDim.x;
-  int c = nt;
-  while (c > 1)
-  {
-    int half = c / 2;
-    if (tx < half)
-    {
-      smem[0][tx] += smem[0][c - tx - 1];
-      smem[1][tx] += smem[1][c - tx - 1];
-      smem[2][tx] += smem[2][c - tx - 1];
-    }
-    __syncthreads();
-    c = c - half;
-  }
-
-  if (tx == 0)
-  {
-    B[0] += FTYPE(smem[0][0]);
-    B[1] += FTYPE(smem[1][0]);
-    B[2] += FTYPE(smem[2][0]);
-  }
-}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu
index bb152466a..586d7f356 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu
@@ -29,18 +29,6 @@
 #include <cmath>
 #include <cassert>
 
-#ifndef MY_FFT_ROWS
-# define MY_FFT_ROWS 128
-# pragma GCC warning "MY_FFT_ROWS not set in preprocessor - defaulting to 128"
-#endif
-
-#ifndef MY_FFT_COLS
-# define MY_FFT_COLS 128
-# pragma GCC warning "MY_FFT_COLS not set in preprocessor - defaulting to 128"
-#endif
-
-
-
 template <int ROWS, int COLUMNS, bool SYMMETRIC, bool IS_FORWARD>
 class FilteredFFTImpl : public FilteredFFT {
 public:
@@ -274,9 +262,37 @@ void FilteredFFTImpl<ROWS,COLUMNS,SYMMETRIC,IS_FORWARD>::setupPlan() {
     }
 }
 
+template <bool SYMMETRIC, bool FORWARD>
+static FilteredFFT* make(int batches, int rows, int cols, complex<float>* prefilt, complex<float>* postfilt, 
+  cudaStream_t stream)
+{
+    // we only support rows / colums are equal and powers of 2, from 16x16 to 512x512
+    if (rows != cols) 
+      throw std::runtime_error("Only equal numbers of rows and columns are supported");
+    switch (rows)
+    {
+        case 16: return new FilteredFFTImpl<16, 16, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 32: return new FilteredFFTImpl<32, 32, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 64: return new FilteredFFTImpl<64, 64, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 128: return new FilteredFFTImpl<128, 128, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 256: return new FilteredFFTImpl<256, 256, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 512: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 1024: return new FilteredFFTImpl<1024, 1024, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        case 2048: return new FilteredFFTImpl<2048, 2048, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream);
+        default: throw std::runtime_error("Only powers of 2 from 16 to 2048 are supported");
+    }
+}
+
 //////////// Factory Functions for Python
 
-FilteredFFT* make_filtered(int batches, bool symmetricScaling,
+// Note: This will instantiate templates for 8 powers of 2, with 4 combinations of forward/reverse, symmetric/not,
+// i.e. 32 different FFTs into the binary. Compile time might be quite long, but we intend to do this once
+// during installation
+
+FilteredFFT* make_filtered(
+  int batches, 
+  int rows, int cols,
+  bool symmetricScaling,
   bool isForward,
   complex<float>* prefilt, complex<float>* postfilt, 
   cudaStream_t stream)
@@ -284,21 +300,17 @@ FilteredFFT* make_filtered(int batches, bool symmetricScaling,
     if (symmetricScaling)
     {
         if (isForward) {
-            return new FilteredFFTImpl<MY_FFT_ROWS, MY_FFT_COLS, true, true>(batches, 
-                prefilt, postfilt, stream);
+            return make<true, true>(batches, rows, cols, prefilt, postfilt, stream);
         } else {
-            return new FilteredFFTImpl<MY_FFT_ROWS, MY_FFT_COLS, true, false>(batches, 
-                prefilt, postfilt, stream);
+            return make<true, false>(batches, rows, cols, prefilt, postfilt, stream);
         }
     }
     else
     {
         if (isForward) {
-            return new FilteredFFTImpl<MY_FFT_ROWS, MY_FFT_COLS, false, true>(batches, 
-                prefilt, postfilt, stream);
+            return make<false, true>(batches, rows, cols, prefilt, postfilt, stream);
         } else {
-            return new FilteredFFTImpl<MY_FFT_ROWS, MY_FFT_COLS, false, false>(batches, 
-                prefilt, postfilt, stream);
+            return make<false, false>(batches, rows, cols, prefilt, postfilt, stream);
         }
     }
     
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h
index fd153f768..9afa4e119 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h
+++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h
@@ -23,6 +23,7 @@ class FilteredFFT {
 // Note that cudaStream_t (runtime API) and CUStream (driver API) are
 // the same type
 FilteredFFT* make_filtered(int batches, 
+  int rows, int columns,
   bool symmetricScaling,
   bool isForward,
   complex<float>* prefilt, complex<float>* postfilt, 
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp
index 186d40cb2..3eb0eb37e 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp
+++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp
@@ -13,7 +13,7 @@
 class FilteredFFTPython
 {
 public:
-    FilteredFFTPython(int batches, bool symmetric, 
+    FilteredFFTPython(int batches, int rows, int columns, bool symmetric, 
         bool is_forward,
         std::size_t prefilt_ptr,
         std::size_t postfilt_ptr,
@@ -21,6 +21,7 @@ class FilteredFFTPython
     {
         fft_ = make_filtered(
             batches, 
+            rows, columns,
             symmetric,
             is_forward,
             reinterpret_cast<complex<float>*>(prefilt_ptr),
@@ -70,12 +71,14 @@ class FilteredFFTPython
 namespace py = pybind11;
 
 
-PYBIND11_MODULE(module, m) {
+PYBIND11_MODULE(filtered_cufft, m) {
     m.doc() = "Filtered FFT for PtyPy";
 
     py::class_<FilteredFFTPython>(m, "FilteredFFT", py::module_local())
-        .def(py::init<int, bool, bool, std::size_t, std::size_t,std::size_t>(),
+        .def(py::init<int, int, int, bool, bool, std::size_t, std::size_t,std::size_t>(),
              py::arg("batches"), 
+             py::arg("rows"),
+             py::arg("columns"),
              py::arg("symmetricScaling"), 
              py::arg("is_forward"),
              py::arg("prefilt"), 
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
index 7d7a512a7..f8f695ca5 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
@@ -1,15 +1,23 @@
+/** fmag_all_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <cmath>
 #include <thrust/complex.h>
 using std::sqrt;
 using thrust::complex;
 
-extern "C" __global__ void fmag_all_update(complex<float>* f,
-                                           const float* fmask,
-                                           const float* fmag,
-                                           const float* fdev,
-                                           const float* err_fmag,
+extern "C" __global__ void fmag_all_update(complex<OUT_TYPE>* f,
+                                           const IN_TYPE* fmask,
+                                           const IN_TYPE* fmag,
+                                           const IN_TYPE* fdev,
+                                           const IN_TYPE* err_fmag,
                                            const int* addr_info,
-                                           float pbound,
+                                           IN_TYPE pbound_,
                                            int A,
                                            int B)
 {
@@ -17,23 +25,24 @@ extern "C" __global__ void fmag_all_update(complex<float>* f,
   int tx = threadIdx.x;
   int ty = threadIdx.y;
   int addr_stride = 15;
+  MATH_TYPE pbound = pbound_;
 
   const int* ea = addr_info + batch * addr_stride + 6;
   const int* da = addr_info + batch * addr_stride + 9;
   const int* ma = addr_info + batch * addr_stride + 12;
 
   fmask += ma[0] * A * B;
-  float err = err_fmag[da[0]];
+  MATH_TYPE err = err_fmag[da[0]];
   fdev += da[0] * A * B;
   fmag += da[0] * A * B;
   f += ea[0] * A * B;
-  float renorm = sqrt(pbound / err);
+  MATH_TYPE renorm = sqrt(pbound / err);
 
   for (int a = ty; a < A; a += blockDim.y)
   {
     for (int b = tx; b < B; b += blockDim.x)
     {
-      float m = fmask[a * A + b];
+      MATH_TYPE m = fmask[a * A + b];
       if (renorm < 1.0f)
       {
         /*
@@ -42,10 +51,10 @@ extern "C" __global__ void fmag_all_update(complex<float>* f,
           ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] +
         fmag[a * A + b]  + 1e-7f)) ;
         */
-        auto fmagv = fmag[a * A + b];
-        auto fdevv = fdev[a * A + b];
-        float fm = (1.0f - m) +
-                   m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + 1e-7f));
+        MATH_TYPE fmagv = fmag[a * A + b];
+        MATH_TYPE fdevv = fdev[a * A + b];
+        MATH_TYPE fm = (MATH_TYPE(1) - m) +
+                   m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + MATH_TYPE(1e-7)));
         f[a * A + b] *= fm;
       }
     }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
new file mode 100644
index 000000000..40a65c172
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
@@ -0,0 +1,53 @@
+/** fmag_all_update_nopbound.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include <cmath>
+#include <thrust/complex.h>
+using std::sqrt;
+using thrust::complex;
+
+extern "C" __global__ void fmag_update_nopbound(complex<OUT_TYPE>* f,
+                                                const IN_TYPE* fmask,
+                                                const IN_TYPE* fmag,
+                                                const IN_TYPE* fdev,
+                                                const int* addr_info,
+                                                int A,
+                                                int B)
+{
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int a = threadIdx.y + blockIdx.y * blockDim.y;
+  if (a >= A)
+    return;
+  int addr_stride = 15;
+
+  const int* ea = addr_info + bid * addr_stride + 6;
+  const int* da = addr_info + bid * addr_stride + 9;
+  const int* ma = addr_info + bid * addr_stride + 12;
+
+  fmask += ma[0] * A * B;
+  fdev += da[0] * A * B;
+  fmag += da[0] * A * B;
+  f += ea[0] * A * B;
+
+  for (int b = tx; b < B; b += blockDim.x)
+  {
+    MATH_TYPE m = fmask[a * A + b];
+    /*
+    // assuming this is actually a mask, i.e. 0 or 1 --> this is slower
+    float fm = m < 0.5f ? 1.0f :
+      ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] +
+    fmag[a * A + b]  + 1e-7f)) ;
+    */
+    MATH_TYPE fmagv = fmag[a * A + b];
+    MATH_TYPE fdevv = fdev[a * A + b];
+    MATH_TYPE fm =
+        (MATH_TYPE(1) - m) + m * (fmagv / (fmagv + fdevv + MATH_TYPE(1e-7)));
+    f[a * A + b] *= fm;
+  }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
new file mode 100644
index 000000000..3427222c3
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
@@ -0,0 +1,58 @@
+/** fourier_deviation.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include <cassert>
+#include <cmath>
+#include <thrust/complex.h>
+using std::sqrt;
+using thrust::abs;
+using thrust::complex;
+
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    fourier_deviation(int nmodes,
+                      const complex<IN_TYPE> *f,
+                      const IN_TYPE *fmag,
+                      OUT_TYPE *fdev,
+                      const int *addr,
+                      int A,
+                      int B)
+{
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int a = threadIdx.y + blockIdx.y * blockDim.y;
+  const int addr_stride = 15;
+
+  const int *ea = addr + 6 + (bid * nmodes) * addr_stride;
+  const int *da = addr + 9 + (bid * nmodes) * addr_stride;
+
+  f += ea[0] * A * B;
+  fdev += da[0] * A * B;
+  fmag += da[0] * A * B;
+
+  if (a >= A)
+    return;
+
+  for (int b = tx; b < B; b += blockDim.x)
+  {
+    MATH_TYPE acc = MATH_TYPE(0);
+    for (int idx = 0; idx < nmodes; ++idx)
+    {
+      complex<MATH_TYPE> t_f = f[a * B + b + idx * A * B];
+      MATH_TYPE abs_exit_wave = abs(t_f);
+      acc += abs_exit_wave *
+             abs_exit_wave;  // if we do this manually (real*real +imag*imag)
+                             // we get differences to numpy due to rounding
+    }
+    auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]);
+    fdev[a * B + b] = fdevv;
+  }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
index 7998e094c..ad483c870 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
@@ -1,3 +1,12 @@
+/** fourier_error.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
+
 #include <cassert>
 #include <cmath>
 #include <thrust/complex.h>
@@ -11,12 +20,12 @@ using thrust::complex;
 // (guided by profiler)
 extern "C" __global__ void __launch_bounds__(1024, 2)
     fourier_error(int nmodes,
-                  complex<float> *f,
-                  const float *fmask,
-                  const float *fmag,
-                  float *fdev,
-                  float *ferr,
-                  const float *mask_sum,
+                  const complex<IN_TYPE> *f,
+                  const IN_TYPE *fmask,
+                  const IN_TYPE *fmag,
+                  OUT_TYPE *fdev,
+                  OUT_TYPE *ferr,
+                  const IN_TYPE *mask_sum,
                   const int *addr,
                   int A,
                   int B)
@@ -39,15 +48,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
   {
     for (int b = tx; b < B; b += blockDim.x)
     {
-      float acc = 0.0;
+      MATH_TYPE acc = MATH_TYPE(0);
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        float abs_exit_wave = abs(f[a * B + b + idx * A * B]);
+        complex<MATH_TYPE> t_f = f[a * B + b + idx * A * B];
+        MATH_TYPE abs_exit_wave = abs(t_f);
         acc += abs_exit_wave *
                abs_exit_wave;  // if we do this manually (real*real +imag*imag)
                                // we get differences to numpy due to rounding
       }
-      auto fdevv = sqrt(acc) - fmag[a * B + b];
+      auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]);
       ferr[a * B + b] = (fmask[a * B + b] * fdevv * fdevv) / mask_sum[ma[0]];
       fdev[a * B + b] = fdevv;
     }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu
index 3fe6ac8a5..801204aaa 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu
@@ -1,16 +1,25 @@
+/** full_reduce kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - ACC_TYPE: the data type used for internal accumulation
+ */
+
+
 #include <cassert>
 
-extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size)
+extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size)
 {
   assert(gridDim.x == 1);
   int tx = threadIdx.x;
 
-  __shared__ DTYPE smem[BDIM_X];
+  __shared__ ACC_TYPE smem[BDIM_X];
 
-  auto sum = DTYPE();
+  auto sum = ACC_TYPE();
   for (int ix = tx; ix < size; ix += blockDim.x)
   {
-    sum = sum + in[ix];
+    sum = sum + ACC_TYPE(in[ix]);
   }
   smem[tx] = sum;
   __syncthreads();
@@ -30,6 +39,6 @@ extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size)
 
   if (tx == 0)
   {
-    out[0] = smem[0];
+    out[0] = OUT_TYPE(smem[0]);
   }
 }
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu
index 06d73ae88..1ab643c4c 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu
@@ -1,11 +1,19 @@
+/** gd_main kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double - for aux wave)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void gd_main(const FTYPE* Imodel,
-                                   const FTYPE* I,
-                                   const FTYPE* w,
-                                   FTYPE* err,
-                                   CTYPE* aux,
+extern "C" __global__ void gd_main(const IN_TYPE* Imodel,
+                                   const IN_TYPE* I,
+                                   const IN_TYPE* w,
+                                   OUT_TYPE* err,
+                                   complex<OUT_TYPE>* aux,
                                    int z,
                                    int modes,
                                    int x)
@@ -16,8 +24,8 @@ extern "C" __global__ void gd_main(const FTYPE* Imodel,
   if (iz >= z || ix >= x)
     return;
 
-  auto DI = Imodel[iz * x + ix] - I[iz * x + ix];
-  auto tmp = w[iz * x + ix] * DI;
+  auto DI = MATH_TYPE(Imodel[iz * x + ix]) - MATH_TYPE(I[iz * x + ix]);
+  auto tmp = MATH_TYPE(w[iz * x + ix]) * MATH_TYPE(DI);
   err[iz * x + ix] = tmp * DI;
 
   // now set this for all modes (promote)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu
new file mode 100644
index 000000000..dda9b45f1
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu
@@ -0,0 +1,35 @@
+#include <thrust/complex.h>
+#include <cassert>
+using thrust::complex;
+
+inline __device__ int minimum(int a, int b) { return a < b ? a : b; }
+
+inline __device__ int maximum(int a, int b) { return a < b ? b : a; }
+
+extern "C" __global__ void get_address(const int* addr_current,
+                                       int* mangled_addr,
+                                       int num_pods,
+                                       const int* __restrict delta,
+                                       int max_oby,
+                                       int max_obx)
+{
+  // we use only one thread block
+  const int tx = threadIdx.x;
+  const int idx = tx % 2;  // even threads access y dim, odd threads x dim
+  const int maxval = (idx == 0) ? max_oby : max_obx;
+
+  const int addr_stride = 15;
+  const int d = delta[idx];
+  addr_current += 3 + idx + 1;
+  mangled_addr += 3 + idx + 1;
+
+  for (int ix = tx; ix < num_pods * 2; ix += blockDim.x)
+  {
+    const int bid = ix / 2;
+    int cur = addr_current[bid * addr_stride] + d;
+    int bound = maximum(0, minimum(maxval, cur));
+    assert(bound >= 0);
+    assert(bound <= maxval);
+    mangled_addr[bid * addr_stride] = bound;
+  }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu
index 13f8551b7..d0033f7f4 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu
@@ -1,41 +1,66 @@
+/** intens_renorm - with 2 steps as separate kernels.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void step1(const FTYPE* Imodel,
-                                   const FTYPE* I,
-                                   const FTYPE* w,
-                                   FTYPE* num,
-                                   FTYPE* den,
-                                   int z,
-                                   int x)
+extern "C" __global__ void step1(const IN_TYPE* Imodel,
+                                 const IN_TYPE* I,
+                                 const IN_TYPE* w,
+                                 OUT_TYPE* num,
+                                 OUT_TYPE* den,
+                                 int n)
 {
-  int iz = blockIdx.z;
-  int ix = threadIdx.x + blockIdx.x * blockDim.x;
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
 
-  if (iz >= z || ix >= x)
+  if (i >= n)
     return;
 
-  auto tmp = w[iz * x + ix] * Imodel[iz * x + ix];
-  num[iz * x + ix] = tmp * I[iz * x + ix];
-  den[iz * x + ix] = tmp * Imodel[iz * x + ix];
+  auto tmp = MATH_TYPE(w[i]) * MATH_TYPE(Imodel[i]);
+  num[i] = tmp * MATH_TYPE(I[i]);
+  den[i] = tmp * MATH_TYPE(Imodel[i]);
 }
 
-extern "C" __global__ void step2(const FTYPE* fic_tmp,
-                                 FTYPE* fic,
-                                 FTYPE* Imodel,
-                                 int z,
-                                 int x)
+extern "C" __global__ void step2(const IN_TYPE* fic_tmp,
+                                 OUT_TYPE* fic,
+                                 OUT_TYPE* Imodel,
+                                 int X,
+                                 int Y)
 {
   int iz = blockIdx.z;
-  int ix = threadIdx.x + blockIdx.x * blockDim.x;
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+  
+  // one thread block per fic data point - we want the first thread to read this
+  // into shared memory and then sync the block, so we don't get into data races
+  // with writing it back to global memory in the end (and we read the value only
+  // once)
+  //
+  __shared__ MATH_TYPE shfic[1];
+  if (tx == 0 && ty == 0) {
+    shfic[0] = MATH_TYPE(fic[iz]) / MATH_TYPE(fic_tmp[iz]);
+  } 
+  __syncthreads();
 
-  if (iz >= z || ix >= x)
-    return;
-  //probably not so clever having all threads read from the same locations
-  auto tmp = fic[iz] / fic_tmp[iz];
-  Imodel[iz * x + ix] *= tmp;
+  // now all threads can access that value
+  auto tmp = shfic[0];
+
+  // offset Imodel for current z
+  Imodel += iz * X * Y;
+  
+  for (int iy = ty; iy < Y; iy += blockDim.y) {
+    #pragma unroll(4)
+    for (int ix = tx; ix < X; ix += blockDim.x) {
+      Imodel[iy * X + ix] *= tmp;
+    }
+  }
+    
   // race condition if write is not restricted to one thread
-  // learned this the hard way
-  if (ix==0)
+  if (tx==0 && ty == 0)
     fic[iz] = tmp;
-}
\ No newline at end of file
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
index e538dd725..90455b1e2 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
@@ -1,3 +1,11 @@
+/** log_likelihood kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
 #include <cassert>
 #include <cmath>
 #include <thrust/complex.h>
@@ -11,13 +19,13 @@ using thrust::complex;
 // (guided by profiler)
 extern "C" __global__ void __launch_bounds__(1024, 2)
     log_likelihood(int nmodes,
-                  complex<float> *aux,
-                  const float *fmask,
-                  const float *fmag,
-                  const int *addr,
-                  float *llerr,
-                  int A,
-                  int B)
+                   complex<OUT_TYPE> *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
 {
   int tx = threadIdx.x;
   int ty = threadIdx.y;
@@ -31,22 +39,69 @@ extern "C" __global__ void __launch_bounds__(1024, 2)
   fmag += da[0] * A * B;
   fmask += ma[0] * A * B;
   llerr += da[0] * A * B;
-  float norm = A * B;
+  MATH_TYPE norm = A * B;
 
   for (int a = ty; a < A; a += blockDim.y)
   {
     for (int b = tx; b < B; b += blockDim.x)
     {
-      float acc = 0.0;
+      MATH_TYPE acc = 0.0;
       for (int idx = 0; idx < nmodes; ++idx)
       {
-        float abs_exit_wave = abs(aux[a * B + b + idx * A * B]);
+        complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
+        MATH_TYPE abs_exit_wave = abs(t_aux);
         acc += abs_exit_wave *
                abs_exit_wave;  // if we do this manually (real*real +imag*imag)
                                // we get differences to numpy due to rounding
       }
-      auto I = fmag[a * B + b] * fmag[a * B + b];
-      llerr[a * B + b] = fmask[a * B + b] * (acc - I) * (acc - I) / (I + 1) / norm;
+      auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
+      llerr[a * B + b] =
+          MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm;
     }
   }
 }
+
+
+extern "C" __global__ void 
+    log_likelihood2(int nmodes,
+                   complex<OUT_TYPE> *aux,
+                   const IN_TYPE *fmask,
+                   const IN_TYPE *fmag,
+                   const int *addr,
+                   IN_TYPE *llerr,
+                   int A,
+                   int B)
+{
+  int bid = blockIdx.z;
+  int tx = threadIdx.x;
+  int a = threadIdx.y + blockIdx.y * blockDim.y;
+  if (a >= A)
+    return;
+  int addr_stride = 15;
+
+  const int *ea = addr + 6 + (bid * nmodes) * addr_stride;
+  const int *da = addr + 9 + (bid * nmodes) * addr_stride;
+  const int *ma = addr + 12 + (bid * nmodes) * addr_stride;
+
+  aux += ea[0] * A * B;
+  fmag += da[0] * A * B;
+  fmask += ma[0] * A * B;
+  llerr += da[0] * A * B;
+  MATH_TYPE norm = A * B;
+
+  for (int b = tx; b < B; b += blockDim.x)
+  {
+    MATH_TYPE acc = 0.0;
+    for (int idx = 0; idx < nmodes; ++idx)
+    {
+      complex<MATH_TYPE> t_aux = aux[a * B + b + idx * A * B];
+      MATH_TYPE abs_exit_wave = abs(t_aux);
+      acc += abs_exit_wave *
+              abs_exit_wave;  // if we do this manually (real*real +imag*imag)
+                              // we get differences to numpy due to rounding
+    }
+    auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]);
+    llerr[a * B + b] =
+        MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm;
+  }
+}
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu
index e86d900f5..11ba29f62 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu
@@ -1,14 +1,23 @@
+/** fmag_all_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ * - ACC_TYPE: data type used for accumulation
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void make_a012(const CTYPE* f,
-                                     const CTYPE* a,
-                                     const CTYPE* b,
-                                     const FTYPE* I,
-                                     const FTYPE* fic,
-                                     FTYPE* A0,
-                                     FTYPE* A1,
-                                     FTYPE* A2,
+extern "C" __global__ void make_a012(const complex<IN_TYPE>* f,
+                                     const complex<IN_TYPE>* a,
+                                     const complex<IN_TYPE>* b,
+                                     const IN_TYPE* I,
+                                     const IN_TYPE* fic,
+                                     OUT_TYPE* A0,
+                                     OUT_TYPE* A1,
+                                     OUT_TYPE* A2,
                                      int z,
                                      int y,
                                      int x,
@@ -22,37 +31,37 @@ extern "C" __global__ void make_a012(const CTYPE* f,
 
   if (iz >= maxz)
   {
-    A0[iz * x + ix] = FTYPE(0);  // make sure it's the right type (double/float)
-    A1[iz * x + ix] = FTYPE(0);
-    A2[iz * x + ix] = FTYPE(0);
+    A0[iz * x + ix] = OUT_TYPE(0);  // make sure it's the right type (double/float)
+    A1[iz * x + ix] = OUT_TYPE(0);
+    A2[iz * x + ix] = OUT_TYPE(0);
     return;
   }
 
   // we sum across y directly, as this is the number of modes,
   // which is typically small
-  auto sumtf0 = FTYPE(0);
-  auto sumtf1 = FTYPE(0);
-  auto sumtf2 = FTYPE(0);
+  auto sumtf0 = ACC_TYPE(0);
+  auto sumtf1 = ACC_TYPE(0);
+  auto sumtf2 = ACC_TYPE(0);
   for (auto iy = 0; iy < y; ++iy)
   {
-    auto fv = f[iz * y * x + iy * x + ix];
+    complex<MATH_TYPE> fv = f[iz * y * x + iy * x + ix];
     sumtf0 += fv.real() * fv.real() + fv.imag() * fv.imag();
 
-    auto av = a[iz * y * x + iy * x + ix];
+    complex<MATH_TYPE> av = a[iz * y * x + iy * x + ix];
     // 2 * real(f * conj(a))
-    sumtf1 += FTYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag());
+    sumtf1 += MATH_TYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag());
 
     // use FTYPE(2) to make sure double creeps into a float calculation
     // as 2.0 * would make everything double.
-    auto bv = b[iz * y * x + iy * x + ix];
+    complex<MATH_TYPE> bv = b[iz * y * x + iy * x + ix];
     // 2 * real(f * conj(b)) + abs(a)^2
-    sumtf2 += FTYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) +
+    sumtf2 += MATH_TYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) +
               (av.real() * av.real() + av.imag() * av.imag());
   }
 
-  auto Iv = I[iz * x + ix];
-  auto ficv = fic[iz];
-  A0[iz * x + ix] = sumtf0 * ficv - Iv;
-  A1[iz * x + ix] = sumtf1 * ficv;
-  A2[iz * x + ix] = sumtf2 * ficv;
+  MATH_TYPE Iv = I[iz * x + ix];
+  MATH_TYPE ficv = fic[iz];
+  A0[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf0) * ficv - Iv);
+  A1[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf1) * ficv);
+  A2[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf2) * ficv);
 }
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu
index 0f8380d71..22bf7d4ab 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu
@@ -1,8 +1,16 @@
+/** make_model - with 2 steps as separate kernels.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
 extern "C" __global__ void make_model(
-    const CTYPE* in, FTYPE* out, int z, int y, int x)
+    const complex<IN_TYPE>* in, OUT_TYPE* out, int z, int y, int x)
 {
   int ix = threadIdx.x + blockIdx.x * blockDim.x;
   int iz = blockIdx.z;
@@ -12,11 +20,11 @@ extern "C" __global__ void make_model(
 
   // we sum accross y directly, as this is the number of modes,
   // which is typically small
-  auto sum = FTYPE();
+  auto sum = MATH_TYPE();
   for (auto iy = 0; iy < y; ++iy)
   {
-    auto v = in[iz * y * x + iy * x + ix];
+    complex<MATH_TYPE> v = in[iz * y * x + iy * x + ix];
     sum += v.real() * v.real() + v.imag() * v.imag();
   }
-  out[iz * x + ix] = sum;
+  out[iz * x + ix] = OUT_TYPE(sum);
 }
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu
new file mode 100644
index 000000000..4da8efb3e
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu
@@ -0,0 +1,115 @@
+/** max_abs2 kernel, calculating the sum of abs(x)**2 value in the first dimension
+ * and then the maximum across the last 2 dimensions
+ * 
+ * Data types:
+ * - IN_TYPE: can be float/double or complex<float>/complex<double>
+ */
+
+#include <cmath>
+#include <thrust/complex.h>
+using thrust::complex;
+using thrust::norm;
+
+inline __device__ OUT_TYPE norm(const float& in) {
+    return in*in;
+}
+
+inline __device__ OUT_TYPE norm(const double& in) {
+    return in*in;
+}
+
+extern "C" __global__ void max_abs2_step1(const IN_TYPE* a,
+                                          int n,
+                                          int rows,
+                                          int cols,
+                                          OUT_TYPE* out)
+{
+    int tx = threadIdx.x;
+    const int iy = blockIdx.y;
+    
+    __shared__ OUT_TYPE sh[BDIM_X];
+    
+    OUT_TYPE maxv = OUT_TYPE(0);
+
+    for (int ix = tx; ix < cols; ix += BDIM_X) {
+        OUT_TYPE v = OUT_TYPE(0); 
+        for (int in = 0; in < n; ++in) {
+            v += norm(a[in * rows * cols + iy * cols + ix]);
+        }
+        if (v > maxv)
+            maxv = v;
+    }
+
+    
+    sh[tx] = maxv; 
+    
+    __syncthreads();
+
+    // reduce:
+    const int nt = BDIM_X;
+    int c = nt;
+    
+    while (c > 1)
+    {
+        int half = c / 2;
+        if (tx < half)
+        {
+            auto v = sh[c - tx - 1];
+            if (maxv < v) {
+                sh[tx] = v;
+                maxv = v;
+            }
+        }
+        __syncthreads();
+        c = c - half;
+    }
+
+    if (tx == 0)
+    {
+        out[iy] = sh[0];
+    }
+}
+
+extern "C" __global__ void max_abs2_step2(const OUT_TYPE* in,
+                                          int n,
+                                          OUT_TYPE* out)
+{
+    int tx = threadIdx.x;
+
+    in += blockIdx.x * n;
+
+    __shared__ OUT_TYPE sh[BDIM_X];
+
+    OUT_TYPE maxv = OUT_TYPE(0);
+    for (int i = tx; i < n; ++i) {
+        auto v = in[i];
+        if (v > maxv)
+            maxv = v;
+    }
+    sh[tx] = maxv;
+    __syncthreads();
+
+    // reduce:
+    const int nt = BDIM_X;
+    int c = nt;
+    
+    while (c > 1)
+    {
+        int half = c / 2;
+        if (tx < half)
+        {
+            auto v = sh[c - tx - 1];
+            if (maxv < v) {
+                sh[tx] = v;
+                maxv = v;
+            }
+        }
+        __syncthreads();
+        c = c - half;
+    }
+
+    if (tx == 0)
+    {
+        out[0] = sh[0];
+    }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu
index c2cf2fd22..29b993fb0 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu
@@ -1,8 +1,16 @@
+/** ob_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
 template <class T>
-__device__ inline void atomicAdd(complex<T>* x, complex<T> y)
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
 {
   auto xf = reinterpret_cast<T*>(x);
   atomicAdd(xf, y.real());
@@ -10,20 +18,20 @@ __device__ inline void atomicAdd(complex<T>* x, complex<T> y)
 }
 
 extern "C" __global__ void ob_update(
-    const complex<float>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ exit_wave,
     int A,
     int B,
     int C,
-    const complex<float>* __restrict__ probe,
+    const complex<IN_TYPE>* __restrict__ probe,
     int D,
     int E,
     int F,
-    complex<float>* obj,
+    complex<OUT_TYPE>* obj,
     int G,
     int H,
     int I,
     const int* __restrict__ addr,
-    DENOM_TYPE* denominator)
+    OUT_TYPE* denominator)
 {
   const int bid = blockIdx.x;
   const int tx = threadIdx.x;
@@ -46,12 +54,15 @@ extern "C" __global__ void ob_update(
   {
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auto probe_val = probe[b * F + c];
-      atomicAdd(&obj[b * I + c], conj(probe_val) * exit_wave[b * C + c]);
-      auto denomreal = reinterpret_cast<float*>(&denominator[b * I + c]);
+      complex<MATH_TYPE> probe_val = probe[b * F + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      auto add_val_m = conj(probe_val) * exit_val;
+      complex<OUT_TYPE> add_val = add_val_m;
+      atomicAdd(&obj[b * I + c], add_val);
+
       auto upd_probe = probe_val.real() * probe_val.real() +
                        probe_val.imag() * probe_val.imag();
-      atomicAdd(denomreal, upd_probe);
+      atomicAdd(&denominator[b * I + c], upd_probe);
     }
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu
index 1f9c5b573..821c04a6d 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu
@@ -1,3 +1,20 @@
+/** ob_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator type for the local ob accumulation
+ * 
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in ob_update.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100), 
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+
 #include <cassert>
 #include <thrust/complex.h>
 using thrust::complex;
@@ -8,59 +25,40 @@ using thrust::complex;
 #define obj_roi_row(k) addr[4 * num_pods + (k)]
 #define obj_roi_column(k) addr[5 * num_pods + (k)]
 
-template <class T>
-__device__ inline void set_real(complex<T>& v, T r)
-{
-  v.real(r);
-}
-template <class T>
-__device__ inline void set_real(T& v, T r)
-{
-  v = r;
-}
-template <class T>
-__device__ inline T get_real(const complex<T>& v)
-{
-  return v.real();
-}
-template <class T>
-__device__ inline T get_real(const T& v)
-{
-  return v;
-}
 
 extern "C" __global__ void ob_update2(
     int pr_sh,
     int ob_modes,
     int num_pods,
-    int ob_sh,
+    int ob_sh_rows,
+    int ob_sh_cols,
     int pr_modes,
     int ex_0,
     int ex_1,
     int ex_2,
-    complex<float>* ob_g,
-    DENOM_TYPE* obn_g,
-    const complex<float>* __restrict__ pr_g,  // 2, 5, 5
-    const complex<float>* __restrict__ ex_g,  // 16, 5, 5
+    complex<OUT_TYPE>* ob_g,
+    OUT_TYPE* obn_g,
+    const complex<IN_TYPE>* __restrict__ pr_g,  // 2, 5, 5
+    const complex<IN_TYPE>* __restrict__ ex_g,  // 16, 5, 5
     const int* addr)
 {
   int y = blockIdx.y * BDIM_Y + threadIdx.y;
-  int dy = ob_sh;
+  int dy = ob_sh_rows;
   int z = blockIdx.x * BDIM_X + threadIdx.x;
-  int dz = ob_sh;
-  complex<float> ob[NUM_MODES];
-  DENOM_TYPE obn[NUM_MODES];
+  int dz = ob_sh_cols;
+  complex<ACC_TYPE> ob[NUM_MODES];
+  ACC_TYPE obn[NUM_MODES];
 
   int txy = threadIdx.y * BDIM_X + threadIdx.x;
   assert(ob_modes <= NUM_MODES);
 
-  if (y < ob_sh && z < ob_sh)
+  if (y < dy && z < dz)
   {
 #pragma unroll
     for (int i = 0; i < NUM_MODES; ++i)
     {
       auto idx = i * dy * dz + y * dz + z;
-      assert(idx < ob_modes * ob_sh * ob_sh);
+      assert(idx < ob_modes * ob_sh_rows * ob_sh_cols);
       ob[i] = ob_g[idx];
       obn[i] = obn_g[idx];
     }
@@ -92,7 +90,7 @@ extern "C" __global__ void ob_update2(
 
     __syncthreads();
 
-    if (y >= ob_sh || z >= ob_sh)
+    if (y >= dy || z >= dz)
       continue;
 
 #pragma unroll 4
@@ -105,21 +103,21 @@ extern "C" __global__ void ob_update2(
       {
         auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2;
         assert(pridx < pr_modes * pr_sh * pr_sh);
-        auto pr = pr_g[pridx];
+        complex<MATH_TYPE> pr = pr_g[pridx];
         int idx = ad[2];
         assert(idx < NUM_MODES);
         auto cpr = conj(pr);
         auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2;
         assert(exidx < ex_0 * ex_1 * ex_2);
-        ob[idx] += cpr * ex_g[exidx];
-        auto rr = get_real(obn[idx]);
-        rr += pr.real() * pr.real() + pr.imag() * pr.imag();
-        set_real(obn[idx], rr);
+        complex<MATH_TYPE> t_ex_g = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cpr * t_ex_g;
+        ob[idx] += add_val;
+        obn[idx] += pr.real() * pr.real() + pr.imag() * pr.imag();
       }
     }
   }
 
-  if (y < ob_sh && z < ob_sh)
+  if (y < dy && z < dz)
   {
     for (int i = 0; i < NUM_MODES; ++i)
     {
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu
index 56d088788..b62e66006 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu
@@ -1,3 +1,20 @@
+/** ob_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator for the ob field
+ * 
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in ob_update_ML.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100), 
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+
 #include <cassert>
 #include <thrust/complex.h>
 using thrust::complex;
@@ -11,33 +28,36 @@ using thrust::complex;
 extern "C" __global__ void ob_update2_ML(int pr_sh,
                                          int ob_modes,
                                          int num_pods,
-                                         int ob_sh,
+                                         int ob_sh_rows,
+                                         int ob_sh_cols,
                                          int pr_modes,
                                          int ex_0,
                                          int ex_1,
                                          int ex_2,
-                                         CTYPE* ob_g,
-                                         const CTYPE* __restrict__ pr_g,
-                                         const CTYPE* __restrict__ ex_g,
+                                         complex<OUT_TYPE>* ob_g,
+                                         const complex<IN_TYPE>* __restrict__ pr_g,
+                                         const complex<IN_TYPE>* __restrict__ ex_g,
                                          const int* addr,
-                                         FTYPE fac)
+                                         IN_TYPE fac_)
 {
   int y = blockIdx.y * BDIM_Y + threadIdx.y;
-  int dy = ob_sh;
+  int dy = ob_sh_rows;
   int z = blockIdx.x * BDIM_X + threadIdx.x;
-  int dz = ob_sh;
-  CTYPE ob[NUM_MODES];
+  int dz = ob_sh_cols;
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> ob[NUM_MODES];
+
 
   int txy = threadIdx.y * BDIM_X + threadIdx.x;
   assert(ob_modes <= NUM_MODES);
 
-  if (y < ob_sh && z < ob_sh)
+  if (y < dy && z < dz)
   {
 #pragma unroll
     for (int i = 0; i < NUM_MODES; ++i)
     {
       auto idx = i * dy * dz + y * dz + z;
-      assert(idx < ob_modes * ob_sh * ob_sh);
+      assert(idx < ob_modes * ob_sh_rows * ob_sh_cols);
       ob[i] = ob_g[idx];
     }
   }
@@ -68,7 +88,7 @@ extern "C" __global__ void ob_update2_ML(int pr_sh,
 
     __syncthreads();
 
-    if (y >= ob_sh || z >= ob_sh)
+    if (y >= dy || z >= dz)
       continue;
 
 #pragma unroll 4
@@ -81,18 +101,20 @@ extern "C" __global__ void ob_update2_ML(int pr_sh,
       {
         auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2;
         assert(pridx < pr_modes * pr_sh * pr_sh);
-        auto pr = pr_g[pridx];
+        complex<MATH_TYPE> pr = pr_g[pridx];
         int idx = ad[2];
         assert(idx < NUM_MODES);
         auto cpr = conj(pr);
         auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2;
         assert(exidx < ex_0 * ex_1 * ex_2);
-        ob[idx] += cpr * ex_g[exidx] * fac;
+        complex<MATH_TYPE> t_ex_g = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cpr * t_ex_g * fac;
+        ob[idx] += add_val;
       }
     }
   }
 
-  if (y < ob_sh && z < ob_sh)
+  if (y < dy && z < dz)
   {
     for (int i = 0; i < NUM_MODES; ++i)
     {
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu
index c6aa9ca11..84e678ebb 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu
@@ -1,8 +1,16 @@
+/** ob_update_ML.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
 template <class T>
-__device__ inline void atomicAdd(complex<T>* x, complex<T> y)
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
 {
   auto xf = reinterpret_cast<T*>(x);
   atomicAdd(xf, y.real());
@@ -11,25 +19,26 @@ __device__ inline void atomicAdd(complex<T>* x, complex<T> y)
 
 extern "C"
 {
-  __global__ void ob_update_ML(const CTYPE* __restrict__ exit_wave,
+  __global__ void ob_update_ML(const complex<IN_TYPE>* __restrict__ exit_wave,
                                int A,
                                int B,
                                int C,
-                               const CTYPE* __restrict__ probe,
+                               const complex<IN_TYPE>* __restrict__ probe,
                                int D,
                                int E,
                                int F,
-                               CTYPE* obj,
+                               complex<OUT_TYPE>* obj,
                                int G,
                                int H,
                                int I,
                                const int* __restrict__ addr,
-                               FTYPE fac)
+                               IN_TYPE fac_)
   {
     const int bid = blockIdx.x;
     const int tx = threadIdx.x;
     const int ty = threadIdx.y;
     const int addr_stride = 15;
+    MATH_TYPE fac = fac_;
 
     const int* oa = addr + 3 + bid * addr_stride;
     const int* pa = addr + bid * addr_stride;
@@ -46,9 +55,12 @@ extern "C"
     {
       for (int c = tx; c < C; c += blockDim.x)
       {
-        auto probe_val = probe[b * F + c];
-        atomicAdd(&obj[b * I + c],
-                  conj(probe_val) * exit_wave[b * C + c] * fac);
+        complex<MATH_TYPE> probe_val = probe[b * F + c];
+        complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+        complex<MATH_TYPE> add_val_m = conj(probe_val) * exit_val * fac;
+        complex<OUT_TYPE> add_val(add_val_m);
+
+        atomicAdd(&obj[b * I + c], add_val);
       }
     }
   }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu
new file mode 100644
index 000000000..c49119be2
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu
@@ -0,0 +1,67 @@
+/** ob_update_local - in DR algorithm.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include <thrust/complex.h>
+using thrust::complex;
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C" __global__ void ob_update_local(
+    const complex<IN_TYPE>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ aux,
+    int A,
+    int B,
+    int C,
+    const complex<IN_TYPE>* __restrict__ probe,
+    int D,
+    int E,
+    int F,
+    const IN_TYPE* __restrict__ pr_norm,
+    complex<OUT_TYPE>* obj,
+    int G,
+    int H,
+    int I,
+    const int* __restrict__ addr)
+{
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  const int addr_stride = 15;
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  aux += bid * B * C;
+  MATH_TYPE norm_val = pr_norm[0];
+  
+  assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+  exit_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+      complex<MATH_TYPE> probe_val = probe[b * F + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> aux_val = aux[b * C + c];
+      
+      auto add_val_m = conj(probe_val) * (exit_val - aux_val) / norm_val;
+      complex<OUT_TYPE> add_val = add_val_m;
+      atomicAdd(&obj[b * I + c], add_val);
+  }
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu
index 13a6c72b1..180cf8f14 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu
@@ -1,29 +1,37 @@
+/** pr_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
-template <class T>
-__device__ inline void atomicAdd(complex<T>* x, complex<T> y)
+template <class T, class U>
+__device__ inline void atomicAdd(complex<T>* x, const complex<U>& y)
 {
   auto xf = reinterpret_cast<T*>(x);
-  atomicAdd(xf, y.real());
-  atomicAdd(xf + 1, y.imag());
+  atomicAdd(xf, T(y.real()));
+  atomicAdd(xf + 1, T(y.imag()));
 }
 
 extern "C" __global__ void pr_update(
-    const complex<float>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ exit_wave,
     int A,
     int B,
     int C,
-    complex<float>* probe,
+    complex<OUT_TYPE>* probe,
     int D,
     int E,
     int F,
-    const complex<float>* __restrict__ obj,
+    const complex<IN_TYPE>* __restrict__ obj,
     int G,
     int H,
     int I,
     const int* __restrict__ addr,
-    DENOM_TYPE* denominator)
+    OUT_TYPE* denominator)
 {
   assert(B == E);  // prsh[1]
   assert(C == F);  // prsh[2]
@@ -48,12 +56,14 @@ extern "C" __global__ void pr_update(
   {
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auto obj_val = obj[b * I + c];
-      atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c]);
-      auto denomreal = reinterpret_cast<float*>(&denominator[b * F + c]);
-      auto upd_obj =
+      complex<MATH_TYPE> obj_val = obj[b * I + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> add_val_m = conj(obj_val) * exit_val;
+      complex<OUT_TYPE> add_val = add_val_m;
+      atomicAdd(&probe[b * F + c], add_val);
+      MATH_TYPE upd_obj =
           obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag();
-      atomicAdd(denomreal, upd_obj);
+      atomicAdd(&denominator[b * F + c], upd_obj);
     }
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu
index 1361cb18d..e5417cc01 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu
@@ -1,3 +1,19 @@
+/** pr_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator type for local pr array
+ * 
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in pr_update.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100), 
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
 #include <cassert>
 #include <thrust/complex.h>
 using thrust::complex;
@@ -10,29 +26,6 @@ using thrust::complex;
 #define obj_roi_row(k) addr[4 * num_pods + (k)]
 #define obj_roi_column(k) addr[5 * num_pods + (k)]
 
-template <class T>
-__device__ inline void set_real(complex<T>& v, T r)
-{
-  v.real(r);
-}
-
-template <class T>
-__device__ inline void set_real(T& v, T r)
-{
-  v = r;
-}
-
-template <class T>
-__device__ inline T get_real(const complex<T>& v)
-{
-  return v.real();
-}
-
-template <class T>
-__device__ inline T get_real(const T& v)
-{
-  return v;
-}
 
 extern "C" __global__ void pr_update2(int pr_sh,
                                       int ob_sh_row,
@@ -40,18 +33,18 @@ extern "C" __global__ void pr_update2(int pr_sh,
                                       int pr_modes,
                                       int ob_modes,
                                       int num_pods,
-                                      complex<float>* pr_g,
-                                      DENOM_TYPE* prn_g,
-                                      const complex<float>* __restrict__ ob_g,
-                                      const complex<float>* __restrict__ ex_g,
+                                      complex<OUT_TYPE>* pr_g,
+                                      OUT_TYPE* prn_g,
+                                      const complex<IN_TYPE>* __restrict__ ob_g,
+                                      const complex<IN_TYPE>* __restrict__ ex_g,
                                       const int* addr)
 {
   int y = blockIdx.y * BDIM_Y + threadIdx.y;
   int dy = pr_sh;
   int z = blockIdx.x * BDIM_X + threadIdx.x;
   int dz = pr_sh;
-  complex<float> pr[NUM_MODES];
-  DENOM_TYPE prn[NUM_MODES];
+  complex<ACC_TYPE> pr[NUM_MODES];
+  ACC_TYPE prn[NUM_MODES];
 
   int txy = threadIdx.y * BDIM_X + threadIdx.x;
   assert(pr_modes <= NUM_MODES);
@@ -107,15 +100,15 @@ extern "C" __global__ void pr_update2(int pr_sh,
       {
         auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2;
         assert(obidx < ob_modes * ob_sh_row * ob_sh_col);
-        auto ob = ob_g[obidx];
+        complex<MATH_TYPE> ob = ob_g[obidx];
 
         int idx = ad[0];
         assert(idx < NUM_MODES);
         auto cob = conj(ob);
-        pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z];
-        auto rr = get_real(prn[idx]);
-        rr += ob.real() * ob.real() + ob.imag() * ob.imag();
-        set_real(prn[idx], rr);
+        complex<MATH_TYPE> ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z];
+        complex<ACC_TYPE> add_val = cob * ex_val;
+        pr[idx] += add_val;
+        prn[idx] += ob.real() * ob.real() + ob.imag() * ob.imag();
       }
     }
   }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu
index 696682e97..8a45891c5 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu
@@ -1,3 +1,19 @@
+/** pr_update.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator type for local pr array
+ * 
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in pr_update_ML.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100), 
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
 #include <cassert>
 #include <thrust/complex.h>
 using thrust::complex;
@@ -16,17 +32,18 @@ extern "C" __global__ void pr_update2_ML(int pr_sh,
                                          int pr_modes,
                                          int ob_modes,
                                          int num_pods,
-                                         CTYPE* pr_g,
-                                         const CTYPE* __restrict__ ob_g,
-                                         const CTYPE* __restrict__ ex_g,
+                                         complex<OUT_TYPE>* pr_g,
+                                         const complex<IN_TYPE>* __restrict__ ob_g,
+                                         const complex<IN_TYPE>* __restrict__ ex_g,
                                          const int* addr,
-                                         FTYPE fac)
+                                         IN_TYPE fac_)
 {
   int y = blockIdx.y * BDIM_Y + threadIdx.y;
   int dy = pr_sh;
   int z = blockIdx.x * BDIM_X + threadIdx.x;
   int dz = pr_sh;
-  CTYPE pr[NUM_MODES];
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> pr[NUM_MODES];
 
   int txy = threadIdx.y * BDIM_X + threadIdx.x;
   assert(pr_modes <= NUM_MODES);
@@ -81,12 +98,15 @@ extern "C" __global__ void pr_update2_ML(int pr_sh,
       {
         auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2;
         assert(obidx < ob_modes * ob_sh_row * ob_sh_col);
-        auto ob = ob_g[obidx];
+        complex<MATH_TYPE> ob = ob_g[obidx];
 
         int idx = ad[0];
         assert(idx < NUM_MODES);
         auto cob = conj(ob);
-        pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z] * fac;
+        complex<MATH_TYPE> ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z];
+        complex<MATH_TYPE> add_val_m = cob * ex_val * fac;
+        complex<ACC_TYPE> add_val = add_val_m;
+        pr[idx] += add_val;
       }
     }
   }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu
index 156e6d198..3fa24137d 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu
@@ -1,28 +1,37 @@
+/** pr_update_ML.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+
 #include <thrust/complex.h>
 using thrust::complex;
 
 template <class T>
-__device__ inline void atomicAdd(complex<T>* x, complex<T> y)
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
 {
   auto xf = reinterpret_cast<T*>(x);
   atomicAdd(xf, y.real());
   atomicAdd(xf + 1, y.imag());
 }
 
-extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave,
+extern "C" __global__ void pr_update_ML(const complex<IN_TYPE>* __restrict__ exit_wave,
                                         int A,
                                         int B,
                                         int C,
-                                        CTYPE* probe,
+                                        complex<OUT_TYPE>* probe,
                                         int D,
                                         int E,
                                         int F,
-                                        const CTYPE* __restrict__ obj,
+                                        const complex<IN_TYPE>* __restrict__ obj,
                                         int G,
                                         int H,
                                         int I,
                                         const int* __restrict__ addr,
-                                        FTYPE fac)
+                                        IN_TYPE fac_)
 {
   assert(B == E);  // prsh[1]
   assert(C == F);  // prsh[2]
@@ -30,6 +39,7 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave,
   const int tx = threadIdx.x;
   const int ty = threadIdx.y;
   const int addr_stride = 15;
+  MATH_TYPE fac = fac_;
 
   const int* oa = addr + 3 + bid * addr_stride;
   const int* pa = addr + bid * addr_stride;
@@ -46,8 +56,11 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave,
   {
     for (int c = tx; c < C; c += blockDim.x)
     {
-      auto obj_val = obj[b * I + c];
-      atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c] * fac);
+      complex<MATH_TYPE> obj_val = obj[b * I + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> add_val_m = conj(obj_val) * exit_val * fac;
+      complex<OUT_TYPE> add_val = add_val_m;
+      atomicAdd(&probe[b * F + c], add_val);
     }
   }
 }
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu
new file mode 100644
index 000000000..ee81e1620
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu
@@ -0,0 +1,71 @@
+/** pr_update_local - for DR algorithm.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: data type used in norm calculation (input here)
+ */
+
+#include <thrust/complex.h>
+using thrust::complex;
+
+template <class T, class U>
+__device__ inline void atomicAdd(complex<T>* x, const complex<U>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, T(y.real()));
+  atomicAdd(xf + 1, T(y.imag()));
+}
+
+extern "C" __global__ void pr_update_local(
+    const complex<IN_TYPE>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ aux,
+    int A,
+    int B,
+    int C,
+    complex<OUT_TYPE>* probe,
+    int D,
+    int E,
+    int F,
+    const IN_TYPE* __restrict__ ob_norm,
+    const complex<IN_TYPE>* __restrict__ obj,
+    int G,
+    int H,
+    int I,
+    const int* __restrict__ addr)
+{
+  assert(B == E);  // prsh[1]
+  assert(C == F);  // prsh[2]
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  const int addr_stride = 15;
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  aux += bid * B * C;
+  MATH_TYPE norm_val = ob_norm[0];
+
+  assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+  exit_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+      complex<MATH_TYPE> obj_val = obj[b * I + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> aux_val = aux[b * C + c];
+
+      complex<MATH_TYPE> add_val_m = conj(obj_val) * (exit_val - aux_val) / norm_val;
+      complex<OUT_TYPE> add_val = add_val_m;
+      atomicAdd(&probe[b * F + c], add_val);
+  }
+
+}
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu
index a460727a4..8de4e7ad7 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu
@@ -5,6 +5,11 @@
  * and shared memory access has no bank conflicts.
  */
 
+/**
+ * Data types:
+ * - DTYPE - any pod type
+ */
+
 #include <thrust/complex.h>
 using thrust::complex;
 
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu
index 2e6d21059..1220a0986 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu
+++ b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu
@@ -1,11 +1,18 @@
+/** update_addr_error_state kernel.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ */
+
 #include <cassert>
 #include <thrust/complex.h>
 using thrust::complex;
 
-extern "C" __global__ void update_addr_error_state(int* addr,
-                                                   const int* mangled_addr,
-                                                   float* error_state,
-                                                   const float* error_sum,
+extern "C" __global__ void update_addr_error_state(int* __restrict addr,
+                                                   const int* __restrict mangled_addr,
+                                                   OUT_TYPE* error_state,
+                                                   const IN_TYPE* __restrict error_sum,
                                                    int nmodes)
 {
   int tx = threadIdx.x;
@@ -23,7 +30,7 @@ extern "C" __global__ void update_addr_error_state(int* addr,
 
   if (err_sum < err_st)
   {
-    for (int i = tx; i < nmodes * 15; i += blockDim.x)
+    for (int i = tx, e = nmodes * 15; i < e; i += blockDim.x)
     {
       addr[i] = mangled_addr[i];
     }
diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py
index 89c2c650b..686171342 100644
--- a/ptypy/accelerate/cuda_pycuda/cufft.py
+++ b/ptypy/accelerate/cuda_pycuda/cufft.py
@@ -17,6 +17,10 @@ def __init__(self, array, queue=None,
         if dims < 2:
             raise AssertionError('Input array must be at least 2-dimensional')
         self.arr_shape = (array.shape[-2], array.shape[-1])
+        rows = self.arr_shape[0]
+        columns = self.arr_shape[1]
+        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
+            raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
         self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1)
         self.forward = forward
 
@@ -34,10 +38,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward):
         else:
             self.post_fft_ptr = 0
 
-        from . import import_fft
-        mod = import_fft.ImportFFT(self.arr_shape[0], self.arr_shape[1]).get_mod()
-        self.fftobj = mod.FilteredFFT(
+        from ptypy import filtered_cufft
+        self.fftobj = filtered_cufft.FilteredFFT(
                 self.batches, 
+                self.arr_shape[0], 
+                self.arr_shape[1],
                 symmetric, 
                 forward,
                 self.pre_fft_ptr,
@@ -75,14 +80,30 @@ def queue(self, queue):
         cufftlib.cufftSetStream(self.plan.handle, queue.handle)
 
     def _load(self, array, pre_fft, post_fft, symmetric, forward):
+        assert(array.dtype in [np.complex64, np.complex128])
+        assert(pre_fft.dtype in [np.complex64, np.complex128] if pre_fft is not None else True)
+        assert(post_fft.dtype in [np.complex64, np.complex128] if post_fft is not None else True)
+
+        math_type = 'float' if array.dtype == np.complex64 else 'double'
+        if pre_fft is not None:
+            math_type = 'float' if pre_fft.dtype == np.complex64 else 'double'
         self.pre_fft_knl = load_kernel("batched_multiply", {
             'MPY_DO_SCALE': 'false',
-            'MPY_DO_FILT': 'true'
+            'MPY_DO_FILT': 'true',
+            'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'MATH_TYPE': math_type
         }) if pre_fft is not None else None
 
+        math_type = 'float' if array.dtype == np.complex64 else 'double'
+        if post_fft is not None:
+            math_type = 'float' if post_fft.dtype == np.complex64 else 'double'
         self.post_fft_knl = load_kernel("batched_multiply", {
             'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false',
-            'MPY_DO_FILT': 'true' if post_fft is not None else 'false'
+            'MPY_DO_FILT': 'true' if post_fft is not None else 'false',
+            'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'MATH_TYPE': math_type
         }) if (not (forward and not symmetric) or post_fft is not None) else None
 
         self.block = (32, 32, 1)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
index 154f073ee..961851072 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py
@@ -18,21 +18,15 @@
 from ptypy.utils import parallel
 from ptypy.engines import register
 from ptypy.accelerate.base.engines import DM_serial
-from ptypy.accelerate.base import address_manglers
 from .. import get_context
-from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel
-from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel
+from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel
 from ..mem_utils import make_pagelocked_paired_arrays as mppa
-
-MPI = parallel.size > 1
-MPI = True
+from ..multi_gpu import get_multi_gpu_communicator
 
 __all__ = ['DM_pycuda']
 
-serialize_array_access = DM_serial.serialize_array_access
-gaussian_kernel = DM_serial.gaussian_kernel
-
-
 @register()
 class DM_pycuda(DM_serial.DM_serial):
 
@@ -66,27 +60,28 @@ def __init__(self, ptycho_parent, pars=None):
         Difference map reconstruction engine.
         """
         super(DM_pycuda, self).__init__(ptycho_parent, pars)
+        self.multigpu = None
 
     def engine_initialize(self):
         """
         Prepare for reconstruction.
         """
-        self.context, self.queue = get_context(new_context=True, new_queue=True)
-        # allocator for READ only buffers
-        # self.const_allocator = cl.tools.ImmediateAllocator(queue, cl.mem_flags.READ_ONLY)
-        ## gaussian filter
-        # dummy kernel
-        # if not self.p.obj_smooth_std:
-        #     gauss_kernel = gaussian_kernel(1, 1).astype(np.float32)
-        # else:
-        #     gauss_kernel = gaussian_kernel(self.p.obj_smooth_std, self.p.obj_smooth_std).astype(np.float32)
-        # self.gauss_kernel_gpu = gpuarray.to_gpu(gauss_kernel)
+        # Context, Multi GPU communicator and Stream (needs to be in this order)
+        self.context, self.queue = get_context(new_context=True, new_queue=False)
+        self.multigpu = get_multi_gpu_communicator()
+        self.context, self.queue = get_context(new_context=False, new_queue=True)
 
         # Gaussian Smoothing Kernel
         self.GSK = GaussianSmoothingKernel(queue=self.queue)
 
+        # Real/Fourier Support Kernel
+        self.RSK = {}
+        self.FSK = {}
+
+        # Clip Magnitudes Kernel
+        self.CMK = ClipMagnitudesKernel(queue=self.queue)
+
         super(DM_pycuda, self).engine_initialize()
-        self.error = []
 
     def _setup_kernels(self):
         """
@@ -117,41 +112,34 @@ def _setup_kernels(self):
             kern.aux = gpuarray.to_gpu(aux)
 
             # setup kernels, one for each SCAN.
-            logger.info("Setting up FourierUpdateKernel")
+            log(4, "Setting up FourierUpdateKernel")
             kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
             kern.FUK.allocate()
 
-            logger.info("Setting up PoUpdateKernel")
-            kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32)
+            log(4, "Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
             kern.POK.allocate()
 
-            logger.info("Setting up AuxiliaryWaveKernel")
+            log(4, "Setting up AuxiliaryWaveKernel")
             kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
             kern.AWK.allocate()
 
-            logger.info("Setting up ArrayUtilsKernel")
+            log(4, "Setting up ArrayUtilsKernel")
             kern.AUK = ArrayUtilsKernel(queue=self.queue)
 
-            logger.info("Setting up PropagationKernel")
+            log(4, "Setting up TransposeKernel")
+            kern.TK = TransposeKernel(queue=self.queue)
+
+            log(4, "Setting up PropagationKernel")
             kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
             kern.PROP.allocate()
             kern.resolution = geo.resolution[0]
 
             if self.do_position_refinement:
-                logger.info("Setting up position correction")
-                addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]),
-                                                                self.p.position_refinement.start,
-                                                                self.p.position_refinement.stop,
-                                                                max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
-                                                                randomseed=0)
-                logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
-                logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
-
-                kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue)
+                log(4, "Setting up PositionCorrectionKernel")
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
                 kern.PCK.allocate()
-                kern.PCK.address_mangler = addr_mangler
-            #self.queue.synchronize()
-            logger.info("Kernel setup completed")
+            log(4, "Kernel setup completed")
 
     def engine_prepare(self):
 
@@ -165,6 +153,8 @@ def engine_prepare(self):
             s.gpu, s.data = mppa(s.data)
         for name, s in self.pr.S.items():
             s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
         for name, s in self.pr_nrm.S.items():
             s.gpu, s.data = mppa(s.data)
 
@@ -177,6 +167,8 @@ def engine_prepare(self):
             if use_tiles:
                 prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
                 prep.addr2_gpu = gpuarray.to_gpu(prep.addr2)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
 
         for label, d in self.ptycho.new_data:
             prep = self.diff_info[d.ID]
@@ -233,47 +225,33 @@ def engine_iterate(self, num=1):
 
                 ## compute log-likelihood
                 if self.p.compute_log_likelihood:
-                    t1 = time.time()
                     AWK.build_aux_no_ex(aux, addr, ob, pr)
                     PROP.fw(aux, aux)
                     FUK.log_likelihood(aux, addr, mag, ma, err_phot)
-                    self.benchmark.F_LLerror += time.time() - t1
 
                 ## build auxilliary wave
-                t1 = time.time()
                 AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                self.benchmark.A_Build_aux += time.time() - t1
 
                 ## forward FFT
-                t1 = time.time()
                 PROP.fw(aux, aux)
-                self.benchmark.B_Prop += time.time() - t1
 
                 ## Deviation from measured data
-                t1 = time.time()
                 FUK.fourier_error(aux, addr, mag, ma, ma_sum)
                 FUK.error_reduce(addr, err_fourier)
                 FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
-                self.benchmark.C_Fourier_update += time.time() - t1
 
                 ## backward FFT
-                t1 = time.time()
                 PROP.bw(aux, aux)
-                self.benchmark.D_iProp += time.time() - t1
 
                 ## build exit wave
-                t1 = time.time()
-                AWK.build_exit(aux, addr, ob, pr, ex)
+                AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
                 FUK.exit_error(aux, addr)
                 FUK.error_reduce(addr, err_exit)
-                self.benchmark.E_Build_exit += time.time() - t1
-
-                self.benchmark.calls_fourier += 1
 
             parallel.barrier()
 
             sync = (self.curiter % 1 == 0)
-            self.overlap_update(MPI=MPI)
+            self.overlap_update()
 
             parallel.barrier()
             if self.do_position_refinement and (self.curiter):
@@ -285,7 +263,7 @@ def engine_iterate(self, num=1):
                     """
                     Iterates through all positions and refines them by a given algorithm.
                     """
-                    log(3, "----------- START POS REF -------------")
+                    log(4, "----------- START POS REF -------------")
                     for dID in self.di.S.keys():
 
                         prep = self.diff_info[dID]
@@ -297,39 +275,53 @@ def engine_iterate(self, num=1):
                         aux = kern.aux
                         addr = prep.addr_gpu
                         original_addr = prep.original_addr
+                        mangled_addr = prep.mangled_addr_gpu
                         mag = prep.mag
                         ma_sum = prep.ma_sum
                         err_fourier = prep.err_fourier_gpu
+                        error_state = prep.error_state_gpu
 
                         PCK = kern.PCK
-                        AUK = kern.AUK
-
-                        #error_state = np.zeros(err_fourier.shape, dtype=np.float32)
-                        #error_state[:] = err_fourier.get()
-                        cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr,
+                        TK  = kern.TK
+                        PROP = kern.PROP
+
+                        # Keep track of object boundaries
+                        max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                        max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                        # We need to re-calculate the current error 
+                        PCK.build_aux(aux, addr, ob, pr)
+                        PROP.fw(aux, aux)
+                        if self.p.position_refinement.metric == "fourier":
+                            PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            PCK.error_reduce(addr, err_fourier)
+                        if self.p.position_refinement.metric == "photon":
+                            PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                        cuda.memcpy_dtod(dest=error_state.ptr,
                                          src=err_fourier.ptr,
                                          size=err_fourier.nbytes)
+
+                        PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+                                        
                         log(4, 'Position refinement trial: iteration %s' % (self.curiter))
-                        for i in range(self.p.position_refinement.nshifts):
-                            mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter)
-                            mangled_addr_gpu = gpuarray.to_gpu(mangled_addr)
-                            PCK.build_aux(aux, mangled_addr_gpu, ob, pr)
+                        for i in range(PCK.mangler.nshifts):
+                            PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                            PCK.build_aux(aux, mangled_addr, ob, pr)
                             PROP.fw(aux, aux)
-                            PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum)
-                            PCK.error_reduce(mangled_addr_gpu, err_fourier)
-                            PCK.update_addr_and_error_state(addr,
-                                prep.error_state_gpu,
-                                mangled_addr_gpu,
-                                err_fourier)
+                            if self.p.position_refinement.metric == "fourier":
+                                PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "photon":
+                                PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                            PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
                         
-                        # prep.err_fourier_gpu.set(error_state)
-                        cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr,
-                            src=prep.error_state_gpu.ptr,
-                            size=prep.err_fourier_gpu.nbytes)
+                        cuda.memcpy_dtod(dest=err_fourier.ptr,
+                                         src=error_state.ptr,
+                                         size=err_fourier.nbytes)
                         if use_tiles:
                             s1 = addr.shape[0] * addr.shape[1]
                             s2 = addr.shape[2] * addr.shape[3]
-                            AUK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+                            TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
 
             self.curiter += 1
             queue.synchronize()
@@ -354,7 +346,6 @@ def engine_iterate(self, num=1):
 
     ## object update
     def object_update(self, MPI=False):
-        t1 = time.time()
         use_atomics = self.p.object_update_cuda_atomics
         queue = self.queue
         queue.synchronize()
@@ -363,11 +354,10 @@ def object_update(self, MPI=False):
             cfact = self.ob_cfact[oID]
 
             if self.p.obj_smooth_std is not None:
-                logger.info('Smoothing object, cfact is %.2f' % cfact)
+                log(4, 'Smoothing object, cfact is %.2f' % cfact)
+                obb = self.ob_buf.S[oID]
                 smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std]
-                ob_gpu_tmp = gpuarray.empty(ob.shape, dtype=np.complex64)
-                self.GSK.convolution(ob.gpu, ob_gpu_tmp, smooth_mfs)
-                ob.gpu = ob_gpu_tmp
+                self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu)
 
             ob.gpu *= cfact
             obn.gpu.fill(cfact)
@@ -393,33 +383,19 @@ def object_update(self, MPI=False):
 
         for oID, ob in self.ob.storages.items():
             obn = self.ob_nrm.S[oID]
-            # MPI test
-            if MPI:
-                ob.data[:] = ob.gpu.get()
-                obn.data[:] = obn.gpu.get()
-                queue.synchronize()
-                parallel.allreduce(ob.data)
-                parallel.allreduce(obn.data)
-                ob.data /= obn.data
-
-                self.clip_object(ob)
-                ob.gpu.set(ob.data)
-            else:
-                ob.gpu /= obn.gpu
+            self.multigpu.allReduceSum(ob.gpu)
+            self.multigpu.allReduceSum(obn.gpu)
+            ob.gpu /= obn.gpu
 
+            self.clip_object(ob.gpu)
             queue.synchronize()
 
-        # print 'object update: ' + str(time.time()-t1)
-        self.benchmark.object_update += time.time() - t1
-        self.benchmark.calls_object += 1
-
     ## probe update
     def probe_update(self, MPI=False):
-        t1 = time.time()
         queue = self.queue
 
         # storage for-loop
-        change = 0
+        change_gpu = gpuarray.zeros((1,), dtype=np.float32)
         cfact = self.p.probe_inertia
         use_atomics = self.p.probe_update_cuda_atomics
         for pID, pr in self.pr.storages.items():
@@ -450,35 +426,56 @@ def probe_update(self, MPI=False):
             buf = self.pr_buf.S[pID]
             prn = self.pr_nrm.S[pID]
 
-            if MPI:
-                pr.data[:] = pr.gpu.get()
-                prn.data[:] = prn.gpu.get()
-                queue.synchronize()
-                parallel.allreduce(pr.data)
-                parallel.allreduce(prn.data)
-                pr.data /= prn.data
-                self.support_constraint(pr)
-                pr.gpu.set(pr.data)
-            else:
-                pr.gpu /= prn.gpu
-                pr.data[:] = pr.gpu.get()
-                self.support_constraint(pr)
-                pr.gpu.set(pr.data)
-
-            ## this should be done on GPU
-            queue.synchronize()
-            change += u.norm2(pr.data - buf.data) / u.norm2(pr.data)
-            buf.data[:] = pr.data
-            if MPI:
-                change = parallel.allreduce(change) / parallel.size
+            self.multigpu.allReduceSum(pr.gpu)
+            self.multigpu.allReduceSum(prn.gpu)
+            pr.gpu /= prn.gpu
+            self.support_constraint(pr)
 
-        # print 'probe update: ' + str(time.time()-t1)
-        self.benchmark.probe_update += time.time() - t1
-        self.benchmark.calls_probe += 1
+            ## calculate change on GPU
+            queue.synchronize()
+            AUK = self.kernels[list(self.kernels)[0]].AUK
+            buf.gpu -= pr.gpu
+            change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu))
+            buf.gpu[:] = pr.gpu
+            self.multigpu.allReduceSum(change_gpu)
+            change = change_gpu.get().item() / parallel.size
 
         return np.sqrt(change)
 
-    def engine_finalize(self):
+    def support_constraint(self, storage=None):
+        """
+        Enforces 2D support contraint on probe.
+        """
+        if storage is None:
+            for s in self.pr.storages.values():
+                self.support_constraint(s)
+
+        # Fourier space
+        support = self._probe_fourier_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.FSK:
+                supp = support.astype(np.complex64)
+                self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID].allocate()
+            self.FSK[storage.ID].apply_fourier_support(storage.gpu)
+
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.RSK:
+                self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64))
+                self.RSK[storage.ID].allocate()
+            self.RSK[storage.ID].apply_real_support(storage.gpu)
+
+    def clip_object(self, ob):
+        """
+        Clips magnitudes of object into given range.
+        """
+        if self.p.clip_object is not None:
+            cmin, cmax = self.p.clip_object
+            self.CMK.clip_magnitudes_to_range(ob, cmin, cmax)
+
+    def engine_finalize(self, benchmark=False):
         """
         clear GPU data and destroy context.
         """
@@ -495,9 +492,11 @@ def engine_finalize(self):
         for dID, prep in self.diff_info.items():
             prep.addr = prep.addr_gpu.get()
 
-        # copy data to cpu
+        # copy data to cpu 
+        # this kills the pagelock memory (otherwise we get segfaults in h5py)
         for name, s in self.pr.S.items():
-            s.data = np.copy(s.data) # is this the same as s.data.get()?
+            s.data = np.copy(s.data)
 
+        self.context.pop()
         self.context.detach()
-        super(DM_pycuda, self).engine_finalize()
\ No newline at end of file
+        super(DM_pycuda, self).engine_finalize(benchmark)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
index 820124b5f..9306475b1 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py
@@ -28,9 +28,6 @@
 from ..mem_utils import make_pagelocked_paired_arrays as mppa
 from ..mem_utils import GpuDataManager2
 
-MPI = parallel.size > 1
-MPI = True
-
 EX_MA_BLOCKS_RATIO = 2
 MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
 #MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
@@ -69,8 +66,8 @@ def _setup_kernels(self):
         # TODO grow blocks dynamically
         nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
         nma = min(fit, MAX_BLOCKS)
-
-        log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9))
+        log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
         # reset memory or create new
         self.ex_data = GpuDataManager2(ex_mem, 0, nex, True)
         self.ma_data = GpuDataManager2(ma_mem, 0, nma, False)
@@ -88,6 +85,8 @@ def engine_prepare(self):
             s.gpu, s.data = mppa(s.data)
         for name, s in self.pr.S.items():
             s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
         for name, s in self.pr_nrm.S.items():
             s.gpu, s.data = mppa(s.data)
 
@@ -100,6 +99,8 @@ def engine_prepare(self):
             if use_tiles:
                 prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
                 prep.addr2_gpu = gpuarray.to_gpu(prep.addr2)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
 
         for label, d in self.ptycho.new_data:
             dID = d.ID
@@ -123,10 +124,11 @@ def engine_prepare(self):
             prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4)
             prep.mag[:] = mag
 
+            log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9))
             self.ex_data.add_data_block()
             self.ma_data.add_data_block()
             self.mag_data.add_data_block()
-
+        
     def engine_iterate(self, num=1):
         """
         Compute one iteration.
@@ -136,7 +138,7 @@ def engine_iterate(self, num=1):
         atomics_probe = self.p.probe_update_cuda_atomics
         atomics_object = self.p.object_update_cuda_atomics
         use_tiles = (not atomics_object) or (not atomics_probe)
-
+        
         for it in range(num):
 
             error = {}
@@ -157,9 +159,9 @@ def engine_iterate(self, num=1):
                         obb = self.ob_buf.S[oID]
 
                         if self.p.obj_smooth_std is not None:
-                            logger.info('Smoothing object, cfact is %.2f' % cfact)
+                            log(4, 'Smoothing object, cfact is %.2f' % cfact)
                             smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std]
-                            self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs)
+                            self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu)
                         # obb.gpu[:] = ob.gpu * cfactf32
                         ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=self.queue)
 
@@ -167,7 +169,6 @@ def engine_iterate(self, num=1):
 
                 # First cycle: Fourier + object update
                 for iblock, dID in enumerate(self.dID_list):
-                    t1 = time.time()
                     prep = self.diff_info[dID]
 
                     # find probe, object in exit ID in dependence of dID
@@ -211,24 +212,18 @@ def engine_iterate(self, num=1):
 
                         ## compute log-likelihood
                         if self.p.compute_log_likelihood:
-                            t1 = time.time()
                             AWK.build_aux_no_ex(aux, addr, ob, pr)
                             PROP.fw(aux, aux)
                             # synchronize h2d stream with compute stream
                             self.queue.wait_for_event(ev_mag)
                             FUK.log_likelihood(aux, addr, mag, ma, err_phot)
-                            self.benchmark.F_LLerror += time.time() - t1
 
                         # synchronize h2d stream with compute stream
                         self.queue.wait_for_event(ev_ex)
-                        t1 = time.time()
                         AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                        self.benchmark.A_Build_aux += time.time() - t1
 
                         ## FFT
-                        t1 = time.time()
                         PROP.fw(aux, aux)
-                        self.benchmark.B_Prop += time.time() - t1
 
                         ## Deviation from measured data
                         # synchronize h2d stream with compute stream
@@ -237,32 +232,23 @@ def engine_iterate(self, num=1):
                         FUK.error_reduce(addr, err_fourier)
                         FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
 
-                        self.benchmark.C_Fourier_update += time.time() - t1
                         data_mag.record_done(self.queue, 'compute')
                         data_ma.record_done(self.queue, 'compute')
 
-                        t1 = time.time()
                         PROP.bw(aux, aux)
                         ## apply changes
-                        AWK.build_exit(aux, addr, ob, pr, ex)
+                        AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
                         FUK.exit_error(aux, addr)
                         FUK.error_reduce(addr, err_exit)
 
-                        self.benchmark.E_Build_exit += time.time() - t1
-                        self.benchmark.calls_fourier += 1
-
                     prestr = '%d Iteration (Overlap) #%02d:  ' % (parallel.rank, inner)
 
                     # Update object
                     if do_update_object:
                         log(4, prestr + '----- object update -----', True)
-                        t1 = time.time()
-
                         addrt = addr if atomics_object else addr2
                         self.queue.wait_for_event(ev_ex)
                         POK.ob_update(addrt, obb, obn, pr, ex, atomics=atomics_object)
-                        self.benchmark.object_update += time.time() - t1
-                        self.benchmark.calls_object += 1
 
                     data_ex.record_done(self.queue, 'compute')
                     if iblock + len(self.ex_data) < len(self.dID_list):
@@ -280,29 +266,21 @@ def engine_iterate(self, num=1):
                     for oID, ob in self.ob.storages.items():
                         obn = self.ob_nrm.S[oID]
                         obb = self.ob_buf.S[oID]
-                        # MPI test
-                        if MPI:
-                            obb.data[:] = obb.gpu.get()
-                            obn.data[:] = obn.gpu.get()
-                            parallel.allreduce(obb.data)
-                            parallel.allreduce(obn.data)
-                            obb.data /= obn.data
-                            self.clip_object(obb)
-                            ob.gpu.set(obb.data)
-                        else:
-                            obb.gpu /= obn.gpu
-                            ob.gpu[:] = obb.gpu
+                        self.multigpu.allReduceSum(obb.gpu)
+                        self.multigpu.allReduceSum(obn.gpu)
+                        obb.gpu /= obn.gpu
+
+                        self.clip_object(obb.gpu)
+                        ob.gpu[:] = obb.gpu
 
                 # Exit if probe should not yet be updated
                 if not do_update_probe:
                     break
-
                 self.ex_data.syncback = False
+
                 # Update probe
                 log(4, prestr + '----- probe update -----', True)
-                change = self.probe_update(MPI=MPI)
-                # change = self.probe_update(MPI=(parallel.size>1 and MPI))
-
+                change = self.probe_update()
                 log(4, prestr + 'change in probe is %.3f' % change, True)
 
                 # stop iteration if probe change is small
@@ -320,7 +298,7 @@ def engine_iterate(self, num=1):
                     """
                     Iterates through all positions and refines them by a given algorithm. 
                     """
-                    log(3, "----------- START POS REF -------------")
+                    log(4, "----------- START POS REF -------------")
                     for dID in self.di.S.keys():
 
                         prep = self.diff_info[dID]
@@ -331,43 +309,61 @@ def engine_iterate(self, num=1):
                         aux = kern.aux
                         addr = prep.addr_gpu
                         original_addr = prep.original_addr
+                        mangled_addr = prep.mangled_addr_gpu
                         ma_sum = prep.ma_sum_gpu
+                        err_fourier = prep.err_fourier_gpu
+                        error_state = prep.error_state_gpu
+
                         PCK = kern.PCK
-                        AUK = kern.AUK
+                        TK = kern.TK
                         PROP = kern.PROP
+
                         # Make sure our data arrays are on device
                         ev_ma, ma, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod)
                         ev_mag, mag, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod)
-                        # error_state = np.zeros(err_fourier.shape, dtype=np.float32)
-                        # err_fourier.get_async(streamdata.queue, error_state)
-                        cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr,
-                                         src=prep.err_fourier_gpu.ptr,
-                                         size=prep.err_fourier_gpu.nbytes)#, stream=self.queue)
+
+                        # Keep track of object boundaries
+                        max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                        max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                        # We need to re-calculate the current error
+                        PCK.build_aux(aux, addr, ob, pr)
+                        PROP.fw(aux, aux)
+                        # wait for data to arrive
+                        self.queue.wait_for_event(ev_mag)
+
+                        # We need to re-calculate the current error 
+                        if self.p.position_refinement.metric == "fourier":
+                            PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            PCK.error_reduce(addr, err_fourier)
+                        if self.p.position_refinement.metric == "photon":
+                            PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                        cuda.memcpy_dtod_async(dest=error_state.ptr,
+                                               src=err_fourier.ptr,
+                                               size=err_fourier.nbytes, stream=self.queue)
+
                         log(4, 'Position refinement trial: iteration %s' % (self.curiter))
-                        for i in range(self.p.position_refinement.nshifts):
-                            mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter)
-                            mangled_addr_gpu = gpuarray.to_gpu(mangled_addr)
-                            PCK.build_aux(aux, mangled_addr_gpu, ob, pr)
+                        PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+                        for i in range(PCK.mangler.nshifts):
+                            PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                            PCK.build_aux(aux, mangled_addr, ob, pr)
                             PROP.fw(aux, aux)
-                            # wait for data to arrive
-                            self.queue.wait_for_event(ev_mag)
-                            PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum)
-                            PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu)
-                            # err_fourier_cpu = err_fourier.get_async(streamdata.queue)
-                            PCK.update_addr_and_error_state(addr,
-                                                            prep.error_state_gpu,
-                                                            mangled_addr_gpu,
-                                                            prep.err_fourier_gpu)
+                            if self.p.position_refinement.metric == "fourier":
+                                PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "photon":
+                                PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                            PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
 
                         data_mag.record_done(self.queue, 'compute')
                         data_ma.record_done(self.queue, 'compute')
-                        cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr,
-                                         src=prep.error_state_gpu.ptr,
-                                         size=prep.err_fourier_gpu.nbytes) #stream=self.queue)
+                        cuda.memcpy_dtod_async(dest=err_fourier.ptr,
+                                               src=error_state.ptr,
+                                               size=err_fourier.nbytes, stream=self.queue)
                         if use_tiles:
                             s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1]
                             s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3]
-                            AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+                            TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
 
             self.curiter += 1
             self.queue.synchronize()
@@ -392,11 +388,10 @@ def engine_iterate(self, num=1):
 
     ## probe update
     def probe_update(self, MPI=False):
-        t1 = time.time()
         queue = self.queue
         use_atomics = self.p.probe_update_cuda_atomics
         # storage for-loop
-        change = 0
+        change_gpu = gpuarray.zeros((1,), dtype=np.float32)
         for pID, pr in self.pr.storages.items():
             prn = self.pr_nrm.S[pID]
             cfact = self.pr_cfact[pID]
@@ -434,40 +429,22 @@ def probe_update(self, MPI=False):
             buf = self.pr_buf.S[pID]
             prn = self.pr_nrm.S[pID]
 
-            # MPI test
-            if MPI:
-                # if False:
-                pr.data[:] = pr.gpu.get()
-                prn.data[:] = prn.gpu.get()
-                # queue.synchronize()
-                parallel.allreduce(pr.data)
-                parallel.allreduce(prn.data)
-                pr.data /= prn.data
-
-                self.support_constraint(pr)
-
-                pr.gpu.set(pr.data)
-            else:
-                pr.gpu /= prn.gpu
-                # ca. 0.3 ms
-                # self.pr.S[pID].gpu = probe_gpu
-                pr.data[:] = pr.gpu.get()
-
-            ## this should be done on GPU
-
-            # queue.synchronize()
-            change += u.norm2(pr.data - buf.data) / u.norm2(pr.data)
-            buf.data[:] = pr.data
-            if MPI:
-                change = parallel.allreduce(change) / parallel.size
-
-        # print 'probe update: ' + str(time.time()-t1)
-        self.benchmark.probe_update += time.time() - t1
-        self.benchmark.calls_probe += 1
+            self.multigpu.allReduceSum(pr.gpu)
+            self.multigpu.allReduceSum(prn.gpu)
+            pr.gpu /= prn.gpu
+            self.support_constraint(pr)
+
+            ## calculate change on GPU
+            AUK = self.kernels[list(self.kernels)[0]].AUK
+            buf.gpu -= pr.gpu
+            change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu))
+            buf.gpu[:] = pr.gpu
+            self.multigpu.allReduceSum(change_gpu)
+            change = change_gpu.get().item() / parallel.size
 
         return np.sqrt(change)
 
-    def engine_finalize(self):
+    def engine_finalize(self, benchmark=False):
         """
         Clear all GPU data, pinned memory, etc
         """
@@ -479,4 +456,4 @@ def engine_finalize(self):
         for name, s in self.pr.S.items():
             s.data = np.copy(s.data)  # is this the same as s.data.get()?
 
-        super().engine_finalize()
+        super().engine_finalize(benchmark)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
index 36aadfe1b..d2db342f5 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py
@@ -20,9 +20,6 @@
 from . import DM_pycuda
 from ..mem_utils import GpuDataManager
 
-MPI = parallel.size > 1
-MPI = True
-
 # factor how many more exit waves we wanna keep on GPU compared to 
 # ma / mag data
 EX_MA_BLOCKS_RATIO = 2
@@ -149,6 +146,12 @@ def engine_prepare(self):
             s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0)
             s.data[:] = d
             s.gpu = gpuarray.to_gpu(s.data)
+        for name, s in self.pr_buf.S.items():
+            # pr
+            d = s.data
+            s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0)
+            s.data[:] = d
+            s.gpu = gpuarray.to_gpu(s.data)
         for name, s in self.pr_nrm.S.items():
             # prn
             d = s.data
@@ -169,6 +172,8 @@ def engine_prepare(self):
             if use_tiles:
                 prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
                 prep.addr2_gpu = gpuarray.to_gpu(prep.addr2)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
 
             prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum)
             # prepare page-locked mems:
@@ -206,7 +211,7 @@ def engine_prepare(self):
         nma = min(fit, blocks)
         nstreams = min(MAX_STREAMS, blocks)
 
-        log(3, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks))
+        log(4, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks))
         # reset memory or create new
         if self.ex_data is not None:
             self.ex_data.reset(ex_mem, nex)
@@ -256,13 +261,13 @@ def engine_iterate(self, num=1):
                         cfact = self.ob_cfact[oID]
                         obn = self.ob_nrm.S[oID]
                         obb = self.ob_buf.S[oID]
-                        
+
                         if self.p.obj_smooth_std is not None:
-                            logger.info('Smoothing object, cfact is %.2f' % cfact)
+                            log(4,'Smoothing object, cfact is %.2f' % cfact)
                             smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std]
-                            self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs)
+                            self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu)
                         
-                        obb.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue)
+                        ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue)
                         obn.gpu.fill(np.float32(cfact), stream=streamdata.queue)
                 
                 self.ex_data.syncback = True
@@ -350,7 +355,7 @@ def engine_iterate(self, num=1):
                         t1 = time.time()
                         PROP.bw(aux, aux)
                         ## apply changes
-                        AWK.build_exit(aux, addr, ob, pr, ex)
+                        AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
                         FUK.exit_error(aux, addr)
                         FUK.error_reduce(addr, err_exit)
                         self.benchmark.E_Build_exit += time.time() - t1
@@ -391,8 +396,7 @@ def engine_iterate(self, num=1):
                 # Update probe
                 log(4, prestr + '----- probe update -----', True)
                 self.ex_data.syncback = False
-                change = self.probe_update(MPI=MPI)
-                # change = self.probe_update(MPI=(parallel.size>1 and MPI))
+                change = self.probe_update()
                 
                 # swap direction for next time
                 self.dID_list.reverse()
@@ -416,7 +420,7 @@ def engine_iterate(self, num=1):
                     """
                     Iterates through all positions and refines them by a given algorithm. 
                     """
-                    log(3, "----------- START POS REF -------------")
+                    log(4, "----------- START POS REF -------------")
                     prev_event = None
                     for dID in self.di.S.keys():
                         streamdata = self.streams[self.cur_stream]
@@ -429,46 +433,59 @@ def engine_iterate(self, num=1):
                         aux = kern.aux
                         addr = prep.addr_gpu
                         original_addr = prep.original_addr
+                        mangled_addr = prep.mangled_addr_gpu
                         ma_sum = prep.ma_sum_gpu
                         ma, mag = streamdata.ma_to_gpu(dID, prep.ma, prep.mag)
+                        err_fourier = prep.err_fourier_gpu
+                        error_state = prep.error_state_gpu
 
                         PCK = kern.PCK
-                        AUK = kern.AUK
+                        TK = kern.TK
+                        PROP = kern.PROP
                         PCK.queue = streamdata.queue
+                        TK.queue = streamdata.queue
                         PROP.queue = streamdata.queue
-                        AUK.queue = streamdata.queue
 
-                        #error_state = np.zeros(err_fourier.shape, dtype=np.float32)
-                        #err_fourier.get_async(streamdata.queue, error_state)
-                        cuda.memcpy_dtod_async(dest=prep.error_state_gpu.ptr,
-                                               src=prep.err_fourier_gpu.ptr,
-                                               size=prep.err_fourier_gpu.nbytes,
+                        # Keep track of object boundaries
+                        max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                        max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                        # We need to re-calculate the current error
+                        PCK.build_aux(aux, addr, ob, pr)
+                        PROP.fw(aux, aux)
+                        if self.p.position_refinement.metric == "fourier":
+                            PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            PCK.error_reduce(addr, err_fourier)
+                        if self.p.position_refinement.metric == "photon":
+                            PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                        cuda.memcpy_dtod_async(dest=error_state.ptr,
+                                               src=err_fourier.ptr,
+                                               size=err_fourier.nbytes,
                                                stream=streamdata.queue)
                         streamdata.start_compute(prev_event)
 
                         log(4, 'Position refinement trial: iteration %s' % (self.curiter))
-                        for i in range(self.p.position_refinement.nshifts):
-                            addr_cpu = addr.get_async(streamdata.queue)
+                        PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+                        for i in range(PCK.mangler.nshifts):
                             streamdata.queue.synchronize()
-                            mangled_addr = PCK.address_mangler.mangle_address(addr_cpu, original_addr, self.curiter)
-                            mangled_addr_gpu = gpuarray.to_gpu_async(mangled_addr, stream=streamdata.queue)
-                            PCK.build_aux(aux, mangled_addr_gpu, ob, pr)
+                            PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                            PCK.build_aux(aux, mangled_addr, ob, pr)
                             PROP.fw(aux, aux)
-                            PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum)
-                            PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu)
-                            # err_fourier_cpu = err_fourier.get_async(streamdata.queue)
-                            PCK.update_addr_and_error_state(addr, 
-                                prep.error_state_gpu, 
-                                mangled_addr_gpu, 
-                                prep.err_fourier_gpu)
-                        cuda.memcpy_dtod_async(dest=prep.err_fourier_gpu.ptr,
-                                               src=prep.error_state_gpu.ptr,
-                                               size=prep.err_fourier_gpu.nbytes,
+                            if self.p.position_refinement.metric == "fourier":
+                                PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "photon":
+                                PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                            PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
+
+                        cuda.memcpy_dtod_async(dest=err_fourier.ptr,
+                                               src=error_state.ptr,
+                                               size=err_fourier.nbytes,
                                                stream=streamdata.queue)
                         if use_tiles:
                             s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1]
                             s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3]
-                            AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+                            TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
 
                         prev_event = streamdata.end_compute()
                         
@@ -486,7 +503,6 @@ def engine_iterate(self, num=1):
         for name, s in self.pr.S.items():
             s.gpu.get(s.data)
 
-
         # FIXXME: copy to pinned memory
         for dID, prep in self.diff_info.items():
             err_fourier = prep.err_fourier_gpu.get()
@@ -498,7 +514,6 @@ def engine_iterate(self, num=1):
         self.error = error
         return error
     
-
     def _object_allreduce(self):
         # make sure that all transfers etc are finished
         for sd in self.streams:
@@ -507,20 +522,12 @@ def _object_allreduce(self):
         for oID, ob in self.ob.storages.items():
             obn = self.ob_nrm.S[oID]
             obb = self.ob_buf.S[oID]
-            if MPI:
-                obb.gpu.get(obb.data)
-                obn.gpu.get(obn.data)
-                parallel.allreduce(obb.data)
-                parallel.allreduce(obn.data)
-                obb.data /= obn.data
-                self.clip_object(obb)
-                tt1 = time.time()
-                ob.gpu.set(obb.data)  # async tx on same stream?
-                
-            else:
-                obb.gpu /= obn.gpu
-                ob.gpu[:] = obb.gpu
-
+            self.multigpu.allReduceSum(obb.gpu)
+            self.multigpu.allReduceSum(obn.gpu)
+            obb.gpu /= obn.gpu
+            
+            self.clip_object(obb.gpu)
+            ob.gpu[:] = obb.gpu
 
     ## probe update
     def probe_update(self, MPI=False):
@@ -528,7 +535,7 @@ def probe_update(self, MPI=False):
         streamdata = self.streams[self.cur_stream]
         use_atomics = self.p.probe_update_cuda_atomics
         # storage for-loop
-        change = 0
+        change_gpu = gpuarray.zeros((1,), dtype=np.float32)
         prev_event = None
         for pID, pr in self.pr.storages.items():
             prn = self.pr_nrm.S[pID]
@@ -560,7 +567,6 @@ def probe_update(self, MPI=False):
             prev_event = streamdata.end_compute()
             self.cur_stream = (self.cur_stream + self.stream_direction) % len(self.streams)
 
-            
         # sync all streams first
         for sd in self.streams:
             sd.synchronize()
@@ -569,31 +575,19 @@ def probe_update(self, MPI=False):
 
             buf = self.pr_buf.S[pID]
             prn = self.pr_nrm.S[pID]
-
-            # MPI test
-            if MPI:
-                # if False:
-                pr.gpu.get(pr.data)
-                prn.gpu.get(prn.data)
-                parallel.allreduce(pr.data)
-                parallel.allreduce(prn.data)
-                pr.data /= prn.data
-                self.support_constraint(pr)
-                pr.gpu.set(pr.data)
-            else:
-                pr.gpu /= prn.gpu
-                # ca. 0.3 ms
-                # self.pr.S[pID].gpu = probe_gpu
-                pr.gpu.get(pr.data)
-
-            ## this should be done on GPU
-            tt1 = time.time()
-            change += u.norm2(pr.data - buf.data) / u.norm2(pr.data)
-            buf.data[:] = pr.data
-            if MPI:
-                change = parallel.allreduce(change) / parallel.size
-            tt2 = time.time()
-            #print('time for pr change: {}s'.format(tt2-tt1))
+            
+            self.multigpu.allReduceSum(pr.gpu)
+            self.multigpu.allReduceSum(prn.gpu)
+            pr.gpu /= prn.gpu
+            self.support_constraint(pr)
+
+            ## calculate change on GPU
+            AUK = self.kernels[list(self.kernels)[0]].AUK
+            buf.gpu -= pr.gpu
+            change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu))
+            buf.gpu[:] = pr.gpu
+            self.multigpu.allReduceSum(change_gpu)
+            change = change_gpu.get().item() / parallel.size
 
         # print 'probe update: ' + str(time.time()-t1)
         self.benchmark.probe_update += time.time() - t1
@@ -601,7 +595,7 @@ def probe_update(self, MPI=False):
 
         return np.sqrt(change)
 
-    def engine_finalize(self):
+    def engine_finalize(self, benchmark=False):
         """
         Clear all GPU data, pinned memory, etc
         """ 
@@ -610,4 +604,4 @@ def engine_finalize(self):
         self.ma_data = None
         self.mag_data = None
 
-        super().engine_finalize()
+        super().engine_finalize(benchmark)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py
new file mode 100644
index 000000000..0454e753c
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py
@@ -0,0 +1,288 @@
+# -*- coding: utf-8 -*-
+"""
+Local Douglas-Rachford reconstruction engine.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+
+import numpy as np
+import time
+from pycuda import gpuarray
+import pycuda.driver as cuda
+
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.engines import register
+from ptypy.accelerate.base.engines import DR_serial
+from ptypy.accelerate.base import address_manglers
+from .. import get_context
+from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel
+from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel
+from ..mem_utils import make_pagelocked_paired_arrays as mppa
+
+MPI = False
+
+
+__all__ = ['DR_pycuda']
+
+@register()
+class DR_pycuda(DR_serial.DR_serial):
+
+    """
+    Defaults:
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Difference map reconstruction engine.
+        """
+        super(DR_pycuda, self).__init__(ptycho_parent, pars)
+
+    
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        self.context, self.queue = get_context(new_context=True, new_queue=True)
+
+        super(DR_pycuda, self).engine_initialize()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            self.kernels[label] = kern
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            # TODO: make this part of the engine rather than scan
+            fpc = self.ptycho.frames_per_block
+
+            # Currently modes not implemented for DR algorithm 
+            #assert scan.p.coherence.num_probe_modes == 1
+            #assert scan.p.coherence.num_object_modes == 1
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            fpc = 1
+            ash = (fpc * nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = gpuarray.to_gpu(aux)
+
+            # setup kernels, one for each SCAN.
+            logger.info("Setting up FourierUpdateKernel")
+            kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
+            kern.FUK.fshape = (1,) + kern.FUK.fshape[1:]
+            kern.FUK.allocate()
+
+            logger.info("Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            logger.info("Setting up AuxiliaryWaveKernel")
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            logger.info("Setting up ArrayUtilsKernel")
+            kern.AUK = ArrayUtilsKernel(queue=self.queue)
+
+            #logger.info("Setting up TransposeKernel")
+            #kern.TK = TransposeKernel(queue=self.queue)
+
+            logger.info("Setting up PropagationKernel")
+            kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            # if self.do_position_refinement:
+            #     logger.info("Setting up position correction")
+            #     addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]),
+            #                                                     self.p.position_refinement.start,
+            #                                                     self.p.position_refinement.stop,
+            #                                                     max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
+            #                                                     randomseed=0)
+            #     logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
+            #     logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
+
+            #     kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue)
+            #     kern.PCK.allocate()
+            #     kern.PCK.address_mangler = addr_mangler
+
+            logger.info("Kernel setup completed")
+
+
+    def engine_prepare(self):
+
+        super(DR_pycuda, self).engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu = gpuarray.to_gpu(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        # TODO : like the serialization this one is needed due to object reformatting
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = gpuarray.to_gpu(prep.addr)
+
+        for label, d in self.ptycho.new_data:
+            prep = self.diff_info[d.ID]
+            prep.ex = gpuarray.to_gpu(prep.ex)
+            prep.mag = gpuarray.to_gpu(prep.mag)
+            prep.ma = gpuarray.to_gpu(prep.ma)
+            prep.ma_sum = gpuarray.to_gpu(prep.ma_sum)
+            prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier)
+            prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot)
+            prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit)
+            # if self.do_position_refinement:
+            #     prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu)
+
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        queue = self.queue
+        error = {}
+        for it in range(num):
+            
+            for dID in self.di.S.keys():
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                PROP = kern.PROP
+                
+                # get aux buffer
+                aux = kern.aux
+
+                # local references
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+
+                # shuffle view order
+                vieworder = prep.vieworder
+                prep.rng.shuffle(vieworder)
+
+                # Iterate through views
+                for i in vieworder:
+
+                    # Get local adress and arrays
+                    addr = prep.addr_gpu[i,None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex[ex_from:ex_to]
+                    mag = prep.mag[i,None]
+                    ma = prep.ma[i,None]
+                    ma_sum = prep.ma_sum[i,None]
+                    err_phot = prep.err_phot_gpu[i,None]
+                    err_fourier = prep.err_fourier_gpu[i,None]
+                    err_exit = prep.err_exit_gpu[i,None]
+
+                    ## build auxilliary wave
+                    AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+
+                    ## forward FFT
+                    PROP.fw(aux, aux)
+
+                    ## Deviation from measured data
+                    if self.p.compute_fourier_error:
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+
+                    ## backward FFT
+                    PROP.bw(aux, aux)
+
+                    ## build exit wave
+                    AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau)
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    ## probe/object rescale
+                    #if self.p.rescale_probe:
+                    #    pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean())
+
+                    ## build auxilliary wave (ob * pr product)
+                    AWK.build_aux2_no_ex(aux, addr, ob, pr)
+
+                    # object update
+                    POK.ob_update_local(addr, ob, pr, ex, aux)
+
+                    # probe update
+                    POK.pr_update_local(addr, pr, ob, ex, aux)
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        PROP.fw(aux, aux)
+                        FUK.log_likelihood2(aux, addr, mag, ma, err_phot)
+
+            self.curiter += 1
+
+        queue.synchronize()
+        for name, s in self.ob.S.items():
+            s.gpu.get(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu.get(s.data)
+
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        self.error = error
+        return error
+
+    def engine_finalize(self):
+        """
+        clear GPU data and destroy context.
+        """
+        for name, s in self.ob.S.items():
+            del s.gpu
+        for name, s in self.pr.S.items():
+            del s.gpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+
+        # copy data to cpu 
+        # this kills the pagelock memory (otherwise we get segfaults in h5py)
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+
+        self.context.detach()
+        super(DR_pycuda, self).engine_finalize()
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py
new file mode 100644
index 000000000..fd8dd4b5e
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py
@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+"""
+Local Douglas-Rachford reconstruction engine for NVIDIA GPUs.
+
+This engine uses three streams, one for the compute queue and one for each I/O queue.
+Events are used to synchronize download / compute/ upload. we cannot manipulate memory
+for each loop over the state vector, a certain number of memory sections is preallocated
+and reused.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+
+from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as cuda
+
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.engines import register
+from . import DR_pycuda
+
+from ..mem_utils import make_pagelocked_paired_arrays as mppa
+from ..mem_utils import GpuDataManager2
+
+MPI = False
+
+EX_MA_BLOCKS_RATIO = 2
+MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
+#MAX_BLOCKS = 4  # can be used to limit the number of blocks, simulating that they don't fit
+
+__all__ = ['DR_pycuda_stream']
+
+@register()
+class DR_pycuda_stream(DR_pycuda.DR_pycuda):
+
+    def __init__(self, ptycho_parent, pars=None):
+
+        super(DR_pycuda_stream, self).__init__(ptycho_parent, pars)
+        self.ma_data = None
+        self.mag_data = None
+        self.ex_data = None
+
+    def engine_initialize(self):
+        super().engine_initialize()
+        self.qu_htod = cuda.Stream()
+        self.qu_dtoh = cuda.Stream()
+
+    def _setup_kernels(self):
+        super()._setup_kernels()
+        ex_mem = 0
+        mag_mem = 0
+        fpc = self.ptycho.frames_per_block
+        for scan, kern in self.kernels.items():
+            ex_mem = max(kern.aux.nbytes * fpc, ex_mem)
+            mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem)
+        ma_mem = mag_mem
+        mem = cuda.mem_get_info()[0]
+        blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
+        fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
+
+        # TODO grow blocks dynamically
+        nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
+        nma = min(fit, MAX_BLOCKS)
+
+        log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        # reset memory or create new
+        self.ex_data = GpuDataManager2(ex_mem, 0, nex, True)
+        self.ma_data = GpuDataManager2(ma_mem, 0, nma, False)
+        self.mag_data = GpuDataManager2(mag_mem, 0, nma, False)
+
+    def engine_prepare(self):
+
+        super(DR_pycuda.DR_pycuda, self).engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = gpuarray.to_gpu(prep.addr)
+
+        for label, d in self.ptycho.new_data:
+            dID = d.ID
+            prep = self.diff_info[dID]
+            pID, oID, eID = prep.poe_IDs
+
+            prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum)
+            # prepare page-locked mems:
+            prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier)
+            prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot)
+            prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit)
+            ma = self.ma.S[dID].data.astype(np.float32)
+            prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4)
+            prep.ma[:] = ma
+            ex = self.ex.S[eID].data
+            prep.ex = cuda.pagelocked_empty(ex.shape, ex.dtype, order="C", mem_flags=4)
+            prep.ex[:] = ex
+            mag = prep.mag
+            prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4)
+            prep.mag[:] = mag
+
+            self.ex_data.add_data_block()
+            self.ma_data.add_data_block()
+            self.mag_data.add_data_block()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        self.dID_list = list(self.di.S.keys())
+        error = {}
+
+        for it in range(num):
+
+            for iblock, dID in enumerate(self.dID_list):
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                PROP = kern.PROP
+
+                # get aux buffer
+                aux = kern.aux
+
+                # local references
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+
+                # shuffle view order
+                vieworder = prep.vieworder
+                prep.rng.shuffle(vieworder)
+
+                # Schedule ex, ma, mag to device
+                ev_ex, ex_full, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod)
+                ev_mag, mag_full, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod)
+                ev_ma, ma_full, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod)
+
+                ## synchronize h2d stream with compute stream
+                self.queue.wait_for_event(ev_ex)
+
+                # Iterate through views
+                for i in vieworder:
+
+                    # Get local adress and arrays
+                    addr = prep.addr_gpu[i,None]
+                    ex = ex_full[i,None]
+                    mag = mag_full[i,None]
+                    ma = ma_full[i,None]
+                    ma_sum = prep.ma_sum[i,None]
+                    err_phot = prep.err_phot_gpu[i,None]
+                    err_fourier = prep.err_fourier_gpu[i,None]
+                    err_exit = prep.err_exit_gpu[i,None]
+
+                    ## build auxilliary wave
+                    AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+
+                    ## forward FFT
+                    PROP.fw(aux, aux)
+
+                    ## Deviation from measured data
+                    self.queue.wait_for_event(ev_mag)
+                    if self.p.compute_fourier_error:
+                        self.queue.wait_for_event(ev_ma)
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                        self.queue.wait_for_event(ev_ma)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+
+                    ## backward FFT
+                    PROP.bw(aux, aux)
+
+                    ## build exit wave
+                    AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau)
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    ## probe/object rescale
+                    #if self.p.rescale_probe:
+                    #    pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean())
+
+                    ## build auxilliary wave (ob * pr product)
+                    AWK.build_aux2_no_ex(aux, addr, ob, pr)
+
+                    # object update
+                    POK.ob_update_local(addr, ob, pr, ex, aux)
+
+                    # probe update
+                    POK.pr_update_local(addr, pr, ob, ex, aux)
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        PROP.fw(aux, aux)
+                        FUK.log_likelihood2(aux, addr, mag, ma, err_phot)
+
+                data_ex.record_done(self.queue, 'compute')
+                if iblock + len(self.ex_data) < len(self.dID_list):
+                    data_ex.from_gpu(self.qu_dtoh)
+            
+            # swap direction
+            self.dID_list.reverse()
+            
+            self.curiter += 1
+            self.ex_data.syncback = False
+
+        # finish all the compute
+        self.queue.synchronize()
+        
+        for name, s in self.ob.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+        for name, s in self.pr.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+
+        for dID, prep in self.diff_info.items():
+            prep.err_fourier_gpu.get(prep.err_fourier)
+            prep.err_phot_gpu.get(prep.err_phot)
+            prep.err_exit_gpu.get(prep.err_exit)
+            errs = np.ascontiguousarray(np.vstack([
+                prep.err_fourier, prep.err_phot, prep.err_exit
+                ]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        # wait for the async transfers
+        self.qu_dtoh.synchronize()
+
+        self.error = error
+        return error
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        self.ex_data = None
+        self.ma_data = None
+        self.mag_data = None
+
+        # replacing page-locked data with normal npy to avoid
+        # crash on context destroy
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+        for name, s in self.ob.S.items():
+            s.data = np.copy(s.data)
+
+        super().engine_finalize()
+        
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
index 4112df968..5f36b9121 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -23,8 +23,7 @@
 from ptypy.utils.verbose import logger
 from ptypy.utils import parallel
 from .. import get_context
-from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \
-    PositionCorrectionKernel, PropagationKernel
+from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PropagationKernel
 from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel
 
 from ptypy.accelerate.base import address_manglers
@@ -160,6 +159,7 @@ def engine_initialize(self):
         self.queue_transfer = cuda.Stream()
         
         self.GSK = GaussianSmoothingKernel(queue=self.queue)
+        self.GSK.tmp = None
         
         super().engine_initialize()
         #self._setup_kernels()
@@ -168,13 +168,6 @@ def _setup_kernels(self):
         """
         Setup kernels, one for each scan. Derive scans from ptycho class
         """
-
-        try:
-            from ptypy.accelerate.cuda_pycuda.cufft import FFT
-        except:
-            logger.warning('Unable to import cuFFT version - using Reikna instead')
-            from ptypy.accelerate.cuda_pycuda.fft import FFT
-
         AUK = ArrayUtilsKernel(queue=self.queue)
         self._dot_kernel = AUK.dot
         # get the scans
@@ -208,7 +201,7 @@ def _setup_kernels(self):
             kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue)
             kern.GDK.allocate()
 
-            kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32)
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
             kern.POK.allocate()
 
             kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
@@ -217,20 +210,6 @@ def _setup_kernels(self):
             kern.PROP = PropagationKernel(aux, geo.propagator, queue_thread=self.queue)
             kern.PROP.allocate()
 
-
-            if self.do_position_refinement:
-                addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]),
-                                                                self.p.position_refinement.start,
-                                                                self.p.position_refinement.stop,
-                                                                max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]),
-                                                                randomseed=0)
-                logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0]))
-                logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0]))
-
-                kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue)
-                kern.PCK.allocate()
-                kern.PCK.address_mangler = addr_mangler
-
     def _initialize_model(self):
 
         # Create noise model
@@ -264,9 +243,10 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False):
                 self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy)
 
     def _get_smooth_gradient(self, data, sigma):
-        tmp = gpuarray.empty(data.shape, dtype=np.complex64)
-        self.GSK.convolution(data, tmp, [sigma, sigma])
-        return tmp
+        if self.GSK.tmp is None:
+            self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64)
+        self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+        return data
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py
deleted file mode 100644
index 6a3d3312e..000000000
--- a/ptypy/accelerate/cuda_pycuda/import_fft.py
+++ /dev/null
@@ -1,176 +0,0 @@
-'''
-"Just-in-time" compilation for callbacks in cufft.
-'''
-import os
-import sys
-import importlib
-import tempfile
-import setuptools
-import sysconfig
-from pycuda import driver as cuda_driver
-import pybind11
-import contextlib
-from io import StringIO
-from ptypy.utils.verbose import log
-import distutils
-from distutils.unixccompiler import UnixCCompiler
-from distutils.command.build_ext import build_ext
-
-
-def find_in_path(name, path):
-    "Find a file in a search path"
-    # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
-    for dir in path.split(os.pathsep):
-        binpath = os.path.join(dir, name)
-        if os.path.exists(binpath):
-            return os.path.abspath(binpath)
-    return None
-
-def locate_cuda():
-    """
-    Locate the CUDA environment on the system
-    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
-    and values giving the absolute path to each directory.
-    Starts by looking for the CUDAHOME env variable. If not found, everything
-    is based on finding 'nvcc' in the PATH.
-    """
-    # first check if the CUDAHOME env variable is in use
-    if 'CUDAHOME' in os.environ:
-        home = os.environ['CUDAHOME']
-        nvcc = os.path.join(home, 'bin', 'nvcc')
-    else:
-        # otherwise, search the PATH for NVCC
-        nvcc = find_in_path('nvcc', os.environ['PATH'])
-        if nvcc is None:
-            raise EnvironmentError('The nvcc binary could not be '
-                                   'located in your $PATH. Either add it to your path, or set $CUDAHOME')
-        home = os.path.dirname(os.path.dirname(nvcc))
-
-    cudaconfig = {'home': home, 'nvcc': nvcc,
-                  'include': os.path.join(home, 'include'),
-                  'lib64': os.path.join(home, 'lib64')}
-    for k, v in cudaconfig.items():
-        if not os.path.exists(v):
-            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
-    return cudaconfig
-
-class NvccCompiler(UnixCCompiler):
-    def __init__(self, *args, **kwargs):
-        super(NvccCompiler, self).__init__(*args, **kwargs)
-        self.CUDA = locate_cuda()
-        module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') 
-        cmp = cuda_driver.Context.get_device().compute_capability()
-        archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1])
-        self.src_extensions.append('.cu')
-        self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"]
-        self.NVCC_FLAGS = ["-dc", archflag]
-        self.CXXFLAGS = ['"-fPIC"']
-        pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')]  
-        INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir]
-        self.INCLUDES = ["-I%s" % ix for ix in INCLUDES]
-        self.OPTFLAGS = ["-O3", "-std=c++14"]
-
-    def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
-        default_compiler_so = self.compiler_so
-        CPPFLAGS = self.INCLUDES + extra_postargs # little hack here, since postargs usually goes at the end, which we won't do.
-        # makefile line is
-        # $(NVCC) $(NVCC_FLAGS) $(OPTFLAGS) -Xcompiler "$(CXXFLAGS)" $(CPPFLAGS)
-        compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS
-        compiler_exec = " ".join(compiler_command)
-        self.set_executable('compiler_so', compiler_exec)
-        postargs = [] # we don't actually have any postargs
-        super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method
-        # reset the default compiler_so, which we might have changed for cuda
-        self.compiler_so = default_compiler_so
-    
-    def link(self, target_desc, objects,
-             output_filename, output_dir=None, libraries=None,
-             library_dirs=None, runtime_library_dirs=None,
-             export_symbols=None, debug=0, extra_preargs=None,
-             extra_postargs=None, build_temp=None, target_lang=None):
-        default_linker_so = self.linker_so
-        # make file line is
-        # $(NVCC) $(OPTFLAGS) -shared $(LD_FLAGS) $(OBJ) $(OBJ_MOD) -o $@
-        linker_command = [self.CUDA["nvcc"]] + self.OPTFLAGS + ["-shared"] + self.LD_FLAGS
-        linker_exec = " ".join(linker_command)
-        self.set_executable('linker_so', linker_exec)
-        super(NvccCompiler, self).link(target_desc, objects,
-             output_filename, output_dir=None, libraries=None,
-             library_dirs=None, runtime_library_dirs=None,
-             export_symbols=None, debug=0, extra_preargs=None,
-             extra_postargs=None, build_temp=None, target_lang=None)
-        self.linker_so = default_linker_so
-
-class CustomBuildExt(build_ext):
-    def build_extensions(self):
-        old_compiler = self.compiler
-        self.compiler = NvccCompiler(verbose=old_compiler.verbose,
-                                     dry_run=old_compiler.dry_run,
-                                     force=old_compiler.force) # this is our bespoke compiler
-        super(CustomBuildExt, self).build_extensions()
-        self.compiler=old_compiler
-
-@contextlib.contextmanager
-def stdchannel_redirected(stdchannel):
-    """
-    Redirects stdout or stderr to a StringIO object. As of python 3.4, there is a
-    standard library contextmanager for this, but backwards compatibility!
-    """
-    old = getattr(sys, stdchannel)
-    try:
-        s = StringIO()
-        setattr(sys, stdchannel, s)
-        yield s
-    finally:
-        setattr(sys, stdchannel, old)
-
-
-class ImportFFT:
-    def __init__(self, rows, columns, build_path=None, quiet=True):
-        self.build_path = build_path
-        self.cleanup_build_path = None
-        if self.build_path is None:
-            self.build_path = tempfile.mkdtemp(prefix="ptypy_fft")
-            self.cleanup_build_path = True
-
-        full_module_name = "module"
-        module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft')
-        # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115)
-        ext = distutils.extension.Extension(full_module_name,
-                                            sources=[os.path.join(module_dir, "module.cpp"),
-                                                        os.path.join(module_dir, "filtered_fft.cu")],
-                                            extra_compile_args=["-DMY_FFT_COLS=%s" % str(columns) , "-DMY_FFT_ROWS=%s" % str(rows)])
-
-        script_args = ['build_ext',
-                       '--build-temp=%s' % self.build_path,
-                       '--build-lib=%s' % self.build_path]
-        # do I need full_module_name here?
-        setuptools_args = {"name": full_module_name,
-                           "ext_modules": [ext],
-                           "script_args": script_args,
-                           "cmdclass":{"build_ext": CustomBuildExt
-                           }}
-
-        if quiet:
-            # we really don't care about the make print for almost all cases so we redirect
-            with stdchannel_redirected("stdout"):
-                with stdchannel_redirected("stderr"):
-                    setuptools.setup(**setuptools_args)
-        else:
-            setuptools.setup(**setuptools_args)
-
-        spec = importlib.util.spec_from_file_location(full_module_name,
-                                                      os.path.join(self.build_path,
-                                                                   "module" + distutils.sysconfig.get_config_var('EXT_SUFFIX')
-                                                                   )
-                                                      )
-        self.mod = importlib.util.module_from_spec(spec)
-
-    def get_mod(self):
-        return self.mod
-
-    def __del__(self):
-        import shutil
-        if self.cleanup_build_path:
-            log(5, "cleaning up the build directory")
-            shutil.rmtree(self.build_path)
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 9064ab593..47dd4cb79 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -3,9 +3,28 @@
 from pycuda import gpuarray
 from ptypy.utils.verbose import log, logger
 from . import load_kernel
+from .array_utils import CropPadKernel
+from .array_utils import MaxAbs2Kernel
 from ..base import kernels as ab
 from ..base.kernels import Adict
 
+def choose_fft(fft_type):
+    if fft_type=='cuda':
+        try:
+            from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT
+        except:
+            logger.warning('Unable to import cufft version - using Reikna instead')
+            from ptypy.accelerate.cuda_pycuda.fft import FFT
+    elif fft_type=='skcuda':
+        try:
+            from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT
+        except:
+            logger.warning('Unable to import skcuda.fft version - using Reikna instead')
+            from ptypy.accelerate.cuda_pycuda.fft import FFT
+    else:
+        from ptypy.accelerate.cuda_pycuda.fft import FFT
+    return FFT
+
 class PropagationKernel:
 
     def __init__(self, aux, propagator, queue_thread=None, fft='reikna'):
@@ -22,35 +41,49 @@ def __init__(self, aux, propagator, queue_thread=None, fft='reikna'):
     def allocate(self):
 
         aux = self.aux
-
-        if self._fft_type=='cuda':
-            try:
-                from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT
-            except:
-                logger.warning('Unable to import cufft version - using Reikna instead')
-                from ptypy.accelerate.cuda_pycuda.fft import FFT
-        elif self._fft_type=='skcuda':
-            try:
-                from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT
-            except:
-                logger.warning('Unable to import skcuda.fft version - using Reikna instead')
-                from ptypy.accelerate.cuda_pycuda.fft import FFT
-        else:
-            from ptypy.accelerate.cuda_pycuda.fft import FFT
+        FFT = choose_fft(self._fft_type)
 
         if self.prop_type == 'farfield':
-            self._fft1 = FFT(aux, self.queue,
+
+            self._do_crop_pad = (self._p.crop_pad != 0).any()
+            if self._do_crop_pad:
+                self._tmp = np.zeros(aux.shape + self._p.crop_pad, dtype=aux.dtype)
+                self._CPK = CropPadKernel(queue=self._queue)
+            else:
+                self._tmp = aux
+
+            self._fft1 = FFT(self._tmp, self.queue,
                              pre_fft=self._p.pre_fft,
                              post_fft=self._p.post_fft,
                              symmetric=True,
                              forward=True)
-            self._fft2 = FFT(aux, self.queue,
+            self._fft2 = FFT(self._tmp, self.queue,
                              pre_fft=self._p.pre_ifft,
                              post_fft=self._p.post_ifft,
                              symmetric=True,
                              forward=False)
-            self.fw = self._fft1.ft
-            self.bw = self._fft2.ift
+            if self._do_crop_pad:
+                self._tmp = gpuarray.to_gpu(self._tmp)
+
+            def _fw(x,y):
+                if self._do_crop_pad:
+                    self._CPK.crop_pad_2d_simple(self._tmp, x)
+                    self._fft1.ft(self._tmp, self._tmp)
+                    self._CPK.crop_pad_2d_simple(y, self._tmp)
+                else:
+                    self._fft1.ft(x,y)
+
+            def _bw(x,y):
+                if self._do_crop_pad:
+                    self._CPK.crop_pad_2d_simple(self._tmp, x)
+                    self._fft2.ift(self._tmp, self._tmp)
+                    self._CPK.crop_pad_2d_simple(y, self._tmp)
+                else:
+                    self._fft2.ift(x,y)
+            
+            self.fw = _fw
+            self.bw = _bw
+
         elif self.prop_type == "nearfield":
             self._fft1 = FFT(aux, self.queue,
                              post_fft=self._p.kernel,
@@ -90,18 +123,78 @@ def queue(self, queue):
         if self.prop_type == "nearfield":
             self._fft3.queue = queue
 
+class FourierSupportKernel:
+    def __init__(self, support, queue_thread=None, fft='reikna'):
+        self.support = support
+        self.queue = queue_thread
+        self._fft_type = fft
+    def allocate(self):
+        FFT = choose_fft(self._fft_type)
+
+        self._fft1 = FFT(self.support, self.queue,
+                        post_fft=self.support,
+                        symmetric=True,
+                        forward=True)
+        self._fft2 = FFT(self.support, self.queue,
+                        symmetric=True,
+                        forward=False)
+    def apply_fourier_support(self,x):
+        self._fft1.ft(x,x)
+        self._fft2.ift(x,x)
+
+class RealSupportKernel:
+    def __init__(self, support):
+        self.support = support
+    def allocate(self):
+        self.support = gpuarray.to_gpu(self.support)
+    def apply_real_support(self, x):
+        x *= self.support
+
 class FourierUpdateKernel(ab.FourierUpdateKernel):
 
-    def __init__(self, aux, nmodes=1, queue_thread=None):
+    def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'):
         super(FourierUpdateKernel, self).__init__(aux,  nmodes=nmodes)
+
+        if accumulate_type not in ['float', 'double']:
+            raise ValueError('Only float or double types are supported')
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only float or double types are supported')
+        self.accumulate_type = accumulate_type
+        self.math_type = math_type
         self.queue = queue_thread
-        self.fmag_all_update_cuda = load_kernel("fmag_all_update")
-        self.fourier_error_cuda = load_kernel("fourier_error")
+        self.fmag_all_update_cuda = load_kernel("fmag_all_update", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.fmag_update_nopbound_cuda = None
+        self.fourier_deviation_cuda = None
+        self.fourier_error_cuda = load_kernel("fourier_error", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
         self.fourier_error2_cuda = None
-        self.error_reduce_cuda = load_kernel("error_reduce")
+        self.error_reduce_cuda = load_kernel("error_reduce", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'ACC_TYPE': self.accumulate_type,
+            'BDIM_X': 32,
+            'BDIM_Y': 32,
+        })
         self.fourier_update_cuda = None
-        self.log_likelihood_cuda = load_kernel("log_likelihood")
-        self.exit_error_cuda = load_kernel("exit_error")
+        self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel(
+            ("log_likelihood", "log_likelihood2"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            },
+            "log_likelihood.cu")
+        self.exit_error_cuda = load_kernel("exit_error", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
 
         self.gpu = Adict()
         self.gpu.fdev = None
@@ -156,6 +249,28 @@ def fourier_error(self, f, addr, fmag, fmask, mask_sum):
                                      shared=int(bx*by*bz*4),
                                      stream=self.queue)
 
+    def fourier_deviation(self, f, addr, fmag):
+        fdev = self.gpu.fdev
+        if self.fourier_deviation_cuda is None:
+            self.fourier_deviation_cuda = load_kernel("fourier_deviation",{
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            })
+        bx = 64
+        by = 1
+        self.fourier_deviation_cuda(np.int32(self.nmodes),
+                                f,
+                                fmag,
+                                fdev,
+                                addr,
+                                np.int32(self.fshape[1]),
+                                np.int32(self.fshape[2]),
+                                block=(bx, by, 1),
+                                grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])),
+                                stream=self.queue)
+
+
     def error_reduce(self, addr, err_sum):
         self.error_reduce_cuda(self.gpu.ferr,
                                err_sum,
@@ -163,7 +278,6 @@ def error_reduce(self, addr, err_sum):
                                np.int32(self.fshape[2]),
                                block=(32, 32, 1),
                                grid=(int(err_sum.shape[0]), 1, 1),
-                               shared=32*32*4,
                                stream=self.queue)
 
     def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0):
@@ -180,6 +294,29 @@ def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0):
                                   block=(32, 32, 1),
                                   grid=(int(fmag.shape[0]*self.nmodes), 1, 1),
                                   stream=self.queue)
+    
+    def fmag_update_nopbound(self, f, addr, fmag, fmask):
+        fdev = self.gpu.fdev
+        bx = 64
+        by = 1
+        if self.fmag_update_nopbound_cuda is None:
+            self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            })
+        self.fmag_update_nopbound_cuda(f,
+                                  fmask,
+                                  fmag,
+                                  fdev,
+                                  addr,
+                                  np.int32(self.fshape[1]),
+                                  np.int32(self.fshape[2]),
+                                  block=(bx, by, 1),
+                                  grid=(1, 
+                                    int((self.fshape[2] + by - 1) // by), 
+                                    int(fmag.shape[0]*self.nmodes)),
+                                  stream=self.queue)
 
     # Note: this was a test to join the kernels, but it's > 2x slower!
     def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0):
@@ -229,6 +366,24 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
         # TODO: we might want to move this call outside of here
         self.error_reduce(addr, err_phot)
 
+    def log_likelihood2(self, b_aux, addr, mag, mask, err_phot):
+        ferr = self.gpu.ferr
+        bx = 64
+        by = 1
+        self.log_likelihood2_cuda(np.int32(self.nmodes),
+                                 b_aux,
+                                 mask,
+                                 mag,
+                                 addr,
+                                 ferr,
+                                 np.int32(self.fshape[1]),
+                                 np.int32(self.fshape[2]),
+                                 block=(bx, by, 1),
+                                 grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])),
+                                 stream=self.queue)
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
+
     def exit_error(self, aux, addr):
         sh = addr.shape
         maxz = sh[0]
@@ -261,17 +416,36 @@ def execute(self, kernel_name=None, compare=False, sync=False):
 
 class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel):
 
-    def __init__(self, queue_thread=None):
+    def __init__(self, queue_thread=None, math_type = 'float'):
         super(AuxiliaryWaveKernel, self).__init__()
         # and now initialise the cuda
         self.queue = queue_thread
         self._ob_shape = None
         self._ob_id = None
-        self.build_aux_cuda = load_kernel("build_aux")
-        self.build_exit_cuda = load_kernel("build_exit")
-        self.build_aux_no_ex_cuda = load_kernel("build_aux_no_ex", {
-            'CTYPE': 'complex<float>',
-            'FTYPE': 'float'
+        self.math_type = math_type
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only double or float math is supported')
+        self.build_aux_cuda, self.build_aux2_cuda = load_kernel(
+            ("build_aux", "build_aux2"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, "build_aux.cu")
+        self.build_exit_cuda = load_kernel("build_exit", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel(
+            ("build_aux_no_ex", "build_aux2_no_ex"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, "build_aux_no_ex.cu")
+        self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
         })
 
     # DEPRECATED?
@@ -282,14 +456,9 @@ def load(self, aux, ob, pr, ex, addr):
 
     def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0):
         obr, obc = self._cache_object_shape(ob)
-        # print('grid={}, 1, 1'.format(int(ex.shape[0])))
-        # print('b_aux={}, sh={}'.format(type(b_aux), b_aux.shape))
-        # print('ex={}, sh={}'.format(type(ex), ex.shape))
-        # print('pr={}, sh={}'.format(type(pr), pr.shape))
-        # print('ob={}, sh={}'.format(type(ob), ob.shape))
-        # print('obr={}, obc={}'.format(obr, obc))
-        # print('addr={}, sh={}'.format(type(addr), addr.shape))
-        # print('stream={}'.format(self.queue))
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
         self.build_aux_cuda(b_aux,
                             ex,
                             np.int32(ex.shape[1]), np.int32(ex.shape[2]),
@@ -298,11 +467,37 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0):
                             ob,
                             obr, obc,
                             addr,
-                            np.float32(alpha),
-                            block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue)
+                            np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha),
+                            block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue)
+
+    def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        self.build_aux2_cuda(b_aux,
+                            ex,
+                            np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                            pr,
+                            np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                            ob,
+                            obr, obc,
+                            addr,
+                            np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha),
+                            block=(bx, by, 1), 
+                            grid=(
+                                1, 
+                                int((ex.shape[1] + by - 1)//by), 
+                                int(maxz * nmodes)), 
+                            stream=self.queue)
 
-    def build_exit(self, b_aux, addr, ob, pr, ex):
+    def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1):
         obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
         self.build_exit_cuda(b_aux,
                              ex,
                              np.int32(ex.shape[1]), np.int32(ex.shape[2]),
@@ -311,7 +506,28 @@ def build_exit(self, b_aux, addr, ob, pr, ex):
                              ob,
                              obr, obc,
                              addr,
-                             block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue)
+                             np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha),
+                             block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue)
+
+    def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        self.build_exit_alpha_tau_cuda(b_aux,
+                                       ex,
+                                       np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                       pr,
+                                       np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                       ob,
+                                       obr, obc,
+                                       addr,
+                                       np.float32(alpha), np.float32(tau),
+                                       block=(bx, by, 1), 
+                                       grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), 
+                                       stream=self.queue)
 
     def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
         obr, obc = self._cache_object_shape(ob)
@@ -327,12 +543,36 @@ def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
                                   ob,
                                   obr, obc,
                                   addr,
-                                  np.float32(fac),
+                                  np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac),
                                   np.int32(add),
                                   block=(32, 32, 1),
                                   grid=(int(maxz * nmodes), 1, 1),
                                   stream=self.queue)
 
+
+    def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        self.build_aux2_no_ex_cuda(b_aux,
+                                  np.int32(b_aux.shape[-2]),
+                                  np.int32(b_aux.shape[-1]),
+                                  pr,
+                                  np.int32(pr.shape[-2]),
+                                  np.int32(pr.shape[-1]),
+                                  ob,
+                                  obr, obc,
+                                  addr,
+                                  np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac),
+                                  np.int32(add),
+                                  block=(bx, by, 1),
+                                  grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)),
+                                  stream=self.queue)
+    
+    
     def _cache_object_shape(self, ob):
         oid = id(ob)
 
@@ -345,28 +585,44 @@ def _cache_object_shape(self, ob):
 
 class GradientDescentKernel(ab.GradientDescentKernel):
 
-    def __init__(self, aux, nmodes=1, queue=None):
+    def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_type='float'):
         super().__init__(aux, nmodes)
         self.queue = queue
-
+        self.accumulate_type = accumulate_type
+        self.math_type = math_type
+        if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']):
+            raise ValueError("accumulate and math types must be double for float")
+ 
         self.gpu = Adict()
         self.gpu.LLden = None
         self.gpu.LLerr = None
         self.gpu.Imodel = None
 
         subs = {
-            'CTYPE': 'complex<float>' if self.ctype == np.complex64 else 'complex<double>',
-            'FTYPE': 'float' if self.ftype == np.float32 else 'double'
+            'IN_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'ACC_TYPE': self.accumulate_type,
+            'MATH_TYPE': self.math_type
         }
         self.make_model_cuda = load_kernel('make_model', subs)
         self.make_a012_cuda = load_kernel('make_a012', subs)
-        self.error_reduce_cuda = load_kernel('error_reduce', subs)
-        self.fill_b_cuda = load_kernel('fill_b', {**subs, 'BDIM_X': 1024})
-        self.fill_b_reduce_cuda = load_kernel(
-            'fill_b_reduce', {**subs, 'BDIM_X': 1024})
+        self.error_reduce_cuda = load_kernel('error_reduce', {
+            **subs,
+            'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'BDIM_X': 32,
+            'BDIM_Y': 32
+        })
+        self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel(
+            ('fill_b', 'fill_b_reduce'), 
+            {
+                **subs, 
+                'BDIM_X': 1024,
+                'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double'
+            },
+            file="fill_b.cu")
         self.main_cuda = load_kernel('gd_main', subs)
-        self.floating_intensity_cuda_step1 = load_kernel('step1', subs,'intens_renorm.cu')
-        self.floating_intensity_cuda_step2 = load_kernel('step2', subs,'intens_renorm.cu')
+        self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \
+            load_kernel(('step1', 'step2'), subs,'intens_renorm.cu')
 
     def allocate(self):
         self.gpu.LLden = gpuarray.zeros(self.fshape, dtype=self.ftype)
@@ -377,7 +633,7 @@ def allocate(self):
 
         # temporary array for the reduction in fill_b
         sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024))
-        self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64)
+        self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32)
 
     def make_model(self, b_aux, addr):
         # reference shape
@@ -463,7 +719,6 @@ def error_reduce(self, addr, err_sum):
                                np.int32(ferr.shape[-1]),
                                block=(32, 32, 1),
                                grid=(int(maxz), 1, 1),
-                               shared=32*32*4,
                                stream=self.queue)
 
     def floating_intensity(self, addr, w, I, fic):
@@ -481,14 +736,13 @@ def floating_intensity(self, addr, w, I, fic):
         fic_tmp = self.gpu.fic_tmp
 
         ## math ##
-        x = np.int32(sh[1] * sh[2])
-        z = np.int32(maxz)
+        xall = np.int32(maxz * sh[1] * sh[2])
         bx = 1024
 
         self.floating_intensity_cuda_step1(Imodel, I, w, num, den,
-                       z, x,
+                       xall,
                        block=(bx, 1, 1),
-                       grid=(int((x + bx - 1) // bx), 1, int(z)),
+                       grid=(int((xall + bx - 1) // bx), 1, 1),
                        stream=self.queue)
 
         self.error_reduce_cuda(num, fic,
@@ -496,7 +750,6 @@ def floating_intensity(self, addr, w, I, fic):
                                np.int32(num.shape[-1]),
                                block=(32, 32, 1),
                                grid=(int(maxz), 1, 1),
-                               shared=32*32*4,
                                stream=self.queue)
 
         self.error_reduce_cuda(den, fic_tmp,
@@ -504,13 +757,13 @@ def floating_intensity(self, addr, w, I, fic):
                                np.int32(den.shape[-1]),
                                block=(32, 32, 1),
                                grid=(int(maxz), 1, 1),
-                               shared=32*32*4,
                                stream=self.queue)
 
         self.floating_intensity_cuda_step2(fic_tmp, fic, Imodel,
-                       z, x,
-                       block=(bx, 1, 1),
-                       grid=(int((x + bx - 1) // bx), 1, int(z)),
+                       np.int32(Imodel.shape[-2]),
+                       np.int32(Imodel.shape[-1]),
+                       block=(32, 32, 1),
+                       grid=(1, 1, int(maxz)),
                        stream=self.queue)
 
 
@@ -542,39 +795,61 @@ def main(self, b_aux, addr, w, I):
 
 class PoUpdateKernel(ab.PoUpdateKernel):
 
-    def __init__(self, queue_thread=None, denom_type=np.complex64):
+    def __init__(self, queue_thread=None, 
+        math_type='float', accumulator_type='float'):
         super(PoUpdateKernel, self).__init__()
         # and now initialise the cuda
-        if denom_type == np.complex64:
-            dtype = 'complex<float>'
-        elif denom_type == np.float32:
-            dtype = 'float'
-        else:
-            raise ValueError('only complex64 and float32 types supported')
-        self.dtype = dtype
+        if math_type not in ['double', 'float']:
+            raise ValueError('only float and double are supported for math_type')
+        if accumulator_type not in ['double', 'float']:
+            raise ValueError('only float and double are supported for accumulator_type')
+        self.math_type = math_type
+        self.accumulator_type = accumulator_type
         self.queue = queue_thread
+        self.norm = None
+        self.MAK = MaxAbs2Kernel(self.queue)
         self.ob_update_cuda = load_kernel("ob_update", {
-            'DENOM_TYPE': dtype
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
         })
         self.ob_update2_cuda = None  # load_kernel("ob_update2")
         self.pr_update_cuda = load_kernel("pr_update", {
-            'DENOM_TYPE': dtype
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
         })
         self.pr_update2_cuda = None
         self.ob_update_ML_cuda = load_kernel("ob_update_ML", {
-            'CTYPE': 'complex<float>',
-            'FTYPE': 'float'
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
         })
         self.ob_update2_ML_cuda = None
         self.pr_update_ML_cuda = load_kernel("pr_update_ML", {
-            'CTYPE': 'complex<float>',
-            'FTYPE': 'float'
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
         })
         self.pr_update2_ML_cuda = None
+        self.ob_update_local_cuda = load_kernel("ob_update_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.pr_update_local_cuda = load_kernel("pr_update_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
 
     def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
+        if obn.dtype != np.float32:
+            raise ValueError("Denominator must be float32 in current implementation")
 
         if atomics:
             if addr.shape[3] != 3 or addr.shape[2] != 5:
@@ -595,12 +870,15 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
                     "NUM_MODES": obsh[0],
                     "BDIM_X": 16,
                     "BDIM_Y": 16,
-                    'DENOM_TYPE': self.dtype
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
                 })
 
             grid = [int((x+15)//16) for x in ob.shape[-2:]]
-            grid = (grid[0], grid[1], int(1))
-            self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2],
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
                                  prsh[0],
                                  np.int32(ex.shape[0]),
                                  np.int32(ex.shape[1]),
@@ -611,6 +889,8 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
     def pr_update(self, addr, pr, prn, ob, ex, atomics=True):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
+        if prn.dtype != np.float32:
+            raise ValueError("Denominator must be float32 in current implementation")
         if atomics:
             if addr.shape[3] != 3 or addr.shape[2] != 5:
                 raise ValueError('Address not in required shape for atomics pr_update')
@@ -632,7 +912,10 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True):
                     "NUM_MODES": prsh[0],
                     "BDIM_X": 16,
                     "BDIM_Y": 16,
-                    'DENOM_TYPE': self.dtype
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
                 })
 
             grid = [int((x+15)//16) for x in pr.shape[-2:]]
@@ -645,17 +928,18 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True):
     def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
 
         if atomics:
             if addr.shape[3] != 3 or addr.shape[2] != 5:
                 raise ValueError('Address not in required shape for tiled ob_update')
 
             num_pods = np.int32(addr.shape[0] * addr.shape[1])
-            self.ob_update_ML_cuda(ex, num_pods, prsh[1], prsh[2],
+            self.ob_update_ML_cuda(ex, num_pods, exsh[1], exsh[2],
                                    pr, prsh[0], prsh[1], prsh[2],
                                    ob, obsh[0], obsh[1], obsh[2],
                                    addr,
-                                   np.float32(fac),
+                                   np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                    block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
         else:
             if addr.shape[0] != 5 or addr.shape[1] != 3:
@@ -667,17 +951,20 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                     "NUM_MODES": obsh[0],
                     "BDIM_X": 16,
                     "BDIM_Y": 16,
-                    'CTYPE': 'complex<float>',
-                    'FTYPE': 'float'
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
                 })
             grid = [int((x+15)//16) for x in ob.shape[-2:]]
-            grid = (grid[0], grid[1], int(1))
-            self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2],
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
                                     prsh[0],
                                     np.int32(ex.shape[0]),
                                     np.int32(ex.shape[1]),
                                     np.int32(ex.shape[2]),
-                                    ob, pr, ex, addr, np.float32(fac),
+                                    ob, pr, ex, addr, 
+                                    np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                     block=(16, 16, 1), grid=grid, stream=self.queue)
 
     def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
@@ -691,7 +978,7 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                                 pr, prsh[0], prsh[1], prsh[2],
                                 ob, obsh[0], obsh[1], obsh[2],
                                 addr,
-                                np.float32(fac),
+                                np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                 block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
         else:
             if addr.shape[0] != 5 or addr.shape[1] != 3:
@@ -702,29 +989,128 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                     "NUM_MODES": prsh[0],
                     "BDIM_X": 16,
                     "BDIM_Y": 16,
-                    'CTYPE': 'complex<float>',
-                    'FTYPE': 'float'
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
                 })
 
             grid = [int((x+15)//16) for x in pr.shape[-2:]]
             grid = (grid[0], grid[1], int(1))
             self.pr_update2_ML_cuda(prsh[-1], obsh[-2], obsh[-1],
                                  prsh[0], obsh[0], num_pods,
-                                 pr, ob, ex, addr, np.float32(fac),
+                                 pr, ob, ex, addr, 
+                                 np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                  block=(16, 16, 1), grid=grid, stream=self.queue)
 
 
+    def ob_update_local(self, addr, ob, pr, ex, aux):
+        # lazy allocation of temporary 1-element array
+        if self.norm is None:
+            self.norm = gpuarray.empty((1,), dtype=np.float32)
+        self.MAK.max_abs2(pr, self.norm)
+        
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled pr_update')
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.ob_update_local_cuda(ex, aux,
+            exsh[0], exsh[1], exsh[2],
+            pr,
+            prsh[0], prsh[1], prsh[2],
+            self.norm,
+            ob,
+            obsh[0], obsh[1], obsh[2],
+            addr,
+            block=(bx, by, 1),
+            grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)),
+            stream=self.queue)
+
+    def pr_update_local(self, addr, pr, ob, ex, aux):
+        # lazy allocation of temporary 1-element array
+        if self.norm is None:
+            self.norm = gpuarray.empty((1,), dtype=np.float32)
+        self.MAK.max_abs2(ob, self.norm)
+        
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled pr_update')
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+
+        bx = 64
+        by = 1
+        self.pr_update_local_cuda(ex, aux,
+            exsh[0], exsh[1], exsh[2],
+            pr,
+            prsh[0], prsh[1], prsh[2],
+            self.norm,
+            ob,
+            obsh[0], obsh[1], obsh[2],
+            addr,
+            block=(bx, by, 1),
+            grid=(1, int((exsh[1] + by - 1) // by), int(num_pods)),
+            stream=self.queue)
+
+
+
 class PositionCorrectionKernel(ab.PositionCorrectionKernel):
-    def __init__(self, aux, nmodes, queue_thread=None):
-        super(PositionCorrectionKernel, self).__init__(aux, nmodes)
+    from ptypy.accelerate.cuda_pycuda import address_manglers
+
+    # these are used by the self.setup method - replacing them with the GPU implementation
+    MANGLERS = {
+        'Annealing': address_manglers.RandomIntMangler,
+        'GridSearch': address_manglers.GridSearchMangler
+    }
+
+    def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs):
+        super(PositionCorrectionKernel, self).__init__(*args, **kwargs)
+        # make sure we set the right stream in the mangler
+        self.mangler.queue = queue_thread
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only float or double math is supported')
+        if accumulate_type not in ['float', 'double']:
+            raise ValueError('Only float or double math is supported')
+        
         # add kernels
+        self.math_type = math_type
+        self.accumulate_type = accumulate_type
         self.queue = queue_thread
         self._ob_shape = None
         self._ob_id = None
-        self.fourier_error_cuda = load_kernel("fourier_error")
-        self.error_reduce_cuda = load_kernel("error_reduce")
-        self.build_aux_pc_cuda = load_kernel("build_aux_position_correction")
-        self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state")
+        self.fourier_error_cuda = load_kernel("fourier_error",{
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.error_reduce_cuda = load_kernel("error_reduce", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'BDIM_X': 32,
+            'BDIM_Y': 32,
+            'ACC_TYPE': self.accumulate_type
+        })
+        self.log_likelihood_cuda = load_kernel("log_likelihood", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        }, "log_likelihood.cu")
+        self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float'
+        })
 
         self.gpu = Adict()
         self.gpu.fdev = None
@@ -736,13 +1122,16 @@ def allocate(self):
 
     def build_aux(self, b_aux, addr, ob, pr):
         obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
         self.build_aux_pc_cuda(b_aux,
                                pr,
                                np.int32(pr.shape[1]), np.int32(pr.shape[2]),
                                ob,
                                obr, obc,
                                addr,
-                               block=(32, 32, 1), grid=(int(np.prod(addr.shape[:1])), 1, 1), stream=self.queue)
+                               block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue)
 
     def fourier_error(self, f, addr, fmag, fmask, mask_sum):
         fdev = self.gpu.fdev
@@ -772,22 +1161,23 @@ def error_reduce(self, addr, err_fmag):
                                np.int32(self.fshape[2]),
                                block=(32, 32, 1),
                                grid=(int(err_fmag.shape[0]), 1, 1),
-                               shared=32*32*4,
                                stream=self.queue)
 
-    def update_addr_and_error_state_old(self, addr, error_state, mangled_addr, err_sum):
-        '''
-        updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu
-        '''
-        update_indices = err_sum < error_state
-        log(4, "updating %s indices" % np.sum(update_indices))
-        print('update ind {}, addr {}, mangled {}'.format(update_indices.shape, addr.shape, mangled_addr.shape))
-        addr_cpu = addr.get_async(self.queue)
-        self.queue.synchronize()
-        addr_cpu[update_indices] = mangled_addr[update_indices]
-        addr.set_async(ary=addr_cpu, stream=self.queue)
-
-        error_state[update_indices] = err_sum[update_indices]
+    def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
+        ferr = self.gpu.ferr
+        self.log_likelihood_cuda(np.int32(self.nmodes),
+                                 b_aux,
+                                 mask,
+                                 mag,
+                                 addr,
+                                 ferr,
+                                 np.int32(self.fshape[1]),
+                                 np.int32(self.fshape[2]),
+                                 block=(32, 32, 1),
+                                 grid=(int(mag.shape[0]), 1, 1),
+                                 stream=self.queue)
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
 
     def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum):
         # assume all data is on GPU!
diff --git a/ptypy/accelerate/cuda_pycuda/mem_utils.py b/ptypy/accelerate/cuda_pycuda/mem_utils.py
index fdded3dfb..2f5917173 100644
--- a/ptypy/accelerate/cuda_pycuda/mem_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/mem_utils.py
@@ -308,7 +308,7 @@ def add_data_block(self, nbytes=None):
         Returns
         -------
         """
-        if self.max is None or len(self)<=self.max:
+        if self.max is None or len(self)<self.max:
             nbytes=nbytes if nbytes is not None else self._nbytes
             self.data.append(GpuData2(nbytes, self._syncback))
 
diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
new file mode 100644
index 000000000..0d4517d4e
--- /dev/null
+++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
@@ -0,0 +1,170 @@
+"""
+Multi-GPU AllReduce Wrapper, that uses NCCL via cupy if it's available,
+and otherwise falls back to CUDA-aware MPI,
+and if that doesn't work, uses host/device copies with regular MPI.
+
+Findings:
+
+1) NCCL works with unit tests, but not in the engines. It seems to 
+add something to the existing pycuda Context or create a new one,
+as a later event recording on an exit wave transfer fails with
+'ivalid resource handle' Cuda Error. This error typically happens if for example
+a CUDA event is created in a different context than what it is used in,
+or on a different device. PyCuda uses the driver API, NCCL uses the runtime.
+Even though those are interoperable, there seems to be an issue.
+Note that this is before any allreduce call - straight after initialising.
+
+2) NCCL requires cupy - the Python wrapper is in there
+
+3) OpenMPI with CUDA support needs to be available, and:
+  - mpi4py needs to be compiled from master (3.1.0a - latest stable release 3.0.x doesn't have it)
+  - pycuda needs to be compile from master (for __cuda_array_interface__ - 2020.1 version doesn't have it)
+  - OpenMPI in a conda install needs to have the environment variable
+  --> if cuda support isn't enabled, the application simply crashes with a seg fault
+
+4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+   It should be in DEFAULT mode.
+
+"""
+
+import mpi4py
+from pkg_resources import parse_version
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as cuda
+from ptypy.utils import parallel
+from ptypy.utils.verbose import logger, log
+import os
+
+try:
+    from cupy.cuda import nccl
+    import cupy as cp
+except ImportError:
+    nccl = None
+
+# properties to check which versions are available
+
+# use NCCL is it is available, and the user didn't override the
+# default selection with environment variables
+have_nccl = (nccl is not None) and \
+    (not 'PTYPY_USE_CUDAMPI' in os.environ) and \
+    (not 'PTYPY_USE_MPI' in os.environ)
+
+# At the moment, we require:
+# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
+# mpi4py >= 3.1.0
+# pycuda with __cuda_array_interface__
+# and not setting the PTYPY_USE_MPI environment variable
+#
+# -> we ideally want to allow enabling support from a parameter in ptypy
+have_cuda_mpi = "OMPI_MCA_opal_cuda_support" in os.environ and \
+    os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
+    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    hasattr(gpuarray.GPUArray, '__cuda_array_interface__') and \
+    not ('PTYPY_USE_MPI' in os.environ)
+
+
+class MultiGpuCommunicatorBase:
+    """Base class for multi-GPU communicator options, to aggregate common bits"""
+
+    def __init__(self):
+        self.rank = parallel.rank
+        self.ndev = parallel.size
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+        # base class only checks properties of arrays
+        assert isinstance(arr, gpuarray.GPUArray), "Input must be a GPUArray"
+
+
+class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase):
+    """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D"""
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+        super().allReduceSum(arr)
+
+        if parallel.MPIenabled:
+            # note: this creates a temporary CPU array
+            data = arr.get()
+            parallel.allreduce(data)
+            arr.set(data)
+
+class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase):
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+
+        # Check if cuda array interface is available
+        if not hasattr(arr, '__cuda_array_interface__'):
+            raise RuntimeError("input array should have a cuda array interface")
+
+        if parallel.MPIenabled:
+            comm = parallel.comm
+            comm.Allreduce(parallel.MPI.IN_PLACE, arr)
+            
+    
+class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):
+    
+    def __init__(self):
+        super().__init__()
+
+        # Check if GPUs are in default mode        
+        if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT:
+            raise RuntimeError("Compute mode must be default in order to use NCCL")
+        
+        # get a unique identifier for the NCCL communicator and 
+        # broadcast it to all MPI processes (assuming one device per process)
+        if self.rank == 0:
+            self.id = nccl.get_unique_id()
+        else:
+            self.id = None
+
+        self.id = parallel.bcast(self.id)
+
+        self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+
+        buf = int(arr.gpudata)
+        count, datatype = self.__get_NCCL_count_dtype(arr)
+        
+        # no stream support here for now - it fails in NCCL when 
+        # pycuda.Stream.handle is used for some unexplained reason
+        stream = cp.cuda.Stream.null.ptr
+       
+        self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream)
+
+    def __get_NCCL_count_dtype(self, arr):
+            if arr.dtype == np.complex64:
+                return arr.size*2, nccl.NCCL_FLOAT32
+            elif arr.dtype == np.complex128:
+                return arr.size*2, nccl.NCCL_FLOAT64
+            elif arr.dtype == np.float32:
+                return arr.size, nccl.NCCL_FLOAT32
+            elif arr.dtype == np.float64:
+                return arr.size, nccl.NCCL_FLOAT64
+            else:
+                raise ValueError("This dtype is not supported by NCCL.")
+
+
+# pick the appropriate communicator depending on installed packages
+def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
+    if have_nccl and use_nccl:
+        try:
+            comm = MultiGpuCommunicatorNccl()
+            log(4, "Using NCCL communicator")
+            return comm
+        except RuntimeError:
+            pass
+    if have_cuda_mpi and use_cuda_mpi:
+        try:
+            comm = MultiGpuCommunicatorCudaMpi()
+            log(4, "Using CUDA-aware MPI communicator")
+            return comm
+        except RuntimeError:
+            pass
+    comm = MultiGpuCommunicatorMpi()
+    log(4, "Using MPI communicator")
+    return comm
\ No newline at end of file
diff --git a/ptypy/engines/DM.py b/ptypy/engines/DM.py
index 9b8340a63..46fa0a2bc 100644
--- a/ptypy/engines/DM.py
+++ b/ptypy/engines/DM.py
@@ -363,8 +363,7 @@ def object_update(self):
                 # array and therefore underestimate the strength of the probe terms.
                 cfact = self.p.object_inertia * self.mean_power
                 if self.p.obj_smooth_std is not None:
-                    logger.info(
-                        'Smoothing object, average cfact is %.2f'
+                    log(4, 'Smoothing object, average cfact is %.2f'
                         % np.mean(cfact).real)
                     smooth_mfs = [0,
                                   self.p.obj_smooth_std,
diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py
index b0bbaf678..f6009e9b8 100644
--- a/ptypy/engines/ML.py
+++ b/ptypy/engines/ML.py
@@ -19,14 +19,15 @@
 from ..utils import parallel
 from .utils import Cnorm2, Cdot
 from . import register
-from .base import PositionCorrectionEngine
+from .base import BaseEngine
 from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull
 
+
 __all__ = ['ML']
 
 
 @register()
-class ML(PositionCorrectionEngine):
+class ML(BaseEngine):
     """
     Maximum likelihood reconstruction engine.
 
@@ -98,7 +99,7 @@ class ML(PositionCorrectionEngine):
     type = int
     lowlim = 0
     help = Number of iterations before probe update starts
-
+    
     """
 
     SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull]
@@ -153,7 +154,6 @@ def engine_initialize(self):
         """
         Prepare for ML reconstruction.
         """
-        super(ML, self).engine_initialize()
         
         # Object gradient and minimization direction
         self.ob_grad = self.ob.copy(self.ob.ID + '_grad', fill=0.)
@@ -232,9 +232,10 @@ def engine_iterate(self, num=1):
             # probe/object rescaling
             if self.p.scale_precond:
                 cn2_new_pr_grad = Cnorm2(new_pr_grad)
+                cn2_new_ob_grad = Cnorm2(new_ob_grad)
                 if cn2_new_pr_grad > 1e-5:
-                    scale_p_o = (self.p.scale_probe_object * Cnorm2(new_ob_grad)
-                                 / Cnorm2(new_pr_grad))
+                    scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad 
+                                 / cn2_new_pr_grad)
                 else:
                     scale_p_o = self.p.scale_probe_object
                 if self.scale_p_o is None:
@@ -319,7 +320,6 @@ def engine_finalize(self):
         """
         Delete temporary containers.
         """
-        super(ML, self).engine_finalize()
         del self.ptycho.containers[self.ob_grad.ID]
         del self.ob_grad
         del self.ptycho.containers[self.ob_grad_new.ID]
diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py
index 18f18b65e..1a6a49cdd 100644
--- a/ptypy/engines/base.py
+++ b/ptypy/engines/base.py
@@ -13,7 +13,7 @@
 from .. import utils as u
 from ..utils import parallel
 from ..utils.verbose import logger, headerline, log
-from .posref import AnnealingRefine
+from .posref import AnnealingRefine, GridSearchRefine
 
 __all__ = ['BaseEngine', 'Base3dBraggEngine', 'DEFAULT_iter_info', 'PositionCorrectionEngine']
 
@@ -175,16 +175,16 @@ def support_constraint(self, storage=None):
             for s in self.pr.storages.values():
                 self.support_contraint(s)
 
-        # Real space
-        support = self._probe_support.get(storage.ID)
-        if support is not None:
-            storage.data *= support
-
         # Fourier space
         support = self._probe_fourier_support.get(storage.ID)
         if support is not None:
             storage.data[:] = np.fft.ifft2(support * np.fft.fft2(storage.data))
 
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            storage.data *= support
+
     def iterate(self, num=None):
         """
         Compute one or several iterations.
@@ -314,6 +314,11 @@ class PositionCorrectionEngine(BaseEngine):
     type = Param, bool
     help = If True refine scan positions
 
+    [position_refinement.method]
+    default = Annealing
+    type = str
+    help = Annealing or GridSearch
+
     [position_refinement.start]
     default = None
     type = int
@@ -357,6 +362,11 @@ class PositionCorrectionEngine(BaseEngine):
     help = record movement of positions
     """
 
+    POSREF_ENGINES = {
+        "Annealing": AnnealingRefine,
+        "GridSearch": GridSearchRefine
+    }
+
     def __init__(self, ptycho_parent, pars):
         """
         Position Correction engine.
@@ -386,17 +396,17 @@ def engine_initialize(self):
             self.do_position_refinement = False
         else:
             self.do_position_refinement = True
-            log(3, "Initialising position refinement")
+            log(3, "Initialising position refinement (%s)" %self.p.position_refinement.method)
             
             # Enlarge object arrays, 
             # This can be skipped though if the boundary is less important
             for name, s in self.ob.storages.items():
-                s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize))
-                s.reformat()
+               s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize))
+               s.reformat()
 
-            # this could be some kind of dictionary lookup if we want to add more
-            self.position_refinement = AnnealingRefine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric)
-            log(3, "Position refinement initialised")
+            # Choose position refinement engine from dictionary
+            PosrefEngine = self.POSREF_ENGINES[self.p.position_refinement.method]
+            self.position_refinement = PosrefEngine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric)
             self.ptycho.citations.add_article(**self.position_refinement.citation_dictionary)
             if self.p.position_refinement.stop is None:
                 self.p.position_refinement.stop = self.p.numiter
diff --git a/ptypy/engines/posref.py b/ptypy/engines/posref.py
index c0f12a857..af27cdaf1 100644
--- a/ptypy/engines/posref.py
+++ b/ptypy/engines/posref.py
@@ -41,14 +41,61 @@ def update_constraints(self, iteration):
         iteration : int
             The current iteration of the engine.
         '''
+        start, end = self.p.start, self.p.stop
+        # Compute the maximum shift allowed at this iteration
+        self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start)
 
-        raise NotImplementedError('This method needs to be overridden in order to position correct')
+    def estimate_fourier_metric(self, di_view, obj):
+        '''
+        Calculates error based on DM fourier error estimate.
+        
+        Parameters
+        ----------
+        di_view : ptypy.core.classes.View
+            A diffraction view for which we wish to calculate the error.
+
+        obj : numpy.ndarray
+            The current calculated object for which we wish to evaluate the error against.
+        Returns
+        -------
+        np.float
+            The calculated fourier error
+        '''
+        af2 = np.zeros_like(di_view.data)
+        for name, pod in di_view.pods.items():
+            af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj)))
+        return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2) / di_view.pod.mask.sum()
+
+    def estimate_photon_metric(self, di_view, obj):
+        '''
+        Calculates error based on reduced likelihood estimate.
+        
+        Parameters
+        ----------
+        di_view : ptypy.core.classes.View
+            A diffraction view for which we wish to calculate the error.
+
+        obj : numpy.ndarray
+            The current calculated object for which we wish to evaluate the error against.
+        Returns
+        -------
+        np.float
+            The calculated fourier error
+        '''
+        af2 = np.zeros_like(di_view.data)
+        for name, pod in di_view.pods.items():
+            af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj)))
+        return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape))
 
     def cleanup(self):
         '''
         Cleans up every iteration
         '''
 
+    @property
+    def citation_dictionary(self):
+        return {}
+
 
 class AnnealingRefine(PositionRefine):
 
@@ -85,48 +132,6 @@ def __init__(self, position_refinement_parameters, Cobj, metric="fourier"):
         else:
             raise NotImplementedError("Metric %s is currently not implemented" %metric)
 
-    def estimate_fourier_metric(self, di_view, obj):
-        '''
-        Calculates error based on DM fourier error estimate.
-        
-        Parameters
-        ----------
-        di_view : ptypy.core.classes.View
-            A diffraction view for which we wish to calculate the error.
-
-        obj : numpy.ndarray
-            The current calculated object for which we wish to evaluate the error against.
-        Returns
-        -------
-        np.float
-            The calculated fourier error
-        '''
-        af2 = np.zeros_like(di_view.data)
-        for name, pod in di_view.pods.items():
-            af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj)))
-        return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2)
-
-    def estimate_photon_metric(self, di_view, obj):
-        '''
-        Calculates error based on reduced likelihood estimate.
-        
-        Parameters
-        ----------
-        di_view : ptypy.core.classes.View
-            A diffraction view for which we wish to calculate the error.
-
-        obj : numpy.ndarray
-            The current calculated object for which we wish to evaluate the error against.
-        Returns
-        -------
-        np.float
-            The calculated fourier error
-        '''
-        af2 = np.zeros_like(di_view.data)
-        for name, pod in di_view.pods.items():
-            af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj)))
-        return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape))
-
     def update_view_position(self, di_view):
         '''
         Refines the positions by the following algorithm:
@@ -189,24 +194,118 @@ def update_view_position(self, di_view):
                 error = new_error
                 coord = new_coord
                 log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta))
-                
+          
         ob_view.coord = coord
         ob_view.storage.update_views(ob_view)        
         return coord - initial_coord
 
-    def update_constraints(self, iteration):
+    @property
+    def citation_dictionary(self):
+        return {
+            "title" : 'An annealing algorithm to correct positioning errors in ptychography',
+            "author" : 'Maiden et al.',
+            "journal" : 'Ultramicroscopy',
+            "volume" : 120,
+            "year" : 2012,
+            "page" : 64,
+            "doi" : '10.1016/j.ultramic.2012.06.001',
+            "comment" : 'Position Refinement using annealing algorithm'}
+
+class GridSearchRefine(PositionRefine):
+
+    def __init__(self, position_refinement_parameters, Cobj, metric="fourier"):
         '''
+        Grid Search Position Refinement.
 
-        Parameters
         ----------
-        iteration : int
-            The current iteration of the engine.
+        position_refinement_parameters : ptypy.utils.parameters.Param
+            The parameter tree for the refinement
+
+        Cobj : ptypy.core.classes.Container
+            The current pbject container object
+        metric : str
+            "fourier" or "photon"
         '''
+        super(GridSearchRefine, self).__init__(position_refinement_parameters)
 
-        start, end = self.p.start, self.p.stop
+        self.Cobj = Cobj  # take a reference here. It would be cool if we could make this read-only or something
 
-        # Compute the maximum shift allowed at this iteration
-        self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start)
+        # Updated before each iteration by self.update_constraints
+        self.max_shift_dist = None
+
+        # Choose metric for fourier error
+        if metric == "fourier":
+            self.fourier_error = self.estimate_fourier_metric
+        elif metric == "photon":
+            self.fourier_error = self.estimate_photon_metric
+        else:
+            raise NotImplementedError("Metric %s is currently not implemented" %metric)
+
+    def update_view_position(self, di_view):
+        '''
+        Refines the positions by the following algorithm:
+
+        Calculates all shifts in a given radius around the original position and calculates the fourier error. 
+        If the fourier error decreased the calculated postion will be used as new position.
+
+        Parameters
+        ----------
+        di_view : ptypy.core.classes.View
+            A diffraction view that we wish to refine.
+
+        Returns
+        -------
+        numpy.ndarray
+            A length 2 numpy array with the position increments for x and y co-ordinates respectively
+        '''        
+        # there might be more than one object view
+        ob_view = di_view.pod.ob_view
+
+        initial_coord = ob_view.coord.copy()
+        coord = initial_coord
+        psize = ob_view.psize.copy()
+
+        # if you cannot move far, do nothing
+        if np.max(psize) >= self.max_shift_dist:
+            return np.zeros((2,))
+            
+        # This can be optimized by saving existing iteration fourier error...
+        error = self.fourier_error(di_view, ob_view.data)
+        
+        max_shift_pix = self.max_shift_dist // np.min(psize)
+        max_bound_pix = self.p.max_shift // np.min(psize)
+
+        # Create the search grid
+        deltas = np.mgrid[-max_shift_pix:max_shift_pix+1:1,
+                          -max_shift_pix:max_shift_pix+1:1]
+        within_bound = (deltas[0]**2 + deltas[1]**2) < (max_bound_pix**2)
+        deltas = (deltas[:,within_bound] * np.min(psize)).T
+
+        for i in range(deltas.shape[0]):
+            # Current shift
+            delta = deltas[i]
+
+            # Move view to new position
+            new_coord = initial_coord + delta 
+            ob_view.coord = new_coord
+            ob_view.storage.update_views(ob_view)
+            data = ob_view.data
+            
+            # catch bad slicing
+            if not np.allclose(data.shape, ob_view.shape):
+                continue 
+                
+            new_error = self.fourier_error(di_view, data)
+            
+            if new_error < error:
+                # keep
+                error = new_error
+                coord = new_coord
+                log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta))
+     
+        ob_view.coord = coord
+        ob_view.storage.update_views(ob_view)        
+        return coord - initial_coord
 
     @property
     def citation_dictionary(self):
@@ -218,4 +317,4 @@ def citation_dictionary(self):
             "year" : 2012,
             "page" : 64,
             "doi" : '10.1016/j.ultramic.2012.06.001',
-            "comment" : 'Position Refinement using annealing algorithm'}
+            "comment" : 'Position Refinement using annealing algorithm'}
\ No newline at end of file
diff --git a/ptypy/engines/utils.py b/ptypy/engines/utils.py
index fadb012c9..39fcbc93c 100644
--- a/ptypy/engines/utils.py
+++ b/ptypy/engines/utils.py
@@ -152,7 +152,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True):
         for name, pod in diff_view.pods.items():
             if not pod.active:
                 continue
-            df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object
+            df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit
             pod.exit += df
             err_exit += np.mean(u.abs2(df))
     elif err_fmag > pbound:
@@ -162,7 +162,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True):
         for name, pod in diff_view.pods.items():
             if not pod.active:
                 continue
-            df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object
+            df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit
             pod.exit += df
             err_exit += np.mean(u.abs2(df))
     else:
@@ -170,7 +170,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True):
         for name, pod in diff_view.pods.items():
             if not pod.active:
                 continue
-            df = alpha * (pod.probe * pod.object - pod.exit)
+            df = (pod.probe * pod.object - pod.exit)
             pod.exit += df
             err_exit += np.mean(u.abs2(df))
 
diff --git a/ptypy/utils/array_utils.py b/ptypy/utils/array_utils.py
index dbd7a2366..a6dc3ede9 100644
--- a/ptypy/utils/array_utils.py
+++ b/ptypy/utils/array_utils.py
@@ -54,9 +54,9 @@ def switch_orientation(A, orientation, center=None):
     o = 0 if orientation is None else orientation
 
     if np.isscalar(o):
-        o = [i=='1' for i in '%03d' % int(np.base_repr(o))]
+        o = [i == '1' for i in '%03d' % int(np.base_repr(o))]
 
-    assert len(o)==3
+    assert len(o) == 3
     # switch orientation
     if o[0]:
         axes = list(range(A.ndim - 2)) + [-1, -2]
@@ -101,10 +101,11 @@ def rebin_2d(A, rebin=1):
     sh = np.asarray(A.shape[-2:])
     newdim = sh // rebin
     if not (sh % rebin == 0).all():
-        raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)),str(rebin)))
+        raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)), str(rebin)))
     else:
         return A.reshape(-1, newdim[0], rebin, newdim[1], rebin).mean(-1).mean(-2)
 
+
 def crop_pad_symmetric_2d(A, newshape, center=None):
     """
     Crops or pads Array `A` symmetrically along the last two axes `(-2,-1)`
@@ -148,7 +149,8 @@ def crop_pad_symmetric_2d(A, newshape, center=None):
 
     return A, c + low
 
-def rebin(a, *args,**kwargs):
+
+def rebin(a, *args, **kwargs):
     """
     Rebin ndarray data into a smaller ndarray of the same rank whose dimensions
     are factors of the original dimensions.
@@ -184,46 +186,52 @@ def rebin(a, *args,**kwargs):
     """
     shape = a.shape
     lenShape = a.ndim
-    factor = np.asarray(shape)//np.asarray(args)
+    factor = np.asarray(shape) // np.asarray(args)
     evList = ['a.reshape('] + \
-             ['args[%d],factor[%d],'%(i,i) for i in range(lenShape)] + \
-             [')'] + ['.sum(%d)'%(i+1) for i in range(lenShape)] + \
-             ['*( 1.'] + ['/factor[%d]'%i for i in range(lenShape)] + [')']
-    if kwargs.get('verbose',False):
+             ['args[%d],factor[%d],' % (i, i) for i in range(lenShape)] + \
+             [')'] + ['.sum(%d)' % (i + 1) for i in range(lenShape)] + \
+             ['*( 1.'] + ['/factor[%d]' % i for i in range(lenShape)] + [')']
+    if kwargs.get('verbose', False):
         print(''.join(evList))
     return eval(''.join(evList))
 
+
 def _confine(A):
     """\
     Doc TODO.
     """
-    sh=np.asarray(A.shape)[1:]
-    A=A.astype(float)
-    m=np.reshape(sh,(len(sh),) + len(sh)*(1,))
-    return (A+m//2.0) % m - m//2.0
+    sh = np.asarray(A.shape)[1:]
+    A = A.astype(float)
+    m = np.reshape(sh, (len(sh),) + len(sh) * (1,))
+    return (A + m // 2.0) % m - m // 2.0
 
-def _translate_to_pix(sh,center):
+
+def _translate_to_pix(sh, center):
     """\
     Take arbitrary input and translate it to a pixel position with respect to sh.
     """
-    sh=np.array(sh)
+    sh = np.array(sh)
     if not isstr(center):
         cen = np.asarray(center) % sh
-    elif center=='fftshift':
-        cen=sh//2.0
-    elif center=='geometric':
-        cen=sh/2.0-0.5
-    elif center=='fft':
-        cen=sh*0.0
+    elif center == 'fftshift':
+        cen = sh // 2.0
+    elif center == 'geometric':
+        cen = sh / 2.0 - 0.5
+    elif center == 'fft':
+        cen = sh * 0.0
     else:
         raise TypeError('Input %s not understood for center' % str(center))
 
     return cen
+
+
 """
 def center_2d(sh,center):
     return translate_to_pix(sh[-2:],expect2(center))
 """
-def grids(sh,psize=None,center='geometric',FFTlike=True):
+
+
+def grids(sh, psize=None, center='geometric', FFTlike=True):
     """\
     ``q0,q1,... = grids(sh)``
     returns centered coordinates for a N-dimensional array of shape sh (pixel units)
@@ -258,14 +266,14 @@ def grids(sh,psize=None,center='geometric',FFTlike=True):
     ndarray
         The coordinate grids
     """
-    sh=np.asarray(sh)
+    sh = np.asarray(sh)
 
-    cen = _translate_to_pix(sh,center)
+    cen = _translate_to_pix(sh, center)
 
-    grid=np.indices(sh).astype(float) - np.reshape(cen,(len(sh),) + len(sh)*(1,))
+    grid = np.indices(sh).astype(float) - np.reshape(cen, (len(sh),) + len(sh) * (1,))
 
     if FFTlike:
-        grid=_confine(grid)
+        grid = _confine(grid)
 
     if psize is None:
         return grid
@@ -273,16 +281,17 @@ def grids(sh,psize=None,center='geometric',FFTlike=True):
         psize = np.asarray(psize)
         if psize.size == 1:
             psize = psize * np.ones((len(sh),))
-        psize = np.asarray(psize).reshape( (len(sh),) + len(sh)*(1,))
+        psize = np.asarray(psize).reshape((len(sh),) + len(sh) * (1,))
         return grid * psize
 
+
 def rectangle(grids, dims=None, ew=2):
     if dims is None:
         dims = (grids.shape[-2] / 2., grids.shape[-1] / 2.)
     v, h = dims
     V, H = grids
-    return (smooth_step(-np.abs(V) + v/2, ew)
-            * smooth_step(-np.abs(H) + h/2, ew))
+    return (smooth_step(-np.abs(V) + v / 2, ew)
+            * smooth_step(-np.abs(H) + h / 2, ew))
 
 
 def ellipsis(grids, dims=None, ew=2):
@@ -291,9 +300,10 @@ def ellipsis(grids, dims=None, ew=2):
     v, h = dims
     V, H = grids
     return smooth_step(
-        0.5 - np.sqrt(V**2/v**2 + H**2/h**2), ew/np.sqrt(v * h))
+        0.5 - np.sqrt(V ** 2 / v ** 2 + H ** 2 / h ** 2), ew / np.sqrt(v * h))
+
 
-def zoom(c,*arg,**kwargs):
+def zoom(c, *arg, **kwargs):
     """
     Wrapper `scipy.ndimage.zoom <https://docs.scipy.org/doc/scipy/reference/
     generated/scipy.ndimage.zoom.html>`_ function and shares 
@@ -311,25 +321,27 @@ def zoom(c,*arg,**kwargs):
     numpy.ndarray
         Zoomed array
     """
-    #if np.all(arg[0] == 1):
+    # if np.all(arg[0] == 1):
     #    return c
     #
     from scipy.ndimage import zoom as _zoom
 
     if np.iscomplexobj(c):
-        return complex_overload(_zoom)(c,*arg,**kwargs)
+        return complex_overload(_zoom)(c, *arg, **kwargs)
     else:
-        return _zoom(c,*arg,**kwargs)
+        return _zoom(c, *arg, **kwargs)
+
 
 c_zoom = zoom
-c_zoom.__doc__='*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__
+c_zoom.__doc__ = '*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__
 
 """
 c_affine_transform=complex_overload(ndi.affine_transform)
 c_affine_transform.__doc__='*complex input*\n\n'+c_affine_transform.__doc__
 """
 
-def shift_zoom(c,zoom,cen_old,cen_new,**kwargs):
+
+def shift_zoom(c, zoom, cen_old, cen_new, **kwargs):
     """
     Move array from center `cen_old` to `cen_new` and perform a zoom `zoom`.
     
@@ -359,39 +371,40 @@ def shift_zoom(c,zoom,cen_old,cen_new,**kwargs):
     numpy.ndarray
         Shifted and zoomed array
     """
-    
+
     from scipy.ndimage import affine_transform as at
     zoom = np.diag(zoom)
-    offset=np.asarray(cen_old)-np.asarray(cen_new).dot(zoom)
+    offset = np.asarray(cen_old) - np.asarray(cen_new).dot(zoom)
     if np.iscomplexobj(c):
-        return complex_overload(at)(c,zoom,offset,**kwargs)
+        return complex_overload(at)(c, zoom, offset, **kwargs)
     else:
-        return at(c,zoom,offset,**kwargs)
+        return at(c, zoom, offset, **kwargs)
 
 
-def fill3D(A,B,offset=[0,0,0]):
+def fill3D(A, B, offset=[0, 0, 0]):
     """
     Fill 3-dimensional array A with B.
     """
-    if A.ndim != 3 or B.ndim!=3:
+    if A.ndim != 3 or B.ndim != 3:
         raise ValueError('3D a numpy arrays expected')
-    Alim=np.array(A.shape)
-    Blim=np.array(B.shape)
-    off=np.array(offset)
+    Alim = np.array(A.shape)
+    Blim = np.array(B.shape)
+    off = np.array(offset)
     Ao = off.copy()
-    Ao[Ao<0]=0
+    Ao[Ao < 0] = 0
     Bo = -off.copy()
-    Bo[Bo<0]=0
-    print(Ao,Bo)
+    Bo[Bo < 0] = 0
     if (Bo > Blim).any() or (Ao > Alim).any():
         print("misfit")
         pass
     else:
-        A[Ao[0]:min(off[0]+Blim[0],Alim[0]),Ao[1]:min(off[1]+Blim[1],Alim[1]),Ao[2]:min(off[2]+Blim[2],Alim[2])] \
-        =B[Bo[0]:min(Alim[0]-off[0],Blim[0]),Bo[1]:min(Alim[1]-off[1],Blim[1]),Bo[2]:min(Alim[2]-off[2],Blim[2])]
+        A[Ao[0]:min(off[0] + Blim[0], Alim[0]), Ao[1]:min(off[1] + Blim[1], Alim[1]),
+        Ao[2]:min(off[2] + Blim[2], Alim[2])] \
+            = B[Bo[0]:min(Alim[0] - off[0], Blim[0]), Bo[1]:min(Alim[1] - off[1], Blim[1]),
+              Bo[2]:min(Alim[2] - off[2], Blim[2])]
 
 
-def mirror(A,axis=-1):
+def mirror(A, axis=-1):
     """
     Mirrors array `A` along one axis `axis`
 
@@ -409,9 +422,10 @@ def mirror(A,axis=-1):
         A view to the mirrored array.
 
     """
-    return np.flipud(np.asarray(A).swapaxes(axis,0)).swapaxes(0,axis)
+    return np.flipud(np.asarray(A).swapaxes(axis, 0)).swapaxes(0, axis)
+
 
-def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'):
+def pad_lr(A, axis, l, r, fillpar=0.0, filltype='scalar'):
     """
     Pads ndarray `A` orthogonal to `axis` with `l` layers
     (pixels,lines,planes,...) on low side an `r` layers on high side.
@@ -445,62 +459,61 @@ def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'):
     crop_pad
     crop_pad_symmetric_2d
     """
-    fsh=np.array(A.shape)
-    if l>fsh[axis]: #rare case
-        l-=fsh[axis]
-        A=pad_lr(A,axis,fsh[axis],0,fillpar, filltype)
-        return pad_lr(A,axis,l,r,fillpar, filltype)
-    elif r>fsh[axis]:
-        r-=fsh[axis]
-        A=pad_lr(A,axis,0,fsh[axis],fillpar, filltype)
-        return pad_lr(A,axis,l,r,fillpar, filltype)
-    elif filltype=='mirror':
-        left=mirror(np.split(A,[l],axis)[0],axis)
-        right=mirror(np.split(A,[A.shape[axis]-r],axis)[1],axis)
-    elif filltype=='periodic':
-        right=np.split(A,[r],axis)[0]
-        left=np.split(A,[A.shape[axis]-l],axis)[1]
-    elif filltype=='project':
-        fsh[axis]=l
-        left=np.ones(fsh,A.dtype)*np.split(A,[1],axis)[0]
-        fsh[axis]=r
-        right=np.ones(fsh,A.dtype)*np.split(A,[A.shape[axis]-1],axis)[1]
-    if filltype=='scalar' or l==0:
-        fsh[axis]=l
-        left=np.ones(fsh,A.dtype)*fillpar
-    if filltype=='scalar' or r==0:
-        fsh[axis]=r
-        right=np.ones(fsh,A.dtype)*fillpar
-    if filltype=='custom':
-        left=fillpar[0].astype(A.dtype)
-        right=fillpar[1].astype(A.dtype)
-    return np.concatenate((left,A,right),axis=axis)
-
-
-def _roll_from_pixcenter(sh,center):
+    fsh = np.array(A.shape)
+    if l > fsh[axis]:  # rare case
+        l -= fsh[axis]
+        A = pad_lr(A, axis, fsh[axis], 0, fillpar, filltype)
+        return pad_lr(A, axis, l, r, fillpar, filltype)
+    elif r > fsh[axis]:
+        r -= fsh[axis]
+        A = pad_lr(A, axis, 0, fsh[axis], fillpar, filltype)
+        return pad_lr(A, axis, l, r, fillpar, filltype)
+    elif filltype == 'mirror':
+        left = mirror(np.split(A, [l], axis)[0], axis)
+        right = mirror(np.split(A, [A.shape[axis] - r], axis)[1], axis)
+    elif filltype == 'periodic':
+        right = np.split(A, [r], axis)[0]
+        left = np.split(A, [A.shape[axis] - l], axis)[1]
+    elif filltype == 'project':
+        fsh[axis] = l
+        left = np.ones(fsh, A.dtype) * np.split(A, [1], axis)[0]
+        fsh[axis] = r
+        right = np.ones(fsh, A.dtype) * np.split(A, [A.shape[axis] - 1], axis)[1]
+    if filltype == 'scalar' or l == 0:
+        fsh[axis] = l
+        left = np.ones(fsh, A.dtype) * fillpar
+    if filltype == 'scalar' or r == 0:
+        fsh[axis] = r
+        right = np.ones(fsh, A.dtype) * fillpar
+    if filltype == 'custom':
+        left = fillpar[0].astype(A.dtype)
+        right = fillpar[1].astype(A.dtype)
+    return np.concatenate((left, A, right), axis=axis)
+
+
+def _roll_from_pixcenter(sh, center):
     """\
     returns array of ints as input for np.roll
     use np.roll(A,-roll_from_pixcenter(sh,cen)[ax],ax) to put 'cen' in geometric center of array A
     """
-    sh=np.array(sh)
+    sh = np.array(sh)
     if center != None:
-        if center=='fftshift':
-            cen=sh//2.0
-        elif center=='geometric':
-            cen=sh/2.0-0.5
-        elif center=='fft':
-            cen=sh*0.0
+        if center == 'fftshift':
+            cen = sh // 2.0
+        elif center == 'geometric':
+            cen = sh / 2.0 - 0.5
+        elif center == 'fft':
+            cen = sh * 0.0
         elif center is not None:
-            cen=sh*np.asarray(center) % sh - 0.5
+            cen = sh * np.asarray(center) % sh - 0.5
 
-        roll=np.ceil(cen - sh/2.0) % sh
+        roll = np.ceil(cen - sh / 2.0) % sh
     else:
-        roll=np.zeros_like(sh)
+        roll = np.zeros_like(sh)
     return roll.astype(int)
 
 
-
-def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'):
+def crop_pad_axis(A, hplanes, axis=-1, roll=0, fillpar=0.0, filltype='scalar'):
     """
     Crops or pads a volume array `A` at beginning and end of axis `axis`
     with a number of hyperplanes specified by `hplanes`
@@ -573,37 +586,36 @@ def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'):
     >>> B=crop_pad_axis(V,(3,-2),1,filltype='mirror')
     """
     if np.isscalar(hplanes):
-        hplanes=int(hplanes)
-        r=np.abs(hplanes) // 2 * np.sign(hplanes)
-        l=hplanes - r
-    elif len(hplanes)==2:
-        l=int(hplanes[0])
-        r=int(hplanes[1])
+        hplanes = int(hplanes)
+        r = np.abs(hplanes) // 2 * np.sign(hplanes)
+        l = hplanes - r
+    elif len(hplanes) == 2:
+        l = int(hplanes[0])
+        r = int(hplanes[1])
     else:
         raise RuntimeError('unsupoorted input for \'hplanes\'')
 
-    if roll!=0:
-        A=np.roll(A,-roll,axis=axis)
-
-    if l<=0 and r<=0:
-        A=np.split(A,[-l,A.shape[axis]+r],axis)[1]
-    elif l>0 and r>0:
-        A=pad_lr(A,axis,l,r,fillpar,filltype)
-    elif l>0 and r<=0:
-        A=pad_lr(A,axis,l,0,fillpar,filltype)
-        A=np.split(A,[0,A.shape[axis]+r],axis)[1]
-    elif l<=0 and r>0:
-        A=pad_lr(A,axis,0,r,fillpar,filltype)
-        A=np.split(A,[-l,A.shape[axis]],axis)[1]
-
-
-    if roll!=0:
-        return np.roll(A,roll+r,axis=axis)
+    if roll != 0:
+        A = np.roll(A, -roll, axis=axis)
+
+    if l <= 0 and r <= 0:
+        A = np.split(A, [-l, A.shape[axis] + r], axis)[1]
+    elif l > 0 and r > 0:
+        A = pad_lr(A, axis, l, r, fillpar, filltype)
+    elif l > 0 and r <= 0:
+        A = pad_lr(A, axis, l, 0, fillpar, filltype)
+        A = np.split(A, [0, A.shape[axis] + r], axis)[1]
+    elif l <= 0 and r > 0:
+        A = pad_lr(A, axis, 0, r, fillpar, filltype)
+        A = np.split(A, [-l, A.shape[axis]], axis)[1]
+
+    if roll != 0:
+        return np.roll(A, roll + r, axis=axis)
     else:
         return A
 
 
-def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'):
+def crop_pad(A, hplane_list, axes=None, cen=None, fillpar=0.0, filltype='scalar'):
     """\
     Crops or pads a volume array `A` with a number of hyperplanes according to parameters in `hplanes`
     Wrapper for crop_pad_axis.
@@ -660,14 +672,13 @@ def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'):
 
     """
     if axes is None:
-        axes=np.arange(len(hplane_list))-len(hplane_list)
-    elif not(len(axes)==len(hplane_list)):
+        axes = np.arange(len(hplane_list)) - len(hplane_list)
+    elif not (len(axes) == len(hplane_list)):
         raise RuntimeError('if axes is specified, hplane_list has to be same length as axes')
 
-    sh=np.array(A.shape)
-    roll = _roll_from_pixcenter(sh,cen)
+    sh = np.array(A.shape)
+    roll = _roll_from_pixcenter(sh, cen)
 
-    for ax,cut in zip(axes,hplane_list):
-        A=crop_pad_axis(A,cut,ax,roll[ax],fillpar,filltype)
+    for ax, cut in zip(axes, hplane_list):
+        A = crop_pad_axis(A, cut, ax, roll[ax], fillpar, filltype)
     return A
-
diff --git a/setup.py b/setup.py
index 43940038c..83d5b9a89 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,11 @@
 #!/usr/bin/env python
 
+# we should aim to remove the distutils dependency
+import distutils
 import setuptools #, setuptools.command.build_ext
 from distutils.core import setup
+import os
+import sys
 
 CLASSIFIERS = """\
 Development Status :: 3 - Alpha
@@ -62,6 +66,38 @@ def write_version_py(filename='ptypy/version.py'):
     except:
         vers = VERSION
 
+ext_modules = []
+cmdclass = {}
+# filtered Cuda FFT extension module
+"""
+Alternative options for this switch:
+
+1. Put the cufft extension module as a separate python package with its own setup.py and
+   put an optional dependency into ptypy (extras_require={ "cufft": ["pybind11"] }), so that 
+   when users do pip install ptypy it installs it without that dependency, and if users do 
+   pip install ptypy[cufft] it installs the optional dependency module
+
+2. Use an environment variable to control the setting, as sqlalchemy does for its C extensions, 
+   or detect if cuda is available on the system and enable it in this case, etc.
+"""
+try:
+    from extensions import locate_cuda # this raises an error if pybind11 is not available
+    CUDA = locate_cuda() # this raises an error if CUDA is not available
+    from extensions import CustomBuildExt
+    cufft_dir = os.path.join('ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft')
+    ext_modules.append(
+        distutils.core.Extension("ptypy.filtered_cufft",
+            sources=[os.path.join(cufft_dir, "module.cpp"),
+                    os.path.join(cufft_dir, "filtered_fft.cu")]
+        )
+    )
+    cmdclass = {"build_ext": CustomBuildExt}
+    EXTBUILD_MESSAGE = "ptypy has been successfully installed with the pre-compiled cufft extension.\n"
+except:
+    EXTBUILD_MESSAGE = '*' * 75 + "\n"
+    EXTBUILD_MESSAGE += "ptypy has been installed without the pre-compiled cufft extension.\n"
+    EXTBUILD_MESSAGE += "If you require cufft, make sure to have CUDA and pybind11 installed.\n"
+    EXTBUILD_MESSAGE += '*' * 75 + "\n"
 
 exclude_packages = []
 package_list = setuptools.find_packages(exclude=exclude_packages)
@@ -74,12 +110,15 @@ def write_version_py(filename='ptypy/version.py'):
     package_dir={'ptypy': 'ptypy'},
     packages=package_list,
     package_data={'ptypy': ['resources/*',],
-                  'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu'],
-                  'ptypy.accelerate.cuda_pycuda.cuda.filtered_fft': ['*.hpp', '*.cpp', 'Makefile', '*.cu', '*.h']},
+                  'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu']},
     scripts=['scripts/ptypy.plot',
              'scripts/ptypy.inspect',
              'scripts/ptypy.plotclient',
              'scripts/ptypy.new',
              'scripts/ptypy.csv2cp',
              'scripts/ptypy.run'],
+    ext_modules=ext_modules,
+    cmdclass=cmdclass
 )
+
+print(EXTBUILD_MESSAGE)
\ No newline at end of file
diff --git a/templates/minimal_prep_and_run_DM_pycuda.py b/templates/minimal_prep_and_run_DM_pycuda.py
index 269e3dd42..976a8b0b8 100644
--- a/templates/minimal_prep_and_run_DM_pycuda.py
+++ b/templates/minimal_prep_and_run_DM_pycuda.py
@@ -16,7 +16,9 @@
 p.io = u.Param()
 p.io.home = "~/dumps/ptypy/"
 p.io.autosave = u.Param(active=True)
-p.io.autoplot = u.Param(active=False)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+p.io.interaction.client = u.Param(poll_timeout=1)
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
 p.scans.MF = u.Param()
diff --git a/templates/minimal_prep_and_run_DR_pycuda.py b/templates/minimal_prep_and_run_DR_pycuda.py
new file mode 100644
index 000000000..618616320
--- /dev/null
+++ b/templates/minimal_prep_and_run_DR_pycuda.py
@@ -0,0 +1,56 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 3
+
+# Frames per block
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+p.io.interaction.client = u.Param()
+p.io.interaction.client.poll_timeout = 1
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.0
+p.scans.MF.coherence = u.Param()
+p.scans.MF.coherence.num_probe_modes = 3
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DR_pycuda'
+p.engines.engine00.numiter = 100
+p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE
+p.engines.engine00.tau = 1
+
+# prepare and run
+P = Ptycho(p,level=5)
diff --git a/templates/minimal_prep_and_run_DR_pycuda_stream.py b/templates/minimal_prep_and_run_DR_pycuda_stream.py
new file mode 100644
index 000000000..38c5157a0
--- /dev/null
+++ b/templates/minimal_prep_and_run_DR_pycuda_stream.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda_stream, DR_pycuda
+DR_pycuda_stream.MAX_BLOCKS=3
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 3
+
+# Frames per block
+p.frames_per_block = 20
+
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+p.io.interaction.client = u.Param()
+p.io.interaction.client.poll_timeout = 1
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 384
+p.scans.MF.data.num_frames = 120
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.0
+p.scans.MF.coherence = u.Param()
+p.scans.MF.coherence.num_probe_modes = 3
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DR_pycuda_stream'
+p.engines.engine00.numiter = 20
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE
+p.engines.engine00.tau = 1
+
+# prepare and run
+P = Ptycho(p,level=5)
diff --git a/templates/minimal_prep_and_run_DR_serial.py b/templates/minimal_prep_and_run_DR_serial.py
new file mode 100644
index 000000000..a9c3c04ba
--- /dev/null
+++ b/templates/minimal_prep_and_run_DR_serial.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.accelerate.base.engines import DR_serial
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 3
+
+# Frames per block
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+p.io.interaction.client = u.Param()
+p.io.interaction.client.poll_timeout = 1
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.0
+p.scans.MF.coherence = u.Param()
+p.scans.MF.coherence.num_probe_modes = 3
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DR_serial'
+p.engines.engine00.numiter = 100
+p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE
+p.engines.engine00.tau = 1
+#p.engines.engine00.rescale_probe = False
+#p.engines.engine00.fourier_power_bound = 0.0
+
+# prepare and run
+P = Ptycho(p,level=5)
diff --git a/templates/minimal_prep_and_run_ML_pycuda.py b/templates/minimal_prep_and_run_ML_pycuda.py
index a66f39825..4b0dd5f51 100644
--- a/templates/minimal_prep_and_run_ML_pycuda.py
+++ b/templates/minimal_prep_and_run_ML_pycuda.py
@@ -11,7 +11,7 @@
 p = u.Param()
 
 # for verbose output
-p.verbose_level = 2
+p.verbose_level = 3
 p.frames_per_block = 400
 # set home path
 p.io = u.Param()
@@ -27,7 +27,7 @@
 p.scans.MF.data= u.Param()
 p.scans.MF.data.name = 'MoonFlowerScan'
 p.scans.MF.data.shape = 128
-p.scans.MF.data.num_frames = 600
+p.scans.MF.data.num_frames = 100
 p.scans.MF.data.save = None
 
 p.scans.MF.illumination = u.Param(diversity=None)
@@ -43,15 +43,14 @@
 p.engines = u.Param()
 p.engines.engine00 = u.Param()
 p.engines.engine00.name = 'ML_pycuda'
-p.engines.engine00.numiter = 10
+p.engines.engine00.numiter = 300
 p.engines.engine00.numiter_contiguous = 5
 p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
 p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
-p.engines.engine00.floating_intensities = True
-
+p.engines.engine00.scale_precond = True
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
 
 # prepare and run
 P = Ptycho(p,level=5)
-#P.run()
-P.print_stats()
-#u.pause(10)
diff --git a/templates/minimal_prep_and_run_probe_modes.py b/templates/minimal_prep_and_run_probe_modes.py
index 8dbcb4dc4..17d358b6b 100644
--- a/templates/minimal_prep_and_run_probe_modes.py
+++ b/templates/minimal_prep_and_run_probe_modes.py
@@ -14,7 +14,10 @@
 # set home path
 p.io = u.Param()
 p.io.home = "/tmp/ptypy/"
-p.io.autosave = None
+p.io.autosave = u.Param(active=False)
+p.io.interaction = u.Param(active=True)
+p.io.interaction.client = u.Param()
+p.io.interaction.client.poll_timeout = 1
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
diff --git a/templates/minimal_prep_and_run_resample_DM.py b/templates/minimal_prep_and_run_resample_DM.py
index 08c12540c..b06281223 100644
--- a/templates/minimal_prep_and_run_resample_DM.py
+++ b/templates/minimal_prep_and_run_resample_DM.py
@@ -14,7 +14,7 @@
 # set home path
 p.io = u.Param()
 p.io.home = "/tmp/ptypy/"
-p.io.autosave = None
+p.io.autosave = u.Param(active=False)
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
diff --git a/templates/minimal_prep_and_run_resample_ML.py b/templates/minimal_prep_and_run_resample_ML.py
index f0d5619f9..2edbb8bcc 100644
--- a/templates/minimal_prep_and_run_resample_ML.py
+++ b/templates/minimal_prep_and_run_resample_ML.py
@@ -15,7 +15,7 @@
 # set home path
 p.io = u.Param()
 p.io.home = "/tmp/ptypy/"
-p.io.autosave = None
+p.io.autosave = u.Param(active=False)
 #p.io.autoplot = u.Param()
 #p.io.autoplot.dump = True
 #p.io.autoplot = False
diff --git a/templates/position_refinement.py b/templates/position_refinement_DM.py
similarity index 80%
rename from templates/position_refinement.py
rename to templates/position_refinement_DM.py
index c3a348c24..052b4b679 100644
--- a/templates/position_refinement.py
+++ b/templates/position_refinement_DM.py
@@ -15,7 +15,8 @@
 # set home path
 p.io = u.Param()
 p.io.home = "/tmp/ptypy/"
-p.io.autosave = u.Param()
+p.io.autosave = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
@@ -41,15 +42,15 @@
 p.engines.engine00 = u.Param()
 p.engines.engine00.name = 'DM'
 p.engines.engine00.probe_support = 1
-# p.engines.engine00.probe_center_tol = 0.5
 p.engines.engine00.numiter = 1000
 p.engines.engine00.position_refinement = u.Param()
 p.engines.engine00.position_refinement.start = 50
-p.engines.engine00.position_refinement.stop = 990
+p.engines.engine00.position_refinement.stop = 950
 p.engines.engine00.position_refinement.interval = 10
 p.engines.engine00.position_refinement.nshifts = 32
-p.engines.engine00.position_refinement.amplitude = 1e-6
-p.engines.engine00.position_refinement.max_shift = 2e-6
+p.engines.engine00.position_refinement.amplitude = 5e-7
+p.engines.engine00.position_refinement.max_shift = 1e-6
+p.engines.engine00.position_refinement.method = "GridSearch"
 
 # prepare and run
 P = Ptycho(p, level=4)
@@ -58,26 +59,24 @@
 a = 0.
 
 coords = []
+coords_start = []
 for pname, pod in P.pods.items():
+
     # Save real position
     coords.append(np.copy(pod.ob_view.coord))
     before = pod.ob_view.coord
     psize = pod.pr_view.psize
-    # print(pname)
-    # print(before)
     perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize)
-
     new_coord = before + perturbation # make sure integer number of pixels shift
-
-
     pod.ob_view.coord = new_coord
-
-    #pod.diff *= np.random.uniform(0.1,1)y
+    coords_start.append(np.copy(pod.ob_view.coord))
+    #pod.diff *= np.random.uniform(0.1,1)
     a += 4.
 
 np.savetxt("positions_theory.txt", coords)
+np.savetxt("positions_start.txt", coords_start)
 P.obj.reformat()
 
-
 # Run
 P.run()
+P.finalize()
diff --git a/templates/position_refinement_DM_pycuda.py b/templates/position_refinement_DM_pycuda.py
new file mode 100644
index 000000000..ac51ef337
--- /dev/null
+++ b/templates/position_refinement_DM_pycuda.py
@@ -0,0 +1,93 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+import numpy as np
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = 3
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home = "/tmp/ptypy/"
+p.io.autosave = u.Param(active=True, interval=500)
+p.io.autoplot = u.Param(active=False)#True, interval=100)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' # or 'Full'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# p.scans.MF.illumination.diversity=u.Param()
+# p.scans.MF.illumination.diversity.power = 0.1
+# p.scans.MF.illumination.diversity.noise = (np.pi, 3.0)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+#p.scans.MF.data.add_poisson_noise = False
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_pycuda'
+p.engines.engine00.probe_support = 1
+p.engines.engine00.numiter = 1000
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.position_refinement = u.Param()
+p.engines.engine00.position_refinement.start = 50
+p.engines.engine00.position_refinement.stop = 950
+p.engines.engine00.position_refinement.interval = 10
+p.engines.engine00.position_refinement.nshifts = 32
+p.engines.engine00.position_refinement.amplitude = 5e-7
+p.engines.engine00.position_refinement.max_shift = 1e-6
+p.engines.engine00.position_refinement.method = "GridSearch"
+
+# prepare and run
+P = Ptycho(p, level=4)
+
+# Mess up the positions
+a = 0.
+
+coords = []
+coords_start = []
+for pname, pod in P.pods.items():
+
+    # Save real position
+    coords.append(np.copy(pod.ob_view.coord))
+    before = pod.ob_view.coord
+    psize = pod.pr_view.psize
+    perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize)
+    new_coord = before + perturbation # make sure integer number of pixels shift
+    pod.ob_view.coord = new_coord
+    coords_start.append(np.copy(pod.ob_view.coord))
+    #pod.diff *= np.random.uniform(0.1,1)y
+    a += 4.
+
+np.savetxt("positions_theory.txt", coords)
+np.savetxt("positions_start", coords_start)
+P.obj.reformat()# update the object storage
+
+# Run
+P.run()
+P.finalize()
+
diff --git a/templates/position_refinement_DM_serial.py b/templates/position_refinement_DM_serial.py
index 523dfd486..6c5584cfd 100644
--- a/templates/position_refinement_DM_serial.py
+++ b/templates/position_refinement_DM_serial.py
@@ -8,7 +8,6 @@
 from ptypy.core import Ptycho
 from ptypy import utils as u
 
-from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda
 from ptypy.accelerate.base.engines import DM_serial
 
 
@@ -16,12 +15,13 @@
 
 # for verbose output
 p.verbose_level = 3
-p.frames_per_block = 300
+p.frames_per_block = 100
 # set home path
 p.io = u.Param()
 p.io.home = "~/dumps/ptypy/"
 p.io.autosave = u.Param(active=True, interval=500)
 p.io.autoplot = u.Param(active=False)#True, interval=100)
+p.io.interaction = u.Param(active=False)
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
@@ -32,7 +32,7 @@
 p.scans.MF.data= u.Param()
 p.scans.MF.data.name = 'MoonFlowerScan'
 p.scans.MF.data.shape = 128
-p.scans.MF.data.num_frames = 2000
+p.scans.MF.data.num_frames = 200
 p.scans.MF.data.save = None
 
 p.scans.MF.illumination = u.Param(diversity=None)
@@ -43,25 +43,26 @@
 # position distance in fraction of illumination frame
 p.scans.MF.data.density = 0.2
 # total number of photon in empty beam
-p.scans.MF.data.photons = 1e6
+p.scans.MF.data.photons = 1e8
 # Gaussian FWHM of possible detector blurring
 p.scans.MF.data.psf = 0.
-p.scans.MF.data.add_poisson_noise = False
-
+#p.scans.MF.data.add_poisson_noise = False
 
 # attach a reconstrucion engine
 p.engines = u.Param()
 p.engines.engine00 = u.Param()
-p.engines.engine00.name = 'DM_pycuda'
-p.engines.engine00.numiter = 1000
+p.engines.engine00.name = 'DM_serial'
+p.engines.engine00.probe_support = 1
+p.engines.engine00.numiter = 100
 p.engines.engine00.numiter_contiguous = 10
 p.engines.engine00.position_refinement = u.Param()
 p.engines.engine00.position_refinement.start = 50
 p.engines.engine00.position_refinement.stop = 950
 p.engines.engine00.position_refinement.interval = 10
-p.engines.engine00.position_refinement.nshifts = 16
-p.engines.engine00.position_refinement.amplitude = 1e-6
-p.engines.engine00.position_refinement.max_shift = 2e-6
+p.engines.engine00.position_refinement.nshifts = 32
+p.engines.engine00.position_refinement.amplitude = 5e-7
+p.engines.engine00.position_refinement.max_shift = 1e-6
+p.engines.engine00.position_refinement.method = "GridSearch"
 
 # prepare and run
 P = Ptycho(p, level=4)
@@ -70,23 +71,25 @@
 a = 0.
 
 coords = []
+coords_start = []
 for pname, pod in P.pods.items():
+
     # Save real position
     coords.append(np.copy(pod.ob_view.coord))
     before = pod.ob_view.coord
     psize = pod.pr_view.psize
-
     perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize)
     new_coord = before + perturbation # make sure integer number of pixels shift
     pod.ob_view.coord = new_coord
-
+    coords_start.append(np.copy(pod.ob_view.coord))
     #pod.diff *= np.random.uniform(0.1,1)y
     a += 4.
 
-# np.savetxt("positions_theory.txt", coords)
+np.savetxt("positions_theory.txt", coords)
+np.savetxt("positions_start.txt", coords_start)
 P.obj.reformat()# update the object storage
 
-
 # Run
 P.run()
+P.finalize()
 
diff --git a/test/accelerate_tests/base_tests/address_manglers_test.py b/test/accelerate_tests/base_tests/address_manglers_test.py
index 11af45e42..7e27c885a 100644
--- a/test/accelerate_tests/base_tests/address_manglers_test.py
+++ b/test/accelerate_tests/base_tests/address_manglers_test.py
@@ -1,7 +1,7 @@
 import unittest
 import sys
 import numpy as np
-from ptypy.accelerate.base.address_manglers import RandomIntMangle
+from ptypy.accelerate.base.address_manglers import BaseMangler, RandomIntMangler
 
 COMPLEX_TYPE = np.complex64
 FLOAT_TYPE = np.float32
@@ -16,14 +16,8 @@ def setUp(self):
     def tearDown(self):
         np.set_printoptions()
 
-    def test_addr_original_set(self):
-
-        max_bound = 10
-        step_size = 3
-        scan_pts = 2
+    def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3):
         total_number_scan_positions = scan_pts ** 2
-        num_modes = 3
-
         X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
         X = X.reshape((total_number_scan_positions)) + max_bound  # max bound is added in the DM_serial engine.
         Y = Y.reshape((total_number_scan_positions)) + max_bound
@@ -44,26 +38,59 @@ def test_addr_original_set(self):
                     mode_idx += 1
                     exit_idx += 1
             position_idx += 1
+        
+        return addr_original
 
-        print(repr(addr_original))
-
-        old_positions = np.zeros((total_number_scan_positions))
-
-        differences_from_original = np.zeros((len(addr_original), 2))
-        differences_from_original[::2] = 12  # so definitely more than the max_bound
-        new_positions = addr_original[:, 0, 1, 1:] + differences_from_original
-
-        mangler = RandomIntMangle(step_size, 50, 100, max_bound=max_bound, )
-
-
-        mangler.apply_bounding_box(new_positions, old_positions, addr_original)
-        print(repr(new_positions))
-        expected_new_positions = new_positions[:]
-        expected_new_positions[::2] = 0
-
-        print(repr(expected_new_positions))
-
-        np.testing.assert_array_equal(expected_new_positions, new_positions)
-
-
+    def test_apply_bounding_box(self):
 
+        scan_pts=2
+        max_bound=10
+        addr = self.prepare_addresses(scan_pts=scan_pts, max_bound=max_bound)
+        step_size = 3
+        
+        mangler = BaseMangler(step_size, 50, 100, nshifts=1, max_bound=max_bound, )
+        min_oby = 1
+        max_oby = 10
+        min_obx = 2
+        max_obx = 9
+        mangler.apply_bounding_box(addr[:,:,1,1], min_oby, max_oby)
+        mangler.apply_bounding_box(addr[:,:,1,2], min_obx, max_obx)
+        
+        np.testing.assert_array_less(addr[:,:,1,1], max_oby+1)
+        np.testing.assert_array_less(addr[:,:,1,2], max_obx+1)
+        np.testing.assert_array_less(min_oby-1, addr[:,:,1,1])
+        np.testing.assert_array_less(min_obx-1, addr[:,:,1,2])
+
+
+    def test_get_address(self):
+        # the other manglers are using the BaseMangler's get_address function
+        # so we set the deltas in a BaseMangler object and test get_address
+
+        scan_pts=2
+        addr_original = self.prepare_addresses(scan_pts=scan_pts)
+        total_number_scan_positions = scan_pts ** 2
+        addr1 = np.copy(addr_original)
+        addr2 = np.copy(addr_original)
+        nshifts=1
+        step_size=2
+        mglr = BaseMangler(step_size, 50, 100, nshifts, max_bound=2)
+        # 2 shifts, with positive/negative shifting
+        mglr.delta = np.array([
+            [1, 2], 
+            [-4, -2]
+        ])
+        mglr.get_address(0, addr_original, addr1, 10, 9)
+        mglr.get_address(1, addr_original, addr2, 10, 9)
+
+        exp1 = np.copy(addr_original)
+        exp2 = np.copy(addr_original)
+        # element-wise here to prepare reference
+        for f in range(addr_original.shape[0]):
+            for m in range(addr_original.shape[1]):
+                exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1))
+                exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2))
+                exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4))
+                exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2))
+
+        np.testing.assert_array_equal(addr1, exp1)
+        np.testing.assert_array_equal(addr2, exp2)
diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py
index f1a182ab0..b1cac58fe 100644
--- a/test/accelerate_tests/base_tests/array_utils_test.py
+++ b/test/accelerate_tests/base_tests/array_utils_test.py
@@ -2,7 +2,6 @@
 Tests for the array_utils module
 '''
 
-
 import unittest
 import numpy as np
 from ptypy.accelerate.base import FLOAT_TYPE, COMPLEX_TYPE
@@ -12,7 +11,7 @@
 class ArrayUtilsTest(unittest.TestCase):
 
     def test_dot_resolution(self):
-        X,Y,Z = np.indices((3,3,1001), dtype=np.float32)
+        X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32)
         A = 10 ** Y + 1j * 10 ** X
         out = au.dot(A, A)
         np.testing.assert_array_equal(out, 60666606.0)
@@ -21,7 +20,7 @@ def test_abs2_real_input(self):
         single_dim = 50.0
         npts = single_dim ** 3
         array_to_be_absed = np.arange(npts)
-        absed = np.array([ix**2 for ix in array_to_be_absed])
+        absed = np.array([ix ** 2 for ix in array_to_be_absed])
         array_shape = (int(single_dim), int(single_dim), int(single_dim))
         array_to_be_absed.reshape(array_shape)
         absed.reshape(array_shape)
@@ -29,13 +28,12 @@ def test_abs2_real_input(self):
         np.testing.assert_array_equal(absed, out)
         self.assertEqual(absed.dtype, np.float)
 
-
     def test_abs2_complex_input(self):
         single_dim = 50.0
         array_shape = (int(single_dim), int(single_dim), int(single_dim))
         npts = single_dim ** 3
         array_to_be_absed = np.arange(npts) + 1j * np.arange(npts)
-        absed = np.array([np.abs(ix**2) for ix in array_to_be_absed])
+        absed = np.array([np.abs(ix ** 2) for ix in array_to_be_absed])
         absed.reshape(array_shape)
         array_to_be_absed.reshape(array_shape)
         out = au.abs2(array_to_be_absed)
@@ -53,7 +51,7 @@ def test_sum_to_buffer(self):
 
         # fill the input array
         for idx in range(I):
-            in1[idx] = np.ones((M, N))* (idx + 1.0)
+            in1[idx] = np.ones((M, N)) * (idx + 1.0)
 
         outshape = (X, M, N)
         expected_out = np.empty(outshape)
@@ -64,9 +62,9 @@ def test_sum_to_buffer(self):
         in1_addr = np.empty((I, 3))
 
         in1_addr = np.array([(0, 0, 0),
-                            (1, 0, 0),
-                            (2, 0, 0),
-                            (3, 0, 0)])
+                             (1, 0, 0),
+                             (2, 0, 0),
+                             (3, 0, 0)])
 
         out1_addr = np.empty_like(in1_addr)
         out1_addr = np.array([(0, 0, 0),
@@ -77,7 +75,6 @@ def test_sum_to_buffer(self):
         out = au.sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype=FLOAT_TYPE)
         np.testing.assert_array_equal(out, expected_out)
 
-
     def test_sum_to_buffer_complex(self):
 
         I = 4
@@ -89,20 +86,20 @@ def test_sum_to_buffer_complex(self):
 
         # fill the input array
         for idx in range(I):
-            in1[idx] = np.ones((M, N))* (idx + 1.0) + 1j * np.ones((M, N))* (idx + 1.0)
+            in1[idx] = np.ones((M, N)) * (idx + 1.0) + 1j * np.ones((M, N)) * (idx + 1.0)
 
         outshape = (X, M, N)
         expected_out = np.empty(outshape, dtype=COMPLEX_TYPE)
 
-        expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N))* 4.0
-        expected_out[1] = np.ones((M, N)) * 6.0+ 1j * np.ones((M, N))* 6.0
+        expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N)) * 4.0
+        expected_out[1] = np.ones((M, N)) * 6.0 + 1j * np.ones((M, N)) * 6.0
 
         in1_addr = np.empty((I, 3))
 
         in1_addr = np.array([(0, 0, 0),
-                            (1, 0, 0),
-                            (2, 0, 0),
-                            (3, 0, 0)])
+                             (1, 0, 0),
+                             (2, 0, 0),
+                             (3, 0, 0)])
 
         out1_addr = np.empty_like(in1_addr)
         out1_addr = np.array([(0, 0, 0),
@@ -120,7 +117,7 @@ def test_norm2_1d_real(self):
         np.testing.assert_array_equal(out, 5.0)
 
     def test_norm2_1d_complex(self):
-        a = np.array([1.0+1.0j, 2.0+2.0j], dtype=COMPLEX_TYPE)
+        a = np.array([1.0 + 1.0j, 2.0 + 2.0j], dtype=COMPLEX_TYPE)
         out = au.norm2(a)
         np.testing.assert_array_equal(out, 10.0)
 
@@ -131,22 +128,22 @@ def test_norm2_2d_real(self):
         np.testing.assert_array_equal(out, 30.0)
 
     def test_norm2_2d_complex(self):
-        a = np.array([[1.0+1.0j, 2.0+2.0j],
-                      [3.0+3.0j, 4.0+4.0j]], dtype=COMPLEX_TYPE)
+        a = np.array([[1.0 + 1.0j, 2.0 + 2.0j],
+                      [3.0 + 3.0j, 4.0 + 4.0j]], dtype=COMPLEX_TYPE)
         out = au.norm2(a)
         np.testing.assert_array_equal(out, 60.0)
 
     def test_norm2_3d_real(self):
         a = np.array([[[1.0, 2.0],
-                      [3.0, 4.0]],
+                       [3.0, 4.0]],
                       [[5.0, 6.0],
                        [7.0, 8.0]]], dtype=FLOAT_TYPE)
         out = au.norm2(a)
         np.testing.assert_array_equal(out, 204.0)
 
     def test_norm2_3d_complex(self):
-        a = np.array([[[1.0+1.0j, 2.0+2.0j],
-                      [3.0+3.0j, 4.0+4.0j]],
+        a = np.array([[[1.0 + 1.0j, 2.0 + 2.0j],
+                       [3.0 + 3.0j, 4.0 + 4.0j]],
                       [[5.0 + 5.0j, 6.0 + 6.0j],
                        [7.0 + 7.0j, 8.0 + 8.0j]]], dtype=COMPLEX_TYPE)
         out = au.norm2(a)
@@ -154,46 +151,45 @@ def test_norm2_3d_complex(self):
 
     def test_complex_gaussian_filter_2d(self):
         data = np.zeros((8, 8), dtype=COMPLEX_TYPE)
-        data[3:5, 3:5] = 2.0+2.0j
-        mfs = 3.0,4.0
+        data[3:5, 3:5] = 2.0 + 2.0j
+        mfs = 3.0, 4.0
         out = au.complex_gaussian_filter(data, mfs)
         expected_out = np.array([0.11033735 + 0.11033735j, 0.11888228 + 0.11888228j, 0.13116673 + 0.13116673j
                                     , 0.13999543 + 0.13999543j, 0.13999543 + 0.13999543j, 0.13116673 + 0.13116673j
                                     , 0.11888228 + 0.11888228j, 0.11033735 + 0.11033735j], dtype=COMPLEX_TYPE)
         np.testing.assert_array_almost_equal(np.diagonal(out), expected_out)
 
-
     def test_complex_gaussian_filter_2d_batched(self):
         batch_number = 2
         A = 5
         B = 5
 
         data = np.zeros((batch_number, A, B), dtype=COMPLEX_TYPE)
-        data[:, 2:3, 2:3] = 2.0+2.0j
-        mfs = 3.0,4.0
+        data[:, 2:3, 2:3] = 2.0 + 2.0j
+        mfs = 3.0, 4.0
         out = au.complex_gaussian_filter(data, mfs)
 
-        expected_out = np.array([[[ 0.07988770+0.0798877j,   0.07989411+0.07989411j,  0.07989471+0.07989471j,
-                                    0.07989411+0.07989411j,  0.07988770+0.0798877j],
-                                  [ 0.08003781+0.08003781j,  0.08004424+0.08004424j,  0.08004485+0.08004485j,
-                                    0.08004424+0.08004424j,  0.08003781+0.08003781j],
-                                  [ 0.08012911+0.08012911j,  0.08013555+0.08013555j,  0.08013615+0.08013615j,
-                                    0.08013555+0.08013555j,  0.08012911+0.08012911j],
-                                  [ 0.08003781+0.08003781j,  0.08004424+0.08004424j,  0.08004485+0.08004485j,
-                                    0.08004424+0.08004424j,  0.08003781+0.08003781j],
-                                  [ 0.07988770+0.0798877j,   0.07989411+0.07989411j,  0.07989471+0.07989471j,
-                                    0.07989411+0.07989411j,  0.07988770+0.0798877j ]],
-
-                                 [[ 0.07988770+0.0798877j,   0.07989411+0.07989411j,  0.07989471+0.07989471j,
-                                    0.07989411+0.07989411j,  0.07988770+0.0798877j ],
-                                  [ 0.08003781+0.08003781j,  0.08004424+0.08004424j,  0.08004485+0.08004485j,
-                                    0.08004424+0.08004424j,  0.08003781+0.08003781j],
-                                  [ 0.08012911+0.08012911j,  0.08013555+0.08013555j,  0.08013615+0.08013615j,
-                                    0.08013555+0.08013555j,  0.08012911+0.08012911j],
-                                  [ 0.08003781+0.08003781j,  0.08004424+0.08004424j,  0.08004485+0.08004485j,
-                                    0.08004424+0.08004424j,  0.08003781+0.08003781j],
-                                  [ 0.07988770+0.0798877j,   0.07989411+0.07989411j,  0.07989471+0.07989471j,
-                                    0.07989411+0.07989411j,  0.07988770+0.0798877j ]]], dtype=COMPLEX_TYPE)
+        expected_out = np.array([[[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j,
+                                   0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j],
+                                  [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j,
+                                   0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j],
+                                  [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j,
+                                   0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j],
+                                  [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j,
+                                   0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j],
+                                  [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j,
+                                   0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]],
+
+                                 [[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j,
+                                   0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j],
+                                  [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j,
+                                   0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j],
+                                  [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j,
+                                   0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j],
+                                  [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j,
+                                   0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j],
+                                  [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j,
+                                   0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]]], dtype=COMPLEX_TYPE)
 
         np.testing.assert_array_almost_equal(out, expected_out)
 
@@ -206,13 +202,12 @@ def test_mass_center_2d(self):
         X, Y = np.meshgrid(x, x)
         Xoff = 5.0
         Yoff = 2.0
-        probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals
+        probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals
 
         com = au.mass_center(np.abs(probe[0]))
         expected_out = np.array([Yoff, Xoff]) + npts // 2
         np.testing.assert_array_almost_equal(com, expected_out, decimal=6)
 
-
     def test_mass_center_3d(self):
         npts = 64
         probe = np.zeros((npts, npts, npts), dtype=COMPLEX_TYPE)
@@ -223,7 +218,7 @@ def test_mass_center_3d(self):
         Xoff = 5.0
         Yoff = 2.0
         Zoff = 10.0
-        probe[(X-Xoff)**2 + (Y-Yoff)**2 + (Z-Zoff)**2< rad**2] = probe_vals
+        probe[(X - Xoff) ** 2 + (Y - Yoff) ** 2 + (Z - Zoff) ** 2 < rad ** 2] = probe_vals
 
         com = au.mass_center(np.abs(probe))
         expected_out = np.array([Yoff, Xoff, Zoff]) + npts // 2
@@ -238,28 +233,64 @@ def test_interpolated_shift(self):
         X, Y = np.meshgrid(x, x)
         Xoff = 5.0
         Yoff = 2.0
-        probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals
+        probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals
         offset = np.array([-Yoff, -Xoff])
 
         not_shifted_probe = np.zeros((1, npts, npts), dtype=COMPLEX_TYPE)
-        not_shifted_probe[0, (X)**2 + (Y)**2 < rad**2] = probe_vals
+        not_shifted_probe[0, (X) ** 2 + (Y) ** 2 < rad ** 2] = probe_vals
         probe[0] = au.interpolated_shift(probe[0], offset)
         np.testing.assert_array_almost_equal(probe, not_shifted_probe, decimal=8)
 
     def test_clip_magnitudes_to_range(self):
-        data = np.ones((5,5), dtype=COMPLEX_TYPE)
-        data[2, 4] = 20.0*np.exp(1j*np.pi/2)
-        data[3, 1] = 0.2*np.exp(1j*np.pi/3)
+        data = np.ones((5, 5), dtype=COMPLEX_TYPE)
+        data[2, 4] = 20.0 * np.exp(1j * np.pi / 2)
+        data[3, 1] = 0.2 * np.exp(1j * np.pi / 3)
 
         clip_min = 0.5
         clip_max = 2.0
         expected_out = np.ones_like(data)
-        expected_out[2, 4] = 2.0*np.exp(1j*np.pi/2)
-        expected_out[3, 1] = 0.5*np.exp(1j*np.pi/3)
+        expected_out[2, 4] = 2.0 * np.exp(1j * np.pi / 2)
+        expected_out[3, 1] = 0.5 * np.exp(1j * np.pi / 3)
         au.clip_complex_magnitudes_to_range(data, clip_min, clip_max)
-        np.testing.assert_array_almost_equal(data, expected_out, decimal=7) # floating point precision I guess...
-
-
-
-if __name__=='__main__':
-    unittest.main()
\ No newline at end of file
+        np.testing.assert_array_almost_equal(data, expected_out, decimal=7)  # floating point precision I guess...
+
+    def test_crop_pad_1(self):
+        # pad, integer, 2D
+        B = np.indices((4, 4), dtype=np.int)
+        A = np.zeros((6, 6), dtype=B.dtype)
+        au.crop_pad_2d_simple(A, B.sum(0))
+        exp_A = np.array([[0, 0, 0, 0, 0, 0],
+                          [0, 0, 1, 2, 3, 0],
+                          [0, 1, 2, 3, 4, 0],
+                          [0, 2, 3, 4, 5, 0],
+                          [0, 3, 4, 5, 6, 0],
+                          [0, 0, 0, 0, 0, 0]])
+        np.testing.assert_equal(A, exp_A)
+
+    def test_crop_pad_2(self):
+        # crop, float, 3D
+        B = np.indices((4, 4), dtype=np.float32)
+        A = np.zeros((2, 2, 2), dtype=B.dtype)
+        au.crop_pad_2d_simple(A, B)
+        exp_A = np.array([[[1., 1.],
+                           [2., 2.]],
+                          [[1., 2.],
+                           [1., 2.]]], dtype=np.float32)
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+    def test_crop_pad_3(self):
+        # crop/pad, complex, 3D
+        B = np.indices((4, 3), dtype=np.complex64)
+        B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2, 2, 5), dtype=B.dtype)
+        au.crop_pad_2d_simple(A, B)
+        exp_A = np.array([[[0. + 0.j, 1. + 0.j, 1. + 1.j, 1. + 2.j, 0. + 0.j],
+                           [0. + 0.j, 2. + 0.j, 2. + 1.j, 2. + 2.j, 0. + 0.j]],
+                          [[0. + 0.j, 0. + 1.j, 1. + 1.j, 2. + 1.j, 0. + 0.j],
+                           [0. + 0.j, 0. + 2.j, 1. + 2.j, 2. + 2.j, 0. + 0.j]]],
+                         dtype=np.complex64)
+        np.testing.assert_array_almost_equal(A, exp_A)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py
index e38909e71..93e753a51 100644
--- a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py
+++ b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py
@@ -21,7 +21,7 @@ def setUp(self):
     def tearDown(self):
         np.set_printoptions()
 
-    def prepare_arrays(self):
+    def prepare_arrays(self, scan_points = None):
         B = 3  # frame size y
         C = 3  # frame size x
 
@@ -34,7 +34,10 @@ def prepare_arrays(self):
         H = B + npts_greater_than  # object size y
         I = C + npts_greater_than  # object size x
 
-        scan_pts = 2  # one dimensional scan point number
+        if scan_points is None:
+            scan_pts = 2  # one dimensional scan point number
+        else:
+            scan_pts = scan_points
 
         total_number_scan_positions = scan_pts ** 2
         total_number_modes = G * D
@@ -76,25 +79,17 @@ def prepare_arrays(self):
         return addr, object_array, probe, exit_wave
 
     def test_build_aux_same_as_exit(self):
-        '''
-        setup
-        '''
-
-        '''
-        test
-        '''
+        # setup
         addr, object_array, probe, exit_wave = self.prepare_arrays()
         auxiliary_wave = np.zeros_like(exit_wave)
 
+        # test
         AWK = AuxiliaryWaveKernel()
         alpha_set = 1.0
         AWK.allocate()  # doesn't actually do anything at the moment
-
         AWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set)
 
-        # print("auxiliary_wave after")
-        # print(repr(auxiliary_wave))
-
+        # assert
         expected_auxiliary_wave = np.array([[[-1. + 3.j, -1. + 3.j, -1. + 3.j],
                                              [-1. + 3.j, -1. + 3.j, -1. + 3.j],
                                              [-1. + 3.j, -1. + 3.j, -1. + 3.j]],
@@ -143,32 +138,20 @@ def test_build_aux_same_as_exit(self):
                                             [[-16. + 16.j, -16. + 16.j, -16. + 16.j],
                                              [-16. + 16.j, -16. + 16.j, -16. + 16.j],
                                              [-16. + 16.j, -16. + 16.j, -16. + 16.j]]], dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(expected_auxiliary_wave, expected_auxiliary_wave,
                                       err_msg="The auxiliary_wave has not been updated as expected")
 
     def test_build_exit_aux_same_as_exit(self):
-        '''
-        setup
-        '''
+        # setup
         addr, object_array, probe, exit_wave = self.prepare_arrays()
-
-        '''
-        test
-        '''
         auxiliary_wave = np.zeros_like(exit_wave)
 
+        # test
         AWK = AuxiliaryWaveKernel()
         AWK.allocate()
-
         AWK.build_exit(auxiliary_wave, addr, object_array, probe, exit_wave)
-        #
-        # print("auxiliary_wave after")
-        # print(repr(auxiliary_wave))
-        #
-        # print("exit_wave after")
-        # print(repr(exit_wave))
 
+        # assert
         expected_auxiliary_wave = np.array([[[0. - 2.j, 0. - 2.j, 0. - 2.j],
                                              [0. - 2.j, 0. - 2.j, 0. - 2.j],
                                              [0. - 2.j, 0. - 2.j, 0. - 2.j]],
@@ -217,10 +200,10 @@ def test_build_exit_aux_same_as_exit(self):
                                             [[0. - 16.j, 0. - 16.j, 0. - 16.j],
                                              [0. - 16.j, 0. - 16.j, 0. - 16.j],
                                              [0. - 16.j, 0. - 16.j, 0. - 16.j]]], dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave,
                                       err_msg="The auxiliary_wave has not been updated as expected")
 
+        # assert                             
         expected_exit_wave = np.array([[[1. - 1.j, 1. - 1.j, 1. - 1.j],
                                         [1. - 1.j, 1. - 1.j, 1. - 1.j],
                                         [1. - 1.j, 1. - 1.j, 1. - 1.j]],
@@ -269,24 +252,20 @@ def test_build_exit_aux_same_as_exit(self):
                                        [[16. + 0.j, 16. + 0.j, 16. + 0.j],
                                         [16. + 0.j, 16. + 0.j, 16. + 0.j],
                                         [16. + 0.j, 16. + 0.j, 16. + 0.j]]], dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(exit_wave, expected_exit_wave,
                                       err_msg="The exit_wave has not been updated as expected")
 
     def test_build_aux_no_ex(self):
-        '''
-        setup
-        '''
+        # setup
         addr, object_array, probe, exit_wave = self.prepare_arrays()
-
-        '''
-        test
-        '''
         auxiliary_wave = np.zeros_like(exit_wave)
 
+        # test
         AWK = AuxiliaryWaveKernel()
         AWK.allocate()
         AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False)
+
+        # assert
         expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j],
                                              [0. + 2.j, 0. + 2.j, 0. + 2.j],
                                              [0. + 2.j, 0. + 2.j, 0. + 2.j]],
@@ -337,9 +316,12 @@ def test_build_aux_no_ex(self):
                                              [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64)
         np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave,
                                       err_msg="The auxiliary_wave has not been updated as expected")
+        
+        # test
         auxiliary_wave = exit_wave
         AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True)
 
+        # assert
         expected_auxiliary_wave = np.array([[[1. + 5.j, 1. + 5.j, 1. + 5.j],
                                              [1. + 5.j, 1. + 5.j, 1. + 5.j],
                                              [1. + 5.j, 1. + 5.j, 1. + 5.j]],
@@ -391,5 +373,57 @@ def test_build_aux_no_ex(self):
         np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave,
                                       err_msg="The auxiliary_wave has not been updated as expected")
 
+
+    def test_build_exit_alpha_tau(self):
+        
+        # setup
+        addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1)
+        auxiliary_wave = np.zeros_like(exit_wave)
+
+        # test
+        AWK = AuxiliaryWaveKernel()
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave)
+
+        # assert
+        expected_auxiliary_wave = np.array(
+                [[[0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j]],
+
+                [[0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j]],
+
+                [[0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j]],
+
+                [[0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64)
+        np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave,
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+        # assert
+        expected_exit_wave = np.array(
+                [[[1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j]],
+
+                [[2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j]],
+
+                [[3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j]],
+
+                [[4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64)
+        np.testing.assert_array_equal(exit_wave, expected_exit_wave,
+                                      err_msg="The exit_wave has not been updated as expected")
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/base_tests/po_update_kernel_test.py b/test/accelerate_tests/base_tests/po_update_kernel_test.py
index 15557e3d2..a8d20ce78 100644
--- a/test/accelerate_tests/base_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/base_tests/po_update_kernel_test.py
@@ -91,26 +91,15 @@ def prepare_arrays(self):
         return addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator
 
     def test_ob_update(self):
-        '''
-        setup
-        '''
+        # setup
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
-        '''
-        test
-        '''
 
+        # test
         POUK = PoUpdateKernel()
-
         POUK.allocate()  # doesn't do anything but is the call signature
-
-        # print("object array denom before:")
-        # print(object_array_denominator)
-
         POUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave)
 
-        # print("object array denom after:")
-        # print(repr(object_array_denominator))
-
+        # assert
         expected_object_array = np.array([[[15. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 39. + 1.j, 1. + 1.j],
                                            [77. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 125. + 1.j,
                                             1. + 1.j],
@@ -136,10 +125,10 @@ def test_ob_update(self):
                                             4. + 4.j],
                                            [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
                                          dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(object_array, expected_object_array,
                                       err_msg="The object array has not been updated as expected")
 
+        # assert
         expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.],
                                                        [22., 42., 42., 42., 42., 22., 2.],
                                                        [22., 42., 42., 42., 42., 22., 2.],
@@ -159,29 +148,15 @@ def test_ob_update(self):
                                       err_msg="The object array denominatorhas not been updated as expected")
 
     def test_pr_update(self):
-        '''
-        setup
-        '''
+        # setup
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
-        '''
-        test
-        '''
-        POUK = PoUpdateKernel()
 
+        # test
+        POUK = PoUpdateKernel()
         POUK.allocate()  # this doesn't do anything, but is the call pattern.
-
-        # print("probe array before:")
-        # print(repr(probe))
-        # print("probe denominator array before:")
-        # print(repr(probe_denominator))
-
         POUK.pr_update(addr, probe, probe_denominator, object_array, exit_wave)
 
-        # print("probe array after:")
-        # print(repr(probe))
-        # print("probe denominator array after:")
-        # print(repr(probe_denominator))
-
+        # assert
         expected_probe = np.array([[[313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j],
                                     [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j],
                                     [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j],
@@ -194,9 +169,10 @@ def test_pr_update(self):
                                     [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j],
                                     [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j]]],
                                   dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(probe, expected_probe,
                                       err_msg="The probe has not been updated as expected")
+
+        # assert
         expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.],
                                                 [138., 138., 138., 138., 138.],
                                                 [138., 138., 138., 138., 138.],
@@ -212,19 +188,15 @@ def test_pr_update(self):
                                       err_msg="The probe denominatorhas not been updated as expected")
 
     def test_pr_update_ML(self):
-        '''
-        setup
-        '''
+        # setup  
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
-        '''
-        test
-        '''
-        POUK = PoUpdateKernel()
 
+        # test
+        POUK = PoUpdateKernel()
         POUK.allocate()  # this doesn't do anything, but is the call pattern.
-
         POUK.pr_update_ML(addr, probe, object_array, exit_wave)
 
+        # assert
         expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
                                     [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
                                     [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
@@ -237,26 +209,19 @@ def test_pr_update_ML(self):
                                     [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
                                     [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
                                   dtype=COMPLEX_TYPE)
-
         np.testing.assert_array_equal(probe, expected_probe,
                                       err_msg="The probe has not been updated as expected")
 
     def test_ob_update_ML(self):
-        '''
-        setup
-        '''
+        # setup
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
-        '''
-        test
-        '''
-        POUK = PoUpdateKernel()
 
+        # test
+        POUK = PoUpdateKernel()
         POUK.allocate()  # this doesn't do anything, but is the call pattern.
-
         POUK.ob_update_ML(addr, object_array, probe, exit_wave)
 
-        print(repr(object_array))
-
+        # assert
         expected_object_array = np.array(
             [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
               [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
@@ -274,7 +239,147 @@ def test_ob_update_ML(self):
               [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
               [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
             dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array, expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+
+    def test_pr_update_local(self):
+        # setup
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 1  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 1  # number of object modes
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave)
+
+        # assert
+        expected_probe = np.array(
+                [[[0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j],
+                [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j],
+                [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j],
+                [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j],
+                [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j]]], dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe, expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+    def test_ob_update_local(self):
+        # setup
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 1  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 1  # number of object modes
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 2
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
 
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave)
+
+        # assert
+        expected_object_array = np.array(
+                    [[[-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [ 1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j],
+                    [ 1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j,  1.0000000e+00+1.j]]], dtype=COMPLEX_TYPE)
         np.testing.assert_array_equal(object_array, expected_object_array,
                                       err_msg="The object array has not been updated as expected")
 
diff --git a/test/accelerate_tests/base_tests/position_correction_kernel_test.py b/test/accelerate_tests/base_tests/position_correction_kernel_test.py
index 20764e39a..117915f6b 100644
--- a/test/accelerate_tests/base_tests/position_correction_kernel_test.py
+++ b/test/accelerate_tests/base_tests/position_correction_kernel_test.py
@@ -6,6 +6,7 @@
 import unittest
 import numpy as np
 from ptypy.accelerate.base.kernels import PositionCorrectionKernel
+from ptypy import utils as u
 COMPLEX_TYPE = np.complex64
 FLOAT_TYPE = np.float32
 INT_TYPE = np.int32
@@ -16,6 +17,14 @@ class PositionCorrectionKernelTest(unittest.TestCase):
     def setUp(self):
         import sys
         np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf)
+        self.params = u.Param()
+        self.params.nshifts = 4
+        self.params.method = "Annealing"
+        self.params.amplitude = 2e-9
+        self.params.start = 0
+        self.params.stop = 10
+        self.params.max_shift = 2e-9
+        self.resolution = [1e-9,1e-9]
 
     def tearDown(self):
         np.set_printoptions()
@@ -77,7 +86,7 @@ def test_build_aux(self):
         '''
         auxiliary_wave = np.zeros((A, B, C), dtype=COMPLEX_TYPE)
 
-        PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes)
+        PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution)
         PCK.allocate()  # doesn't actually do anything at the moment
         PCK.build_aux(auxiliary_wave, addr, object_array, probe)
 
@@ -205,7 +214,7 @@ def test_fourier_error(self):
         mask_sum = mask.sum(-1).sum(-1)
 
 
-        PCK = PositionCorrectionKernel(auxiliary_wave, nmodes=total_number_modes)
+        PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution)
         PCK.allocate()
         PCK.fourier_error(auxiliary_wave, addr, fmag, mask, mask_sum)
 
@@ -276,7 +285,7 @@ def test_error_reduce(self):
 
         addr = np.zeros((N, 1, 5, 3))
 
-        PCK = PositionCorrectionKernel(fake_aux, nmodes=1)
+        PCK = PositionCorrectionKernel(fake_aux, 1, self.params, self.resolution)
         PCK.allocate()
         err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
         PCK.error_reduce(addr, err_fmag)
diff --git a/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py
new file mode 100644
index 000000000..2704dcf97
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py
@@ -0,0 +1,77 @@
+import unittest
+import numpy as np
+from . import perfrun, PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.base import address_manglers as am
+    from ptypy.accelerate.cuda_pycuda import address_manglers as gam
+
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class AddressManglersTest(PyCudaTest):
+
+    def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3):
+        total_number_scan_positions = scan_pts ** 2
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions)) + max_bound  # max bound is added in the DM_serial engine.
+        Y = Y.reshape((total_number_scan_positions)) + max_bound
+
+        addr_original = np.zeros((total_number_scan_positions, num_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(num_modes):
+                for ob_mode in range(1):
+                    addr_original[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+        
+        return addr_original
+
+    def test_get_address_REGRESSION(self):
+        # the other manglers are using the BaseMangler's get_address function
+        # so we set the deltas in a BaseMangler object and test get_address
+
+        scan_pts=2
+        addr_original = self.prepare_addresses(scan_pts=scan_pts)
+        addr_original_dev = gpuarray.to_gpu(addr_original)
+        nshifts=1
+        step_size=2
+        mglr = gam.BaseMangler(step_size, 50, 100, nshifts, max_bound=2)
+        # 2 shifts, with positive/negative shifting
+        mglr.delta = np.array([
+            [1, 2], 
+            [-4, -2]
+        ], dtype=np.int32)
+        mglr._setup_delta_gpu()
+        
+        addr1 = addr_original_dev.copy()
+        mglr.get_address(0, addr_original_dev, addr1, 10, 9)
+        
+        addr2 = addr_original_dev.copy()
+        mglr.get_address(1, addr_original_dev, addr2, 10, 9)
+
+        exp1 = np.copy(addr_original)
+        exp2 = np.copy(addr_original)
+        # element-wise here to prepare reference
+        for f in range(addr_original.shape[0]):
+            for m in range(addr_original.shape[1]):
+                exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1))
+                exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2))
+                exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4))
+                exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2))
+
+        np.testing.assert_array_equal(addr2.get(), exp2)
+        np.testing.assert_array_equal(addr1.get(), exp1)
+        
diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
index dcd133344..23950af26 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
@@ -88,7 +88,7 @@ def test_transpose_2D(self):
         out_dev = gpuarray.empty((3,5), dtype=np.int32)
         
         ## Act
-        AU = gau.ArrayUtilsKernel()
+        AU = gau.TransposeKernel()
         AU.transpose(inp_dev, out_dev)
 
         ## Assert
@@ -103,7 +103,7 @@ def test_transpose_2D_large(self):
         out_dev = gpuarray.empty((61,137), dtype=np.int32)
         
         ## Act
-        AU = gau.ArrayUtilsKernel()
+        AU = gau.TransposeKernel()
         AU.transpose(inp_dev, out_dev)
 
         ## Assert
@@ -118,7 +118,7 @@ def test_transpose_4D(self):
         out_dev = gpuarray.empty((5, 3, 250, 3), dtype=np.int32)
 
         ## Act
-        AU = gau.ArrayUtilsKernel()
+        AU = gau.TransposeKernel()
         AU.transpose(inp_dev.reshape(750, 15), out_dev.reshape(15, 750))
 
         ## Assert
@@ -128,124 +128,124 @@ def test_transpose_4D(self):
 
     def test_complex_gaussian_filter_1d_no_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((11,), dtype=np.complex64)
-        inp[5] = 1.0 +1.0j
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
         mfs = [0]
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((11,), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((11,), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         self.assertTrue(np.testing.assert_allclose(out_exp, out, rtol=1e-5) is None)
 
     def test_complex_gaussian_filter_1d_little_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((11,), dtype=np.complex64)
-        inp[5] = 1.0 +1.0j
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
         mfs = [0.2]
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((11,), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((11,), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         np.testing.assert_allclose(out_exp, out, rtol=1e-5)
 
     
     def test_complex_gaussian_filter_1d_more_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((11,), dtype=np.complex64)
-        inp[5] = 1.0 +1.0j
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
         mfs = [2.0]
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((11,), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((11,), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         np.testing.assert_allclose(out_exp, out, rtol=1e-5)
 
     def test_complex_gaussian_filter_2d_no_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((11, 11), dtype=np.complex64)
-        inp[5, 5] = 1.0+1.0j
+        data = np.zeros((11, 11), dtype=np.complex64)
+        data[5, 5] = 1.0+1.0j
         mfs = 0.0,0.0
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((11,11), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((11,11), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         np.testing.assert_allclose(out_exp, out, rtol=1e-5)
 
     def test_complex_gaussian_filter_2d_little_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((11, 11), dtype=np.complex64)
-        inp[5, 5] = 1.0+1.0j
+        data = np.zeros((11, 11), dtype=np.complex64)
+        data[5, 5] = 1.0+1.0j
         mfs = 0.2,0.2
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((11,11),dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((11,11),dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         np.testing.assert_allclose(out_exp, out, rtol=1e-5)
 
     def test_complex_gaussian_filter_2d_more_blurring_UNITY(self):
         # Arrange
-        inp = np.zeros((8, 8), dtype=np.complex64)
-        inp[3:5, 3:5] = 2.0+2.0j
+        data = np.zeros((8, 8), dtype=np.complex64)
+        data[3:5, 3:5] = 2.0+2.0j
         mfs = 3.0,4.0
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((8,8), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        #tmp_dev = gpuarray.empty((8,8), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
         np.testing.assert_allclose(out_exp, out, rtol=1e-4)
 
     def test_complex_gaussian_filter_2d_nonsquare_UNITY(self):
         # Arrange
-        inp = np.zeros((32, 16), dtype=np.complex64)
-        inp[3:4, 11:12] = 2.0+2.0j
-        inp[3:5, 3:5] = 2.0+2.0j
-        inp[20:25,3:5] = 2.0+2.0j
+        data = np.zeros((32, 16), dtype=np.complex64)
+        data[3:4, 11:12] = 2.0+2.0j
+        data[3:5, 3:5] = 2.0+2.0j
+        data[20:25,3:5] = 2.0+2.0j
         mfs = 1.0,1.0
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty(inp.shape, dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty(data_dev.shape, dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
 
         np.testing.assert_allclose(out_exp, out, rtol=1e-4)
 
@@ -254,17 +254,152 @@ def test_complex_gaussian_filter_2d_batched(self):
         batch_number = 2
         A = 5
         B = 5
-        inp = np.zeros((batch_number, A, B), dtype=np.complex64)
-        inp[:, 2:3, 2:3] = 2.0+2.0j
+        data = np.zeros((batch_number, A, B), dtype=np.complex64)
+        data[:, 2:3, 2:3] = 2.0+2.0j
         mfs = 3.0,4.0
-        inp_dev = gpuarray.to_gpu(inp)
-        out_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        tmp_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64)
 
         # Act
         GS = gau.GaussianSmoothingKernel()
-        GS.convolution(inp_dev, out_dev, mfs)
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
 
         # Assert
-        out_exp = au.complex_gaussian_filter(inp, mfs)
-        out = out_dev.get()        
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()        
         np.testing.assert_allclose(out_exp, out, rtol=1e-4)
+
+
+    def test_crop_pad_simple_1_UNITY(self):
+        # pad, integer, 2D
+        B = np.indices((4, 4), dtype=np.int).sum(0)
+        A = np.zeros((6, 6), dtype=B.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_2_UNITY(self):
+        # crop, float, 3D
+        B = np.indices((4, 4), dtype=np.float32)
+        A = np.zeros((2, 2, 2), dtype=B.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_3_UNITY(self):
+        # crop/pad, complex, 3D
+        B = np.indices((4, 3), dtype=np.complex64)
+        B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2, 2, 5), dtype=B.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_difflike_UNITY(self):
+        np.random.seed(1983)
+        # crop/pad, 4D
+        D = np.random.randint(0, 3000, (100,256,256)).astype(np.float32)
+        A = np.zeros((100,260,260), dtype=D.dtype)
+        B = np.zeros((100,250,250), dtype=D.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+        D_dev = gpuarray.to_gpu(D)
+
+        # Act
+        au.crop_pad_2d_simple(A, D)
+        au.crop_pad_2d_simple(B, D)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, D_dev)
+        k.crop_pad_2d_simple(B_dev, D_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+        np.testing.assert_allclose(B, B_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_oblike_UNITY(self):
+        np.random.seed(1983)
+        # crop/pad, 4D
+        B = np.random.rand(2,1230,1434).astype(np.complex64) \
+           +2j * np.pi * np.random.randn(2,1230,1434).astype(np.complex64)
+        A = np.ones((2,1000,1500), dtype=B.dtype)
+        B_dev = gpuarray.to_gpu(B)
+        A_dev = gpuarray.to_gpu(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_max_abs2_complex_UNITY(self):
+        np.random.seed(1983)
+        X = (np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + \
+            1j * np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32)).astype(np.complex64)
+        out = np.zeros((1,), dtype=np.float32)
+        X_dev = gpuarray.to_gpu(X)
+        out_dev = gpuarray.to_gpu(out)
+
+        out = au.max_abs2(X)
+
+        MAK = gau.MaxAbs2Kernel(queue=self.stream)
+        MAK.max_abs2(X_dev, out_dev)
+        
+        np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6,
+            err_msg="The object norm array has not been updated as expected")
+
+    def test_max_abs2_float_UNITY(self):
+        np.random.seed(1983)
+        X = np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32)
+            
+        out = np.zeros((1,), dtype=np.float32)
+        X_dev = gpuarray.to_gpu(X)
+        out_dev = gpuarray.to_gpu(out)
+
+        out = au.max_abs2(X)
+
+        MAK = gau.MaxAbs2Kernel(queue=self.stream)
+        MAK.max_abs2(X_dev, out_dev)
+        
+        np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6,
+            err_msg="The object norm array has not been updated as expected")
+
+
+    def test_clip_magnitudes_to_range_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((2,10,10))
+        B = A[0] + 1j* A[1]
+        B = B.astype(np.complex64)
+        B_gpu = gpuarray.to_gpu(B)
+
+        au.clip_complex_magnitudes_to_range(B, 0.2,0.8)
+        CMK = gau.ClipMagnitudesKernel()
+        CMK.clip_magnitudes_to_range(B_gpu, 0.2, 0.8)
+
+        np.testing.assert_allclose(B_gpu.get(), B, rtol=1e-6, atol=1e-6,
+            err_msg="The magnitudes of the array have not been clipped as expected")
+
+
diff --git a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py
index bc38a62b1..71e8e1e7e 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py
@@ -17,7 +17,7 @@
 
 class AuxiliaryWaveKernelTest(PyCudaTest):
 
-    def prepare_arrays(self, performance=False):
+    def prepare_arrays(self, performance=False, scan_points=None):
         if not performance:
             B = 3  # frame size y
             C = 3  # frame size x
@@ -27,8 +27,10 @@ def prepare_arrays(self, performance=False):
 
             npts_greater_than = 2  # how many points bigger than the probe the object is.
             G = 2  # number of object modes
-
-            scan_pts = 2  # one dimensional scan point number
+            if scan_points is None:
+                scan_pts = 2  # one dimensional scan point number
+            else:
+                scan_pts = scan_points
         else:
             B = 128
             C = 128
@@ -37,7 +39,10 @@ def prepare_arrays(self, performance=False):
             F = C
             npts_greater_than = 1215
             G = 4
-            scan_pts = 14
+            if scan_points is None:
+                scan_pts = 14
+            else:
+                scan_pts = scan_points
 
         H = B + npts_greater_than  # object size y
         I = C + npts_greater_than  # object size x
@@ -189,6 +194,25 @@ def test_build_aux_same_as_exit_UNITY(self):
         np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(),
                                       err_msg="The gpu auxiliary_wave does not look the same as the numpy version")
 
+    def test_build_aux2_same_as_exit_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = np.zeros_like(exit_wave)
+        auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev)
+        
+        ## Act
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        AWK = AuxiliaryWaveKernel(self.stream)
+        alpha_set = FLOAT_TYPE(1.0)
+
+        AWK.build_aux2(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set)
+        nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set)
+        
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(),
+                                      err_msg="The gpu auxiliary_wave does not look the same as the numpy version")
 
     def test_build_exit_aux_same_as_exit_REGRESSION(self):
         ## Arrange
@@ -413,6 +437,27 @@ def test_build_aux_no_ex_noadd_UNITY(self):
         np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
                                       err_msg="The auxiliary_wave does not match numpy")
 
+    def test_build_aux2_no_ex_noadd_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev)
+        auxiliary_wave = np.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=1.0, add=False)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
 
     def test_build_aux_no_ex_add_REGRESSION(self):
         ## Arrange
@@ -500,6 +545,27 @@ def test_build_aux_no_ex_add_UNITY(self):
         np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
                                       err_msg="The auxiliary_wave does not match numpy")
 
+    def test_build_aux2_no_ex_add_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev)
+        auxiliary_wave = np.ones_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=2.0, add=True)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
 
     @unittest.skipIf(not perfrun, "performance test")
     def test_build_aux_no_ex_performance(self):
@@ -512,5 +578,89 @@ def test_build_aux_no_ex_performance(self):
         AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, 
             fac=1.0, add=False)
 
+
+    def test_build_exit_alpha_tau_REGRESSION(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1)
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = gpuarray.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave)
+
+        # Assert
+        expected_auxiliary_wave = np.array(
+                [[[0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j]],
+
+                [[0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j]],
+
+                [[0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j]],
+
+                [[0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64)
+        np.testing.assert_allclose(auxiliary_wave.get(), expected_auxiliary_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+        expected_exit_wave = np.array(
+                [[[1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j]],
+
+                [[2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j]],
+
+                [[3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j]],
+
+                [[4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64)
+        np.testing.assert_allclose(exit_wave.get(), expected_exit_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The exit_wave has not been updated as expected")
+                              
+    def test_build_exit_alpha_tau_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1)
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev)
+        auxiliary_wave = np.ones_like(exit_wave)
+        
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=0.8, tau=0.6)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6)
+
+        ## Assert
+        np.testing.assert_allclose(auxiliary_wave_dev.get(), auxiliary_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The auxiliary_wave does not match numpy")
+        ## Assert
+        np.testing.assert_allclose(exit_wave_dev.get(), exit_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The exit_wave does not match numpy")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_build_exit_alpha_tau_performance(self):
+        addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True, scan_points=1)
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = gpuarray.zeros_like(exit_wave)
+
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py
new file mode 100644
index 000000000..0d943c28e
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py
@@ -0,0 +1,57 @@
+'''
+Testing based on real data
+'''
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import perfrun, PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.kernels import AuxiliaryWaveKernel
+from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as BaseAuxiliaryWaveKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DlsAuxiliaryWaveKernelTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_build_aux_no_ex_noadd_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir % name + "build_aux_no_ex_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            ob = f["ob"][:]
+            pr = f["pr"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        addr_dev = gpuarray.to_gpu(addr)
+        ob_dev = gpuarray.to_gpu(ob)
+        pr_dev = gpuarray.to_gpu(pr)
+
+        # CPU kernel
+        BAWK = BaseAuxiliaryWaveKernel()
+        BAWK.allocate()
+        BAWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
+
+        ## GPU kernel
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux_no_ex(aux_dev, addr_dev, ob_dev, pr_dev, add=False)
+
+        ## Assert
+        np.testing.assert_allclose(aux_dev.get(), aux, rtol=self.rtol, atol=self.atol, 
+            err_msg="The auxiliary_wave does not match the base kernel output")
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py
new file mode 100644
index 000000000..57f62f9dd
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py
@@ -0,0 +1,83 @@
+'''
+Testing on real data
+'''
+
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel
+from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DlsDRpycudaTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-dr/"
+    iter = 0
+    rtol = 1e-6
+    atol = 1e-6
+
+    def test_ob_update_local_UNITY(self):
+
+        # Load data
+        with h5py.File(self.datadir + "ob_update_local_%04d.h5" %self.iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            ob = f["ob"][:]
+            pr = f["pr"][:]
+            ex = f["ex"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        ob_dev = gpuarray.to_gpu(ob)
+        pr_dev = gpuarray.to_gpu(pr)
+        ex_dev = gpuarray.to_gpu(ex)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        # CPU Kernel
+        BPOK = BasePoUpdateKernel()
+        BPOK.ob_update_local(addr, ob, pr, ex, aux)
+
+        # GPU Kernel
+        POK = PoUpdateKernel()
+        POK.ob_update_local(addr_dev, ob_dev, pr_dev, ex_dev, aux_dev)
+
+        ## Assert
+        np.testing.assert_allclose(ob_dev.get(), ob, atol=self.atol, rtol=self.rtol, verbose=False,
+            err_msg="The object array has not been updated as expected")
+
+    def test_pr_update_local_UNITY(self):
+
+        # Load data
+        with h5py.File(self.datadir + "pr_update_local_%04d.h5" %self.iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            ob = f["ob"][:]
+            pr = f["pr"][:]
+            ex = f["ex"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        ob_dev = gpuarray.to_gpu(ob)
+        pr_dev = gpuarray.to_gpu(pr)
+        ex_dev = gpuarray.to_gpu(ex)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        # CPU Kernel
+        BPOK = BasePoUpdateKernel()
+        BPOK.pr_update_local(addr, pr, ob, ex, aux)
+
+        # GPU Kernel
+        POK = PoUpdateKernel()
+        POK.pr_update_local(addr_dev, pr_dev, ob_dev, ex_dev, aux_dev)
+
+        ## Assert
+        np.testing.assert_allclose(pr_dev.get(), pr, atol=self.atol, rtol=self.rtol, verbose=False,
+            err_msg="The object array has not been updated as expected")
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py
new file mode 100644
index 000000000..f62834e2e
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py
@@ -0,0 +1,261 @@
+'''
+Testing on real data
+'''
+
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import perfrun, PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel
+from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DlsGradientDescentKernelTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_make_model_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.make_model(aux, addr)
+
+        # GPU kernel
+        GDK = GradientDescentKernel(aux_dev, addr.shape[1])
+        GDK.allocate()
+        GDK.make_model(aux_dev, addr_dev)
+
+        ## Assert
+        np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol,
+            err_msg="`Imodel` buffer has not been updated as expected")
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_floating_intensity_UNITY(self, name, iter):
+        
+        # Load data
+        with h5py.File(self.datadir %name + "floating_intensities_%04d.h5" %iter, "r") as f:
+            w = f["w"][:]
+            addr = f["addr"][:]
+            I = f["I"][:]
+            fic = f["fic"][:]
+            Imodel = f["Imodel"][:]
+        with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+        
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        w_dev = gpuarray.to_gpu(w)
+        addr_dev = gpuarray.to_gpu(addr)
+        I_dev = gpuarray.to_gpu(I)
+        fic_dev = gpuarray.to_gpu(fic)
+        Imodel_dev = gpuarray.to_gpu(np.ascontiguousarray(Imodel))
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.npy.Imodel = Imodel
+        BGDK.floating_intensity(addr, w, I, fic)
+
+        # GPU kernel
+        GDK = GradientDescentKernel(aux_dev, addr.shape[1])
+        GDK.allocate()
+        GDK.gpu.Imodel = Imodel_dev
+        GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev)
+
+        ## Assert
+        np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, 
+            verbose=False, equal_nan=False,
+            err_msg="`LLerr` buffer has not been updated as expected")
+        np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, 
+            verbose=False, equal_nan=False,
+            err_msg="`LLden` buffer has not been updated as expected")
+        np.testing.assert_allclose(BGDK.npy.fic_tmp, GDK.gpu.fic_tmp.get(), atol=self.atol, rtol=self.rtol, 
+            verbose=False, equal_nan=False,
+            err_msg="`fic_tmp` buffer has not been updated as expected")
+
+        np.testing.assert_allclose(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, 
+            verbose=False, equal_nan=False, 
+            err_msg="floating intensity coeff (fic) has not been updated as expected")
+
+        np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, 
+            verbose=False, equal_nan=False,
+            err_msg="`Imodel` buffer has not been updated as expected")
+        
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_main_and_error_reduce_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir %name + "main_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            w = f["w"][:]
+            I = f["I"][:]
+        # Load data
+        with h5py.File(self.datadir %name + "error_reduce_%04d.h5" %iter, "r") as f:
+            err_phot = f["err_phot"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        w_dev = gpuarray.to_gpu(w)
+        addr_dev = gpuarray.to_gpu(addr)
+        I_dev = gpuarray.to_gpu(I)
+        err_phot_dev = gpuarray.to_gpu(err_phot)
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.main(aux, addr, w, I)
+        BGDK.error_reduce(addr, err_phot)
+
+        # GPU kernel
+        GDK = GradientDescentKernel(aux_dev, addr.shape[1])
+        GDK.allocate()
+        GDK.main(aux_dev, addr_dev, w_dev, I_dev)
+        GDK.error_reduce(addr_dev, err_phot_dev)
+
+        ## Assert
+        np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, 
+            err_msg="Auxiliary has not been updated as expected")
+        np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, 
+            err_msg="LogLikelihood error has not been updated as expected")
+        np.testing.assert_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, 
+            err_msg="`err_phot` has not been updated as expected")
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_make_a012_UNITY(self, name, iter):
+
+        # Reduce the array size to make the tests run faster
+        Nmax = 10 
+        Ymax = 128
+        Xmax = 128
+
+        # Load data
+        with h5py.File(self.datadir %name + "make_a012_%04d.h5" %iter, "r") as g:
+            addr = g["addr"][:Nmax]
+            I = g["I"][:Nmax,:Ymax,:Xmax]
+            f = g["f"][:Nmax,:Ymax,:Xmax]
+            a = g["a"][:Nmax,:Ymax,:Xmax]
+            b = g["b"][:Nmax,:Ymax,:Xmax]
+            fic = g["fic"][:Nmax]
+        with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as h:
+            aux = h["aux"][:Nmax,:Ymax,:Xmax]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        addr_dev = gpuarray.to_gpu(addr)
+        I_dev = gpuarray.to_gpu(I)
+        f_dev = gpuarray.to_gpu(f)
+        a_dev = gpuarray.to_gpu(a)
+        b_dev = gpuarray.to_gpu(b)
+        fic_dev = gpuarray.to_gpu(fic)
+        
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.make_a012(f, a, b, addr, I, fic)
+
+        # GPU kernel        
+        GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream)
+        GDK.allocate()
+        GDK.gpu.Imodel.fill(np.nan)
+        GDK.gpu.LLerr.fill(np.nan)
+        GDK.gpu.LLden.fill(np.nan)
+        GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev)
+
+        ## Assert
+        np.testing.assert_allclose(GDK.gpu.Imodel.get(), BGDK.npy.Imodel, atol=self.atol, rtol=self.rtol, 
+            err_msg="Imodel error has not been updated as expected")
+        np.testing.assert_allclose(GDK.gpu.LLerr.get(), BGDK.npy.LLerr, atol=self.atol, rtol=self.rtol, 
+            err_msg="LLerr error has not been updated as expected")
+        np.testing.assert_allclose(GDK.gpu.LLden.get(), BGDK.npy.LLden, atol=self.atol, rtol=self.rtol, 
+            err_msg="LLden error has not been updated as expected")
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_fill_b_UNITY(self, name, iter):
+
+        Nmax = 10
+        Ymax = 128
+        Xmax = 128
+
+        # Load data
+        with h5py.File(self.datadir %name + "fill_b_%04d.h5" %iter, "r") as f:
+            w = f["w"][:Nmax, :Ymax, :Xmax]
+            addr = f["addr"][:]
+            B = f["B"][:]
+            Brenorm = f["Brenorm"][...]
+            A0 = f["A0"][:Nmax, :Ymax, :Xmax]
+            A1 = f["A1"][:Nmax, :Ymax, :Xmax]
+            A2 = f["A2"][:Nmax, :Ymax, :Xmax]
+        with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:Nmax, :Ymax, :Xmax]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        w_dev = gpuarray.to_gpu(w)
+        addr_dev = gpuarray.to_gpu(addr)
+        B_dev = gpuarray.to_gpu(B.astype(np.float32))
+        A0_dev = gpuarray.to_gpu(A0)
+        A1_dev = gpuarray.to_gpu(A1)
+        A2_dev = gpuarray.to_gpu(A2)
+
+        # CPU Kernel
+        BGDK = BaseGradientDescentKernel(aux, addr.shape[1])
+        BGDK.allocate()
+        BGDK.npy.Imodel = A0
+        BGDK.npy.LLerr = A1
+        BGDK.npy.LLden = A2
+        BGDK.fill_b(addr, Brenorm, w, B)
+
+        # GPU kernel
+        GDK = GradientDescentKernel(aux_dev, addr.shape[1])
+        GDK.allocate()
+        GDK.gpu.Imodel = A0_dev
+        GDK.gpu.LLerr = A1_dev
+        GDK.gpu.LLden = A2_dev
+        GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev)
+
+        ## Assert
+        np.testing.assert_allclose(B, B_dev.get(), rtol=self.rtol, atol=self.atol, 
+            err_msg="`B` has not been updated as expected")
+
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py
new file mode 100644
index 000000000..3b8ee0474
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py
@@ -0,0 +1,106 @@
+'''
+Testing on real data
+'''
+
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel
+from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DlsPoUpdateKernelTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+
+    @parameterized.expand([
+        ["base", 10, False],
+        ["regul", 50, False],
+        ["floating", 0, False],
+        ["base", 10, True],
+        ["regul", 50, True],
+        ["floating", 0, True],
+    ])
+    def test_op_update_ml_UNITY(self, name, iter, atomics):
+
+        # Load data
+        with h5py.File(self.datadir %name + "op_update_ml_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            obg = f["obg"][:]
+            pr = f["pr"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+        obg_dev = gpuarray.to_gpu(obg)
+        pr_dev = gpuarray.to_gpu(pr)
+
+        # If not using atomics we need to change the addresses
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = gpuarray.to_gpu(addr2)
+        else:
+            addr_dev = gpuarray.to_gpu(addr)
+
+        # CPU Kernel
+        BPOK = BasePoUpdateKernel()
+        BPOK.ob_update_ML(addr, obg, pr, aux)
+
+        # GPU Kernel
+        POK = PoUpdateKernel()
+        POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=atomics)
+
+        ## Assert
+        np.testing.assert_allclose(obg_dev.get(), obg, atol=self.atol, rtol=self.rtol, verbose=False,
+            err_msg="The object array has not been updated as expected")
+
+    @parameterized.expand([
+        ["base", 10, False],
+        ["regul", 50, False],
+        ["floating", 0, False],
+        ["base", 10, True],
+        ["regul", 50, True],
+        ["floating", 0, True],
+    ])
+    def test_pr_update_ml_UNITY(self, name, iter, atomics):
+
+        # Load data
+        with h5py.File(self.datadir %name + "pr_update_ml_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][:]
+            addr = f["addr"][:]
+            ob = f["ob"][:]
+            prg = f["prg"][:]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux) 
+        ob_dev = gpuarray.to_gpu(ob)
+        prg_dev = gpuarray.to_gpu(prg)
+
+        # If not using atomics we need to change the addresses
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = gpuarray.to_gpu(addr2)
+        else:
+            addr_dev = gpuarray.to_gpu(addr)
+
+        # CPU Kernel
+        BPOK = BasePoUpdateKernel()
+        BPOK.pr_update_ML(addr, prg, ob, aux)
+
+        # GPU Kernel
+        POK = PoUpdateKernel()
+        POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=atomics)
+        
+        ## Assert
+        np.testing.assert_allclose(prg, prg_dev.get(),  atol=self.atol, rtol=self.rtol, verbose=False, 
+            err_msg="The probe array has not been updated as expected")
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py
new file mode 100644
index 000000000..ac9fa0402
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py
@@ -0,0 +1,102 @@
+'''
+testing on real data
+'''
+
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel
+
+import ptypy.utils as u
+from ptypy.core import geometry
+from ptypy.core import Base as theBase
+
+# subclass for dictionary access
+Base = type('Base',(theBase,),{})
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DLsPropagationKernelTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+
+    def set_up_farfield(self,shape):
+        P = Base()
+        P.CType = COMPLEX_TYPE
+        P.Ftype = FLOAT_TYPE
+        g = u.Param()
+        g.energy = None # u.keV2m(1.0)/6.32e-7
+        g.lam = 5.32e-7
+        g.distance = 15e-2
+        g.psize = 24e-6
+        g.shape = shape
+        g.propagation = "farfield"
+        G = geometry.Geo(owner=P, pars=g)
+        return G
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_forward_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir % name + "forward_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][0]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+
+        # Geometry
+        geo = self.set_up_farfield(aux.shape)
+
+        # CPU kernel
+        aux = geo.propagator.fw(aux)
+
+        # GPU kernel
+        PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.fw(aux_dev, aux_dev)
+
+        ## Assert
+        np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, 
+            err_msg="Forward propagation was not as expected")
+
+    @parameterized.expand([
+        ["base", 10],
+        ["regul", 50],
+        ["floating", 0],
+    ])
+    def test_backward_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir % name + "backward_%04d.h5" %iter, "r") as f:
+            aux = f["aux"][0]
+
+        # Copy data to device
+        aux_dev = gpuarray.to_gpu(aux)
+
+        # Geometry
+        geo = self.set_up_farfield(aux.shape)
+
+        # CPU kernel
+        aux = geo.propagator.bw(aux)
+
+        # GPU kernel
+        PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.bw(aux_dev, aux_dev)
+
+        ## Assert
+        np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, 
+            err_msg="Backward propagation was not as expected")
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py
new file mode 100644
index 000000000..972648552
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py
@@ -0,0 +1,77 @@
+'''
+Testing on real data
+'''
+
+import h5py
+import unittest
+import numpy as np
+from parameterized import parameterized
+from .. import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    from ptypy.accelerate.cuda_pycuda.engines.ML_pycuda import Regul_del2_pycuda
+    import pycuda.driver as cuda
+from ptypy.engines.ML import Regul_del2
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class DlsRegularizerTest(PyCudaTest):
+
+    datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/"
+    rtol = 1e-6
+    atol = 1e-6
+
+    @parameterized.expand([
+        ["regul", 50]
+    ])
+    def test_regularizer_grad_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir %name + "regul_grad_%04d.h5" %iter, "r") as f:
+            ob = f["ob"][:]
+
+        # Copy data to device
+        ob_dev = gpuarray.to_gpu(ob)
+
+        # CPU Kernel
+        regul = Regul_del2(0.1)
+        obr = regul.grad(ob)
+
+        # GPU Kernel
+        regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc)
+        obr_dev = regul_pycuda.grad(ob_dev)
+
+        ## Assert
+        np.testing.assert_allclose(obr, obr_dev.get(),  atol=self.atol, rtol=self.rtol, 
+            err_msg="The object array has not been updated as expected")
+        np.testing.assert_allclose(regul.LL, regul_pycuda.LL,  atol=self.atol, rtol=self.rtol, 
+            err_msg="The LL array has not been updated as expected")
+
+    @parameterized.expand([
+        ["regul", 50],
+    ])
+    def test_regularizer_poly_line_ceoffs_UNITY(self, name, iter):
+
+        # Load data
+        with h5py.File(self.datadir % name + "regul_poly_line_coeffs_%04d.h5" %iter, "r") as f:
+            ob  = f["ob"][:]
+            obh = f["obh"][:]
+
+        # Copy data to device
+        ob_dev = gpuarray.to_gpu(ob)
+        obh_dev = gpuarray.to_gpu(obh)
+
+        # CPU Kernel
+        regul = Regul_del2(0.1)
+        res = regul.poly_line_coeffs(obh, ob)
+
+        # GPU Kernel
+        regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc)
+        res_pycuda = regul_pycuda.poly_line_coeffs(obh_dev, ob_dev)
+
+        ## Assert
+        np.testing.assert_allclose(res, res_pycuda,  atol=self.atol, rtol=self.rtol, 
+            err_msg="The B array has not been updated as expected")
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py
deleted file mode 100644
index ed6929865..000000000
--- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-'''
-'''
-
-import unittest
-import numpy as np
-import scipy.fft as fft
-from . import PyCudaTest, have_pycuda
-
-
-if have_pycuda():
-    from pycuda import gpuarray
-    from ptypy.accelerate.cuda_pycuda.fft import FFT as ReiknaFFT
-    from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as cuFFT
-
-class FftAccurracyTest(PyCudaTest):
-
-    def gen_input(self):
-        rows = cols = 32
-        batches = 1
-        f = np.random.randn(batches, rows, cols) + 1j * np.random.randn(batches,rows, cols)
-        f = np.ascontiguousarray(f.astype(np.complex64))
-        return f
-
-    def test_random_cufft_fwd(self):
-        f = self.gen_input()
-        cuft = cuFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=None, forward=True).ft
-        reikft = ReiknaFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=False).ft
-        for i in range(10):
-            f = self.gen_input()
-            y = fft.fft2(f)
-
-            x_d = gpuarray.to_gpu(f)
-            cuft(x_d, x_d)
-            y_cufft = x_d.get().reshape(y.shape)
-
-            x_d = gpuarray.to_gpu(f)
-            reikft(x_d, x_d)
-            y_reikna = x_d.get().reshape(y.shape)
-
-            # cufft_diff = np.max(np.abs(y_cufft - y))
-            # reikna_diff = np.max(np.abs(y_reikna-y))
-            # cufft_rdiff = np.max(np.abs(y_cufft - y) / np.abs(y))
-            # reikna_rdiff = np.max(np.abs(y_reikna - y) / np.abs(y))
-            # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff))
-        
-            # Note: check if this tolerance and test case is ok
-            np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i))
-            np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i))
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py
new file mode 100644
index 000000000..ac28436b4
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py
@@ -0,0 +1,28 @@
+
+import unittest
+from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from ptypy.filtered_cufft import FilteredFFT
+
+class CuFFTInitTest(PyCudaTest):
+
+    def test_import_fft(self):
+        ft = FilteredFFT(2, 32, 32, False, True, 0, 0, 0)
+    
+    
+    def test_import_fft_different_shape(self):
+        ft = FilteredFFT(2, 128, 128, False, True, 0, 0, 0)
+    
+
+    @unittest.expectedFailure
+    def test_import_fft_not_square(self):
+        ft = FilteredFFT(2, 32, 64, False, True, 0, 0, 0)
+    
+    @unittest.expectedFailure
+    def test_import_fft_not_pow2(self):
+        ft = FilteredFFT(2, 40, 40, False, True, 0, 0, 0)
+
+
+if __name__=="__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py
index 9c87e34f2..7c30c3221 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py
@@ -44,5 +44,5 @@ def test_random_cufft_fwd(self):
             # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff))
         
             # Note: check if this tolerance and test case is ok
-            np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i))
-            np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i))
+            np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i))
+            np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i))
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py
deleted file mode 100644
index 7d60ce46a..000000000
--- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py
+++ /dev/null
@@ -1,27 +0,0 @@
-
-import unittest, pytest
-from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda
-import os, shutil
-from distutils import sysconfig
-
-if have_pycuda():
-    import pycuda.driver as cuda
-    from pycuda import gpuarray
-    from ptypy.accelerate.cuda_pycuda import import_fft
-    from pycuda.tools import make_default_context
-
-class ImportFFTTest(PyCudaTest):
-
-    def test_import_fft(self):
-        import_fft.ImportFFT(32, 32)
-    
-    
-    def test_import_fft_different_shape(self):
-        import_fft.ImportFFT(128, 128)
-    
-    def test_import_fft_same_module_again(self):
-        import_fft.ImportFFT(32, 32)
-
-
-if __name__=="__main__":
-    unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
index dfea1e19b..3d7cb5fa6 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py
@@ -109,11 +109,104 @@ def test_fmag_all_update_UNITY(self):
         nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set)
         expected_f = f
         measured_f = f_d.get()
-        np.testing.assert_array_equal(expected_f, measured_f, err_msg="Numpy f "
+        np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f "
                                                                       "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" %  (repr(expected_f),
                                                                                                                                repr(measured_f),
                                                                                                                                repr(mask)))
 
+    def test_fmag_update_nopbound_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number og object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0 # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        # print("address book is:")
+        # print(repr(addr))
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
+        nFUK.error_reduce(addr, err_fmag)
+        # print(np.sqrt(pbound_set/err_fmag))
+        f_d = gpuarray.to_gpu(f)
+        fmag_d = gpuarray.to_gpu(fmag)
+        mask_d = gpuarray.to_gpu(mask)
+        addr_d = gpuarray.to_gpu(addr)
+
+        # now set the state for both.
+
+        FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev)
+        FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr)
+
+        FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d)
+        nFUK.fmag_update_nopbound(f, addr, fmag, mask)
+
+        expected_f = f
+        measured_f = f_d.get()
+        np.testing.assert_allclose(measured_f, expected_f, rtol=1e-6, err_msg="Numpy f "
+                                                                      "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" %  (repr(expected_f),
+                                                                                                                               repr(measured_f),
+                                                                                                                               repr(mask)))
+
+
     def test_fourier_error_UNITY(self):
         '''
         setup
@@ -191,7 +284,7 @@ def test_fourier_error_UNITY(self):
 
         expected_fdev = nFUK.npy.fdev
         measured_fdev = FUK.gpu.fdev.get()
-        np.testing.assert_array_equal(expected_fdev, measured_fdev, err_msg="Numpy fdev "
+        np.testing.assert_allclose(expected_fdev, measured_fdev, rtol=1e-6, err_msg="Numpy fdev "
                                                                             "is \n%s, \nbut gpu fdev is \n %s, \n " % (
                                                                             repr(expected_fdev),
                                                                             repr(measured_fdev)))
@@ -203,6 +296,87 @@ def test_fourier_error_UNITY(self):
                                                                             "is \n%s, \nbut gpu ferr is \n %s, \n " % (
                                                                             repr(expected_ferr),
                                                                             repr(measured_ferr)))
+    def test_fourier_deviation_UNITY(self):
+        '''
+        setup - using the fourier_error as reference, so we need mask, etc.
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        f_d = gpuarray.to_gpu(f)
+        fmag_d = gpuarray.to_gpu(fmag)
+        addr_d = gpuarray.to_gpu(addr)
+
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_deviation(f, addr, fmag)
+        FUK.fourier_deviation(f_d, addr_d, fmag_d)
+
+        expected_fdev = nFUK.npy.fdev
+        measured_fdev = FUK.gpu.fdev.get()
+        np.testing.assert_allclose(measured_fdev, expected_fdev,  rtol=1e-6, err_msg="Numpy fdev "
+                                                                            "is \n%s, \nbut gpu fdev is \n %s, \n " % (
+                                                                            repr(expected_fdev),
+                                                                            repr(measured_fdev)))
+
+
 
     def test_error_reduce_UNITY(self):
         '''
@@ -348,7 +522,7 @@ def test_error_reduce(self):
                                                                    "is not behaving as expected.")
 
 
-    def test_log_likelihood_UNITY(self):
+    def log_likelihood_UNITY_tester(self, use_version2=False):
         '''
         setup
         '''
@@ -420,7 +594,10 @@ def test_log_likelihood_UNITY(self):
 
         FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
         FUK.allocate()
-        FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+        if use_version2:
+            FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+        else:
+            FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d)
 
         expected_err_phot = LLerr
         measured_err_phot = LLerr_d.get()
@@ -429,6 +606,11 @@ def test_log_likelihood_UNITY(self):
                                                                                  "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % (
                                                                                  repr(expected_err_phot),
                                                                                  repr(measured_err_phot)), rtol=1e-5)
+    def test_log_likelihood_UNITY(self):
+        self.log_likelihood_UNITY_tester(False)
+
+    def test_log_likelihood2_UNITY(self):
+        self.log_likelihood_UNITY_tester(True)
 
     def test_exit_error_UNITY(self):
         '''
diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
new file mode 100644
index 000000000..64cc5110d
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -0,0 +1,84 @@
+'''
+'''
+
+import unittest
+from mpi4py.MPI import Get_version
+import numpy as np
+from . import PyCudaTest, have_pycuda
+
+if have_pycuda():
+    from pycuda import gpuarray
+    import pycuda.driver as cuda
+    from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu
+    from ptypy.utils import parallel
+
+from pkg_resources import parse_version
+
+class GpuDataTest(PyCudaTest):
+    """
+    This is a test class for MPI - to really check if it all works, it needs
+    to be run as:
+
+    mpirun -np 2 pytest multi_gpu_test.py
+
+    For CUDA-aware MPI testing, currently the environment variable
+
+    OMPI_MCA_opal_cuda_support=true
+
+    needs to be set, mpi4py version 3.1.0+ used, a pycuda build from master,
+    and a cuda-aware MPI version.
+    """
+
+    def setUp(self):
+        if parallel.rank_local < cuda.Device.count():
+            self.device = cuda.Device(parallel.rank_local)
+            self.ctx = self.device.make_context()
+            self.ctx.push()
+        else:
+            self.ctx = None
+
+    def tearDown(self):
+        if self.ctx is not None:
+            self.ctx.pop()
+            self.ctx.detach()
+
+    @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
+    def test_version(self):
+        v1 = parse_version("3.1.0")
+        v2 = parse_version(parse_version("3.1.0a").base_version)
+
+        self.assertGreaterEqual(v2, v1)
+
+    def test_compute_mode(self):
+        attr = cuda.Context.get_device().get_attributes()
+        self.assertIn(cuda.device_attribute.COMPUTE_MODE, attr)
+        mode = attr[cuda.device_attribute.COMPUTE_MODE]
+        self.assertIn(mode, 
+            [cuda.compute_mode.DEFAULT, cuda.compute_mode.PROHIBITED, cuda.compute_mode.EXCLUSIVE_PROCESS]
+        )
+
+    def multigpu_tester(self, com):
+        if self.ctx is None:
+            return
+
+        data = np.ones((2, 1), dtype=np.float32)
+        data_dev = gpuarray.to_gpu(data)
+        sz = parallel.size
+        com.allReduceSum(data_dev)
+
+        out = data_dev.get()
+        np.testing.assert_allclose(out, sz * data, rtol=1e-6)
+
+    def test_multigpu_auto(self):
+        self.multigpu_tester(mgpu.get_multi_gpu_communicator())
+        
+    def test_multigpu_mpi(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi())
+
+    @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
+    def test_multigpu_cudampi(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())
+
+    @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
+    def test_multigpu_nccl(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
index 81674d610..d626c0ca2 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
@@ -6,6 +6,8 @@
 import unittest
 import numpy as np
 from . import PyCudaTest, have_pycuda
+from ptypy.accelerate.base.array_utils import max_abs2
+from parameterized import parameterized
 
 if have_pycuda():
     from pycuda import gpuarray
@@ -18,7 +20,7 @@
 
 class PoUpdateKernelTest(PyCudaTest):
 
-    def prepare_arrays(self):
+    def prepare_arrays(self, scan_points=None):
         B = 5  # frame size y
         C = 5  # frame size x
 
@@ -31,7 +33,10 @@ def prepare_arrays(self):
         H = B + npts_greater_than  # object size y
         I = C + npts_greater_than  # object size x
 
-        scan_pts = 2  # one dimensional scan point number
+        if scan_points is None:
+            scan_pts = 2  # one dimensional scan point number
+        else:
+            scan_pts = scan_points
 
         total_number_scan_positions = scan_pts ** 2
         total_number_modes = G * D
@@ -72,11 +77,11 @@ def prepare_arrays(self):
 
         object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)  # + 1j * np.ones((H, I)) * (5 * idx + 2)
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
 
         probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)  # + 1j * np.ones((E, F)) * (5 * idx + 2)
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
 
         return (gpuarray.to_gpu(addr), 
             gpuarray.to_gpu(object_array), 
@@ -87,17 +92,12 @@ def prepare_arrays(self):
 
 
     def test_init(self):
-
         POUK = PoUpdateKernel()
-
-        np.testing.assert_equal(POUK.kernels,
-                                ['pr_update', 'ob_update'],
+        np.testing.assert_equal(POUK.kernels, ['pr_update', 'ob_update'],
                                 err_msg='PoUpdateKernel does not have the correct functions registered.')
 
     def ob_update_REGRESSION_tester(self, atomics=True):
-        '''
-        setup
-        '''
+        
         B = 5  # frame size y
         C = 5  # frame size x
 
@@ -149,14 +149,13 @@ def ob_update_REGRESSION_tester(self, atomics=True):
                     mode_idx += 1
                     exit_idx += 1
             position_idx += 1
-        
 
         '''
         test
         '''
-        object_array_denominator = np.empty_like(object_array)
+        object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2)
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
 
 
         POUK = PoUpdateKernel()
@@ -204,22 +203,22 @@ def ob_update_REGRESSION_tester(self, atomics=True):
         np.testing.assert_array_equal(object_array, expected_object_array,
                                       err_msg="The object array has not been updated as expected")
 
-        expected_object_array_denominator = np.array([[[12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j,  2.+2.j],
-                                                       [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j,  2.+2.j],
-                                                       [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j,  2.+2.j],
-                                                       [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j,  2.+2.j],
-                                                       [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j,  2.+2.j],
-                                                       [12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j,  2.+2.j],
-                                                       [ 2.+2.j,  2.+2.j,  2.+2.j,  2.+2.j,  2.+2.j,  2.+2.j,  2.+2.j]],
+        expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [12., 22., 22., 22., 22., 12.,  2.],
+                                                       [ 2.,  2.,  2.,  2.,  2.,  2.,  2.]],
 
-                                                      [[17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j,  7.+7.j],
-                                                       [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j,  7.+7.j],
-                                                       [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j,  7.+7.j],
-                                                       [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j,  7.+7.j],
-                                                       [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j,  7.+7.j],
-                                                       [17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j,  7.+7.j],
-                                                       [ 7.+7.j,  7.+7.j,  7.+7.j,  7.+7.j,  7.+7.j,  7.+7.j,  7.+7.j]]],
-                                                     dtype=COMPLEX_TYPE)
+                                                      [[17., 27., 27., 27., 27., 17.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [17., 27., 27., 27., 27., 17.,  7.],
+                                                       [ 7.,  7.,  7.,  7.,  7.,  7.,  7.]]],
+                                                     dtype=FLOAT_TYPE)
 
 
         np.testing.assert_array_equal(object_array_denominator_dev.get(), expected_object_array_denominator,
@@ -291,9 +290,9 @@ def ob_update_UNITY_tester(self, atomics=True):
         '''
         test
         '''
-        object_array_denominator = np.empty_like(object_array)
+        object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2)
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
 
 
         POUK = PoUpdateKernel()
@@ -394,9 +393,9 @@ def pr_update_REGRESSION_tester(self, atomics=True):
         '''
         test
         '''
-        probe_denominator = np.empty_like(probe)
+        probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2)
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
 
         POUK = PoUpdateKernel()
 
@@ -438,18 +437,18 @@ def pr_update_REGRESSION_tester(self, atomics=True):
         np.testing.assert_array_equal(probe_dev.get(), expected_probe,
                                       err_msg="The probe has not been updated as expected")
 
-        expected_probe_denominator = np.array([[[138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j],
-                                                [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j],
-                                                [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j],
-                                                [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j],
-                                                [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j]],
+        expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.]],
 
-                                               [[143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j],
-                                                [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j],
-                                                [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j],
-                                                [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j],
-                                                [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j]]],
-                                              dtype=COMPLEX_TYPE)
+                                               [[143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.]]],
+                                              dtype=FLOAT_TYPE)
 
         np.testing.assert_array_equal(probe_denominator_dev.get(), expected_probe_denominator,
                                       err_msg="The probe denominatorhas not been updated as expected")
@@ -519,9 +518,9 @@ def pr_update_UNITY_tester(self, atomics=True):
         '''
         test
         '''
-        probe_denominator = np.empty_like(probe)
+        probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2)
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
 
         POUK = PoUpdateKernel()
         from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
@@ -650,6 +649,158 @@ def test_ob_update_ML_atomics_REGRESSION(self):
     def test_ob_update_ML_tiled_REGRESSION(self):
         self.ob_update_ML_tester(False)
 
+    def test_ob_update_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 2
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel(queue_thread=self.stream)
+
+        object_array_dev = gpuarray.to_gpu(object_array)
+        probe_dev = gpuarray.to_gpu(probe)
+        exit_wave_dev = gpuarray.to_gpu(exit_wave)
+        auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        POUK.ob_update_local(addr_dev, object_array_dev, probe_dev, exit_wave_dev, auxiliary_wave_dev)
+        nPOUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                      err_msg="The object array has not been updated as expected")
+
+    def test_pr_update_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = gpuarray.to_gpu(object_array)
+        probe_dev = gpuarray.to_gpu(probe)
+        exit_wave_dev = gpuarray.to_gpu(exit_wave)
+        auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        POUK.pr_update_local(addr_dev,  probe_dev, object_array_dev,exit_wave_dev, auxiliary_wave_dev)
+        nPOUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave)
+
+        np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6,
+                                      err_msg="The probe has not been updated as expected")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py
index a8deebdc6..7f36f138c 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py
@@ -6,6 +6,7 @@
 import unittest
 import numpy as np
 from . import PyCudaTest, have_pycuda
+from ptypy import utils as u
 
 if have_pycuda():
     from pycuda import gpuarray
@@ -19,6 +20,17 @@
 
 class PositionCorrectionKernelTest(PyCudaTest):
 
+    def setUp(self):
+        PyCudaTest.setUp(self)
+        self.params = u.Param()
+        self.params.nshifts = 4
+        self.params.method = "Annealing"
+        self.params.amplitude = 2e-9
+        self.params.start = 0
+        self.params.stop = 10
+        self.params.max_shift = 2e-9
+        self.resolution = [1e-9,1e-9]
+
     def update_addr_and_error_state_UNITY_helper(self, size, modes):
         ## Arrange
         addr = np.ones((size, modes, 5, 3), dtype=np.int32)
@@ -33,9 +45,9 @@ def update_addr_and_error_state_UNITY_helper(self, size, modes):
         aux = np.ones((1,1,1), dtype=np.complex64)
 
         ## Act
-        PCK = PositionCorrectionKernel(aux, modes, queue_thread=self.stream)
+        PCK = PositionCorrectionKernel(aux, modes, self.params, self.resolution, queue_thread=self.stream)
         PCK.update_addr_and_error_state(addr_gpu, err_state_gpu, mangled_addr_gpu, err_sum_gpu)
-        abPCK = abPositionCorrectionKernel(aux, modes)
+        abPCK = abPositionCorrectionKernel(aux, modes, self.params, self.resolution)
         abPCK.update_addr_and_error_state(addr, err_state, mangled_addr, err_sum)
 
         ## Assert
diff --git a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py
index 28f576b9e..794a547fd 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py
@@ -23,7 +23,7 @@
 
 class PropagationKernelTest(PyCudaTest):
 
-    def set_up_farfield(self,shape):
+    def set_up_farfield(self,shape, resolution=None):
         P = Base()
         P.CType = COMPLEX_TYPE
         P.Ftype = FLOAT_TYPE
@@ -34,6 +34,8 @@ def set_up_farfield(self,shape):
         g.psize = 24e-6
         g.shape = shape
         g.propagation = "farfield"
+        if resolution is not None:
+            g.resolution = resolution
         G = geometry.Geo(owner=P, pars=g)
         return G
 
@@ -65,7 +67,8 @@ def test_farfield_propagator_forward_UNITY(self):
         PropK.allocate()
         PropK.fw(aux_d, aux_d)
 
-        np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
 
     def test_farfield_propagator_backward_UNITY(self):
         # setup
@@ -81,7 +84,44 @@ def test_farfield_propagator_backward_UNITY(self):
         PropK.allocate()
         PropK.bw(aux_d, aux_d)
 
-        np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_farfield_propagator_forward_crop_pad_UNITY(self):
+        # setup
+        SH = (16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[5:11,5:11] = 1. + 2j
+        aux_d = gpuarray.to_gpu(aux)
+        geo = self.set_up_farfield(SH)
+        geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution)
+
+        # test
+        aux = geo.propagator.fw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.fw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_farfield_propagator_backward_crop_pad_UNITY(self):
+        # setup
+        SH = (16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[5:11,5:11] = 1. + 2j
+        aux_d = gpuarray.to_gpu(aux)
+        geo = self.set_up_farfield(SH)
+        geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution)
+
+        # test
+        aux = geo.propagator.bw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.bw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
 
     def test_nearfield_propagator_forward_UNITY(self):
         # setup
@@ -97,7 +137,8 @@ def test_nearfield_propagator_forward_UNITY(self):
         PropK.allocate()
         PropK.fw(aux_d, aux_d)
 
-        np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
 
     def test_nearfield_propagator_backward_UNITY(self):
         # setup
@@ -113,4 +154,5 @@ def test_nearfield_propagator_backward_UNITY(self):
         PropK.allocate()
         PropK.bw(aux_d, aux_d)
 
-        np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
\ No newline at end of file
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
\ No newline at end of file
diff --git a/test/engine_tests/ML_test.py b/test/engine_tests/ML_test.py
index b7ae3525e..fd95b816e 100644
--- a/test/engine_tests/ML_test.py
+++ b/test/engine_tests/ML_test.py
@@ -12,21 +12,6 @@
 
 class MLTest(unittest.TestCase):
 
-    def test_ML_farfield_position_refinement(self):
-        engine_params = u.Param()
-        engine_params.name = 'ML'
-        engine_params.numiter = 5
-        engine_params.probe_update_start = 2
-        engine_params.floating_intensities = False
-        engine_params.intensity_renormalization = 1.0
-        engine_params.reg_del2 =True
-        engine_params.reg_del2_amplitude = 0.01
-        engine_params.smooth_gradient = 0.0
-        engine_params.scale_precond =False
-        engine_params.probe_update_start = 0
-        engine_params.position_refinement = True
-        tu.EngineTestRunner(engine_params)
-
     def test_ML_farfield_floating_intensities(self):
         engine_params = u.Param()
         engine_params.name = 'ML'