From 880523883719049d70afeca2ddcf5623f5bc2cd9 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 18 Feb 2021 18:19:06 +0000 Subject: [PATCH 01/56] Added new DM_local engine, currently does nothing --- ptypy/engines/DM_local.py | 288 ++++++++++++++++++++ ptypy/engines/DM_serial.py | 4 - ptypy/engines/__init__.py | 1 + templates/minimal_prep_and_run_DM_local.py | 48 ++++ templates/minimal_prep_and_run_DM_pycuda.py | 4 +- 5 files changed, 340 insertions(+), 5 deletions(-) create mode 100644 ptypy/engines/DM_local.py create mode 100644 templates/minimal_prep_and_run_DM_local.py diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py new file mode 100644 index 000000000..a471dba14 --- /dev/null +++ b/ptypy/engines/DM_local.py @@ -0,0 +1,288 @@ +# -*- coding: utf-8 -*- +""" +Local Difference Map/Alternate Projections reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" +import numpy as np +import time + +from .. import utils as u +from ..utils.verbose import logger, log +from ..utils import parallel +from .. import defaults_tree +from . import register, DM_serial +from .base import PositionCorrectionEngine +from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull +from ..accelerate.array_based.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..accelerate.array_based import address_manglers +from ..accelerate.array_based import array_utils as au + +__all__ = ['DM_local'] + +@register() +class DM_local(PositionCorrectionEngine): + """ + A local version of the Difference Map engine + that can be operated like the ePIE algorithm. + + + Defaults: + + [name] + default = DM_local + type = str + help = + doc = + + [alpha] + default = 1 + type = float + lowlim = 0.0 + help = Difference map tuning parameter, a value of 0 makes it equal to ePIE. + + [probe_inertia] + default = 1e-9 + type = float + lowlim = 0.0 + help = Weight of the current probe estimate in the update + + [object_inertia] + default = 1e-4 + type = float + lowlim = 0.0 + help = Weight of the current object in the update + + [clip_object] + default = None + type = tuple + help = Clip object amplitude into this interval + + [compute_log_likelihood] + default = True + type = bool + help = A switch for computing the log-likelihood error (this can impact the performance of the engine) + + """ + + SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] + + def __init__(self, ptycho_parent, pars=None): + """ + Local difference map reconstruction engine. + """ + super(DM_local, self).__init__(ptycho_parent, pars) + + # Instance attributes + self.error = None + self.pbound = None + + # Required to get proper normalization of object inertia + # The actual value is computed in engine_prepare + # Another possibility would be to use the maximum value of all probe storages. + self.mean_power = None + + # keep track of timings + self.benchmark = u.Param() + + # Stores all information needed with respect to the diffraction storages. + self.diff_info = {} + self.ob_cfact = {} + self.pr_cfact = {} + self.kernels = {} + + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + super(DM_local, self).engine_initialize() + + self.error = [] + self._reset_benchmarks() + self._setup_kernels() + + def _reset_benchmarks(self): + self.benchmark.A_Build_aux = 0. + self.benchmark.B_Prop = 0. + self.benchmark.C_Fourier_update = 0. + self.benchmark.D_iProp = 0. + self.benchmark.E_Build_exit = 0. + self.benchmark.F_LLerror = 0. + self.benchmark.probe_update = 0. + self.benchmark.object_update = 0. + self.benchmark.calls_fourier = 0 + self.benchmark.calls_object = 0 + self.benchmark.calls_probe = 0 + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + # TODO: make this part of the engine rather than scan + fpc = self.ptycho.frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = aux + + # setup kernels, one for each SCAN. + kern.FUK = FourierUpdateKernel(aux, nmodes) + kern.FUK.allocate() + + kern.POK = PoUpdateKernel() + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel() + kern.AWK.allocate() + + kern.FW = geo.propagator.fw + kern.BW = geo.propagator.bw + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), + self.p.position_refinement.start, + self.p.position_refinement.stop, + max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), + randomseed=0) + logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) + logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) + + kern.PCK = PositionCorrectionKernel(aux, nmodes) + kern.PCK.allocate() + kern.PCK.address_mangler = addr_mangler + + def engine_prepare(self): + + """ + Last minute initialization. + + Everything that needs to be recalculated when new data arrives. + """ + if self.ptycho.new_data: + + # recalculate everything + mean_power = 0. + self.pbound_scan = {} + for s in self.di.storages.values(): + if not self.pbound_scan.get(s.label): + self.pbound_scan[s.label] = 0.25 + else: + self.pbound_scan[s.label] = max(pb, self.pbound_scan[s.label]) + mean_power += s.mean_power + self.mean_power = mean_power / len(self.di.storages) + + ## Serialize new data ## + for label, d in self.ptycho.new_data: + prep = u.Param() + prep.label = label + self.diff_info[d.ID] = prep + prep.mag = np.sqrt(np.abs(d.data)) + prep.ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma_sum = prep.ma.sum(-1).sum(-1) + prep.err_phot = np.zeros_like(prep.ma_sum) + prep.err_fourier = np.zeros_like(prep.ma_sum) + prep.err_exit = np.zeros_like(prep.ma_sum) + + # Unfortunately this needs to be done for all pods, since + # the shape of the probe / object was modified. + # TODO: possible scaling issue, remove the need for padding + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.view_IDs, prep.poe_IDs, prep.addr = DM_serial.serialize_array_access(d) + if self.do_position_refinement: + prep.original_addr = np.zeros_like(prep.addr) + prep.original_addr[:] = prep.addr + pID, oID, eID = prep.poe_IDs + + ob = self.ob.S[oID] + misfit = np.asarray(ob.shape[-2:]) % 32 + if (misfit != 0).any(): + pad = 32 - np.asarray(ob.shape[-2:]) % 32 + ob.data = u.crop_pad(ob.data, [[0, pad[0]], [0, pad[1]]], axes=[-2, -1], filltype='project') + ob.shape = ob.data.shape + + # calculate c_facts + cfact = self.p.object_inertia * self.mean_power + self.ob_cfact[oID] = cfact / u.parallel.size + + pr = self.pr.S[pID] + cfact = self.p.probe_inertia * len(pr.views) / pr.data.shape[0] + self.pr_cfact[pID] = cfact / u.parallel.size + + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + for it in range(num): + + error_dct = {} + for name, di_view in self.di.views.items(): + if not di_view.active: + continue + error_dct[name] = np.array([0,0,0]) + + time.sleep(0.1) + + self.curiter += 1 + + error = parallel.gather_dict(error_dct) + return error + + + def engine_finalize(self): + """ + try deleting ever helper contianer + """ + if parallel.master and self.benchmark.calls_fourier: + print("----- BENCHMARKS ----") + acc = 0. + for name in sorted(self.benchmark.keys()): + t = self.benchmark[name] + if name[0] in 'ABCDEFGHI': + print('%20s : %1.3f ms per iteration' % (name, t / self.benchmark.calls_fourier * 1000)) + acc += t + elif str(name) == 'probe_update': + print('%20s : %1.3f ms per call. %d calls' % ( + name, t / self.benchmark.calls_probe * 1000, self.benchmark.calls_probe)) + elif str(name) == 'object_update': + print('%20s : %1.3f ms per call. %d calls' % ( + name, t / self.benchmark.calls_object * 1000, self.benchmark.calls_object)) + + print('%20s : %1.3f ms per iteration. %d calls' % ( + 'Fourier_total', acc / self.benchmark.calls_fourier * 1000, self.benchmark.calls_fourier)) + + self._reset_benchmarks() + + if self.do_position_refinement: + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + res = self.kernels[prep.label].resolution + for i,view in enumerate(d.views): + for j,(pname, pod) in enumerate(view.pods.items()): + delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res + pod.ob_view.coord += delta + pod.ob_view.storage.update_views(pod.ob_view) \ No newline at end of file diff --git a/ptypy/engines/DM_serial.py b/ptypy/engines/DM_serial.py index a4247d096..f1f679394 100644 --- a/ptypy/engines/DM_serial.py +++ b/ptypy/engines/DM_serial.py @@ -7,10 +7,6 @@ :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. :license: GPLv2, see LICENSE for details. """ - -# from .. import core -from __future__ import division - import numpy as np import time diff --git a/ptypy/engines/__init__.py b/ptypy/engines/__init__.py index a2716afd0..7e286330d 100644 --- a/ptypy/engines/__init__.py +++ b/ptypy/engines/__init__.py @@ -53,6 +53,7 @@ def by_name(name): from . import DM_serial from . import ML_serial from . import DM_serial_stream +from . import DM_local try: from . import DM_pycuda from . import DM_pycuda_streams diff --git a/templates/minimal_prep_and_run_DM_local.py b/templates/minimal_prep_and_run_DM_local.py new file mode 100644 index 000000000..c369a67bf --- /dev/null +++ b/templates/minimal_prep_and_run_DM_local.py @@ -0,0 +1,48 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=True) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'Vanilla' # or 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_local' +p.engines.engine00.numiter = 80 + +# prepare and run +P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_DM_pycuda.py b/templates/minimal_prep_and_run_DM_pycuda.py index 6c07c90b3..a16ccc686 100644 --- a/templates/minimal_prep_and_run_DM_pycuda.py +++ b/templates/minimal_prep_and_run_DM_pycuda.py @@ -15,7 +15,9 @@ p.io = u.Param() p.io.home = "~/dumps/ptypy/" p.io.autosave = u.Param(active=True) -p.io.autoplot = u.Param(active=False) +p.io.autoplot = u.Param(active=True) +p.io.interaction = u.Param(active=True) +p.io.interaction.client = u.Param(poll_timeout=1) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() p.scans.MF = u.Param() From 49cf3eb88b6562c00c0a0c556661b1bc2da728e5 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 19 Feb 2021 11:06:41 +0000 Subject: [PATCH 02/56] started working on iterator --- ptypy/engines/DM_local.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py index a471dba14..80845f6aa 100644 --- a/ptypy/engines/DM_local.py +++ b/ptypy/engines/DM_local.py @@ -240,6 +240,15 @@ def engine_iterate(self, num=1): for it in range(num): error_dct = {} + + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + print(prep.addr.shape) + for name, di_view in self.di.views.items(): if not di_view.active: continue From cd992314b2c1b21bdf754af35b3c31b91cb97e1b Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 19 Feb 2021 15:36:40 +0000 Subject: [PATCH 03/56] DM_local works with alpha=0 --- ptypy/accelerate/array_based/kernels.py | 50 ++++++++- ptypy/engines/DM_local.py | 112 ++++++++++++++++++--- templates/minimal_prep_and_run_DM_local.py | 6 +- 3 files changed, 152 insertions(+), 16 deletions(-) diff --git a/ptypy/accelerate/array_based/kernels.py b/ptypy/accelerate/array_based/kernels.py index fa66ea2f5..9fa671869 100644 --- a/ptypy/accelerate/array_based/kernels.py +++ b/ptypy/accelerate/array_based/kernels.py @@ -359,7 +359,6 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): aux = b_aux[:maxz * nmodes] flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3]) rows, cols = ex.shape[-2:] - for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): tmp = ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ pr[prc[0], :, :] * \ @@ -393,6 +392,30 @@ def build_exit(self, b_aux, addr, ob, pr, ex): aux[ind, :, :] = dex return + def build_exit_alpha(self, b_aux, addr, ob, pr, ex, alpha=1): + sh = addr.shape + + nmodes = sh[1] + + # stopper + maxz = sh[0] + + # batch buffers + aux = b_aux[:maxz * nmodes] + + flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3]) + rows, cols = ex.shape[-2:] + + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + dex = aux[ind, :, :] - alpha * \ + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + \ + (alpha - 1) * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex + aux[ind, :, :] = dex + return + def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): sh = addr.shape @@ -479,6 +502,31 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0): ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac return + def ob_update_local(self, addr, ob, pr, ex, aux): + + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + aux[ind,:,:] = pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] * \ + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \ + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \ + (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ + np.max(np.abs(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols])**2) + return + + def pr_update_local(self, addr, pr, ob, ex, aux): + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \ + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \ + (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ + np.max(np.abs(ob[obc[0]])**2) + return + class PositionCorrectionKernel(BaseKernel): def __init__(self, aux, nmodes): super(PositionCorrectionKernel, self).__init__() diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py index 80845f6aa..834e2129f 100644 --- a/ptypy/engines/DM_local.py +++ b/ptypy/engines/DM_local.py @@ -143,7 +143,7 @@ def _setup_kernels(self): nmodes = 1 # create buffer arrays - ash = (fpc * nmodes,) + tuple(geo.shape) + ash = (1 * nmodes,) + tuple(geo.shape) aux = np.zeros(ash, dtype=np.complex64) kern.aux = aux @@ -224,13 +224,16 @@ def engine_prepare(self): ob.data = u.crop_pad(ob.data, [[0, pad[0]], [0, pad[1]]], axes=[-2, -1], filltype='project') ob.shape = ob.data.shape + # Keep a list of view indices + prep.vieworder = np.arange(prep.addr.shape[0]) + # calculate c_facts - cfact = self.p.object_inertia * self.mean_power - self.ob_cfact[oID] = cfact / u.parallel.size + #cfact = self.p.object_inertia * self.mean_power + #self.ob_cfact[oID] = cfact / u.parallel.size - pr = self.pr.S[pID] - cfact = self.p.probe_inertia * len(pr.views) / pr.data.shape[0] - self.pr_cfact[pID] = cfact / u.parallel.size + #pr = self.pr.S[pID] + #cfact = self.p.probe_inertia * len(pr.views) / pr.data.shape[0] + #self.pr_cfact[pID] = cfact / u.parallel.size def engine_iterate(self, num=1): @@ -247,14 +250,95 @@ def engine_iterate(self, num=1): prep = self.diff_info[dID] pID, oID, eID = prep.poe_IDs - print(prep.addr.shape) - - for name, di_view in self.di.views.items(): - if not di_view.active: - continue - error_dct[name] = np.array([0,0,0]) - - time.sleep(0.1) + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + FW = kern.FW + BW = kern.BW + + # global buffers + pbound = 0 #self.pbound_scan[prep.label] + aux = kern.aux + vieworder = prep.vieworder + + # references for ob, pr, ex + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + ex = self.ex.S[eID].data + + # randomly shuffle view order + np.random.shuffle(vieworder) + + # Iterate through views + for i in prep.vieworder: + + # Get local adress and arrays + addr = prep.addr[i,None] + mag = prep.mag[i,None] + ma = prep.ma[i,None] + ma_sum = prep.ma_sum[i,None] + + err_phot = prep.err_phot[i,None] + err_fourier = prep.err_fourier[i,None] + err_exit = prep.err_exit[i,None] + + ## compute log-likelihood + t1 = time.time() + AWK.build_aux_no_ex(aux, addr, ob, pr) + aux[:] = FW(aux) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + self.benchmark.F_LLerror += time.time() - t1 + + ## build auxilliary wave + t1 = time.time() + AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + self.benchmark.A_Build_aux += time.time() - t1 + + ## forward FFT + t1 = time.time() + aux[:] = FW(aux) + self.benchmark.B_Prop += time.time() - t1 + + ## Deviation from measured data + t1 = time.time() + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + self.benchmark.C_Fourier_update += time.time() - t1 + + ## backward FFT + t1 = time.time() + aux[:] = BW(aux) + self.benchmark.D_iProp += time.time() - t1 + + ## build exit wave + t1 = time.time() + AWK.build_exit_alpha(aux, addr, ob, pr, ex, alpha=self.p.alpha) + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + self.benchmark.E_Build_exit += time.time() - t1 + self.benchmark.calls_fourier += 1 + + ## probe/object rescale + pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + # object update + t1 = time.time() + POK.ob_update_local(addr, ob, pr, ex, aux) + self.benchmark.object_update += time.time() - t1 + self.benchmark.calls_object += 1 + + # probe update + t1 = time.time() + POK.pr_update_local(addr, pr, ob, ex, aux) + self.benchmark.probe_update += time.time() - t1 + self.benchmark.calls_probe += 1 + + # update errors + errs = np.ascontiguousarray(np.vstack([prep.err_fourier, prep.err_phot, prep.err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) self.curiter += 1 diff --git a/templates/minimal_prep_and_run_DM_local.py b/templates/minimal_prep_and_run_DM_local.py index c369a67bf..790179012 100644 --- a/templates/minimal_prep_and_run_DM_local.py +++ b/templates/minimal_prep_and_run_DM_local.py @@ -11,6 +11,9 @@ # for verbose output p.verbose_level = 3 +# Frames per block +p.frames_per_block = 200 + # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" @@ -42,7 +45,8 @@ p.engines = u.Param() p.engines.engine00 = u.Param() p.engines.engine00.name = 'DM_local' -p.engines.engine00.numiter = 80 +p.engines.engine00.numiter = 100 +p.engines.engine00.alpha = 0 # behaves like ePIE # prepare and run P = Ptycho(p,level=5) From d84f4ff0e97ad19156f6e8e71281eb238ee8c24e Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 19 Feb 2021 18:32:49 +0000 Subject: [PATCH 04/56] DM_local work in progress --- ptypy/accelerate/array_based/kernels.py | 9 +++--- ptypy/engines/DM_local.py | 32 ++++++++++++++++------ templates/minimal_prep_and_run_DM_local.py | 11 ++++++-- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/ptypy/accelerate/array_based/kernels.py b/ptypy/accelerate/array_based/kernels.py index 9fa671869..5d08aa9cc 100644 --- a/ptypy/accelerate/array_based/kernels.py +++ b/ptypy/accelerate/array_based/kernels.py @@ -392,7 +392,7 @@ def build_exit(self, b_aux, addr, ob, pr, ex): aux[ind, :, :] = dex return - def build_exit_alpha(self, b_aux, addr, ob, pr, ex, alpha=1): + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): sh = addr.shape nmodes = sh[1] @@ -407,10 +407,11 @@ def build_exit_alpha(self, b_aux, addr, ob, pr, ex, alpha=1): rows, cols = ex.shape[-2:] for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): - dex = aux[ind, :, :] - alpha * \ + dex = tau * aux[ind, :, :] + (tau * alpha - 1) * \ + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + \ + (1 - tau * (1 + alpha)) * \ ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ - pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + \ - (alpha - 1) * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex aux[ind, :, :] = dex diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py index 834e2129f..30cca894c 100644 --- a/ptypy/engines/DM_local.py +++ b/ptypy/engines/DM_local.py @@ -44,6 +44,12 @@ class DM_local(PositionCorrectionEngine): lowlim = 0.0 help = Difference map tuning parameter, a value of 0 makes it equal to ePIE. + [tau] + default = 1 + type = float + lowlim = 0.0 + help = fourier update parameter, a value of 0 means no fourier update. + [probe_inertia] default = 1e-9 type = float @@ -61,6 +67,18 @@ class DM_local(PositionCorrectionEngine): type = tuple help = Clip object amplitude into this interval + [rescale_probe] + default = True + type = bool + lowlim = 0 + help = Normalise probe power according to data + + [fourier_power_bound] + default = None + type = float + help = If rms error of model vs diffraction data is smaller than this value, Fourier constraint is met + doc = For Poisson-sampled data, the theoretical value for this parameter is 1/4. Set this value higher for noisy data. + [compute_log_likelihood] default = True type = bool @@ -79,10 +97,6 @@ def __init__(self, ptycho_parent, pars=None): # Instance attributes self.error = None self.pbound = None - - # Required to get proper normalization of object inertia - # The actual value is computed in engine_prepare - # Another possibility would be to use the maximum value of all probe storages. self.mean_power = None # keep track of timings @@ -94,7 +108,6 @@ def __init__(self, ptycho_parent, pars=None): self.pr_cfact = {} self.kernels = {} - def engine_initialize(self): """ Prepare for reconstruction. @@ -188,7 +201,7 @@ def engine_prepare(self): self.pbound_scan = {} for s in self.di.storages.values(): if not self.pbound_scan.get(s.label): - self.pbound_scan[s.label] = 0.25 + self.pbound_scan[s.label] = self.p.fourier_power_bound else: self.pbound_scan[s.label] = max(pb, self.pbound_scan[s.label]) mean_power += s.mean_power @@ -259,7 +272,7 @@ def engine_iterate(self, num=1): BW = kern.BW # global buffers - pbound = 0 #self.pbound_scan[prep.label] + pbound = self.pbound_scan[prep.label] aux = kern.aux vieworder = prep.vieworder @@ -315,14 +328,15 @@ def engine_iterate(self, num=1): ## build exit wave t1 = time.time() - AWK.build_exit_alpha(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) FUK.exit_error(aux,addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 self.benchmark.calls_fourier += 1 ## probe/object rescale - pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + if self.p.rescale_probe: + pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) # object update t1 = time.time() diff --git a/templates/minimal_prep_and_run_DM_local.py b/templates/minimal_prep_and_run_DM_local.py index 790179012..39cc699f9 100644 --- a/templates/minimal_prep_and_run_DM_local.py +++ b/templates/minimal_prep_and_run_DM_local.py @@ -27,7 +27,7 @@ p.scans.MF = u.Param() # now you have to specify which ScanModel to use with scans.XX.name, # just as you have to give 'name' for engines and PtyScan subclasses. -p.scans.MF.name = 'Vanilla' # or 'Full' +p.scans.MF.name = 'Full' p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 @@ -39,14 +39,19 @@ # total number of photon in empty beam p.scans.MF.data.photons = 1e8 # Gaussian FWHM of possible detector blurring -p.scans.MF.data.psf = 0. +p.scans.MF.data.psf = 0.5 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 2 # attach a reconstrucion engine p.engines = u.Param() p.engines.engine00 = u.Param() p.engines.engine00.name = 'DM_local' p.engines.engine00.numiter = 100 -p.engines.engine00.alpha = 0 # behaves like ePIE +p.engines.engine00.alpha = 0.0 # 0 behaves like ePIE +p.engines.engine00.tau = 1.0 +p.engines.engine00.rescale_probe = False +p.engines.engine00.fourier_power_bound = 0.0 # prepare and run P = Ptycho(p,level=5) From ebbe0226cc6c38c8a0ffab7b0a5c772b3751773f Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Mon, 22 Feb 2021 15:21:36 +0000 Subject: [PATCH 05/56] same power bound for all scans --- ptypy/engines/DM_local.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py index 30cca894c..f6dbb615a 100644 --- a/ptypy/engines/DM_local.py +++ b/ptypy/engines/DM_local.py @@ -200,10 +200,7 @@ def engine_prepare(self): mean_power = 0. self.pbound_scan = {} for s in self.di.storages.values(): - if not self.pbound_scan.get(s.label): - self.pbound_scan[s.label] = self.p.fourier_power_bound - else: - self.pbound_scan[s.label] = max(pb, self.pbound_scan[s.label]) + self.pbound_scan[s.label] = self.p.fourier_power_bound mean_power += s.mean_power self.mean_power = mean_power / len(self.di.storages) From 90961ef986379ea4ce5b3886e1b2a553986122a3 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Mon, 22 Feb 2021 18:56:40 +0000 Subject: [PATCH 06/56] Use shuffled vieworder --- ptypy/engines/DM_local.py | 2 +- templates/minimal_prep_and_run_DM_local.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ptypy/engines/DM_local.py b/ptypy/engines/DM_local.py index f6dbb615a..09afba39b 100644 --- a/ptypy/engines/DM_local.py +++ b/ptypy/engines/DM_local.py @@ -282,7 +282,7 @@ def engine_iterate(self, num=1): np.random.shuffle(vieworder) # Iterate through views - for i in prep.vieworder: + for i in vieworder: # Get local adress and arrays addr = prep.addr[i,None] diff --git a/templates/minimal_prep_and_run_DM_local.py b/templates/minimal_prep_and_run_DM_local.py index 39cc699f9..51f43b1c4 100644 --- a/templates/minimal_prep_and_run_DM_local.py +++ b/templates/minimal_prep_and_run_DM_local.py @@ -39,17 +39,17 @@ # total number of photon in empty beam p.scans.MF.data.photons = 1e8 # Gaussian FWHM of possible detector blurring -p.scans.MF.data.psf = 0.5 +p.scans.MF.data.psf = 0.0 p.scans.MF.coherence = u.Param() -p.scans.MF.coherence.num_probe_modes = 2 +p.scans.MF.coherence.num_probe_modes = 1 # attach a reconstrucion engine p.engines = u.Param() p.engines.engine00 = u.Param() p.engines.engine00.name = 'DM_local' p.engines.engine00.numiter = 100 -p.engines.engine00.alpha = 0.0 # 0 behaves like ePIE -p.engines.engine00.tau = 1.0 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 p.engines.engine00.rescale_probe = False p.engines.engine00.fourier_power_bound = 0.0 From 998c96977799d7e4aaba8fe29e0131dfac65b79b Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Sat, 27 Feb 2021 12:49:09 +0000 Subject: [PATCH 07/56] fixed imports --- ptypy/accelerate/base/engines/DM_local.py | 22 +++++++++---------- templates/minimal_prep_and_run_resample_DM.py | 2 +- templates/minimal_prep_and_run_resample_ML.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ptypy/accelerate/base/engines/DM_local.py b/ptypy/accelerate/base/engines/DM_local.py index 09afba39b..080cdc3df 100644 --- a/ptypy/accelerate/base/engines/DM_local.py +++ b/ptypy/accelerate/base/engines/DM_local.py @@ -10,16 +10,16 @@ import numpy as np import time -from .. import utils as u -from ..utils.verbose import logger, log -from ..utils import parallel -from .. import defaults_tree -from . import register, DM_serial -from .base import PositionCorrectionEngine -from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull -from ..accelerate.array_based.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel -from ..accelerate.array_based import address_manglers -from ..accelerate.array_based import array_utils as au +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy import defaults_tree +from ptypy.engines import register, DM_serial +from ptypy.engines.base import PositionCorrectionEngine +from ptypy.core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull +from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ptypy.accelerate.base import address_manglers +from ptypy.accelerate.base import array_utils as au __all__ = ['DM_local'] @@ -389,4 +389,4 @@ def engine_finalize(self): for j,(pname, pod) in enumerate(view.pods.items()): delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res pod.ob_view.coord += delta - pod.ob_view.storage.update_views(pod.ob_view) \ No newline at end of file + pod.ob_view.storage.update_views(pod.ob_view) diff --git a/templates/minimal_prep_and_run_resample_DM.py b/templates/minimal_prep_and_run_resample_DM.py index 08c12540c..b06281223 100644 --- a/templates/minimal_prep_and_run_resample_DM.py +++ b/templates/minimal_prep_and_run_resample_DM.py @@ -14,7 +14,7 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/templates/minimal_prep_and_run_resample_ML.py b/templates/minimal_prep_and_run_resample_ML.py index f0d5619f9..2edbb8bcc 100644 --- a/templates/minimal_prep_and_run_resample_ML.py +++ b/templates/minimal_prep_and_run_resample_ML.py @@ -15,7 +15,7 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) #p.io.autoplot = u.Param() #p.io.autoplot.dump = True #p.io.autoplot = False From 47ebe6bb8038a058449c163516058687e2d8e2ae Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Sun, 28 Feb 2021 14:31:12 +0000 Subject: [PATCH 08/56] More import fixes --- ptypy/accelerate/base/engines/DM_local.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ptypy/accelerate/base/engines/DM_local.py b/ptypy/accelerate/base/engines/DM_local.py index 080cdc3df..f93ff21e2 100644 --- a/ptypy/accelerate/base/engines/DM_local.py +++ b/ptypy/accelerate/base/engines/DM_local.py @@ -14,9 +14,10 @@ from ptypy.utils.verbose import logger, log from ptypy.utils import parallel from ptypy import defaults_tree -from ptypy.engines import register, DM_serial +from ptypy.engines import register from ptypy.engines.base import PositionCorrectionEngine from ptypy.core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull +from ptypy.accelerate.base.engines import DM_serial from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel from ptypy.accelerate.base import address_manglers from ptypy.accelerate.base import array_utils as au From 99f6ad787912809d7ab63a04ab5fb135821c184e Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Sun, 28 Feb 2021 22:29:51 +0000 Subject: [PATCH 09/56] Save out arrays for debugging --- ptypy/accelerate/base/engines/ML_serial.py | 109 ++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 04bfd58ba..d81873f01 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -25,12 +25,24 @@ PositionCorrectionKernel from ptypy.accelerate.base import address_manglers -__all__ = ['ML_serial'] +# for debugging +import h5py +__all__ = ['ML_serial'] @register() class ML_serial(ML): + """ + Defaults: + + [debug] + default = None + type = str + help = For debugging purposes, dump arrays into given directory + + """ + def __init__(self, ptycho_parent, pars=None): """ Maximum likelihood reconstruction engine. @@ -355,23 +367,87 @@ def new_grad(self): prg = pr_grad.S[pID].data I = self.engine.di.S[dID].data + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/build_aux_no_ex_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["pr"] = pr + # make propagated exit (to buffer) AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/forward_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + # forward prop aux[:] = FW(aux) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/make_model_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + GDK.make_model(aux, addr) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/floating_intensities_%04d.h5" %self.engine.curiter, "w") as f: + f["w"] = w + f["addr"] = addr + f["I"] = I + f["fic"] = fic + if self.p.floating_intensities: GDK.floating_intensity(addr, w, I, fic) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/main_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["w"] = w + f["I"] = I + GDK.main(aux, addr, w, I) + + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/error_reduce_%04d.h5" %self.engine.curiter, "w") as f: + f["addr"] = addr + f["err_phot"] = err_phot + GDK.error_reduce(addr, err_phot) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/backward_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + aux[:] = BW(aux) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/op_update_ml_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["obg"] = obg + f["pr"] = pr + POK.ob_update_ML(addr, obg, pr, aux) + + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/pr_update_ml_%04d.h5" %self.engine.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["prg"] = prg + POK.pr_update_ML(addr, prg, ob, aux) for dID, prep in self.engine.diff_info.items(): @@ -391,6 +467,12 @@ def new_grad(self): # Object regularizer if self.regularizer: for name, s in self.engine.ob.storages.items(): + + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/regul_grad_%04d.h5" %self.engine.curiter, "w") as f: + f["ob"] = s.data + ob_grad.storages[name].data += self.regularizer.grad(s.data) LL += self.regularizer.LL @@ -447,8 +529,26 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): a[:] = FW(a) b[:] = FW(b) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/make_a012_%04d.h5" %self.engine.curiter, "w") as g: + g["addr"] = addr + g["a"] = a + g["b"] = b + g["f"] = f + g["I"] = I + g["fic"] = fic + GDK.make_a012(f, a, b, addr, I, fic) + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/fill_b_%04d.h5" %self.engine.curiter, "w") as f: + f["addr"] = addr + f["Brenorm"] = Brenorm + f["w"] = w + f["B"] = B + GDK.fill_b(addr, Brenorm, w, B) parallel.allreduce(B) @@ -456,6 +556,13 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): # Object regularizer if self.regularizer: for name, s in self.ob.storages.items(): + + # debugging + if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + with h5py.File(self.p.debug + "/regul_poly_line_coeffs_%04d.h5" %self.engine.curiter, "w") as f: + f["ob"] = s.data + f["obh"] = c_ob_h.storages[name].data + B += Brenorm * self.regularizer.poly_line_coeffs( c_ob_h.storages[name].data, s.data) From 0150209828467d1d605e7a00d5f71f8cb80d21ad Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Sun, 28 Feb 2021 22:30:23 +0000 Subject: [PATCH 10/56] Added DLS tests based on real data --- .../cuda_pycuda_tests/dls_tests/__init__.py | 0 .../dls_tests/dls_auxiliary_wave_kernel.py | 51 +++++ .../dls_tests/dls_gradient_descent_kernel.py | 201 ++++++++++++++++++ .../dls_tests/dls_po_update_kernel_test.py | 78 +++++++ .../dls_tests/dls_propagation_test.py | 93 ++++++++ .../dls_tests/dls_regularizer_kernel_test.py | 72 +++++++ 6 files changed, 495 insertions(+) create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py new file mode 100644 index 000000000..ce52181c8 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py @@ -0,0 +1,51 @@ +''' +Testing based on real data +''' +import h5py +import unittest +import numpy as np +from .. import perfrun, PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import AuxiliaryWaveKernel +from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as BaseAuxiliaryWaveKernel + + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsAuxiliaryWaveKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" + iter = 10 + rtol = 1e-6 + + def test_build_aux_no_ex_noadd_UNITY(self): + + # Load data + with h5py.File(self.datadir + "build_aux_no_ex_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + + # CPU kernel + BAWK = BaseAuxiliaryWaveKernel() + BAWK.allocate() + BAWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + ## GPU kernel + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(aux_dev, addr_dev, ob_dev, pr_dev, add=False) + + ## Assert + np.testing.assert_allclose(aux_dev.get(), aux, rtol=self.rtol, err_msg="The auxiliary_wave does not match the base kernel output") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py new file mode 100644 index 000000000..ff60cd788 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py @@ -0,0 +1,201 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from .. import perfrun, PyCudaTest, have_pycuda + + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + + +class DlsGradientDescentKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def test_make_model_UNITY(self): + + # Load data + with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_model(aux, addr) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.make_model(aux_dev, addr_dev) + + ## Assert + np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + err_msg="`Imodel` buffer has not been updated as expected") + + + def test_floating_intensity_UNITY(self): + + # Load data + with h5py.File(self.datadir + "floating_intensities_%04d.h5" %self.iter, "r") as f: + w = f["w"][:] + addr = f["addr"][:] + I = f["I"][:] + fic = f["fic"][:] + with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + fic_dev = gpuarray.to_gpu(fic) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.floating_intensity(addr, w, I, fic) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) + + ## Assert + np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + err_msg="`Imodel` buffer has not been updated as expected") + np.testing.assert_allcolse(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="floating intensity coeff (fic) has not been updated as expected") + + + def test_main_and_error_reduce_UNITY(self): + + # Load data + with h5py.File(self.datadir + "main_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + w = f["w"][:] + I = f["I"][:] + # Load data + with h5py.File(self.datadir + "error_reduce_%04d.h5" %self.iter, "r") as f: + err_phot = f["err_phot"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + err_phot_dev = gpuarray.to_gpu(err_phot) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.main(aux, addr, w, I) + BGDK.error_reduce(addr, err_phot) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.main(aux_dev, addr_dev, w_dev, I_dev) + GDK.error_reduce(addr_dev, err_phot_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="Auxiliary has not been updated as expected") + np.testing.assert_allclose(BGDK.cpu.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + err_msg="LogLikelihood error has not been updated as expected") + np.testing.assert_array_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="`err_phot` has not been updated as expected") + + + def test_make_a012_UNITY(self): + + # Load data + with h5py.File(self.datadir + "make_a012_%04d.h5" %self.iter, "r") as g: + addr = g["addr"][:] + I = g["I"][:] + f = g["f"][:] + a = g["a"][:] + b = g["b"][:] + fic = g["fic"][:] + with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as h: + aux = h["aux"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + f_dev = gpuarray.to_gpu(f) + a_dev = gpuarray.to_gpu(a) + b_dev = gpuarray.to_gpu(b) + fic_dev = gpuarray.to_gpu(fic) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_a012(f, a, b, addr, I, fic) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) + + ## Assert + np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + err_msg="Imodel error has not been updated as expected") + np.testing.assert_allclose(BGDK.cpu.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + err_msg="LLerr error has not been updated as expected") + np.testing.assert_allclose(BGDK.cpu.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, + err_msg="LLden error has not been updated as expected") + + + def test_fill_b_UNITY(self): + + # Load data + with h5py.File(self.datadir + "fill_b_%04d.h5" %self.iter, "r") as f: + w = f["w"][:] + addr = f["addr"][:] + B = f["B"][:] + Brenorm = f["Brenorm"][...] + with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + print(B) + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + B_dev = gpuarray.to_gpu(B.astype(np.float32)) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.fill_b(addr, Brenorm, w, B) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev) + + ## Assert + np.testing.assert_allclose(B, B_dev.get(), rtol=self.rtol, atol=self.atol, + err_msg="`B` has not been updated as expected") + diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py new file mode 100644 index 000000000..0b5194c44 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -0,0 +1,78 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel +from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsPoUpdateKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def test_op_update_ml_UNITY(self): + + # Load data + with h5py.File(self.datadir + "op_update_ml_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + obg = f["obg"][:] + pr = f["pr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + obg_dev = gpuarray.to_gpu(obg) + pr_dev = gpuarray.to_gpu(pr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.ob_update_ML(addr, obg, pr, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="The object array has not been updated as expected") + + def test_pr_update_ml_UNITY(self): + + # Load data + with h5py.File(self.datadir + "pr_update_ml_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + prg = f["prg"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + ob_dev = gpuarray.to_gpu(ob) + prg_dev = gpuarray.to_gpu(prg) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.pr_update_ML(addr, prg, ob, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.ob_update_ML(addr_dev, prg_dev, ob_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="The probe array has not been updated as expected") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py new file mode 100644 index 000000000..2edc9276e --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py @@ -0,0 +1,93 @@ +''' +testing on real data +''' + +import h5py +import unittest +import numpy as np +import ptypy.utils as u +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel + +from ptypy.core import geometry +from ptypy.core import Base as theBase + +# subclass for dictionary access +Base = type('Base',(theBase,),{}) + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DLsPropagationKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def set_up_farfield(self,shape): + P = Base() + P.CType = COMPLEX_TYPE + P.Ftype = FLOAT_TYPE + g = u.Param() + g.energy = None # u.keV2m(1.0)/6.32e-7 + g.lam = 5.32e-7 + g.distance = 15e-2 + g.psize = 24e-6 + g.shape = shape + g.propagation = "farfield" + G = geometry.Geo(owner=P, pars=g) + return G + + def test_forward_UNITY(self): + + # Load data + with h5py.File(self.datadir + "forward_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + + # Geometry + geo = self.set_up_farfield(aux.shape[1:]) + + # CPU kernel + aux = geo.propagator.fw(aux) + + # GPU kernel + PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="CPU aux is \n%s, \nbut GPU aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + + def test_ackward_UNITY(self): + + # Load data + with h5py.File(self.datadir + "backward_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + + # Geometry + geo = self.set_up_farfield(aux.shape[1:]) + + # CPU kernel + aux = geo.propagator.bw(aux) + + # GPU kernel + PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="CPU aux is \n%s, \nbut GPU aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py new file mode 100644 index 000000000..d083d94ac --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -0,0 +1,72 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.engines.ML_pycuda import Regul_del2_pycuda + import pycuda.driver as cuda +from ptypy.engines.ML import Regul_del2 + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsRegularizerTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def test_regularizer_grad_UNITY(self): + + # Load data + with h5py.File(self.datadir + "regul_grad_%04d.h5" %self.iter, "r") as f: + ob = f["ob"][:] + + # Copy data to device + ob_dev = gpuarray.to_gpu(ob) + + # CPU Kernel + regul = Regul_del2(0.1) + obr = regul.grad(ob) + + # GPU Kernel + regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc) + obr_dev = regul_pycuda.grad(ob_dev) + + ## Assert + np.testing.assert_allclose(obr, obr_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="The object array has not been updated as expected") + np.testing.assert_allclose(regul.LL, regul_pycuda.LL.get(), atol=self.atol, rtol=self.rtol, + err_msg="The LL array has not been updated as expected") + + + def test_regularizer_poly_line_ceoffs_UNITY(self): + + # Load data + with h5py.File(self.datadir + "regul_poly_line_coeffs_%04d.h5" %self.iter, "r") as f: + ob = f["ob"][:] + obh = f["obh"][:] + + # Copy data to device + ob_dev = gpuarray.to_gpu(ob) + obh_dev = gpuarray.to_gpu(obh) + + # CPU Kernel + regul = Regul_del2(0.1) + res = regul.poly_line_coeffs(obh, ob) + + # GPU Kernel + regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc) + res_dev = regul_pycuda.poly_line_coeffs(obh_dev, ob_dev) + + ## Assert + np.testing.assert_allclose(res, res_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="The B array has not been updated as expected") From 709c65d4cc898dc9b9d94e9cb41626a6c7da45dd Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Sun, 28 Feb 2021 23:50:40 +0000 Subject: [PATCH 11/56] most of dls_tests are working now --- ...kernel.py => dls_auxiliary_wave_kernel_test.py} | 0 ...rnel.py => dls_gradient_descent_kernel_test.py} | 13 ++++++------- .../dls_tests/dls_po_update_kernel_test.py | 4 ++-- ...tion_test.py => dls_propagation_kernel_test.py} | 14 +++++++------- .../dls_tests/dls_regularizer_kernel_test.py | 6 +++--- 5 files changed, 18 insertions(+), 19 deletions(-) rename test/accelerate_tests/cuda_pycuda_tests/dls_tests/{dls_auxiliary_wave_kernel.py => dls_auxiliary_wave_kernel_test.py} (100%) rename test/accelerate_tests/cuda_pycuda_tests/dls_tests/{dls_gradient_descent_kernel.py => dls_gradient_descent_kernel_test.py} (94%) rename test/accelerate_tests/cuda_pycuda_tests/dls_tests/{dls_propagation_test.py => dls_propagation_kernel_test.py} (84%) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py similarity index 100% rename from test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel.py rename to test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py similarity index 94% rename from test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py rename to test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index ff60cd788..c91268ac5 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -17,7 +17,6 @@ FLOAT_TYPE = np.float32 INT_TYPE = np.int32 - class DlsGradientDescentKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" @@ -47,7 +46,7 @@ def test_make_model_UNITY(self): GDK.make_model(aux_dev, addr_dev) ## Assert - np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, err_msg="`Imodel` buffer has not been updated as expected") @@ -80,7 +79,7 @@ def test_floating_intensity_UNITY(self): GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) ## Assert - np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, err_msg="`Imodel` buffer has not been updated as expected") np.testing.assert_allcolse(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="floating intensity coeff (fic) has not been updated as expected") @@ -120,7 +119,7 @@ def test_main_and_error_reduce_UNITY(self): ## Assert np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="Auxiliary has not been updated as expected") - np.testing.assert_allclose(BGDK.cpu.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, err_msg="LogLikelihood error has not been updated as expected") np.testing.assert_array_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="`err_phot` has not been updated as expected") @@ -159,11 +158,11 @@ def test_make_a012_UNITY(self): GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) ## Assert - np.testing.assert_allclose(BGDK.cpu.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, err_msg="Imodel error has not been updated as expected") - np.testing.assert_allclose(BGDK.cpu.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, err_msg="LLerr error has not been updated as expected") - np.testing.assert_allclose(BGDK.cpu.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, err_msg="LLden error has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index 0b5194c44..ff93d73a2 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -44,7 +44,7 @@ def test_op_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev) + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=True) ## Assert np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, @@ -71,7 +71,7 @@ def test_pr_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.ob_update_ML(addr_dev, prg_dev, ob_dev, aux_dev) + POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=True) ## Assert np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py similarity index 84% rename from test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py rename to test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py index 2edc9276e..3b4f2c873 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py @@ -47,13 +47,13 @@ def test_forward_UNITY(self): # Load data with h5py.File(self.datadir + "forward_%04d.h5" %self.iter, "r") as f: - aux = f["aux"][:] + aux = f["aux"][0] # Copy data to device aux_dev = gpuarray.to_gpu(aux) # Geometry - geo = self.set_up_farfield(aux.shape[1:]) + geo = self.set_up_farfield(aux.shape) # CPU kernel aux = geo.propagator.fw(aux) @@ -65,20 +65,20 @@ def test_forward_UNITY(self): ## Assert np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, - err_msg="CPU aux is \n%s, \nbut GPU aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + err_msg="Forward propagation was not as expected") - def test_ackward_UNITY(self): + def test_backward_UNITY(self): # Load data with h5py.File(self.datadir + "backward_%04d.h5" %self.iter, "r") as f: - aux = f["aux"][:] + aux = f["aux"][0] # Copy data to device aux_dev = gpuarray.to_gpu(aux) # Geometry - geo = self.set_up_farfield(aux.shape[1:]) + geo = self.set_up_farfield(aux.shape) # CPU kernel aux = geo.propagator.bw(aux) @@ -90,4 +90,4 @@ def test_ackward_UNITY(self): ## Assert np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, - err_msg="CPU aux is \n%s, \nbut GPU aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + err_msg="Backward propagation was not as expected") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py index d083d94ac..58c5a0bef 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -44,7 +44,7 @@ def test_regularizer_grad_UNITY(self): ## Assert np.testing.assert_allclose(obr, obr_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="The object array has not been updated as expected") - np.testing.assert_allclose(regul.LL, regul_pycuda.LL.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(regul.LL, regul_pycuda.LL, atol=self.atol, rtol=self.rtol, err_msg="The LL array has not been updated as expected") @@ -65,8 +65,8 @@ def test_regularizer_poly_line_ceoffs_UNITY(self): # GPU Kernel regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc) - res_dev = regul_pycuda.poly_line_coeffs(obh_dev, ob_dev) + res_pycuda = regul_pycuda.poly_line_coeffs(obh_dev, ob_dev) ## Assert - np.testing.assert_allclose(res, res_dev.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(res, res_pycuda, atol=self.atol, rtol=self.rtol, err_msg="The B array has not been updated as expected") From 6fd3928536d24302d45471f1b968fa3d768242c7 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Mon, 1 Mar 2021 13:30:13 +0000 Subject: [PATCH 12/56] improve dls_tests --- .../dls_tests/dls_auxiliary_wave_kernel_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py index ce52181c8..c85687cd2 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py @@ -19,8 +19,9 @@ class DlsAuxiliaryWaveKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 10 + iter = 0 rtol = 1e-6 + atol = 1e-6 def test_build_aux_no_ex_noadd_UNITY(self): @@ -48,4 +49,5 @@ def test_build_aux_no_ex_noadd_UNITY(self): AWK.build_aux_no_ex(aux_dev, addr_dev, ob_dev, pr_dev, add=False) ## Assert - np.testing.assert_allclose(aux_dev.get(), aux, rtol=self.rtol, err_msg="The auxiliary_wave does not match the base kernel output") \ No newline at end of file + np.testing.assert_allclose(aux_dev.get(), aux, rtol=self.rtol, atol=self.atol, + err_msg="The auxiliary_wave does not match the base kernel output") \ No newline at end of file From e54e15f0dc3c59619bd49a6f48c52c215d127965 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Tue, 2 Mar 2021 12:16:13 +0000 Subject: [PATCH 13/56] DLS real data tests now working, not all are passing --- ptypy/accelerate/base/engines/ML_serial.py | 3 ++ .../dls_auxiliary_wave_kernel_test.py | 2 +- .../dls_gradient_descent_kernel_test.py | 49 +++++++++++++------ .../dls_tests/dls_po_update_kernel_test.py | 6 +-- .../dls_tests/dls_propagation_kernel_test.py | 2 +- .../dls_tests/dls_regularizer_kernel_test.py | 2 +- 6 files changed, 43 insertions(+), 21 deletions(-) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index d81873f01..61df10dfb 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -548,6 +548,9 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): f["Brenorm"] = Brenorm f["w"] = w f["B"] = B + f["A0"] = GDK.npy.Imodel + f["A1"] = GDK.npy.LLerr + f["A2"] = GDK.npy.LLden GDK.fill_b(addr, Brenorm, w, B) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py index c85687cd2..82ce367c9 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py @@ -19,7 +19,7 @@ class DlsAuxiliaryWaveKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 0 + iter = 50 rtol = 1e-6 atol = 1e-6 diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index c91268ac5..ab4140d70 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -20,7 +20,7 @@ class DlsGradientDescentKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 0 + iter = 50 rtol = 1e-6 atol = 1e-6 @@ -81,7 +81,7 @@ def test_floating_intensity_UNITY(self): ## Assert np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, err_msg="`Imodel` buffer has not been updated as expected") - np.testing.assert_allcolse(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="floating intensity coeff (fic) has not been updated as expected") @@ -121,27 +121,31 @@ def test_main_and_error_reduce_UNITY(self): err_msg="Auxiliary has not been updated as expected") np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, err_msg="LogLikelihood error has not been updated as expected") - np.testing.assert_array_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="`err_phot` has not been updated as expected") def test_make_a012_UNITY(self): + Nmax = 10 + Ymax = 128 + Xmax = 128 + # Load data with h5py.File(self.datadir + "make_a012_%04d.h5" %self.iter, "r") as g: addr = g["addr"][:] - I = g["I"][:] - f = g["f"][:] - a = g["a"][:] - b = g["b"][:] - fic = g["fic"][:] + I = g["I"][:Nmax,:Ymax,:Xmax] + f = g["f"][:Nmax,:Ymax,:Xmax] + a = g["a"][:Nmax,:Ymax,:Xmax] + b = g["b"][:Nmax,:Ymax,:Xmax] + fic = g["fic"][:Nmax] with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as h: - aux = h["aux"][:] + aux = h["aux"][:Nmax,:Ymax,:Xmax] # Copy data to device aux_dev = gpuarray.to_gpu(aux) addr_dev = gpuarray.to_gpu(addr) - I_dev = gpuarray.to_gpu(I) + I_dev = gpuarray.to_gpu(addr) f_dev = gpuarray.to_gpu(f) a_dev = gpuarray.to_gpu(a) b_dev = gpuarray.to_gpu(b) @@ -152,8 +156,8 @@ def test_make_a012_UNITY(self): BGDK.allocate() BGDK.make_a012(f, a, b, addr, I, fic) - # GPU kernel - GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream) GDK.allocate() GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) @@ -168,30 +172,45 @@ def test_make_a012_UNITY(self): def test_fill_b_UNITY(self): + Nmax = 10 + Ymax = 128 + Xmax = 128 + # Load data with h5py.File(self.datadir + "fill_b_%04d.h5" %self.iter, "r") as f: - w = f["w"][:] + w = f["w"][:Nmax, :Ymax, :Xmax] addr = f["addr"][:] B = f["B"][:] Brenorm = f["Brenorm"][...] + A0 = f["A0"][:Nmax, :Ymax, :Xmax] + A1 = f["A1"][:Nmax, :Ymax, :Xmax] + A2 = f["A2"][:Nmax, :Ymax, :Xmax] with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: - aux = f["aux"][:] - print(B) + aux = f["aux"][:Nmax, :Ymax, :Xmax] # Copy data to device aux_dev = gpuarray.to_gpu(aux) w_dev = gpuarray.to_gpu(w) addr_dev = gpuarray.to_gpu(addr) B_dev = gpuarray.to_gpu(B.astype(np.float32)) + A0_dev = gpuarray.to_gpu(A0) + A1_dev = gpuarray.to_gpu(A1) + A2_dev = gpuarray.to_gpu(A2) # CPU Kernel BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) BGDK.allocate() + BGDK.npy.Imodel = A0 + BGDK.npy.LLerr = A1 + BGDK.npy.LLden = A2 BGDK.fill_b(addr, Brenorm, w, B) # GPU kernel GDK = GradientDescentKernel(aux_dev, addr.shape[1]) GDK.allocate() + GDK.gpu.Imodel = A0_dev + GDK.gpu.LLerr = A1_dev + GDK.gpu.LLden = A2_dev GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev) ## Assert diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index ff93d73a2..3d5c63b26 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -19,7 +19,7 @@ class DlsPoUpdateKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 0 + iter = 50 rtol = 1e-6 atol = 1e-6 @@ -44,7 +44,7 @@ def test_op_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=True) + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=False) ## Assert np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, @@ -71,7 +71,7 @@ def test_pr_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=True) + POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=False) ## Assert np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py index 3b4f2c873..6e658b970 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py @@ -25,7 +25,7 @@ class DLsPropagationKernelTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 0 + iter = 50 rtol = 1e-6 atol = 1e-6 diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py index 58c5a0bef..cf7ac5b9d 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -20,7 +20,7 @@ class DlsRegularizerTest(PyCudaTest): datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 0 + iter = 50 rtol = 1e-6 atol = 1e-6 From 988031b38d0fb70256079cdf7c0fb53761ff09a4 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Tue, 2 Mar 2021 12:23:01 +0000 Subject: [PATCH 14/56] Test with atomics for now --- .../cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index 3d5c63b26..20d4ad68f 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -44,7 +44,7 @@ def test_op_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=False) + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=True) ## Assert np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, @@ -71,7 +71,7 @@ def test_pr_update_ml_UNITY(self): # GPU Kernel POK = PoUpdateKernel() - POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=False) + POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=True) ## Assert np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, From 993572d7473daa935e1f9871cb68a0305f9bec5a Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Wed, 3 Mar 2021 10:34:04 +0000 Subject: [PATCH 15/56] improved dls_tests --- ptypy/accelerate/base/engines/ML_serial.py | 42 ++++++------- .../cuda_pycuda/engines/DM_pycuda.py | 5 +- ptypy/engines/ML.py | 23 ++++++- .../dls_auxiliary_wave_kernel_test.py | 14 +++-- .../dls_gradient_descent_kernel_test.py | 62 ++++++++++++------- .../dls_tests/dls_po_update_kernel_test.py | 22 +++++-- .../dls_tests/dls_propagation_kernel_test.py | 25 +++++--- .../dls_tests/dls_regularizer_kernel_test.py | 23 ++++--- 8 files changed, 141 insertions(+), 75 deletions(-) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 61df10dfb..8a2097952 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -33,16 +33,6 @@ @register() class ML_serial(ML): - """ - Defaults: - - [debug] - default = None - type = str - help = For debugging purposes, dump arrays into given directory - - """ - def __init__(self, ptycho_parent, pars=None): """ Maximum likelihood reconstruction engine. @@ -206,7 +196,11 @@ def engine_iterate(self, num=1): # probe/object rescaling if self.p.scale_precond: - cn2_new_pr_grad = cn2_new_pr_grad + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/ml_serial_o_p_norm_%04d.h5" %self.curiter, "w") as f: + f["cn2_new_pr_grad"] = cn2_new_pr_grad + f["cn2_new_ob_grad"] = cn2_new_ob_grad + if cn2_new_pr_grad > 1e-5: scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad / cn2_new_pr_grad) @@ -368,7 +362,7 @@ def new_grad(self): I = self.engine.di.S[dID].data # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/build_aux_no_ex_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux f["addr"] = addr @@ -379,7 +373,7 @@ def new_grad(self): AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/forward_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux @@ -387,7 +381,7 @@ def new_grad(self): aux[:] = FW(aux) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/make_model_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux f["addr"] = addr @@ -395,7 +389,7 @@ def new_grad(self): GDK.make_model(aux, addr) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/floating_intensities_%04d.h5" %self.engine.curiter, "w") as f: f["w"] = w f["addr"] = addr @@ -406,7 +400,7 @@ def new_grad(self): GDK.floating_intensity(addr, w, I, fic) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/main_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux f["addr"] = addr @@ -416,7 +410,7 @@ def new_grad(self): GDK.main(aux, addr, w, I) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/error_reduce_%04d.h5" %self.engine.curiter, "w") as f: f["addr"] = addr f["err_phot"] = err_phot @@ -424,14 +418,14 @@ def new_grad(self): GDK.error_reduce(addr, err_phot) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/backward_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux aux[:] = BW(aux) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/op_update_ml_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux f["addr"] = addr @@ -441,7 +435,7 @@ def new_grad(self): POK.ob_update_ML(addr, obg, pr, aux) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/pr_update_ml_%04d.h5" %self.engine.curiter, "w") as f: f["aux"] = aux f["addr"] = addr @@ -469,7 +463,7 @@ def new_grad(self): for name, s in self.engine.ob.storages.items(): # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/regul_grad_%04d.h5" %self.engine.curiter, "w") as f: f["ob"] = s.data @@ -530,7 +524,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): b[:] = FW(b) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/make_a012_%04d.h5" %self.engine.curiter, "w") as g: g["addr"] = addr g["a"] = a @@ -542,7 +536,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): GDK.make_a012(f, a, b, addr, I, fic) # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/fill_b_%04d.h5" %self.engine.curiter, "w") as f: f["addr"] = addr f["Brenorm"] = Brenorm @@ -561,7 +555,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): for name, s in self.ob.storages.items(): # debugging - if self.p.debug and parallel.master and (self.engine.curiter % 10 == 0): + if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): with h5py.File(self.p.debug + "/regul_poly_line_coeffs_%04d.h5" %self.engine.curiter, "w") as f: f["ob"] = s.data f["obh"] = c_ob_h.storages[name].data diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 154f073ee..8b7741e38 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -495,9 +495,10 @@ def engine_finalize(self): for dID, prep in self.diff_info.items(): prep.addr = prep.addr_gpu.get() - # copy data to cpu + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) for name, s in self.pr.S.items(): - s.data = np.copy(s.data) # is this the same as s.data.get()? + s.data = np.copy(s.data) self.context.detach() super(DM_pycuda, self).engine_finalize() \ No newline at end of file diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index b0bbaf678..b66ac639c 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -22,6 +22,9 @@ from .base import PositionCorrectionEngine from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull +# for debugging +import h5py + __all__ = ['ML'] @@ -99,6 +102,16 @@ class ML(PositionCorrectionEngine): lowlim = 0 help = Number of iterations before probe update starts + [debug] + default = None + type = str + help = For debugging purposes, dump arrays into given directory + + [debug_iter] + default = 0 + type = int + help = For debugging purposes, dump arrays at this iteration + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] @@ -232,9 +245,15 @@ def engine_iterate(self, num=1): # probe/object rescaling if self.p.scale_precond: cn2_new_pr_grad = Cnorm2(new_pr_grad) + cn2_new_ob_grad = Cnorm2(new_ob_grad) + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/ml_o_p_norm_%04d.h5" %self.curiter, "w") as f: + f["cn2_new_pr_grad"] = cn2_new_pr_grad + f["cn2_new_ob_grad"] = cn2_new_ob_grad + if cn2_new_pr_grad > 1e-5: - scale_p_o = (self.p.scale_probe_object * Cnorm2(new_ob_grad) - / Cnorm2(new_pr_grad)) + scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad + / cn2_new_pr_grad) else: scale_p_o = self.p.scale_probe_object if self.scale_p_o is None: diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py index 82ce367c9..38e52ace0 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py @@ -4,6 +4,7 @@ import h5py import unittest import numpy as np +from parameterized import parameterized from .. import perfrun, PyCudaTest, have_pycuda if have_pycuda(): @@ -11,22 +12,25 @@ from ptypy.accelerate.cuda_pycuda.kernels import AuxiliaryWaveKernel from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as BaseAuxiliaryWaveKernel - COMPLEX_TYPE = np.complex64 FLOAT_TYPE = np.float32 INT_TYPE = np.int32 class DlsAuxiliaryWaveKernelTest(PyCudaTest): - datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 50 + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" rtol = 1e-6 atol = 1e-6 - def test_build_aux_no_ex_noadd_UNITY(self): + @parameterized.expand([ + ["base", 10], + #["regul", 50], + ["floating", 0], + ]) + def test_build_aux_no_ex_noadd_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "build_aux_no_ex_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir % name + "build_aux_no_ex_%04d.h5" %iter, "r") as f: aux = f["aux"][:] addr = f["addr"][:] ob = f["ob"][:] diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index ab4140d70..ee4055b7d 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -5,9 +5,9 @@ import h5py import unittest import numpy as np +from parameterized import parameterized from .. import perfrun, PyCudaTest, have_pycuda - if have_pycuda(): from pycuda import gpuarray from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel @@ -19,15 +19,19 @@ class DlsGradientDescentKernelTest(PyCudaTest): - datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 50 + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" rtol = 1e-6 atol = 1e-6 - def test_make_model_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_make_model_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: aux = f["aux"][:] addr = f["addr"][:] @@ -49,16 +53,20 @@ def test_make_model_UNITY(self): np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, err_msg="`Imodel` buffer has not been updated as expected") - - def test_floating_intensity_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_floating_intensity_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "floating_intensities_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "floating_intensities_%04d.h5" %iter, "r") as f: w = f["w"][:] addr = f["addr"][:] I = f["I"][:] fic = f["fic"][:] - with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: aux = f["aux"][:] # Copy data to device @@ -84,17 +92,21 @@ def test_floating_intensity_UNITY(self): np.testing.assert_allclose(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="floating intensity coeff (fic) has not been updated as expected") - - def test_main_and_error_reduce_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_main_and_error_reduce_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "main_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "main_%04d.h5" %iter, "r") as f: aux = f["aux"][:] addr = f["addr"][:] w = f["w"][:] I = f["I"][:] # Load data - with h5py.File(self.datadir + "error_reduce_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "error_reduce_%04d.h5" %iter, "r") as f: err_phot = f["err_phot"][:] # Copy data to device @@ -124,22 +136,26 @@ def test_main_and_error_reduce_UNITY(self): np.testing.assert_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="`err_phot` has not been updated as expected") - - def test_make_a012_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_make_a012_UNITY(self, name, iter): Nmax = 10 Ymax = 128 Xmax = 128 # Load data - with h5py.File(self.datadir + "make_a012_%04d.h5" %self.iter, "r") as g: + with h5py.File(self.datadir %name + "make_a012_%04d.h5" %iter, "r") as g: addr = g["addr"][:] I = g["I"][:Nmax,:Ymax,:Xmax] f = g["f"][:Nmax,:Ymax,:Xmax] a = g["a"][:Nmax,:Ymax,:Xmax] b = g["b"][:Nmax,:Ymax,:Xmax] fic = g["fic"][:Nmax] - with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as h: + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as h: aux = h["aux"][:Nmax,:Ymax,:Xmax] # Copy data to device @@ -169,15 +185,19 @@ def test_make_a012_UNITY(self): np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, err_msg="LLden error has not been updated as expected") - - def test_fill_b_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_fill_b_UNITY(self, name, iter): Nmax = 10 Ymax = 128 Xmax = 128 # Load data - with h5py.File(self.datadir + "fill_b_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "fill_b_%04d.h5" %iter, "r") as f: w = f["w"][:Nmax, :Ymax, :Xmax] addr = f["addr"][:] B = f["B"][:] @@ -185,7 +205,7 @@ def test_fill_b_UNITY(self): A0 = f["A0"][:Nmax, :Ymax, :Xmax] A1 = f["A1"][:Nmax, :Ymax, :Xmax] A2 = f["A2"][:Nmax, :Ymax, :Xmax] - with h5py.File(self.datadir + "make_model_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: aux = f["aux"][:Nmax, :Ymax, :Xmax] # Copy data to device diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index 20d4ad68f..da6bd2661 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -5,6 +5,7 @@ import h5py import unittest import numpy as np +from parameterized import parameterized from .. import PyCudaTest, have_pycuda if have_pycuda(): @@ -18,15 +19,19 @@ class DlsPoUpdateKernelTest(PyCudaTest): - datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 50 + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" rtol = 1e-6 atol = 1e-6 - def test_op_update_ml_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_op_update_ml_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "op_update_ml_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "op_update_ml_%04d.h5" %iter, "r") as f: aux = f["aux"][:] addr = f["addr"][:] obg = f["obg"][:] @@ -50,10 +55,15 @@ def test_op_update_ml_UNITY(self): np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="The object array has not been updated as expected") - def test_pr_update_ml_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_pr_update_ml_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "pr_update_ml_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "pr_update_ml_%04d.h5" %iter, "r") as f: aux = f["aux"][:] addr = f["addr"][:] ob = f["ob"][:] diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py index 6e658b970..ac9fa0402 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py @@ -5,13 +5,14 @@ import h5py import unittest import numpy as np -import ptypy.utils as u +from parameterized import parameterized from .. import PyCudaTest, have_pycuda if have_pycuda(): from pycuda import gpuarray from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel +import ptypy.utils as u from ptypy.core import geometry from ptypy.core import Base as theBase @@ -24,8 +25,7 @@ class DLsPropagationKernelTest(PyCudaTest): - datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 50 + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" rtol = 1e-6 atol = 1e-6 @@ -43,10 +43,15 @@ def set_up_farfield(self,shape): G = geometry.Geo(owner=P, pars=g) return G - def test_forward_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_forward_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "forward_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir % name + "forward_%04d.h5" %iter, "r") as f: aux = f["aux"][0] # Copy data to device @@ -67,11 +72,15 @@ def test_forward_UNITY(self): np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, err_msg="Forward propagation was not as expected") - - def test_backward_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_backward_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "backward_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir % name + "backward_%04d.h5" %iter, "r") as f: aux = f["aux"][0] # Copy data to device diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py index cf7ac5b9d..64fa892f8 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -5,6 +5,7 @@ import h5py import unittest import numpy as np +from parameterized import parameterized from .. import PyCudaTest, have_pycuda if have_pycuda(): @@ -19,15 +20,19 @@ class DlsRegularizerTest(PyCudaTest): - datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data/" - iter = 50 + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" rtol = 1e-6 atol = 1e-6 - def test_regularizer_grad_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_regularizer_grad_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "regul_grad_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir %name + "regul_grad_%04d.h5" %iter, "r") as f: ob = f["ob"][:] # Copy data to device @@ -47,11 +52,15 @@ def test_regularizer_grad_UNITY(self): np.testing.assert_allclose(regul.LL, regul_pycuda.LL, atol=self.atol, rtol=self.rtol, err_msg="The LL array has not been updated as expected") - - def test_regularizer_poly_line_ceoffs_UNITY(self): + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_regularizer_poly_line_ceoffs_UNITY(self, name, iter): # Load data - with h5py.File(self.datadir + "regul_poly_line_coeffs_%04d.h5" %self.iter, "r") as f: + with h5py.File(self.datadir % name + "regul_poly_line_coeffs_%04d.h5" %iter, "r") as f: ob = f["ob"][:] obh = f["obh"][:] From 51c2e81343c33cb0f171bead53bd9d9ba8377c7c Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Wed, 3 Mar 2021 10:38:30 +0000 Subject: [PATCH 16/56] small change to dls_tests --- .../dls_tests/dls_auxiliary_wave_kernel_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py index 38e52ace0..0d943c28e 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py @@ -24,7 +24,7 @@ class DlsAuxiliaryWaveKernelTest(PyCudaTest): @parameterized.expand([ ["base", 10], - #["regul", 50], + ["regul", 50], ["floating", 0], ]) def test_build_aux_no_ex_noadd_UNITY(self, name, iter): From 0f3c520184ad779bf640d90198af76c06bb03d2d Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Wed, 3 Mar 2021 10:40:52 +0000 Subject: [PATCH 17/56] only read regul data for regularization tests --- .../dls_tests/dls_regularizer_kernel_test.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py index 64fa892f8..972648552 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -25,9 +25,7 @@ class DlsRegularizerTest(PyCudaTest): atol = 1e-6 @parameterized.expand([ - ["base", 10], - ["regul", 50], - ["floating", 0], + ["regul", 50] ]) def test_regularizer_grad_UNITY(self, name, iter): @@ -53,9 +51,7 @@ def test_regularizer_grad_UNITY(self, name, iter): err_msg="The LL array has not been updated as expected") @parameterized.expand([ - ["base", 10], ["regul", 50], - ["floating", 0], ]) def test_regularizer_poly_line_ceoffs_UNITY(self, name, iter): From ddb03c90c12b89e38a2a7decf11966ab44b30655 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Wed, 3 Mar 2021 15:09:41 +0000 Subject: [PATCH 18/56] Testing make_a012: still failing --- .../dls_tests/dls_gradient_descent_kernel_test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index ee4055b7d..c37febd0f 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -143,13 +143,14 @@ def test_main_and_error_reduce_UNITY(self, name, iter): ]) def test_make_a012_UNITY(self, name, iter): - Nmax = 10 + # Reduce the array size to make the tests run faster + Nmax = 10 Ymax = 128 Xmax = 128 # Load data with h5py.File(self.datadir %name + "make_a012_%04d.h5" %iter, "r") as g: - addr = g["addr"][:] + addr = g["addr"][:Nmax] I = g["I"][:Nmax,:Ymax,:Xmax] f = g["f"][:Nmax,:Ymax,:Xmax] a = g["a"][:Nmax,:Ymax,:Xmax] @@ -175,6 +176,9 @@ def test_make_a012_UNITY(self, name, iter): # GPU kernel GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream) GDK.allocate() + GDK.gpu.Imodel.fill(np.nan) + GDK.gpu.LLerr.fill(np.nan) + GDK.gpu.LLden.fill(np.nan) GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) ## Assert From 8c4dda42674812183ada09e6f8d0250afe93b021 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 4 Mar 2021 10:59:15 +0000 Subject: [PATCH 19/56] Testing probe/object update without atomics --- .../dls_tests/dls_po_update_kernel_test.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index da6bd2661..b045d01f4 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -28,7 +28,7 @@ class DlsPoUpdateKernelTest(PyCudaTest): ["regul", 50], ["floating", 0], ]) - def test_op_update_ml_UNITY(self, name, iter): + def test_op_update_ml_UNITY(self, name, iter, atomics=False): # Load data with h5py.File(self.datadir %name + "op_update_ml_%04d.h5" %iter, "r") as f: @@ -39,20 +39,26 @@ def test_op_update_ml_UNITY(self, name, iter): # Copy data to device aux_dev = gpuarray.to_gpu(aux) - addr_dev = gpuarray.to_gpu(addr) obg_dev = gpuarray.to_gpu(obg) pr_dev = gpuarray.to_gpu(pr) + # If not using atomics we need to change the addresses + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = gpuarray.to_gpu(addr2) + else: + addr_dev = gpuarray.to_gpu(addr) + # CPU Kernel BPOK = BasePoUpdateKernel() BPOK.ob_update_ML(addr, obg, pr, aux) # GPU Kernel POK = PoUpdateKernel() - POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=True) + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=atomics) ## Assert - np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, verbose=False, err_msg="The object array has not been updated as expected") @parameterized.expand([ @@ -60,7 +66,7 @@ def test_op_update_ml_UNITY(self, name, iter): ["regul", 50], ["floating", 0], ]) - def test_pr_update_ml_UNITY(self, name, iter): + def test_pr_update_ml_UNITY(self, name, iter, atomics=False): # Load data with h5py.File(self.datadir %name + "pr_update_ml_%04d.h5" %iter, "r") as f: @@ -70,19 +76,25 @@ def test_pr_update_ml_UNITY(self, name, iter): prg = f["prg"][:] # Copy data to device - aux_dev = gpuarray.to_gpu(aux) - addr_dev = gpuarray.to_gpu(addr) + aux_dev = gpuarray.to_gpu(aux) ob_dev = gpuarray.to_gpu(ob) prg_dev = gpuarray.to_gpu(prg) + # If not using atomics we need to change the addresses + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = gpuarray.to_gpu(addr2) + else: + addr_dev = gpuarray.to_gpu(addr) + # CPU Kernel BPOK = BasePoUpdateKernel() BPOK.pr_update_ML(addr, prg, ob, aux) # GPU Kernel POK = PoUpdateKernel() - POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=True) + POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=atomics) ## Assert - np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, verbose=False, err_msg="The probe array has not been updated as expected") \ No newline at end of file From 002658e568dc4333be6fb3ee3fda96138c1c1398 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Thu, 4 Mar 2021 19:10:44 +0000 Subject: [PATCH 20/56] Gpu flexible datatypes (#294) * generalised and flexible data types for fill_b kernels * configurable data types for batched_multiply * build_aux kernels and variants with flexible dtypes * flexible data type for build_exit * flexible data types for error_reduce * finite difference kernel update for consistent and flexible data types * consistent naming of data types in dot.cu * flexible types in exit_error.cu * fmag_all_update kernel with flexible datatypes * adjustable data types in fourier_error.cu * configurable data types for full_reduce * gd_main with flexible data types * flexible data types in log_likelihood * flexible data types in intens_renorm * flexible dtypes in update_addr_error_state * flexible data types for make_a012 * better error output from kernel compilation by inserting a line directive * flexible data types for make_model * flexible data types in ob_update_ML * flexible data types in ob_update * flexible data types on ob_update2_ML * type-generic ob_update2 * flexible data types for pr_update_ML * flexible data types for the pr_update kernel * flexible data type for pr_update2_ML * flexible data types for pr_update2 * flexible data type on transpose * flexible data type for kernel in convolution * removing old type substitutions * fixing explicit type casts * adding an ACC_TYPE to the tiled update kernels * adding note to explain the register-spilling effect on the tiled update kernels --- ptypy/accelerate/cuda_pycuda/__init__.py | 3 + ptypy/accelerate/cuda_pycuda/array_utils.py | 40 ++-- .../cuda_pycuda/cuda/batched_multiply.cu | 14 +- .../accelerate/cuda_pycuda/cuda/build_aux.cu | 28 ++- .../cuda_pycuda/cuda/build_aux_no_ex.cu | 21 +- .../cuda/build_aux_position_correction.cu | 18 +- .../accelerate/cuda_pycuda/cuda/build_exit.cu | 23 ++- .../cuda_pycuda/cuda/convolution.cu | 16 +- .../accelerate/cuda_pycuda/cuda/delx_last.cu | 19 +- ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu | 19 +- ptypy/accelerate/cuda_pycuda/cuda/dot.cu | 10 +- .../cuda_pycuda/cuda/error_reduce.cu | 21 +- .../accelerate/cuda_pycuda/cuda/exit_error.cu | 15 +- ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu | 46 +++-- .../cuda_pycuda/cuda/fill_b_reduce.cu | 19 +- .../cuda_pycuda/cuda/fmag_all_update.cu | 35 ++-- .../cuda_pycuda/cuda/fourier_error.cu | 28 ++- .../cuda_pycuda/cuda/full_reduce.cu | 19 +- ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu | 22 +- .../cuda_pycuda/cuda/intens_renorm.cu | 32 +-- .../cuda_pycuda/cuda/log_likelihood.cu | 34 ++-- .../accelerate/cuda_pycuda/cuda/make_a012.cu | 57 +++--- .../accelerate/cuda_pycuda/cuda/make_model.cu | 16 +- .../accelerate/cuda_pycuda/cuda/ob_update.cu | 36 +++- .../accelerate/cuda_pycuda/cuda/ob_update2.cu | 53 +++-- .../cuda_pycuda/cuda/ob_update2_ML.cu | 35 +++- .../cuda_pycuda/cuda/ob_update_ML.cu | 28 ++- .../accelerate/cuda_pycuda/cuda/pr_update.cu | 35 +++- .../accelerate/cuda_pycuda/cuda/pr_update2.cu | 51 +++-- .../cuda_pycuda/cuda/pr_update2_ML.cu | 34 +++- .../cuda_pycuda/cuda/pr_update_ML.cu | 27 ++- .../accelerate/cuda_pycuda/cuda/transpose.cu | 5 + .../cuda/update_addr_error_state.cu | 17 +- ptypy/accelerate/cuda_pycuda/cufft.py | 20 +- ptypy/accelerate/cuda_pycuda/kernels.py | 191 ++++++++++++++---- .../fourier_update_kernel_test.py | 4 +- 36 files changed, 780 insertions(+), 311 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 04074625b..9daee89e3 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -42,6 +42,9 @@ def load_kernel(name, subs={}, file=None): kernel = f.read() for k,v in list(subs.items()): kernel = kernel.replace(k, str(v)) + # insert a preprocessor line directive to assist compiler errors + escaped = fn.replace("\\", "\\\\") + kernel = '#line 1 "{}"\n'.format(escaped) + kernel mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options) return mod.get_function(name) diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 7ec819b95..e3b97657a 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -8,15 +8,17 @@ def __init__(self, acc_dtype=np.float64, queue=None): self.queue = queue self.acc_dtype = acc_dtype self.cdot_cuda = load_kernel("dot", { - 'INTYPE': 'complex', - 'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float' + 'IN_TYPE': 'complex', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float' }) self.dot_cuda = load_kernel("dot", { - 'INTYPE': 'float', - 'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float' + 'IN_TYPE': 'float', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float' }) self.full_reduce_cuda = load_kernel("full_reduce", { - 'DTYPE': 'double' if acc_dtype==np.float64 else 'float', + 'IN_TYPE': 'double' if acc_dtype==np.float64 else 'float', + 'OUT_TYPE': 'double' if acc_dtype==np.float64 else 'float', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float', 'BDIM_X': 1024 }) self.transpose_cuda = load_kernel("transpose", { @@ -103,25 +105,29 @@ def __init__(self, dtype, queue=None): 'IS_FORWARD': 'true', 'BDIM_X': str(self.last_axis_block[0]), 'BDIM_Y': str(self.last_axis_block[1]), - 'DTYPE': stype + 'IN_TYPE': stype, + 'OUT_TYPE': stype }) self.delxb_last = load_kernel("delx_last", file="delx_last.cu", subs={ 'IS_FORWARD': 'false', 'BDIM_X': str(self.last_axis_block[0]), 'BDIM_Y': str(self.last_axis_block[1]), - 'DTYPE': stype + 'IN_TYPE': stype, + 'OUT_TYPE': stype }) self.delxf_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ 'IS_FORWARD': 'true', 'BDIM_X': str(self.mid_axis_block[0]), 'BDIM_Y': str(self.mid_axis_block[1]), - 'DTYPE': stype + 'IN_TYPE': stype, + 'OUT_TYPE': stype }) self.delxb_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ 'IS_FORWARD': 'false', 'BDIM_X': str(self.mid_axis_block[0]), 'BDIM_Y': str(self.mid_axis_block[1]), - 'DTYPE': stype + 'IN_TYPE': stype, + 'OUT_TYPE': stype }) def delxf(self, input, out, axis=-1): @@ -188,13 +194,17 @@ def delxb(self, input, out, axis=-1): class GaussianSmoothingKernel: - def __init__(self, queue=None, num_stdevs=4): + def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type self.dtype = np.complex64 self.stype = "complex" self.queue = queue self.num_stdevs = num_stdevs self.blockdim_x = 4 self.blockdim_y = 16 + # At least 2 blocks per SM self.max_shared_per_block = 48 * 1024 // 2 @@ -204,12 +214,14 @@ def __init__(self, queue=None, num_stdevs=4): self.convolution_row = load_kernel("convolution_row", file="convolution.cu", subs={ 'BDIM_X': self.blockdim_x, 'BDIM_Y': self.blockdim_y, - 'DTYPE': self.stype + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) self.convolution_col = load_kernel("convolution_col", file="convolution.cu", subs={ 'BDIM_X': self.blockdim_y, 'BDIM_Y': self.blockdim_x, - 'DTYPE': self.stype + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) @@ -238,7 +250,7 @@ def convolution(self, input, output, mfs): r = int(self.num_stdevs * stdx + 0.5) g = gaussian(np.arange(-r,r+1), stdx) g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32)) + kernel = gpuarray.to_gpu(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") @@ -263,7 +275,7 @@ def convolution(self, input, output, mfs): r = int(self.num_stdevs * stdy + 0.5) g = gaussian(np.arange(-r,r+1), stdy) g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32)) + kernel = gpuarray.to_gpu(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu index 15ca555fa..1263841b6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu @@ -1,13 +1,19 @@ /** This kernel was used for FFT pre- and post-scaling, to test if cuFFT via python is worthwhile. It turned out it wasn't. -*/ + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + * - MATH_TYPE: the data type used for computation (filter) + */ + #include using thrust::complex; -extern "C" __global__ void batched_multiply(const complex* input, - complex* output, - const complex* filter, +extern "C" __global__ void batched_multiply(const complex* input, + complex* output, + const complex* filter, float scale, int nBatches, int rows, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu index 88b22c256..bb0e68838 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu @@ -1,24 +1,33 @@ +/** build_aux kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; extern "C" __global__ void build_aux( - complex* auxiliary_wave, - const complex* __restrict__ exit_wave, + complex* auxiliary_wave, + const complex* __restrict__ exit_wave, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, const int* __restrict__ addr, - float alpha) + IN_TYPE alpha_) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -35,9 +44,14 @@ extern "C" __global__ void build_aux( // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { + // temporaries to convert to MATH_TYPE in case it's different to storage type + complex t_obj = obj[b * I + c]; + complex t_probe = probe[b * F + c]; + complex t_ex = exit_wave[b * C + c]; + auxiliary_wave[b * C + c] = - obj[b * I + c] * probe[b * F + c] * (1.0f + alpha) - - exit_wave[b * C + c] * alpha; + t_obj * t_probe * (MATH_TYPE(1) + alpha) - + t_ex * alpha; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu index 384efc070..b19ad8d70 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu @@ -1,23 +1,32 @@ +/** build_aux without exit wave kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave, +extern "C" __global__ void build_aux_no_ex(complex* auxilliary_wave, int aRows, int aCols, - const CTYPE* __restrict__ probe, + const complex* __restrict__ probe, int pRows, int pCols, - const CTYPE* __restrict__ obj, + const complex* __restrict__ obj, int oRows, int oCols, const int* __restrict__ addr, - FTYPE fac, + IN_TYPE fac_, int doAdd) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; const int addr_stride = 15; + const MATH_TYPE fac = fac_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -32,7 +41,9 @@ extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave, # pragma unroll(4) for (int c = tx; c < aCols; c += blockDim.x) { - auto tmp = obj[b * oCols + c] * probe[b * pCols + c] * fac; + complex t_obj = obj[b * oCols + c]; + complex t_probe = probe[b * pCols + c]; + auto tmp = t_obj * t_probe * fac; if (doAdd) { auxilliary_wave[b * aCols + c] += tmp; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu index 004e7f0ed..327040371 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu @@ -1,12 +1,20 @@ +/** build_aux for position correction. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; extern "C" __global__ void build_aux_position_correction( - complex* auxiliary_wave, - const complex* __restrict__ probe, + complex* auxiliary_wave, + const complex* __restrict__ probe, int B, int C, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, const int* __restrict__ addr) @@ -30,7 +38,9 @@ extern "C" __global__ void build_aux_position_correction( // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - auxiliary_wave[b * C + c] = obj[b * I + c] * probe[b * C + c]; + complex t_obj = obj[b * I + c]; + complex t_probe = probe[b * C + c]; + auxiliary_wave[b * C + c] = t_obj * t_probe; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu index 87031184e..8c1127758 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu @@ -1,3 +1,12 @@ +/** build_exit kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + + #include using thrust::complex; @@ -9,14 +18,14 @@ __device__ inline void atomicAdd(complex* x, complex y) atomicAdd(xf + 1, y.imag()); } -extern "C" __global__ void build_exit(complex* auxiliary_wave, - complex* exit_wave, +extern "C" __global__ void build_exit(complex* auxiliary_wave, + complex* exit_wave, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, const int* __restrict__ addr) @@ -41,8 +50,10 @@ extern "C" __global__ void build_exit(complex* auxiliary_wave, // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - auto auxv = auxiliary_wave[b * C + c]; - auxv -= probe[b * F + c] * obj[b * I + c]; + complex auxv = auxiliary_wave[b * C + c]; + complex t_probe = probe[b * F + c]; + complex t_obj = obj[b * I + c]; + auxv -= t_probe * t_obj; exit_wave[b * C + c] += auxv; auxiliary_wave[b * C + c] = auxv; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu index 1b008c815..ae42ecba5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu @@ -1,3 +1,11 @@ +/** + * Data types: + * - DTYPE (float/double/complex/complex) + * - MATH_TYPE (float/double) - used for the convolution kernel itself + * + * A symmetric convolution kernel is assumed here + */ + #include using thrust::complex; @@ -42,7 +50,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input, DTYPE *output, int height, int width, - const float* kernel, + const MATH_TYPE* kernel, int kernel_radius) { int tx = threadIdx.x; @@ -97,7 +105,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input, if (gby + ty >= width || gbx + tx >= height) return; - // compute + // compute - will be complex if kernel is double auto sum = shm[tx * shwidth + (ty + kernel_radius)] * kernel[0]; for (int i = 1; i <= kernel_radius; ++i) { @@ -117,7 +125,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input, DTYPE *output, int height, int width, - const float* kernel, + const MATH_TYPE* kernel, int kernel_radius) { int tx = threadIdx.x; @@ -169,7 +177,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input, if (gby + ty >= width || gbx + tx >= height) return; - // compute + // compute - will be complex if kernel is double auto sum = shm[(tx + kernel_radius) * BDIM_Y + ty] * kernel[0]; for (int i = 1; i <= kernel_radius; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu index c4449f19a..a302790f7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu @@ -1,3 +1,10 @@ +/** difference along last axis + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + */ + #include using thrust::complex; @@ -10,14 +17,14 @@ using thrust::complex; * Otherwise it follows the same ideas as delx_mid - please read the * description there. */ -extern "C" __global__ void delx_last(const DTYPE *__restrict__ input, - DTYPE *output, +extern "C" __global__ void delx_last(const IN_TYPE *__restrict__ input, + OUT_TYPE *output, int flat_dim, int axis_dim) { // reinterpret to avoid constructor of complex() + compiler warning - __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)]; - auto shared_data = reinterpret_cast(shr); + __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; + auto shared_data = reinterpret_cast(shr); unsigned int tx = threadIdx.x; unsigned int ty = threadIdx.y; @@ -43,7 +50,7 @@ extern "C" __global__ void delx_last(const DTYPE *__restrict__ input, { if (IS_FORWARD) { - DTYPE plus1; + IN_TYPE plus1; if (tx < BDIM_X - 1 && ix < axis_dim - 1) // we have a next element in shared data { @@ -62,7 +69,7 @@ extern "C" __global__ void delx_last(const DTYPE *__restrict__ input, } else { - DTYPE minus1; + IN_TYPE minus1; if (tx > 0) // we have a previous element in shared { minus1 = shared_data[ty * BDIM_X + tx - 1]; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu index ffc6600ca..15a17f544 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu @@ -1,3 +1,10 @@ +/** difference along any axis + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + */ + #include using thrust::complex; @@ -40,8 +47,8 @@ using thrust::complex; * zero if it's the end of the input. * */ -extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, - DTYPE *output, +extern "C" __global__ void delx_mid(const IN_TYPE *__restrict__ input, + OUT_TYPE *output, int lower_dim, // x for 3D int higher_dim, // z for 3D int axis_dim) @@ -49,8 +56,8 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, // reinterpret to avoid compiler warning that // constructor of complex() cannot be called if it's // shared memory - polluting the outputs - __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)]; - auto shared_data = reinterpret_cast(shr); + __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; + auto shared_data = reinterpret_cast(shr); unsigned int tx = threadIdx.x; unsigned int ty = threadIdx.y; @@ -82,7 +89,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, { if (IS_FORWARD) { - DTYPE plus1; + IN_TYPE plus1; if (ty < BDIM_Y - 1 && iy < axis_dim - 1) // we have a next element in shared data { @@ -100,7 +107,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, } else { - DTYPE minus1; + IN_TYPE minus1; if (ty > 0) // we have a previous element in shared { minus1 = shared_data[(ty - 1) * BDIM_X + tx]; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu index 1f53b0d0c..21087abe3 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu @@ -15,15 +15,15 @@ __device__ inline T dotmul(const complex& a, const complex& b) return a.real() * b.real() + a.imag() * b.imag(); } -extern "C" __global__ void dot(const INTYPE* a, - const INTYPE* b, +extern "C" __global__ void dot(const IN_TYPE* a, + const IN_TYPE* b, int size, - ACCTYPE* out) + ACC_TYPE* out) { int tx = threadIdx.x; int ix = tx + blockIdx.x * blockDim.x; - __shared__ ACCTYPE sh[1024]; + __shared__ ACC_TYPE sh[1024]; if (ix < size) { @@ -31,7 +31,7 @@ extern "C" __global__ void dot(const INTYPE* a, } else { - sh[tx] = ACCTYPE(0); + sh[tx] = ACC_TYPE(0); } __syncthreads(); diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu index 177732e9b..9b3389d5c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu @@ -1,17 +1,24 @@ - -extern "C" __global__ void error_reduce(const float* ferr, - float* err_fmag, +/** error_reduce kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - ACC_TYPE: the data type used for computation + */ + +extern "C" __global__ void error_reduce(const IN_TYPE* ferr, + OUT_TYPE* err_fmag, int M, int N) { int tx = threadIdx.x; int ty = threadIdx.y; int batch = blockIdx.x; - extern __shared__ float sum_v[1024]; + extern __shared__ ACC_TYPE sum_v[1024]; int shidx = ty * blockDim.x + tx; // shidx: index in shared memory for this block - float sum = 0.0f; + ACC_TYPE sum = ACC_TYPE(0.0); for (int m = ty; m < M; m += blockDim.y) { @@ -20,7 +27,7 @@ extern "C" __global__ void error_reduce(const float* ferr, { int idx = batch * M * N + m * N + n; // idx is index qwith respect to the full stack - sum += ferr[idx]; + sum += ACC_TYPE(ferr[idx]); } } @@ -44,6 +51,6 @@ extern "C" __global__ void error_reduce(const float* ferr, if (shidx == 0) { - err_fmag[batch] = float(sum_v[0]); + err_fmag[batch] = OUT_TYPE(sum_v[0]); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu index d4f774319..fdac52e46 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu @@ -11,16 +11,16 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) exit_error(int nmodes, - complex *aux, - float *ferr, - const int *addr, + const complex * __restrict aux, + OUT_TYPE *ferr, + const int * __restrict addr, int A, int B) { int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; - float denom = A * B; + MATH_TYPE denom = A * B; const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride; const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride; @@ -32,15 +32,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(aux[a * B + b + idx * A * B]); + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - ferr[a * B + b] = acc / denom; + ferr[a * B + b] = OUT_TYPE(acc / denom); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu index cfdffb911..9c6c7e1de 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu @@ -1,28 +1,40 @@ -extern "C" __global__ void fill_b(const FTYPE* A0, - const FTYPE* A1, - const FTYPE* A2, - const FTYPE* w, - FTYPE Brenorm, +/** fill_b kernel. + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: the accumulator type for summing + */ + +extern "C" __global__ void fill_b(const IN_TYPE* A0, + const IN_TYPE* A1, + const IN_TYPE* A2, + const IN_TYPE* w, + IN_TYPE Brenorm, int size, - double* out) + OUT_TYPE* out) { int tx = threadIdx.x; int ix = tx + blockIdx.x * blockDim.x; - __shared__ double smem[3][BDIM_X]; + __shared__ ACC_TYPE smem[3][BDIM_X]; if (ix < size) { - // FTYPE(2) to make sure it's float in single precision and doesn't + // MATHTYPE(2) to make sure it's float in single precision and doesn't // accidentally promote the equation to double - smem[0][tx] = w[ix] * A0[ix] * A0[ix]; - smem[1][tx] = w[ix] * FTYPE(2) * A0[ix] * A1[ix]; - smem[2][tx] = w[ix] * (A1[ix] * A1[ix] + FTYPE(2) * A0[ix] * A2[ix]); + MATH_TYPE t_a0 = A0[ix]; + MATH_TYPE t_a1 = A1[ix]; + MATH_TYPE t_a2 = A2[ix]; + MATH_TYPE t_w = w[ix]; + smem[0][tx] = t_w * t_a0 * t_a0; + smem[1][tx] = t_w * MATH_TYPE(2) * t_a0 * t_a1; + smem[2][tx] = t_w * (t_a1 * t_a1 + MATH_TYPE(2) * t_a0 * t_a2); } else { - smem[0][tx] = FTYPE(0); - smem[1][tx] = FTYPE(0); - smem[2][tx] = FTYPE(0); + smem[0][tx] = ACC_TYPE(0); + smem[1][tx] = ACC_TYPE(0); + smem[2][tx] = ACC_TYPE(0); } __syncthreads(); @@ -43,8 +55,8 @@ extern "C" __global__ void fill_b(const FTYPE* A0, if (tx == 0) { - out[blockIdx.x * 3 + 0] = smem[0][0] * double(Brenorm); - out[blockIdx.x * 3 + 1] = smem[1][0] * double(Brenorm); - out[blockIdx.x * 3 + 2] = smem[2][0] * double(Brenorm); + out[blockIdx.x * 3 + 0] = MATH_TYPE(smem[0][0]) * MATH_TYPE(Brenorm); + out[blockIdx.x * 3 + 1] = MATH_TYPE(smem[1][0]) * MATH_TYPE(Brenorm); + out[blockIdx.x * 3 + 2] = MATH_TYPE(smem[2][0]) * MATH_TYPE(Brenorm); } } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu index c37d494d8..b590e39e4 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu @@ -1,12 +1,21 @@ +/** fill_b_reduce - for second-stage reduction used after fill_b. + * + * Note that the IN_TYPE here must match what's produced by the fill_b kernel + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + * - ACC_TYPE: the accumulator type for summing + */ + #include -extern "C" __global__ void fill_b_reduce(const double* in, FTYPE* B, int blocks) +extern "C" __global__ void fill_b_reduce(const IN_TYPE* in, OUT_TYPE* B, int blocks) { // always a single thread block for 2nd stage assert(gridDim.x == 1); int tx = threadIdx.x; - __shared__ double smem[3][BDIM_X]; + __shared__ ACC_TYPE smem[3][BDIM_X]; double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; for (int ix = tx; ix < blocks; ix += blockDim.x) @@ -37,8 +46,8 @@ extern "C" __global__ void fill_b_reduce(const double* in, FTYPE* B, int blocks) if (tx == 0) { - B[0] += FTYPE(smem[0][0]); - B[1] += FTYPE(smem[1][0]); - B[2] += FTYPE(smem[2][0]); + B[0] += OUT_TYPE(smem[0][0]); + B[1] += OUT_TYPE(smem[1][0]); + B[2] += OUT_TYPE(smem[2][0]); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu index 7d7a512a7..f8f695ca5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu @@ -1,15 +1,23 @@ +/** fmag_all_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include #include using std::sqrt; using thrust::complex; -extern "C" __global__ void fmag_all_update(complex* f, - const float* fmask, - const float* fmag, - const float* fdev, - const float* err_fmag, +extern "C" __global__ void fmag_all_update(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const IN_TYPE* err_fmag, const int* addr_info, - float pbound, + IN_TYPE pbound_, int A, int B) { @@ -17,23 +25,24 @@ extern "C" __global__ void fmag_all_update(complex* f, int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; + MATH_TYPE pbound = pbound_; const int* ea = addr_info + batch * addr_stride + 6; const int* da = addr_info + batch * addr_stride + 9; const int* ma = addr_info + batch * addr_stride + 12; fmask += ma[0] * A * B; - float err = err_fmag[da[0]]; + MATH_TYPE err = err_fmag[da[0]]; fdev += da[0] * A * B; fmag += da[0] * A * B; f += ea[0] * A * B; - float renorm = sqrt(pbound / err); + MATH_TYPE renorm = sqrt(pbound / err); for (int a = ty; a < A; a += blockDim.y) { for (int b = tx; b < B; b += blockDim.x) { - float m = fmask[a * A + b]; + MATH_TYPE m = fmask[a * A + b]; if (renorm < 1.0f) { /* @@ -42,10 +51,10 @@ extern "C" __global__ void fmag_all_update(complex* f, ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] + fmag[a * A + b] + 1e-7f)) ; */ - auto fmagv = fmag[a * A + b]; - auto fdevv = fdev[a * A + b]; - float fm = (1.0f - m) + - m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + 1e-7f)); + MATH_TYPE fmagv = fmag[a * A + b]; + MATH_TYPE fdevv = fdev[a * A + b]; + MATH_TYPE fm = (MATH_TYPE(1) - m) + + m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + MATH_TYPE(1e-7))); f[a * A + b] *= fm; } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu index 7998e094c..ad483c870 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu @@ -1,3 +1,12 @@ +/** fourier_error. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + + #include #include #include @@ -11,12 +20,12 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) fourier_error(int nmodes, - complex *f, - const float *fmask, - const float *fmag, - float *fdev, - float *ferr, - const float *mask_sum, + const complex *f, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + OUT_TYPE *ferr, + const IN_TYPE *mask_sum, const int *addr, int A, int B) @@ -39,15 +48,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = MATH_TYPE(0); for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(f[a * B + b + idx * A * B]); + complex t_f = f[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_f); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - auto fdevv = sqrt(acc) - fmag[a * B + b]; + auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); ferr[a * B + b] = (fmask[a * B + b] * fdevv * fdevv) / mask_sum[ma[0]]; fdev[a * B + b] = fdevv; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu index 3fe6ac8a5..801204aaa 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu @@ -1,16 +1,25 @@ +/** full_reduce kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - ACC_TYPE: the data type used for internal accumulation + */ + + #include -extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size) +extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size) { assert(gridDim.x == 1); int tx = threadIdx.x; - __shared__ DTYPE smem[BDIM_X]; + __shared__ ACC_TYPE smem[BDIM_X]; - auto sum = DTYPE(); + auto sum = ACC_TYPE(); for (int ix = tx; ix < size; ix += blockDim.x) { - sum = sum + in[ix]; + sum = sum + ACC_TYPE(in[ix]); } smem[tx] = sum; __syncthreads(); @@ -30,6 +39,6 @@ extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size) if (tx == 0) { - out[0] = smem[0]; + out[0] = OUT_TYPE(smem[0]); } } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu index 06d73ae88..1ab643c4c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu @@ -1,11 +1,19 @@ +/** gd_main kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void gd_main(const FTYPE* Imodel, - const FTYPE* I, - const FTYPE* w, - FTYPE* err, - CTYPE* aux, +extern "C" __global__ void gd_main(const IN_TYPE* Imodel, + const IN_TYPE* I, + const IN_TYPE* w, + OUT_TYPE* err, + complex* aux, int z, int modes, int x) @@ -16,8 +24,8 @@ extern "C" __global__ void gd_main(const FTYPE* Imodel, if (iz >= z || ix >= x) return; - auto DI = Imodel[iz * x + ix] - I[iz * x + ix]; - auto tmp = w[iz * x + ix] * DI; + auto DI = MATH_TYPE(Imodel[iz * x + ix]) - MATH_TYPE(I[iz * x + ix]); + auto tmp = MATH_TYPE(w[iz * x + ix]) * MATH_TYPE(DI); err[iz * x + ix] = tmp * DI; // now set this for all modes (promote) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu index 13f8551b7..60b0db6e7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu @@ -1,11 +1,19 @@ +/** intens_renorm - with 2 steps as separate kernels. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void step1(const FTYPE* Imodel, - const FTYPE* I, - const FTYPE* w, - FTYPE* num, - FTYPE* den, +extern "C" __global__ void step1(const IN_TYPE* Imodel, + const IN_TYPE* I, + const IN_TYPE* w, + OUT_TYPE* num, + OUT_TYPE* den, int z, int x) { @@ -15,14 +23,14 @@ extern "C" __global__ void step1(const FTYPE* Imodel, if (iz >= z || ix >= x) return; - auto tmp = w[iz * x + ix] * Imodel[iz * x + ix]; - num[iz * x + ix] = tmp * I[iz * x + ix]; - den[iz * x + ix] = tmp * Imodel[iz * x + ix]; + auto tmp = MATH_TYPE(w[iz * x + ix]) * MATH_TYPE(Imodel[iz * x + ix]); + num[iz * x + ix] = tmp * MATH_TYPE(I[iz * x + ix]); + den[iz * x + ix] = tmp * MATH_TYPE(Imodel[iz * x + ix]); } -extern "C" __global__ void step2(const FTYPE* fic_tmp, - FTYPE* fic, - FTYPE* Imodel, +extern "C" __global__ void step2(const IN_TYPE* fic_tmp, + OUT_TYPE* fic, + OUT_TYPE* Imodel, int z, int x) { @@ -32,7 +40,7 @@ extern "C" __global__ void step2(const FTYPE* fic_tmp, if (iz >= z || ix >= x) return; //probably not so clever having all threads read from the same locations - auto tmp = fic[iz] / fic_tmp[iz]; + auto tmp = MATH_TYPE(fic[iz]) / MATH_TYPE(fic_tmp[iz]); Imodel[iz * x + ix] *= tmp; // race condition if write is not restricted to one thread // learned this the hard way diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu index e538dd725..684099150 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu @@ -1,3 +1,11 @@ +/** log_likelihood kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include #include #include @@ -11,13 +19,13 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) log_likelihood(int nmodes, - complex *aux, - const float *fmask, - const float *fmag, - const int *addr, - float *llerr, - int A, - int B) + complex *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) { int tx = threadIdx.x; int ty = threadIdx.y; @@ -31,22 +39,24 @@ extern "C" __global__ void __launch_bounds__(1024, 2) fmag += da[0] * A * B; fmask += ma[0] * A * B; llerr += da[0] * A * B; - float norm = A * B; + MATH_TYPE norm = A * B; for (int a = ty; a < A; a += blockDim.y) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(aux[a * B + b + idx * A * B]); + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - auto I = fmag[a * B + b] * fmag[a * B + b]; - llerr[a * B + b] = fmask[a * B + b] * (acc - I) * (acc - I) / (I + 1) / norm; + auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); + llerr[a * B + b] = + MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu index e86d900f5..23798c35c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu @@ -1,14 +1,23 @@ +/** fmag_all_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: data type used for accumulation + */ + #include using thrust::complex; -extern "C" __global__ void make_a012(const CTYPE* f, - const CTYPE* a, - const CTYPE* b, - const FTYPE* I, - const FTYPE* fic, - FTYPE* A0, - FTYPE* A1, - FTYPE* A2, +extern "C" __global__ void make_a012(const complex* f, + const complex* a, + const complex* b, + const IN_TYPE* I, + const IN_TYPE* fic, + OUT_TYPE* A0, + OUT_TYPE* A1, + OUT_TYPE* A2, int z, int y, int x, @@ -22,37 +31,37 @@ extern "C" __global__ void make_a012(const CTYPE* f, if (iz >= maxz) { - A0[iz * x + ix] = FTYPE(0); // make sure it's the right type (double/float) - A1[iz * x + ix] = FTYPE(0); - A2[iz * x + ix] = FTYPE(0); + A0[iz * x + ix] = OUT_TYPE(0); // make sure it's the right type (double/float) + A1[iz * x + ix] = OUT_TYPE(0); + A2[iz * x + ix] = OUT_TYPE(0); return; } // we sum across y directly, as this is the number of modes, // which is typically small - auto sumtf0 = FTYPE(0); - auto sumtf1 = FTYPE(0); - auto sumtf2 = FTYPE(0); + auto sumtf0 = ACC_TYPE(0); + auto sumtf1 = ACC_TYPE(0); + auto sumtf2 = ACC_TYPE(0); for (auto iy = 0; iy < y; ++iy) { - auto fv = f[iz * y * x + iy * x + ix]; + complex fv = f[iz * y * x + iy * x + ix]; sumtf0 += fv.real() * fv.real() + fv.imag() * fv.imag(); - auto av = a[iz * y * x + iy * x + ix]; + complex av = a[iz * y * x + iy * x + ix]; // 2 * real(f * conj(a)) - sumtf1 += FTYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag()); + sumtf1 += MATH_TYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag()); // use FTYPE(2) to make sure double creeps into a float calculation // as 2.0 * would make everything double. - auto bv = b[iz * y * x + iy * x + ix]; + complex bv = b[iz * y * x + iy * x + ix]; // 2 * real(f * conj(b)) + abs(a)^2 - sumtf2 += FTYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) + + sumtf2 += MATH_TYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) + (av.real() * av.real() + av.imag() * av.imag()); } - auto Iv = I[iz * x + ix]; - auto ficv = fic[iz]; - A0[iz * x + ix] = sumtf0 * ficv - Iv; - A1[iz * x + ix] = sumtf1 * ficv; - A2[iz * x + ix] = sumtf2 * ficv; + MATH_TYPE Iv = I[iz * x + ix]; + MATH_TYPE ficv = fic[iz]; + A0[iz * x + ix] = OUT_TYPE(sumtf0 * ficv - Iv); + A1[iz * x + ix] = OUT_TYPE(sumtf1 * ficv); + A2[iz * x + ix] = OUT_TYPE(sumtf2 * ficv); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu index 0f8380d71..22bf7d4ab 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu @@ -1,8 +1,16 @@ +/** make_model - with 2 steps as separate kernels. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; extern "C" __global__ void make_model( - const CTYPE* in, FTYPE* out, int z, int y, int x) + const complex* in, OUT_TYPE* out, int z, int y, int x) { int ix = threadIdx.x + blockIdx.x * blockDim.x; int iz = blockIdx.z; @@ -12,11 +20,11 @@ extern "C" __global__ void make_model( // we sum accross y directly, as this is the number of modes, // which is typically small - auto sum = FTYPE(); + auto sum = MATH_TYPE(); for (auto iy = 0; iy < y; ++iy) { - auto v = in[iz * y * x + iy * x + ix]; + complex v = in[iz * y * x + iy * x + ix]; sum += v.real() * v.real() + v.imag() * v.imag(); } - out[iz * x + ix] = sum; + out[iz * x + ix] = OUT_TYPE(sum); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu index c2cf2fd22..57c69848d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu @@ -1,24 +1,40 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - DENOM_TYPE: data type for the denominator (double,float,complex,complex) + */ + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); atomicAdd(xf + 1, y.imag()); } +// return a pointer to the real part of the argument +template +__device__ inline T* get_denom_real_ptr(complex* den) +{ + return reinterpret_cast(den); +} + extern "C" __global__ void ob_update( - const complex* __restrict__ exit_wave, + const complex* __restrict__ exit_wave, int A, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int D, int E, int F, - complex* obj, + complex* obj, int G, int H, int I, @@ -46,12 +62,16 @@ extern "C" __global__ void ob_update( { for (int c = tx; c < C; c += blockDim.x) { - auto probe_val = probe[b * F + c]; - atomicAdd(&obj[b * I + c], conj(probe_val) * exit_wave[b * C + c]); - auto denomreal = reinterpret_cast(&denominator[b * I + c]); + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + auto add_val_m = conj(probe_val) * exit_val; + complex add_val = add_val_m; + atomicAdd(&obj[b * I + c], add_val); + + auto denomreal_ptr = get_denom_real_ptr(&denominator[b * I + c]); auto upd_probe = probe_val.real() * probe_val.real() + probe_val.imag() * probe_val.imag(); - atomicAdd(denomreal, upd_probe); + atomicAdd(denomreal_ptr, upd_probe); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu index 1f9c5b573..7c41c0231 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu @@ -1,3 +1,21 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator type for the local ob accumulation + * - DENOM_TYPE: type for the denominator (can be real/complex float/double) + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in ob_update.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + + #include #include using thrust::complex; @@ -8,13 +26,13 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, T r) +template +__device__ inline void set_real(complex& v, U r) { - v.real(r); + v.real(T(r)); } -template -__device__ inline void set_real(T& v, T r) +template +__device__ inline void set_real(T& v, U r) { v = r; } @@ -29,6 +47,7 @@ __device__ inline T get_real(const T& v) return v; } + extern "C" __global__ void ob_update2( int pr_sh, int ob_modes, @@ -38,18 +57,18 @@ extern "C" __global__ void ob_update2( int ex_0, int ex_1, int ex_2, - complex* ob_g, + complex* ob_g, DENOM_TYPE* obn_g, - const complex* __restrict__ pr_g, // 2, 5, 5 - const complex* __restrict__ ex_g, // 16, 5, 5 + const complex* __restrict__ pr_g, // 2, 5, 5 + const complex* __restrict__ ex_g, // 16, 5, 5 const int* addr) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = ob_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = ob_sh; - complex ob[NUM_MODES]; - DENOM_TYPE obn[NUM_MODES]; + complex ob[NUM_MODES]; + ACC_TYPE obn[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); @@ -62,7 +81,7 @@ extern "C" __global__ void ob_update2( auto idx = i * dy * dz + y * dz + z; assert(idx < ob_modes * ob_sh * ob_sh); ob[i] = ob_g[idx]; - obn[i] = obn_g[idx]; + obn[i] = get_real(obn_g[idx]); } } @@ -105,16 +124,16 @@ extern "C" __global__ void ob_update2( { auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(pridx < pr_modes * pr_sh * pr_sh); - auto pr = pr_g[pridx]; + complex pr = pr_g[pridx]; int idx = ad[2]; assert(idx < NUM_MODES); auto cpr = conj(pr); auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(exidx < ex_0 * ex_1 * ex_2); - ob[idx] += cpr * ex_g[exidx]; - auto rr = get_real(obn[idx]); - rr += pr.real() * pr.real() + pr.imag() * pr.imag(); - set_real(obn[idx], rr); + complex t_ex_g = ex_g[exidx]; + complex add_val = cpr * t_ex_g; + ob[idx] += add_val; + obn[idx] += pr.real() * pr.real() + pr.imag() * pr.imag(); } } } @@ -124,7 +143,7 @@ extern "C" __global__ void ob_update2( for (int i = 0; i < NUM_MODES; ++i) { ob_g[i * dy * dz + y * dz + z] = ob[i]; - obn_g[i * dy * dz + y * dz + z] = obn[i]; + set_real(obn_g[i * dy * dz + y * dz + z], obn[i]); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu index 56d088788..484912ddc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu @@ -1,3 +1,20 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator for the ob field + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in ob_update_ML.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + + #include #include using thrust::complex; @@ -16,17 +33,19 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, int ex_0, int ex_1, int ex_2, - CTYPE* ob_g, - const CTYPE* __restrict__ pr_g, - const CTYPE* __restrict__ ex_g, + complex* ob_g, + const complex* __restrict__ pr_g, + const complex* __restrict__ ex_g, const int* addr, - FTYPE fac) + IN_TYPE fac_) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = ob_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = ob_sh; - CTYPE ob[NUM_MODES]; + MATH_TYPE fac = fac_; + complex ob[NUM_MODES]; + int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); @@ -81,13 +100,15 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, { auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(pridx < pr_modes * pr_sh * pr_sh); - auto pr = pr_g[pridx]; + complex pr = pr_g[pridx]; int idx = ad[2]; assert(idx < NUM_MODES); auto cpr = conj(pr); auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(exidx < ex_0 * ex_1 * ex_2); - ob[idx] += cpr * ex_g[exidx] * fac; + complex t_ex_g = ex_g[exidx]; + complex add_val = cpr * t_ex_g * fac; + ob[idx] += add_val; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu index c6aa9ca11..84e678ebb 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu @@ -1,8 +1,16 @@ +/** ob_update_ML. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); @@ -11,25 +19,26 @@ __device__ inline void atomicAdd(complex* x, complex y) extern "C" { - __global__ void ob_update_ML(const CTYPE* __restrict__ exit_wave, + __global__ void ob_update_ML(const complex* __restrict__ exit_wave, int A, int B, int C, - const CTYPE* __restrict__ probe, + const complex* __restrict__ probe, int D, int E, int F, - CTYPE* obj, + complex* obj, int G, int H, int I, const int* __restrict__ addr, - FTYPE fac) + IN_TYPE fac_) { const int bid = blockIdx.x; const int tx = threadIdx.x; const int ty = threadIdx.y; const int addr_stride = 15; + MATH_TYPE fac = fac_; const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -46,9 +55,12 @@ extern "C" { for (int c = tx; c < C; c += blockDim.x) { - auto probe_val = probe[b * F + c]; - atomicAdd(&obj[b * I + c], - conj(probe_val) * exit_wave[b * C + c] * fac); + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(probe_val) * exit_val * fac; + complex add_val(add_val_m); + + atomicAdd(&obj[b * I + c], add_val); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu index 13a6c72b1..bbabdb2f1 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu @@ -1,24 +1,40 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - DENOM_TYPE: type of the denominator (real/complex, float/double) + */ + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); atomicAdd(xf + 1, y.imag()); } +// return a pointer to the real part of the argument +template +__device__ inline T* get_denom_real_ptr(complex* den) +{ + return reinterpret_cast(den); +} + extern "C" __global__ void pr_update( - const complex* __restrict__ exit_wave, + const complex* __restrict__ exit_wave, int A, int B, int C, - complex* probe, + complex* probe, int D, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int G, int H, int I, @@ -48,10 +64,13 @@ extern "C" __global__ void pr_update( { for (int c = tx; c < C; c += blockDim.x) { - auto obj_val = obj[b * I + c]; - atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c]); - auto denomreal = reinterpret_cast(&denominator[b * F + c]); - auto upd_obj = + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(obj_val) * exit_val; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); + auto denomreal = get_denom_real_ptr(&denominator[b * F + c]); + MATH_TYPE upd_obj = obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag(); atomicAdd(denomreal, upd_obj); } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu index 1361cb18d..1c2aa8f50 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu @@ -1,3 +1,20 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - DENOM_TYPE: data type for the denominator (double,float,complex,complex) + * - ACC_TYPE: accumulator type for local pr array + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in pr_update.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + #include #include using thrust::complex; @@ -10,14 +27,14 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, T r) +template +__device__ inline void set_real(complex& v, U r) { - v.real(r); + v.real(T(r)); } -template -__device__ inline void set_real(T& v, T r) +template +__device__ inline void set_real(T& v, U r) { v = r; } @@ -40,18 +57,18 @@ extern "C" __global__ void pr_update2(int pr_sh, int pr_modes, int ob_modes, int num_pods, - complex* pr_g, + complex* pr_g, DENOM_TYPE* prn_g, - const complex* __restrict__ ob_g, - const complex* __restrict__ ex_g, + const complex* __restrict__ ob_g, + const complex* __restrict__ ex_g, const int* addr) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = pr_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = pr_sh; - complex pr[NUM_MODES]; - DENOM_TYPE prn[NUM_MODES]; + complex pr[NUM_MODES]; + ACC_TYPE prn[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(pr_modes <= NUM_MODES); @@ -64,7 +81,7 @@ extern "C" __global__ void pr_update2(int pr_sh, auto idx = i * dy * dz + y * dz + z; assert(idx < pr_modes * pr_sh * pr_sh); pr[i] = pr_g[idx]; - prn[i] = prn_g[idx]; + prn[i] = get_real(prn_g[idx]); } } @@ -107,15 +124,15 @@ extern "C" __global__ void pr_update2(int pr_sh, { auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2; assert(obidx < ob_modes * ob_sh_row * ob_sh_col); - auto ob = ob_g[obidx]; + complex ob = ob_g[obidx]; int idx = ad[0]; assert(idx < NUM_MODES); auto cob = conj(ob); - pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; - auto rr = get_real(prn[idx]); - rr += ob.real() * ob.real() + ob.imag() * ob.imag(); - set_real(prn[idx], rr); + complex ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; + complex add_val = cob * ex_val; + pr[idx] += add_val; + prn[idx] += ob.real() * ob.real() + ob.imag() * ob.imag(); } } } @@ -125,7 +142,7 @@ extern "C" __global__ void pr_update2(int pr_sh, for (int i = 0; i < NUM_MODES; ++i) { pr_g[i * dy * dz + y * dz + z] = pr[i]; - prn_g[i * dy * dz + y * dz + z] = prn[i]; + set_real(prn_g[i * dy * dz + y * dz + z], prn[i]); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu index 696682e97..8a45891c5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu @@ -1,3 +1,19 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator type for local pr array + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in pr_update_ML.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + #include #include using thrust::complex; @@ -16,17 +32,18 @@ extern "C" __global__ void pr_update2_ML(int pr_sh, int pr_modes, int ob_modes, int num_pods, - CTYPE* pr_g, - const CTYPE* __restrict__ ob_g, - const CTYPE* __restrict__ ex_g, + complex* pr_g, + const complex* __restrict__ ob_g, + const complex* __restrict__ ex_g, const int* addr, - FTYPE fac) + IN_TYPE fac_) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = pr_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = pr_sh; - CTYPE pr[NUM_MODES]; + MATH_TYPE fac = fac_; + complex pr[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(pr_modes <= NUM_MODES); @@ -81,12 +98,15 @@ extern "C" __global__ void pr_update2_ML(int pr_sh, { auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2; assert(obidx < ob_modes * ob_sh_row * ob_sh_col); - auto ob = ob_g[obidx]; + complex ob = ob_g[obidx]; int idx = ad[0]; assert(idx < NUM_MODES); auto cob = conj(ob); - pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z] * fac; + complex ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; + complex add_val_m = cob * ex_val * fac; + complex add_val = add_val_m; + pr[idx] += add_val; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu index 156e6d198..3fa24137d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu @@ -1,28 +1,37 @@ +/** pr_update_ML. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); atomicAdd(xf + 1, y.imag()); } -extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, +extern "C" __global__ void pr_update_ML(const complex* __restrict__ exit_wave, int A, int B, int C, - CTYPE* probe, + complex* probe, int D, int E, int F, - const CTYPE* __restrict__ obj, + const complex* __restrict__ obj, int G, int H, int I, const int* __restrict__ addr, - FTYPE fac) + IN_TYPE fac_) { assert(B == E); // prsh[1] assert(C == F); // prsh[2] @@ -30,6 +39,7 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, const int tx = threadIdx.x; const int ty = threadIdx.y; const int addr_stride = 15; + MATH_TYPE fac = fac_; const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -46,8 +56,11 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, { for (int c = tx; c < C; c += blockDim.x) { - auto obj_val = obj[b * I + c]; - atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c] * fac); + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(obj_val) * exit_val * fac; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu index a460727a4..8de4e7ad7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu @@ -5,6 +5,11 @@ * and shared memory access has no bank conflicts. */ +/** + * Data types: + * - DTYPE - any pod type + */ + #include using thrust::complex; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu index 2e6d21059..1220a0986 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu @@ -1,11 +1,18 @@ +/** update_addr_error_state kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + */ + #include #include using thrust::complex; -extern "C" __global__ void update_addr_error_state(int* addr, - const int* mangled_addr, - float* error_state, - const float* error_sum, +extern "C" __global__ void update_addr_error_state(int* __restrict addr, + const int* __restrict mangled_addr, + OUT_TYPE* error_state, + const IN_TYPE* __restrict error_sum, int nmodes) { int tx = threadIdx.x; @@ -23,7 +30,7 @@ extern "C" __global__ void update_addr_error_state(int* addr, if (err_sum < err_st) { - for (int i = tx; i < nmodes * 15; i += blockDim.x) + for (int i = tx, e = nmodes * 15; i < e; i += blockDim.x) { addr[i] = mangled_addr[i]; } diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 89c2c650b..605e90d43 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -75,14 +75,30 @@ def queue(self, queue): cufftlib.cufftSetStream(self.plan.handle, queue.handle) def _load(self, array, pre_fft, post_fft, symmetric, forward): + assert(array.dtype in [np.complex64, np.complex128]) + assert(pre_fft.dtype in [np.complex64, np.complex128] if pre_fft is not None else True) + assert(post_fft.dtype in [np.complex64, np.complex128] if post_fft is not None else True) + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if pre_fft is not None: + math_type = 'float' if pre_fft.dtype == np.complex64 else 'double' self.pre_fft_knl = load_kernel("batched_multiply", { 'MPY_DO_SCALE': 'false', - 'MPY_DO_FILT': 'true' + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type }) if pre_fft is not None else None + math_type = 'float' if array.dtype == np.complex64 else 'double' + if post_fft is not None: + math_type = 'float' if post_fft.dtype == np.complex64 else 'double' self.post_fft_knl = load_kernel("batched_multiply", { 'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false', - 'MPY_DO_FILT': 'true' if post_fft is not None else 'false' + 'MPY_DO_FILT': 'true' if post_fft is not None else 'false', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type }) if (not (forward and not symmetric) or post_fft is not None) else None self.block = (32, 32, 1) diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 9064ab593..072768d7b 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -92,16 +92,43 @@ def queue(self, queue): class FourierUpdateKernel(ab.FourierUpdateKernel): - def __init__(self, aux, nmodes=1, queue_thread=None): + def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): super(FourierUpdateKernel, self).__init__(aux, nmodes=nmodes) + + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + if math_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + self.accumulate_type = accumulate_type + self.math_type = math_type self.queue = queue_thread - self.fmag_all_update_cuda = load_kernel("fmag_all_update") - self.fourier_error_cuda = load_kernel("fourier_error") + self.fmag_all_update_cuda = load_kernel("fmag_all_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) self.fourier_error2_cuda = None - self.error_reduce_cuda = load_kernel("error_reduce") + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'ACC_TYPE': self.accumulate_type + }) self.fourier_update_cuda = None - self.log_likelihood_cuda = load_kernel("log_likelihood") - self.exit_error_cuda = load_kernel("exit_error") + self.log_likelihood_cuda = load_kernel("log_likelihood", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.exit_error_cuda = load_kernel("exit_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) self.gpu = Adict() self.gpu.fdev = None @@ -261,17 +288,29 @@ def execute(self, kernel_name=None, compare=False, sync=False): class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel): - def __init__(self, queue_thread=None): + def __init__(self, queue_thread=None, math_type = 'float'): super(AuxiliaryWaveKernel, self).__init__() # and now initialise the cuda self.queue = queue_thread self._ob_shape = None self._ob_id = None - self.build_aux_cuda = load_kernel("build_aux") - self.build_exit_cuda = load_kernel("build_exit") + self.math_type = math_type + if math_type not in ['float', 'double']: + raise ValueError('Only double or float math is supported') + self.build_aux_cuda = load_kernel("build_aux", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.build_exit_cuda = load_kernel("build_exit", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) self.build_aux_no_ex_cuda = load_kernel("build_aux_no_ex", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) # DEPRECATED? @@ -298,7 +337,7 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): ob, obr, obc, addr, - np.float32(alpha), + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue) def build_exit(self, b_aux, addr, ob, pr, ex): @@ -327,7 +366,7 @@ def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): ob, obr, obc, addr, - np.float32(fac), + np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac), np.int32(add), block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), @@ -345,25 +384,43 @@ def _cache_object_shape(self, ob): class GradientDescentKernel(ab.GradientDescentKernel): - def __init__(self, aux, nmodes=1, queue=None): + def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_type='float'): super().__init__(aux, nmodes) self.queue = queue - + self.accumulate_type = accumulate_type + self.math_type = math_type + if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']): + raise ValueError("accumulate and math types must be double for float") + self.gpu = Adict() self.gpu.LLden = None self.gpu.LLerr = None self.gpu.Imodel = None subs = { - 'CTYPE': 'complex' if self.ctype == np.complex64 else 'complex', - 'FTYPE': 'float' if self.ftype == np.float32 else 'double' + 'IN_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'ACC_TYPE': self.accumulate_type, + 'MATH_TYPE': self.math_type } self.make_model_cuda = load_kernel('make_model', subs) self.make_a012_cuda = load_kernel('make_a012', subs) - self.error_reduce_cuda = load_kernel('error_reduce', subs) - self.fill_b_cuda = load_kernel('fill_b', {**subs, 'BDIM_X': 1024}) + self.error_reduce_cuda = load_kernel('error_reduce', { + **subs, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + }) + self.fill_b_cuda = load_kernel('fill_b', { + **subs, + 'BDIM_X': 1024, + 'OUT_TYPE': self.accumulate_type + }) self.fill_b_reduce_cuda = load_kernel( - 'fill_b_reduce', {**subs, 'BDIM_X': 1024}) + 'fill_b_reduce', { + **subs, + 'BDIM_X': 1024, + 'IN_TYPE': self.accumulate_type, # must match out-type of fill_b + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + }) self.main_cuda = load_kernel('gd_main', subs) self.floating_intensity_cuda_step1 = load_kernel('step1', subs,'intens_renorm.cu') self.floating_intensity_cuda_step2 = load_kernel('step2', subs,'intens_renorm.cu') @@ -377,7 +434,7 @@ def allocate(self): # temporary array for the reduction in fill_b sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024)) - self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64) + self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32) def make_model(self, b_aux, addr): # reference shape @@ -542,33 +599,53 @@ def main(self, b_aux, addr, w, I): class PoUpdateKernel(ab.PoUpdateKernel): - def __init__(self, queue_thread=None, denom_type=np.complex64): + def __init__(self, queue_thread=None, denom_type=np.complex64, + math_type='float', accumulator_type='float'): super(PoUpdateKernel, self).__init__() # and now initialise the cuda if denom_type == np.complex64: dtype = 'complex' elif denom_type == np.float32: dtype = 'float' + elif denom_type == np.complex128: + dtype = 'complex' + elif denom_type == np.float64: + dtype = 'double' else: - raise ValueError('only complex64 and float32 types supported') + raise ValueError('invalid type for denominator') + if math_type not in ['double', 'float']: + raise ValueError('only float and double are supported for math_type') + if accumulator_type not in ['double', 'float']: + raise ValueError('only float and double are supported for accumulator_type') + + self.math_type = math_type + self.accumulator_type = accumulator_type self.dtype = dtype self.queue = queue_thread self.ob_update_cuda = load_kernel("ob_update", { - 'DENOM_TYPE': dtype + 'DENOM_TYPE': dtype, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.ob_update2_cuda = None # load_kernel("ob_update2") self.pr_update_cuda = load_kernel("pr_update", { - 'DENOM_TYPE': dtype + 'DENOM_TYPE': dtype, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.pr_update2_cuda = None self.ob_update_ML_cuda = load_kernel("ob_update_ML", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.ob_update2_ML_cuda = None self.pr_update_ML_cuda = load_kernel("pr_update_ML", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.pr_update2_ML_cuda = None @@ -595,7 +672,11 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype + 'DENOM_TYPE': self.dtype, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in ob.shape[-2:]] @@ -632,7 +713,11 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype + 'DENOM_TYPE': self.dtype, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in pr.shape[-2:]] @@ -667,8 +752,10 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in ob.shape[-2:]] grid = (grid[0], grid[1], int(1)) @@ -702,8 +789,10 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in pr.shape[-2:]] @@ -715,16 +804,38 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): class PositionCorrectionKernel(ab.PositionCorrectionKernel): - def __init__(self, aux, nmodes, queue_thread=None): + def __init__(self, aux, nmodes, queue_thread=None, math_type='float', accumulate_type='float'): super(PositionCorrectionKernel, self).__init__(aux, nmodes) + if math_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + # add kernels + self.math_type = math_type + self.accumulate_type = accumulate_type self.queue = queue_thread self._ob_shape = None self._ob_id = None - self.fourier_error_cuda = load_kernel("fourier_error") - self.error_reduce_cuda = load_kernel("error_reduce") - self.build_aux_pc_cuda = load_kernel("build_aux_position_correction") - self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state") + self.fourier_error_cuda = load_kernel("fourier_error",{ + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'ACC_TYPE': self.accumulate_type + }) + self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float' + }) self.gpu = Adict() self.gpu.fdev = None diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py index dfea1e19b..2650c9ad1 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py @@ -109,7 +109,7 @@ def test_fmag_all_update_UNITY(self): nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set) expected_f = f measured_f = f_d.get() - np.testing.assert_array_equal(expected_f, measured_f, err_msg="Numpy f " + np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f " "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), repr(measured_f), repr(mask))) @@ -191,7 +191,7 @@ def test_fourier_error_UNITY(self): expected_fdev = nFUK.npy.fdev measured_fdev = FUK.gpu.fdev.get() - np.testing.assert_array_equal(expected_fdev, measured_fdev, err_msg="Numpy fdev " + np.testing.assert_allclose(expected_fdev, measured_fdev, rtol=1e-6, err_msg="Numpy fdev " "is \n%s, \nbut gpu fdev is \n %s, \n " % ( repr(expected_fdev), repr(measured_fdev))) From 7838ce4acb59cfa673b8420ece067c0175a3b369 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Fri, 5 Mar 2021 12:06:13 +0000 Subject: [PATCH 21/56] Making ob/pr denominator real, tests passing (#295) --- .../accelerate/cuda_pycuda/cuda/ob_update.cu | 6 ++ .../accelerate/cuda_pycuda/cuda/pr_update.cu | 6 ++ .../cuda_pycuda/engines/DM_pycuda.py | 2 +- .../cuda_pycuda/engines/ML_pycuda.py | 2 +- ptypy/accelerate/cuda_pycuda/kernels.py | 25 +++---- .../po_update_kernel_test.py | 72 +++++++++---------- 6 files changed, 58 insertions(+), 55 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu index 57c69848d..20ca11206 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu @@ -25,6 +25,12 @@ __device__ inline T* get_denom_real_ptr(complex* den) return reinterpret_cast(den); } +template +__device__ inline T* get_denom_real_ptr(T* den) +{ + return den; +} + extern "C" __global__ void ob_update( const complex* __restrict__ exit_wave, int A, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu index bbabdb2f1..5b082cd0f 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu @@ -25,6 +25,12 @@ __device__ inline T* get_denom_real_ptr(complex* den) return reinterpret_cast(den); } +template +__device__ inline T* get_denom_real_ptr(T* den) +{ + return den; +} + extern "C" __global__ void pr_update( const complex* __restrict__ exit_wave, int A, diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 8b7741e38..fd9a6ea34 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -122,7 +122,7 @@ def _setup_kernels(self): kern.FUK.allocate() logger.info("Setting up PoUpdateKernel") - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32) + kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type='float') kern.POK.allocate() logger.info("Setting up AuxiliaryWaveKernel") diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 4112df968..b269d3227 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -208,7 +208,7 @@ def _setup_kernels(self): kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue) kern.GDK.allocate() - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32) + kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type='float') kern.POK.allocate() kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 072768d7b..8fd79dd35 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -599,38 +599,29 @@ def main(self, b_aux, addr, w, I): class PoUpdateKernel(ab.PoUpdateKernel): - def __init__(self, queue_thread=None, denom_type=np.complex64, + def __init__(self, queue_thread=None, denom_type='float', math_type='float', accumulator_type='float'): super(PoUpdateKernel, self).__init__() # and now initialise the cuda - if denom_type == np.complex64: - dtype = 'complex' - elif denom_type == np.float32: - dtype = 'float' - elif denom_type == np.complex128: - dtype = 'complex' - elif denom_type == np.float64: - dtype = 'double' - else: - raise ValueError('invalid type for denominator') + if denom_type not in ['double', 'float']: + raise ValueError('only float and double are supported for denom_type') if math_type not in ['double', 'float']: raise ValueError('only float and double are supported for math_type') if accumulator_type not in ['double', 'float']: raise ValueError('only float and double are supported for accumulator_type') - + self.denom_type = denom_type self.math_type = math_type self.accumulator_type = accumulator_type - self.dtype = dtype self.queue = queue_thread self.ob_update_cuda = load_kernel("ob_update", { - 'DENOM_TYPE': dtype, + 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type }) self.ob_update2_cuda = None # load_kernel("ob_update2") self.pr_update_cuda = load_kernel("pr_update", { - 'DENOM_TYPE': dtype, + 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type @@ -672,7 +663,7 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype, + 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type, @@ -713,7 +704,7 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype, + 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type, diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py index 81674d610..4cd9a8f8c 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py @@ -72,11 +72,11 @@ def prepare_arrays(self): object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) # + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) # + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) return (gpuarray.to_gpu(addr), gpuarray.to_gpu(object_array), @@ -154,9 +154,9 @@ def ob_update_REGRESSION_tester(self, atomics=True): ''' test ''' - object_array_denominator = np.empty_like(object_array) + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -204,22 +204,22 @@ def ob_update_REGRESSION_tester(self, atomics=True): np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") - expected_object_array_denominator = np.array([[[12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j, 2.+2.j], - [ 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j]], + expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [12., 22., 22., 22., 22., 12., 2.], + [ 2., 2., 2., 2., 2., 2., 2.]], - [[17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j, 7.+7.j], - [ 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j]]], - dtype=COMPLEX_TYPE) + [[17., 27., 27., 27., 27., 17., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [17., 27., 27., 27., 27., 17., 7.], + [ 7., 7., 7., 7., 7., 7., 7.]]], + dtype=FLOAT_TYPE) np.testing.assert_array_equal(object_array_denominator_dev.get(), expected_object_array_denominator, @@ -291,9 +291,9 @@ def ob_update_UNITY_tester(self, atomics=True): ''' test ''' - object_array_denominator = np.empty_like(object_array) + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -394,9 +394,9 @@ def pr_update_REGRESSION_tester(self, atomics=True): ''' test ''' - probe_denominator = np.empty_like(probe) + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -438,18 +438,18 @@ def pr_update_REGRESSION_tester(self, atomics=True): np.testing.assert_array_equal(probe_dev.get(), expected_probe, err_msg="The probe has not been updated as expected") - expected_probe_denominator = np.array([[[138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j]], + expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.]], - [[143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j]]], - dtype=COMPLEX_TYPE) + [[143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.]]], + dtype=FLOAT_TYPE) np.testing.assert_array_equal(probe_denominator_dev.get(), expected_probe_denominator, err_msg="The probe denominatorhas not been updated as expected") @@ -519,9 +519,9 @@ def pr_update_UNITY_tester(self, atomics=True): ''' test ''' - probe_denominator = np.empty_like(probe) + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) POUK = PoUpdateKernel() from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel From 924851d78b69cee9fc375e50a9690aace3e54d8f Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 5 Mar 2021 14:09:20 +0000 Subject: [PATCH 22/56] removed unused code throwing a confusing error --- ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index b269d3227..b4481a8e7 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -168,13 +168,6 @@ def _setup_kernels(self): """ Setup kernels, one for each scan. Derive scans from ptycho class """ - - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT - except: - logger.warning('Unable to import cuFFT version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - AUK = ArrayUtilsKernel(queue=self.queue) self._dot_kernel = AUK.dot # get the scans From 49247c1bf79a267379c021458b47f64adb066913 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Fri, 5 Mar 2021 15:57:55 +0000 Subject: [PATCH 23/56] Gpu precision and bugfixes (#296) * fixing bug in DLS test, transferring the wrong data to GPU * investigations / improvements re make_a012 precision errors * fixing explicit type casts * adding an ACC_TYPE to the tiled update kernels * fixing non-atomic ob_update versions for ob dimensions * fixing gradient descent data type specification in test --- .../accelerate/cuda_pycuda/cuda/make_a012.cu | 6 ++--- .../accelerate/cuda_pycuda/cuda/ob_update2.cu | 15 ++++++------ .../cuda_pycuda/cuda/ob_update2_ML.cu | 15 ++++++------ ptypy/accelerate/cuda_pycuda/kernels.py | 21 +++++++++------- .../dls_gradient_descent_kernel_test.py | 19 +++++++++++---- .../dls_tests/dls_po_update_kernel_test.py | 24 ++++++++++++------- 6 files changed, 61 insertions(+), 39 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu index 23798c35c..11ba29f62 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu @@ -61,7 +61,7 @@ extern "C" __global__ void make_a012(const complex* f, MATH_TYPE Iv = I[iz * x + ix]; MATH_TYPE ficv = fic[iz]; - A0[iz * x + ix] = OUT_TYPE(sumtf0 * ficv - Iv); - A1[iz * x + ix] = OUT_TYPE(sumtf1 * ficv); - A2[iz * x + ix] = OUT_TYPE(sumtf2 * ficv); + A0[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf0) * ficv - Iv); + A1[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf1) * ficv); + A2[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf2) * ficv); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu index 7c41c0231..fbca654e6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu @@ -52,7 +52,8 @@ extern "C" __global__ void ob_update2( int pr_sh, int ob_modes, int num_pods, - int ob_sh, + int ob_sh_rows, + int ob_sh_cols, int pr_modes, int ex_0, int ex_1, @@ -64,22 +65,22 @@ extern "C" __global__ void ob_update2( const int* addr) { int y = blockIdx.y * BDIM_Y + threadIdx.y; - int dy = ob_sh; + int dy = ob_sh_rows; int z = blockIdx.x * BDIM_X + threadIdx.x; - int dz = ob_sh; + int dz = ob_sh_cols; complex ob[NUM_MODES]; ACC_TYPE obn[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { #pragma unroll for (int i = 0; i < NUM_MODES; ++i) { auto idx = i * dy * dz + y * dz + z; - assert(idx < ob_modes * ob_sh * ob_sh); + assert(idx < ob_modes * ob_sh_rows * ob_sh_cols); ob[i] = ob_g[idx]; obn[i] = get_real(obn_g[idx]); } @@ -111,7 +112,7 @@ extern "C" __global__ void ob_update2( __syncthreads(); - if (y >= ob_sh || z >= ob_sh) + if (y >= dy || z >= dz) continue; #pragma unroll 4 @@ -138,7 +139,7 @@ extern "C" __global__ void ob_update2( } } - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { for (int i = 0; i < NUM_MODES; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu index 484912ddc..b62e66006 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu @@ -28,7 +28,8 @@ using thrust::complex; extern "C" __global__ void ob_update2_ML(int pr_sh, int ob_modes, int num_pods, - int ob_sh, + int ob_sh_rows, + int ob_sh_cols, int pr_modes, int ex_0, int ex_1, @@ -40,9 +41,9 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, IN_TYPE fac_) { int y = blockIdx.y * BDIM_Y + threadIdx.y; - int dy = ob_sh; + int dy = ob_sh_rows; int z = blockIdx.x * BDIM_X + threadIdx.x; - int dz = ob_sh; + int dz = ob_sh_cols; MATH_TYPE fac = fac_; complex ob[NUM_MODES]; @@ -50,13 +51,13 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { #pragma unroll for (int i = 0; i < NUM_MODES; ++i) { auto idx = i * dy * dz + y * dz + z; - assert(idx < ob_modes * ob_sh * ob_sh); + assert(idx < ob_modes * ob_sh_rows * ob_sh_cols); ob[i] = ob_g[idx]; } } @@ -87,7 +88,7 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, __syncthreads(); - if (y >= ob_sh || z >= ob_sh) + if (y >= dy || z >= dz) continue; #pragma unroll 4 @@ -113,7 +114,7 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, } } - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { for (int i = 0; i < NUM_MODES; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 8fd79dd35..6f4e60ee2 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -671,8 +671,8 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): }) grid = [int((x+15)//16) for x in ob.shape[-2:]] - grid = (grid[0], grid[1], int(1)) - self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], + grid = (grid[1], grid[0], int(1)) + self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], prsh[0], np.int32(ex.shape[0]), np.int32(ex.shape[1]), @@ -721,17 +721,18 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: raise ValueError('Address not in required shape for tiled ob_update') num_pods = np.int32(addr.shape[0] * addr.shape[1]) - self.ob_update_ML_cuda(ex, num_pods, prsh[1], prsh[2], + self.ob_update_ML_cuda(ex, num_pods, exsh[1], exsh[2], pr, prsh[0], prsh[1], prsh[2], ob, obsh[0], obsh[1], obsh[2], addr, - np.float32(fac), + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue) else: if addr.shape[0] != 5 or addr.shape[1] != 3: @@ -749,13 +750,14 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in ob.shape[-2:]] - grid = (grid[0], grid[1], int(1)) - self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], + grid = (grid[1], grid[0], int(1)) + self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], prsh[0], np.int32(ex.shape[0]), np.int32(ex.shape[1]), np.int32(ex.shape[2]), - ob, pr, ex, addr, np.float32(fac), + ob, pr, ex, addr, + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): @@ -769,7 +771,7 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): pr, prsh[0], prsh[1], prsh[2], ob, obsh[0], obsh[1], obsh[2], addr, - np.float32(fac), + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue) else: if addr.shape[0] != 5 or addr.shape[1] != 3: @@ -790,7 +792,8 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): grid = (grid[0], grid[1], int(1)) self.pr_update2_ML_cuda(prsh[-1], obsh[-2], obsh[-1], prsh[0], obsh[0], num_pods, - pr, ob, ex, addr, np.float32(fac), + pr, ob, ex, addr, + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index c37febd0f..f02a1c94a 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -162,11 +162,22 @@ def test_make_a012_UNITY(self, name, iter): # Copy data to device aux_dev = gpuarray.to_gpu(aux) addr_dev = gpuarray.to_gpu(addr) - I_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) f_dev = gpuarray.to_gpu(f) a_dev = gpuarray.to_gpu(a) b_dev = gpuarray.to_gpu(b) fic_dev = gpuarray.to_gpu(fic) + + # double versions + # aux_dbl = aux.astype(np.complex128) + # I_dbl = I.astype(np.float64) + # f_dbl = f.astype(np.complex128) + # a_dbl = a.astype(np.complex128) + # b_dbl = b.astype(np.complex128) + # fic_dbl = fic.astype(np.float64) + # BGDK = BaseGradientDescentKernel(aux_dbl, addr.shape[1]) + # BGDK.allocate() + # BGDK.make_a012(f_dbl, a_dbl, b_dbl, addr, I_dbl, fic_dbl) # CPU Kernel BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) @@ -182,11 +193,11 @@ def test_make_a012_UNITY(self, name, iter): GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) ## Assert - np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(GDK.gpu.Imodel.get(), BGDK.npy.Imodel, atol=self.atol, rtol=self.rtol, err_msg="Imodel error has not been updated as expected") - np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(GDK.gpu.LLerr.get(), BGDK.npy.LLerr, atol=self.atol, rtol=self.rtol, err_msg="LLerr error has not been updated as expected") - np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, + np.testing.assert_allclose(GDK.gpu.LLden.get(), BGDK.npy.LLden, atol=self.atol, rtol=self.rtol, err_msg="LLden error has not been updated as expected") @parameterized.expand([ diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py index b045d01f4..3b8ee0474 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -24,11 +24,14 @@ class DlsPoUpdateKernelTest(PyCudaTest): atol = 1e-6 @parameterized.expand([ - ["base", 10], - ["regul", 50], - ["floating", 0], + ["base", 10, False], + ["regul", 50, False], + ["floating", 0, False], + ["base", 10, True], + ["regul", 50, True], + ["floating", 0, True], ]) - def test_op_update_ml_UNITY(self, name, iter, atomics=False): + def test_op_update_ml_UNITY(self, name, iter, atomics): # Load data with h5py.File(self.datadir %name + "op_update_ml_%04d.h5" %iter, "r") as f: @@ -58,15 +61,18 @@ def test_op_update_ml_UNITY(self, name, iter, atomics=False): POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=atomics) ## Assert - np.testing.assert_allclose(obg, obg_dev.get(), atol=self.atol, rtol=self.rtol, verbose=False, + np.testing.assert_allclose(obg_dev.get(), obg, atol=self.atol, rtol=self.rtol, verbose=False, err_msg="The object array has not been updated as expected") @parameterized.expand([ - ["base", 10], - ["regul", 50], - ["floating", 0], + ["base", 10, False], + ["regul", 50, False], + ["floating", 0, False], + ["base", 10, True], + ["regul", 50, True], + ["floating", 0, True], ]) - def test_pr_update_ml_UNITY(self, name, iter, atomics=False): + def test_pr_update_ml_UNITY(self, name, iter, atomics): # Load data with h5py.File(self.datadir %name + "pr_update_ml_%04d.h5" %iter, "r") as f: From f5c9e50bdad52188f57401a36bfddf9e552f6307 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Fri, 5 Mar 2021 17:08:18 +0000 Subject: [PATCH 24/56] simplifying ob/pr updates by removing denominator type (#297) --- .../accelerate/cuda_pycuda/cuda/ob_update.cu | 19 ++---------- .../accelerate/cuda_pycuda/cuda/ob_update2.cu | 28 ++--------------- .../accelerate/cuda_pycuda/cuda/pr_update.cu | 27 ++++------------- .../accelerate/cuda_pycuda/cuda/pr_update2.cu | 30 ++----------------- .../cuda_pycuda/engines/DM_pycuda.py | 2 +- .../cuda_pycuda/engines/ML_pycuda.py | 2 +- ptypy/accelerate/cuda_pycuda/kernels.py | 13 ++++---- 7 files changed, 21 insertions(+), 100 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu index 20ca11206..29b993fb0 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu @@ -4,7 +4,6 @@ * - IN_TYPE: the data type for the inputs (float or double) * - OUT_TYPE: the data type for the outputs (float or double) * - MATH_TYPE: the data type used for computation - * - DENOM_TYPE: data type for the denominator (double,float,complex,complex) */ #include @@ -18,19 +17,6 @@ __device__ inline void atomicAdd(complex* x, const complex& y) atomicAdd(xf + 1, y.imag()); } -// return a pointer to the real part of the argument -template -__device__ inline T* get_denom_real_ptr(complex* den) -{ - return reinterpret_cast(den); -} - -template -__device__ inline T* get_denom_real_ptr(T* den) -{ - return den; -} - extern "C" __global__ void ob_update( const complex* __restrict__ exit_wave, int A, @@ -45,7 +31,7 @@ extern "C" __global__ void ob_update( int H, int I, const int* __restrict__ addr, - DENOM_TYPE* denominator) + OUT_TYPE* denominator) { const int bid = blockIdx.x; const int tx = threadIdx.x; @@ -74,10 +60,9 @@ extern "C" __global__ void ob_update( complex add_val = add_val_m; atomicAdd(&obj[b * I + c], add_val); - auto denomreal_ptr = get_denom_real_ptr(&denominator[b * I + c]); auto upd_probe = probe_val.real() * probe_val.real() + probe_val.imag() * probe_val.imag(); - atomicAdd(denomreal_ptr, upd_probe); + atomicAdd(&denominator[b * I + c], upd_probe); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu index fbca654e6..821c04a6d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu @@ -5,7 +5,6 @@ * - OUT_TYPE: the data type for the outputs (float or double) * - MATH_TYPE: the data type used for computation * - ACC_TYPE: accumulator type for the local ob accumulation - * - DENOM_TYPE: type for the denominator (can be real/complex float/double) * * NOTE: This version of ob_update goes over all tiles that need to be accumulated * in a single thread block to avoid global atomic additions (as in ob_update.cu). @@ -26,27 +25,6 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, U r) -{ - v.real(T(r)); -} -template -__device__ inline void set_real(T& v, U r) -{ - v = r; -} -template -__device__ inline T get_real(const complex& v) -{ - return v.real(); -} -template -__device__ inline T get_real(const T& v) -{ - return v; -} - extern "C" __global__ void ob_update2( int pr_sh, @@ -59,7 +37,7 @@ extern "C" __global__ void ob_update2( int ex_1, int ex_2, complex* ob_g, - DENOM_TYPE* obn_g, + OUT_TYPE* obn_g, const complex* __restrict__ pr_g, // 2, 5, 5 const complex* __restrict__ ex_g, // 16, 5, 5 const int* addr) @@ -82,7 +60,7 @@ extern "C" __global__ void ob_update2( auto idx = i * dy * dz + y * dz + z; assert(idx < ob_modes * ob_sh_rows * ob_sh_cols); ob[i] = ob_g[idx]; - obn[i] = get_real(obn_g[idx]); + obn[i] = obn_g[idx]; } } @@ -144,7 +122,7 @@ extern "C" __global__ void ob_update2( for (int i = 0; i < NUM_MODES; ++i) { ob_g[i * dy * dz + y * dz + z] = ob[i]; - set_real(obn_g[i * dy * dz + y * dz + z], obn[i]); + obn_g[i * dy * dz + y * dz + z] = obn[i]; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu index 5b082cd0f..180cf8f14 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu @@ -4,31 +4,17 @@ * - IN_TYPE: the data type for the inputs (float or double) * - OUT_TYPE: the data type for the outputs (float or double) * - MATH_TYPE: the data type used for computation - * - DENOM_TYPE: type of the denominator (real/complex, float/double) */ #include using thrust::complex; -template -__device__ inline void atomicAdd(complex* x, const complex& y) +template +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); - atomicAdd(xf, y.real()); - atomicAdd(xf + 1, y.imag()); -} - -// return a pointer to the real part of the argument -template -__device__ inline T* get_denom_real_ptr(complex* den) -{ - return reinterpret_cast(den); -} - -template -__device__ inline T* get_denom_real_ptr(T* den) -{ - return den; + atomicAdd(xf, T(y.real())); + atomicAdd(xf + 1, T(y.imag())); } extern "C" __global__ void pr_update( @@ -45,7 +31,7 @@ extern "C" __global__ void pr_update( int H, int I, const int* __restrict__ addr, - DENOM_TYPE* denominator) + OUT_TYPE* denominator) { assert(B == E); // prsh[1] assert(C == F); // prsh[2] @@ -75,10 +61,9 @@ extern "C" __global__ void pr_update( complex add_val_m = conj(obj_val) * exit_val; complex add_val = add_val_m; atomicAdd(&probe[b * F + c], add_val); - auto denomreal = get_denom_real_ptr(&denominator[b * F + c]); MATH_TYPE upd_obj = obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag(); - atomicAdd(denomreal, upd_obj); + atomicAdd(&denominator[b * F + c], upd_obj); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu index 1c2aa8f50..e5417cc01 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu @@ -4,7 +4,6 @@ * - IN_TYPE: the data type for the inputs (float or double) * - OUT_TYPE: the data type for the outputs (float or double) * - MATH_TYPE: the data type used for computation - * - DENOM_TYPE: data type for the denominator (double,float,complex,complex) * - ACC_TYPE: accumulator type for local pr array * * NOTE: This version of ob_update goes over all tiles that need to be accumulated @@ -27,29 +26,6 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, U r) -{ - v.real(T(r)); -} - -template -__device__ inline void set_real(T& v, U r) -{ - v = r; -} - -template -__device__ inline T get_real(const complex& v) -{ - return v.real(); -} - -template -__device__ inline T get_real(const T& v) -{ - return v; -} extern "C" __global__ void pr_update2(int pr_sh, int ob_sh_row, @@ -58,7 +34,7 @@ extern "C" __global__ void pr_update2(int pr_sh, int ob_modes, int num_pods, complex* pr_g, - DENOM_TYPE* prn_g, + OUT_TYPE* prn_g, const complex* __restrict__ ob_g, const complex* __restrict__ ex_g, const int* addr) @@ -81,7 +57,7 @@ extern "C" __global__ void pr_update2(int pr_sh, auto idx = i * dy * dz + y * dz + z; assert(idx < pr_modes * pr_sh * pr_sh); pr[i] = pr_g[idx]; - prn[i] = get_real(prn_g[idx]); + prn[i] = prn_g[idx]; } } @@ -142,7 +118,7 @@ extern "C" __global__ void pr_update2(int pr_sh, for (int i = 0; i < NUM_MODES; ++i) { pr_g[i * dy * dz + y * dz + z] = pr[i]; - set_real(prn_g[i * dy * dz + y * dz + z], prn[i]); + prn_g[i * dy * dz + y * dz + z] = prn[i]; } } } diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index fd9a6ea34..1206b887e 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -122,7 +122,7 @@ def _setup_kernels(self): kern.FUK.allocate() logger.info("Setting up PoUpdateKernel") - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type='float') + kern.POK = PoUpdateKernel(queue_thread=self.queue) kern.POK.allocate() logger.info("Setting up AuxiliaryWaveKernel") diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index b4481a8e7..0cb1568b9 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -201,7 +201,7 @@ def _setup_kernels(self): kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue) kern.GDK.allocate() - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type='float') + kern.POK = PoUpdateKernel(queue_thread=self.queue) kern.POK.allocate() kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 6f4e60ee2..1ff4ac00e 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -599,29 +599,24 @@ def main(self, b_aux, addr, w, I): class PoUpdateKernel(ab.PoUpdateKernel): - def __init__(self, queue_thread=None, denom_type='float', + def __init__(self, queue_thread=None, math_type='float', accumulator_type='float'): super(PoUpdateKernel, self).__init__() # and now initialise the cuda - if denom_type not in ['double', 'float']: - raise ValueError('only float and double are supported for denom_type') if math_type not in ['double', 'float']: raise ValueError('only float and double are supported for math_type') if accumulator_type not in ['double', 'float']: raise ValueError('only float and double are supported for accumulator_type') - self.denom_type = denom_type self.math_type = math_type self.accumulator_type = accumulator_type self.queue = queue_thread self.ob_update_cuda = load_kernel("ob_update", { - 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type }) self.ob_update2_cuda = None # load_kernel("ob_update2") self.pr_update_cuda = load_kernel("pr_update", { - 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type @@ -643,6 +638,8 @@ def __init__(self, queue_thread=None, denom_type='float', def ob_update(self, addr, ob, obn, pr, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + if obn.dtype != np.float32: + raise ValueError("Denominator must be float32 in current implementation") if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: @@ -663,7 +660,6 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type, @@ -683,6 +679,8 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): def pr_update(self, addr, pr, prn, ob, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + if prn.dtype != np.float32: + raise ValueError("Denominator must be float32 in current implementation") if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: raise ValueError('Address not in required shape for atomics pr_update') @@ -704,7 +702,6 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.denom_type, 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type, From a3ee838b96c170113f3685740faf491e6cd69f2f Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Mon, 8 Mar 2021 18:30:16 +0000 Subject: [PATCH 25/56] Save Imodel --- ptypy/accelerate/base/engines/ML_serial.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 8a2097952..fb359cf23 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -395,6 +395,7 @@ def new_grad(self): f["addr"] = addr f["I"] = I f["fic"] = fic + f["Imodel"] = GDK.npy.Imodel if self.p.floating_intensities: GDK.floating_intensity(addr, w, I, fic) From 527ce47bb2440543f0a37211a7e3e5962565fc16 Mon Sep 17 00:00:00 2001 From: Bjoern Enders Date: Tue, 9 Mar 2021 04:07:18 -1000 Subject: [PATCH 26/56] Added crop_pad and testing (#300) * Added crop_pad and testing * Added GPU tests for crop_pad_simple Co-authored-by: Benedikt Daurer --- ptypy/accelerate/base/array_utils.py | 67 ++++- ptypy/utils/array_utils.py | 269 +++++++++--------- .../base_tests/array_utils_test.py | 159 ++++++----- .../cuda_pycuda_tests/array_utils_test.py | 44 +++ 4 files changed, 341 insertions(+), 198 deletions(-) diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py index c2d341711..6a7472c19 100644 --- a/ptypy/accelerate/base/array_utils.py +++ b/ptypy/accelerate/base/array_utils.py @@ -26,6 +26,7 @@ def abs2(input): ''' return np.multiply(input, input.conj()).real + def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype): ''' :param in1. An array . Can be inplace. Can be complex or real. @@ -40,6 +41,7 @@ def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype): out1[o1[0], o1[1]:(o1[1] + inshape[1]), o1[2]:(o1[2] + inshape[2])] += in1[i1[0]] return out1 + def norm2(input): ''' Input here could be a variety of 1D, 2D, 3D complex or real. all will be single precision at least. @@ -47,17 +49,20 @@ def norm2(input): ''' return np.sum(abs2(input)) + def complex_gaussian_filter(input, mfs): ''' takes 2D and 3D arrays. Complex input, complex output. mfs has len 02: + if len(mfs) > 2: raise NotImplementedError("Only batches of 2D arrays allowed!") if input.ndim == 3: mfs = np.insert(mfs, 0, 0) - return (ndi.gaussian_filter(np.real(input), mfs) +1j *ndi.gaussian_filter(np.imag(input), mfs)).astype(input.dtype) + return (ndi.gaussian_filter(np.real(input), mfs) + 1j * ndi.gaussian_filter(np.imag(input), mfs)).astype( + input.dtype) + def mass_center(A): ''' @@ -65,6 +70,7 @@ def mass_center(A): ''' return np.array(ndi.measurements.center_of_mass(A), dtype=A.dtype) + def interpolated_shift(c, shift, do_linear=False): ''' complex bicubic interpolated shift. @@ -72,9 +78,13 @@ def interpolated_shift(c, shift, do_linear=False): ''' if not do_linear: - return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j*ndi.interpolation.shift(np.imag(c), shift, order=3, prefilter=True) + return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.interpolation.shift( + np.imag(c), shift, order=3, prefilter=True) else: - return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0, prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1, mode='constant', cval=0, prefilter=False) + return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0, + prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1, + mode='constant', cval=0, + prefilter=False) def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max): @@ -84,4 +94,51 @@ def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max): ampl = np.abs(complex_input) phase = np.exp(1j * np.angle(complex_input)) ampl = np.clip(ampl, clip_min, clip_max) - complex_input[:] = ampl * phase \ No newline at end of file + complex_input[:] = ampl * phase + + +def fill3D(A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap" + A[..., Ao[0]:min(off[0] + Blim[0], Alim[0]), + Ao[1]:min(off[1] + Blim[1], Alim[1]), + Ao[2]:min(off[2] + Blim[2], Alim[2])] \ + = B[..., Bo[0]:min(Alim[0] - off[0], Blim[0]), + Bo[1]:min(Alim[1] - off[1], Blim[1]), + Bo[2]:min(Alim[2] - off[2], Blim[2])] + + +def crop_pad_2d_simple(A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + fill3D(A, B, offset) diff --git a/ptypy/utils/array_utils.py b/ptypy/utils/array_utils.py index dbd7a2366..a6dc3ede9 100644 --- a/ptypy/utils/array_utils.py +++ b/ptypy/utils/array_utils.py @@ -54,9 +54,9 @@ def switch_orientation(A, orientation, center=None): o = 0 if orientation is None else orientation if np.isscalar(o): - o = [i=='1' for i in '%03d' % int(np.base_repr(o))] + o = [i == '1' for i in '%03d' % int(np.base_repr(o))] - assert len(o)==3 + assert len(o) == 3 # switch orientation if o[0]: axes = list(range(A.ndim - 2)) + [-1, -2] @@ -101,10 +101,11 @@ def rebin_2d(A, rebin=1): sh = np.asarray(A.shape[-2:]) newdim = sh // rebin if not (sh % rebin == 0).all(): - raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)),str(rebin))) + raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)), str(rebin))) else: return A.reshape(-1, newdim[0], rebin, newdim[1], rebin).mean(-1).mean(-2) + def crop_pad_symmetric_2d(A, newshape, center=None): """ Crops or pads Array `A` symmetrically along the last two axes `(-2,-1)` @@ -148,7 +149,8 @@ def crop_pad_symmetric_2d(A, newshape, center=None): return A, c + low -def rebin(a, *args,**kwargs): + +def rebin(a, *args, **kwargs): """ Rebin ndarray data into a smaller ndarray of the same rank whose dimensions are factors of the original dimensions. @@ -184,46 +186,52 @@ def rebin(a, *args,**kwargs): """ shape = a.shape lenShape = a.ndim - factor = np.asarray(shape)//np.asarray(args) + factor = np.asarray(shape) // np.asarray(args) evList = ['a.reshape('] + \ - ['args[%d],factor[%d],'%(i,i) for i in range(lenShape)] + \ - [')'] + ['.sum(%d)'%(i+1) for i in range(lenShape)] + \ - ['*( 1.'] + ['/factor[%d]'%i for i in range(lenShape)] + [')'] - if kwargs.get('verbose',False): + ['args[%d],factor[%d],' % (i, i) for i in range(lenShape)] + \ + [')'] + ['.sum(%d)' % (i + 1) for i in range(lenShape)] + \ + ['*( 1.'] + ['/factor[%d]' % i for i in range(lenShape)] + [')'] + if kwargs.get('verbose', False): print(''.join(evList)) return eval(''.join(evList)) + def _confine(A): """\ Doc TODO. """ - sh=np.asarray(A.shape)[1:] - A=A.astype(float) - m=np.reshape(sh,(len(sh),) + len(sh)*(1,)) - return (A+m//2.0) % m - m//2.0 + sh = np.asarray(A.shape)[1:] + A = A.astype(float) + m = np.reshape(sh, (len(sh),) + len(sh) * (1,)) + return (A + m // 2.0) % m - m // 2.0 -def _translate_to_pix(sh,center): + +def _translate_to_pix(sh, center): """\ Take arbitrary input and translate it to a pixel position with respect to sh. """ - sh=np.array(sh) + sh = np.array(sh) if not isstr(center): cen = np.asarray(center) % sh - elif center=='fftshift': - cen=sh//2.0 - elif center=='geometric': - cen=sh/2.0-0.5 - elif center=='fft': - cen=sh*0.0 + elif center == 'fftshift': + cen = sh // 2.0 + elif center == 'geometric': + cen = sh / 2.0 - 0.5 + elif center == 'fft': + cen = sh * 0.0 else: raise TypeError('Input %s not understood for center' % str(center)) return cen + + """ def center_2d(sh,center): return translate_to_pix(sh[-2:],expect2(center)) """ -def grids(sh,psize=None,center='geometric',FFTlike=True): + + +def grids(sh, psize=None, center='geometric', FFTlike=True): """\ ``q0,q1,... = grids(sh)`` returns centered coordinates for a N-dimensional array of shape sh (pixel units) @@ -258,14 +266,14 @@ def grids(sh,psize=None,center='geometric',FFTlike=True): ndarray The coordinate grids """ - sh=np.asarray(sh) + sh = np.asarray(sh) - cen = _translate_to_pix(sh,center) + cen = _translate_to_pix(sh, center) - grid=np.indices(sh).astype(float) - np.reshape(cen,(len(sh),) + len(sh)*(1,)) + grid = np.indices(sh).astype(float) - np.reshape(cen, (len(sh),) + len(sh) * (1,)) if FFTlike: - grid=_confine(grid) + grid = _confine(grid) if psize is None: return grid @@ -273,16 +281,17 @@ def grids(sh,psize=None,center='geometric',FFTlike=True): psize = np.asarray(psize) if psize.size == 1: psize = psize * np.ones((len(sh),)) - psize = np.asarray(psize).reshape( (len(sh),) + len(sh)*(1,)) + psize = np.asarray(psize).reshape((len(sh),) + len(sh) * (1,)) return grid * psize + def rectangle(grids, dims=None, ew=2): if dims is None: dims = (grids.shape[-2] / 2., grids.shape[-1] / 2.) v, h = dims V, H = grids - return (smooth_step(-np.abs(V) + v/2, ew) - * smooth_step(-np.abs(H) + h/2, ew)) + return (smooth_step(-np.abs(V) + v / 2, ew) + * smooth_step(-np.abs(H) + h / 2, ew)) def ellipsis(grids, dims=None, ew=2): @@ -291,9 +300,10 @@ def ellipsis(grids, dims=None, ew=2): v, h = dims V, H = grids return smooth_step( - 0.5 - np.sqrt(V**2/v**2 + H**2/h**2), ew/np.sqrt(v * h)) + 0.5 - np.sqrt(V ** 2 / v ** 2 + H ** 2 / h ** 2), ew / np.sqrt(v * h)) + -def zoom(c,*arg,**kwargs): +def zoom(c, *arg, **kwargs): """ Wrapper `scipy.ndimage.zoom `_ function and shares @@ -311,25 +321,27 @@ def zoom(c,*arg,**kwargs): numpy.ndarray Zoomed array """ - #if np.all(arg[0] == 1): + # if np.all(arg[0] == 1): # return c # from scipy.ndimage import zoom as _zoom if np.iscomplexobj(c): - return complex_overload(_zoom)(c,*arg,**kwargs) + return complex_overload(_zoom)(c, *arg, **kwargs) else: - return _zoom(c,*arg,**kwargs) + return _zoom(c, *arg, **kwargs) + c_zoom = zoom -c_zoom.__doc__='*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__ +c_zoom.__doc__ = '*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__ """ c_affine_transform=complex_overload(ndi.affine_transform) c_affine_transform.__doc__='*complex input*\n\n'+c_affine_transform.__doc__ """ -def shift_zoom(c,zoom,cen_old,cen_new,**kwargs): + +def shift_zoom(c, zoom, cen_old, cen_new, **kwargs): """ Move array from center `cen_old` to `cen_new` and perform a zoom `zoom`. @@ -359,39 +371,40 @@ def shift_zoom(c,zoom,cen_old,cen_new,**kwargs): numpy.ndarray Shifted and zoomed array """ - + from scipy.ndimage import affine_transform as at zoom = np.diag(zoom) - offset=np.asarray(cen_old)-np.asarray(cen_new).dot(zoom) + offset = np.asarray(cen_old) - np.asarray(cen_new).dot(zoom) if np.iscomplexobj(c): - return complex_overload(at)(c,zoom,offset,**kwargs) + return complex_overload(at)(c, zoom, offset, **kwargs) else: - return at(c,zoom,offset,**kwargs) + return at(c, zoom, offset, **kwargs) -def fill3D(A,B,offset=[0,0,0]): +def fill3D(A, B, offset=[0, 0, 0]): """ Fill 3-dimensional array A with B. """ - if A.ndim != 3 or B.ndim!=3: + if A.ndim != 3 or B.ndim != 3: raise ValueError('3D a numpy arrays expected') - Alim=np.array(A.shape) - Blim=np.array(B.shape) - off=np.array(offset) + Alim = np.array(A.shape) + Blim = np.array(B.shape) + off = np.array(offset) Ao = off.copy() - Ao[Ao<0]=0 + Ao[Ao < 0] = 0 Bo = -off.copy() - Bo[Bo<0]=0 - print(Ao,Bo) + Bo[Bo < 0] = 0 if (Bo > Blim).any() or (Ao > Alim).any(): print("misfit") pass else: - A[Ao[0]:min(off[0]+Blim[0],Alim[0]),Ao[1]:min(off[1]+Blim[1],Alim[1]),Ao[2]:min(off[2]+Blim[2],Alim[2])] \ - =B[Bo[0]:min(Alim[0]-off[0],Blim[0]),Bo[1]:min(Alim[1]-off[1],Blim[1]),Bo[2]:min(Alim[2]-off[2],Blim[2])] + A[Ao[0]:min(off[0] + Blim[0], Alim[0]), Ao[1]:min(off[1] + Blim[1], Alim[1]), + Ao[2]:min(off[2] + Blim[2], Alim[2])] \ + = B[Bo[0]:min(Alim[0] - off[0], Blim[0]), Bo[1]:min(Alim[1] - off[1], Blim[1]), + Bo[2]:min(Alim[2] - off[2], Blim[2])] -def mirror(A,axis=-1): +def mirror(A, axis=-1): """ Mirrors array `A` along one axis `axis` @@ -409,9 +422,10 @@ def mirror(A,axis=-1): A view to the mirrored array. """ - return np.flipud(np.asarray(A).swapaxes(axis,0)).swapaxes(0,axis) + return np.flipud(np.asarray(A).swapaxes(axis, 0)).swapaxes(0, axis) + -def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'): +def pad_lr(A, axis, l, r, fillpar=0.0, filltype='scalar'): """ Pads ndarray `A` orthogonal to `axis` with `l` layers (pixels,lines,planes,...) on low side an `r` layers on high side. @@ -445,62 +459,61 @@ def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'): crop_pad crop_pad_symmetric_2d """ - fsh=np.array(A.shape) - if l>fsh[axis]: #rare case - l-=fsh[axis] - A=pad_lr(A,axis,fsh[axis],0,fillpar, filltype) - return pad_lr(A,axis,l,r,fillpar, filltype) - elif r>fsh[axis]: - r-=fsh[axis] - A=pad_lr(A,axis,0,fsh[axis],fillpar, filltype) - return pad_lr(A,axis,l,r,fillpar, filltype) - elif filltype=='mirror': - left=mirror(np.split(A,[l],axis)[0],axis) - right=mirror(np.split(A,[A.shape[axis]-r],axis)[1],axis) - elif filltype=='periodic': - right=np.split(A,[r],axis)[0] - left=np.split(A,[A.shape[axis]-l],axis)[1] - elif filltype=='project': - fsh[axis]=l - left=np.ones(fsh,A.dtype)*np.split(A,[1],axis)[0] - fsh[axis]=r - right=np.ones(fsh,A.dtype)*np.split(A,[A.shape[axis]-1],axis)[1] - if filltype=='scalar' or l==0: - fsh[axis]=l - left=np.ones(fsh,A.dtype)*fillpar - if filltype=='scalar' or r==0: - fsh[axis]=r - right=np.ones(fsh,A.dtype)*fillpar - if filltype=='custom': - left=fillpar[0].astype(A.dtype) - right=fillpar[1].astype(A.dtype) - return np.concatenate((left,A,right),axis=axis) - - -def _roll_from_pixcenter(sh,center): + fsh = np.array(A.shape) + if l > fsh[axis]: # rare case + l -= fsh[axis] + A = pad_lr(A, axis, fsh[axis], 0, fillpar, filltype) + return pad_lr(A, axis, l, r, fillpar, filltype) + elif r > fsh[axis]: + r -= fsh[axis] + A = pad_lr(A, axis, 0, fsh[axis], fillpar, filltype) + return pad_lr(A, axis, l, r, fillpar, filltype) + elif filltype == 'mirror': + left = mirror(np.split(A, [l], axis)[0], axis) + right = mirror(np.split(A, [A.shape[axis] - r], axis)[1], axis) + elif filltype == 'periodic': + right = np.split(A, [r], axis)[0] + left = np.split(A, [A.shape[axis] - l], axis)[1] + elif filltype == 'project': + fsh[axis] = l + left = np.ones(fsh, A.dtype) * np.split(A, [1], axis)[0] + fsh[axis] = r + right = np.ones(fsh, A.dtype) * np.split(A, [A.shape[axis] - 1], axis)[1] + if filltype == 'scalar' or l == 0: + fsh[axis] = l + left = np.ones(fsh, A.dtype) * fillpar + if filltype == 'scalar' or r == 0: + fsh[axis] = r + right = np.ones(fsh, A.dtype) * fillpar + if filltype == 'custom': + left = fillpar[0].astype(A.dtype) + right = fillpar[1].astype(A.dtype) + return np.concatenate((left, A, right), axis=axis) + + +def _roll_from_pixcenter(sh, center): """\ returns array of ints as input for np.roll use np.roll(A,-roll_from_pixcenter(sh,cen)[ax],ax) to put 'cen' in geometric center of array A """ - sh=np.array(sh) + sh = np.array(sh) if center != None: - if center=='fftshift': - cen=sh//2.0 - elif center=='geometric': - cen=sh/2.0-0.5 - elif center=='fft': - cen=sh*0.0 + if center == 'fftshift': + cen = sh // 2.0 + elif center == 'geometric': + cen = sh / 2.0 - 0.5 + elif center == 'fft': + cen = sh * 0.0 elif center is not None: - cen=sh*np.asarray(center) % sh - 0.5 + cen = sh * np.asarray(center) % sh - 0.5 - roll=np.ceil(cen - sh/2.0) % sh + roll = np.ceil(cen - sh / 2.0) % sh else: - roll=np.zeros_like(sh) + roll = np.zeros_like(sh) return roll.astype(int) - -def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'): +def crop_pad_axis(A, hplanes, axis=-1, roll=0, fillpar=0.0, filltype='scalar'): """ Crops or pads a volume array `A` at beginning and end of axis `axis` with a number of hyperplanes specified by `hplanes` @@ -573,37 +586,36 @@ def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'): >>> B=crop_pad_axis(V,(3,-2),1,filltype='mirror') """ if np.isscalar(hplanes): - hplanes=int(hplanes) - r=np.abs(hplanes) // 2 * np.sign(hplanes) - l=hplanes - r - elif len(hplanes)==2: - l=int(hplanes[0]) - r=int(hplanes[1]) + hplanes = int(hplanes) + r = np.abs(hplanes) // 2 * np.sign(hplanes) + l = hplanes - r + elif len(hplanes) == 2: + l = int(hplanes[0]) + r = int(hplanes[1]) else: raise RuntimeError('unsupoorted input for \'hplanes\'') - if roll!=0: - A=np.roll(A,-roll,axis=axis) - - if l<=0 and r<=0: - A=np.split(A,[-l,A.shape[axis]+r],axis)[1] - elif l>0 and r>0: - A=pad_lr(A,axis,l,r,fillpar,filltype) - elif l>0 and r<=0: - A=pad_lr(A,axis,l,0,fillpar,filltype) - A=np.split(A,[0,A.shape[axis]+r],axis)[1] - elif l<=0 and r>0: - A=pad_lr(A,axis,0,r,fillpar,filltype) - A=np.split(A,[-l,A.shape[axis]],axis)[1] - - - if roll!=0: - return np.roll(A,roll+r,axis=axis) + if roll != 0: + A = np.roll(A, -roll, axis=axis) + + if l <= 0 and r <= 0: + A = np.split(A, [-l, A.shape[axis] + r], axis)[1] + elif l > 0 and r > 0: + A = pad_lr(A, axis, l, r, fillpar, filltype) + elif l > 0 and r <= 0: + A = pad_lr(A, axis, l, 0, fillpar, filltype) + A = np.split(A, [0, A.shape[axis] + r], axis)[1] + elif l <= 0 and r > 0: + A = pad_lr(A, axis, 0, r, fillpar, filltype) + A = np.split(A, [-l, A.shape[axis]], axis)[1] + + if roll != 0: + return np.roll(A, roll + r, axis=axis) else: return A -def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'): +def crop_pad(A, hplane_list, axes=None, cen=None, fillpar=0.0, filltype='scalar'): """\ Crops or pads a volume array `A` with a number of hyperplanes according to parameters in `hplanes` Wrapper for crop_pad_axis. @@ -660,14 +672,13 @@ def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'): """ if axes is None: - axes=np.arange(len(hplane_list))-len(hplane_list) - elif not(len(axes)==len(hplane_list)): + axes = np.arange(len(hplane_list)) - len(hplane_list) + elif not (len(axes) == len(hplane_list)): raise RuntimeError('if axes is specified, hplane_list has to be same length as axes') - sh=np.array(A.shape) - roll = _roll_from_pixcenter(sh,cen) + sh = np.array(A.shape) + roll = _roll_from_pixcenter(sh, cen) - for ax,cut in zip(axes,hplane_list): - A=crop_pad_axis(A,cut,ax,roll[ax],fillpar,filltype) + for ax, cut in zip(axes, hplane_list): + A = crop_pad_axis(A, cut, ax, roll[ax], fillpar, filltype) return A - diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py index f1a182ab0..b1cac58fe 100644 --- a/test/accelerate_tests/base_tests/array_utils_test.py +++ b/test/accelerate_tests/base_tests/array_utils_test.py @@ -2,7 +2,6 @@ Tests for the array_utils module ''' - import unittest import numpy as np from ptypy.accelerate.base import FLOAT_TYPE, COMPLEX_TYPE @@ -12,7 +11,7 @@ class ArrayUtilsTest(unittest.TestCase): def test_dot_resolution(self): - X,Y,Z = np.indices((3,3,1001), dtype=np.float32) + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) A = 10 ** Y + 1j * 10 ** X out = au.dot(A, A) np.testing.assert_array_equal(out, 60666606.0) @@ -21,7 +20,7 @@ def test_abs2_real_input(self): single_dim = 50.0 npts = single_dim ** 3 array_to_be_absed = np.arange(npts) - absed = np.array([ix**2 for ix in array_to_be_absed]) + absed = np.array([ix ** 2 for ix in array_to_be_absed]) array_shape = (int(single_dim), int(single_dim), int(single_dim)) array_to_be_absed.reshape(array_shape) absed.reshape(array_shape) @@ -29,13 +28,12 @@ def test_abs2_real_input(self): np.testing.assert_array_equal(absed, out) self.assertEqual(absed.dtype, np.float) - def test_abs2_complex_input(self): single_dim = 50.0 array_shape = (int(single_dim), int(single_dim), int(single_dim)) npts = single_dim ** 3 array_to_be_absed = np.arange(npts) + 1j * np.arange(npts) - absed = np.array([np.abs(ix**2) for ix in array_to_be_absed]) + absed = np.array([np.abs(ix ** 2) for ix in array_to_be_absed]) absed.reshape(array_shape) array_to_be_absed.reshape(array_shape) out = au.abs2(array_to_be_absed) @@ -53,7 +51,7 @@ def test_sum_to_buffer(self): # fill the input array for idx in range(I): - in1[idx] = np.ones((M, N))* (idx + 1.0) + in1[idx] = np.ones((M, N)) * (idx + 1.0) outshape = (X, M, N) expected_out = np.empty(outshape) @@ -64,9 +62,9 @@ def test_sum_to_buffer(self): in1_addr = np.empty((I, 3)) in1_addr = np.array([(0, 0, 0), - (1, 0, 0), - (2, 0, 0), - (3, 0, 0)]) + (1, 0, 0), + (2, 0, 0), + (3, 0, 0)]) out1_addr = np.empty_like(in1_addr) out1_addr = np.array([(0, 0, 0), @@ -77,7 +75,6 @@ def test_sum_to_buffer(self): out = au.sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype=FLOAT_TYPE) np.testing.assert_array_equal(out, expected_out) - def test_sum_to_buffer_complex(self): I = 4 @@ -89,20 +86,20 @@ def test_sum_to_buffer_complex(self): # fill the input array for idx in range(I): - in1[idx] = np.ones((M, N))* (idx + 1.0) + 1j * np.ones((M, N))* (idx + 1.0) + in1[idx] = np.ones((M, N)) * (idx + 1.0) + 1j * np.ones((M, N)) * (idx + 1.0) outshape = (X, M, N) expected_out = np.empty(outshape, dtype=COMPLEX_TYPE) - expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N))* 4.0 - expected_out[1] = np.ones((M, N)) * 6.0+ 1j * np.ones((M, N))* 6.0 + expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N)) * 4.0 + expected_out[1] = np.ones((M, N)) * 6.0 + 1j * np.ones((M, N)) * 6.0 in1_addr = np.empty((I, 3)) in1_addr = np.array([(0, 0, 0), - (1, 0, 0), - (2, 0, 0), - (3, 0, 0)]) + (1, 0, 0), + (2, 0, 0), + (3, 0, 0)]) out1_addr = np.empty_like(in1_addr) out1_addr = np.array([(0, 0, 0), @@ -120,7 +117,7 @@ def test_norm2_1d_real(self): np.testing.assert_array_equal(out, 5.0) def test_norm2_1d_complex(self): - a = np.array([1.0+1.0j, 2.0+2.0j], dtype=COMPLEX_TYPE) + a = np.array([1.0 + 1.0j, 2.0 + 2.0j], dtype=COMPLEX_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 10.0) @@ -131,22 +128,22 @@ def test_norm2_2d_real(self): np.testing.assert_array_equal(out, 30.0) def test_norm2_2d_complex(self): - a = np.array([[1.0+1.0j, 2.0+2.0j], - [3.0+3.0j, 4.0+4.0j]], dtype=COMPLEX_TYPE) + a = np.array([[1.0 + 1.0j, 2.0 + 2.0j], + [3.0 + 3.0j, 4.0 + 4.0j]], dtype=COMPLEX_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 60.0) def test_norm2_3d_real(self): a = np.array([[[1.0, 2.0], - [3.0, 4.0]], + [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]], dtype=FLOAT_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 204.0) def test_norm2_3d_complex(self): - a = np.array([[[1.0+1.0j, 2.0+2.0j], - [3.0+3.0j, 4.0+4.0j]], + a = np.array([[[1.0 + 1.0j, 2.0 + 2.0j], + [3.0 + 3.0j, 4.0 + 4.0j]], [[5.0 + 5.0j, 6.0 + 6.0j], [7.0 + 7.0j, 8.0 + 8.0j]]], dtype=COMPLEX_TYPE) out = au.norm2(a) @@ -154,46 +151,45 @@ def test_norm2_3d_complex(self): def test_complex_gaussian_filter_2d(self): data = np.zeros((8, 8), dtype=COMPLEX_TYPE) - data[3:5, 3:5] = 2.0+2.0j - mfs = 3.0,4.0 + data[3:5, 3:5] = 2.0 + 2.0j + mfs = 3.0, 4.0 out = au.complex_gaussian_filter(data, mfs) expected_out = np.array([0.11033735 + 0.11033735j, 0.11888228 + 0.11888228j, 0.13116673 + 0.13116673j , 0.13999543 + 0.13999543j, 0.13999543 + 0.13999543j, 0.13116673 + 0.13116673j , 0.11888228 + 0.11888228j, 0.11033735 + 0.11033735j], dtype=COMPLEX_TYPE) np.testing.assert_array_almost_equal(np.diagonal(out), expected_out) - def test_complex_gaussian_filter_2d_batched(self): batch_number = 2 A = 5 B = 5 data = np.zeros((batch_number, A, B), dtype=COMPLEX_TYPE) - data[:, 2:3, 2:3] = 2.0+2.0j - mfs = 3.0,4.0 + data[:, 2:3, 2:3] = 2.0 + 2.0j + mfs = 3.0, 4.0 out = au.complex_gaussian_filter(data, mfs) - expected_out = np.array([[[ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.08012911+0.08012911j, 0.08013555+0.08013555j, 0.08013615+0.08013615j, - 0.08013555+0.08013555j, 0.08012911+0.08012911j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ]], - - [[ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.08012911+0.08012911j, 0.08013555+0.08013555j, 0.08013615+0.08013615j, - 0.08013555+0.08013555j, 0.08012911+0.08012911j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ]]], dtype=COMPLEX_TYPE) + expected_out = np.array([[[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]], + + [[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]]], dtype=COMPLEX_TYPE) np.testing.assert_array_almost_equal(out, expected_out) @@ -206,13 +202,12 @@ def test_mass_center_2d(self): X, Y = np.meshgrid(x, x) Xoff = 5.0 Yoff = 2.0 - probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals + probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals com = au.mass_center(np.abs(probe[0])) expected_out = np.array([Yoff, Xoff]) + npts // 2 np.testing.assert_array_almost_equal(com, expected_out, decimal=6) - def test_mass_center_3d(self): npts = 64 probe = np.zeros((npts, npts, npts), dtype=COMPLEX_TYPE) @@ -223,7 +218,7 @@ def test_mass_center_3d(self): Xoff = 5.0 Yoff = 2.0 Zoff = 10.0 - probe[(X-Xoff)**2 + (Y-Yoff)**2 + (Z-Zoff)**2< rad**2] = probe_vals + probe[(X - Xoff) ** 2 + (Y - Yoff) ** 2 + (Z - Zoff) ** 2 < rad ** 2] = probe_vals com = au.mass_center(np.abs(probe)) expected_out = np.array([Yoff, Xoff, Zoff]) + npts // 2 @@ -238,28 +233,64 @@ def test_interpolated_shift(self): X, Y = np.meshgrid(x, x) Xoff = 5.0 Yoff = 2.0 - probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals + probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals offset = np.array([-Yoff, -Xoff]) not_shifted_probe = np.zeros((1, npts, npts), dtype=COMPLEX_TYPE) - not_shifted_probe[0, (X)**2 + (Y)**2 < rad**2] = probe_vals + not_shifted_probe[0, (X) ** 2 + (Y) ** 2 < rad ** 2] = probe_vals probe[0] = au.interpolated_shift(probe[0], offset) np.testing.assert_array_almost_equal(probe, not_shifted_probe, decimal=8) def test_clip_magnitudes_to_range(self): - data = np.ones((5,5), dtype=COMPLEX_TYPE) - data[2, 4] = 20.0*np.exp(1j*np.pi/2) - data[3, 1] = 0.2*np.exp(1j*np.pi/3) + data = np.ones((5, 5), dtype=COMPLEX_TYPE) + data[2, 4] = 20.0 * np.exp(1j * np.pi / 2) + data[3, 1] = 0.2 * np.exp(1j * np.pi / 3) clip_min = 0.5 clip_max = 2.0 expected_out = np.ones_like(data) - expected_out[2, 4] = 2.0*np.exp(1j*np.pi/2) - expected_out[3, 1] = 0.5*np.exp(1j*np.pi/3) + expected_out[2, 4] = 2.0 * np.exp(1j * np.pi / 2) + expected_out[3, 1] = 0.5 * np.exp(1j * np.pi / 3) au.clip_complex_magnitudes_to_range(data, clip_min, clip_max) - np.testing.assert_array_almost_equal(data, expected_out, decimal=7) # floating point precision I guess... - - - -if __name__=='__main__': - unittest.main() \ No newline at end of file + np.testing.assert_array_almost_equal(data, expected_out, decimal=7) # floating point precision I guess... + + def test_crop_pad_1(self): + # pad, integer, 2D + B = np.indices((4, 4), dtype=np.int) + A = np.zeros((6, 6), dtype=B.dtype) + au.crop_pad_2d_simple(A, B.sum(0)) + exp_A = np.array([[0, 0, 0, 0, 0, 0], + [0, 0, 1, 2, 3, 0], + [0, 1, 2, 3, 4, 0], + [0, 2, 3, 4, 5, 0], + [0, 3, 4, 5, 6, 0], + [0, 0, 0, 0, 0, 0]]) + np.testing.assert_equal(A, exp_A) + + def test_crop_pad_2(self): + # crop, float, 3D + B = np.indices((4, 4), dtype=np.float32) + A = np.zeros((2, 2, 2), dtype=B.dtype) + au.crop_pad_2d_simple(A, B) + exp_A = np.array([[[1., 1.], + [2., 2.]], + [[1., 2.], + [1., 2.]]], dtype=np.float32) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_crop_pad_3(self): + # crop/pad, complex, 3D + B = np.indices((4, 3), dtype=np.complex64) + B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2, 2, 5), dtype=B.dtype) + au.crop_pad_2d_simple(A, B) + exp_A = np.array([[[0. + 0.j, 1. + 0.j, 1. + 1.j, 1. + 2.j, 0. + 0.j], + [0. + 0.j, 2. + 0.j, 2. + 1.j, 2. + 2.j, 0. + 0.j]], + [[0. + 0.j, 0. + 1.j, 1. + 1.j, 2. + 1.j, 0. + 0.j], + [0. + 0.j, 0. + 2.j, 1. + 2.j, 2. + 2.j, 0. + 0.j]]], + dtype=np.complex64) + np.testing.assert_array_almost_equal(A, exp_A) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index dcd133344..ab3f78d7e 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -268,3 +268,47 @@ def test_complex_gaussian_filter_2d_batched(self): out_exp = au.complex_gaussian_filter(inp, mfs) out = out_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) + + + def test_crop_pad_simple_1(self): + # pad, integer, 2D + B = np.indices((4, 4), dtype=np.int).sum(0) + A = np.zeros((6, 6), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + gau.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_2(self): + # crop, float, 3D + B = np.indices((4, 4), dtype=np.float32) + A = np.zeros((2, 2, 2), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + gau.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_3(self): + # crop/pad, complex, 3D + B = np.indices((4, 3), dtype=np.complex64) + B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2, 2, 5), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + gau.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) From 0219aeb96d4ea890c7ad7f65fe55ecf88c665297 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 9 Mar 2021 21:31:54 +0000 Subject: [PATCH 27/56] Gpu hackathon: intensity kernel fix (#301) * fixing intensity kernel race condition without extra memory * Save Imodel Co-authored-by: Benedikt Daurer --- .../cuda_pycuda/cuda/error_reduce.cu | 4 +- .../cuda_pycuda/cuda/intens_renorm.cu | 63 ++++++++++++------- ptypy/accelerate/cuda_pycuda/kernels.py | 23 +++---- .../dls_gradient_descent_kernel_test.py | 24 ++++++- 4 files changed, 73 insertions(+), 41 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu index 9b3389d5c..91b5357b4 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu @@ -14,7 +14,7 @@ extern "C" __global__ void error_reduce(const IN_TYPE* ferr, int tx = threadIdx.x; int ty = threadIdx.y; int batch = blockIdx.x; - extern __shared__ ACC_TYPE sum_v[1024]; + __shared__ ACC_TYPE sum_v[BDIM_X*BDIM_Y]; int shidx = ty * blockDim.x + tx; // shidx: index in shared memory for this block @@ -35,7 +35,7 @@ extern "C" __global__ void error_reduce(const IN_TYPE* ferr, __syncthreads(); - int nt = blockDim.x * blockDim.y; + int nt = BDIM_X * BDIM_Y; int c = nt; while (c > 1) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu index 60b0db6e7..d0033f7f4 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu @@ -10,40 +10,57 @@ using thrust::complex; extern "C" __global__ void step1(const IN_TYPE* Imodel, - const IN_TYPE* I, - const IN_TYPE* w, - OUT_TYPE* num, - OUT_TYPE* den, - int z, - int x) + const IN_TYPE* I, + const IN_TYPE* w, + OUT_TYPE* num, + OUT_TYPE* den, + int n) { - int iz = blockIdx.z; - int ix = threadIdx.x + blockIdx.x * blockDim.x; + int i = threadIdx.x + blockIdx.x * blockDim.x; - if (iz >= z || ix >= x) + if (i >= n) return; - auto tmp = MATH_TYPE(w[iz * x + ix]) * MATH_TYPE(Imodel[iz * x + ix]); - num[iz * x + ix] = tmp * MATH_TYPE(I[iz * x + ix]); - den[iz * x + ix] = tmp * MATH_TYPE(Imodel[iz * x + ix]); + auto tmp = MATH_TYPE(w[i]) * MATH_TYPE(Imodel[i]); + num[i] = tmp * MATH_TYPE(I[i]); + den[i] = tmp * MATH_TYPE(Imodel[i]); } extern "C" __global__ void step2(const IN_TYPE* fic_tmp, OUT_TYPE* fic, OUT_TYPE* Imodel, - int z, - int x) + int X, + int Y) { int iz = blockIdx.z; - int ix = threadIdx.x + blockIdx.x * blockDim.x; + int tx = threadIdx.x; + int ty = threadIdx.y; + + // one thread block per fic data point - we want the first thread to read this + // into shared memory and then sync the block, so we don't get into data races + // with writing it back to global memory in the end (and we read the value only + // once) + // + __shared__ MATH_TYPE shfic[1]; + if (tx == 0 && ty == 0) { + shfic[0] = MATH_TYPE(fic[iz]) / MATH_TYPE(fic_tmp[iz]); + } + __syncthreads(); - if (iz >= z || ix >= x) - return; - //probably not so clever having all threads read from the same locations - auto tmp = MATH_TYPE(fic[iz]) / MATH_TYPE(fic_tmp[iz]); - Imodel[iz * x + ix] *= tmp; + // now all threads can access that value + auto tmp = shfic[0]; + + // offset Imodel for current z + Imodel += iz * X * Y; + + for (int iy = ty; iy < Y; iy += blockDim.y) { + #pragma unroll(4) + for (int ix = tx; ix < X; ix += blockDim.x) { + Imodel[iy * X + ix] *= tmp; + } + } + // race condition if write is not restricted to one thread - // learned this the hard way - if (ix==0) + if (tx==0 && ty == 0) fic[iz] = tmp; -} \ No newline at end of file +} diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 1ff4ac00e..025ab7fe9 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -190,7 +190,6 @@ def error_reduce(self, addr, err_sum): np.int32(self.fshape[2]), block=(32, 32, 1), grid=(int(err_sum.shape[0]), 1, 1), - shared=32*32*4, stream=self.queue) def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): @@ -407,7 +406,9 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t self.make_a012_cuda = load_kernel('make_a012', subs) self.error_reduce_cuda = load_kernel('error_reduce', { **subs, - 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'BDIM_X': 32, + 'BDIM_Y': 32 }) self.fill_b_cuda = load_kernel('fill_b', { **subs, @@ -520,7 +521,6 @@ def error_reduce(self, addr, err_sum): np.int32(ferr.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) def floating_intensity(self, addr, w, I, fic): @@ -538,14 +538,13 @@ def floating_intensity(self, addr, w, I, fic): fic_tmp = self.gpu.fic_tmp ## math ## - x = np.int32(sh[1] * sh[2]) - z = np.int32(maxz) + xall = np.int32(maxz * sh[1] * sh[2]) bx = 1024 self.floating_intensity_cuda_step1(Imodel, I, w, num, den, - z, x, + xall, block=(bx, 1, 1), - grid=(int((x + bx - 1) // bx), 1, int(z)), + grid=(int((xall + bx - 1) // bx), 1, 1), stream=self.queue) self.error_reduce_cuda(num, fic, @@ -553,7 +552,6 @@ def floating_intensity(self, addr, w, I, fic): np.int32(num.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) self.error_reduce_cuda(den, fic_tmp, @@ -561,13 +559,13 @@ def floating_intensity(self, addr, w, I, fic): np.int32(den.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) self.floating_intensity_cuda_step2(fic_tmp, fic, Imodel, - z, x, - block=(bx, 1, 1), - grid=(int((x + bx - 1) // bx), 1, int(z)), + np.int32(Imodel.shape[-2]), + np.int32(Imodel.shape[-1]), + block=(32, 32, 1), + grid=(1, 1, int(maxz)), stream=self.queue) @@ -874,7 +872,6 @@ def error_reduce(self, addr, err_fmag): np.int32(self.fshape[2]), block=(32, 32, 1), grid=(int(err_fmag.shape[0]), 1, 1), - shared=32*32*4, stream=self.queue) def update_addr_and_error_state_old(self, addr, error_state, mangled_addr, err_sum): diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index f02a1c94a..afa76eeca 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -59,13 +59,14 @@ def test_make_model_UNITY(self, name, iter): ["floating", 0], ]) def test_floating_intensity_UNITY(self, name, iter): - + # Load data with h5py.File(self.datadir %name + "floating_intensities_%04d.h5" %iter, "r") as f: w = f["w"][:] addr = f["addr"][:] I = f["I"][:] fic = f["fic"][:] + Imodel = f["Imodel"][:] with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: aux = f["aux"][:] @@ -75,23 +76,40 @@ def test_floating_intensity_UNITY(self, name, iter): addr_dev = gpuarray.to_gpu(addr) I_dev = gpuarray.to_gpu(I) fic_dev = gpuarray.to_gpu(fic) + Imodel_dev = gpuarray.to_gpu(np.ascontiguousarray(Imodel)) # CPU Kernel BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) BGDK.allocate() + BGDK.npy.Imodel = Imodel BGDK.floating_intensity(addr, w, I, fic) # GPU kernel GDK = GradientDescentKernel(aux_dev, addr.shape[1]) GDK.allocate() + GDK.gpu.Imodel = Imodel_dev GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) ## Assert - np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, - err_msg="`Imodel` buffer has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`LLerr` buffer has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`LLden` buffer has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.fic_tmp, GDK.gpu.fic_tmp.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`fic_tmp` buffer has not been updated as expected") + np.testing.assert_allclose(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, err_msg="floating intensity coeff (fic) has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`Imodel` buffer has not been updated as expected") + + @parameterized.expand([ ["base", 10], ["regul", 50], From c93ddf35ad5b74dbed5ebdebca122e9f152a829e Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 9 Mar 2021 21:42:33 +0000 Subject: [PATCH 28/56] Gpu hackathon: accuracy scripts (#302) * accuracy testing script for gradient descent kernels * forgotten return statements * fix in results building Co-authored-by: Benedikt Daurer --- .../diamond_benchmarks/ML_accurracy_test.py | 404 ++++++++++++++++++ .../dls_gradient_descent_kernel_test.py | 11 - 2 files changed, 404 insertions(+), 11 deletions(-) create mode 100644 benchmark/diamond_benchmarks/ML_accurracy_test.py diff --git a/benchmark/diamond_benchmarks/ML_accurracy_test.py b/benchmark/diamond_benchmarks/ML_accurracy_test.py new file mode 100644 index 000000000..a8da654ac --- /dev/null +++ b/benchmark/diamond_benchmarks/ML_accurracy_test.py @@ -0,0 +1,404 @@ +''' +Load real data and prepare an accuracy report of GPU vs numpy +''' + +import h5py +import numpy as np +import csv + +import pycuda.driver as cuda +from pycuda import gpuarray + +from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel + + +class GradientDescentAccuracyTester: + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + headings = ['Kernel', 'Version', 'Iter', 'MATH_TYPE', 'IN/OUT_TYPE', + 'ACC_TYPE', 'Array', 'num_elements', 'num_errors', 'max_relerr', 'max_abserr'] + + def __init__(self): + import sys + np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) + cuda.init() + self.device = cuda.Device(0) + self.ctx = self.device.make_context() + self.stream = cuda.Stream() + self.results = [] + + def __del__(self): + np.set_printoptions() + self.ctx.pop() + self.ctx.detach() + + def test_make_model(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}): + + res = [] + + # Load data + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_model(aux, addr) + ref = BGDK.npy.Imodel + + # GPU variants + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + for m in math_type: + # data type will be determined based on aux_dev data type automatically + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], queue=self.stream, math_type=m) + GDK.allocate() + GDK.make_model(aux_dev, addr_dev) + act = GDK.gpu.Imodel.get() + + num, num_mis, max_abs, max_rel = self._calc_diffs(act, ref) + + line = ['make_model', name, iter, d, m, 'N/A', + 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_floating_intensity(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + # note that this is actually calling 4 kernels: + # - floating_intensity_cuda_step1 + # - error_reduce_cuda (2x) + # - floating_intensity_cuda_step2 + + res = [] + + # Load data + with h5py.File(self.datadir % name + "floating_intensities_%04d.h5" % iter, "r") as f: + w = f["w"][:] + addr = f["addr"][:] + I = f["I"][:] + fic = f["fic"][:] + Imodel = f["Imodel"][:] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + + # CPU Kernel + ficref = np.copy(fic) + Iref = np.copy(Imodel) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = Iref + BGDK.floating_intensity(addr, w, I, ficref) # modifies fic, Imodel + Iref = BGDK.npy.Imodel + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float32)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float64)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float64)) + + # GPU kernel + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], accumulate_type=a, math_type=m, queue=self.stream) + GDK.allocate() + GDK.gpu.Imodel = Imodel_dev + GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) + + Iact = GDK.gpu.Imodel.get() + fact = fic_dev.get() + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Iact, Iref) + line = ['floating_intensity', name, iter, d, m, + a, 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + fact, ficref) + line = ['floating_intensity', name, iter, d, m, + a, 'fic', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_main_and_error_reduce(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + res = [] + + # Load data + with h5py.File(self.datadir % name + "main_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + w = f["w"][:] + I = f["I"][:] + # Load data + with h5py.File(self.datadir % name + "error_reduce_%04d.h5" % iter, "r") as f: + err_phot = f["err_phot"][:] + + # CPU Kernel + auxref = np.copy(aux) + errref = np.copy(err_phot) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.main(auxref, addr, w, I) + BGDK.error_reduce(addr, errref) + LLerrref = BGDK.npy.LLerr + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + err_phot_dev = gpuarray.to_gpu( + err_phot.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + err_phot_dev = gpuarray.to_gpu( + err_phot.astype(np.float64)) + + # GPU kernel + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], accumulate_type=a, math_type=m) + GDK.allocate() + GDK.main(aux_dev, addr_dev, w_dev, I_dev) + GDK.error_reduce(addr_dev, err_phot_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + auxref, aux_dev.get()) + line = ['main_and_error_reduce', name, iter, d, + m, a, 'aux', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLerrref, GDK.gpu.LLerr.get()) + line = ['main_and_error_reduce', name, iter, d, + m, a, 'LLerr', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + errref, err_phot_dev.get()) + line = ['main_and_error_reduce', name, iter, d, m, + a, 'err_phot', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_make_a012(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + # Reduce the array size to make the tests run faster + Nmax = 10 + Ymax = 128 + Xmax = 128 + + res = [] + + # Load data + with h5py.File(self.datadir % name + "make_a012_%04d.h5" % iter, "r") as g: + addr = g["addr"][:Nmax] + I = g["I"][:Nmax, :Ymax, :Xmax] + b_f = g["f"][:Nmax, :Ymax, :Xmax] + b_a = g["a"][:Nmax, :Ymax, :Xmax] + b_b = g["b"][:Nmax, :Ymax, :Xmax] + fic = g["fic"][:Nmax] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as h: + aux = h["aux"][:Nmax, :Ymax, :Xmax] + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_a012(b_f, b_a, b_b, addr, I, fic) + Imodelref = BGDK.npy.Imodel + LLerrref = BGDK.npy.LLerr + LLdenref = BGDK.npy.LLden + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex64)) + b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex64)) + b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex64)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex128)) + b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex128)) + b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex128)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float64)) + + GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream, + math_type=m, accumulate_type=a) + GDK.allocate() + GDK.gpu.Imodel.fill(np.nan) + GDK.gpu.LLerr.fill(np.nan) + GDK.gpu.LLden.fill(np.nan) + GDK.make_a012(b_f_dev, b_a_dev, b_b_dev, + addr_dev, I_dev, fic_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLerrref, GDK.gpu.LLerr.get()) + line = ['make_a012', name, iter, d, m, a, + 'LLerr', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLdenref, GDK.gpu.LLden.get()) + line = ['make_a012', name, iter, d, m, a, + 'LLden', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Imodelref, GDK.gpu.Imodel.get()) + line = ['make_a012', name, iter, d, m, a, + 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_fill_b(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + res = [] + + # Load data + + Nmax = 10 + Ymax = 128 + Xmax = 128 + + with h5py.File(self.datadir % name + "fill_b_%04d.h5" % iter, "r") as f: + w = f["w"][:Nmax, :Ymax, :Xmax] + addr = f["addr"][:] + B = f["B"][:] + Brenorm = f["Brenorm"][...] + A0 = f["A0"][:Nmax, :Ymax, :Xmax] + A1 = f["A1"][:Nmax, :Ymax, :Xmax] + A2 = f["A2"][:Nmax, :Ymax, :Xmax] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:Nmax, :Ymax, :Xmax] + + # CPU Kernel + Bref = np.copy(B) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = A0 + BGDK.npy.LLerr = A1 + BGDK.npy.LLden = A2 + BGDK.fill_b(addr, Brenorm, w, Bref) + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + B_dev = gpuarray.to_gpu(B.astype(np.float32)) + A0_dev = gpuarray.to_gpu(A0.astype(np.float32)) + A1_dev = gpuarray.to_gpu(A1.astype(np.float32)) + A2_dev = gpuarray.to_gpu(A2.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + B_dev = gpuarray.to_gpu(B.astype(np.float64)) + A0_dev = gpuarray.to_gpu(A0.astype(np.float64)) + A1_dev = gpuarray.to_gpu(A1.astype(np.float64)) + A2_dev = gpuarray.to_gpu(A2.astype(np.float64)) + + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], queue=self.stream, math_type=m, accumulate_type=a) + GDK.allocate() + GDK.gpu.Imodel = A0_dev + GDK.gpu.LLerr = A1_dev + GDK.gpu.LLden = A2_dev + GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Bref, B_dev.get()) + line = ['fill_b', name, iter, d, m, a, + 'B', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def _calc_diffs(self, act, ref): + diffs = np.abs(ref - act) + max_abs = np.max(diffs[:]) + aref = np.abs(ref[:]) + max_rel = np.max( + np.divide(diffs[:], aref, out=np.zeros_like(diffs[:]), where=aref > 0)) + num_mis = np.count_nonzero(diffs[:] > self.atol + self.rtol * aref) + num = np.prod(ref.shape) + + return num, num_mis, max_abs, max_rel + + +tester = GradientDescentAccuracyTester() +print(tester.headings) + +res = [tester.headings] +for ver in [("base", 10), ("regul", 50), ("floating", 0)]: + res += tester.test_make_model(*ver) + res += tester.test_floating_intensity(*ver) + res += tester.test_main_and_error_reduce(*ver) + res += tester.test_make_a012(*ver) + res += tester.test_fill_b(*ver) + +with open('ML_accuracy_test_results.csv', 'w', newline='') as f: + writer = csv.writer(f) + writer.writerows(res) + +print('Done.') diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py index afa76eeca..f62834e2e 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -186,17 +186,6 @@ def test_make_a012_UNITY(self, name, iter): b_dev = gpuarray.to_gpu(b) fic_dev = gpuarray.to_gpu(fic) - # double versions - # aux_dbl = aux.astype(np.complex128) - # I_dbl = I.astype(np.float64) - # f_dbl = f.astype(np.complex128) - # a_dbl = a.astype(np.complex128) - # b_dbl = b.astype(np.complex128) - # fic_dbl = fic.astype(np.float64) - # BGDK = BaseGradientDescentKernel(aux_dbl, addr.shape[1]) - # BGDK.allocate() - # BGDK.make_a012(f_dbl, a_dbl, b_dbl, addr, I_dbl, fic_dbl) - # CPU Kernel BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) BGDK.allocate() From 3705653396e4824751d6110bf7c45e8ce45ac932 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Wed, 10 Mar 2021 10:35:29 +0000 Subject: [PATCH 29/56] Gpu hackathon: crop pad (#303) * adding a 4D test case for crop-pad * crop/pad GPU tests are passing * Integrated crop_pad into propagator, simple tests passing * Added tests for crop/pad, refactored ArrayUtilsKernel. * adding BDIM_X/BDIM_Y to other uses of the error_reduce kernel * conditionally enable -std=c++14 flag depending on CUDA version Co-authored-by: Benedikt Daurer Co-authored-by: Bjoern Enders --- ptypy/accelerate/cuda_pycuda/__init__.py | 8 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 120 +++++++++++++++++- ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu | 60 +++++++++ .../cuda_pycuda/engines/DM_pycuda.py | 7 +- ptypy/accelerate/cuda_pycuda/kernels.py | 43 ++++++- .../cuda_pycuda_tests/array_utils_test.py | 66 ++++++++-- .../propagation_kernel_test.py | 52 +++++++- 7 files changed, 325 insertions(+), 31 deletions(-) create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 9daee89e3..d78f1cb80 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -4,7 +4,13 @@ import os # debug_options = [] #debug_options = ['-O0', '-G', '-g', '-std=c++11', '--keep'] -debug_options = ['-O3', '-DNDEBUG', '-std=c++11', '-lineinfo'] # release mode flags +debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags + +# C++14 support was added with CUDA 9, so we only enable the flag there +if cuda.get_version()[0] >= 9: + debug_options += ['-std=c++14'] +else: + debug_options += ['-std=c++11'] context = None queue = None diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index e3b97657a..14dd05532 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -21,10 +21,6 @@ def __init__(self, acc_dtype=np.float64, queue=None): 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float', 'BDIM_X': 1024 }) - self.transpose_cuda = load_kernel("transpose", { - 'DTYPE': 'int', - 'BDIM': 16 - }) self.Ctmp = None def dot(self, A, B, out=None): @@ -62,6 +58,18 @@ def dot(self, A, B, out=None): return out + def norm2(self, A, out=None): + return self.dot(A, A, out) + +class TransposeKernel: + + def __init__(self, queue=None): + self.queue = queue + self.transpose_cuda = load_kernel("transpose", { + 'DTYPE': 'int', + 'BDIM': 16 + }) + def transpose(self, input, output): # only for int at the moment (addr array), and 2D (reshape pls) if len(input.shape) != 2: @@ -82,8 +90,108 @@ def transpose(self, input, output): self.transpose_cuda(input, output, np.int32(width), np.int32(height), block=blk, grid=grd, stream=self.queue) - def norm2(self, A, out=None): - return self.dot(A, A, out) + + +class CropPadKernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.fill3D_cuda = {} + + def fill3D(self, A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap" + Ao = Ao.astype(np.int32) + Bo = Bo.astype(np.int32) + lengths = np.array([ + min(off[0] + Blim[0], Alim[0]) - Ao[0], + min(off[1] + Blim[1], Alim[1]) - Ao[1], + min(off[2] + Blim[2], Alim[2]) - Ao[2], + ], dtype=np.int32) + lengths2 = np.array([ + min(Alim[0] - off[0], Blim[0]) - Bo[0], + min(Alim[1] - off[1], Blim[1]) - Bo[1], + min(Alim[2] - off[2], Blim[2]) - Bo[2], + ], dtype=np.int32) + assert (lengths == lengths2).all(), "left and right lenghts are not matching" + batch = int(np.prod(A.shape[:-3])) + + # lazy loading depending on data type + + def map_type(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) + + version = '{},{}'.format(map_type(B.dtype), map_type(A.dtype)) + if version not in self.fill3D_cuda: + self.fill3D_cuda[version] = load_kernel("fill3D", { + 'IN_TYPE': map_type(B.dtype), + 'OUT_TYPE': map_type(A.dtype) + }) + bx = by = 32 + self.fill3D_cuda[version]( + A, B, + np.int32(A.shape[-3]), np.int32(A.shape[-2]), np.int32(A.shape[-1]), + np.int32(B.shape[-3]), np.int32(B.shape[-2]), np.int32(B.shape[-1]), + Ao[0], Ao[1], Ao[2], + Bo[0], Bo[1], Bo[2], + lengths[0], lengths[1], lengths[2], + block=(int(bx), int(by), int(1)), + grid=( + int((lengths[2] + bx - 1)//bx), + int((lengths[1] + by - 1)//by), + int(batch)), + stream=self.queue + ) + + + def crop_pad_2d_simple(self, A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + self.fill3D(A, B, offset) class DerivativesKernel: diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu new file mode 100644 index 000000000..c3f03d8ca --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu @@ -0,0 +1,60 @@ +/** fill3D kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: data type for outputs + */ + +#include +#include +using thrust::complex; + +extern "C" __global__ void fill3D( + OUT_TYPE* A, + const IN_TYPE* B, + // final dimensions of A/B in [z, y, x] + int A_Z, + int A_Y, + int A_X, + int B_Z, + int B_Y, + int B_X, + // offsets to start reading/writing + int Ao_z, + int Ao_y, + int Ao_x, + int Bo_z, + int Bo_y, + int Bo_x, + // lengths to copy + int len_z, + int len_y, + int len_x + ) +{ + // We use the following strategy: + // - BlockIdx.z for the batch (first dims combined if 4D+) + // - blockDim.z = 1 + // - multiple blocks are used across y and x dimensions + // - we loop over z dimension within the thread block + int batch = blockIdx.z; + int ix = threadIdx.x + blockIdx.x * blockDim.x; + int iy = threadIdx.y + blockIdx.y * blockDim.y; + + if (ix >= len_x || iy >= len_y) + return; + + // offset for current batch (4D+ dimension) + A += batch * A_X * A_Y * A_Z; + B += batch * B_X * B_Y * B_Z; + + // offset for start position in each dimension of the last 3 + A += Ao_z * A_Y * A_X + Ao_y * A_X + Ao_x; + B += Bo_z * B_Y * B_X + Bo_y * B_X + Bo_x; + + // copy data + for (int iz = 0; iz < len_z; ++iz) { + A[iz * A_Y * A_X + iy * A_X + ix] = + B[iz * B_Y * B_X + iy * B_X + ix]; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 1206b887e..63503d608 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -21,7 +21,7 @@ from ptypy.accelerate.base import address_manglers from .. import get_context from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel -from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel from ..mem_utils import make_pagelocked_paired_arrays as mppa MPI = parallel.size > 1 @@ -132,6 +132,9 @@ def _setup_kernels(self): logger.info("Setting up ArrayUtilsKernel") kern.AUK = ArrayUtilsKernel(queue=self.queue) + logger.info("Setting up TransposeKernel") + kern.TK = TransposeKernel(queue=self.queue) + logger.info("Setting up PropagationKernel") kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) kern.PROP.allocate() @@ -329,7 +332,7 @@ def engine_iterate(self, num=1): if use_tiles: s1 = addr.shape[0] * addr.shape[1] s2 = addr.shape[2] * addr.shape[3] - AUK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + kern.TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 queue.synchronize() diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 025ab7fe9..93500168c 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -3,6 +3,7 @@ from pycuda import gpuarray from ptypy.utils.verbose import log, logger from . import load_kernel +from .array_utils import CropPadKernel from ..base import kernels as ab from ..base.kernels import Adict @@ -39,18 +40,46 @@ def allocate(self): from ptypy.accelerate.cuda_pycuda.fft import FFT if self.prop_type == 'farfield': - self._fft1 = FFT(aux, self.queue, + + self._do_crop_pad = (self._p.crop_pad != 0).any() + if self._do_crop_pad: + self._tmp = np.zeros(aux.shape + self._p.crop_pad, dtype=aux.dtype) + self._CPK = CropPadKernel(queue=self._queue) + else: + self._tmp = aux + + self._fft1 = FFT(self._tmp, self.queue, pre_fft=self._p.pre_fft, post_fft=self._p.post_fft, symmetric=True, forward=True) - self._fft2 = FFT(aux, self.queue, + self._fft2 = FFT(self._tmp, self.queue, pre_fft=self._p.pre_ifft, post_fft=self._p.post_ifft, symmetric=True, forward=False) - self.fw = self._fft1.ft - self.bw = self._fft2.ift + if self._do_crop_pad: + self._tmp = gpuarray.to_gpu(self._tmp) + + def _fw(x,y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft1.ft(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft1.ft(x,y) + + def _bw(x,y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft2.ift(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft2.ift(x,y) + + self.fw = _fw + self.bw = _bw + elif self.prop_type == "nearfield": self._fft1 = FFT(aux, self.queue, post_fft=self._p.kernel, @@ -116,7 +145,9 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma self.error_reduce_cuda = load_kernel("error_reduce", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', - 'ACC_TYPE': self.accumulate_type + 'ACC_TYPE': self.accumulate_type, + 'BDIM_X': 32, + 'BDIM_Y': 32, }) self.fourier_update_cuda = None self.log_likelihood_cuda = load_kernel("log_likelihood", { @@ -814,6 +845,8 @@ def __init__(self, aux, nmodes, queue_thread=None, math_type='float', accumulate self.error_reduce_cuda = load_kernel("error_reduce", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', + 'BDIM_X': 32, + 'BDIM_Y': 32, 'ACC_TYPE': self.accumulate_type }) self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index ab3f78d7e..611c67759 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -88,7 +88,7 @@ def test_transpose_2D(self): out_dev = gpuarray.empty((3,5), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev, out_dev) ## Assert @@ -103,7 +103,7 @@ def test_transpose_2D_large(self): out_dev = gpuarray.empty((61,137), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev, out_dev) ## Assert @@ -118,7 +118,7 @@ def test_transpose_4D(self): out_dev = gpuarray.empty((5, 3, 250, 3), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev.reshape(750, 15), out_dev.reshape(15, 750)) ## Assert @@ -270,7 +270,7 @@ def test_complex_gaussian_filter_2d_batched(self): np.testing.assert_allclose(out_exp, out, rtol=1e-4) - def test_crop_pad_simple_1(self): + def test_crop_pad_simple_1_UNITY(self): # pad, integer, 2D B = np.indices((4, 4), dtype=np.int).sum(0) A = np.zeros((6, 6), dtype=B.dtype) @@ -279,12 +279,13 @@ def test_crop_pad_simple_1(self): # Act au.crop_pad_2d_simple(A, B) - gau.crop_pad_2d_simple(A_dev, B_dev) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) # Assert - np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) - def test_crop_pad_simple_2(self): + def test_crop_pad_simple_2_UNITY(self): # crop, float, 3D B = np.indices((4, 4), dtype=np.float32) A = np.zeros((2, 2, 2), dtype=B.dtype) @@ -293,12 +294,14 @@ def test_crop_pad_simple_2(self): # Act au.crop_pad_2d_simple(A, B) - gau.crop_pad_2d_simple(A_dev, B_dev) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + # Assert - np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) - def test_crop_pad_simple_3(self): + def test_crop_pad_simple_3_UNITY(self): # crop/pad, complex, 3D B = np.indices((4, 3), dtype=np.complex64) B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] @@ -308,7 +311,46 @@ def test_crop_pad_simple_3(self): # Act au.crop_pad_2d_simple(A, B) - gau.crop_pad_2d_simple(A_dev, B_dev) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_difflike_UNITY(self): + np.random.seed(1983) + # crop/pad, 4D + D = np.random.randint(0, 3000, (100,256,256)).astype(np.float32) + A = np.zeros((100,260,260), dtype=D.dtype) + B = np.zeros((100,250,250), dtype=D.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + D_dev = gpuarray.to_gpu(D) + + # Act + au.crop_pad_2d_simple(A, D) + au.crop_pad_2d_simple(B, D) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, D_dev) + k.crop_pad_2d_simple(B_dev, D_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(B, B_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_oblike_UNITY(self): + np.random.seed(1983) + # crop/pad, 4D + B = np.random.rand(2,1230,1434).astype(np.complex64) \ + +2j * np.pi * np.random.randn(2,1230,1434).astype(np.complex64) + A = np.ones((2,1000,1500), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) # Assert - np.testing.assert_all_close(A_dev.get(), A, rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) diff --git a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py index 28f576b9e..794a547fd 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py @@ -23,7 +23,7 @@ class PropagationKernelTest(PyCudaTest): - def set_up_farfield(self,shape): + def set_up_farfield(self,shape, resolution=None): P = Base() P.CType = COMPLEX_TYPE P.Ftype = FLOAT_TYPE @@ -34,6 +34,8 @@ def set_up_farfield(self,shape): g.psize = 24e-6 g.shape = shape g.propagation = "farfield" + if resolution is not None: + g.resolution = resolution G = geometry.Geo(owner=P, pars=g) return G @@ -65,7 +67,8 @@ def test_farfield_propagator_forward_UNITY(self): PropK.allocate() PropK.fw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_farfield_propagator_backward_UNITY(self): # setup @@ -81,7 +84,44 @@ def test_farfield_propagator_backward_UNITY(self): PropK.allocate() PropK.bw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_forward_crop_pad_UNITY(self): + # setup + SH = (16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[5:11,5:11] = 1. + 2j + aux_d = gpuarray.to_gpu(aux) + geo = self.set_up_farfield(SH) + geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.fw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_backward_crop_pad_UNITY(self): + # setup + SH = (16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[5:11,5:11] = 1. + 2j + aux_d = gpuarray.to_gpu(aux) + geo = self.set_up_farfield(SH) + geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.bw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_nearfield_propagator_forward_UNITY(self): # setup @@ -97,7 +137,8 @@ def test_nearfield_propagator_forward_UNITY(self): PropK.allocate() PropK.fw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_nearfield_propagator_backward_UNITY(self): # setup @@ -113,4 +154,5 @@ def test_nearfield_propagator_backward_UNITY(self): PropK.allocate() PropK.bw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) \ No newline at end of file + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) \ No newline at end of file From 6a4c203894fb8d88db8b897ced60af84a14a7e7d Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Thu, 18 Mar 2021 10:33:37 +0000 Subject: [PATCH 30/56] WIP: Local douglas rachford algorithm (#304) * Renamed engine to Douglas-Rachford (DR) and added citation * working on tests * Use build_aux kernel to compute pr*ob product * Add unit tests for new kernels * formatting * Added tests for build_exit_alpha_tau * adding prototypes to make tests runnable * fixing reference implementation for alpha_tau test * implementation of build_exit_alpha_tau on GPU * Separate the maximum norm from the main update * Updated tests * First draft of DR_pycuda engine * Properly reading back the errors * kernel + tests for max_abs2 * ob_update_local is working on GPU * Added debugging output * simplified base kernel, we should be able to also simplify the CUDA kernel for max_abs2 * pr_update_local working on GPU * DR pycuda engine is running, but having illegal memory access issues * Define grid by using addr instead of ex * simplified / refactored max_abs2 as independent function * Changed addr in the DR engine * norm is now of type IN_TYPE * DRpycuda engine working now * clean up * more clean up, made exit_error optional * added dls_test for update_local * don't need pbound in DR and can make fourier_error optional * optimised GPU kernels for DR engine, using a thread block per Y dimension * max norm for DR engine needs to sum over modes * ob/pr norm is a single value now * No need for lists, pycuda can do slicing :) * no need anymore for lists when copying errors back * typo * allow changing block dimensions easily from python * adjusting in case of BDIM_Y > 1, we need to return early then * adding fourier_deviation kernel to GPU - tested against fourier_error * fourier_deviation integrated in DR engine * fmag_all_update without pbound * cleaned up and renamed fmag_update_nopbound * Trying a different strategy for shuffling the vieworder * adding a build_aux2 with different parallelisation strategy * build_aux_no_ex with different parallelisation scheme * integrate new build_aux kernels into DR engine * load_kernel supports multiple kernels / file + refactor to keep code DRY * fixing max_abs2 and local updates to aggregate over modes * better parallelisation on the log_likelihood error * avoid extra copy on the CPU for D2H transfers * make vieworder shuffling simpler again * first attempt to DR streaming engine * use random shuffle for vieworder in streaming engine * allow for modes in engine, add templates * Bring DR engines back in sync * Test ob/pr update local with modes * updates to DR engine to fix sizing and transfers * updating benchmark script to new API * made DR work with modes * fixing crash on shutdown due to pagelocked memory * increased MAX_BLOCKS and clean up * Fixed typo Co-authored-by: Jorg Lotze --- .../moonflower_scripts/i14_2.py | 2 + ptypy/accelerate/base/array_utils.py | 6 + .../engines/{DM_local.py => DR_serial.py} | 163 +++++++--- ptypy/accelerate/base/kernels.py | 59 +++- ptypy/accelerate/cuda_pycuda/__init__.py | 13 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 134 +++++--- .../accelerate/cuda_pycuda/cuda/build_aux.cu | 62 +++- .../cuda_pycuda/cuda/build_aux_no_ex.cu | 46 +++ .../cuda_pycuda/cuda/build_exit_alpha_tau.cu | 60 ++++ .../cuda_pycuda/cuda/{delx_mid.cu => delx.cu} | 85 ++++- .../accelerate/cuda_pycuda/cuda/delx_last.cu | 89 ------ ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu | 49 ++- .../cuda_pycuda/cuda/fill_b_reduce.cu | 53 ---- .../cuda_pycuda/cuda/fmag_update_nopbound.cu | 53 ++++ .../cuda_pycuda/cuda/fourier_deviation.cu | 58 ++++ .../cuda_pycuda/cuda/log_likelihood.cu | 45 +++ ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu | 115 +++++++ .../cuda_pycuda/cuda/ob_update_local.cu | 67 ++++ .../cuda_pycuda/cuda/pr_update_local.cu | 71 +++++ .../cuda_pycuda/engines/DM_pycuda.py | 13 - .../cuda_pycuda/engines/DR_pycuda.py | 290 ++++++++++++++++++ .../cuda_pycuda/engines/DR_pycuda_stream.py | 260 ++++++++++++++++ ptypy/accelerate/cuda_pycuda/kernels.py | 278 +++++++++++++++-- ...l.py => minimal_prep_and_run_DR_pycuda.py} | 10 +- templates/minimal_prep_and_run_DR_serial.py | 58 ++++ templates/minimal_prep_and_run_probe_modes.py | 5 +- .../base_tests/auxiliary_wave_kernel_test.py | 108 ++++--- .../base_tests/po_update_kernel_test.py | 211 +++++++++---- .../cuda_pycuda_tests/array_utils_test.py | 32 ++ .../auxiliary_wave_kernel_test.py | 158 +++++++++- .../dls_tests/dls_drpycuda_test.py | 83 +++++ .../fourier_update_kernel_test.py | 186 ++++++++++- .../po_update_kernel_test.py | 171 ++++++++++- 33 files changed, 2680 insertions(+), 413 deletions(-) rename ptypy/accelerate/base/engines/{DM_local.py => DR_serial.py} (70%) create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu rename ptypy/accelerate/cuda_pycuda/cuda/{delx_mid.cu => delx.cu} (64%) delete mode 100644 ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu delete mode 100644 ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu create mode 100644 ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py create mode 100644 ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py rename templates/{minimal_prep_and_run_DM_local.py => minimal_prep_and_run_DR_pycuda.py} (84%) create mode 100644 templates/minimal_prep_and_run_DR_serial.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py index 0c9927ea9..414b785b3 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py @@ -7,6 +7,8 @@ from ptypy.core import Ptycho from ptypy import utils as u import time +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams import os import getpass diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py index 6a7472c19..839b08e70 100644 --- a/ptypy/accelerate/base/array_utils.py +++ b/ptypy/accelerate/base/array_utils.py @@ -17,6 +17,12 @@ def dot(A, B, acc_dtype=np.float64): def norm2(A): return dot(A, A) +def max_abs2(A): + ''' + A has ndim = 3. + compute abs2, sum along first dimension and take maximum along last two dims + ''' + return np.max(np.sum(np.abs(A)**2,axis=0),axis=(-2,-1)) def abs2(input): ''' diff --git a/ptypy/accelerate/base/engines/DM_local.py b/ptypy/accelerate/base/engines/DR_serial.py similarity index 70% rename from ptypy/accelerate/base/engines/DM_local.py rename to ptypy/accelerate/base/engines/DR_serial.py index f93ff21e2..31fc43b95 100644 --- a/ptypy/accelerate/base/engines/DM_local.py +++ b/ptypy/accelerate/base/engines/DR_serial.py @@ -22,19 +22,21 @@ from ptypy.accelerate.base import address_manglers from ptypy.accelerate.base import array_utils as au -__all__ = ['DM_local'] +# for debugging +import h5py, sys + +__all__ = ['DR_serial'] @register() -class DM_local(PositionCorrectionEngine): +class DR_serial(PositionCorrectionEngine): """ - A local version of the Difference Map engine + An implementation of the Douglas-Rachford algorithm that can be operated like the ePIE algorithm. - Defaults: [name] - default = DM_local + default = DR_serial type = str help = doc = @@ -43,7 +45,7 @@ class DM_local(PositionCorrectionEngine): default = 1 type = float lowlim = 0.0 - help = Difference map tuning parameter, a value of 0 makes it equal to ePIE. + help = Tuning parameter, a value of 0 makes it equal to ePIE. [tau] default = 1 @@ -74,17 +76,31 @@ class DM_local(PositionCorrectionEngine): lowlim = 0 help = Normalise probe power according to data - [fourier_power_bound] - default = None - type = float - help = If rms error of model vs diffraction data is smaller than this value, Fourier constraint is met - doc = For Poisson-sampled data, the theoretical value for this parameter is 1/4. Set this value higher for noisy data. - [compute_log_likelihood] default = True type = bool help = A switch for computing the log-likelihood error (this can impact the performance of the engine) + [compute_exit_error] + default = False + type = bool + help = A switch for computing the exitwave error (this can impact the performance of the engine) + + [compute_fourier_error] + default = False + type = bool + help = A switch for computing the fourier error (this can impact the performance of the engine) + + [debug] + default = None + type = str + help = For debugging purposes, dump arrays into given directory + + [debug_iter] + default = 0 + type = int + help = For debugging purposes, dump arrays at this iteration + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] @@ -93,11 +109,10 @@ def __init__(self, ptycho_parent, pars=None): """ Local difference map reconstruction engine. """ - super(DM_local, self).__init__(ptycho_parent, pars) + super(DR_serial, self).__init__(ptycho_parent, pars) # Instance attributes self.error = None - self.pbound = None self.mean_power = None # keep track of timings @@ -109,11 +124,22 @@ def __init__(self, ptycho_parent, pars=None): self.pr_cfact = {} self.kernels = {} + self.ptycho.citations.add_article( + title='Semi-implicit relaxed Douglas-Rachford algorithm (sDR) for ptychography', + author='Pham et al.', + journal='Opt. Express', + volume=27, + year=2019, + page=31246, + doi='10.1364/OE.27.031246', + comment='The local douglas-rachford reconstruction algorithm', + ) + def engine_initialize(self): """ Prepare for reconstruction. """ - super(DM_local, self).engine_initialize() + super(DR_serial, self).engine_initialize() self.error = [] self._reset_benchmarks() @@ -199,9 +225,7 @@ def engine_prepare(self): # recalculate everything mean_power = 0. - self.pbound_scan = {} - for s in self.di.storages.values(): - self.pbound_scan[s.label] = self.p.fourier_power_bound + for s in self.di.storages.values(): mean_power += s.mean_power self.mean_power = mean_power / len(self.di.storages) @@ -216,7 +240,7 @@ def engine_prepare(self): prep.err_phot = np.zeros_like(prep.ma_sum) prep.err_fourier = np.zeros_like(prep.ma_sum) prep.err_exit = np.zeros_like(prep.ma_sum) - + # Unfortunately this needs to be done for all pods, since # the shape of the probe / object was modified. # TODO: possible scaling issue, remove the need for padding @@ -236,8 +260,17 @@ def engine_prepare(self): ob.shape = ob.data.shape # Keep a list of view indices + prep.rng = np.random.default_rng() prep.vieworder = np.arange(prep.addr.shape[0]) + # Modify addresses, copy pa into ea and remove da/ma + prep.addr_ex = np.vstack([prep.addr[:,0,2,0], prep.addr[:,-1,2,0]+1]).T + prep.addr[:,:,2] = prep.addr[:,:,0] + prep.addr[:,:,3:,0] = 0 + + # Reference to ex + prep.ex = self.ex.S[eID].data + # calculate c_facts #cfact = self.p.object_inertia * self.mean_power #self.ob_cfact[oID] = cfact / u.parallel.size @@ -269,38 +302,41 @@ def engine_iterate(self, num=1): FW = kern.FW BW = kern.BW - # global buffers - pbound = self.pbound_scan[prep.label] + # global aux buffer aux = kern.aux - vieworder = prep.vieworder - # references for ob, pr, ex + # references for ob, pr ob = self.ob.S[oID].data pr = self.pr.S[pID].data - ex = self.ex.S[eID].data - # randomly shuffle view order - np.random.shuffle(vieworder) + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) # Iterate through views for i in vieworder: # Get local adress and arrays addr = prep.addr[i,None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex[ex_from:ex_to] mag = prep.mag[i,None] ma = prep.ma[i,None] ma_sum = prep.ma_sum[i,None] - err_phot = prep.err_phot[i,None] err_fourier = prep.err_fourier[i,None] err_exit = prep.err_exit[i,None] - ## compute log-likelihood - t1 = time.time() - AWK.build_aux_no_ex(aux, addr, ob, pr) - aux[:] = FW(aux) - FUK.log_likelihood(aux, addr, mag, ma, err_phot) - self.benchmark.F_LLerror += time.time() - t1 + # debugging + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/before_%04d.h5" %self.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["pr"] = pr + f["mag"] = mag + f["ma"] = ma + f["ma_sum"] = ma_sum ## build auxilliary wave t1 = time.time() @@ -314,9 +350,12 @@ def engine_iterate(self, num=1): ## Deviation from measured data t1 = time.time() - FUK.fourier_error(aux, addr, mag, ma, ma_sum) - FUK.error_reduce(addr, err_fourier) - FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + if self.p.compute_fourier_error: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + FUK.fmag_update_nopbound(aux, addr, mag, ma) self.benchmark.C_Fourier_update += time.time() - t1 ## backward FFT @@ -327,14 +366,37 @@ def engine_iterate(self, num=1): ## build exit wave t1 = time.time() AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) - FUK.exit_error(aux,addr) - FUK.error_reduce(addr, err_exit) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 self.benchmark.calls_fourier += 1 ## probe/object rescale - if self.p.rescale_probe: - pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + # debugging + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/before_aux_no_ex_%04d.h5" %self.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["pr"] = pr + + ## build auxilliary wave (ob * pr product) + t1 = time.time() + AWK.build_aux_no_ex(aux, addr, ob, pr) + self.benchmark.A_Build_aux += time.time() - t1 + + # debugging + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/ob_update_local_%04d.h5" %self.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["pr"] = pr + f["ex"] = ex # object update t1 = time.time() @@ -342,14 +404,33 @@ def engine_iterate(self, num=1): self.benchmark.object_update += time.time() - t1 self.benchmark.calls_object += 1 + # debugging + if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): + with h5py.File(self.p.debug + "/pr_update_local_%04d.h5" %self.curiter, "w") as f: + f["aux"] = aux + f["addr"] = addr + f["ob"] = ob + f["pr"] = pr + f["ex"] = ex + # probe update t1 = time.time() POK.pr_update_local(addr, pr, ob, ex, aux) self.benchmark.probe_update += time.time() - t1 self.benchmark.calls_probe += 1 + ## compute log-likelihood + if self.p.compute_log_likelihood: + t1 = time.time() + #AWK.build_aux_no_ex(aux, addr, ob, pr) + aux[:] = FW(aux) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + self.benchmark.F_LLerror += time.time() - t1 + # update errors - errs = np.ascontiguousarray(np.vstack([prep.err_fourier, prep.err_phot, prep.err_exit]).T) + errs = np.ascontiguousarray(np.vstack([np.hstack(prep.err_fourier), + np.hstack(prep.err_phot), + np.hstack(prep.err_exit)]).T) error_dct.update(zip(prep.view_IDs, errs)) self.curiter += 1 diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index db2ebc64b..9569f882c 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -1,5 +1,6 @@ import numpy as np from ptypy.utils.verbose import logger, log +from .array_utils import max_abs2 class Adict(object): @@ -73,6 +74,28 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum): ferr[:] = mask * np.abs(fdev) ** 2 / mask_sum.reshape((maxz, 1, 1)) return + def fourier_deviation(self, b_aux, addr, mag): + # reference shape (write-to shape) + sh = self.fshape + # stopper + maxz = mag.shape[0] + + # batch buffers + fdev = self.npy.fdev[:maxz] + aux = b_aux[:maxz * self.nmodes] + + ## Actual math ## + + # build model from complex fourier magnitudes, summing up + # all modes incoherently + tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) + af = np.sqrt((np.abs(tf) ** 2).sum(1)) + + # calculate difference to real data (g_mag) + fdev[:] = af - mag + + return + def error_reduce(self, addr, err_sum): # reference shape (write-to shape) sh = self.fshape @@ -133,6 +156,33 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0): aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) return + def fmag_update_nopbound(self, b_aux, addr, mag, mask): + + sh = self.fshape + nmodes = self.nmodes + + # stopper + maxz = mag.shape[0] + + # batch buffers + fdev = self.npy.fdev[:maxz] + aux = b_aux[:maxz * nmodes] + + # write-to shape + ish = aux.shape + + ## Actual math ## + + # local values + fm = np.ones((maxz, sh[1], sh[2]), np.float32) + + af = fdev + mag + fm[:] = (1 - mask) + mask * mag / (af + self.denom) + + # upcasting + aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + return + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # reference shape (write-to shape) sh = self.fshape @@ -503,28 +553,27 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0): return def ob_update_local(self, addr, ob, pr, ex, aux): - sh = addr.shape flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) rows, cols = ex.shape[-2:] + pr_norm = max_abs2(pr) for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): - aux[ind,:,:] = pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] * \ - ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \ pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \ (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ - np.max(np.abs(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols])**2) + pr_norm return def pr_update_local(self, addr, pr, ob, ex, aux): sh = addr.shape flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) rows, cols = ex.shape[-2:] + ob_norm = max_abs2(ob) for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \ ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \ (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ - np.max(np.abs(ob[obc[0]])**2) + ob_norm return class PositionCorrectionKernel(BaseKernel): diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index d78f1cb80..55833de3e 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -40,10 +40,13 @@ def get_context(new_context=False, new_queue=False): def load_kernel(name, subs={}, file=None): if file is None: - fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + if isinstance(name, str): + fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + else: + raise ValueError("name parameter must be a string if not filename is given") else: fn = "%s/cuda/%s" % (os.path.dirname(__file__), file) - + with open(fn, 'r') as f: kernel = f.read() for k,v in list(subs.items()): @@ -52,5 +55,9 @@ def load_kernel(name, subs={}, file=None): escaped = fn.replace("\\", "\\\\") kernel = '#line 1 "{}"\n'.format(escaped) + kernel mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options) - return mod.get_function(name) + + if isinstance(name, str): + return mod.get_function(name) + else: # tuple + return tuple(mod.get_function(n) for n in name) diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 14dd05532..3378a0262 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -3,6 +3,24 @@ from ptypy.utils import gaussian import numpy as np +# maps a numpy dtype to the corresponding C type +def map2ctype(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) + + class ArrayUtilsKernel: def __init__(self, acc_dtype=np.float64, queue=None): self.queue = queue @@ -90,7 +108,51 @@ def transpose(self, input, output): self.transpose_cuda(input, output, np.int32(width), np.int32(height), block=blk, grid=grd, stream=self.queue) +class MaxAbs2Kernel: + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.max_abs2_cuda = {} + + def max_abs2(self, X, out): + """ Calculate max(abs(x)**2) across the final 2 dimensions""" + # lazy-loading, keeping scratch memory and both kernels in the same dictionary + bx = int(64) + version = '{},{}'.format(map2ctype(X.dtype), map2ctype(out.dtype)) + if version not in self.max_abs2_cuda: + step1, step2 = load_kernel( + ("max_abs2_step1", "max_abs2_step2"), + { + 'IN_TYPE': map2ctype(X.dtype), + 'OUT_TYPE': map2ctype(out.dtype), + 'BDIM_X': bx, + }, "max_abs2.cu") + self.max_abs2_cuda[version] = { + 'step1': step1, + 'step2': step2, + 'scratchmem': None + } + + rows = np.int32(X.shape[-2]) + cols = np.int32(X.shape[-1]) + firstdims = np.int32(np.prod(X.shape[:-2])) + gy = int(rows) + + if self.max_abs2_cuda[version]['scratchmem'] is None \ + or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: + self.max_abs2_cuda[version]['scratchmem'] = gpuarray.empty((gy,), dtype=out.dtype) + scratch = self.max_abs2_cuda[version]['scratchmem'] + + + self.max_abs2_cuda[version]['step1'](X, firstdims, rows, cols, scratch, + block=(bx, 1, 1), grid=(1, gy, 1), + stream=self.queue) + self.max_abs2_cuda[version]['step2'](scratch, np.int32(gy), out, + block=(bx, 1, 1), grid=(1, 1, 1), + stream=self.queue + ) + class CropPadKernel: @@ -135,28 +197,11 @@ def fill3D(self, A, B, offset=[0, 0, 0]): batch = int(np.prod(A.shape[:-3])) # lazy loading depending on data type - - def map_type(dt): - if dt == np.float32: - return 'float' - elif dt == np.float64: - return 'double' - elif dt == np.complex64: - return 'complex' - elif dt == np.complex128: - return 'complex' - elif dt == np.int32: - return 'int' - elif dt == np.int64: - return 'long long' - else: - raise ValueError('No mapping for {}'.format(dt)) - - version = '{},{}'.format(map_type(B.dtype), map_type(A.dtype)) + version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype)) if version not in self.fill3D_cuda: self.fill3D_cuda[version] = load_kernel("fill3D", { - 'IN_TYPE': map_type(B.dtype), - 'OUT_TYPE': map_type(A.dtype) + 'IN_TYPE': map2ctype(B.dtype), + 'OUT_TYPE': map2ctype(A.dtype) }) bx = by = 32 self.fill3D_cuda[version]( @@ -209,34 +254,27 @@ def __init__(self, dtype, queue=None): self.last_axis_block = (256, 4, 1) self.mid_axis_block = (256, 4, 1) - self.delxf_last = load_kernel("delx_last", file="delx_last.cu", subs={ - 'IS_FORWARD': 'true', - 'BDIM_X': str(self.last_axis_block[0]), - 'BDIM_Y': str(self.last_axis_block[1]), - 'IN_TYPE': stype, - 'OUT_TYPE': stype - }) - self.delxb_last = load_kernel("delx_last", file="delx_last.cu", subs={ - 'IS_FORWARD': 'false', - 'BDIM_X': str(self.last_axis_block[0]), - 'BDIM_Y': str(self.last_axis_block[1]), - 'IN_TYPE': stype, - 'OUT_TYPE': stype - }) - self.delxf_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ - 'IS_FORWARD': 'true', - 'BDIM_X': str(self.mid_axis_block[0]), - 'BDIM_Y': str(self.mid_axis_block[1]), - 'IN_TYPE': stype, - 'OUT_TYPE': stype - }) - self.delxb_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ - 'IS_FORWARD': 'false', - 'BDIM_X': str(self.mid_axis_block[0]), - 'BDIM_Y': str(self.mid_axis_block[1]), - 'IN_TYPE': stype, - 'OUT_TYPE': stype - }) + self.delxf_last, self.delxf_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'true', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + self.delxb_last, self.delxb_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'false', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + def delxf(self, input, out, axis=-1): if input.dtype != self.dtype: diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu index bb0e68838..e9ceeb80c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu @@ -3,12 +3,22 @@ * Data types: * - IN_TYPE: the data type for the inputs (float or double) * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) - * - MATH_TYPE: the data type used for computation + * - MATH_TYPE: the data type used for computation */ #include using thrust::complex; +// core calculation function - used by both kernels and inlined +inline __device__ complex calculate( + const complex& t_obj, + const complex& t_probe, + const complex& t_ex, + MATH_TYPE alpha) +{ + return t_obj * t_probe * (MATH_TYPE(1) + alpha) - t_ex * alpha; +} + extern "C" __global__ void build_aux( complex* auxiliary_wave, const complex* __restrict__ exit_wave, @@ -27,7 +37,7 @@ extern "C" __global__ void build_aux( int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; - const MATH_TYPE alpha = alpha_; // type conversion + const MATH_TYPE alpha = alpha_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -44,14 +54,46 @@ extern "C" __global__ void build_aux( // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - // temporaries to convert to MATH_TYPE in case it's different to storage type - complex t_obj = obj[b * I + c]; - complex t_probe = probe[b * F + c]; - complex t_ex = exit_wave[b * C + c]; - - auxiliary_wave[b * C + c] = - t_obj * t_probe * (MATH_TYPE(1) + alpha) - - t_ex * alpha; + auxiliary_wave[b * C + c] = calculate( + obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha); } } } + +extern "C" __global__ void build_aux2( + complex* auxiliary_wave, + const complex* __restrict__ exit_wave, + int B, + int C, + const complex* __restrict__ probe, + int E, + int F, + const complex* __restrict__ obj, + int H, + int I, + const int* __restrict__ addr, + IN_TYPE alpha_) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + exit_wave += ea[0] * B * C; + auxiliary_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + auxiliary_wave[b * C + c] = calculate( + obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu index b19ad8d70..ee091c58e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu @@ -54,4 +54,50 @@ extern "C" __global__ void build_aux_no_ex(complex* auxilliary_wave, } } } +} + +extern "C" __global__ void build_aux2_no_ex(complex* auxilliary_wave, + int aRows, + int aCols, + const complex* __restrict__ probe, + int pRows, + int pCols, + const complex* __restrict__ obj, + int oRows, + int oCols, + const int* __restrict__ addr, + IN_TYPE fac_, + int doAdd) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= aRows) + return; + const int addr_stride = 15; + const MATH_TYPE fac = fac_; // type conversion + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + obj += oa[0] * oRows * oCols + oa[1] * oCols + oa[2]; + probe += pa[0] * pRows * pCols + pa[1] * pCols + pa[2]; + auxilliary_wave += ea[0] * aRows * aCols; + + for (int c = tx; c < aCols; c += blockDim.x) + { + complex t_obj = obj[b * oCols + c]; + complex t_probe = probe[b * pCols + c]; + auto tmp = t_obj * t_probe * fac; + if (doAdd) + { + auxilliary_wave[b * aCols + c] += tmp; + } + else + { + auxilliary_wave[b * aCols + c] = tmp; + } + } + } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu new file mode 100644 index 000000000..8528f2e9c --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu @@ -0,0 +1,60 @@ +/** build_exit_alpha_tau kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + + +#include +using thrust::complex; + + +extern "C" __global__ void build_exit_alpha_tau( + complex* auxiliary_wave, + complex* exit_wave, + int B, + int C, + const complex* __restrict__ probe, + int E, + int F, + const complex* __restrict__ obj, + int H, + int I, + const int* __restrict__ addr, + IN_TYPE alpha_, + IN_TYPE tau_) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + MATH_TYPE alpha = alpha_; + MATH_TYPE tau = tau_; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + exit_wave += ea[0] * B * C; + auxiliary_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex t_aux = auxiliary_wave[b * C + c]; + complex t_probe = probe[b * F + c]; + complex t_obj = obj[b * I + c]; + complex t_ex = exit_wave[b * C + c]; + + auto dex = tau * t_aux + (tau * alpha - MATH_TYPE(1)) * t_ex + + (MATH_TYPE(1) - tau * (MATH_TYPE(1) + alpha)) * t_obj * t_probe; + + exit_wave[b * C + c] += dex; + auxiliary_wave[b * C + c] = dex; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu similarity index 64% rename from ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu rename to ptypy/accelerate/cuda_pycuda/cuda/delx.cu index 15a17f544..f2e8a934e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu @@ -1,4 +1,4 @@ -/** difference along any axis +/** difference along axes (last and mid axis kernels) * * Data types: * - IN_TYPE: the data type for the inputs @@ -8,6 +8,7 @@ #include using thrust::complex; + /** Finite difference for forward/backward for any axis that is not the * last one, assuring that the reads and writes are coalesced. * @@ -125,3 +126,85 @@ extern "C" __global__ void delx_mid(const IN_TYPE *__restrict__ input, } } } + + + +/** This is the special case for when we diff along the last axis. + * + * Here, flat_dim is all other dims multiplied together, and axis_dim + * is the dimension along which we diff. + * To ensure that we stay coalesced (compared to delx_mid), + * we use the x index to iterate within each thread block (the loop). + * Otherwise it follows the same ideas as delx_mid - please read the + * description there. + */ +extern "C" __global__ void delx_last(const IN_TYPE *__restrict__ input, + OUT_TYPE *output, + int flat_dim, + int axis_dim) +{ + // reinterpret to avoid constructor of complex() + compiler warning + __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; + auto shared_data = reinterpret_cast(shr); + + unsigned int tx = threadIdx.x; + unsigned int ty = threadIdx.y; + + unsigned int ix = tx; + unsigned int iy = ty + blockIdx.x * BDIM_Y; // we always use x in grid + + int stride_y = axis_dim; + + auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X; + for (int bidx = 0; bidx < maxblocks; ++bidx) + { + ix = tx + bidx * BDIM_X; + + if (iy < flat_dim && ix < axis_dim) + { + shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix]; + } + + __syncthreads(); + + if (iy < flat_dim && ix < axis_dim) + { + if (IS_FORWARD) + { + IN_TYPE plus1; + if (tx < BDIM_X - 1 && + ix < axis_dim - 1) // we have a next element in shared data + { + plus1 = shared_data[ty * BDIM_X + tx + 1]; + } + else if (ix == axis_dim - 1) // end of axis - same as current to get 0 + { + plus1 = shared_data[ty * BDIM_X + tx]; + } + else // end of block, but nore input is there + { + plus1 = input[iy * stride_y + ix + 1]; + } + + output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx]; + } + else + { + IN_TYPE minus1; + if (tx > 0) // we have a previous element in shared + { + minus1 = shared_data[ty * BDIM_X + tx - 1]; + } + else if (ix == 0) // use same as next to get zero + { + minus1 = shared_data[ty * BDIM_X + tx]; + } + else // read previous input (ty == 0 but iy > 0) + { + minus1 = input[iy * stride_y + ix - 1]; + } + output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1; + } + } + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu deleted file mode 100644 index a302790f7..000000000 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu +++ /dev/null @@ -1,89 +0,0 @@ -/** difference along last axis - * - * Data types: - * - IN_TYPE: the data type for the inputs - * - OUT_TYPE: the data type for the outputs - */ - -#include -using thrust::complex; - -/** This is the special case for when we diff along the last axis. - * - * Here, flat_dim is all other dims multiplied together, and axis_dim - * is the dimension along which we diff. - * To ensure that we stay coalesced (compared to delx_mid), - * we use the x index to iterate within each thread block (the loop). - * Otherwise it follows the same ideas as delx_mid - please read the - * description there. - */ -extern "C" __global__ void delx_last(const IN_TYPE *__restrict__ input, - OUT_TYPE *output, - int flat_dim, - int axis_dim) -{ - // reinterpret to avoid constructor of complex() + compiler warning - __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; - auto shared_data = reinterpret_cast(shr); - - unsigned int tx = threadIdx.x; - unsigned int ty = threadIdx.y; - - unsigned int ix = tx; - unsigned int iy = ty + blockIdx.x * BDIM_Y; // we always use x in grid - - int stride_y = axis_dim; - - auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X; - for (int bidx = 0; bidx < maxblocks; ++bidx) - { - ix = tx + bidx * BDIM_X; - - if (iy < flat_dim && ix < axis_dim) - { - shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix]; - } - - __syncthreads(); - - if (iy < flat_dim && ix < axis_dim) - { - if (IS_FORWARD) - { - IN_TYPE plus1; - if (tx < BDIM_X - 1 && - ix < axis_dim - 1) // we have a next element in shared data - { - plus1 = shared_data[ty * BDIM_X + tx + 1]; - } - else if (ix == axis_dim - 1) // end of axis - same as current to get 0 - { - plus1 = shared_data[ty * BDIM_X + tx]; - } - else // end of block, but nore input is there - { - plus1 = input[iy * stride_y + ix + 1]; - } - - output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx]; - } - else - { - IN_TYPE minus1; - if (tx > 0) // we have a previous element in shared - { - minus1 = shared_data[ty * BDIM_X + tx - 1]; - } - else if (ix == 0) // use same as next to get zero - { - minus1 = shared_data[ty * BDIM_X + tx]; - } - else // read previous input (ty == 0 but iy > 0) - { - minus1 = input[iy * stride_y + ix - 1]; - } - output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1; - } - } - } -} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu index 9c6c7e1de..46d0d09f1 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu @@ -1,4 +1,4 @@ -/** fill_b kernel. +/** fill_b kernels. * Data types: * - IN_TYPE: the data type for the inputs * - OUT_TYPE: the data type for the outputs @@ -12,7 +12,7 @@ extern "C" __global__ void fill_b(const IN_TYPE* A0, const IN_TYPE* w, IN_TYPE Brenorm, int size, - OUT_TYPE* out) + ACC_TYPE* out) { int tx = threadIdx.x; int ix = tx + blockIdx.x * blockDim.x; @@ -59,4 +59,47 @@ extern "C" __global__ void fill_b(const IN_TYPE* A0, out[blockIdx.x * 3 + 1] = MATH_TYPE(smem[1][0]) * MATH_TYPE(Brenorm); out[blockIdx.x * 3 + 2] = MATH_TYPE(smem[2][0]) * MATH_TYPE(Brenorm); } -} \ No newline at end of file +} + +extern "C" __global__ void fill_b_reduce(const ACC_TYPE* in, OUT_TYPE* B, int blocks) +{ + // always a single thread block for 2nd stage + assert(gridDim.x == 1); + int tx = threadIdx.x; + + __shared__ ACC_TYPE smem[3][BDIM_X]; + + double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for (int ix = tx; ix < blocks; ix += blockDim.x) + { + sum0 += in[ix * 3 + 0]; + sum1 += in[ix * 3 + 1]; + sum2 += in[ix * 3 + 2]; + } + smem[0][tx] = sum0; + smem[1][tx] = sum1; + smem[2][tx] = sum2; + __syncthreads(); + + int nt = blockDim.x; + int c = nt; + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + smem[0][tx] += smem[0][c - tx - 1]; + smem[1][tx] += smem[1][c - tx - 1]; + smem[2][tx] += smem[2][c - tx - 1]; + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + B[0] += OUT_TYPE(smem[0][0]); + B[1] += OUT_TYPE(smem[1][0]); + B[2] += OUT_TYPE(smem[2][0]); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu deleted file mode 100644 index b590e39e4..000000000 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu +++ /dev/null @@ -1,53 +0,0 @@ -/** fill_b_reduce - for second-stage reduction used after fill_b. - * - * Note that the IN_TYPE here must match what's produced by the fill_b kernel - * Data types: - * - IN_TYPE: the data type for the inputs - * - OUT_TYPE: the data type for the outputs - * - ACC_TYPE: the accumulator type for summing - */ - -#include - -extern "C" __global__ void fill_b_reduce(const IN_TYPE* in, OUT_TYPE* B, int blocks) -{ - // always a single thread block for 2nd stage - assert(gridDim.x == 1); - int tx = threadIdx.x; - - __shared__ ACC_TYPE smem[3][BDIM_X]; - - double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for (int ix = tx; ix < blocks; ix += blockDim.x) - { - sum0 += in[ix * 3 + 0]; - sum1 += in[ix * 3 + 1]; - sum2 += in[ix * 3 + 2]; - } - smem[0][tx] = sum0; - smem[1][tx] = sum1; - smem[2][tx] = sum2; - __syncthreads(); - - int nt = blockDim.x; - int c = nt; - while (c > 1) - { - int half = c / 2; - if (tx < half) - { - smem[0][tx] += smem[0][c - tx - 1]; - smem[1][tx] += smem[1][c - tx - 1]; - smem[2][tx] += smem[2][c - tx - 1]; - } - __syncthreads(); - c = c - half; - } - - if (tx == 0) - { - B[0] += OUT_TYPE(smem[0][0]); - B[1] += OUT_TYPE(smem[1][0]); - B[2] += OUT_TYPE(smem[2][0]); - } -} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu new file mode 100644 index 000000000..40a65c172 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu @@ -0,0 +1,53 @@ +/** fmag_all_update_nopbound. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +#include +using std::sqrt; +using thrust::complex; + +extern "C" __global__ void fmag_update_nopbound(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const int* addr_info, + int A, + int B) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int a = threadIdx.y + blockIdx.y * blockDim.y; + if (a >= A) + return; + int addr_stride = 15; + + const int* ea = addr_info + bid * addr_stride + 6; + const int* da = addr_info + bid * addr_stride + 9; + const int* ma = addr_info + bid * addr_stride + 12; + + fmask += ma[0] * A * B; + fdev += da[0] * A * B; + fmag += da[0] * A * B; + f += ea[0] * A * B; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE m = fmask[a * A + b]; + /* + // assuming this is actually a mask, i.e. 0 or 1 --> this is slower + float fm = m < 0.5f ? 1.0f : + ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] + + fmag[a * A + b] + 1e-7f)) ; + */ + MATH_TYPE fmagv = fmag[a * A + b]; + MATH_TYPE fdevv = fdev[a * A + b]; + MATH_TYPE fm = + (MATH_TYPE(1) - m) + m * (fmagv / (fmagv + fdevv + MATH_TYPE(1e-7))); + f[a * A + b] *= fm; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu new file mode 100644 index 000000000..3427222c3 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu @@ -0,0 +1,58 @@ +/** fourier_deviation. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +#include +#include +using std::sqrt; +using thrust::abs; +using thrust::complex; + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_deviation(int nmodes, + const complex *f, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + const int *addr, + int A, + int B) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int a = threadIdx.y + blockIdx.y * blockDim.y; + const int addr_stride = 15; + + const int *ea = addr + 6 + (bid * nmodes) * addr_stride; + const int *da = addr + 9 + (bid * nmodes) * addr_stride; + + f += ea[0] * A * B; + fdev += da[0] * A * B; + fmag += da[0] * A * B; + + if (a >= A) + return; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE acc = MATH_TYPE(0); + for (int idx = 0; idx < nmodes; ++idx) + { + complex t_f = f[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_f); + acc += abs_exit_wave * + abs_exit_wave; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding + } + auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); + fdev[a * B + b] = fdevv; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu index 684099150..90455b1e2 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu @@ -60,3 +60,48 @@ extern "C" __global__ void __launch_bounds__(1024, 2) } } } + + +extern "C" __global__ void + log_likelihood2(int nmodes, + complex *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int a = threadIdx.y + blockIdx.y * blockDim.y; + if (a >= A) + return; + int addr_stride = 15; + + const int *ea = addr + 6 + (bid * nmodes) * addr_stride; + const int *da = addr + 9 + (bid * nmodes) * addr_stride; + const int *ma = addr + 12 + (bid * nmodes) * addr_stride; + + aux += ea[0] * A * B; + fmag += da[0] * A * B; + fmask += ma[0] * A * B; + llerr += da[0] * A * B; + MATH_TYPE norm = A * B; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE acc = 0.0; + for (int idx = 0; idx < nmodes; ++idx) + { + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); + acc += abs_exit_wave * + abs_exit_wave; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding + } + auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); + llerr[a * B + b] = + MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu new file mode 100644 index 000000000..4da8efb3e --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu @@ -0,0 +1,115 @@ +/** max_abs2 kernel, calculating the sum of abs(x)**2 value in the first dimension + * and then the maximum across the last 2 dimensions + * + * Data types: + * - IN_TYPE: can be float/double or complex/complex + */ + +#include +#include +using thrust::complex; +using thrust::norm; + +inline __device__ OUT_TYPE norm(const float& in) { + return in*in; +} + +inline __device__ OUT_TYPE norm(const double& in) { + return in*in; +} + +extern "C" __global__ void max_abs2_step1(const IN_TYPE* a, + int n, + int rows, + int cols, + OUT_TYPE* out) +{ + int tx = threadIdx.x; + const int iy = blockIdx.y; + + __shared__ OUT_TYPE sh[BDIM_X]; + + OUT_TYPE maxv = OUT_TYPE(0); + + for (int ix = tx; ix < cols; ix += BDIM_X) { + OUT_TYPE v = OUT_TYPE(0); + for (int in = 0; in < n; ++in) { + v += norm(a[in * rows * cols + iy * cols + ix]); + } + if (v > maxv) + maxv = v; + } + + + sh[tx] = maxv; + + __syncthreads(); + + // reduce: + const int nt = BDIM_X; + int c = nt; + + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + auto v = sh[c - tx - 1]; + if (maxv < v) { + sh[tx] = v; + maxv = v; + } + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + out[iy] = sh[0]; + } +} + +extern "C" __global__ void max_abs2_step2(const OUT_TYPE* in, + int n, + OUT_TYPE* out) +{ + int tx = threadIdx.x; + + in += blockIdx.x * n; + + __shared__ OUT_TYPE sh[BDIM_X]; + + OUT_TYPE maxv = OUT_TYPE(0); + for (int i = tx; i < n; ++i) { + auto v = in[i]; + if (v > maxv) + maxv = v; + } + sh[tx] = maxv; + __syncthreads(); + + // reduce: + const int nt = BDIM_X; + int c = nt; + + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + auto v = sh[c - tx - 1]; + if (maxv < v) { + sh[tx] = v; + maxv = v; + } + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + out[0] = sh[0]; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu new file mode 100644 index 000000000..c49119be2 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu @@ -0,0 +1,67 @@ +/** ob_update_local - in DR algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +using thrust::complex; + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, y.real()); + atomicAdd(xf + 1, y.imag()); +} + +extern "C" __global__ void ob_update_local( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + const complex* __restrict__ probe, + int D, + int E, + int F, + const IN_TYPE* __restrict__ pr_norm, + complex* obj, + int G, + int H, + int I, + const int* __restrict__ addr) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + MATH_TYPE norm_val = pr_norm[0]; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + auto add_val_m = conj(probe_val) * (exit_val - aux_val) / norm_val; + complex add_val = add_val_m; + atomicAdd(&obj[b * I + c], add_val); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu new file mode 100644 index 000000000..ee81e1620 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu @@ -0,0 +1,71 @@ +/** pr_update_local - for DR algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: data type used in norm calculation (input here) + */ + +#include +using thrust::complex; + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, T(y.real())); + atomicAdd(xf + 1, T(y.imag())); +} + +extern "C" __global__ void pr_update_local( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + complex* probe, + int D, + int E, + int F, + const IN_TYPE* __restrict__ ob_norm, + const complex* __restrict__ obj, + int G, + int H, + int I, + const int* __restrict__ addr) +{ + assert(B == E); // prsh[1] + assert(C == F); // prsh[2] + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + MATH_TYPE norm_val = ob_norm[0]; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + complex add_val_m = conj(obj_val) * (exit_val - aux_val) / norm_val; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); + } + +} diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 63503d608..cb489253a 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -29,10 +29,6 @@ __all__ = ['DM_pycuda'] -serialize_array_access = DM_serial.serialize_array_access -gaussian_kernel = DM_serial.gaussian_kernel - - @register() class DM_pycuda(DM_serial.DM_serial): @@ -74,19 +70,11 @@ def engine_initialize(self): self.context, self.queue = get_context(new_context=True, new_queue=True) # allocator for READ only buffers # self.const_allocator = cl.tools.ImmediateAllocator(queue, cl.mem_flags.READ_ONLY) - ## gaussian filter - # dummy kernel - # if not self.p.obj_smooth_std: - # gauss_kernel = gaussian_kernel(1, 1).astype(np.float32) - # else: - # gauss_kernel = gaussian_kernel(self.p.obj_smooth_std, self.p.obj_smooth_std).astype(np.float32) - # self.gauss_kernel_gpu = gpuarray.to_gpu(gauss_kernel) # Gaussian Smoothing Kernel self.GSK = GaussianSmoothingKernel(queue=self.queue) super(DM_pycuda, self).engine_initialize() - self.error = [] def _setup_kernels(self): """ @@ -153,7 +141,6 @@ def _setup_kernels(self): kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) kern.PCK.allocate() kern.PCK.address_mangler = addr_mangler - #self.queue.synchronize() logger.info("Kernel setup completed") def engine_prepare(self): diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py new file mode 100644 index 000000000..879411178 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py @@ -0,0 +1,290 @@ +# -*- coding: utf-8 -*- +""" +Local Douglas-Rachford reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" + +import numpy as np +import time +from pycuda import gpuarray +import pycuda.driver as cuda + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.accelerate.base.engines import DR_serial +from ptypy.accelerate.base import address_manglers +from .. import get_context +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa + +MPI = False + +# debugging +import sys + +__all__ = ['DR_pycuda'] + +@register() +class DR_pycuda(DR_serial.DR_serial): + + """ + Defaults: + + [fft_lib] + default = reikna + type = str + help = Choose the pycuda-compatible FFT module. + doc = One of: + - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming) + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages) + choices = 'reikna','cuda','skcuda' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Difference map reconstruction engine. + """ + super(DR_pycuda, self).__init__(ptycho_parent, pars) + + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + self.context, self.queue = get_context(new_context=True, new_queue=True) + + super(DR_pycuda, self).engine_initialize() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + # TODO: make this part of the engine rather than scan + fpc = self.ptycho.frames_per_block + + # Currently modes not implemented for DR algorithm + #assert scan.p.coherence.num_probe_modes == 1 + #assert scan.p.coherence.num_object_modes == 1 + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + fpc = 1 + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = gpuarray.to_gpu(aux) + + # setup kernels, one for each SCAN. + logger.info("Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) + kern.FUK.fshape = (1,) + kern.FUK.fshape[1:] + kern.FUK.allocate() + + logger.info("Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + logger.info("Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + logger.info("Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + #logger.info("Setting up TransposeKernel") + #kern.TK = TransposeKernel(queue=self.queue) + + logger.info("Setting up PropagationKernel") + kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + # if self.do_position_refinement: + # logger.info("Setting up position correction") + # addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), + # self.p.position_refinement.start, + # self.p.position_refinement.stop, + # max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), + # randomseed=0) + # logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) + # logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) + + # kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) + # kern.PCK.allocate() + # kern.PCK.address_mangler = addr_mangler + + logger.info("Kernel setup completed") + + + def engine_prepare(self): + + super(DR_pycuda, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = gpuarray.to_gpu(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = gpuarray.to_gpu(prep.addr) + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.ex = gpuarray.to_gpu(prep.ex) + prep.mag = gpuarray.to_gpu(prep.mag) + prep.ma = gpuarray.to_gpu(prep.ma) + prep.ma_sum = gpuarray.to_gpu(prep.ma_sum) + prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier) + prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot) + prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) + # if self.do_position_refinement: + # prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu) + + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + queue = self.queue + error = {} + for it in range(num): + + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i,None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex[ex_from:ex_to] + mag = prep.mag[i,None] + ma = prep.ma[i,None] + ma_sum = prep.ma_sum[i,None] + err_phot = prep.err_phot_gpu[i,None] + err_fourier = prep.err_fourier_gpu[i,None] + err_exit = prep.err_exit_gpu[i,None] + + ## build auxilliary wave + AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha) + + ## forward FFT + PROP.fw(aux, aux) + + ## Deviation from measured data + if self.p.compute_fourier_error: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + ## backward FFT + PROP.bw(aux, aux) + + ## build exit wave + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + + ## probe/object rescale + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + ## build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.ob_update_local(addr, ob, pr, ex, aux) + + # probe update + POK.pr_update_local(addr, pr, ob, ex, aux) + + ## compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + self.curiter += 1 + + queue.synchronize() + for name, s in self.ob.S.items(): + s.gpu.get(s.data) + for name, s in self.pr.S.items(): + s.gpu.get(s.data) + + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + for name, s in self.ob.S.items(): + del s.gpu + for name, s in self.pr.S.items(): + del s.gpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + + self.context.detach() + super(DR_pycuda, self).engine_finalize() \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py new file mode 100644 index 000000000..fd8dd4b5e --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +""" +Local Douglas-Rachford reconstruction engine for NVIDIA GPUs. + +This engine uses three streams, one for the compute queue and one for each I/O queue. +Events are used to synchronize download / compute/ upload. we cannot manipulate memory +for each loop over the state vector, a certain number of memory sections is preallocated +and reused. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" + +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +import numpy as np +from pycuda import gpuarray +import pycuda.driver as cuda + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from . import DR_pycuda + +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager2 + +MPI = False + +EX_MA_BLOCKS_RATIO = 2 +MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit +#MAX_BLOCKS = 4 # can be used to limit the number of blocks, simulating that they don't fit + +__all__ = ['DR_pycuda_stream'] + +@register() +class DR_pycuda_stream(DR_pycuda.DR_pycuda): + + def __init__(self, ptycho_parent, pars=None): + + super(DR_pycuda_stream, self).__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + super().engine_initialize() + self.qu_htod = cuda.Stream() + self.qu_dtoh = cuda.Stream() + + def _setup_kernels(self): + super()._setup_kernels() + ex_mem = 0 + mag_mem = 0 + fpc = self.ptycho.frames_per_block + for scan, kern in self.kernels.items(): + ex_mem = max(kern.aux.nbytes * fpc, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem) + ma_mem = mag_mem + mem = cuda.mem_get_info()[0] + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + + log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager2(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager2(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager2(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super(DR_pycuda.DR_pycuda, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = gpuarray.to_gpu(prep.addr) + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum) + # prepare page-locked mems: + prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier) + prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot) + prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4) + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cuda.pagelocked_empty(ex.shape, ex.dtype, order="C", mem_flags=4) + prep.ex[:] = ex + mag = prep.mag + prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) + prep.mag[:] = mag + + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + self.dID_list = list(self.di.S.keys()) + error = {} + + for it in range(num): + + for iblock, dID in enumerate(self.dID_list): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Schedule ex, ma, mag to device + ev_ex, ex_full, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod) + ev_mag, mag_full, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod) + ev_ma, ma_full, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod) + + ## synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_ex) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i,None] + ex = ex_full[i,None] + mag = mag_full[i,None] + ma = ma_full[i,None] + ma_sum = prep.ma_sum[i,None] + err_phot = prep.err_phot_gpu[i,None] + err_fourier = prep.err_fourier_gpu[i,None] + err_exit = prep.err_exit_gpu[i,None] + + ## build auxilliary wave + AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha) + + ## forward FFT + PROP.fw(aux, aux) + + ## Deviation from measured data + self.queue.wait_for_event(ev_mag) + if self.p.compute_fourier_error: + self.queue.wait_for_event(ev_ma) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + self.queue.wait_for_event(ev_ma) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + ## backward FFT + PROP.bw(aux, aux) + + ## build exit wave + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + + ## probe/object rescale + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + ## build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.ob_update_local(addr, ob, pr, ex, aux) + + # probe update + POK.pr_update_local(addr, pr, ob, ex, aux) + + ## compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + self.dID_list.reverse() + + self.curiter += 1 + self.ex_data.syncback = False + + # finish all the compute + self.queue.synchronize() + + for name, s in self.ob.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + for name, s in self.pr.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + + for dID, prep in self.diff_info.items(): + prep.err_fourier_gpu.get(prep.err_fourier) + prep.err_phot_gpu.get(prep.err_phot) + prep.err_exit_gpu.get(prep.err_exit) + errs = np.ascontiguousarray(np.vstack([ + prep.err_fourier, prep.err_phot, prep.err_exit + ]).T) + error.update(zip(prep.view_IDs, errs)) + + # wait for the async transfers + self.qu_dtoh.synchronize() + + self.error = error + return error + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + # replacing page-locked data with normal npy to avoid + # crash on context destroy + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + + super().engine_finalize() + \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 93500168c..bbf53c975 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -4,6 +4,7 @@ from ptypy.utils.verbose import log, logger from . import load_kernel from .array_utils import CropPadKernel +from .array_utils import MaxAbs2Kernel from ..base import kernels as ab from ..base.kernels import Adict @@ -136,6 +137,8 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type }) + self.fmag_update_nopbound_cuda = None + self.fourier_deviation_cuda = None self.fourier_error_cuda = load_kernel("fourier_error", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', @@ -150,11 +153,13 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma 'BDIM_Y': 32, }) self.fourier_update_cuda = None - self.log_likelihood_cuda = load_kernel("log_likelihood", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel( + ("log_likelihood", "log_likelihood2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "log_likelihood.cu") self.exit_error_cuda = load_kernel("exit_error", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', @@ -214,6 +219,28 @@ def fourier_error(self, f, addr, fmag, fmask, mask_sum): shared=int(bx*by*bz*4), stream=self.queue) + def fourier_deviation(self, f, addr, fmag): + fdev = self.gpu.fdev + if self.fourier_deviation_cuda is None: + self.fourier_deviation_cuda = load_kernel("fourier_deviation",{ + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + bx = 64 + by = 1 + self.fourier_deviation_cuda(np.int32(self.nmodes), + f, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), + stream=self.queue) + + def error_reduce(self, addr, err_sum): self.error_reduce_cuda(self.gpu.ferr, err_sum, @@ -237,6 +264,29 @@ def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): block=(32, 32, 1), grid=(int(fmag.shape[0]*self.nmodes), 1, 1), stream=self.queue) + + def fmag_update_nopbound(self, f, addr, fmag, fmask): + fdev = self.gpu.fdev + bx = 64 + by = 1 + if self.fmag_update_nopbound_cuda is None: + self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda(f, + fmask, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, + int((self.fshape[2] + by - 1) // by), + int(fmag.shape[0]*self.nmodes)), + stream=self.queue) # Note: this was a test to join the kernels, but it's > 2x slower! def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): @@ -286,6 +336,24 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # TODO: we might want to move this call outside of here self.error_reduce(addr, err_phot) + def log_likelihood2(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + bx = 64 + by = 1 + self.log_likelihood2_cuda(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), + stream=self.queue) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + def exit_error(self, aux, addr): sh = addr.shape maxz = sh[0] @@ -327,17 +395,24 @@ def __init__(self, queue_thread=None, math_type = 'float'): self.math_type = math_type if math_type not in ['float', 'double']: raise ValueError('Only double or float math is supported') - self.build_aux_cuda = load_kernel("build_aux", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.build_aux_cuda, self.build_aux2_cuda = load_kernel( + ("build_aux", "build_aux2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux.cu") self.build_exit_cuda = load_kernel("build_exit", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type }) - self.build_aux_no_ex_cuda = load_kernel("build_aux_no_ex", { + self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel( + ("build_aux_no_ex", "build_aux2_no_ex"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux_no_ex.cu") + self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type @@ -351,14 +426,9 @@ def load(self, aux, ob, pr, ex, addr): def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): obr, obc = self._cache_object_shape(ob) - # print('grid={}, 1, 1'.format(int(ex.shape[0]))) - # print('b_aux={}, sh={}'.format(type(b_aux), b_aux.shape)) - # print('ex={}, sh={}'.format(type(ex), ex.shape)) - # print('pr={}, sh={}'.format(type(pr), pr.shape)) - # print('ob={}, sh={}'.format(type(ob), ob.shape)) - # print('obr={}, obc={}'.format(obr, obc)) - # print('addr={}, sh={}'.format(type(addr), addr.shape)) - # print('stream={}'.format(self.queue)) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_aux_cuda(b_aux, ex, np.int32(ex.shape[1]), np.int32(ex.shape[2]), @@ -368,10 +438,36 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): obr, obc, addr, np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), - block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue) + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_aux2_cuda(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), + block=(bx, by, 1), + grid=( + 1, + int((ex.shape[1] + by - 1)//by), + int(maxz * nmodes)), + stream=self.queue) def build_exit(self, b_aux, addr, ob, pr, ex): obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_exit_cuda(b_aux, ex, np.int32(ex.shape[1]), np.int32(ex.shape[2]), @@ -380,7 +476,27 @@ def build_exit(self, b_aux, addr, ob, pr, ex): ob, obr, obc, addr, - block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue) + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_exit_alpha_tau_cuda(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha), np.float32(tau), + block=(bx, by, 1), + grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), + stream=self.queue) def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): obr, obc = self._cache_object_shape(ob) @@ -402,6 +518,30 @@ def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_aux2_no_ex_cuda(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add), + block=(bx, by, 1), + grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)), + stream=self.queue) + + def _cache_object_shape(self, ob): oid = id(ob) @@ -441,21 +581,17 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t 'BDIM_X': 32, 'BDIM_Y': 32 }) - self.fill_b_cuda = load_kernel('fill_b', { - **subs, - 'BDIM_X': 1024, - 'OUT_TYPE': self.accumulate_type - }) - self.fill_b_reduce_cuda = load_kernel( - 'fill_b_reduce', { + self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel( + ('fill_b', 'fill_b_reduce'), + { **subs, - 'BDIM_X': 1024, - 'IN_TYPE': self.accumulate_type, # must match out-type of fill_b + 'BDIM_X': 1024, 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' - }) + }, + file="fill_b.cu") self.main_cuda = load_kernel('gd_main', subs) - self.floating_intensity_cuda_step1 = load_kernel('step1', subs,'intens_renorm.cu') - self.floating_intensity_cuda_step2 = load_kernel('step2', subs,'intens_renorm.cu') + self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \ + load_kernel(('step1', 'step2'), subs,'intens_renorm.cu') def allocate(self): self.gpu.LLden = gpuarray.zeros(self.fshape, dtype=self.ftype) @@ -639,6 +775,8 @@ def __init__(self, queue_thread=None, self.math_type = math_type self.accumulator_type = accumulator_type self.queue = queue_thread + self.norm = None + self.MAK = MaxAbs2Kernel(self.queue) self.ob_update_cuda = load_kernel("ob_update", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', @@ -663,6 +801,18 @@ def __init__(self, queue_thread=None, 'MATH_TYPE': self.math_type }) self.pr_update2_ML_cuda = None + self.ob_update_local_cuda = load_kernel("ob_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_local_cuda = load_kernel("pr_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) def ob_update(self, addr, ob, obn, pr, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] @@ -823,6 +973,63 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): block=(16, 16, 1), grid=grid, stream=self.queue) + def ob_update_local(self, addr, ob, pr, ex, aux): + # lazy allocation of temporary 1-element array + if self.norm is None: + self.norm = gpuarray.empty((1,), dtype=np.float32) + self.MAK.max_abs2(pr, self.norm) + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.ob_update_local_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + self.norm, + ob, + obsh[0], obsh[1], obsh[2], + addr, + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + stream=self.queue) + + def pr_update_local(self, addr, pr, ob, ex, aux): + # lazy allocation of temporary 1-element array + if self.norm is None: + self.norm = gpuarray.empty((1,), dtype=np.float32) + self.MAK.max_abs2(ob, self.norm) + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + + bx = 64 + by = 1 + self.pr_update_local_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + self.norm, + ob, + obsh[0], obsh[1], obsh[2], + addr, + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1) // by), int(num_pods)), + stream=self.queue) + + + class PositionCorrectionKernel(ab.PositionCorrectionKernel): def __init__(self, aux, nmodes, queue_thread=None, math_type='float', accumulate_type='float'): super(PositionCorrectionKernel, self).__init__(aux, nmodes) @@ -869,13 +1076,16 @@ def allocate(self): def build_aux(self, b_aux, addr, ob, pr): obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_aux_pc_cuda(b_aux, pr, np.int32(pr.shape[1]), np.int32(pr.shape[2]), ob, obr, obc, addr, - block=(32, 32, 1), grid=(int(np.prod(addr.shape[:1])), 1, 1), stream=self.queue) + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) def fourier_error(self, f, addr, fmag, fmask, mask_sum): fdev = self.gpu.fdev diff --git a/templates/minimal_prep_and_run_DM_local.py b/templates/minimal_prep_and_run_DR_pycuda.py similarity index 84% rename from templates/minimal_prep_and_run_DM_local.py rename to templates/minimal_prep_and_run_DR_pycuda.py index 9f7f5f9f0..654df60fe 100644 --- a/templates/minimal_prep_and_run_DM_local.py +++ b/templates/minimal_prep_and_run_DR_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u -from ptypy.accelerate.base.engines import DM_local +from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda p = u.Param() # for verbose output @@ -19,7 +19,7 @@ p.io = u.Param() p.io.home = "/tmp/ptypy/" p.io.autosave = u.Param(active=False) -p.io.interaction = u.Param(active=True) +p.io.interaction = u.Param(active=False) p.io.interaction.client = u.Param() p.io.interaction.client.poll_timeout = 1 @@ -42,17 +42,15 @@ # Gaussian FWHM of possible detector blurring p.scans.MF.data.psf = 0.0 p.scans.MF.coherence = u.Param() -p.scans.MF.coherence.num_probe_modes = 1 +p.scans.MF.coherence.num_probe_modes = 3 # attach a reconstrucion engine p.engines = u.Param() p.engines.engine00 = u.Param() -p.engines.engine00.name = 'DM_local' +p.engines.engine00.name = 'DR_pycuda' p.engines.engine00.numiter = 100 p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE p.engines.engine00.tau = 1 -p.engines.engine00.rescale_probe = False -p.engines.engine00.fourier_power_bound = 0.0 # prepare and run P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_DR_serial.py b/templates/minimal_prep_and_run_DR_serial.py new file mode 100644 index 000000000..a9d16eb45 --- /dev/null +++ b/templates/minimal_prep_and_run_DR_serial.py @@ -0,0 +1,58 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.base.engines import DR_serial +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# Frames per block +p.frames_per_block = 200 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0.0 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 3 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DR_serial' +p.engines.engine00.numiter = 100 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 +#p.engines.engine00.rescale_probe = False +#p.engines.engine00.fourier_power_bound = 0.0 + +# prepare and run +P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_probe_modes.py b/templates/minimal_prep_and_run_probe_modes.py index 8dbcb4dc4..17d358b6b 100644 --- a/templates/minimal_prep_and_run_probe_modes.py +++ b/templates/minimal_prep_and_run_probe_modes.py @@ -14,7 +14,10 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=True) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py index e38909e71..93e753a51 100644 --- a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py @@ -21,7 +21,7 @@ def setUp(self): def tearDown(self): np.set_printoptions() - def prepare_arrays(self): + def prepare_arrays(self, scan_points = None): B = 3 # frame size y C = 3 # frame size x @@ -34,7 +34,10 @@ def prepare_arrays(self): H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points total_number_scan_positions = scan_pts ** 2 total_number_modes = G * D @@ -76,25 +79,17 @@ def prepare_arrays(self): return addr, object_array, probe, exit_wave def test_build_aux_same_as_exit(self): - ''' - setup - ''' - - ''' - test - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() alpha_set = 1.0 AWK.allocate() # doesn't actually do anything at the moment - AWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) - # print("auxiliary_wave after") - # print(repr(auxiliary_wave)) - + # assert expected_auxiliary_wave = np.array([[[-1. + 3.j, -1. + 3.j, -1. + 3.j], [-1. + 3.j, -1. + 3.j, -1. + 3.j], [-1. + 3.j, -1. + 3.j, -1. + 3.j]], @@ -143,32 +138,20 @@ def test_build_aux_same_as_exit(self): [[-16. + 16.j, -16. + 16.j, -16. + 16.j], [-16. + 16.j, -16. + 16.j, -16. + 16.j], [-16. + 16.j, -16. + 16.j, -16. + 16.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(expected_auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") def test_build_exit_aux_same_as_exit(self): - ''' - setup - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() - - ''' - test - ''' auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() AWK.allocate() - AWK.build_exit(auxiliary_wave, addr, object_array, probe, exit_wave) - # - # print("auxiliary_wave after") - # print(repr(auxiliary_wave)) - # - # print("exit_wave after") - # print(repr(exit_wave)) + # assert expected_auxiliary_wave = np.array([[[0. - 2.j, 0. - 2.j, 0. - 2.j], [0. - 2.j, 0. - 2.j, 0. - 2.j], [0. - 2.j, 0. - 2.j, 0. - 2.j]], @@ -217,10 +200,10 @@ def test_build_exit_aux_same_as_exit(self): [[0. - 16.j, 0. - 16.j, 0. - 16.j], [0. - 16.j, 0. - 16.j, 0. - 16.j], [0. - 16.j, 0. - 16.j, 0. - 16.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + # assert expected_exit_wave = np.array([[[1. - 1.j, 1. - 1.j, 1. - 1.j], [1. - 1.j, 1. - 1.j, 1. - 1.j], [1. - 1.j, 1. - 1.j, 1. - 1.j]], @@ -269,24 +252,20 @@ def test_build_exit_aux_same_as_exit(self): [[16. + 0.j, 16. + 0.j, 16. + 0.j], [16. + 0.j, 16. + 0.j, 16. + 0.j], [16. + 0.j, 16. + 0.j, 16. + 0.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(exit_wave, expected_exit_wave, err_msg="The exit_wave has not been updated as expected") def test_build_aux_no_ex(self): - ''' - setup - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() - - ''' - test - ''' auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() AWK.allocate() AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + # assert expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j], [0. + 2.j, 0. + 2.j, 0. + 2.j], [0. + 2.j, 0. + 2.j, 0. + 2.j]], @@ -337,9 +316,12 @@ def test_build_aux_no_ex(self): [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64) np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + + # test auxiliary_wave = exit_wave AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + # assert expected_auxiliary_wave = np.array([[[1. + 5.j, 1. + 5.j, 1. + 5.j], [1. + 5.j, 1. + 5.j, 1. + 5.j], [1. + 5.j, 1. + 5.j, 1. + 5.j]], @@ -391,5 +373,57 @@ def test_build_aux_no_ex(self): np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + + def test_build_exit_alpha_tau(self): + + # setup + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + auxiliary_wave = np.zeros_like(exit_wave) + + # test + AWK = AuxiliaryWaveKernel() + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave) + + # assert + expected_auxiliary_wave = np.array( + [[[0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j]], + + [[0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j]], + + [[0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j]], + + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64) + np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, + err_msg="The auxiliary_wave has not been updated as expected") + + # assert + expected_exit_wave = np.array( + [[[1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j]], + + [[2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j]], + + [[3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j]], + + [[4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64) + np.testing.assert_array_equal(exit_wave, expected_exit_wave, + err_msg="The exit_wave has not been updated as expected") + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/base_tests/po_update_kernel_test.py b/test/accelerate_tests/base_tests/po_update_kernel_test.py index 15557e3d2..a8d20ce78 100644 --- a/test/accelerate_tests/base_tests/po_update_kernel_test.py +++ b/test/accelerate_tests/base_tests/po_update_kernel_test.py @@ -91,26 +91,15 @@ def prepare_arrays(self): return addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator def test_ob_update(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' + # test POUK = PoUpdateKernel() - POUK.allocate() # doesn't do anything but is the call signature - - # print("object array denom before:") - # print(object_array_denominator) - POUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave) - # print("object array denom after:") - # print(repr(object_array_denominator)) - + # assert expected_object_array = np.array([[[15. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 39. + 1.j, 1. + 1.j], [77. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 125. + 1.j, 1. + 1.j], @@ -136,10 +125,10 @@ def test_ob_update(self): 4. + 4.j], [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") + # assert expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.], [22., 42., 42., 42., 42., 22., 2.], [22., 42., 42., 42., 42., 22., 2.], @@ -159,29 +148,15 @@ def test_ob_update(self): err_msg="The object array denominatorhas not been updated as expected") def test_pr_update(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - - # print("probe array before:") - # print(repr(probe)) - # print("probe denominator array before:") - # print(repr(probe_denominator)) - POUK.pr_update(addr, probe, probe_denominator, object_array, exit_wave) - # print("probe array after:") - # print(repr(probe)) - # print("probe denominator array after:") - # print(repr(probe_denominator)) - + # assert expected_probe = np.array([[[313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], @@ -194,9 +169,10 @@ def test_pr_update(self): [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j], [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(probe, expected_probe, err_msg="The probe has not been updated as expected") + + # assert expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.], [138., 138., 138., 138., 138.], [138., 138., 138., 138., 138.], @@ -212,19 +188,15 @@ def test_pr_update(self): err_msg="The probe denominatorhas not been updated as expected") def test_pr_update_ML(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - POUK.pr_update_ML(addr, probe, object_array, exit_wave) + # assert expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], @@ -237,26 +209,19 @@ def test_pr_update_ML(self): [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(probe, expected_probe, err_msg="The probe has not been updated as expected") def test_ob_update_ML(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - POUK.ob_update_ML(addr, object_array, probe, exit_wave) - print(repr(object_array)) - + # assert expected_object_array = np.array( [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j], [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], @@ -274,7 +239,147 @@ def test_ob_update_ML(self): [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j], [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], dtype=COMPLEX_TYPE) + np.testing.assert_array_equal(object_array, expected_object_array, + err_msg="The object array has not been updated as expected") + + + def test_pr_update_local(self): + # setup + B = 5 # frame size y + C = 5 # frame size x + + D = 1 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 1 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 1.5 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # test + POUK = PoUpdateKernel() + POUK.allocate() # this doesn't do anything, but is the call pattern. + POUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave) + + # assert + expected_probe = np.array( + [[[0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j]]], dtype=COMPLEX_TYPE) + np.testing.assert_array_equal(probe, expected_probe, + err_msg="The probe has not been updated as expected") + + def test_ob_update_local(self): + # setup + B = 5 # frame size y + C = 5 # frame size x + + D = 1 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 1 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 2 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + # test + POUK = PoUpdateKernel() + POUK.allocate() # this doesn't do anything, but is the call pattern. + POUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave) + + # assert + expected_object_array = np.array( + [[[-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [ 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [ 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j]]], dtype=COMPLEX_TYPE) np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index 611c67759..9823f2a9b 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -354,3 +354,35 @@ def test_crop_pad_simple_oblike_UNITY(self): # Assert np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_max_abs2_complex_UNITY(self): + np.random.seed(1983) + X = (np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + \ + 1j * np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32)).astype(np.complex64) + out = np.zeros((1,), dtype=np.float32) + X_dev = gpuarray.to_gpu(X) + out_dev = gpuarray.to_gpu(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + def test_max_abs2_float_UNITY(self): + np.random.seed(1983) + X = np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + + out = np.zeros((1,), dtype=np.float32) + X_dev = gpuarray.to_gpu(X) + out_dev = gpuarray.to_gpu(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py index bc38a62b1..71e8e1e7e 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py @@ -17,7 +17,7 @@ class AuxiliaryWaveKernelTest(PyCudaTest): - def prepare_arrays(self, performance=False): + def prepare_arrays(self, performance=False, scan_points=None): if not performance: B = 3 # frame size y C = 3 # frame size x @@ -27,8 +27,10 @@ def prepare_arrays(self, performance=False): npts_greater_than = 2 # how many points bigger than the probe the object is. G = 2 # number of object modes - - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points else: B = 128 C = 128 @@ -37,7 +39,10 @@ def prepare_arrays(self, performance=False): F = C npts_greater_than = 1215 G = 4 - scan_pts = 14 + if scan_points is None: + scan_pts = 14 + else: + scan_pts = scan_points H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x @@ -189,6 +194,25 @@ def test_build_aux_same_as_exit_UNITY(self): np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), err_msg="The gpu auxiliary_wave does not look the same as the numpy version") + def test_build_aux2_same_as_exit_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = np.zeros_like(exit_wave) + auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev) + + ## Act + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = FLOAT_TYPE(1.0) + + AWK.build_aux2(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set) + nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The gpu auxiliary_wave does not look the same as the numpy version") def test_build_exit_aux_same_as_exit_REGRESSION(self): ## Arrange @@ -413,6 +437,27 @@ def test_build_aux_no_ex_noadd_UNITY(self): np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, err_msg="The auxiliary_wave does not match numpy") + def test_build_aux2_no_ex_noadd_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev) + auxiliary_wave = np.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=1.0, add=False) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + def test_build_aux_no_ex_add_REGRESSION(self): ## Arrange @@ -500,6 +545,27 @@ def test_build_aux_no_ex_add_UNITY(self): np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, err_msg="The auxiliary_wave does not match numpy") + def test_build_aux2_no_ex_add_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=2.0, add=True) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + @unittest.skipIf(not perfrun, "performance test") def test_build_aux_no_ex_performance(self): @@ -512,5 +578,89 @@ def test_build_aux_no_ex_performance(self): AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + def test_build_exit_alpha_tau_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = gpuarray.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave) + + # Assert + expected_auxiliary_wave = np.array( + [[[0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j]], + + [[0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j]], + + [[0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j]], + + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64) + np.testing.assert_allclose(auxiliary_wave.get(), expected_auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave has not been updated as expected") + + expected_exit_wave = np.array( + [[[1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j]], + + [[2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j]], + + [[3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j]], + + [[4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64) + np.testing.assert_allclose(exit_wave.get(), expected_exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave has not been updated as expected") + + def test_build_exit_alpha_tau_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=0.8, tau=0.6) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + + ## Assert + np.testing.assert_allclose(auxiliary_wave_dev.get(), auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave does not match numpy") + ## Assert + np.testing.assert_allclose(exit_wave_dev.get(), exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave does not match numpy") + + @unittest.skipIf(not perfrun, "performance test") + def test_build_exit_alpha_tau_performance(self): + addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True, scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = gpuarray.zeros_like(exit_wave) + + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py new file mode 100644 index 000000000..57f62f9dd --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py @@ -0,0 +1,83 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel +from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsDRpycudaTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-dr/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def test_ob_update_local_UNITY(self): + + # Load data + with h5py.File(self.datadir + "ob_update_local_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + ex = f["ex"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + ex_dev = gpuarray.to_gpu(ex) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.ob_update_local(addr, ob, pr, ex, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.ob_update_local(addr_dev, ob_dev, pr_dev, ex_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(ob_dev.get(), ob, atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The object array has not been updated as expected") + + def test_pr_update_local_UNITY(self): + + # Load data + with h5py.File(self.datadir + "pr_update_local_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + ex = f["ex"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + ex_dev = gpuarray.to_gpu(ex) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.pr_update_local(addr, pr, ob, ex, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.pr_update_local(addr_dev, pr_dev, ob_dev, ex_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(pr_dev.get(), pr, atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The object array has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py index 2650c9ad1..3d7cb5fa6 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py @@ -114,6 +114,99 @@ def test_fmag_all_update_UNITY(self): repr(measured_f), repr(mask))) + def test_fmag_update_nopbound_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # print("address book is:") + # print(repr(addr)) + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + nFUK.error_reduce(addr, err_fmag) + # print(np.sqrt(pbound_set/err_fmag)) + f_d = gpuarray.to_gpu(f) + fmag_d = gpuarray.to_gpu(fmag) + mask_d = gpuarray.to_gpu(mask) + addr_d = gpuarray.to_gpu(addr) + + # now set the state for both. + + FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev) + FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr) + + FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d) + nFUK.fmag_update_nopbound(f, addr, fmag, mask) + + expected_f = f + measured_f = f_d.get() + np.testing.assert_allclose(measured_f, expected_f, rtol=1e-6, err_msg="Numpy f " + "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), + repr(measured_f), + repr(mask))) + + def test_fourier_error_UNITY(self): ''' setup @@ -203,6 +296,87 @@ def test_fourier_error_UNITY(self): "is \n%s, \nbut gpu ferr is \n %s, \n " % ( repr(expected_ferr), repr(measured_ferr))) + def test_fourier_deviation_UNITY(self): + ''' + setup - using the fourier_error as reference, so we need mask, etc. + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + f_d = gpuarray.to_gpu(f) + fmag_d = gpuarray.to_gpu(fmag) + addr_d = gpuarray.to_gpu(addr) + + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_deviation(f, addr, fmag) + FUK.fourier_deviation(f_d, addr_d, fmag_d) + + expected_fdev = nFUK.npy.fdev + measured_fdev = FUK.gpu.fdev.get() + np.testing.assert_allclose(measured_fdev, expected_fdev, rtol=1e-6, err_msg="Numpy fdev " + "is \n%s, \nbut gpu fdev is \n %s, \n " % ( + repr(expected_fdev), + repr(measured_fdev))) + + def test_error_reduce_UNITY(self): ''' @@ -348,7 +522,7 @@ def test_error_reduce(self): "is not behaving as expected.") - def test_log_likelihood_UNITY(self): + def log_likelihood_UNITY_tester(self, use_version2=False): ''' setup ''' @@ -420,7 +594,10 @@ def test_log_likelihood_UNITY(self): FUK = FourierUpdateKernel(f, nmodes=total_number_modes) FUK.allocate() - FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) + if use_version2: + FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d) + else: + FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) expected_err_phot = LLerr measured_err_phot = LLerr_d.get() @@ -429,6 +606,11 @@ def test_log_likelihood_UNITY(self): "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % ( repr(expected_err_phot), repr(measured_err_phot)), rtol=1e-5) + def test_log_likelihood_UNITY(self): + self.log_likelihood_UNITY_tester(False) + + def test_log_likelihood2_UNITY(self): + self.log_likelihood_UNITY_tester(True) def test_exit_error_UNITY(self): ''' diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py index 4cd9a8f8c..d626c0ca2 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py @@ -6,6 +6,8 @@ import unittest import numpy as np from . import PyCudaTest, have_pycuda +from ptypy.accelerate.base.array_utils import max_abs2 +from parameterized import parameterized if have_pycuda(): from pycuda import gpuarray @@ -18,7 +20,7 @@ class PoUpdateKernelTest(PyCudaTest): - def prepare_arrays(self): + def prepare_arrays(self, scan_points=None): B = 5 # frame size y C = 5 # frame size x @@ -31,7 +33,10 @@ def prepare_arrays(self): H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points total_number_scan_positions = scan_pts ** 2 total_number_modes = G * D @@ -87,17 +92,12 @@ def prepare_arrays(self): def test_init(self): - POUK = PoUpdateKernel() - - np.testing.assert_equal(POUK.kernels, - ['pr_update', 'ob_update'], + np.testing.assert_equal(POUK.kernels, ['pr_update', 'ob_update'], err_msg='PoUpdateKernel does not have the correct functions registered.') def ob_update_REGRESSION_tester(self, atomics=True): - ''' - setup - ''' + B = 5 # frame size y C = 5 # frame size x @@ -149,7 +149,6 @@ def ob_update_REGRESSION_tester(self, atomics=True): mode_idx += 1 exit_idx += 1 position_idx += 1 - ''' test @@ -650,6 +649,158 @@ def test_ob_update_ML_atomics_REGRESSION(self): def test_ob_update_ML_tiled_REGRESSION(self): self.ob_update_ML_tester(False) + def test_ob_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 2 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel(queue_thread=self.stream) + + object_array_dev = gpuarray.to_gpu(object_array) + probe_dev = gpuarray.to_gpu(probe) + exit_wave_dev = gpuarray.to_gpu(exit_wave) + auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave) + addr_dev = gpuarray.to_gpu(addr) + + POUK.ob_update_local(addr_dev, object_array_dev, probe_dev, exit_wave_dev, auxiliary_wave_dev) + nPOUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave) + + np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6, + err_msg="The object array has not been updated as expected") + + def test_pr_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 1.5 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel() + + object_array_dev = gpuarray.to_gpu(object_array) + probe_dev = gpuarray.to_gpu(probe) + exit_wave_dev = gpuarray.to_gpu(exit_wave) + auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave) + addr_dev = gpuarray.to_gpu(addr) + + POUK.pr_update_local(addr_dev, probe_dev, object_array_dev,exit_wave_dev, auxiliary_wave_dev) + nPOUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave) + + np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6, + err_msg="The probe has not been updated as expected") + if __name__ == '__main__': unittest.main() From 74fd603f26ee8347ec8997b62ea60fde3e1fccbb Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 18 Mar 2021 17:55:06 +0000 Subject: [PATCH 31/56] Use blockmodel in DR templates --- templates/minimal_prep_and_run_DR_pycuda.py | 2 +- templates/minimal_prep_and_run_DR_serial.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/minimal_prep_and_run_DR_pycuda.py b/templates/minimal_prep_and_run_DR_pycuda.py index 654df60fe..618616320 100644 --- a/templates/minimal_prep_and_run_DR_pycuda.py +++ b/templates/minimal_prep_and_run_DR_pycuda.py @@ -28,7 +28,7 @@ p.scans.MF = u.Param() # now you have to specify which ScanModel to use with scans.XX.name, # just as you have to give 'name' for engines and PtyScan subclasses. -p.scans.MF.name = 'Full' +p.scans.MF.name = 'BlockFull' p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 diff --git a/templates/minimal_prep_and_run_DR_serial.py b/templates/minimal_prep_and_run_DR_serial.py index a9d16eb45..a9c3c04ba 100644 --- a/templates/minimal_prep_and_run_DR_serial.py +++ b/templates/minimal_prep_and_run_DR_serial.py @@ -28,7 +28,7 @@ p.scans.MF = u.Param() # now you have to specify which ScanModel to use with scans.XX.name, # just as you have to give 'name' for engines and PtyScan subclasses. -p.scans.MF.name = 'Full' +p.scans.MF.name = 'BlockFull' p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 From 45f094334e9ed3a25c5c48d1ed0557a7cc1d6a5a Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 18 Mar 2021 20:49:03 +0000 Subject: [PATCH 32/56] added benchmark script that fails with DM_pycuda_stream --- .../moonflower_scripts/i14_3.py | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py new file mode 100644 index 000000000..413fbe446 --- /dev/null +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py @@ -0,0 +1,75 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +import time +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams + +import os +import getpass +from pathlib import Path +username = getpass.getuser() +tmpdir = os.path.join('/dls/tmp', username, 'dumps', 'ptypy') +Path(tmpdir).mkdir(parents=True, exist_ok=True) + +p = u.Param() + +# for verbose output +p.verbose_level = 3 +p.frames_per_block = 100 +# set home path +p.io = u.Param() +p.io.home = tmpdir +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param() +p.io.interaction.server = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.i14_2 = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.i14_2.name = 'BlockFull' # or 'Full' +p.scans.i14_2.data= u.Param() +p.scans.i14_2.data.name = 'MoonFlowerScan' +p.scans.i14_2.data.shape = 512 +p.scans.i14_2.data.num_frames = 4000 #50000 is the real value +p.scans.i14_2.data.save = None + +p.scans.i14_2.illumination = u.Param() +p.scans.i14_2.coherence = u.Param(num_probe_modes=10) +p.scans.i14_2.illumination.diversity = u.Param() +p.scans.i14_2.illumination.diversity.noise = (0.5, 1.0) +p.scans.i14_2.illumination.diversity.power = 0.1 + +# position distance in fraction of illumination frame +p.scans.i14_2.data.density = 0.2 +# total number of photon in empty beam +p.scans.i14_2.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.i14_2.data.psf = 0.4 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_pycuda_stream' +p.engines.engine00.numiter = 100 +p.engines.engine00.numiter_contiguous = 20 +p.engines.engine00.probe_update_start = 1 +p.engines.engine00.probe_update_cuda_atomics = False +p.engines.engine00.object_update_cuda_atomics = True + + +# prepare and run +P = Ptycho(p,level=4) +t1 = time.perf_counter() +P.run() +t2 = time.perf_counter() +P.print_stats() +print('Elapsed Compute Time: {} seconds'.format(t2-t1)) From 7c058ab18d098081650ffacef23010c46ccbf435 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 18 Mar 2021 21:09:41 +0000 Subject: [PATCH 33/56] update scan name --- .../moonflower_scripts/i14_3.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py index 413fbe446..15e1c7513 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py @@ -32,28 +32,28 @@ # max 200 frames (128x128px) of diffraction data p.scans = u.Param() -p.scans.i14_2 = u.Param() +p.scans.i14_3 = u.Param() # now you have to specify which ScanModel to use with scans.XX.name, # just as you have to give 'name' for engines and PtyScan subclasses. -p.scans.i14_2.name = 'BlockFull' # or 'Full' -p.scans.i14_2.data= u.Param() -p.scans.i14_2.data.name = 'MoonFlowerScan' -p.scans.i14_2.data.shape = 512 -p.scans.i14_2.data.num_frames = 4000 #50000 is the real value -p.scans.i14_2.data.save = None +p.scans.i14_3.name = 'BlockFull' # or 'Full' +p.scans.i14_3.data= u.Param() +p.scans.i14_3.data.name = 'MoonFlowerScan' +p.scans.i14_3.data.shape = 512 +p.scans.i14_3.data.num_frames = 4000 #50000 is the real value +p.scans.i14_3.data.save = None -p.scans.i14_2.illumination = u.Param() -p.scans.i14_2.coherence = u.Param(num_probe_modes=10) -p.scans.i14_2.illumination.diversity = u.Param() -p.scans.i14_2.illumination.diversity.noise = (0.5, 1.0) -p.scans.i14_2.illumination.diversity.power = 0.1 +p.scans.i14_3.illumination = u.Param() +p.scans.i14_3.coherence = u.Param(num_probe_modes=10) +p.scans.i14_3.illumination.diversity = u.Param() +p.scans.i14_3.illumination.diversity.noise = (0.5, 1.0) +p.scans.i14_3.illumination.diversity.power = 0.1 # position distance in fraction of illumination frame -p.scans.i14_2.data.density = 0.2 +p.scans.i14_3.data.density = 0.2 # total number of photon in empty beam -p.scans.i14_2.data.photons = 1e8 +p.scans.i14_3.data.photons = 1e8 # Gaussian FWHM of possible detector blurring -p.scans.i14_2.data.psf = 0.4 +p.scans.i14_3.data.psf = 0.4 # attach a reconstrucion engine p.engines = u.Param() From 90ca1854b600e0a9d1b9ed72bb0b9284b2bf3517 Mon Sep 17 00:00:00 2001 From: Bjoern Enders Date: Thu, 18 Mar 2021 19:40:37 -0700 Subject: [PATCH 34/56] Fixed bug in GpuDataManager2 that would overallocate blocks. --- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py | 3 ++- ptypy/accelerate/cuda_pycuda/mem_utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 820124b5f..9dbb19bf6 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -69,7 +69,7 @@ def _setup_kernels(self): # TODO grow blocks dynamically nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) nma = min(fit, MAX_BLOCKS) - + log(3, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) # reset memory or create new self.ex_data = GpuDataManager2(ex_mem, 0, nex, True) @@ -123,6 +123,7 @@ def engine_prepare(self): prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) prep.mag[:] = mag + log(3, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) self.ex_data.add_data_block() self.ma_data.add_data_block() self.mag_data.add_data_block() diff --git a/ptypy/accelerate/cuda_pycuda/mem_utils.py b/ptypy/accelerate/cuda_pycuda/mem_utils.py index fdded3dfb..2f5917173 100644 --- a/ptypy/accelerate/cuda_pycuda/mem_utils.py +++ b/ptypy/accelerate/cuda_pycuda/mem_utils.py @@ -308,7 +308,7 @@ def add_data_block(self, nbytes=None): Returns ------- """ - if self.max is None or len(self)<=self.max: + if self.max is None or len(self) Date: Mon, 22 Mar 2021 14:45:59 +0000 Subject: [PATCH 35/56] fixing transpose kernel call, as that moved to its own class --- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py | 2 +- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 9dbb19bf6..3cf58f672 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -368,7 +368,7 @@ def engine_iterate(self, num=1): if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + kern.TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 self.queue.synchronize() diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 36aadfe1b..706c03b26 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -468,7 +468,7 @@ def engine_iterate(self, num=1): if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + kern.TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) prev_event = streamdata.end_compute() From 6dd41bf642ea622503e24161d4ce1a4e421af607 Mon Sep 17 00:00:00 2001 From: Bjoern Enders Date: Wed, 24 Mar 2021 00:49:11 -1000 Subject: [PATCH 36/56] includes scratchmem sizi in dict key (#308) --- ptypy/accelerate/cuda_pycuda/array_utils.py | 19 +++--- .../minimal_prep_and_run_DR_pycuda_stream.py | 59 +++++++++++++++++++ 2 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 templates/minimal_prep_and_run_DR_pycuda_stream.py diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 3378a0262..e953ed39d 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -117,9 +117,13 @@ def __init__(self, queue=None): def max_abs2(self, X, out): """ Calculate max(abs(x)**2) across the final 2 dimensions""" + rows = np.int32(X.shape[-2]) + cols = np.int32(X.shape[-1]) + firstdims = np.int32(np.prod(X.shape[:-2])) + gy = int(rows) # lazy-loading, keeping scratch memory and both kernels in the same dictionary bx = int(64) - version = '{},{}'.format(map2ctype(X.dtype), map2ctype(out.dtype)) + version = '{},{},{}'.format(map2ctype(X.dtype), map2ctype(out.dtype), gy) if version not in self.max_abs2_cuda: step1, step2 = load_kernel( ("max_abs2_step1", "max_abs2_step2"), @@ -131,17 +135,12 @@ def max_abs2(self, X, out): self.max_abs2_cuda[version] = { 'step1': step1, 'step2': step2, - 'scratchmem': None + 'scratchmem': gpuarray.empty((gy,), dtype=out.dtype) } - rows = np.int32(X.shape[-2]) - cols = np.int32(X.shape[-1]) - firstdims = np.int32(np.prod(X.shape[:-2])) - gy = int(rows) - - if self.max_abs2_cuda[version]['scratchmem'] is None \ - or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: - self.max_abs2_cuda[version]['scratchmem'] = gpuarray.empty((gy,), dtype=out.dtype) + # if self.max_abs2_cuda[version]['scratchmem'] is None \ + # or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: + # self.max_abs2_cuda[version]['scratchmem'] = scratch = self.max_abs2_cuda[version]['scratchmem'] diff --git a/templates/minimal_prep_and_run_DR_pycuda_stream.py b/templates/minimal_prep_and_run_DR_pycuda_stream.py new file mode 100644 index 000000000..38c5157a0 --- /dev/null +++ b/templates/minimal_prep_and_run_DR_pycuda_stream.py @@ -0,0 +1,59 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda_stream, DR_pycuda +DR_pycuda_stream.MAX_BLOCKS=3 +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# Frames per block +p.frames_per_block = 20 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 384 +p.scans.MF.data.num_frames = 120 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0.0 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 3 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DR_pycuda_stream' +p.engines.engine00.numiter = 20 +p.engines.engine00.numiter_contiguous = 10 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 + +# prepare and run +P = Ptycho(p,level=5) From c8b3f7b56f22c1f066db1b8ce7a04fb0229acad5 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 10:17:40 +0100 Subject: [PATCH 37/56] updates to imported FFT to compile all supported sizes into the same module --- .../cuda/filtered_fft/filtered_fft.cu | 54 +++++++++++-------- .../cuda/filtered_fft/filtered_fft.h | 1 + .../cuda_pycuda/cuda/filtered_fft/module.cpp | 7 ++- ptypy/accelerate/cuda_pycuda/cufft.py | 8 ++- ptypy/accelerate/cuda_pycuda/import_fft.py | 5 +- .../cuda_pycuda_tests/fft_accuracy_test.py | 4 +- .../fft_tests/fft_accuracy_test.py | 4 +- .../fft_tests/fft_import_fft_test.py | 26 +++++---- 8 files changed, 69 insertions(+), 40 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu index bb152466a..4450cdf7f 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu @@ -29,18 +29,6 @@ #include #include -#ifndef MY_FFT_ROWS -# define MY_FFT_ROWS 128 -# pragma GCC warning "MY_FFT_ROWS not set in preprocessor - defaulting to 128" -#endif - -#ifndef MY_FFT_COLS -# define MY_FFT_COLS 128 -# pragma GCC warning "MY_FFT_COLS not set in preprocessor - defaulting to 128" -#endif - - - template class FilteredFFTImpl : public FilteredFFT { public: @@ -274,9 +262,37 @@ void FilteredFFTImpl::setupPlan() { } } +template +static FilteredFFT* make(int batches, int rows, int cols, complex* prefilt, complex* postfilt, + cudaStream_t stream) +{ + // we only support rows / colums are equal and powers of 2, from 16x16 to 512x512 + if (rows != cols) + throw std::runtime_error("Only equal numbers of rows and columns are supported"); + switch (rows) + { + case 16: return new FilteredFFTImpl<16, 16, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 32: return new FilteredFFTImpl<32, 32, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 64: return new FilteredFFTImpl<64, 64, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 128: return new FilteredFFTImpl<128, 128, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 256: return new FilteredFFTImpl<256, 256, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 512: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 1024: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 2048: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + default: throw std::runtime_error("Only powers of 2 from 16 to 2048 are supported"); + } +} + //////////// Factory Functions for Python -FilteredFFT* make_filtered(int batches, bool symmetricScaling, +// Note: This will instantiate templates for 8 powers of 2, with 4 combinations of forward/reverse, symmetric/not, +// i.e. 32 different FFTs into the binary. Compile time might be quite long, but we intend to do this once +// during installation + +FilteredFFT* make_filtered( + int batches, + int rows, int cols, + bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, cudaStream_t stream) @@ -284,21 +300,17 @@ FilteredFFT* make_filtered(int batches, bool symmetricScaling, if (symmetricScaling) { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } else { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h index fd153f768..9afa4e119 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h @@ -23,6 +23,7 @@ class FilteredFFT { // Note that cudaStream_t (runtime API) and CUStream (driver API) are // the same type FilteredFFT* make_filtered(int batches, + int rows, int columns, bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 186d40cb2..7a8bb54dd 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -13,7 +13,7 @@ class FilteredFFTPython { public: - FilteredFFTPython(int batches, bool symmetric, + FilteredFFTPython(int batches, int rows, int columns, bool symmetric, bool is_forward, std::size_t prefilt_ptr, std::size_t postfilt_ptr, @@ -21,6 +21,7 @@ class FilteredFFTPython { fft_ = make_filtered( batches, + rows, columns, symmetric, is_forward, reinterpret_cast*>(prefilt_ptr), @@ -74,8 +75,10 @@ PYBIND11_MODULE(module, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) - .def(py::init(), + .def(py::init(), py::arg("batches"), + py::arg("rows"), + py::arg("columns"), py::arg("symmetricScaling"), py::arg("is_forward"), py::arg("prefilt"), diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 605e90d43..49462c0f8 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -17,6 +17,10 @@ def __init__(self, array, queue=None, if dims < 2: raise AssertionError('Input array must be at least 2-dimensional') self.arr_shape = (array.shape[-2], array.shape[-1]) + rows = self.arr_shape[0] + columns = self.arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward @@ -35,9 +39,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): self.post_fft_ptr = 0 from . import import_fft - mod = import_fft.ImportFFT(self.arr_shape[0], self.arr_shape[1]).get_mod() + mod = import_fft.ImportFFT().get_mod() self.fftobj = mod.FilteredFFT( self.batches, + self.arr_shape[0], + self.arr_shape[1], symmetric, forward, self.pre_fft_ptr, diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py index 6a3d3312e..a5007b68e 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/ptypy/accelerate/cuda_pycuda/import_fft.py @@ -126,7 +126,7 @@ def stdchannel_redirected(stdchannel): class ImportFFT: - def __init__(self, rows, columns, build_path=None, quiet=True): + def __init__(self, build_path=None, quiet=True): self.build_path = build_path self.cleanup_build_path = None if self.build_path is None: @@ -138,8 +138,7 @@ def __init__(self, rows, columns, build_path=None, quiet=True): # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) ext = distutils.extension.Extension(full_module_name, sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")], - extra_compile_args=["-DMY_FFT_COLS=%s" % str(columns) , "-DMY_FFT_ROWS=%s" % str(rows)]) + os.path.join(module_dir, "filtered_fft.cu")]) script_args = ['build_ext', '--build-temp=%s' % self.build_path, diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py index ed6929865..30d76d2cb 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py index 9c87e34f2..7c30c3221 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py index 7d60ce46a..62fa7bbbc 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py @@ -1,26 +1,34 @@ -import unittest, pytest +import unittest from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda -import os, shutil -from distutils import sysconfig if have_pycuda(): - import pycuda.driver as cuda - from pycuda import gpuarray from ptypy.accelerate.cuda_pycuda import import_fft - from pycuda.tools import make_default_context class ImportFFTTest(PyCudaTest): def test_import_fft(self): - import_fft.ImportFFT(32, 32) + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) def test_import_fft_different_shape(self): - import_fft.ImportFFT(128, 128) + mod = import_fft.ImportFFT(quiet=False).get_mod() + ft = mod.FilteredFFT(2, 128, 128, False, True, 0, 0, 0) def test_import_fft_same_module_again(self): - import_fft.ImportFFT(32, 32) + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_square(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 64, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_pow2(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 40, 40, False, True, 0, 0, 0) if __name__=="__main__": From ce89ee71d63846779025d77d9adb7059ea09c697 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 11:29:18 +0100 Subject: [PATCH 38/56] integrating filtered_cufft in setup.py --- .../cuda_pycuda/cuda/filtered_fft/module.cpp | 2 +- ptypy/accelerate/cuda_pycuda/cufft.py | 5 ++- ptypy/accelerate/cuda_pycuda/import_fft.py | 35 +++++++++++++------ setup.py | 16 +++++++++ 4 files changed, 44 insertions(+), 14 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 7a8bb54dd..3eb0eb37e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -71,7 +71,7 @@ class FilteredFFTPython namespace py = pybind11; -PYBIND11_MODULE(module, m) { +PYBIND11_MODULE(filtered_cufft, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 49462c0f8..686171342 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -38,9 +38,8 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): else: self.post_fft_ptr = 0 - from . import import_fft - mod = import_fft.ImportFFT().get_mod() - self.fftobj = mod.FilteredFFT( + from ptypy import filtered_cufft + self.fftobj = filtered_cufft.FilteredFFT( self.batches, self.arr_shape[0], self.arr_shape[1], diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py index a5007b68e..63aa2e224 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/ptypy/accelerate/cuda_pycuda/import_fft.py @@ -59,8 +59,18 @@ def __init__(self, *args, **kwargs): super(NvccCompiler, self).__init__(*args, **kwargs) self.CUDA = locate_cuda() module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - cmp = cuda_driver.Context.get_device().compute_capability() - archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) + try: + cmp = cuda_driver.Context.get_device().compute_capability() + archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) + except cuda_driver.LogicError: + # by default, compile for all of these + archflag = '-gencode=arch=compute_50,code=sm_50' + \ + ' -gencode=arch=compute_52,code=sm_52' + \ + ' -gencode=arch=compute_60,code=sm_60' + \ + ' -gencode=arch=compute_61,code=sm_61' + \ + ' -gencode=arch=compute_70,code=sm_70' + \ + ' -gencode=arch=compute_75,code=sm_75' + \ + ' -gencode=arch=compute_75,code=compute_75' self.src_extensions.append('.cu') self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] @@ -102,13 +112,18 @@ def link(self, target_desc, objects, self.linker_so = default_linker_so class CustomBuildExt(build_ext): - def build_extensions(self): - old_compiler = self.compiler - self.compiler = NvccCompiler(verbose=old_compiler.verbose, - dry_run=old_compiler.dry_run, - force=old_compiler.force) # this is our bespoke compiler - super(CustomBuildExt, self).build_extensions() - self.compiler=old_compiler + + def build_extension(self, ext): + has_cu = any([src.endswith('.cu') for src in ext.sources]) + if has_cu: + old_compiler = self.compiler + self.compiler = NvccCompiler(verbose=old_compiler.verbose, + dry_run=old_compiler.dry_run, + force=old_compiler.force) # this is our bespoke compiler + super(CustomBuildExt, self).build_extension(ext) + self.compiler=old_compiler + else: + super(CustomBuildExt, self).build_extension(ext) @contextlib.contextmanager def stdchannel_redirected(stdchannel): @@ -133,7 +148,7 @@ def __init__(self, build_path=None, quiet=True): self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") self.cleanup_build_path = True - full_module_name = "module" + full_module_name = "filtered_cufft" module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) ext = distutils.extension.Extension(full_module_name, diff --git a/setup.py b/setup.py index 43940038c..6fa37acdc 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,10 @@ #!/usr/bin/env python +import distutils +from ptypy.accelerate.cuda_pycuda.import_fft import CustomBuildExt import setuptools #, setuptools.command.build_ext from distutils.core import setup +import os CLASSIFIERS = """\ Development Status :: 3 - Alpha @@ -62,6 +65,17 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION +module_dir = os.path.join(__file__.strip('setup.py'), + 'ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') + +ext_modules = [ + distutils.core.Extension("ptypy.filtered_cufft", + sources=[os.path.join(module_dir, "module.cpp"), + os.path.join(module_dir, "filtered_fft.cu")] + ) +] +cmdclass = {"build_ext": CustomBuildExt} + exclude_packages = [] package_list = setuptools.find_packages(exclude=exclude_packages) @@ -82,4 +96,6 @@ def write_version_py(filename='ptypy/version.py'): 'scripts/ptypy.new', 'scripts/ptypy.csv2cp', 'scripts/ptypy.run'], + ext_modules=ext_modules, + cmdclass=cmdclass ) From 6c4904b0a81194fc077befd28c472cf92297882a Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 11:48:10 +0100 Subject: [PATCH 39/56] cleanup and re-organising file locations --- setup.py | 14 ++- .../import_fft.py => setupext_nvidia.py | 93 ++----------------- .../cuda_pycuda_tests/fft_accuracy_test.py | 48 ---------- .../fft_tests/cufft_init_test.py | 28 ++++++ .../fft_tests/fft_import_fft_test.py | 35 ------- 5 files changed, 43 insertions(+), 175 deletions(-) rename ptypy/accelerate/cuda_pycuda/import_fft.py => setupext_nvidia.py (56%) delete mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py delete mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py diff --git a/setup.py b/setup.py index 6fa37acdc..d1351932e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import distutils -from ptypy.accelerate.cuda_pycuda.import_fft import CustomBuildExt +from setupext_nvidia import CustomBuildExt import setuptools #, setuptools.command.build_ext from distutils.core import setup import os @@ -65,13 +65,12 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION -module_dir = os.path.join(__file__.strip('setup.py'), - 'ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') - +# filtered Cuda FFT extension module +cufft_dir = os.path.join('ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') ext_modules = [ distutils.core.Extension("ptypy.filtered_cufft", - sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")] + sources=[os.path.join(cufft_dir, "module.cpp"), + os.path.join(cufft_dir, "filtered_fft.cu")] ) ] cmdclass = {"build_ext": CustomBuildExt} @@ -88,8 +87,7 @@ def write_version_py(filename='ptypy/version.py'): package_dir={'ptypy': 'ptypy'}, packages=package_list, package_data={'ptypy': ['resources/*',], - 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu'], - 'ptypy.accelerate.cuda_pycuda.cuda.filtered_fft': ['*.hpp', '*.cpp', 'Makefile', '*.cu', '*.h']}, + 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu']}, scripts=['scripts/ptypy.plot', 'scripts/ptypy.inspect', 'scripts/ptypy.plotclient', diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/setupext_nvidia.py similarity index 56% rename from ptypy/accelerate/cuda_pycuda/import_fft.py rename to setupext_nvidia.py index 63aa2e224..c36483e09 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/setupext_nvidia.py @@ -1,18 +1,9 @@ ''' -"Just-in-time" compilation for callbacks in cufft. +Compilation tools for Nvidia builds of extension modules. ''' import os -import sys -import importlib -import tempfile -import setuptools import sysconfig -from pycuda import driver as cuda_driver import pybind11 -import contextlib -from io import StringIO -from ptypy.utils.verbose import log -import distutils from distutils.unixccompiler import UnixCCompiler from distutils.command.build_ext import build_ext @@ -59,18 +50,14 @@ def __init__(self, *args, **kwargs): super(NvccCompiler, self).__init__(*args, **kwargs) self.CUDA = locate_cuda() module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - try: - cmp = cuda_driver.Context.get_device().compute_capability() - archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) - except cuda_driver.LogicError: - # by default, compile for all of these - archflag = '-gencode=arch=compute_50,code=sm_50' + \ - ' -gencode=arch=compute_52,code=sm_52' + \ - ' -gencode=arch=compute_60,code=sm_60' + \ - ' -gencode=arch=compute_61,code=sm_61' + \ - ' -gencode=arch=compute_70,code=sm_70' + \ - ' -gencode=arch=compute_75,code=sm_75' + \ - ' -gencode=arch=compute_75,code=compute_75' + # by default, compile for all of these + archflag = '-gencode=arch=compute_50,code=sm_50' + \ + ' -gencode=arch=compute_52,code=sm_52' + \ + ' -gencode=arch=compute_60,code=sm_60' + \ + ' -gencode=arch=compute_61,code=sm_61' + \ + ' -gencode=arch=compute_70,code=sm_70' + \ + ' -gencode=arch=compute_75,code=sm_75' + \ + ' -gencode=arch=compute_75,code=compute_75' self.src_extensions.append('.cu') self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] @@ -125,66 +112,4 @@ def build_extension(self, ext): else: super(CustomBuildExt, self).build_extension(ext) -@contextlib.contextmanager -def stdchannel_redirected(stdchannel): - """ - Redirects stdout or stderr to a StringIO object. As of python 3.4, there is a - standard library contextmanager for this, but backwards compatibility! - """ - old = getattr(sys, stdchannel) - try: - s = StringIO() - setattr(sys, stdchannel, s) - yield s - finally: - setattr(sys, stdchannel, old) - - -class ImportFFT: - def __init__(self, build_path=None, quiet=True): - self.build_path = build_path - self.cleanup_build_path = None - if self.build_path is None: - self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") - self.cleanup_build_path = True - - full_module_name = "filtered_cufft" - module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) - ext = distutils.extension.Extension(full_module_name, - sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")]) - - script_args = ['build_ext', - '--build-temp=%s' % self.build_path, - '--build-lib=%s' % self.build_path] - # do I need full_module_name here? - setuptools_args = {"name": full_module_name, - "ext_modules": [ext], - "script_args": script_args, - "cmdclass":{"build_ext": CustomBuildExt - }} - - if quiet: - # we really don't care about the make print for almost all cases so we redirect - with stdchannel_redirected("stdout"): - with stdchannel_redirected("stderr"): - setuptools.setup(**setuptools_args) - else: - setuptools.setup(**setuptools_args) - - spec = importlib.util.spec_from_file_location(full_module_name, - os.path.join(self.build_path, - "module" + distutils.sysconfig.get_config_var('EXT_SUFFIX') - ) - ) - self.mod = importlib.util.module_from_spec(spec) - - def get_mod(self): - return self.mod - def __del__(self): - import shutil - if self.cleanup_build_path: - log(5, "cleaning up the build directory") - shutil.rmtree(self.build_path) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py deleted file mode 100644 index 30d76d2cb..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py +++ /dev/null @@ -1,48 +0,0 @@ -''' -''' - -import unittest -import numpy as np -import scipy.fft as fft -from . import PyCudaTest, have_pycuda - - -if have_pycuda(): - from pycuda import gpuarray - from ptypy.accelerate.cuda_pycuda.fft import FFT as ReiknaFFT - from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as cuFFT - -class FftAccurracyTest(PyCudaTest): - - def gen_input(self): - rows = cols = 32 - batches = 1 - f = np.random.randn(batches, rows, cols) + 1j * np.random.randn(batches,rows, cols) - f = np.ascontiguousarray(f.astype(np.complex64)) - return f - - def test_random_cufft_fwd(self): - f = self.gen_input() - cuft = cuFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=None, forward=True).ft - reikft = ReiknaFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=False).ft - for i in range(10): - f = self.gen_input() - y = fft.fft2(f) - - x_d = gpuarray.to_gpu(f) - cuft(x_d, x_d) - y_cufft = x_d.get().reshape(y.shape) - - x_d = gpuarray.to_gpu(f) - reikft(x_d, x_d) - y_reikna = x_d.get().reshape(y.shape) - - # cufft_diff = np.max(np.abs(y_cufft - y)) - # reikna_diff = np.max(np.abs(y_reikna-y)) - # cufft_rdiff = np.max(np.abs(y_cufft - y) / np.abs(y)) - # reikna_rdiff = np.max(np.abs(y_reikna - y) / np.abs(y)) - # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) - - # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py new file mode 100644 index 000000000..ac28436b4 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py @@ -0,0 +1,28 @@ + +import unittest +from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda + +if have_pycuda(): + from ptypy.filtered_cufft import FilteredFFT + +class CuFFTInitTest(PyCudaTest): + + def test_import_fft(self): + ft = FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + + def test_import_fft_different_shape(self): + ft = FilteredFFT(2, 128, 128, False, True, 0, 0, 0) + + + @unittest.expectedFailure + def test_import_fft_not_square(self): + ft = FilteredFFT(2, 32, 64, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_pow2(self): + ft = FilteredFFT(2, 40, 40, False, True, 0, 0, 0) + + +if __name__=="__main__": + unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py deleted file mode 100644 index 62fa7bbbc..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py +++ /dev/null @@ -1,35 +0,0 @@ - -import unittest -from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda - -if have_pycuda(): - from ptypy.accelerate.cuda_pycuda import import_fft - -class ImportFFTTest(PyCudaTest): - - def test_import_fft(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) - - - def test_import_fft_different_shape(self): - mod = import_fft.ImportFFT(quiet=False).get_mod() - ft = mod.FilteredFFT(2, 128, 128, False, True, 0, 0, 0) - - def test_import_fft_same_module_again(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) - - @unittest.expectedFailure - def test_import_fft_not_square(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 64, False, True, 0, 0, 0) - - @unittest.expectedFailure - def test_import_fft_not_pow2(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 40, 40, False, True, 0, 0, 0) - - -if __name__=="__main__": - unittest.main() From 077c8a2fc1b8dd3eb220e18dbf3598fd16e8db70 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 11:51:17 +0100 Subject: [PATCH 40/56] Revert accidental commit: "cleanup and re-organising file locations" This reverts commit 6c4904b0a81194fc077befd28c472cf92297882a. --- .../accelerate/cuda_pycuda/import_fft.py | 93 +++++++++++++++++-- setup.py | 14 +-- .../cuda_pycuda_tests/fft_accuracy_test.py | 48 ++++++++++ .../fft_tests/cufft_init_test.py | 28 ------ .../fft_tests/fft_import_fft_test.py | 35 +++++++ 5 files changed, 175 insertions(+), 43 deletions(-) rename setupext_nvidia.py => ptypy/accelerate/cuda_pycuda/import_fft.py (56%) create mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py delete mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py diff --git a/setupext_nvidia.py b/ptypy/accelerate/cuda_pycuda/import_fft.py similarity index 56% rename from setupext_nvidia.py rename to ptypy/accelerate/cuda_pycuda/import_fft.py index c36483e09..63aa2e224 100644 --- a/setupext_nvidia.py +++ b/ptypy/accelerate/cuda_pycuda/import_fft.py @@ -1,9 +1,18 @@ ''' -Compilation tools for Nvidia builds of extension modules. +"Just-in-time" compilation for callbacks in cufft. ''' import os +import sys +import importlib +import tempfile +import setuptools import sysconfig +from pycuda import driver as cuda_driver import pybind11 +import contextlib +from io import StringIO +from ptypy.utils.verbose import log +import distutils from distutils.unixccompiler import UnixCCompiler from distutils.command.build_ext import build_ext @@ -50,14 +59,18 @@ def __init__(self, *args, **kwargs): super(NvccCompiler, self).__init__(*args, **kwargs) self.CUDA = locate_cuda() module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - # by default, compile for all of these - archflag = '-gencode=arch=compute_50,code=sm_50' + \ - ' -gencode=arch=compute_52,code=sm_52' + \ - ' -gencode=arch=compute_60,code=sm_60' + \ - ' -gencode=arch=compute_61,code=sm_61' + \ - ' -gencode=arch=compute_70,code=sm_70' + \ - ' -gencode=arch=compute_75,code=sm_75' + \ - ' -gencode=arch=compute_75,code=compute_75' + try: + cmp = cuda_driver.Context.get_device().compute_capability() + archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) + except cuda_driver.LogicError: + # by default, compile for all of these + archflag = '-gencode=arch=compute_50,code=sm_50' + \ + ' -gencode=arch=compute_52,code=sm_52' + \ + ' -gencode=arch=compute_60,code=sm_60' + \ + ' -gencode=arch=compute_61,code=sm_61' + \ + ' -gencode=arch=compute_70,code=sm_70' + \ + ' -gencode=arch=compute_75,code=sm_75' + \ + ' -gencode=arch=compute_75,code=compute_75' self.src_extensions.append('.cu') self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] @@ -112,4 +125,66 @@ def build_extension(self, ext): else: super(CustomBuildExt, self).build_extension(ext) +@contextlib.contextmanager +def stdchannel_redirected(stdchannel): + """ + Redirects stdout or stderr to a StringIO object. As of python 3.4, there is a + standard library contextmanager for this, but backwards compatibility! + """ + old = getattr(sys, stdchannel) + try: + s = StringIO() + setattr(sys, stdchannel, s) + yield s + finally: + setattr(sys, stdchannel, old) + + +class ImportFFT: + def __init__(self, build_path=None, quiet=True): + self.build_path = build_path + self.cleanup_build_path = None + if self.build_path is None: + self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") + self.cleanup_build_path = True + + full_module_name = "filtered_cufft" + module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') + # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) + ext = distutils.extension.Extension(full_module_name, + sources=[os.path.join(module_dir, "module.cpp"), + os.path.join(module_dir, "filtered_fft.cu")]) + + script_args = ['build_ext', + '--build-temp=%s' % self.build_path, + '--build-lib=%s' % self.build_path] + # do I need full_module_name here? + setuptools_args = {"name": full_module_name, + "ext_modules": [ext], + "script_args": script_args, + "cmdclass":{"build_ext": CustomBuildExt + }} + + if quiet: + # we really don't care about the make print for almost all cases so we redirect + with stdchannel_redirected("stdout"): + with stdchannel_redirected("stderr"): + setuptools.setup(**setuptools_args) + else: + setuptools.setup(**setuptools_args) + + spec = importlib.util.spec_from_file_location(full_module_name, + os.path.join(self.build_path, + "module" + distutils.sysconfig.get_config_var('EXT_SUFFIX') + ) + ) + self.mod = importlib.util.module_from_spec(spec) + + def get_mod(self): + return self.mod + def __del__(self): + import shutil + if self.cleanup_build_path: + log(5, "cleaning up the build directory") + shutil.rmtree(self.build_path) diff --git a/setup.py b/setup.py index d1351932e..6fa37acdc 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import distutils -from setupext_nvidia import CustomBuildExt +from ptypy.accelerate.cuda_pycuda.import_fft import CustomBuildExt import setuptools #, setuptools.command.build_ext from distutils.core import setup import os @@ -65,12 +65,13 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION -# filtered Cuda FFT extension module -cufft_dir = os.path.join('ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') +module_dir = os.path.join(__file__.strip('setup.py'), + 'ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') + ext_modules = [ distutils.core.Extension("ptypy.filtered_cufft", - sources=[os.path.join(cufft_dir, "module.cpp"), - os.path.join(cufft_dir, "filtered_fft.cu")] + sources=[os.path.join(module_dir, "module.cpp"), + os.path.join(module_dir, "filtered_fft.cu")] ) ] cmdclass = {"build_ext": CustomBuildExt} @@ -87,7 +88,8 @@ def write_version_py(filename='ptypy/version.py'): package_dir={'ptypy': 'ptypy'}, packages=package_list, package_data={'ptypy': ['resources/*',], - 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu']}, + 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu'], + 'ptypy.accelerate.cuda_pycuda.cuda.filtered_fft': ['*.hpp', '*.cpp', 'Makefile', '*.cu', '*.h']}, scripts=['scripts/ptypy.plot', 'scripts/ptypy.inspect', 'scripts/ptypy.plotclient', diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py new file mode 100644 index 000000000..30d76d2cb --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py @@ -0,0 +1,48 @@ +''' +''' + +import unittest +import numpy as np +import scipy.fft as fft +from . import PyCudaTest, have_pycuda + + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.fft import FFT as ReiknaFFT + from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as cuFFT + +class FftAccurracyTest(PyCudaTest): + + def gen_input(self): + rows = cols = 32 + batches = 1 + f = np.random.randn(batches, rows, cols) + 1j * np.random.randn(batches,rows, cols) + f = np.ascontiguousarray(f.astype(np.complex64)) + return f + + def test_random_cufft_fwd(self): + f = self.gen_input() + cuft = cuFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=None, forward=True).ft + reikft = ReiknaFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=False).ft + for i in range(10): + f = self.gen_input() + y = fft.fft2(f) + + x_d = gpuarray.to_gpu(f) + cuft(x_d, x_d) + y_cufft = x_d.get().reshape(y.shape) + + x_d = gpuarray.to_gpu(f) + reikft(x_d, x_d) + y_reikna = x_d.get().reshape(y.shape) + + # cufft_diff = np.max(np.abs(y_cufft - y)) + # reikna_diff = np.max(np.abs(y_reikna-y)) + # cufft_rdiff = np.max(np.abs(y_cufft - y) / np.abs(y)) + # reikna_rdiff = np.max(np.abs(y_reikna - y) / np.abs(y)) + # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) + + # Note: check if this tolerance and test case is ok + np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py deleted file mode 100644 index ac28436b4..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py +++ /dev/null @@ -1,28 +0,0 @@ - -import unittest -from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda - -if have_pycuda(): - from ptypy.filtered_cufft import FilteredFFT - -class CuFFTInitTest(PyCudaTest): - - def test_import_fft(self): - ft = FilteredFFT(2, 32, 32, False, True, 0, 0, 0) - - - def test_import_fft_different_shape(self): - ft = FilteredFFT(2, 128, 128, False, True, 0, 0, 0) - - - @unittest.expectedFailure - def test_import_fft_not_square(self): - ft = FilteredFFT(2, 32, 64, False, True, 0, 0, 0) - - @unittest.expectedFailure - def test_import_fft_not_pow2(self): - ft = FilteredFFT(2, 40, 40, False, True, 0, 0, 0) - - -if __name__=="__main__": - unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py new file mode 100644 index 000000000..62fa7bbbc --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py @@ -0,0 +1,35 @@ + +import unittest +from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda + +if have_pycuda(): + from ptypy.accelerate.cuda_pycuda import import_fft + +class ImportFFTTest(PyCudaTest): + + def test_import_fft(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + + def test_import_fft_different_shape(self): + mod = import_fft.ImportFFT(quiet=False).get_mod() + ft = mod.FilteredFFT(2, 128, 128, False, True, 0, 0, 0) + + def test_import_fft_same_module_again(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_square(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 32, 64, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_pow2(self): + mod = import_fft.ImportFFT().get_mod() + ft = mod.FilteredFFT(2, 40, 40, False, True, 0, 0, 0) + + +if __name__=="__main__": + unittest.main() From e10d3b534459c876d2f3d539b622fe0bb7f39700 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 11:52:02 +0100 Subject: [PATCH 41/56] Revert accidental commit: "integrating filtered_cufft in setup.py" This reverts commit ce89ee71d63846779025d77d9adb7059ea09c697. --- .../cuda_pycuda/cuda/filtered_fft/module.cpp | 2 +- ptypy/accelerate/cuda_pycuda/cufft.py | 5 +-- ptypy/accelerate/cuda_pycuda/import_fft.py | 35 ++++++------------- setup.py | 16 --------- 4 files changed, 14 insertions(+), 44 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 3eb0eb37e..7a8bb54dd 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -71,7 +71,7 @@ class FilteredFFTPython namespace py = pybind11; -PYBIND11_MODULE(filtered_cufft, m) { +PYBIND11_MODULE(module, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 686171342..49462c0f8 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -38,8 +38,9 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): else: self.post_fft_ptr = 0 - from ptypy import filtered_cufft - self.fftobj = filtered_cufft.FilteredFFT( + from . import import_fft + mod = import_fft.ImportFFT().get_mod() + self.fftobj = mod.FilteredFFT( self.batches, self.arr_shape[0], self.arr_shape[1], diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py index 63aa2e224..a5007b68e 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/ptypy/accelerate/cuda_pycuda/import_fft.py @@ -59,18 +59,8 @@ def __init__(self, *args, **kwargs): super(NvccCompiler, self).__init__(*args, **kwargs) self.CUDA = locate_cuda() module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - try: - cmp = cuda_driver.Context.get_device().compute_capability() - archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) - except cuda_driver.LogicError: - # by default, compile for all of these - archflag = '-gencode=arch=compute_50,code=sm_50' + \ - ' -gencode=arch=compute_52,code=sm_52' + \ - ' -gencode=arch=compute_60,code=sm_60' + \ - ' -gencode=arch=compute_61,code=sm_61' + \ - ' -gencode=arch=compute_70,code=sm_70' + \ - ' -gencode=arch=compute_75,code=sm_75' + \ - ' -gencode=arch=compute_75,code=compute_75' + cmp = cuda_driver.Context.get_device().compute_capability() + archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) self.src_extensions.append('.cu') self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] @@ -112,18 +102,13 @@ def link(self, target_desc, objects, self.linker_so = default_linker_so class CustomBuildExt(build_ext): - - def build_extension(self, ext): - has_cu = any([src.endswith('.cu') for src in ext.sources]) - if has_cu: - old_compiler = self.compiler - self.compiler = NvccCompiler(verbose=old_compiler.verbose, - dry_run=old_compiler.dry_run, - force=old_compiler.force) # this is our bespoke compiler - super(CustomBuildExt, self).build_extension(ext) - self.compiler=old_compiler - else: - super(CustomBuildExt, self).build_extension(ext) + def build_extensions(self): + old_compiler = self.compiler + self.compiler = NvccCompiler(verbose=old_compiler.verbose, + dry_run=old_compiler.dry_run, + force=old_compiler.force) # this is our bespoke compiler + super(CustomBuildExt, self).build_extensions() + self.compiler=old_compiler @contextlib.contextmanager def stdchannel_redirected(stdchannel): @@ -148,7 +133,7 @@ def __init__(self, build_path=None, quiet=True): self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") self.cleanup_build_path = True - full_module_name = "filtered_cufft" + full_module_name = "module" module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) ext = distutils.extension.Extension(full_module_name, diff --git a/setup.py b/setup.py index 6fa37acdc..43940038c 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,7 @@ #!/usr/bin/env python -import distutils -from ptypy.accelerate.cuda_pycuda.import_fft import CustomBuildExt import setuptools #, setuptools.command.build_ext from distutils.core import setup -import os CLASSIFIERS = """\ Development Status :: 3 - Alpha @@ -65,17 +62,6 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION -module_dir = os.path.join(__file__.strip('setup.py'), - 'ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') - -ext_modules = [ - distutils.core.Extension("ptypy.filtered_cufft", - sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")] - ) -] -cmdclass = {"build_ext": CustomBuildExt} - exclude_packages = [] package_list = setuptools.find_packages(exclude=exclude_packages) @@ -96,6 +82,4 @@ def write_version_py(filename='ptypy/version.py'): 'scripts/ptypy.new', 'scripts/ptypy.csv2cp', 'scripts/ptypy.run'], - ext_modules=ext_modules, - cmdclass=cmdclass ) From 59a5f9c2b7171fc754cfb557f1029b9747c043e4 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Tue, 30 Mar 2021 11:52:33 +0100 Subject: [PATCH 42/56] Revert acidental commit: "updates to imported FFT to compile all supported sizes into the same module" This reverts commit c8b3f7b56f22c1f066db1b8ce7a04fb0229acad5. --- .../cuda/filtered_fft/filtered_fft.cu | 54 ++++++++----------- .../cuda/filtered_fft/filtered_fft.h | 1 - .../cuda_pycuda/cuda/filtered_fft/module.cpp | 7 +-- ptypy/accelerate/cuda_pycuda/cufft.py | 8 +-- ptypy/accelerate/cuda_pycuda/import_fft.py | 5 +- .../cuda_pycuda_tests/fft_accuracy_test.py | 4 +- .../fft_tests/fft_accuracy_test.py | 4 +- .../fft_tests/fft_import_fft_test.py | 26 ++++----- 8 files changed, 40 insertions(+), 69 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu index 4450cdf7f..bb152466a 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu @@ -29,6 +29,18 @@ #include #include +#ifndef MY_FFT_ROWS +# define MY_FFT_ROWS 128 +# pragma GCC warning "MY_FFT_ROWS not set in preprocessor - defaulting to 128" +#endif + +#ifndef MY_FFT_COLS +# define MY_FFT_COLS 128 +# pragma GCC warning "MY_FFT_COLS not set in preprocessor - defaulting to 128" +#endif + + + template class FilteredFFTImpl : public FilteredFFT { public: @@ -262,37 +274,9 @@ void FilteredFFTImpl::setupPlan() { } } -template -static FilteredFFT* make(int batches, int rows, int cols, complex* prefilt, complex* postfilt, - cudaStream_t stream) -{ - // we only support rows / colums are equal and powers of 2, from 16x16 to 512x512 - if (rows != cols) - throw std::runtime_error("Only equal numbers of rows and columns are supported"); - switch (rows) - { - case 16: return new FilteredFFTImpl<16, 16, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 32: return new FilteredFFTImpl<32, 32, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 64: return new FilteredFFTImpl<64, 64, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 128: return new FilteredFFTImpl<128, 128, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 256: return new FilteredFFTImpl<256, 256, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 512: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 1024: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - case 2048: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); - default: throw std::runtime_error("Only powers of 2 from 16 to 2048 are supported"); - } -} - //////////// Factory Functions for Python -// Note: This will instantiate templates for 8 powers of 2, with 4 combinations of forward/reverse, symmetric/not, -// i.e. 32 different FFTs into the binary. Compile time might be quite long, but we intend to do this once -// during installation - -FilteredFFT* make_filtered( - int batches, - int rows, int cols, - bool symmetricScaling, +FilteredFFT* make_filtered(int batches, bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, cudaStream_t stream) @@ -300,17 +284,21 @@ FilteredFFT* make_filtered( if (symmetricScaling) { if (isForward) { - return make(batches, rows, cols, prefilt, postfilt, stream); + return new FilteredFFTImpl(batches, + prefilt, postfilt, stream); } else { - return make(batches, rows, cols, prefilt, postfilt, stream); + return new FilteredFFTImpl(batches, + prefilt, postfilt, stream); } } else { if (isForward) { - return make(batches, rows, cols, prefilt, postfilt, stream); + return new FilteredFFTImpl(batches, + prefilt, postfilt, stream); } else { - return make(batches, rows, cols, prefilt, postfilt, stream); + return new FilteredFFTImpl(batches, + prefilt, postfilt, stream); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h index 9afa4e119..fd153f768 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h @@ -23,7 +23,6 @@ class FilteredFFT { // Note that cudaStream_t (runtime API) and CUStream (driver API) are // the same type FilteredFFT* make_filtered(int batches, - int rows, int columns, bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 7a8bb54dd..186d40cb2 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -13,7 +13,7 @@ class FilteredFFTPython { public: - FilteredFFTPython(int batches, int rows, int columns, bool symmetric, + FilteredFFTPython(int batches, bool symmetric, bool is_forward, std::size_t prefilt_ptr, std::size_t postfilt_ptr, @@ -21,7 +21,6 @@ class FilteredFFTPython { fft_ = make_filtered( batches, - rows, columns, symmetric, is_forward, reinterpret_cast*>(prefilt_ptr), @@ -75,10 +74,8 @@ PYBIND11_MODULE(module, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) - .def(py::init(), + .def(py::init(), py::arg("batches"), - py::arg("rows"), - py::arg("columns"), py::arg("symmetricScaling"), py::arg("is_forward"), py::arg("prefilt"), diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 49462c0f8..605e90d43 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -17,10 +17,6 @@ def __init__(self, array, queue=None, if dims < 2: raise AssertionError('Input array must be at least 2-dimensional') self.arr_shape = (array.shape[-2], array.shape[-1]) - rows = self.arr_shape[0] - columns = self.arr_shape[1] - if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: - raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward @@ -39,11 +35,9 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): self.post_fft_ptr = 0 from . import import_fft - mod = import_fft.ImportFFT().get_mod() + mod = import_fft.ImportFFT(self.arr_shape[0], self.arr_shape[1]).get_mod() self.fftobj = mod.FilteredFFT( self.batches, - self.arr_shape[0], - self.arr_shape[1], symmetric, forward, self.pre_fft_ptr, diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py index a5007b68e..6a3d3312e 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/ptypy/accelerate/cuda_pycuda/import_fft.py @@ -126,7 +126,7 @@ def stdchannel_redirected(stdchannel): class ImportFFT: - def __init__(self, build_path=None, quiet=True): + def __init__(self, rows, columns, build_path=None, quiet=True): self.build_path = build_path self.cleanup_build_path = None if self.build_path is None: @@ -138,7 +138,8 @@ def __init__(self, build_path=None, quiet=True): # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) ext = distutils.extension.Extension(full_module_name, sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")]) + os.path.join(module_dir, "filtered_fft.cu")], + extra_compile_args=["-DMY_FFT_COLS=%s" % str(columns) , "-DMY_FFT_ROWS=%s" % str(rows)]) script_args = ['build_ext', '--build-temp=%s' % self.build_path, diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py index 30d76d2cb..ed6929865 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py index 7c30c3221..9c87e34f2 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py index 62fa7bbbc..7d60ce46a 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py @@ -1,34 +1,26 @@ -import unittest +import unittest, pytest from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda +import os, shutil +from distutils import sysconfig if have_pycuda(): + import pycuda.driver as cuda + from pycuda import gpuarray from ptypy.accelerate.cuda_pycuda import import_fft + from pycuda.tools import make_default_context class ImportFFTTest(PyCudaTest): def test_import_fft(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + import_fft.ImportFFT(32, 32) def test_import_fft_different_shape(self): - mod = import_fft.ImportFFT(quiet=False).get_mod() - ft = mod.FilteredFFT(2, 128, 128, False, True, 0, 0, 0) + import_fft.ImportFFT(128, 128) def test_import_fft_same_module_again(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 32, False, True, 0, 0, 0) - - @unittest.expectedFailure - def test_import_fft_not_square(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 32, 64, False, True, 0, 0, 0) - - @unittest.expectedFailure - def test_import_fft_not_pow2(self): - mod = import_fft.ImportFFT().get_mod() - ft = mod.FilteredFFT(2, 40, 40, False, True, 0, 0, 0) + import_fft.ImportFFT(32, 32) if __name__=="__main__": From ad4e81e85165d7d3f2094749e53166d30091de5c Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Wed, 31 Mar 2021 20:02:16 +0100 Subject: [PATCH 43/56] Fix in context initialisation to raise an exception (#312) * fix in context initialisation to raise an exception in case more processes than GPUs are created * More verbose error and allow to create new stream with existing context * improved error message Co-authored-by: Benedikt Daurer --- ptypy/accelerate/cuda_pycuda/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 55833de3e..207027f40 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -24,9 +24,13 @@ def get_context(new_context=False, new_queue=False): if context is None or new_context: cuda.init() - if parallel.rank_local < cuda.Device.count(): - context = cuda.Device(parallel.rank_local).make_context() - context.push() + if parallel.rank_local >= cuda.Device.count(): + raise Exception('Local rank must be smaller than total device count, \ + rank={}, rank_local={}, device_count={}'.format( + parallel.rank, parallel.rank_local, cuda.Device.count() + )) + context = cuda.Device(parallel.rank_local).make_context() + context.push() # print("made context %s on rank %s" % (str(context), str(parallel.rank))) # print("The cuda device count on %s is:%s" % (str(parallel.rank), # str(cuda.Device.count()))) @@ -34,6 +38,7 @@ def get_context(new_context=False, new_queue=False): # str(parallel.rank_local))) if queue is None or new_queue: queue = cuda.Stream() + return context, queue From 2849395961199544c3e7674a1579049678f7a3bb Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Thu, 1 Apr 2021 10:24:30 +0100 Subject: [PATCH 44/56] WIP: position correction (#309) * Introduce grid search option for position refinement * fixed bug in address mangler * re-designed position correction base kernel, added grid search * Make sure we stay within valid bounds * base address manglers tests * address mangler's get_address on GPU (tests) * integrating GPU-based address manglers in DM engines * Fix typo in DM_serial, clean up debugging traces * avoid expensive re-allocations for deltas in address manglers * position grid search seems to work again with all DM engines * simplified address mangler * Template scripts for position correction * use a raw memcopy for the deltas to GPU, which will also work for differing sizes * Fixing data type and memcopy for the deltas in address manglers * Remove warning message * fixing typo for transpose kernel + setting position correction stream * Implement "photon" metric in all DM engines * need to synchronize * starting to add position correction in ML * Add templates for position refinement * It does not make sense to implement position correction for ML in this way Co-authored-by: Jorg Lotze --- ptypy/accelerate/base/address_manglers.py | 92 +++++--- ptypy/accelerate/base/engines/DM_serial.py | 65 +++--- ptypy/accelerate/base/engines/ML_serial.py | 22 +- ptypy/accelerate/base/kernels.py | 43 +++- ptypy/accelerate/cuda_pycuda/__init__.py | 2 +- .../cuda_pycuda/address_manglers.py | 74 +++++++ .../cuda_pycuda/cuda/get_address.cu | 35 +++ .../cuda_pycuda/engines/DM_pycuda.py | 70 +++--- .../cuda_pycuda/engines/DM_pycuda_stream.py | 66 ++++-- .../cuda_pycuda/engines/DM_pycuda_streams.py | 61 ++++-- .../cuda_pycuda/engines/ML_pycuda.py | 17 +- ptypy/accelerate/cuda_pycuda/kernels.py | 47 ++-- ptypy/engines/ML.py | 6 +- ptypy/engines/base.py | 24 ++- ptypy/engines/posref.py | 203 +++++++++++++----- ...efinement.py => position_refinement_DM.py} | 25 ++- templates/position_refinement_DM_pycuda.py | 93 ++++++++ templates/position_refinement_DM_serial.py | 33 +-- .../base_tests/address_manglers_test.py | 85 +++++--- .../address_manglers_test.py | 77 +++++++ 20 files changed, 828 insertions(+), 312 deletions(-) create mode 100644 ptypy/accelerate/cuda_pycuda/address_manglers.py create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/get_address.cu rename templates/{position_refinement.py => position_refinement_DM.py} (80%) create mode 100644 templates/position_refinement_DM_pycuda.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py diff --git a/ptypy/accelerate/base/address_manglers.py b/ptypy/accelerate/base/address_manglers.py index c60543cb4..6c73da5da 100644 --- a/ptypy/accelerate/base/address_manglers.py +++ b/ptypy/accelerate/base/address_manglers.py @@ -4,52 +4,82 @@ import numpy as np np.random.seed(0) -class RandomIntMangle(object): + +class BaseMangler(object): ''' - assumes integer pixel shift. + Assumes integer pixel shift. ''' - def __init__(self, max_step_per_shift, start, stop, max_bound=None, randomseed=None): + def __init__(self, max_step_per_shift, start, stop, nshifts, max_bound=None, randomseed=None): # can be initialised in the engine.init self.max_bound = max_bound # maximum distance from the starting positions self.max_step = lambda it: (max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression - self.call_no = 0 + self.nshifts = nshifts + self.delta = 0 - def mangle_address(self, addr_current, addr_original, iteration): + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): ''' - Takes the current address book and adds an offset to it according to the parameters + Mangles with the address given a delta shift ''' - mangled_addr = np.zeros_like(addr_current) - mangled_addr[:] = addr_current # make a copy - max_step = self.max_step(iteration) - deltas = np.random.randint(0, max_step + 1, (addr_current.shape[0], 2)) - # the following improves things a lot! - deltas[:, 0] = (-1)**self.call_no - deltas[:, 1] = (-1)**(self.call_no//2) - self.call_no += 1 - - # deltas = np.zeros((addr_current.shape[0], 2)) # for testing old_positions = np.zeros((addr_current.shape[0], 2)) old_positions[:] = addr_current[:, 0, 1, 1:] new_positions = np.zeros((addr_current.shape[0],2)) - # new_positions[1:] = old_positions[1:] + deltas[1:] # first mode is same as all of them. - new_positions[:] = old_positions + deltas # first mode is same as all of them. - self.apply_bounding_box(new_positions, old_positions, addr_original) + new_positions[:] = old_positions + self.delta[index] # first mode is same as all of them. # now update the main matrix (Same for all modes) - for idx in range(addr_original.shape[1]): + for idx in range(addr_current.shape[1]): mangled_addr[:, idx, 1, 1:] = new_positions - return mangled_addr + self.apply_bounding_box(mangled_addr[:,:,1,1], 0, max_oby) + self.apply_bounding_box(mangled_addr[:,:,1,2], 0, max_obx) + + def apply_bounding_box(self, addr, min, max): + ''' + Check if the mangled addresses are within valid bounds + ''' + addr[addrmax] = max + + def setup_shifts(self, current_iteration, nframes=1): + ''' + Arrange an array of shifts + ''' + raise NotImplementedError("This method needs to be overwritten in order to position correct") + + +class RandomIntMangler(BaseMangler): - def apply_bounding_box(self, new_positions, old_positions, addr_original): + def __init__(self, *args, **kwargs): + super(RandomIntMangler, self).__init__(*args, **kwargs) + + def setup_shifts(self, current_iteration, nframes=1): + ''' + Calculates random integer shifts + ''' + max_step = self.max_step(current_iteration) + self.delta = np.random.randint(0, max_step + 1, (self.nshifts, nframes, 2)) + for index in range(self.nshifts): + self.delta[index, :, 0] *= (-1)**index + self.delta[index, :, 1] *= (-1)**(index//2) + # check if the shifts are within the maximum bound + norms = np.linalg.norm(self.delta, axis=-1) + self.delta[norms > self.max_bound] = 0 + +class GridSearchMangler(BaseMangler): + def __init__(self, *args, **kwargs): + super(GridSearchMangler, self).__init__(*args, **kwargs) + + def setup_shifts(self, current_iteration, nframes=1): ''' - Checks if the new co-ordinates lie within the bounding box. If not, we undo this move. + Calculates integer shifts on a grid ''' + max_step = self.max_step(current_iteration) + delta = np.mgrid[-max_step:max_step+1:1, + -max_step:max_step+1:1] + within_bound = (delta[0]**2 + delta[1]**2) < (self.max_bound**2) + self.delta = np.tile(delta[:,within_bound].T.reshape(within_bound.sum(),1,2), (1,nframes,1)) + self.nshifts = self.delta.shape[0] + + + + + - distances_from_original = new_positions - addr_original[:, 0, 1, 1:] - # logger.warning("distance from original is %s" % repr(distances_from_original)) - norms = np.linalg.norm(distances_from_original, axis=-1) - for i in range(len(new_positions)): - if norms[i]> self.max_bound: - new_positions[i] = old_positions[i] - # new_positions[norms>self.max_bound] = old_positions[norms>self.max_bound] # make sure we aren't outside the bounding box -# \ No newline at end of file diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index 6957d808e..563f61ea1 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -15,7 +15,6 @@ from ptypy.utils import parallel from ptypy.engines import BaseEngine, register, DM from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel -from ptypy.accelerate.base import address_manglers from ptypy.accelerate.base import array_utils as au @@ -196,17 +195,8 @@ def _setup_kernels(self): kern.resolution = geo.resolution[0] if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes) + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution) kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler def engine_prepare(self): @@ -346,7 +336,7 @@ def engine_iterate(self, num=1): self.overlap_update(MPI=True) parallel.barrier() - if self.do_position_refinement and (self.curiter): + if self.do_position_refinement: do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 @@ -366,7 +356,8 @@ def engine_iterate(self, num=1): kern = self.kernels[prep.label] aux = kern.aux addr = prep.addr - original_addr = prep.original_addr # use this instead of the one in the address mangler. + original_addr = prep.original_addr + mangled_addr = addr.copy() mag = prep.mag ma_sum = prep.ma_sum err_fourier = prep.err_fourier @@ -374,16 +365,34 @@ def engine_iterate(self, num=1): PCK = kern.PCK FW = kern.FW + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + aux[:] = FW(aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) error_state = np.zeros_like(err_fourier) error_state[:] = err_fourier + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr, original_addr, self.curiter) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) PCK.build_aux(aux, mangled_addr, ob, pr) aux[:] = FW(aux) - PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + prep.err_fourier = error_state prep.addr = addr @@ -460,19 +469,19 @@ def object_update(self, MPI=False): parallel.allreduce(ob.data) parallel.allreduce(obn.data) ob.data /= obn.data - - # Clip object (This call takes like one ms. Not time critical) - if self.p.clip_object is not None: - clip_min, clip_max = self.p.clip_object - ampl_obj = np.abs(ob.data) - phase_obj = np.exp(1j * np.angle(ob.data)) - too_high = (ampl_obj > clip_max) - too_low = (ampl_obj < clip_min) - ob.data[too_high] = clip_max * phase_obj[too_high] - ob.data[too_low] = clip_min * phase_obj[too_low] else: ob.data /= obn.data + # Clip object (This call takes like one ms. Not time critical) + if self.p.clip_object is not None: + clip_min, clip_max = self.p.clip_object + ampl_obj = np.abs(ob.data) + phase_obj = np.exp(1j * np.angle(ob.data)) + too_high = (ampl_obj > clip_max) + too_low = (ampl_obj < clip_min) + ob.data[too_high] = clip_max * phase_obj[too_high] + ob.data[too_low] = clip_min * phase_obj[too_low] + self.benchmark.object_update += time.time() - t1 self.benchmark.calls_object += 1 @@ -559,7 +568,7 @@ def engine_finalize(self): res = self.kernels[prep.label].resolution for i,view in enumerate(d.views): for j,(pname, pod) in enumerate(view.pods.items()): - delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res + delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res pod.ob_view.coord += delta pod.ob_view.storage.update_views(pod.ob_view) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index fb359cf23..7ad06c69d 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -17,12 +17,11 @@ from ptypy.engines.ML import ML, BaseModel from .DM_serial import serialize_array_access from ptypy import utils as u -from ptypy.utils.verbose import logger +from ptypy.utils.verbose import logger, log from ptypy.utils import parallel from ptypy.engines.utils import Cnorm2, Cdot from ptypy.engines import register -from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \ - PositionCorrectionKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel from ptypy.accelerate.base import address_manglers # for debugging @@ -106,20 +105,6 @@ def _setup_kernels(self): kern.FW = geo.propagator.fw kern.BW = geo.propagator.bw - if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle( - int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes) - kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler - def engine_prepare(self): ## Serialize new data ## @@ -139,9 +124,6 @@ def engine_prepare(self): for label, d in self.di.storages.items(): prep = self.diff_info[d.ID] prep.view_IDs, prep.poe_IDs, prep.addr = serialize_array_access(d) - if self.do_position_refinement: - prep.original_addr = np.zeros_like(prep.addr) - prep.original_addr[:] = prep.addr self.ML_model.prepare() diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index 9569f882c..85f81fec2 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -577,7 +577,14 @@ def pr_update_local(self, addr, pr, ob, ex, aux): return class PositionCorrectionKernel(BaseKernel): - def __init__(self, aux, nmodes): + from ptypy.accelerate.base import address_manglers + + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, aux, nmodes, parameters, resolution): super(PositionCorrectionKernel, self).__init__() ash = aux.shape self.fshape = (ash[0] // nmodes, ash[1], ash[2]) @@ -585,11 +592,20 @@ def __init__(self, aux, nmodes): self.npy.fdev = None self.addr = None self.nmodes = nmodes - self.address_mangler = None + self.param = parameters + self.nshifts = parameters.nshifts + self.resolution = resolution self.kernels = ['build_aux', 'fourier_error', 'error_reduce', 'update_addr'] + self.setup() + + def setup(self): + Mangler = self.MANGLERS[self.param.method] + self.mangler = Mangler(int(self.param.amplitude // self.resolution[0]), self.param.start, self.param.stop, + self.param.nshifts, + max_bound=int(self.param.max_shift // self.resolution[0]), randomseed=0) def allocate(self): self.npy.fdev = np.zeros(self.fshape, dtype=np.float32) # we won't use this again but preallocate for speed @@ -663,11 +679,32 @@ def error_reduce(self, addr, err_sum): err_sum[:] = ferr.sum(-1).sum(-1) return + def log_likelihood(self, b_aux, addr, mag, mask, err_sum): + # reference shape (write-to shape) + sh = self.fshape + # stopper + maxz = mag.shape[0] + + # batch buffers + aux = b_aux[:maxz * self.nmodes] + + # build model from complex fourier magnitudes, summing up + # all modes incoherently + tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) + LL = (np.abs(tf) ** 2).sum(1) + + # Intensity data + I = mag**2 + + # Calculate log likelihood error + err_sum[:] = ((mask * (LL - I)**2 / (I + 1.)).sum(-1).sum(-1) / np.prod(LL.shape[-2:])) + return + def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): ''' updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu ''' update_indices = err_sum < error_state - log(4, "updating %s indices" % np.sum(update_indices)) + log(4, "Position correction: updating %s indices" % np.sum(update_indices)) addr[update_indices] = mangled_addr[update_indices] error_state[update_indices] = err_sum[update_indices] diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 207027f40..677a641f0 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -3,7 +3,7 @@ import numpy as np import os # debug_options = [] -#debug_options = ['-O0', '-G', '-g', '-std=c++11', '--keep'] +# debug_options = ['-O0', '-G', '-g', ] debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags # C++14 support was added with CUDA 9, so we only enable the flag there diff --git a/ptypy/accelerate/cuda_pycuda/address_manglers.py b/ptypy/accelerate/cuda_pycuda/address_manglers.py new file mode 100644 index 000000000..fa168903f --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/address_manglers.py @@ -0,0 +1,74 @@ +from ptypy.accelerate.cuda_pycuda import load_kernel +import numpy as np +from ptypy.accelerate.base import address_manglers as npam +from pycuda import gpuarray +import pycuda.driver as cuda + +class BaseMangler(npam.BaseMangler): + + def __init__(self, *args, queue_thread=None, **kwargs): + super().__init__(*args, **kwargs) + self.queue = queue_thread + self.get_address_cuda = load_kernel("get_address") + self.delta = None + self.delta_gpu = None + + def _setup_delta_gpu(self): + assert self.delta is not None, "Setup delta using the setup_shifts method first" + self.delta = np.ascontiguousarray(self.delta, dtype=np.int32) + + if self.delta_gpu is None or self.delta_gpu.shape[0] > self.delta.shape[0]: + self.delta_gpu = gpuarray.empty(self.delta.shape, dtype=np.int32) + # in case self.delta is smaller than delta_gpu, this will only copy the + # relevant part + cuda.memcpy_htod(dest=self.delta_gpu.ptr, + src=self.delta) + + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): + assert addr_current.dtype == np.int32, "addresses must be int32" + assert mangled_addr.dtype == np.int32, "addresses must be int32" + assert len(addr_current.shape) == 4, "addresses must be 4 dimensions" + assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated" + assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first" + assert index < self.delta_gpu.shape[0], "Index out of range for deltas" + assert isinstance(self.delta_gpu, gpuarray.GPUArray), "Only GPU arrays are supported for delta" + + # only using a single thread block here as it's not enough work + # otherwise + self.get_address_cuda( + addr_current, + mangled_addr, + np.int32(addr_current.shape[0] * addr_current.shape[1]), + self.delta_gpu[index,None], + np.int32(max_oby), + np.int32(max_obx), + block=(64,1,1), + grid=(1, 1, 1), + stream=self.queue) + +# with multiple inheritance, we have to be explicit which super class +# we are calling in the methods +class RandomIntMangler(BaseMangler, npam.RandomIntMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.RandomIntMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) + + +class GridSearchMangler(BaseMangler, npam.GridSearchMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.GridSearchMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu new file mode 100644 index 000000000..dda9b45f1 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu @@ -0,0 +1,35 @@ +#include +#include +using thrust::complex; + +inline __device__ int minimum(int a, int b) { return a < b ? a : b; } + +inline __device__ int maximum(int a, int b) { return a < b ? b : a; } + +extern "C" __global__ void get_address(const int* addr_current, + int* mangled_addr, + int num_pods, + const int* __restrict delta, + int max_oby, + int max_obx) +{ + // we use only one thread block + const int tx = threadIdx.x; + const int idx = tx % 2; // even threads access y dim, odd threads x dim + const int maxval = (idx == 0) ? max_oby : max_obx; + + const int addr_stride = 15; + const int d = delta[idx]; + addr_current += 3 + idx + 1; + mangled_addr += 3 + idx + 1; + + for (int ix = tx; ix < num_pods * 2; ix += blockDim.x) + { + const int bid = ix / 2; + int cur = addr_current[bid * addr_stride] + d; + int bound = maximum(0, minimum(maxval, cur)); + assert(bound >= 0); + assert(bound <= maxval); + mangled_addr[bid * addr_stride] = bound; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index cb489253a..21afc30fa 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -18,7 +18,6 @@ from ptypy.utils import parallel from ptypy.engines import register from ptypy.accelerate.base.engines import DM_serial -from ptypy.accelerate.base import address_manglers from .. import get_context from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel @@ -129,18 +128,9 @@ def _setup_kernels(self): kern.resolution = geo.resolution[0] if self.do_position_refinement: - logger.info("Setting up position correction") - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) + logger.info("Setting up PositionCorrectionKernel") + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler logger.info("Kernel setup completed") def engine_prepare(self): @@ -167,6 +157,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() for label, d in self.ptycho.new_data: prep = self.diff_info[d.ID] @@ -287,39 +279,53 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu mag = prep.mag ma_sum = prep.ma_sum err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu PCK = kern.PCK - AUK = kern.AUK - - #error_state = np.zeros(err_fourier.shape, dtype=np.float32) - #error_state[:] = err_fourier.get() - cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr, + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod(dest=error_state.ptr, src=err_fourier.ptr, size=err_fourier.nbytes) + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu(mangled_addr) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, err_fourier) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - err_fourier) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) - # prep.err_fourier_gpu.set(error_state) - cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes) + cuda.memcpy_dtod(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes) if use_tiles: s1 = addr.shape[0] * addr.shape[1] s2 = addr.shape[2] * addr.shape[3] - kern.TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 queue.synchronize() diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 3cf58f672..602715849 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -100,6 +100,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() for label, d in self.ptycho.new_data: dID = d.ID @@ -332,43 +334,61 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu ma_sum = prep.ma_sum_gpu + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + PCK = kern.PCK - AUK = kern.AUK + TK = kern.TK PROP = kern.PROP + # Make sure our data arrays are on device ev_ma, ma, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod) ev_mag, mag, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod) - # error_state = np.zeros(err_fourier.shape, dtype=np.float32) - # err_fourier.get_async(streamdata.queue, error_state) - cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr, - src=prep.err_fourier_gpu.ptr, - size=prep.err_fourier_gpu.nbytes)#, stream=self.queue) + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # wait for data to arrive + self.queue.wait_for_event(ev_mag) + + # We need to re-calculate the current error + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod_async(dest=error_state.ptr, + src=err_fourier.ptr, + size=err_fourier.nbytes, stream=self.queue) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu(mangled_addr) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - # wait for data to arrive - self.queue.wait_for_event(ev_mag) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu) - # err_fourier_cpu = err_fourier.get_async(streamdata.queue) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - prep.err_fourier_gpu) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) data_mag.record_done(self.queue, 'compute') data_ma.record_done(self.queue, 'compute') - cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes) #stream=self.queue) + cuda.memcpy_dtod_async(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes, stream=self.queue) if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - kern.TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 self.queue.synchronize() diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 706c03b26..4f797ed39 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -169,6 +169,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum) # prepare page-locked mems: @@ -429,46 +431,59 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu ma_sum = prep.ma_sum_gpu ma, mag = streamdata.ma_to_gpu(dID, prep.ma, prep.mag) + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu PCK = kern.PCK - AUK = kern.AUK + TK = kern.TK + PROP = kern.PROP PCK.queue = streamdata.queue + TK.queue = streamdata.queue PROP.queue = streamdata.queue - AUK.queue = streamdata.queue - #error_state = np.zeros(err_fourier.shape, dtype=np.float32) - #err_fourier.get_async(streamdata.queue, error_state) - cuda.memcpy_dtod_async(dest=prep.error_state_gpu.ptr, - src=prep.err_fourier_gpu.ptr, - size=prep.err_fourier_gpu.nbytes, + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod_async(dest=error_state.ptr, + src=err_fourier.ptr, + size=err_fourier.nbytes, stream=streamdata.queue) streamdata.start_compute(prev_event) log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - addr_cpu = addr.get_async(streamdata.queue) + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): streamdata.queue.synchronize() - mangled_addr = PCK.address_mangler.mangle_address(addr_cpu, original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu_async(mangled_addr, stream=streamdata.queue) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu) - # err_fourier_cpu = err_fourier.get_async(streamdata.queue) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - prep.err_fourier_gpu) - cuda.memcpy_dtod_async(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes, + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + + cuda.memcpy_dtod_async(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes, stream=streamdata.queue) if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - kern.TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) prev_event = streamdata.end_compute() diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 0cb1568b9..1b859fb66 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -23,8 +23,7 @@ from ptypy.utils.verbose import logger from ptypy.utils import parallel from .. import get_context -from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \ - PositionCorrectionKernel, PropagationKernel +from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PropagationKernel from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel from ptypy.accelerate.base import address_manglers @@ -210,20 +209,6 @@ def _setup_kernels(self): kern.PROP = PropagationKernel(aux, geo.propagator, queue_thread=self.queue) kern.PROP.allocate() - - if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) - kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler - def _initialize_model(self): # Create noise model diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index bbf53c975..4ac3d3161 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -1031,8 +1031,18 @@ def pr_update_local(self, addr, pr, ob, ex, aux): class PositionCorrectionKernel(ab.PositionCorrectionKernel): - def __init__(self, aux, nmodes, queue_thread=None, math_type='float', accumulate_type='float'): - super(PositionCorrectionKernel, self).__init__(aux, nmodes) + from ptypy.accelerate.cuda_pycuda import address_manglers + + # these are used by the self.setup method - replacing them with the GPU implementation + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs): + super(PositionCorrectionKernel, self).__init__(*args, **kwargs) + # make sure we set the right stream in the mangler + self.mangler.queue = queue_thread if math_type not in ['float', 'double']: raise ValueError('Only float or double math is supported') if accumulate_type not in ['float', 'double']: @@ -1056,6 +1066,11 @@ def __init__(self, aux, nmodes, queue_thread=None, math_type='float', accumulate 'BDIM_Y': 32, 'ACC_TYPE': self.accumulate_type }) + self.log_likelihood_cuda = load_kernel("log_likelihood", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "log_likelihood.cu") self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', @@ -1117,19 +1132,21 @@ def error_reduce(self, addr, err_fmag): grid=(int(err_fmag.shape[0]), 1, 1), stream=self.queue) - def update_addr_and_error_state_old(self, addr, error_state, mangled_addr, err_sum): - ''' - updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu - ''' - update_indices = err_sum < error_state - log(4, "updating %s indices" % np.sum(update_indices)) - print('update ind {}, addr {}, mangled {}'.format(update_indices.shape, addr.shape, mangled_addr.shape)) - addr_cpu = addr.get_async(self.queue) - self.queue.synchronize() - addr_cpu[update_indices] = mangled_addr[update_indices] - addr.set_async(ary=addr_cpu, stream=self.queue) - - error_state[update_indices] = err_sum[update_indices] + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + self.log_likelihood_cuda(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(mag.shape[0]), 1, 1), + stream=self.queue) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): # assume all data is on GPU! diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index b66ac639c..e0059ca59 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -19,7 +19,7 @@ from ..utils import parallel from .utils import Cnorm2, Cdot from . import register -from .base import PositionCorrectionEngine +from .base import BaseEngine from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull # for debugging @@ -29,7 +29,7 @@ @register() -class ML(PositionCorrectionEngine): +class ML(BaseEngine): """ Maximum likelihood reconstruction engine. @@ -166,7 +166,6 @@ def engine_initialize(self): """ Prepare for ML reconstruction. """ - super(ML, self).engine_initialize() # Object gradient and minimization direction self.ob_grad = self.ob.copy(self.ob.ID + '_grad', fill=0.) @@ -338,7 +337,6 @@ def engine_finalize(self): """ Delete temporary containers. """ - super(ML, self).engine_finalize() del self.ptycho.containers[self.ob_grad.ID] del self.ob_grad del self.ptycho.containers[self.ob_grad_new.ID] diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py index 18f18b65e..174628af4 100644 --- a/ptypy/engines/base.py +++ b/ptypy/engines/base.py @@ -13,7 +13,7 @@ from .. import utils as u from ..utils import parallel from ..utils.verbose import logger, headerline, log -from .posref import AnnealingRefine +from .posref import AnnealingRefine, GridSearchRefine __all__ = ['BaseEngine', 'Base3dBraggEngine', 'DEFAULT_iter_info', 'PositionCorrectionEngine'] @@ -314,6 +314,11 @@ class PositionCorrectionEngine(BaseEngine): type = Param, bool help = If True refine scan positions + [position_refinement.method] + default = Annealing + type = str + help = Annealing or GridSearch + [position_refinement.start] default = None type = int @@ -357,6 +362,11 @@ class PositionCorrectionEngine(BaseEngine): help = record movement of positions """ + POSREF_ENGINES = { + "Annealing": AnnealingRefine, + "GridSearch": GridSearchRefine + } + def __init__(self, ptycho_parent, pars): """ Position Correction engine. @@ -386,17 +396,17 @@ def engine_initialize(self): self.do_position_refinement = False else: self.do_position_refinement = True - log(3, "Initialising position refinement") + log(3, "Initialising position refinement (%s)" %self.p.position_refinement.method) # Enlarge object arrays, # This can be skipped though if the boundary is less important for name, s in self.ob.storages.items(): - s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize)) - s.reformat() + s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize)) + s.reformat() - # this could be some kind of dictionary lookup if we want to add more - self.position_refinement = AnnealingRefine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric) - log(3, "Position refinement initialised") + # Choose position refinement engine from dictionary + PosrefEngine = self.POSREF_ENGINES[self.p.position_refinement.method] + self.position_refinement = PosrefEngine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric) self.ptycho.citations.add_article(**self.position_refinement.citation_dictionary) if self.p.position_refinement.stop is None: self.p.position_refinement.stop = self.p.numiter diff --git a/ptypy/engines/posref.py b/ptypy/engines/posref.py index c0f12a857..af27cdaf1 100644 --- a/ptypy/engines/posref.py +++ b/ptypy/engines/posref.py @@ -41,14 +41,61 @@ def update_constraints(self, iteration): iteration : int The current iteration of the engine. ''' + start, end = self.p.start, self.p.stop + # Compute the maximum shift allowed at this iteration + self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start) - raise NotImplementedError('This method needs to be overridden in order to position correct') + def estimate_fourier_metric(self, di_view, obj): + ''' + Calculates error based on DM fourier error estimate. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view for which we wish to calculate the error. + + obj : numpy.ndarray + The current calculated object for which we wish to evaluate the error against. + Returns + ------- + np.float + The calculated fourier error + ''' + af2 = np.zeros_like(di_view.data) + for name, pod in di_view.pods.items(): + af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) + return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2) / di_view.pod.mask.sum() + + def estimate_photon_metric(self, di_view, obj): + ''' + Calculates error based on reduced likelihood estimate. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view for which we wish to calculate the error. + + obj : numpy.ndarray + The current calculated object for which we wish to evaluate the error against. + Returns + ------- + np.float + The calculated fourier error + ''' + af2 = np.zeros_like(di_view.data) + for name, pod in di_view.pods.items(): + af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) + return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape)) def cleanup(self): ''' Cleans up every iteration ''' + @property + def citation_dictionary(self): + return {} + class AnnealingRefine(PositionRefine): @@ -85,48 +132,6 @@ def __init__(self, position_refinement_parameters, Cobj, metric="fourier"): else: raise NotImplementedError("Metric %s is currently not implemented" %metric) - def estimate_fourier_metric(self, di_view, obj): - ''' - Calculates error based on DM fourier error estimate. - - Parameters - ---------- - di_view : ptypy.core.classes.View - A diffraction view for which we wish to calculate the error. - - obj : numpy.ndarray - The current calculated object for which we wish to evaluate the error against. - Returns - ------- - np.float - The calculated fourier error - ''' - af2 = np.zeros_like(di_view.data) - for name, pod in di_view.pods.items(): - af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) - return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2) - - def estimate_photon_metric(self, di_view, obj): - ''' - Calculates error based on reduced likelihood estimate. - - Parameters - ---------- - di_view : ptypy.core.classes.View - A diffraction view for which we wish to calculate the error. - - obj : numpy.ndarray - The current calculated object for which we wish to evaluate the error against. - Returns - ------- - np.float - The calculated fourier error - ''' - af2 = np.zeros_like(di_view.data) - for name, pod in di_view.pods.items(): - af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) - return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape)) - def update_view_position(self, di_view): ''' Refines the positions by the following algorithm: @@ -189,24 +194,118 @@ def update_view_position(self, di_view): error = new_error coord = new_coord log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta)) - + ob_view.coord = coord ob_view.storage.update_views(ob_view) return coord - initial_coord - def update_constraints(self, iteration): + @property + def citation_dictionary(self): + return { + "title" : 'An annealing algorithm to correct positioning errors in ptychography', + "author" : 'Maiden et al.', + "journal" : 'Ultramicroscopy', + "volume" : 120, + "year" : 2012, + "page" : 64, + "doi" : '10.1016/j.ultramic.2012.06.001', + "comment" : 'Position Refinement using annealing algorithm'} + +class GridSearchRefine(PositionRefine): + + def __init__(self, position_refinement_parameters, Cobj, metric="fourier"): ''' + Grid Search Position Refinement. - Parameters ---------- - iteration : int - The current iteration of the engine. + position_refinement_parameters : ptypy.utils.parameters.Param + The parameter tree for the refinement + + Cobj : ptypy.core.classes.Container + The current pbject container object + metric : str + "fourier" or "photon" ''' + super(GridSearchRefine, self).__init__(position_refinement_parameters) - start, end = self.p.start, self.p.stop + self.Cobj = Cobj # take a reference here. It would be cool if we could make this read-only or something - # Compute the maximum shift allowed at this iteration - self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start) + # Updated before each iteration by self.update_constraints + self.max_shift_dist = None + + # Choose metric for fourier error + if metric == "fourier": + self.fourier_error = self.estimate_fourier_metric + elif metric == "photon": + self.fourier_error = self.estimate_photon_metric + else: + raise NotImplementedError("Metric %s is currently not implemented" %metric) + + def update_view_position(self, di_view): + ''' + Refines the positions by the following algorithm: + + Calculates all shifts in a given radius around the original position and calculates the fourier error. + If the fourier error decreased the calculated postion will be used as new position. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view that we wish to refine. + + Returns + ------- + numpy.ndarray + A length 2 numpy array with the position increments for x and y co-ordinates respectively + ''' + # there might be more than one object view + ob_view = di_view.pod.ob_view + + initial_coord = ob_view.coord.copy() + coord = initial_coord + psize = ob_view.psize.copy() + + # if you cannot move far, do nothing + if np.max(psize) >= self.max_shift_dist: + return np.zeros((2,)) + + # This can be optimized by saving existing iteration fourier error... + error = self.fourier_error(di_view, ob_view.data) + + max_shift_pix = self.max_shift_dist // np.min(psize) + max_bound_pix = self.p.max_shift // np.min(psize) + + # Create the search grid + deltas = np.mgrid[-max_shift_pix:max_shift_pix+1:1, + -max_shift_pix:max_shift_pix+1:1] + within_bound = (deltas[0]**2 + deltas[1]**2) < (max_bound_pix**2) + deltas = (deltas[:,within_bound] * np.min(psize)).T + + for i in range(deltas.shape[0]): + # Current shift + delta = deltas[i] + + # Move view to new position + new_coord = initial_coord + delta + ob_view.coord = new_coord + ob_view.storage.update_views(ob_view) + data = ob_view.data + + # catch bad slicing + if not np.allclose(data.shape, ob_view.shape): + continue + + new_error = self.fourier_error(di_view, data) + + if new_error < error: + # keep + error = new_error + coord = new_coord + log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta)) + + ob_view.coord = coord + ob_view.storage.update_views(ob_view) + return coord - initial_coord @property def citation_dictionary(self): @@ -218,4 +317,4 @@ def citation_dictionary(self): "year" : 2012, "page" : 64, "doi" : '10.1016/j.ultramic.2012.06.001', - "comment" : 'Position Refinement using annealing algorithm'} + "comment" : 'Position Refinement using annealing algorithm'} \ No newline at end of file diff --git a/templates/position_refinement.py b/templates/position_refinement_DM.py similarity index 80% rename from templates/position_refinement.py rename to templates/position_refinement_DM.py index c3a348c24..052b4b679 100644 --- a/templates/position_refinement.py +++ b/templates/position_refinement_DM.py @@ -15,7 +15,8 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = u.Param() +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() @@ -41,15 +42,15 @@ p.engines.engine00 = u.Param() p.engines.engine00.name = 'DM' p.engines.engine00.probe_support = 1 -# p.engines.engine00.probe_center_tol = 0.5 p.engines.engine00.numiter = 1000 p.engines.engine00.position_refinement = u.Param() p.engines.engine00.position_refinement.start = 50 -p.engines.engine00.position_refinement.stop = 990 +p.engines.engine00.position_refinement.stop = 950 p.engines.engine00.position_refinement.interval = 10 p.engines.engine00.position_refinement.nshifts = 32 -p.engines.engine00.position_refinement.amplitude = 1e-6 -p.engines.engine00.position_refinement.max_shift = 2e-6 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" # prepare and run P = Ptycho(p, level=4) @@ -58,26 +59,24 @@ a = 0. coords = [] +coords_start = [] for pname, pod in P.pods.items(): + # Save real position coords.append(np.copy(pod.ob_view.coord)) before = pod.ob_view.coord psize = pod.pr_view.psize - # print(pname) - # print(before) perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) - new_coord = before + perturbation # make sure integer number of pixels shift - - pod.ob_view.coord = new_coord - - #pod.diff *= np.random.uniform(0.1,1)y + coords_start.append(np.copy(pod.ob_view.coord)) + #pod.diff *= np.random.uniform(0.1,1) a += 4. np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start.txt", coords_start) P.obj.reformat() - # Run P.run() +P.finalize() diff --git a/templates/position_refinement_DM_pycuda.py b/templates/position_refinement_DM_pycuda.py new file mode 100644 index 000000000..ac51ef337 --- /dev/null +++ b/templates/position_refinement_DM_pycuda.py @@ -0,0 +1,93 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +import numpy as np +from ptypy.core import Ptycho +from ptypy import utils as u + +from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda + +p = u.Param() + +# for verbose output +p.verbose_level = 3 +p.frames_per_block = 100 +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=True, interval=500) +p.io.autoplot = u.Param(active=False)#True, interval=100) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' # or 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +p.scans.MF.coherence = u.Param(num_probe_modes=1) +# p.scans.MF.illumination.diversity=u.Param() +# p.scans.MF.illumination.diversity.power = 0.1 +# p.scans.MF.illumination.diversity.noise = (np.pi, 3.0) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. +#p.scans.MF.data.add_poisson_noise = False + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_pycuda' +p.engines.engine00.probe_support = 1 +p.engines.engine00.numiter = 1000 +p.engines.engine00.numiter_contiguous = 10 +p.engines.engine00.position_refinement = u.Param() +p.engines.engine00.position_refinement.start = 50 +p.engines.engine00.position_refinement.stop = 950 +p.engines.engine00.position_refinement.interval = 10 +p.engines.engine00.position_refinement.nshifts = 32 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" + +# prepare and run +P = Ptycho(p, level=4) + +# Mess up the positions +a = 0. + +coords = [] +coords_start = [] +for pname, pod in P.pods.items(): + + # Save real position + coords.append(np.copy(pod.ob_view.coord)) + before = pod.ob_view.coord + psize = pod.pr_view.psize + perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) + new_coord = before + perturbation # make sure integer number of pixels shift + pod.ob_view.coord = new_coord + coords_start.append(np.copy(pod.ob_view.coord)) + #pod.diff *= np.random.uniform(0.1,1)y + a += 4. + +np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start", coords_start) +P.obj.reformat()# update the object storage + +# Run +P.run() +P.finalize() + diff --git a/templates/position_refinement_DM_serial.py b/templates/position_refinement_DM_serial.py index 523dfd486..6c5584cfd 100644 --- a/templates/position_refinement_DM_serial.py +++ b/templates/position_refinement_DM_serial.py @@ -8,7 +8,6 @@ from ptypy.core import Ptycho from ptypy import utils as u -from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda from ptypy.accelerate.base.engines import DM_serial @@ -16,12 +15,13 @@ # for verbose output p.verbose_level = 3 -p.frames_per_block = 300 +p.frames_per_block = 100 # set home path p.io = u.Param() p.io.home = "~/dumps/ptypy/" p.io.autosave = u.Param(active=True, interval=500) p.io.autoplot = u.Param(active=False)#True, interval=100) +p.io.interaction = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() @@ -32,7 +32,7 @@ p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 -p.scans.MF.data.num_frames = 2000 +p.scans.MF.data.num_frames = 200 p.scans.MF.data.save = None p.scans.MF.illumination = u.Param(diversity=None) @@ -43,25 +43,26 @@ # position distance in fraction of illumination frame p.scans.MF.data.density = 0.2 # total number of photon in empty beam -p.scans.MF.data.photons = 1e6 +p.scans.MF.data.photons = 1e8 # Gaussian FWHM of possible detector blurring p.scans.MF.data.psf = 0. -p.scans.MF.data.add_poisson_noise = False - +#p.scans.MF.data.add_poisson_noise = False # attach a reconstrucion engine p.engines = u.Param() p.engines.engine00 = u.Param() -p.engines.engine00.name = 'DM_pycuda' -p.engines.engine00.numiter = 1000 +p.engines.engine00.name = 'DM_serial' +p.engines.engine00.probe_support = 1 +p.engines.engine00.numiter = 100 p.engines.engine00.numiter_contiguous = 10 p.engines.engine00.position_refinement = u.Param() p.engines.engine00.position_refinement.start = 50 p.engines.engine00.position_refinement.stop = 950 p.engines.engine00.position_refinement.interval = 10 -p.engines.engine00.position_refinement.nshifts = 16 -p.engines.engine00.position_refinement.amplitude = 1e-6 -p.engines.engine00.position_refinement.max_shift = 2e-6 +p.engines.engine00.position_refinement.nshifts = 32 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" # prepare and run P = Ptycho(p, level=4) @@ -70,23 +71,25 @@ a = 0. coords = [] +coords_start = [] for pname, pod in P.pods.items(): + # Save real position coords.append(np.copy(pod.ob_view.coord)) before = pod.ob_view.coord psize = pod.pr_view.psize - perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) new_coord = before + perturbation # make sure integer number of pixels shift pod.ob_view.coord = new_coord - + coords_start.append(np.copy(pod.ob_view.coord)) #pod.diff *= np.random.uniform(0.1,1)y a += 4. -# np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start.txt", coords_start) P.obj.reformat()# update the object storage - # Run P.run() +P.finalize() diff --git a/test/accelerate_tests/base_tests/address_manglers_test.py b/test/accelerate_tests/base_tests/address_manglers_test.py index 11af45e42..7e27c885a 100644 --- a/test/accelerate_tests/base_tests/address_manglers_test.py +++ b/test/accelerate_tests/base_tests/address_manglers_test.py @@ -1,7 +1,7 @@ import unittest import sys import numpy as np -from ptypy.accelerate.base.address_manglers import RandomIntMangle +from ptypy.accelerate.base.address_manglers import BaseMangler, RandomIntMangler COMPLEX_TYPE = np.complex64 FLOAT_TYPE = np.float32 @@ -16,14 +16,8 @@ def setUp(self): def tearDown(self): np.set_printoptions() - def test_addr_original_set(self): - - max_bound = 10 - step_size = 3 - scan_pts = 2 + def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3): total_number_scan_positions = scan_pts ** 2 - num_modes = 3 - X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) X = X.reshape((total_number_scan_positions)) + max_bound # max bound is added in the DM_serial engine. Y = Y.reshape((total_number_scan_positions)) + max_bound @@ -44,26 +38,59 @@ def test_addr_original_set(self): mode_idx += 1 exit_idx += 1 position_idx += 1 + + return addr_original - print(repr(addr_original)) - - old_positions = np.zeros((total_number_scan_positions)) - - differences_from_original = np.zeros((len(addr_original), 2)) - differences_from_original[::2] = 12 # so definitely more than the max_bound - new_positions = addr_original[:, 0, 1, 1:] + differences_from_original - - mangler = RandomIntMangle(step_size, 50, 100, max_bound=max_bound, ) - - - mangler.apply_bounding_box(new_positions, old_positions, addr_original) - print(repr(new_positions)) - expected_new_positions = new_positions[:] - expected_new_positions[::2] = 0 - - print(repr(expected_new_positions)) - - np.testing.assert_array_equal(expected_new_positions, new_positions) - - + def test_apply_bounding_box(self): + scan_pts=2 + max_bound=10 + addr = self.prepare_addresses(scan_pts=scan_pts, max_bound=max_bound) + step_size = 3 + + mangler = BaseMangler(step_size, 50, 100, nshifts=1, max_bound=max_bound, ) + min_oby = 1 + max_oby = 10 + min_obx = 2 + max_obx = 9 + mangler.apply_bounding_box(addr[:,:,1,1], min_oby, max_oby) + mangler.apply_bounding_box(addr[:,:,1,2], min_obx, max_obx) + + np.testing.assert_array_less(addr[:,:,1,1], max_oby+1) + np.testing.assert_array_less(addr[:,:,1,2], max_obx+1) + np.testing.assert_array_less(min_oby-1, addr[:,:,1,1]) + np.testing.assert_array_less(min_obx-1, addr[:,:,1,2]) + + + def test_get_address(self): + # the other manglers are using the BaseMangler's get_address function + # so we set the deltas in a BaseMangler object and test get_address + + scan_pts=2 + addr_original = self.prepare_addresses(scan_pts=scan_pts) + total_number_scan_positions = scan_pts ** 2 + addr1 = np.copy(addr_original) + addr2 = np.copy(addr_original) + nshifts=1 + step_size=2 + mglr = BaseMangler(step_size, 50, 100, nshifts, max_bound=2) + # 2 shifts, with positive/negative shifting + mglr.delta = np.array([ + [1, 2], + [-4, -2] + ]) + mglr.get_address(0, addr_original, addr1, 10, 9) + mglr.get_address(1, addr_original, addr2, 10, 9) + + exp1 = np.copy(addr_original) + exp2 = np.copy(addr_original) + # element-wise here to prepare reference + for f in range(addr_original.shape[0]): + for m in range(addr_original.shape[1]): + exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1)) + exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2)) + exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4)) + exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2)) + + np.testing.assert_array_equal(addr1, exp1) + np.testing.assert_array_equal(addr2, exp2) diff --git a/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py new file mode 100644 index 000000000..2704dcf97 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py @@ -0,0 +1,77 @@ +import unittest +import numpy as np +from . import perfrun, PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.base import address_manglers as am + from ptypy.accelerate.cuda_pycuda import address_manglers as gam + + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class AddressManglersTest(PyCudaTest): + + def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3): + total_number_scan_positions = scan_pts ** 2 + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + max_bound # max bound is added in the DM_serial engine. + Y = Y.reshape((total_number_scan_positions)) + max_bound + + addr_original = np.zeros((total_number_scan_positions, num_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(num_modes): + for ob_mode in range(1): + addr_original[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + return addr_original + + def test_get_address_REGRESSION(self): + # the other manglers are using the BaseMangler's get_address function + # so we set the deltas in a BaseMangler object and test get_address + + scan_pts=2 + addr_original = self.prepare_addresses(scan_pts=scan_pts) + addr_original_dev = gpuarray.to_gpu(addr_original) + nshifts=1 + step_size=2 + mglr = gam.BaseMangler(step_size, 50, 100, nshifts, max_bound=2) + # 2 shifts, with positive/negative shifting + mglr.delta = np.array([ + [1, 2], + [-4, -2] + ], dtype=np.int32) + mglr._setup_delta_gpu() + + addr1 = addr_original_dev.copy() + mglr.get_address(0, addr_original_dev, addr1, 10, 9) + + addr2 = addr_original_dev.copy() + mglr.get_address(1, addr_original_dev, addr2, 10, 9) + + exp1 = np.copy(addr_original) + exp2 = np.copy(addr_original) + # element-wise here to prepare reference + for f in range(addr_original.shape[0]): + for m in range(addr_original.shape[1]): + exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1)) + exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2)) + exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4)) + exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2)) + + np.testing.assert_array_equal(addr2.get(), exp2) + np.testing.assert_array_equal(addr1.get(), exp1) + From 13cffa5efea979e4914b7701f9e8f568bf023ed2 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 1 Apr 2021 10:31:21 +0100 Subject: [PATCH 45/56] no need to test for ML with position refinement --- test/engine_tests/ML_test.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/test/engine_tests/ML_test.py b/test/engine_tests/ML_test.py index b7ae3525e..fd95b816e 100644 --- a/test/engine_tests/ML_test.py +++ b/test/engine_tests/ML_test.py @@ -12,21 +12,6 @@ class MLTest(unittest.TestCase): - def test_ML_farfield_position_refinement(self): - engine_params = u.Param() - engine_params.name = 'ML' - engine_params.numiter = 5 - engine_params.probe_update_start = 2 - engine_params.floating_intensities = False - engine_params.intensity_renormalization = 1.0 - engine_params.reg_del2 =True - engine_params.reg_del2_amplitude = 0.01 - engine_params.smooth_gradient = 0.0 - engine_params.scale_precond =False - engine_params.probe_update_start = 0 - engine_params.position_refinement = True - tu.EngineTestRunner(engine_params) - def test_ML_farfield_floating_intensities(self): engine_params = u.Param() engine_params.name = 'ML' From b134bc6e9e8033e5ccb3d51bad3a451fb4c6fcd9 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 1 Apr 2021 11:02:15 +0100 Subject: [PATCH 46/56] archived extensions.py --- extensions.py => archive/cuda_extension/extensions.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename extensions.py => archive/cuda_extension/extensions.py (100%) diff --git a/extensions.py b/archive/cuda_extension/extensions.py similarity index 100% rename from extensions.py rename to archive/cuda_extension/extensions.py From 84665b5c064e7e6291c15e812d7ceb34fc7b61f0 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Thu, 1 Apr 2021 18:44:30 +0100 Subject: [PATCH 47/56] Gpu smoothing fix (#314) * work in progress refactoring of convolution kernel * tests for gaussian smoothing are passing now * integrating new smoothing kernels into engines * create the tmp array if not given * avoid repeatedly creating tmp array Co-authored-by: Jorg Lotze --- ptypy/accelerate/cuda_pycuda/__init__.py | 2 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 90 ++++++++++---- .../cuda_pycuda/engines/DM_pycuda.py | 5 +- .../cuda_pycuda/engines/DM_pycuda_stream.py | 6 +- .../cuda_pycuda/engines/DM_pycuda_streams.py | 8 +- .../cuda_pycuda/engines/ML_pycuda.py | 8 +- templates/minimal_prep_and_run_ML_pycuda.py | 15 ++- .../cuda_pycuda_tests/array_utils_test.py | 116 +++++++++--------- 8 files changed, 144 insertions(+), 106 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 677a641f0..e6c51d49f 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -3,7 +3,7 @@ import numpy as np import os # debug_options = [] -# debug_options = ['-O0', '-G', '-g', ] +# debug_options = ['-O0', '-G', '-g'] debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags # C++14 support was added with CUDA 9, so we only enable the flag there diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index e953ed39d..00cecac0f 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -1,5 +1,6 @@ from . import load_kernel from pycuda import gpuarray +import pycuda.driver as cuda from ptypy.utils import gaussian import numpy as np @@ -354,25 +355,44 @@ def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): # At least 2 blocks per SM self.max_shared_per_block = 48 * 1024 // 2 self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize - self.max_kernel_radius = self.max_shared_per_block_complex / self.blockdim_y - - self.convolution_row = load_kernel("convolution_row", file="convolution.cu", subs={ - 'BDIM_X': self.blockdim_x, - 'BDIM_Y': self.blockdim_y, - 'DTYPE': self.stype, - 'MATH_TYPE': self.kernel_type + self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y) + + self.convolution_row = load_kernel( + "convolution_row", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_x, + 'BDIM_Y': self.blockdim_y, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) - self.convolution_col = load_kernel("convolution_col", file="convolution.cu", subs={ - 'BDIM_X': self.blockdim_y, - 'BDIM_Y': self.blockdim_x, - 'DTYPE': self.stype, - 'MATH_TYPE': self.kernel_type + self.convolution_col = load_kernel( + "convolution_col", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns + 'BDIM_Y': self.blockdim_x, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) + # pre-allocate kernel memory on gpu, with max-radius to accomodate + dtype=np.float32 if self.kernel_type == 'float' else np.float64 + self.kernel_gpu = gpuarray.empty((self.max_kernel_radius,), dtype=dtype) + # keep track of previus radius and std to determine if we need to transfer again + self.r = 0 + self.std = 0 - def convolution(self, input, output, mfs): - ndims = input.ndim - shape = input.shape + def convolution(self, data, mfs, tmp=None): + """ + Calculates a stacked 2D convolution for smoothing, with the standard deviations + given in mfs (stdx, stdy). It works in-place in the data array, + and tmp is a gpu-allocated array of the same size and type as data, + used internally for temporary storage + """ + ndims = data.ndim + shape = data.shape + + # Create temporary array (if not given) + if tmp is None: + tmp = gpuarray.empty(shape, dtype=data.dtype) + assert shape == tmp.shape and data.dtype == tmp.dtype # Check input dimensions if ndims == 3: @@ -389,15 +409,23 @@ def convolution(self, input, output, mfs): else: raise NotImplementedError("input needs to be of dimensions 0 < ndims <= 3") + input = data + output = tmp + # Row convolution kernel # TODO: is this threshold acceptable in all cases? if stdx > 0.1: r = int(self.num_stdevs * stdx + 0.5) - g = gaussian(np.arange(-r,r+1), stdx) - g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdx != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r,r+1), stdx) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = k[:] + self.r = r + self.std = stdx bx = self.blockdim_x by = self.blockdim_y @@ -408,21 +436,27 @@ def convolution(self, input, output, mfs): blk = (bx, by, 1) grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches) - self.convolution_row(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), + self.convolution_row(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), block=blk, grid=grd, shared=shared, stream=self.queue) - # Overwrite input input = output - + output = data + # Column convolution kernel # TODO: is this threshold acceptable in all cases? if stdy > 0.1: r = int(self.num_stdevs * stdy + 0.5) - g = gaussian(np.arange(-r,r+1), stdy) - g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdy != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r,r+1), stdy) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = k[:] + self.r = r + self.std = stdy + bx = self.blockdim_y by = self.blockdim_x @@ -433,9 +467,13 @@ def convolution(self, input, output, mfs): blk = (bx, by, 1) grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches) - self.convolution_col(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), + self.convolution_col(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), block=blk, grid=grd, shared=shared, stream=self.queue) # TODO: is this threshold acceptable in all cases? if (stdx <= 0.1 and stdy <= 0.1): - output[:] = input[:] + return # nothing to do + elif (stdx > 0.1 and stdy > 0.1): + return # both parts have run, output is back in data + else: + data[:] = tmp[:] # only one of them has run, output is in tmp diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 21afc30fa..2a07edf3b 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -359,11 +359,10 @@ def object_update(self, MPI=False): cfact = self.ob_cfact[oID] if self.p.obj_smooth_std is not None: + obb = self.ob_buf.S[oID] logger.info('Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - ob_gpu_tmp = gpuarray.empty(ob.shape, dtype=np.complex64) - self.GSK.convolution(ob.gpu, ob_gpu_tmp, smooth_mfs) - ob.gpu = ob_gpu_tmp + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) ob.gpu *= cfact obn.gpu.fill(cfact) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 602715849..a9ad7fac7 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -157,14 +157,14 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): cfact = self.ob_cfact[oID] obn = self.ob_nrm.S[oID] - obb = self.ob_buf.S[oID] if self.p.obj_smooth_std is not None: + obb = self.ob_buf.S[oID] logger.info('Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs) + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) # obb.gpu[:] = ob.gpu * cfactf32 - ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=self.queue) + ob.gpu._axpbz(np.complex64(cfact), 0, ob.gpu, stream=self.queue) obn.gpu.fill(np.float32(cfact), stream=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 4f797ed39..3bc019d67 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -257,14 +257,14 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): cfact = self.ob_cfact[oID] obn = self.ob_nrm.S[oID] - obb = self.ob_buf.S[oID] - + if self.p.obj_smooth_std is not None: logger.info('Smoothing object, cfact is %.2f' % cfact) + obb = self.ob_buf.S[oID] smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs) + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) - obb.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue) + ob.gpu._axpbz(np.complex64(cfact), 0, ob.gpu, stream=streamdata.queue) obn.gpu.fill(np.float32(cfact), stream=streamdata.queue) self.ex_data.syncback = True diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 1b859fb66..5f36b9121 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -159,6 +159,7 @@ def engine_initialize(self): self.queue_transfer = cuda.Stream() self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None super().engine_initialize() #self._setup_kernels() @@ -242,9 +243,10 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy) def _get_smooth_gradient(self, data, sigma): - tmp = gpuarray.empty(data.shape, dtype=np.complex64) - self.GSK.convolution(data, tmp, [sigma, sigma]) - return tmp + if self.GSK.tmp is None: + self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64) + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + return data def _replace_ob_grad(self): new_ob_grad = self.ob_grad_new diff --git a/templates/minimal_prep_and_run_ML_pycuda.py b/templates/minimal_prep_and_run_ML_pycuda.py index a66f39825..4b0dd5f51 100644 --- a/templates/minimal_prep_and_run_ML_pycuda.py +++ b/templates/minimal_prep_and_run_ML_pycuda.py @@ -11,7 +11,7 @@ p = u.Param() # for verbose output -p.verbose_level = 2 +p.verbose_level = 3 p.frames_per_block = 400 # set home path p.io = u.Param() @@ -27,7 +27,7 @@ p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 -p.scans.MF.data.num_frames = 600 +p.scans.MF.data.num_frames = 100 p.scans.MF.data.save = None p.scans.MF.illumination = u.Param(diversity=None) @@ -43,15 +43,14 @@ p.engines = u.Param() p.engines.engine00 = u.Param() p.engines.engine00.name = 'ML_pycuda' -p.engines.engine00.numiter = 10 +p.engines.engine00.numiter = 300 p.engines.engine00.numiter_contiguous = 5 p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used -p.engines.engine00.floating_intensities = True - +p.engines.engine00.scale_precond = True +p.engines.engine00.smooth_gradient = 20. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.floating_intensities = False # prepare and run P = Ptycho(p,level=5) -#P.run() -P.print_stats() -#u.pause(10) diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index 9823f2a9b..d511bec36 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -128,124 +128,124 @@ def test_transpose_4D(self): def test_complex_gaussian_filter_1d_no_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [0] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() self.assertTrue(np.testing.assert_allclose(out_exp, out, rtol=1e-5) is None) def test_complex_gaussian_filter_1d_little_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [0.2] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_1d_more_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [2.0] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_no_blurring_UNITY(self): # Arrange - inp = np.zeros((11, 11), dtype=np.complex64) - inp[5, 5] = 1.0+1.0j + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j mfs = 0.0,0.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,11), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,11), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_little_blurring_UNITY(self): # Arrange - inp = np.zeros((11, 11), dtype=np.complex64) - inp[5, 5] = 1.0+1.0j + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j mfs = 0.2,0.2 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,11),dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,11),dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_more_blurring_UNITY(self): # Arrange - inp = np.zeros((8, 8), dtype=np.complex64) - inp[3:5, 3:5] = 2.0+2.0j + data = np.zeros((8, 8), dtype=np.complex64) + data[3:5, 3:5] = 2.0+2.0j mfs = 3.0,4.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((8,8), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + #tmp_dev = gpuarray.empty((8,8), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) def test_complex_gaussian_filter_2d_nonsquare_UNITY(self): # Arrange - inp = np.zeros((32, 16), dtype=np.complex64) - inp[3:4, 11:12] = 2.0+2.0j - inp[3:5, 3:5] = 2.0+2.0j - inp[20:25,3:5] = 2.0+2.0j + data = np.zeros((32, 16), dtype=np.complex64) + data[3:4, 11:12] = 2.0+2.0j + data[3:5, 3:5] = 2.0+2.0j + data[20:25,3:5] = 2.0+2.0j mfs = 1.0,1.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty(inp.shape, dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty(data_dev.shape, dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) @@ -254,19 +254,19 @@ def test_complex_gaussian_filter_2d_batched(self): batch_number = 2 A = 5 B = 5 - inp = np.zeros((batch_number, A, B), dtype=np.complex64) - inp[:, 2:3, 2:3] = 2.0+2.0j + data = np.zeros((batch_number, A, B), dtype=np.complex64) + data[:, 2:3, 2:3] = 2.0+2.0j mfs = 3.0,4.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) From 9dbf5ffbbc18cc50a97272fb20953ae1b94e10d6 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Thu, 1 Apr 2021 20:37:15 +0100 Subject: [PATCH 48/56] Precompile cufft during setup to avoid MPI failures and speed up execution (#313) * updates to imported FFT to compile all supported sizes into the same module * integrating filtered_cufft in setup.py * cleanup and re-organising file locations * fixing typos * made cufft extension module optional in setup.py (enabled by default for now) * replaced cmdline flag with try/except * moved setupext into accelerate folder * move extension back to root level, improved build message Co-authored-by: Benedikt Daurer --- .../import_fft.py => extensions.py | 99 ++++--------------- .../cuda/filtered_fft/filtered_fft.cu | 54 ++++++---- .../cuda/filtered_fft/filtered_fft.h | 1 + .../cuda_pycuda/cuda/filtered_fft/module.cpp | 9 +- ptypy/accelerate/cuda_pycuda/cufft.py | 11 ++- setup.py | 43 +++++++- .../cuda_pycuda_tests/fft_accuracy_test.py | 48 --------- .../fft_tests/cufft_init_test.py | 28 ++++++ .../fft_tests/fft_accuracy_test.py | 4 +- .../fft_tests/fft_import_fft_test.py | 27 ----- 10 files changed, 138 insertions(+), 186 deletions(-) rename ptypy/accelerate/cuda_pycuda/import_fft.py => extensions.py (54%) delete mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py delete mode 100644 test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/extensions.py similarity index 54% rename from ptypy/accelerate/cuda_pycuda/import_fft.py rename to extensions.py index 6a3d3312e..c36483e09 100644 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ b/extensions.py @@ -1,18 +1,9 @@ ''' -"Just-in-time" compilation for callbacks in cufft. +Compilation tools for Nvidia builds of extension modules. ''' import os -import sys -import importlib -import tempfile -import setuptools import sysconfig -from pycuda import driver as cuda_driver import pybind11 -import contextlib -from io import StringIO -from ptypy.utils.verbose import log -import distutils from distutils.unixccompiler import UnixCCompiler from distutils.command.build_ext import build_ext @@ -59,8 +50,14 @@ def __init__(self, *args, **kwargs): super(NvccCompiler, self).__init__(*args, **kwargs) self.CUDA = locate_cuda() module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - cmp = cuda_driver.Context.get_device().compute_capability() - archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) + # by default, compile for all of these + archflag = '-gencode=arch=compute_50,code=sm_50' + \ + ' -gencode=arch=compute_52,code=sm_52' + \ + ' -gencode=arch=compute_60,code=sm_60' + \ + ' -gencode=arch=compute_61,code=sm_61' + \ + ' -gencode=arch=compute_70,code=sm_70' + \ + ' -gencode=arch=compute_75,code=sm_75' + \ + ' -gencode=arch=compute_75,code=compute_75' self.src_extensions.append('.cu') self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] @@ -102,75 +99,17 @@ def link(self, target_desc, objects, self.linker_so = default_linker_so class CustomBuildExt(build_ext): - def build_extensions(self): - old_compiler = self.compiler - self.compiler = NvccCompiler(verbose=old_compiler.verbose, - dry_run=old_compiler.dry_run, - force=old_compiler.force) # this is our bespoke compiler - super(CustomBuildExt, self).build_extensions() - self.compiler=old_compiler -@contextlib.contextmanager -def stdchannel_redirected(stdchannel): - """ - Redirects stdout or stderr to a StringIO object. As of python 3.4, there is a - standard library contextmanager for this, but backwards compatibility! - """ - old = getattr(sys, stdchannel) - try: - s = StringIO() - setattr(sys, stdchannel, s) - yield s - finally: - setattr(sys, stdchannel, old) - - -class ImportFFT: - def __init__(self, rows, columns, build_path=None, quiet=True): - self.build_path = build_path - self.cleanup_build_path = None - if self.build_path is None: - self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") - self.cleanup_build_path = True - - full_module_name = "module" - module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) - ext = distutils.extension.Extension(full_module_name, - sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")], - extra_compile_args=["-DMY_FFT_COLS=%s" % str(columns) , "-DMY_FFT_ROWS=%s" % str(rows)]) - - script_args = ['build_ext', - '--build-temp=%s' % self.build_path, - '--build-lib=%s' % self.build_path] - # do I need full_module_name here? - setuptools_args = {"name": full_module_name, - "ext_modules": [ext], - "script_args": script_args, - "cmdclass":{"build_ext": CustomBuildExt - }} - - if quiet: - # we really don't care about the make print for almost all cases so we redirect - with stdchannel_redirected("stdout"): - with stdchannel_redirected("stderr"): - setuptools.setup(**setuptools_args) + def build_extension(self, ext): + has_cu = any([src.endswith('.cu') for src in ext.sources]) + if has_cu: + old_compiler = self.compiler + self.compiler = NvccCompiler(verbose=old_compiler.verbose, + dry_run=old_compiler.dry_run, + force=old_compiler.force) # this is our bespoke compiler + super(CustomBuildExt, self).build_extension(ext) + self.compiler=old_compiler else: - setuptools.setup(**setuptools_args) - - spec = importlib.util.spec_from_file_location(full_module_name, - os.path.join(self.build_path, - "module" + distutils.sysconfig.get_config_var('EXT_SUFFIX') - ) - ) - self.mod = importlib.util.module_from_spec(spec) + super(CustomBuildExt, self).build_extension(ext) - def get_mod(self): - return self.mod - def __del__(self): - import shutil - if self.cleanup_build_path: - log(5, "cleaning up the build directory") - shutil.rmtree(self.build_path) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu index bb152466a..586d7f356 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu @@ -29,18 +29,6 @@ #include #include -#ifndef MY_FFT_ROWS -# define MY_FFT_ROWS 128 -# pragma GCC warning "MY_FFT_ROWS not set in preprocessor - defaulting to 128" -#endif - -#ifndef MY_FFT_COLS -# define MY_FFT_COLS 128 -# pragma GCC warning "MY_FFT_COLS not set in preprocessor - defaulting to 128" -#endif - - - template class FilteredFFTImpl : public FilteredFFT { public: @@ -274,9 +262,37 @@ void FilteredFFTImpl::setupPlan() { } } +template +static FilteredFFT* make(int batches, int rows, int cols, complex* prefilt, complex* postfilt, + cudaStream_t stream) +{ + // we only support rows / colums are equal and powers of 2, from 16x16 to 512x512 + if (rows != cols) + throw std::runtime_error("Only equal numbers of rows and columns are supported"); + switch (rows) + { + case 16: return new FilteredFFTImpl<16, 16, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 32: return new FilteredFFTImpl<32, 32, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 64: return new FilteredFFTImpl<64, 64, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 128: return new FilteredFFTImpl<128, 128, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 256: return new FilteredFFTImpl<256, 256, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 512: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 1024: return new FilteredFFTImpl<1024, 1024, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 2048: return new FilteredFFTImpl<2048, 2048, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + default: throw std::runtime_error("Only powers of 2 from 16 to 2048 are supported"); + } +} + //////////// Factory Functions for Python -FilteredFFT* make_filtered(int batches, bool symmetricScaling, +// Note: This will instantiate templates for 8 powers of 2, with 4 combinations of forward/reverse, symmetric/not, +// i.e. 32 different FFTs into the binary. Compile time might be quite long, but we intend to do this once +// during installation + +FilteredFFT* make_filtered( + int batches, + int rows, int cols, + bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, cudaStream_t stream) @@ -284,21 +300,17 @@ FilteredFFT* make_filtered(int batches, bool symmetricScaling, if (symmetricScaling) { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } else { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h index fd153f768..9afa4e119 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h @@ -23,6 +23,7 @@ class FilteredFFT { // Note that cudaStream_t (runtime API) and CUStream (driver API) are // the same type FilteredFFT* make_filtered(int batches, + int rows, int columns, bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 186d40cb2..3eb0eb37e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -13,7 +13,7 @@ class FilteredFFTPython { public: - FilteredFFTPython(int batches, bool symmetric, + FilteredFFTPython(int batches, int rows, int columns, bool symmetric, bool is_forward, std::size_t prefilt_ptr, std::size_t postfilt_ptr, @@ -21,6 +21,7 @@ class FilteredFFTPython { fft_ = make_filtered( batches, + rows, columns, symmetric, is_forward, reinterpret_cast*>(prefilt_ptr), @@ -70,12 +71,14 @@ class FilteredFFTPython namespace py = pybind11; -PYBIND11_MODULE(module, m) { +PYBIND11_MODULE(filtered_cufft, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) - .def(py::init(), + .def(py::init(), py::arg("batches"), + py::arg("rows"), + py::arg("columns"), py::arg("symmetricScaling"), py::arg("is_forward"), py::arg("prefilt"), diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 605e90d43..686171342 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -17,6 +17,10 @@ def __init__(self, array, queue=None, if dims < 2: raise AssertionError('Input array must be at least 2-dimensional') self.arr_shape = (array.shape[-2], array.shape[-1]) + rows = self.arr_shape[0] + columns = self.arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward @@ -34,10 +38,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): else: self.post_fft_ptr = 0 - from . import import_fft - mod = import_fft.ImportFFT(self.arr_shape[0], self.arr_shape[1]).get_mod() - self.fftobj = mod.FilteredFFT( + from ptypy import filtered_cufft + self.fftobj = filtered_cufft.FilteredFFT( self.batches, + self.arr_shape[0], + self.arr_shape[1], symmetric, forward, self.pre_fft_ptr, diff --git a/setup.py b/setup.py index 43940038c..83d5b9a89 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,11 @@ #!/usr/bin/env python +# we should aim to remove the distutils dependency +import distutils import setuptools #, setuptools.command.build_ext from distutils.core import setup +import os +import sys CLASSIFIERS = """\ Development Status :: 3 - Alpha @@ -62,6 +66,38 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION +ext_modules = [] +cmdclass = {} +# filtered Cuda FFT extension module +""" +Alternative options for this switch: + +1. Put the cufft extension module as a separate python package with its own setup.py and + put an optional dependency into ptypy (extras_require={ "cufft": ["pybind11"] }), so that + when users do pip install ptypy it installs it without that dependency, and if users do + pip install ptypy[cufft] it installs the optional dependency module + +2. Use an environment variable to control the setting, as sqlalchemy does for its C extensions, + or detect if cuda is available on the system and enable it in this case, etc. +""" +try: + from extensions import locate_cuda # this raises an error if pybind11 is not available + CUDA = locate_cuda() # this raises an error if CUDA is not available + from extensions import CustomBuildExt + cufft_dir = os.path.join('ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') + ext_modules.append( + distutils.core.Extension("ptypy.filtered_cufft", + sources=[os.path.join(cufft_dir, "module.cpp"), + os.path.join(cufft_dir, "filtered_fft.cu")] + ) + ) + cmdclass = {"build_ext": CustomBuildExt} + EXTBUILD_MESSAGE = "ptypy has been successfully installed with the pre-compiled cufft extension.\n" +except: + EXTBUILD_MESSAGE = '*' * 75 + "\n" + EXTBUILD_MESSAGE += "ptypy has been installed without the pre-compiled cufft extension.\n" + EXTBUILD_MESSAGE += "If you require cufft, make sure to have CUDA and pybind11 installed.\n" + EXTBUILD_MESSAGE += '*' * 75 + "\n" exclude_packages = [] package_list = setuptools.find_packages(exclude=exclude_packages) @@ -74,12 +110,15 @@ def write_version_py(filename='ptypy/version.py'): package_dir={'ptypy': 'ptypy'}, packages=package_list, package_data={'ptypy': ['resources/*',], - 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu'], - 'ptypy.accelerate.cuda_pycuda.cuda.filtered_fft': ['*.hpp', '*.cpp', 'Makefile', '*.cu', '*.h']}, + 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu']}, scripts=['scripts/ptypy.plot', 'scripts/ptypy.inspect', 'scripts/ptypy.plotclient', 'scripts/ptypy.new', 'scripts/ptypy.csv2cp', 'scripts/ptypy.run'], + ext_modules=ext_modules, + cmdclass=cmdclass ) + +print(EXTBUILD_MESSAGE) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py deleted file mode 100644 index ed6929865..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py +++ /dev/null @@ -1,48 +0,0 @@ -''' -''' - -import unittest -import numpy as np -import scipy.fft as fft -from . import PyCudaTest, have_pycuda - - -if have_pycuda(): - from pycuda import gpuarray - from ptypy.accelerate.cuda_pycuda.fft import FFT as ReiknaFFT - from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as cuFFT - -class FftAccurracyTest(PyCudaTest): - - def gen_input(self): - rows = cols = 32 - batches = 1 - f = np.random.randn(batches, rows, cols) + 1j * np.random.randn(batches,rows, cols) - f = np.ascontiguousarray(f.astype(np.complex64)) - return f - - def test_random_cufft_fwd(self): - f = self.gen_input() - cuft = cuFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=None, forward=True).ft - reikft = ReiknaFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=False).ft - for i in range(10): - f = self.gen_input() - y = fft.fft2(f) - - x_d = gpuarray.to_gpu(f) - cuft(x_d, x_d) - y_cufft = x_d.get().reshape(y.shape) - - x_d = gpuarray.to_gpu(f) - reikft(x_d, x_d) - y_reikna = x_d.get().reshape(y.shape) - - # cufft_diff = np.max(np.abs(y_cufft - y)) - # reikna_diff = np.max(np.abs(y_reikna-y)) - # cufft_rdiff = np.max(np.abs(y_cufft - y) / np.abs(y)) - # reikna_rdiff = np.max(np.abs(y_reikna - y) / np.abs(y)) - # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) - - # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py new file mode 100644 index 000000000..ac28436b4 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py @@ -0,0 +1,28 @@ + +import unittest +from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda + +if have_pycuda(): + from ptypy.filtered_cufft import FilteredFFT + +class CuFFTInitTest(PyCudaTest): + + def test_import_fft(self): + ft = FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + + def test_import_fft_different_shape(self): + ft = FilteredFFT(2, 128, 128, False, True, 0, 0, 0) + + + @unittest.expectedFailure + def test_import_fft_not_square(self): + ft = FilteredFFT(2, 32, 64, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_pow2(self): + ft = FilteredFFT(2, 40, 40, False, True, 0, 0, 0) + + +if __name__=="__main__": + unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py index 9c87e34f2..7c30c3221 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py deleted file mode 100644 index 7d60ce46a..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py +++ /dev/null @@ -1,27 +0,0 @@ - -import unittest, pytest -from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda -import os, shutil -from distutils import sysconfig - -if have_pycuda(): - import pycuda.driver as cuda - from pycuda import gpuarray - from ptypy.accelerate.cuda_pycuda import import_fft - from pycuda.tools import make_default_context - -class ImportFFTTest(PyCudaTest): - - def test_import_fft(self): - import_fft.ImportFFT(32, 32) - - - def test_import_fft_different_shape(self): - import_fft.ImportFFT(128, 128) - - def test_import_fft_same_module_again(self): - import_fft.ImportFFT(32, 32) - - -if __name__=="__main__": - unittest.main() From 0fb4d94e07abe4974fc9a7fa9120e9fe9c99c99e Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 2 Apr 2021 13:48:32 +0100 Subject: [PATCH 49/56] needed to make changes in position correction tests --- .../position_correction_kernel_test.py | 15 ++++++++++++--- .../position_correction_kernel_test.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/test/accelerate_tests/base_tests/position_correction_kernel_test.py b/test/accelerate_tests/base_tests/position_correction_kernel_test.py index 20764e39a..117915f6b 100644 --- a/test/accelerate_tests/base_tests/position_correction_kernel_test.py +++ b/test/accelerate_tests/base_tests/position_correction_kernel_test.py @@ -6,6 +6,7 @@ import unittest import numpy as np from ptypy.accelerate.base.kernels import PositionCorrectionKernel +from ptypy import utils as u COMPLEX_TYPE = np.complex64 FLOAT_TYPE = np.float32 INT_TYPE = np.int32 @@ -16,6 +17,14 @@ class PositionCorrectionKernelTest(unittest.TestCase): def setUp(self): import sys np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) + self.params = u.Param() + self.params.nshifts = 4 + self.params.method = "Annealing" + self.params.amplitude = 2e-9 + self.params.start = 0 + self.params.stop = 10 + self.params.max_shift = 2e-9 + self.resolution = [1e-9,1e-9] def tearDown(self): np.set_printoptions() @@ -77,7 +86,7 @@ def test_build_aux(self): ''' auxiliary_wave = np.zeros((A, B, C), dtype=COMPLEX_TYPE) - PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes) + PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution) PCK.allocate() # doesn't actually do anything at the moment PCK.build_aux(auxiliary_wave, addr, object_array, probe) @@ -205,7 +214,7 @@ def test_fourier_error(self): mask_sum = mask.sum(-1).sum(-1) - PCK = PositionCorrectionKernel(auxiliary_wave, nmodes=total_number_modes) + PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution) PCK.allocate() PCK.fourier_error(auxiliary_wave, addr, fmag, mask, mask_sum) @@ -276,7 +285,7 @@ def test_error_reduce(self): addr = np.zeros((N, 1, 5, 3)) - PCK = PositionCorrectionKernel(fake_aux, nmodes=1) + PCK = PositionCorrectionKernel(fake_aux, 1, self.params, self.resolution) PCK.allocate() err_fmag = np.zeros(N, dtype=FLOAT_TYPE) PCK.error_reduce(addr, err_fmag) diff --git a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py index a8deebdc6..7f36f138c 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py @@ -6,6 +6,7 @@ import unittest import numpy as np from . import PyCudaTest, have_pycuda +from ptypy import utils as u if have_pycuda(): from pycuda import gpuarray @@ -19,6 +20,17 @@ class PositionCorrectionKernelTest(PyCudaTest): + def setUp(self): + PyCudaTest.setUp(self) + self.params = u.Param() + self.params.nshifts = 4 + self.params.method = "Annealing" + self.params.amplitude = 2e-9 + self.params.start = 0 + self.params.stop = 10 + self.params.max_shift = 2e-9 + self.resolution = [1e-9,1e-9] + def update_addr_and_error_state_UNITY_helper(self, size, modes): ## Arrange addr = np.ones((size, modes, 5, 3), dtype=np.int32) @@ -33,9 +45,9 @@ def update_addr_and_error_state_UNITY_helper(self, size, modes): aux = np.ones((1,1,1), dtype=np.complex64) ## Act - PCK = PositionCorrectionKernel(aux, modes, queue_thread=self.stream) + PCK = PositionCorrectionKernel(aux, modes, self.params, self.resolution, queue_thread=self.stream) PCK.update_addr_and_error_state(addr_gpu, err_state_gpu, mangled_addr_gpu, err_sum_gpu) - abPCK = abPositionCorrectionKernel(aux, modes) + abPCK = abPositionCorrectionKernel(aux, modes, self.params, self.resolution) abPCK.update_addr_and_error_state(addr, err_state, mangled_addr, err_sum) ## Assert From 3383e7ac121e345794b02cadd454aae39c99f052 Mon Sep 17 00:00:00 2001 From: Jorg Lotze Date: Mon, 5 Apr 2021 19:57:00 +0100 Subject: [PATCH 50/56] Gpu NCCL wrapper (#310) * multi-GPU wrapper using NCCL for allReduce * Implementation and generalisation of the multi-gpu tests, incl. cuda-aware MPI * adding C++ MPI test for cuda-aware MPI * multi-gpu support integration in DM_pycuda_stream - work in progress * clean up and findings for multi-gpu implementation * probe allreduce and change calc on GPU for all DM engines * Change smoothing message to level 4 * use multigpu.allReduceSum in all DM engines * Moved support constraint to GPU for DM engines * Attempt to write clip_magnitudes kernel, unity test fails * Integrate clip magnitues kernel into DM engines, still off for now * working on clip magnitudes kernel * adjusting test to pass complex * use clip_object * need to pass gpu array * adding reproducer script used for reproducing nccl crash in the engines * adding dummy call to build_aux_no_ex to test * Fixing Nccl issue - Streams allocated before NCCL can't be used afterwards * move smoothing message to log level 4 * use more simple syntax for DtoD copies * this avoids clean up error when using NCCL * Use multigpu allreduce for change, clean up * remove benchmarks from pycuda engines and move most logging to level 4 * cosmetic changes Co-authored-by: Benedikt Daurer --- archive/misc/mpitest.cpp | 47 +++++ ptypy/accelerate/base/engines/DM_serial.py | 8 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 22 +++ .../cuda_pycuda/cuda/clip_magnitudes.cu | 30 ++++ .../cuda_pycuda/engines/DM_pycuda.py | 161 +++++++++--------- .../cuda_pycuda/engines/DM_pycuda_stream.py | 109 ++++-------- .../cuda_pycuda/engines/DM_pycuda_streams.py | 85 ++++----- ptypy/accelerate/cuda_pycuda/kernels.py | 60 +++++-- ptypy/accelerate/cuda_pycuda/multi_gpu.py | 159 +++++++++++++++++ ptypy/engines/DM.py | 3 +- .../cuda_pycuda_tests/array_utils_test.py | 17 ++ .../cuda_pycuda_tests/multi_gpu_test.py | 85 +++++++++ 12 files changed, 556 insertions(+), 230 deletions(-) create mode 100644 archive/misc/mpitest.cpp create mode 100644 ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu create mode 100644 ptypy/accelerate/cuda_pycuda/multi_gpu.py create mode 100644 test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py diff --git a/archive/misc/mpitest.cpp b/archive/misc/mpitest.cpp new file mode 100644 index 000000000..e4ff84577 --- /dev/null +++ b/archive/misc/mpitest.cpp @@ -0,0 +1,47 @@ +/** This is a simple C++ test to check if cuda-aware MPI works as + * expected. + * It allocates a GPU array and puts 1s into it, then sends it + * across MPI to the receiving rank, which transfers back to + * host and outputs the values. + * The expected output is: + * + * Received 1, 1 + * + * Compile with: + * mpic++ -o test mpitest.cpp -L/path/to/cuda/libs -lcudart + * + * Run with: + * mpirun -np 2 test + */ + +#include +#include +#include +#include +#include + +int main(int argc, char** argv) +{ + MPI_Init(&argc, &argv); + + int rank; + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (rank == 0) { + int* d_send; + cudaMalloc((void**)&d_send, 2*sizeof(int)); + int h_send[] = {1, 1}; + cudaMemcpy(d_send, h_send, 2*sizeof(int), cudaMemcpyHostToDevice); + MPI_Send(d_send, 2, MPI_INT, 1, 99, MPI_COMM_WORLD); + std::cout << "Data has been sent...\n"; + } else if (rank == 1) { + int* d_recv; + cudaMalloc((void**)&d_recv, 2*sizeof(int)); + MPI_Recv(d_recv, 2, MPI_INT, 0, 99, MPI_COMM_WORLD, &status); + int h_recv[2]; + cudaMemcpy(h_recv, d_recv, 2*sizeof(int), cudaMemcpyDeviceToHost); + std::cout << "Received " << h_recv[0] << ", " << h_recv[1] << "\n"; + } + +} \ No newline at end of file diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index 563f61ea1..b5f779efc 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -423,8 +423,6 @@ def overlap_update(self, MPI=True): # Update probe log(4, prestr + '----- probe update -----', True) change = self.probe_update(MPI=(parallel.size > 1 and MPI)) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) - log(4, prestr + 'change in probe is %.3f' % change, True) # stop iteration if probe change is small @@ -439,7 +437,7 @@ def object_update(self, MPI=False): cfact = self.p.object_inertia * self.mean_power if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4, 'Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] ob.data = cfact * au.complex_gaussian_filter(ob.data, smooth_mfs) else: @@ -538,11 +536,11 @@ def probe_update(self, MPI=False): return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=True): """ try deleting ever helper contianer """ - if parallel.master: + if parallel.master and benchmark: print("----- BENCHMARKS ----") acc = 0. for name in sorted(self.benchmark.keys()): diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 00cecac0f..85f816223 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -477,3 +477,25 @@ def convolution(self, data, mfs, tmp=None): return # both parts have run, output is back in data else: data[:] = tmp[:] # only one of them has run, output is in tmp + +class ClipMagnitudesKernel: + + def __init__(self, queue=None): + self.queue = queue + self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", { + 'IN_TYPE': 'complex', + }) + + def clip_magnitudes_to_range(self, array, clip_min, clip_max): + + cmin = np.float32(clip_min) + cmax = np.float32(clip_max) + + npixel = np.int32(np.prod(array.shape)) + bx = 256 + gx = int((npixel + bx - 1) // bx) + self.clip_magnitudes_cuda(array, cmin, cmax, + npixel, + block=(bx, 1, 1), + grid=(gx, 1, 1), + stream=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu new file mode 100644 index 000000000..8128091f9 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu @@ -0,0 +1,30 @@ +/** clip_magnitudes. + * + */ + #include + #include + #include + using thrust::complex; + + extern "C" __global__ void clip_magnitudes(IN_TYPE *arr, + float clip_min, + float clip_max, + int N) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id >= N) + return; + + auto v = arr[id]; + auto mag = abs(v); + auto theta = arg(v); + + if (mag > clip_max) + mag = clip_max; + if (mag < clip_min) + mag = clip_min; + + v = thrust::polar(mag, theta); + arr[id] = v; +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 2a07edf3b..65b5edd0e 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -19,12 +19,11 @@ from ptypy.engines import register from ptypy.accelerate.base.engines import DM_serial from .. import get_context -from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel -from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel from ..mem_utils import make_pagelocked_paired_arrays as mppa - -MPI = parallel.size > 1 -MPI = True +from ..multi_gpu import MultiGpuCommunicator __all__ = ['DM_pycuda'] @@ -61,18 +60,27 @@ def __init__(self, ptycho_parent, pars=None): Difference map reconstruction engine. """ super(DM_pycuda, self).__init__(ptycho_parent, pars) + self.multigpu = None def engine_initialize(self): """ Prepare for reconstruction. """ - self.context, self.queue = get_context(new_context=True, new_queue=True) - # allocator for READ only buffers - # self.const_allocator = cl.tools.ImmediateAllocator(queue, cl.mem_flags.READ_ONLY) + # Context, Multi GPU communicator and Stream (needs to be in this order) + self.context, self.queue = get_context(new_context=True, new_queue=False) + self.multigpu = MultiGpuCommunicator() + self.context, self.queue = get_context(new_context=False, new_queue=True) # Gaussian Smoothing Kernel self.GSK = GaussianSmoothingKernel(queue=self.queue) + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + # Clip Magnitudes Kernel + self.CMK = ClipMagnitudesKernel(queue=self.queue) + super(DM_pycuda, self).engine_initialize() def _setup_kernels(self): @@ -104,34 +112,34 @@ def _setup_kernels(self): kern.aux = gpuarray.to_gpu(aux) # setup kernels, one for each SCAN. - logger.info("Setting up FourierUpdateKernel") + log(4, "Setting up FourierUpdateKernel") kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) kern.FUK.allocate() - logger.info("Setting up PoUpdateKernel") + log(4, "Setting up PoUpdateKernel") kern.POK = PoUpdateKernel(queue_thread=self.queue) kern.POK.allocate() - logger.info("Setting up AuxiliaryWaveKernel") + log(4, "Setting up AuxiliaryWaveKernel") kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) kern.AWK.allocate() - logger.info("Setting up ArrayUtilsKernel") + log(4, "Setting up ArrayUtilsKernel") kern.AUK = ArrayUtilsKernel(queue=self.queue) - logger.info("Setting up TransposeKernel") + log(4, "Setting up TransposeKernel") kern.TK = TransposeKernel(queue=self.queue) - logger.info("Setting up PropagationKernel") + log(4, "Setting up PropagationKernel") kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) kern.PROP.allocate() kern.resolution = geo.resolution[0] if self.do_position_refinement: - logger.info("Setting up PositionCorrectionKernel") + log(4, "Setting up PositionCorrectionKernel") kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) kern.PCK.allocate() - logger.info("Kernel setup completed") + log(4, "Kernel setup completed") def engine_prepare(self): @@ -145,6 +153,8 @@ def engine_prepare(self): s.gpu, s.data = mppa(s.data) for name, s in self.pr.S.items(): s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) for name, s in self.pr_nrm.S.items(): s.gpu, s.data = mppa(s.data) @@ -215,47 +225,33 @@ def engine_iterate(self, num=1): ## compute log-likelihood if self.p.compute_log_likelihood: - t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) FUK.log_likelihood(aux, addr, mag, ma, err_phot) - self.benchmark.F_LLerror += time.time() - t1 ## build auxilliary wave - t1 = time.time() AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) - self.benchmark.A_Build_aux += time.time() - t1 ## forward FFT - t1 = time.time() PROP.fw(aux, aux) - self.benchmark.B_Prop += time.time() - t1 ## Deviation from measured data - t1 = time.time() FUK.fourier_error(aux, addr, mag, ma, ma_sum) FUK.error_reduce(addr, err_fourier) FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) - self.benchmark.C_Fourier_update += time.time() - t1 ## backward FFT - t1 = time.time() PROP.bw(aux, aux) - self.benchmark.D_iProp += time.time() - t1 ## build exit wave - t1 = time.time() AWK.build_exit(aux, addr, ob, pr, ex) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) - self.benchmark.E_Build_exit += time.time() - t1 - - self.benchmark.calls_fourier += 1 parallel.barrier() sync = (self.curiter % 1 == 0) - self.overlap_update(MPI=MPI) + self.overlap_update() parallel.barrier() if self.do_position_refinement and (self.curiter): @@ -267,7 +263,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): prep = self.diff_info[dID] @@ -350,7 +346,6 @@ def engine_iterate(self, num=1): ## object update def object_update(self, MPI=False): - t1 = time.time() use_atomics = self.p.object_update_cuda_atomics queue = self.queue queue.synchronize() @@ -359,8 +354,8 @@ def object_update(self, MPI=False): cfact = self.ob_cfact[oID] if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) obb = self.ob_buf.S[oID] - logger.info('Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) @@ -388,33 +383,19 @@ def object_update(self, MPI=False): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] - # MPI test - if MPI: - ob.data[:] = ob.gpu.get() - obn.data[:] = obn.gpu.get() - queue.synchronize() - parallel.allreduce(ob.data) - parallel.allreduce(obn.data) - ob.data /= obn.data - - self.clip_object(ob) - ob.gpu.set(ob.data) - else: - ob.gpu /= obn.gpu + self.multigpu.allReduceSum(ob.gpu) + self.multigpu.allReduceSum(obn.gpu) + ob.gpu /= obn.gpu + self.clip_object(ob.gpu) queue.synchronize() - # print 'object update: ' + str(time.time()-t1) - self.benchmark.object_update += time.time() - t1 - self.benchmark.calls_object += 1 - ## probe update def probe_update(self, MPI=False): - t1 = time.time() queue = self.queue # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) cfact = self.p.probe_inertia use_atomics = self.p.probe_update_cuda_atomics for pID, pr in self.pr.storages.items(): @@ -445,35 +426,56 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - if MPI: - pr.data[:] = pr.gpu.get() - prn.data[:] = prn.gpu.get() - queue.synchronize() - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - self.support_constraint(pr) - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - pr.data[:] = pr.gpu.get() - self.support_constraint(pr) - pr.gpu.set(pr.data) - - ## this should be done on GPU - queue.synchronize() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) - # print 'probe update: ' + str(time.time()-t1) - self.benchmark.probe_update += time.time() - t1 - self.benchmark.calls_probe += 1 + ## calculate change on GPU + queue.synchronize() + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size return np.sqrt(change) - def engine_finalize(self): + def support_constraint(self, storage=None): + """ + Enforces 2D support contraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + def clip_object(self, ob): + """ + Clips magnitudes of object into given range. + """ + if self.p.clip_object is not None: + cmin, cmax = self.p.clip_object + self.CMK.clip_magnitudes_to_range(ob, cmin, cmax) + + def engine_finalize(self, benchmark=False): """ clear GPU data and destroy context. """ @@ -495,5 +497,6 @@ def engine_finalize(self): for name, s in self.pr.S.items(): s.data = np.copy(s.data) + self.context.pop() self.context.detach() - super(DM_pycuda, self).engine_finalize() \ No newline at end of file + super(DM_pycuda, self).engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index a9ad7fac7..928c8b654 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -24,13 +24,11 @@ from ptypy.utils import parallel from ptypy.engines import register from . import DM_pycuda +from ..multi_gpu import MultiGpuCommunicator from ..mem_utils import make_pagelocked_paired_arrays as mppa from ..mem_utils import GpuDataManager2 -MPI = parallel.size > 1 -MPI = True - EX_MA_BLOCKS_RATIO = 2 MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit #MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit @@ -69,8 +67,8 @@ def _setup_kernels(self): # TODO grow blocks dynamically nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) nma = min(fit, MAX_BLOCKS) - log(3, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) - log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) + log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) # reset memory or create new self.ex_data = GpuDataManager2(ex_mem, 0, nex, True) self.ma_data = GpuDataManager2(ma_mem, 0, nma, False) @@ -88,6 +86,8 @@ def engine_prepare(self): s.gpu, s.data = mppa(s.data) for name, s in self.pr.S.items(): s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) for name, s in self.pr_nrm.S.items(): s.gpu, s.data = mppa(s.data) @@ -125,11 +125,11 @@ def engine_prepare(self): prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) prep.mag[:] = mag - log(3, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) + log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) self.ex_data.add_data_block() self.ma_data.add_data_block() self.mag_data.add_data_block() - + def engine_iterate(self, num=1): """ Compute one iteration. @@ -139,7 +139,7 @@ def engine_iterate(self, num=1): atomics_probe = self.p.probe_update_cuda_atomics atomics_object = self.p.object_update_cuda_atomics use_tiles = (not atomics_object) or (not atomics_probe) - + for it in range(num): error = {} @@ -159,8 +159,8 @@ def engine_iterate(self, num=1): obn = self.ob_nrm.S[oID] if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) obb = self.ob_buf.S[oID] - logger.info('Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) # obb.gpu[:] = ob.gpu * cfactf32 @@ -170,7 +170,6 @@ def engine_iterate(self, num=1): # First cycle: Fourier + object update for iblock, dID in enumerate(self.dID_list): - t1 = time.time() prep = self.diff_info[dID] # find probe, object in exit ID in dependence of dID @@ -214,24 +213,18 @@ def engine_iterate(self, num=1): ## compute log-likelihood if self.p.compute_log_likelihood: - t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) # synchronize h2d stream with compute stream self.queue.wait_for_event(ev_mag) FUK.log_likelihood(aux, addr, mag, ma, err_phot) - self.benchmark.F_LLerror += time.time() - t1 # synchronize h2d stream with compute stream self.queue.wait_for_event(ev_ex) - t1 = time.time() AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) - self.benchmark.A_Build_aux += time.time() - t1 ## FFT - t1 = time.time() PROP.fw(aux, aux) - self.benchmark.B_Prop += time.time() - t1 ## Deviation from measured data # synchronize h2d stream with compute stream @@ -240,32 +233,23 @@ def engine_iterate(self, num=1): FUK.error_reduce(addr, err_fourier) FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) - self.benchmark.C_Fourier_update += time.time() - t1 data_mag.record_done(self.queue, 'compute') data_ma.record_done(self.queue, 'compute') - t1 = time.time() PROP.bw(aux, aux) ## apply changes AWK.build_exit(aux, addr, ob, pr, ex) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) - self.benchmark.E_Build_exit += time.time() - t1 - self.benchmark.calls_fourier += 1 - prestr = '%d Iteration (Overlap) #%02d: ' % (parallel.rank, inner) # Update object if do_update_object: log(4, prestr + '----- object update -----', True) - t1 = time.time() - addrt = addr if atomics_object else addr2 self.queue.wait_for_event(ev_ex) POK.ob_update(addrt, obb, obn, pr, ex, atomics=atomics_object) - self.benchmark.object_update += time.time() - t1 - self.benchmark.calls_object += 1 data_ex.record_done(self.queue, 'compute') if iblock + len(self.ex_data) < len(self.dID_list): @@ -283,29 +267,21 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] obb = self.ob_buf.S[oID] - # MPI test - if MPI: - obb.data[:] = obb.gpu.get() - obn.data[:] = obn.gpu.get() - parallel.allreduce(obb.data) - parallel.allreduce(obn.data) - obb.data /= obn.data - self.clip_object(obb) - ob.gpu.set(obb.data) - else: - obb.gpu /= obn.gpu - ob.gpu[:] = obb.gpu + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu # Exit if probe should not yet be updated if not do_update_probe: break - self.ex_data.syncback = False + # Update probe log(4, prestr + '----- probe update -----', True) - change = self.probe_update(MPI=MPI) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) - + change = self.probe_update() log(4, prestr + 'change in probe is %.3f' % change, True) # stop iteration if probe change is small @@ -323,7 +299,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): prep = self.diff_info[dID] @@ -413,11 +389,10 @@ def engine_iterate(self, num=1): ## probe update def probe_update(self, MPI=False): - t1 = time.time() queue = self.queue use_atomics = self.p.probe_update_cuda_atomics # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) for pID, pr in self.pr.storages.items(): prn = self.pr_nrm.S[pID] cfact = self.pr_cfact[pID] @@ -455,40 +430,22 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - # MPI test - if MPI: - # if False: - pr.data[:] = pr.gpu.get() - prn.data[:] = prn.gpu.get() - # queue.synchronize() - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - - self.support_constraint(pr) - - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - # ca. 0.3 ms - # self.pr.S[pID].gpu = probe_gpu - pr.data[:] = pr.gpu.get() - - ## this should be done on GPU - - # queue.synchronize() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size - - # print 'probe update: ' + str(time.time()-t1) - self.benchmark.probe_update += time.time() - t1 - self.benchmark.calls_probe += 1 + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + ## calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=False): """ Clear all GPU data, pinned memory, etc """ @@ -500,4 +457,4 @@ def engine_finalize(self): for name, s in self.pr.S.items(): s.data = np.copy(s.data) # is this the same as s.data.get()? - super().engine_finalize() + super().engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 3bc019d67..8678de830 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -20,9 +20,6 @@ from . import DM_pycuda from ..mem_utils import GpuDataManager -MPI = parallel.size > 1 -MPI = True - # factor how many more exit waves we wanna keep on GPU compared to # ma / mag data EX_MA_BLOCKS_RATIO = 2 @@ -149,6 +146,12 @@ def engine_prepare(self): s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0) s.data[:] = d s.gpu = gpuarray.to_gpu(s.data) + for name, s in self.pr_buf.S.items(): + # pr + d = s.data + s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0) + s.data[:] = d + s.gpu = gpuarray.to_gpu(s.data) for name, s in self.pr_nrm.S.items(): # prn d = s.data @@ -208,7 +211,7 @@ def engine_prepare(self): nma = min(fit, blocks) nstreams = min(MAX_STREAMS, blocks) - log(3, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks)) + log(4, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks)) # reset memory or create new if self.ex_data is not None: self.ex_data.reset(ex_mem, nex) @@ -259,7 +262,7 @@ def engine_iterate(self, num=1): obn = self.ob_nrm.S[oID] if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4,'Smoothing object, cfact is %.2f' % cfact) obb = self.ob_buf.S[oID] smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) @@ -393,8 +396,7 @@ def engine_iterate(self, num=1): # Update probe log(4, prestr + '----- probe update -----', True) self.ex_data.syncback = False - change = self.probe_update(MPI=MPI) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) + change = self.probe_update() # swap direction for next time self.dID_list.reverse() @@ -418,7 +420,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") prev_event = None for dID in self.di.S.keys(): streamdata = self.streams[self.cur_stream] @@ -501,7 +503,6 @@ def engine_iterate(self, num=1): for name, s in self.pr.S.items(): s.gpu.get(s.data) - # FIXXME: copy to pinned memory for dID, prep in self.diff_info.items(): err_fourier = prep.err_fourier_gpu.get() @@ -513,7 +514,6 @@ def engine_iterate(self, num=1): self.error = error return error - def _object_allreduce(self): # make sure that all transfers etc are finished for sd in self.streams: @@ -522,20 +522,12 @@ def _object_allreduce(self): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] obb = self.ob_buf.S[oID] - if MPI: - obb.gpu.get(obb.data) - obn.gpu.get(obn.data) - parallel.allreduce(obb.data) - parallel.allreduce(obn.data) - obb.data /= obn.data - self.clip_object(obb) - tt1 = time.time() - ob.gpu.set(obb.data) # async tx on same stream? - - else: - obb.gpu /= obn.gpu - ob.gpu[:] = obb.gpu - + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu ## probe update def probe_update(self, MPI=False): @@ -543,7 +535,7 @@ def probe_update(self, MPI=False): streamdata = self.streams[self.cur_stream] use_atomics = self.p.probe_update_cuda_atomics # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) prev_event = None for pID, pr in self.pr.storages.items(): prn = self.pr_nrm.S[pID] @@ -575,7 +567,6 @@ def probe_update(self, MPI=False): prev_event = streamdata.end_compute() self.cur_stream = (self.cur_stream + self.stream_direction) % len(self.streams) - # sync all streams first for sd in self.streams: sd.synchronize() @@ -584,31 +575,19 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - - # MPI test - if MPI: - # if False: - pr.gpu.get(pr.data) - prn.gpu.get(prn.data) - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - self.support_constraint(pr) - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - # ca. 0.3 ms - # self.pr.S[pID].gpu = probe_gpu - pr.gpu.get(pr.data) - - ## this should be done on GPU - tt1 = time.time() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size - tt2 = time.time() - #print('time for pr change: {}s'.format(tt2-tt1)) + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + ## calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size # print 'probe update: ' + str(time.time()-t1) self.benchmark.probe_update += time.time() - t1 @@ -616,7 +595,7 @@ def probe_update(self, MPI=False): return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=False): """ Clear all GPU data, pinned memory, etc """ @@ -625,4 +604,4 @@ def engine_finalize(self): self.ma_data = None self.mag_data = None - super().engine_finalize() + super().engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 4ac3d3161..a932be7b2 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -8,6 +8,23 @@ from ..base import kernels as ab from ..base.kernels import Adict +def choose_fft(fft_type): + if fft_type=='cuda': + try: + from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT + except: + logger.warning('Unable to import cufft version - using Reikna instead') + from ptypy.accelerate.cuda_pycuda.fft import FFT + elif fft_type=='skcuda': + try: + from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT + except: + logger.warning('Unable to import skcuda.fft version - using Reikna instead') + from ptypy.accelerate.cuda_pycuda.fft import FFT + else: + from ptypy.accelerate.cuda_pycuda.fft import FFT + return FFT + class PropagationKernel: def __init__(self, aux, propagator, queue_thread=None, fft='reikna'): @@ -24,21 +41,7 @@ def __init__(self, aux, propagator, queue_thread=None, fft='reikna'): def allocate(self): aux = self.aux - - if self._fft_type=='cuda': - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT - except: - logger.warning('Unable to import cufft version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - elif self._fft_type=='skcuda': - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT - except: - logger.warning('Unable to import skcuda.fft version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - else: - from ptypy.accelerate.cuda_pycuda.fft import FFT + FFT = choose_fft(self._fft_type) if self.prop_type == 'farfield': @@ -120,6 +123,33 @@ def queue(self, queue): if self.prop_type == "nearfield": self._fft3.queue = queue +class FourierSupportKernel: + def __init__(self, support, queue_thread=None, fft='reikna'): + self.support = support + self.queue = queue_thread + self._fft_type = fft + def allocate(self): + FFT = choose_fft(self._fft_type) + + self._fft1 = FFT(self.support, self.queue, + post_fft=self.support, + symmetric=True, + forward=True) + self._fft2 = FFT(self.support, self.queue, + symmetric=True, + forward=False) + def apply_fourier_support(self,x): + self._fft1.ft(x,x) + self._fft2.ift(x,x) + +class RealSupportKernel: + def __init__(self, support): + self.support = support + def allocate(self): + self.support = gpuarray.to_gpu(self.support) + def apply_real_support(self, x): + x *= self.support + class FourierUpdateKernel(ab.FourierUpdateKernel): def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py new file mode 100644 index 000000000..49f654994 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py @@ -0,0 +1,159 @@ +""" +Multi-GPU AllReduce Wrapper, that uses NCCL via cupy if it's available, +and otherwise falls back to CUDA-aware MPI, +and if that doesn't work, uses host/device copies with regular MPI. + +Findings: + +1) NCCL works with unit tests, but not in the engines. It seems to +add something to the existing pycuda Context or create a new one, +as a later event recording on an exit wave transfer fails with +'ivalid resource handle' Cuda Error. This error typically happens if for example +a CUDA event is created in a different context than what it is used in, +or on a different device. PyCuda uses the driver API, NCCL uses the runtime. +Even though those are interoperable, there seems to be an issue. +Note that this is before any allreduce call - straight after initialising. + +2) NCCL requires cupy - the Python wrapper is in there + +3) OpenMPI with CUDA support needs to be available, and: + - mpi4py needs to be compiled from master (3.1.0a - latest stable release 3.0.x doesn't have it) + - pycuda needs to be compile from master (for __cuda_array_interface__ - 2020.1 version doesn't have it) + - OpenMPI in a conda install needs to have the environment variable + --> if cuda support isn't enabled, the application simply crashes with a seg fault + +4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. + It should be in DEFAULT mode. + +""" + +import mpi4py +from pkg_resources import parse_version +import numpy as np +from pycuda import gpuarray +import pycuda.driver as cuda +from ptypy.utils import parallel +from ptypy.utils.verbose import logger, log +import os + +try: + from cupy.cuda import nccl + import cupy as cp +except ImportError: + nccl = None + +# properties to check which versions are available + +# use NCCL is it is available, and the user didn't override the +# default selection with environment variables +have_nccl = (nccl is not None) and \ + (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ + (not 'PTYPY_USE_MPI' in os.environ) + +# At the moment, we require: +# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, +# mpi4py >= 3.1.0 +# pycuda with __cuda_array_interface__ +# and not setting the PTYPY_USE_MPI environment variable +# +# -> we ideally want to allow enabling support from a parameter in ptypy +have_cuda_mpi = "OMPI_MCA_opal_cuda_support" in os.environ and \ + os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \ + parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \ + hasattr(gpuarray.GPUArray, '__cuda_array_interface__') and \ + not ('PTYPY_USE_MPI' in os.environ) + + +class MultiGpuCommunicatorBase: + """Base class for multi-GPU communicator options, to aggregate common bits""" + + def __init__(self): + self.rank = parallel.rank + self.ndev = parallel.size + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + # base class only checks properties of arrays + assert isinstance(arr, gpuarray.GPUArray), "Input must be a GPUArray" + + +class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase): + """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D""" + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + super().allReduceSum(arr) + + if parallel.MPIenabled: + # note: this creates a temporary CPU array + data = arr.get() + parallel.allreduce(data) + arr.set(data) + +class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase): + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + assert hasattr(arr, '__cuda_array_interface__'), "input array should have a cuda array interface" + + if parallel.MPIenabled: + comm = parallel.comm + comm.Allreduce(parallel.MPI.IN_PLACE, arr) + + +class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): + + def __init__(self): + super().__init__() + + #assert cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] == cuda.compute_mode.DEFAULT, "compute mode must be default in order to use NCCL" + + # get a unique identifier for the NCCL communicator and + # broadcast it to all MPI processes (assuming one device per process) + if self.rank == 0: + self.id = nccl.get_unique_id() + else: + self.id = None + + self.id = parallel.bcast(self.id) + + self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + buf = int(arr.gpudata) + count, datatype = self.__get_NCCL_count_dtype(arr) + + # no stream support here for now - it fails in NCCL when + # pycuda.Stream.handle is used for some unexplained reason + stream = cp.cuda.Stream.null.ptr + + self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream) + + def __get_NCCL_count_dtype(self, arr): + if arr.dtype == np.complex64: + return arr.size*2, nccl.NCCL_FLOAT32 + elif arr.dtype == np.complex128: + return arr.size*2, nccl.NCCL_FLOAT64 + elif arr.dtype == np.float32: + return arr.size, nccl.NCCL_FLOAT32 + elif arr.dtype == np.float64: + return arr.size, nccl.NCCL_FLOAT64 + else: + raise ValueError("This dtype is not supported by NCCL.") + + + +# pick the appropriate communicator depending on installed packages +if have_nccl: + MultiGpuCommunicator = MultiGpuCommunicatorNccl + log(4, "Using NCCL communicator") +elif have_cuda_mpi: + MultiGpuCommunicator = MultiGpuCommunicatorCudaMpi + log(4, "Using CUDA-aware MPI communicator") +else: + MultiGpuCommunicator = MultiGpuCommunicatorMpi + log(4, "Using MPI communicator") + diff --git a/ptypy/engines/DM.py b/ptypy/engines/DM.py index 9b8340a63..46fa0a2bc 100644 --- a/ptypy/engines/DM.py +++ b/ptypy/engines/DM.py @@ -363,8 +363,7 @@ def object_update(self): # array and therefore underestimate the strength of the probe terms. cfact = self.p.object_inertia * self.mean_power if self.p.obj_smooth_std is not None: - logger.info( - 'Smoothing object, average cfact is %.2f' + log(4, 'Smoothing object, average cfact is %.2f' % np.mean(cfact).real) smooth_mfs = [0, self.p.obj_smooth_std, diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index d511bec36..23950af26 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -386,3 +386,20 @@ def test_max_abs2_float_UNITY(self): np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, err_msg="The object norm array has not been updated as expected") + + + def test_clip_magnitudes_to_range_UNITY(self): + np.random.seed(1987) + A = np.random.random((2,10,10)) + B = A[0] + 1j* A[1] + B = B.astype(np.complex64) + B_gpu = gpuarray.to_gpu(B) + + au.clip_complex_magnitudes_to_range(B, 0.2,0.8) + CMK = gau.ClipMagnitudesKernel() + CMK.clip_magnitudes_to_range(B_gpu, 0.2, 0.8) + + np.testing.assert_allclose(B_gpu.get(), B, rtol=1e-6, atol=1e-6, + err_msg="The magnitudes of the array have not been clipped as expected") + + diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py new file mode 100644 index 000000000..9313dbd64 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py @@ -0,0 +1,85 @@ +''' +''' + +import unittest +from mpi4py.MPI import Get_version +import numpy as np +from . import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + import pycuda.driver as cuda + from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu + from ptypy.utils import parallel + +from pkg_resources import parse_version + +class GpuDataTest(PyCudaTest): + """ + This is a test class for MPI - to really check if it all works, it needs + to be run as: + + mpirun -np 2 pytest multi_gpu_test.py + + For CUDA-aware MPI testing, currently the environment variable + + OMPI_MCA_opal_cuda_support=true + + needs to be set, mpi4py version 3.1.0+ used, a pycuda build from master, + and a cuda-aware MPI version. + """ + + def setUp(self): + if parallel.rank_local < cuda.Device.count(): + self.device = cuda.Device(parallel.rank_local) + self.ctx = self.device.make_context() + self.ctx.push() + else: + self.ctx = None + + def tearDown(self): + if self.ctx is not None: + self.ctx.pop() + self.ctx.detach() + + @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0") + def test_version(self): + v1 = parse_version("3.1.0") + v2 = parse_version(parse_version("3.1.0a").base_version) + + self.assertGreaterEqual(v2, v1) + + def test_compute_mode(self): + attr = cuda.Context.get_device().get_attributes() + self.assertIn(cuda.device_attribute.COMPUTE_MODE, attr) + mode = attr[cuda.device_attribute.COMPUTE_MODE] + self.assertIn(mode, + [cuda.compute_mode.DEFAULT, cuda.compute_mode.PROHIBITED, cuda.compute_mode.EXCLUSIVE_PROCESS] + ) + + def multigpu_tester(self, com): + if self.ctx is None: + return + + data = np.ones((2, 1), dtype=np.float32) + data_dev = gpuarray.to_gpu(data) + sz = parallel.size + com.allReduceSum(data_dev) + + out = data_dev.get() + np.testing.assert_allclose(out, sz * data, rtol=1e-6) + + def test_multigpu_auto(self): + self.multigpu_tester(mgpu.MultiGpuCommunicator()) + + + def test_multigpu_mpi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi()) + + @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available") + def test_multigpu_cudampi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi()) + + @unittest.skipIf(not mgpu.have_nccl, "NCCL not available") + def test_multigpu_nccl(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl()) \ No newline at end of file From 1c36ab86ff19b7b3b67759e0b5c492de16f1938c Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Tue, 6 Apr 2021 12:20:21 +0100 Subject: [PATCH 51/56] checking at runtime if nccl/cuda-mpi are available --- .../cuda_pycuda/engines/DM_pycuda.py | 4 +- .../cuda_pycuda/engines/DM_pycuda_stream.py | 1 - ptypy/accelerate/cuda_pycuda/multi_gpu.py | 39 ++++++++++++------- .../cuda_pycuda_tests/multi_gpu_test.py | 3 +- 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 65b5edd0e..50b605680 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -23,7 +23,7 @@ from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel from ..mem_utils import make_pagelocked_paired_arrays as mppa -from ..multi_gpu import MultiGpuCommunicator +from ..multi_gpu import get_multi_gpu_communicator __all__ = ['DM_pycuda'] @@ -68,7 +68,7 @@ def engine_initialize(self): """ # Context, Multi GPU communicator and Stream (needs to be in this order) self.context, self.queue = get_context(new_context=True, new_queue=False) - self.multigpu = MultiGpuCommunicator() + self.multigpu = get_multi_gpu_communicator() self.context, self.queue = get_context(new_context=False, new_queue=True) # Gaussian Smoothing Kernel diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 928c8b654..b002c3dd8 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -24,7 +24,6 @@ from ptypy.utils import parallel from ptypy.engines import register from . import DM_pycuda -from ..multi_gpu import MultiGpuCommunicator from ..mem_utils import make_pagelocked_paired_arrays as mppa from ..mem_utils import GpuDataManager2 diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py index 49f654994..0d4517d4e 100644 --- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py +++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py @@ -95,7 +95,9 @@ class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase): def allReduceSum(self, arr): """Call MPI.all_reduce in-place, with array on GPU""" - assert hasattr(arr, '__cuda_array_interface__'), "input array should have a cuda array interface" + # Check if cuda array interface is available + if not hasattr(arr, '__cuda_array_interface__'): + raise RuntimeError("input array should have a cuda array interface") if parallel.MPIenabled: comm = parallel.comm @@ -106,8 +108,10 @@ class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): def __init__(self): super().__init__() - - #assert cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] == cuda.compute_mode.DEFAULT, "compute mode must be default in order to use NCCL" + + # Check if GPUs are in default mode + if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT: + raise RuntimeError("Compute mode must be default in order to use NCCL") # get a unique identifier for the NCCL communicator and # broadcast it to all MPI processes (assuming one device per process) @@ -145,15 +149,22 @@ def __get_NCCL_count_dtype(self, arr): raise ValueError("This dtype is not supported by NCCL.") - -# pick the appropriate communicator depending on installed packages -if have_nccl: - MultiGpuCommunicator = MultiGpuCommunicatorNccl - log(4, "Using NCCL communicator") -elif have_cuda_mpi: - MultiGpuCommunicator = MultiGpuCommunicatorCudaMpi - log(4, "Using CUDA-aware MPI communicator") -else: - MultiGpuCommunicator = MultiGpuCommunicatorMpi +# pick the appropriate communicator depending on installed packages +def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): + if have_nccl and use_nccl: + try: + comm = MultiGpuCommunicatorNccl() + log(4, "Using NCCL communicator") + return comm + except RuntimeError: + pass + if have_cuda_mpi and use_cuda_mpi: + try: + comm = MultiGpuCommunicatorCudaMpi() + log(4, "Using CUDA-aware MPI communicator") + return comm + except RuntimeError: + pass + comm = MultiGpuCommunicatorMpi() log(4, "Using MPI communicator") - + return comm \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py index 9313dbd64..64cc5110d 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py @@ -70,9 +70,8 @@ def multigpu_tester(self, com): np.testing.assert_allclose(out, sz * data, rtol=1e-6) def test_multigpu_auto(self): - self.multigpu_tester(mgpu.MultiGpuCommunicator()) + self.multigpu_tester(mgpu.get_multi_gpu_communicator()) - def test_multigpu_mpi(self): self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi()) From 2f10f86fcdf9478afd15082e68b366cac0c05aee Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Wed, 7 Apr 2021 16:55:27 +0100 Subject: [PATCH 52/56] Fixed bugs in address manglers --- ptypy/accelerate/base/address_manglers.py | 3 ++- ptypy/accelerate/cuda_pycuda/address_manglers.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ptypy/accelerate/base/address_manglers.py b/ptypy/accelerate/base/address_manglers.py index 6c73da5da..100c4d382 100644 --- a/ptypy/accelerate/base/address_manglers.py +++ b/ptypy/accelerate/base/address_manglers.py @@ -13,7 +13,7 @@ def __init__(self, max_step_per_shift, start, stop, nshifts, max_bound=None, r # can be initialised in the engine.init self.max_bound = max_bound # maximum distance from the starting positions - self.max_step = lambda it: (max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression + self.max_step = lambda it: np.ceil(max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression self.nshifts = nshifts self.delta = 0 @@ -75,6 +75,7 @@ def setup_shifts(self, current_iteration, nframes=1): delta = np.mgrid[-max_step:max_step+1:1, -max_step:max_step+1:1] within_bound = (delta[0]**2 + delta[1]**2) < (self.max_bound**2) + print(max_step, self.max_bound, within_bound.sum()) self.delta = np.tile(delta[:,within_bound].T.reshape(within_bound.sum(),1,2), (1,nframes,1)) self.nshifts = self.delta.shape[0] diff --git a/ptypy/accelerate/cuda_pycuda/address_manglers.py b/ptypy/accelerate/cuda_pycuda/address_manglers.py index fa168903f..d19a77fa4 100644 --- a/ptypy/accelerate/cuda_pycuda/address_manglers.py +++ b/ptypy/accelerate/cuda_pycuda/address_manglers.py @@ -17,7 +17,7 @@ def _setup_delta_gpu(self): assert self.delta is not None, "Setup delta using the setup_shifts method first" self.delta = np.ascontiguousarray(self.delta, dtype=np.int32) - if self.delta_gpu is None or self.delta_gpu.shape[0] > self.delta.shape[0]: + if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]: self.delta_gpu = gpuarray.empty(self.delta.shape, dtype=np.int32) # in case self.delta is smaller than delta_gpu, this will only copy the # relevant part From 0db75156753a191e7196fa9af5975fbe6dcd4a01 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 8 Apr 2021 21:33:17 +0100 Subject: [PATCH 53/56] Fixed bug in DM stream engines related to smoothing/object update --- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py | 4 ++-- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index b002c3dd8..8f2454ad7 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -156,14 +156,14 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): cfact = self.ob_cfact[oID] obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] if self.p.obj_smooth_std is not None: log(4, 'Smoothing object, cfact is %.2f' % cfact) - obb = self.ob_buf.S[oID] smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) # obb.gpu[:] = ob.gpu * cfactf32 - ob.gpu._axpbz(np.complex64(cfact), 0, ob.gpu, stream=self.queue) + ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=self.queue) obn.gpu.fill(np.float32(cfact), stream=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 8678de830..741ae3045 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -260,14 +260,14 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): cfact = self.ob_cfact[oID] obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] if self.p.obj_smooth_std is not None: log(4,'Smoothing object, cfact is %.2f' % cfact) - obb = self.ob_buf.S[oID] smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) - ob.gpu._axpbz(np.complex64(cfact), 0, ob.gpu, stream=streamdata.queue) + ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue) obn.gpu.fill(np.float32(cfact), stream=streamdata.queue) self.ex_data.syncback = True From c8d4a232128c0e7a176e914b79437373dceaffbd Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Fri, 9 Apr 2021 10:51:07 +0100 Subject: [PATCH 54/56] reversing the order of the support constraints (#315) * reversing the order of the support constraints * now making the intended change --- .../accelerate/cuda_pycuda/engines/DM_pycuda.py | 16 ++++++++-------- ptypy/engines/base.py | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 50b605680..4b514dbcf 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -450,14 +450,6 @@ def support_constraint(self, storage=None): for s in self.pr.storages.values(): self.support_constraint(s) - # Real space - support = self._probe_support.get(storage.ID) - if support is not None: - if storage.ID not in self.RSK: - self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) - self.RSK[storage.ID].allocate() - self.RSK[storage.ID].apply_real_support(storage.gpu) - # Fourier space support = self._probe_fourier_support.get(storage.ID) if support is not None: @@ -467,6 +459,14 @@ def support_constraint(self, storage=None): self.FSK[storage.ID].allocate() self.FSK[storage.ID].apply_fourier_support(storage.gpu) + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + def clip_object(self, ob): """ Clips magnitudes of object into given range. diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py index 174628af4..1a6a49cdd 100644 --- a/ptypy/engines/base.py +++ b/ptypy/engines/base.py @@ -175,16 +175,16 @@ def support_constraint(self, storage=None): for s in self.pr.storages.values(): self.support_contraint(s) - # Real space - support = self._probe_support.get(storage.ID) - if support is not None: - storage.data *= support - # Fourier space support = self._probe_fourier_support.get(storage.ID) if support is not None: storage.data[:] = np.fft.ifft2(support * np.fft.fft2(storage.data)) + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + storage.data *= support + def iterate(self, num=None): """ Compute one or several iterations. From aef47bcc41c1181e1d20724b96482fac8049e061 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Fri, 9 Apr 2021 10:53:51 +0100 Subject: [PATCH 55/56] Make basic fourier update a true blend between DM and AP (#288) * Make basic update a true blend between DM and AP * made all DM updates a true blend of DM and AP. --- ptypy/accelerate/base/engines/DM_serial.py | 2 +- ptypy/accelerate/base/engines/DM_serial_stream.py | 2 +- ptypy/accelerate/base/kernels.py | 7 ++++--- ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu | 8 ++++++-- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py | 2 +- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py | 2 +- ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py | 2 +- ptypy/accelerate/cuda_pycuda/kernels.py | 3 ++- ptypy/engines/utils.py | 6 +++--- 9 files changed, 20 insertions(+), 14 deletions(-) diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index b5f779efc..7ddf4af57 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -319,7 +319,7 @@ def engine_iterate(self, num=1): ## build exit wave t1 = time.time() - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux,addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 diff --git a/ptypy/accelerate/base/engines/DM_serial_stream.py b/ptypy/accelerate/base/engines/DM_serial_stream.py index e3eadc085..ace8cf6d1 100644 --- a/ptypy/accelerate/base/engines/DM_serial_stream.py +++ b/ptypy/accelerate/base/engines/DM_serial_stream.py @@ -139,7 +139,7 @@ def engine_iterate(self, num=1): ## apply changes #2 t1 = time.time() - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) self.benchmark.E_Build_exit += time.time() - t1 err_phot = np.zeros_like(err_fourier) diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index 85f81fec2..b1f109444 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -417,7 +417,7 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): aux[ind, :, :] = tmp return - def build_exit(self, b_aux, addr, ob, pr, ex): + def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1): sh = addr.shape @@ -433,9 +433,10 @@ def build_exit(self, b_aux, addr, ob, pr, ex): rows, cols = ex.shape[-2:] for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): - dex = aux[ind, :, :] - \ + dex = aux[ind, :, :] - alpha * \ ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ - pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + (alpha - 1) * \ + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex aux[ind, :, :] = dex diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu index 8c1127758..2b98634dc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu @@ -28,12 +28,14 @@ extern "C" __global__ void build_exit(complex* auxiliary_wave, const complex* __restrict__ obj, int H, int I, - const int* __restrict__ addr) + const int* __restrict__ addr, + IN_TYPE alpha_) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; const int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -53,7 +55,9 @@ extern "C" __global__ void build_exit(complex* auxiliary_wave, complex auxv = auxiliary_wave[b * C + c]; complex t_probe = probe[b * F + c]; complex t_obj = obj[b * I + c]; - auxv -= t_probe * t_obj; + complex t_exit = exit_wave[b * C + c]; + auxv -= alpha * t_probe * t_obj; + auxv += (alpha - 1) * t_exit; exit_wave[b * C + c] += auxv; auxiliary_wave[b * C + c] = auxv; } diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 4b514dbcf..961851072 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -244,7 +244,7 @@ def engine_iterate(self, num=1): PROP.bw(aux, aux) ## build exit wave - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 8f2454ad7..9306475b1 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -237,7 +237,7 @@ def engine_iterate(self, num=1): PROP.bw(aux, aux) ## apply changes - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 741ae3045..d2db342f5 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -355,7 +355,7 @@ def engine_iterate(self, num=1): t1 = time.time() PROP.bw(aux, aux) ## apply changes - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index a932be7b2..47dd4cb79 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -493,7 +493,7 @@ def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0): int(maxz * nmodes)), stream=self.queue) - def build_exit(self, b_aux, addr, ob, pr, ex): + def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1): obr, obc = self._cache_object_shape(ob) sh = addr.shape nmodes = sh[1] @@ -506,6 +506,7 @@ def build_exit(self, b_aux, addr, ob, pr, ex): ob, obr, obc, addr, + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): diff --git a/ptypy/engines/utils.py b/ptypy/engines/utils.py index fadb012c9..39fcbc93c 100644 --- a/ptypy/engines/utils.py +++ b/ptypy/engines/utils.py @@ -152,7 +152,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object + df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit pod.exit += df err_exit += np.mean(u.abs2(df)) elif err_fmag > pbound: @@ -162,7 +162,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object + df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit pod.exit += df err_exit += np.mean(u.abs2(df)) else: @@ -170,7 +170,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = alpha * (pod.probe * pod.object - pod.exit) + df = (pod.probe * pod.object - pod.exit) pod.exit += df err_exit += np.mean(u.abs2(df)) From e70bae1f9a601e1dffab63568e16ef6d4644e111 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Fri, 9 Apr 2021 11:02:57 +0100 Subject: [PATCH 56/56] Cleaned up debugging traces --- ptypy/accelerate/base/engines/DM_serial.py | 2 - .../base/engines/DM_serial_stream.py | 2 - ptypy/accelerate/base/engines/DR_serial.py | 50 -------- ptypy/accelerate/base/engines/ML_serial.py | 109 ------------------ .../cuda_pycuda/engines/DR_pycuda.py | 2 - ptypy/engines/ML.py | 19 +-- 6 files changed, 1 insertion(+), 183 deletions(-) diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index 7ddf4af57..44573bf56 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -27,8 +27,6 @@ # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that. # - Fourier_update_kernel needs to allow batched execution -## for debugging -#from matplotlib import pyplot as plt __all__ = ['DM_serial'] diff --git a/ptypy/accelerate/base/engines/DM_serial_stream.py b/ptypy/accelerate/base/engines/DM_serial_stream.py index ace8cf6d1..2c65511dc 100644 --- a/ptypy/accelerate/base/engines/DM_serial_stream.py +++ b/ptypy/accelerate/base/engines/DM_serial_stream.py @@ -29,8 +29,6 @@ # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that. # - Fourier_update_kernel needs to allow batched execution -## for debugging -#from matplotlib import pyplot as plt __all__ = ['DM_serial_stream'] diff --git a/ptypy/accelerate/base/engines/DR_serial.py b/ptypy/accelerate/base/engines/DR_serial.py index 31fc43b95..b13828919 100644 --- a/ptypy/accelerate/base/engines/DR_serial.py +++ b/ptypy/accelerate/base/engines/DR_serial.py @@ -22,8 +22,6 @@ from ptypy.accelerate.base import address_manglers from ptypy.accelerate.base import array_utils as au -# for debugging -import h5py, sys __all__ = ['DR_serial'] @@ -91,16 +89,6 @@ class DR_serial(PositionCorrectionEngine): type = bool help = A switch for computing the fourier error (this can impact the performance of the engine) - [debug] - default = None - type = str - help = For debugging purposes, dump arrays into given directory - - [debug_iter] - default = 0 - type = int - help = For debugging purposes, dump arrays at this iteration - """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] @@ -327,17 +315,6 @@ def engine_iterate(self, num=1): err_fourier = prep.err_fourier[i,None] err_exit = prep.err_exit[i,None] - # debugging - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/before_%04d.h5" %self.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["pr"] = pr - f["mag"] = mag - f["ma"] = ma - f["ma_sum"] = ma_sum - ## build auxilliary wave t1 = time.time() AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) @@ -376,43 +353,17 @@ def engine_iterate(self, num=1): #if self.p.rescale_probe: # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) - # debugging - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/before_aux_no_ex_%04d.h5" %self.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["pr"] = pr - ## build auxilliary wave (ob * pr product) t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) self.benchmark.A_Build_aux += time.time() - t1 - # debugging - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/ob_update_local_%04d.h5" %self.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["pr"] = pr - f["ex"] = ex - # object update t1 = time.time() POK.ob_update_local(addr, ob, pr, ex, aux) self.benchmark.object_update += time.time() - t1 self.benchmark.calls_object += 1 - # debugging - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/pr_update_local_%04d.h5" %self.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["pr"] = pr - f["ex"] = ex - # probe update t1 = time.time() POK.pr_update_local(addr, pr, ob, ex, aux) @@ -422,7 +373,6 @@ def engine_iterate(self, num=1): ## compute log-likelihood if self.p.compute_log_likelihood: t1 = time.time() - #AWK.build_aux_no_ex(aux, addr, ob, pr) aux[:] = FW(aux) FUK.log_likelihood(aux, addr, mag, ma, err_phot) self.benchmark.F_LLerror += time.time() - t1 diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 7ad06c69d..214aa0536 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -24,8 +24,6 @@ from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel from ptypy.accelerate.base import address_manglers -# for debugging -import h5py __all__ = ['ML_serial'] @@ -178,11 +176,6 @@ def engine_iterate(self, num=1): # probe/object rescaling if self.p.scale_precond: - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/ml_serial_o_p_norm_%04d.h5" %self.curiter, "w") as f: - f["cn2_new_pr_grad"] = cn2_new_pr_grad - f["cn2_new_ob_grad"] = cn2_new_ob_grad - if cn2_new_pr_grad > 1e-5: scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad / cn2_new_pr_grad) @@ -343,88 +336,21 @@ def new_grad(self): prg = pr_grad.S[pID].data I = self.engine.di.S[dID].data - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/build_aux_no_ex_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["pr"] = pr - # make propagated exit (to buffer) AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/forward_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - # forward prop aux[:] = FW(aux) - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/make_model_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - GDK.make_model(aux, addr) - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/floating_intensities_%04d.h5" %self.engine.curiter, "w") as f: - f["w"] = w - f["addr"] = addr - f["I"] = I - f["fic"] = fic - f["Imodel"] = GDK.npy.Imodel - if self.p.floating_intensities: GDK.floating_intensity(addr, w, I, fic) - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/main_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["w"] = w - f["I"] = I - GDK.main(aux, addr, w, I) - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/error_reduce_%04d.h5" %self.engine.curiter, "w") as f: - f["addr"] = addr - f["err_phot"] = err_phot - GDK.error_reduce(addr, err_phot) - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/backward_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - aux[:] = BW(aux) - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/op_update_ml_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["obg"] = obg - f["pr"] = pr - POK.ob_update_ML(addr, obg, pr, aux) - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/pr_update_ml_%04d.h5" %self.engine.curiter, "w") as f: - f["aux"] = aux - f["addr"] = addr - f["ob"] = ob - f["prg"] = prg - POK.pr_update_ML(addr, prg, ob, aux) for dID, prep in self.engine.diff_info.items(): @@ -444,12 +370,6 @@ def new_grad(self): # Object regularizer if self.regularizer: for name, s in self.engine.ob.storages.items(): - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/regul_grad_%04d.h5" %self.engine.curiter, "w") as f: - f["ob"] = s.data - ob_grad.storages[name].data += self.regularizer.grad(s.data) LL += self.regularizer.LL @@ -506,29 +426,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): a[:] = FW(a) b[:] = FW(b) - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/make_a012_%04d.h5" %self.engine.curiter, "w") as g: - g["addr"] = addr - g["a"] = a - g["b"] = b - g["f"] = f - g["I"] = I - g["fic"] = fic - GDK.make_a012(f, a, b, addr, I, fic) - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/fill_b_%04d.h5" %self.engine.curiter, "w") as f: - f["addr"] = addr - f["Brenorm"] = Brenorm - f["w"] = w - f["B"] = B - f["A0"] = GDK.npy.Imodel - f["A1"] = GDK.npy.LLerr - f["A2"] = GDK.npy.LLden - GDK.fill_b(addr, Brenorm, w, B) parallel.allreduce(B) @@ -536,13 +434,6 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): # Object regularizer if self.regularizer: for name, s in self.ob.storages.items(): - - # debugging - if self.p.debug and parallel.master and (self.engine.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/regul_poly_line_coeffs_%04d.h5" %self.engine.curiter, "w") as f: - f["ob"] = s.data - f["obh"] = c_ob_h.storages[name].data - B += Brenorm * self.regularizer.poly_line_coeffs( c_ob_h.storages[name].data, s.data) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py index 879411178..0454e753c 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py @@ -26,8 +26,6 @@ MPI = False -# debugging -import sys __all__ = ['DR_pycuda'] diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index e0059ca59..f6009e9b8 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -22,8 +22,6 @@ from .base import BaseEngine from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull -# for debugging -import h5py __all__ = ['ML'] @@ -101,17 +99,7 @@ class ML(BaseEngine): type = int lowlim = 0 help = Number of iterations before probe update starts - - [debug] - default = None - type = str - help = For debugging purposes, dump arrays into given directory - - [debug_iter] - default = 0 - type = int - help = For debugging purposes, dump arrays at this iteration - + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] @@ -245,11 +233,6 @@ def engine_iterate(self, num=1): if self.p.scale_precond: cn2_new_pr_grad = Cnorm2(new_pr_grad) cn2_new_ob_grad = Cnorm2(new_ob_grad) - if self.p.debug and parallel.master and (self.curiter == self.p.debug_iter): - with h5py.File(self.p.debug + "/ml_o_p_norm_%04d.h5" %self.curiter, "w") as f: - f["cn2_new_pr_grad"] = cn2_new_pr_grad - f["cn2_new_ob_grad"] = cn2_new_ob_grad - if cn2_new_pr_grad > 1e-5: scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad / cn2_new_pr_grad)