diff --git a/archive/cuda_extension/extensions.py b/archive/cuda_extension/extensions.py new file mode 100644 index 000000000..f0dfc0010 --- /dev/null +++ b/archive/cuda_extension/extensions.py @@ -0,0 +1,102 @@ +''' +These are the optional extensions for ptypy +''' + + +from distutils.version import LooseVersion +from distutils.extension import Extension +import os +import multiprocessing +import subprocess +import re +import numpy as np + + +# this is a hacky version, but is the desired behaviour +class AccelerationExtension(object): + def __init__(self, debug=False): + self.debug = debug + self._options = None + + def get_full_options(self): + return self._options + + def get_reflection_options(self): + user_options = [] + boolean_options = [] + for name, description in self._options.items(): + if isinstance(description['default'], str): + user_options.append((name+'=', None, description['doc'])) + elif isinstance(description['default'], bool): + user_options.append((name, None, description['doc'])) + boolean_options.append(name) + else: + raise NotImplementedError("Don't know what to do with parameter:%s of type: %s" % (name, type(description['default']))) + return user_options, boolean_options + + def build(self, options): + raise NotImplementedError('You need to implement the build method!') + + def getExtension(self): + raise NotImplementedError('You need to return cython extension object.') + + +class CudaExtension(AccelerationExtension): # probably going to inherit from something. + def __init__(self, *args, **kwargs): + super(CudaExtension, self).__init__(*args, **kwargs) + self._options = {'cudadir': {'default': '', + 'doc': 'CUDA directory'}, + 'cudaflags': {'default': '-gencode arch=compute_35,\\"code=sm_35\\" ' + + '-gencode arch=compute_37,\\"code=sm_37\\" ' + + '-gencode arch=compute_52,\\"code=sm_52\\" ' + + '-gencode arch=compute_60,\\"code=sm_60\\" ' + + '-gencode arch=compute_70,\\"code=sm_70\\" ', + 'doc': 'Flags to the CUDA compiler'}, + 'gputiming': {'default': False, + 'doc': 'Do GPU timing'}} + + def build(self, options): + cudadir = options['cudadir'] + cudaflags = options['cudaflags'] + gputiming = options['gputiming'] + try: + out = subprocess.check_output(['cmake', '--version']) + except OSError: + raise RuntimeError( + "CMake must be installed to build the CUDA extensions.") + + cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', + out.decode()).group(1)) + if cmake_version < '3.8.0': + raise RuntimeError("CMake >= 3.8.0 is required") + + srcdir = os.path.abspath('cuda') + buildtmp = os.path.abspath(os.path.join('build', 'cuda')) + cmake_args = [ + "-DCMAKE_BUILD_TYPE=" + ("Debug" if self.debug else "Release"), + '-DCMAKE_CUDA_FLAGS={}'.format(cudaflags), + '-DGPU_TIMING={}'.format("ON" if gputiming else "OFF") + ] + if cudadir: + cmake_args += '-DCMAKE_CUDA_COMPILER="{}/bin/nvcc"'.format(cudadir) + build_args = ["--config", "Debug" if self.debug else "Release", "--", "-j{}".format(multiprocessing.cpu_count() + 1)] + if not os.path.exists(buildtmp): + os.makedirs(buildtmp) + env = os.environ.copy() + subprocess.check_call(['cmake', srcdir] + cmake_args, + cwd=buildtmp, env=env) + subprocess.check_call(['cmake', '--build', '.'] + build_args, + cwd=buildtmp) + print("Complete.") + + def getExtension(self): + libdirs = ['build/cuda'] + if 'LD_LIBRARY_PATH' in os.environ: + libdirs += os.environ['LD_LIBRARY_PATH'].split(':') + return Extension('*', + sources=['ptypy/accelerate/cuda/gpu_extension.pyx'], + include_dirs=[np.get_include()], + libraries=['gpu_extension', 'cudart', 'cufft'], + library_dirs=libdirs, + depends=['build/cuda/libgpu_extension.a', ], + language="c++") diff --git a/archive/misc/mpitest.cpp b/archive/misc/mpitest.cpp new file mode 100644 index 000000000..e4ff84577 --- /dev/null +++ b/archive/misc/mpitest.cpp @@ -0,0 +1,47 @@ +/** This is a simple C++ test to check if cuda-aware MPI works as + * expected. + * It allocates a GPU array and puts 1s into it, then sends it + * across MPI to the receiving rank, which transfers back to + * host and outputs the values. + * The expected output is: + * + * Received 1, 1 + * + * Compile with: + * mpic++ -o test mpitest.cpp -L/path/to/cuda/libs -lcudart + * + * Run with: + * mpirun -np 2 test + */ + +#include +#include +#include +#include +#include + +int main(int argc, char** argv) +{ + MPI_Init(&argc, &argv); + + int rank; + MPI_Status status; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (rank == 0) { + int* d_send; + cudaMalloc((void**)&d_send, 2*sizeof(int)); + int h_send[] = {1, 1}; + cudaMemcpy(d_send, h_send, 2*sizeof(int), cudaMemcpyHostToDevice); + MPI_Send(d_send, 2, MPI_INT, 1, 99, MPI_COMM_WORLD); + std::cout << "Data has been sent...\n"; + } else if (rank == 1) { + int* d_recv; + cudaMalloc((void**)&d_recv, 2*sizeof(int)); + MPI_Recv(d_recv, 2, MPI_INT, 0, 99, MPI_COMM_WORLD, &status); + int h_recv[2]; + cudaMemcpy(h_recv, d_recv, 2*sizeof(int), cudaMemcpyDeviceToHost); + std::cout << "Received " << h_recv[0] << ", " << h_recv[1] << "\n"; + } + +} \ No newline at end of file diff --git a/benchmark/diamond_benchmarks/ML_accurracy_test.py b/benchmark/diamond_benchmarks/ML_accurracy_test.py new file mode 100644 index 000000000..a8da654ac --- /dev/null +++ b/benchmark/diamond_benchmarks/ML_accurracy_test.py @@ -0,0 +1,404 @@ +''' +Load real data and prepare an accuracy report of GPU vs numpy +''' + +import h5py +import numpy as np +import csv + +import pycuda.driver as cuda +from pycuda import gpuarray + +from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel + + +class GradientDescentAccuracyTester: + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + headings = ['Kernel', 'Version', 'Iter', 'MATH_TYPE', 'IN/OUT_TYPE', + 'ACC_TYPE', 'Array', 'num_elements', 'num_errors', 'max_relerr', 'max_abserr'] + + def __init__(self): + import sys + np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) + cuda.init() + self.device = cuda.Device(0) + self.ctx = self.device.make_context() + self.stream = cuda.Stream() + self.results = [] + + def __del__(self): + np.set_printoptions() + self.ctx.pop() + self.ctx.detach() + + def test_make_model(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}): + + res = [] + + # Load data + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_model(aux, addr) + ref = BGDK.npy.Imodel + + # GPU variants + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + for m in math_type: + # data type will be determined based on aux_dev data type automatically + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], queue=self.stream, math_type=m) + GDK.allocate() + GDK.make_model(aux_dev, addr_dev) + act = GDK.gpu.Imodel.get() + + num, num_mis, max_abs, max_rel = self._calc_diffs(act, ref) + + line = ['make_model', name, iter, d, m, 'N/A', + 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_floating_intensity(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + # note that this is actually calling 4 kernels: + # - floating_intensity_cuda_step1 + # - error_reduce_cuda (2x) + # - floating_intensity_cuda_step2 + + res = [] + + # Load data + with h5py.File(self.datadir % name + "floating_intensities_%04d.h5" % iter, "r") as f: + w = f["w"][:] + addr = f["addr"][:] + I = f["I"][:] + fic = f["fic"][:] + Imodel = f["Imodel"][:] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + + # CPU Kernel + ficref = np.copy(fic) + Iref = np.copy(Imodel) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = Iref + BGDK.floating_intensity(addr, w, I, ficref) # modifies fic, Imodel + Iref = BGDK.npy.Imodel + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float32)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float64)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + Imodel_dev = gpuarray.to_gpu(Imodel.astype(np.float64)) + + # GPU kernel + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], accumulate_type=a, math_type=m, queue=self.stream) + GDK.allocate() + GDK.gpu.Imodel = Imodel_dev + GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) + + Iact = GDK.gpu.Imodel.get() + fact = fic_dev.get() + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Iact, Iref) + line = ['floating_intensity', name, iter, d, m, + a, 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + fact, ficref) + line = ['floating_intensity', name, iter, d, m, + a, 'fic', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_main_and_error_reduce(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + res = [] + + # Load data + with h5py.File(self.datadir % name + "main_%04d.h5" % iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + w = f["w"][:] + I = f["I"][:] + # Load data + with h5py.File(self.datadir % name + "error_reduce_%04d.h5" % iter, "r") as f: + err_phot = f["err_phot"][:] + + # CPU Kernel + auxref = np.copy(aux) + errref = np.copy(err_phot) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.main(auxref, addr, w, I) + BGDK.error_reduce(addr, errref) + LLerrref = BGDK.npy.LLerr + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + err_phot_dev = gpuarray.to_gpu( + err_phot.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + err_phot_dev = gpuarray.to_gpu( + err_phot.astype(np.float64)) + + # GPU kernel + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], accumulate_type=a, math_type=m) + GDK.allocate() + GDK.main(aux_dev, addr_dev, w_dev, I_dev) + GDK.error_reduce(addr_dev, err_phot_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + auxref, aux_dev.get()) + line = ['main_and_error_reduce', name, iter, d, + m, a, 'aux', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLerrref, GDK.gpu.LLerr.get()) + line = ['main_and_error_reduce', name, iter, d, + m, a, 'LLerr', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + errref, err_phot_dev.get()) + line = ['main_and_error_reduce', name, iter, d, m, + a, 'err_phot', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_make_a012(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + # Reduce the array size to make the tests run faster + Nmax = 10 + Ymax = 128 + Xmax = 128 + + res = [] + + # Load data + with h5py.File(self.datadir % name + "make_a012_%04d.h5" % iter, "r") as g: + addr = g["addr"][:Nmax] + I = g["I"][:Nmax, :Ymax, :Xmax] + b_f = g["f"][:Nmax, :Ymax, :Xmax] + b_a = g["a"][:Nmax, :Ymax, :Xmax] + b_b = g["b"][:Nmax, :Ymax, :Xmax] + fic = g["fic"][:Nmax] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as h: + aux = h["aux"][:Nmax, :Ymax, :Xmax] + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_a012(b_f, b_a, b_b, addr, I, fic) + Imodelref = BGDK.npy.Imodel + LLerrref = BGDK.npy.LLerr + LLdenref = BGDK.npy.LLden + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + I_dev = gpuarray.to_gpu(I.astype(np.float32)) + b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex64)) + b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex64)) + b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex64)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + I_dev = gpuarray.to_gpu(I.astype(np.float64)) + b_f_dev = gpuarray.to_gpu(b_f.astype(np.complex128)) + b_a_dev = gpuarray.to_gpu(b_a.astype(np.complex128)) + b_b_dev = gpuarray.to_gpu(b_b.astype(np.complex128)) + fic_dev = gpuarray.to_gpu(fic.astype(np.float64)) + + GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream, + math_type=m, accumulate_type=a) + GDK.allocate() + GDK.gpu.Imodel.fill(np.nan) + GDK.gpu.LLerr.fill(np.nan) + GDK.gpu.LLden.fill(np.nan) + GDK.make_a012(b_f_dev, b_a_dev, b_b_dev, + addr_dev, I_dev, fic_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLerrref, GDK.gpu.LLerr.get()) + line = ['make_a012', name, iter, d, m, a, + 'LLerr', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + LLdenref, GDK.gpu.LLden.get()) + line = ['make_a012', name, iter, d, m, a, + 'LLden', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Imodelref, GDK.gpu.Imodel.get()) + line = ['make_a012', name, iter, d, m, a, + 'Imodel', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def test_fill_b(self, name, iter, + math_type={'float', 'double'}, + data_type={'float', 'double'}, + acc_type={'float', 'double'}): + + res = [] + + # Load data + + Nmax = 10 + Ymax = 128 + Xmax = 128 + + with h5py.File(self.datadir % name + "fill_b_%04d.h5" % iter, "r") as f: + w = f["w"][:Nmax, :Ymax, :Xmax] + addr = f["addr"][:] + B = f["B"][:] + Brenorm = f["Brenorm"][...] + A0 = f["A0"][:Nmax, :Ymax, :Xmax] + A1 = f["A1"][:Nmax, :Ymax, :Xmax] + A2 = f["A2"][:Nmax, :Ymax, :Xmax] + with h5py.File(self.datadir % name + "make_model_%04d.h5" % iter, "r") as f: + aux = f["aux"][:Nmax, :Ymax, :Xmax] + + # CPU Kernel + Bref = np.copy(B) + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = A0 + BGDK.npy.LLerr = A1 + BGDK.npy.LLden = A2 + BGDK.fill_b(addr, Brenorm, w, Bref) + + addr_dev = gpuarray.to_gpu(addr) + for d in data_type: + for m in math_type: + for a in acc_type: + if d == 'float': + aux_dev = gpuarray.to_gpu(aux.astype(np.complex64)) + w_dev = gpuarray.to_gpu(w.astype(np.float32)) + B_dev = gpuarray.to_gpu(B.astype(np.float32)) + A0_dev = gpuarray.to_gpu(A0.astype(np.float32)) + A1_dev = gpuarray.to_gpu(A1.astype(np.float32)) + A2_dev = gpuarray.to_gpu(A2.astype(np.float32)) + else: + aux_dev = gpuarray.to_gpu(aux.astype(np.complex128)) + w_dev = gpuarray.to_gpu(w.astype(np.float64)) + B_dev = gpuarray.to_gpu(B.astype(np.float64)) + A0_dev = gpuarray.to_gpu(A0.astype(np.float64)) + A1_dev = gpuarray.to_gpu(A1.astype(np.float64)) + A2_dev = gpuarray.to_gpu(A2.astype(np.float64)) + + GDK = GradientDescentKernel( + aux_dev, addr.shape[1], queue=self.stream, math_type=m, accumulate_type=a) + GDK.allocate() + GDK.gpu.Imodel = A0_dev + GDK.gpu.LLerr = A1_dev + GDK.gpu.LLden = A2_dev + GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev) + + num, num_mis, max_abs, max_rel = self._calc_diffs( + Bref, B_dev.get()) + line = ['fill_b', name, iter, d, m, a, + 'B', num, num_mis, max_rel, max_abs] + print(line) + res.append(line) + + return res + + def _calc_diffs(self, act, ref): + diffs = np.abs(ref - act) + max_abs = np.max(diffs[:]) + aref = np.abs(ref[:]) + max_rel = np.max( + np.divide(diffs[:], aref, out=np.zeros_like(diffs[:]), where=aref > 0)) + num_mis = np.count_nonzero(diffs[:] > self.atol + self.rtol * aref) + num = np.prod(ref.shape) + + return num, num_mis, max_abs, max_rel + + +tester = GradientDescentAccuracyTester() +print(tester.headings) + +res = [tester.headings] +for ver in [("base", 10), ("regul", 50), ("floating", 0)]: + res += tester.test_make_model(*ver) + res += tester.test_floating_intensity(*ver) + res += tester.test_main_and_error_reduce(*ver) + res += tester.test_make_a012(*ver) + res += tester.test_fill_b(*ver) + +with open('ML_accuracy_test_results.csv', 'w', newline='') as f: + writer = csv.writer(f) + writer.writerows(res) + +print('Done.') diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py index 0c9927ea9..414b785b3 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py @@ -7,6 +7,8 @@ from ptypy.core import Ptycho from ptypy import utils as u import time +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams import os import getpass diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py new file mode 100644 index 000000000..15e1c7513 --- /dev/null +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_3.py @@ -0,0 +1,75 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +import time +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_streams import DM_pycuda_streams + +import os +import getpass +from pathlib import Path +username = getpass.getuser() +tmpdir = os.path.join('/dls/tmp', username, 'dumps', 'ptypy') +Path(tmpdir).mkdir(parents=True, exist_ok=True) + +p = u.Param() + +# for verbose output +p.verbose_level = 3 +p.frames_per_block = 100 +# set home path +p.io = u.Param() +p.io.home = tmpdir +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param() +p.io.interaction.server = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.i14_3 = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.i14_3.name = 'BlockFull' # or 'Full' +p.scans.i14_3.data= u.Param() +p.scans.i14_3.data.name = 'MoonFlowerScan' +p.scans.i14_3.data.shape = 512 +p.scans.i14_3.data.num_frames = 4000 #50000 is the real value +p.scans.i14_3.data.save = None + +p.scans.i14_3.illumination = u.Param() +p.scans.i14_3.coherence = u.Param(num_probe_modes=10) +p.scans.i14_3.illumination.diversity = u.Param() +p.scans.i14_3.illumination.diversity.noise = (0.5, 1.0) +p.scans.i14_3.illumination.diversity.power = 0.1 + +# position distance in fraction of illumination frame +p.scans.i14_3.data.density = 0.2 +# total number of photon in empty beam +p.scans.i14_3.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.i14_3.data.psf = 0.4 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_pycuda_stream' +p.engines.engine00.numiter = 100 +p.engines.engine00.numiter_contiguous = 20 +p.engines.engine00.probe_update_start = 1 +p.engines.engine00.probe_update_cuda_atomics = False +p.engines.engine00.object_update_cuda_atomics = True + + +# prepare and run +P = Ptycho(p,level=4) +t1 = time.perf_counter() +P.run() +t2 = time.perf_counter() +P.print_stats() +print('Elapsed Compute Time: {} seconds'.format(t2-t1)) diff --git a/extensions.py b/extensions.py index f0dfc0010..c36483e09 100644 --- a/extensions.py +++ b/extensions.py @@ -1,102 +1,115 @@ ''' -These are the optional extensions for ptypy +Compilation tools for Nvidia builds of extension modules. ''' - - -from distutils.version import LooseVersion -from distutils.extension import Extension import os -import multiprocessing -import subprocess -import re -import numpy as np - +import sysconfig +import pybind11 +from distutils.unixccompiler import UnixCCompiler +from distutils.command.build_ext import build_ext -# this is a hacky version, but is the desired behaviour -class AccelerationExtension(object): - def __init__(self, debug=False): - self.debug = debug - self._options = None - def get_full_options(self): - return self._options +def find_in_path(name, path): + "Find a file in a search path" + # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ + for dir in path.split(os.pathsep): + binpath = os.path.join(dir, name) + if os.path.exists(binpath): + return os.path.abspath(binpath) + return None - def get_reflection_options(self): - user_options = [] - boolean_options = [] - for name, description in self._options.items(): - if isinstance(description['default'], str): - user_options.append((name+'=', None, description['doc'])) - elif isinstance(description['default'], bool): - user_options.append((name, None, description['doc'])) - boolean_options.append(name) - else: - raise NotImplementedError("Don't know what to do with parameter:%s of type: %s" % (name, type(description['default']))) - return user_options, boolean_options +def locate_cuda(): + """ + Locate the CUDA environment on the system + Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' + and values giving the absolute path to each directory. + Starts by looking for the CUDAHOME env variable. If not found, everything + is based on finding 'nvcc' in the PATH. + """ + # first check if the CUDAHOME env variable is in use + if 'CUDAHOME' in os.environ: + home = os.environ['CUDAHOME'] + nvcc = os.path.join(home, 'bin', 'nvcc') + else: + # otherwise, search the PATH for NVCC + nvcc = find_in_path('nvcc', os.environ['PATH']) + if nvcc is None: + raise EnvironmentError('The nvcc binary could not be ' + 'located in your $PATH. Either add it to your path, or set $CUDAHOME') + home = os.path.dirname(os.path.dirname(nvcc)) - def build(self, options): - raise NotImplementedError('You need to implement the build method!') + cudaconfig = {'home': home, 'nvcc': nvcc, + 'include': os.path.join(home, 'include'), + 'lib64': os.path.join(home, 'lib64')} + for k, v in cudaconfig.items(): + if not os.path.exists(v): + raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) + return cudaconfig - def getExtension(self): - raise NotImplementedError('You need to return cython extension object.') - - -class CudaExtension(AccelerationExtension): # probably going to inherit from something. +class NvccCompiler(UnixCCompiler): def __init__(self, *args, **kwargs): - super(CudaExtension, self).__init__(*args, **kwargs) - self._options = {'cudadir': {'default': '', - 'doc': 'CUDA directory'}, - 'cudaflags': {'default': '-gencode arch=compute_35,\\"code=sm_35\\" ' + - '-gencode arch=compute_37,\\"code=sm_37\\" ' + - '-gencode arch=compute_52,\\"code=sm_52\\" ' + - '-gencode arch=compute_60,\\"code=sm_60\\" ' + - '-gencode arch=compute_70,\\"code=sm_70\\" ', - 'doc': 'Flags to the CUDA compiler'}, - 'gputiming': {'default': False, - 'doc': 'Do GPU timing'}} + super(NvccCompiler, self).__init__(*args, **kwargs) + self.CUDA = locate_cuda() + module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') + # by default, compile for all of these + archflag = '-gencode=arch=compute_50,code=sm_50' + \ + ' -gencode=arch=compute_52,code=sm_52' + \ + ' -gencode=arch=compute_60,code=sm_60' + \ + ' -gencode=arch=compute_61,code=sm_61' + \ + ' -gencode=arch=compute_70,code=sm_70' + \ + ' -gencode=arch=compute_75,code=sm_75' + \ + ' -gencode=arch=compute_75,code=compute_75' + self.src_extensions.append('.cu') + self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] + self.NVCC_FLAGS = ["-dc", archflag] + self.CXXFLAGS = ['"-fPIC"'] + pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')] + INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir] + self.INCLUDES = ["-I%s" % ix for ix in INCLUDES] + self.OPTFLAGS = ["-O3", "-std=c++14"] + + def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): + default_compiler_so = self.compiler_so + CPPFLAGS = self.INCLUDES + extra_postargs # little hack here, since postargs usually goes at the end, which we won't do. + # makefile line is + # $(NVCC) $(NVCC_FLAGS) $(OPTFLAGS) -Xcompiler "$(CXXFLAGS)" $(CPPFLAGS) + compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS + compiler_exec = " ".join(compiler_command) + self.set_executable('compiler_so', compiler_exec) + postargs = [] # we don't actually have any postargs + super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method + # reset the default compiler_so, which we might have changed for cuda + self.compiler_so = default_compiler_so + + def link(self, target_desc, objects, + output_filename, output_dir=None, libraries=None, + library_dirs=None, runtime_library_dirs=None, + export_symbols=None, debug=0, extra_preargs=None, + extra_postargs=None, build_temp=None, target_lang=None): + default_linker_so = self.linker_so + # make file line is + # $(NVCC) $(OPTFLAGS) -shared $(LD_FLAGS) $(OBJ) $(OBJ_MOD) -o $@ + linker_command = [self.CUDA["nvcc"]] + self.OPTFLAGS + ["-shared"] + self.LD_FLAGS + linker_exec = " ".join(linker_command) + self.set_executable('linker_so', linker_exec) + super(NvccCompiler, self).link(target_desc, objects, + output_filename, output_dir=None, libraries=None, + library_dirs=None, runtime_library_dirs=None, + export_symbols=None, debug=0, extra_preargs=None, + extra_postargs=None, build_temp=None, target_lang=None) + self.linker_so = default_linker_so - def build(self, options): - cudadir = options['cudadir'] - cudaflags = options['cudaflags'] - gputiming = options['gputiming'] - try: - out = subprocess.check_output(['cmake', '--version']) - except OSError: - raise RuntimeError( - "CMake must be installed to build the CUDA extensions.") +class CustomBuildExt(build_ext): - cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', - out.decode()).group(1)) - if cmake_version < '3.8.0': - raise RuntimeError("CMake >= 3.8.0 is required") + def build_extension(self, ext): + has_cu = any([src.endswith('.cu') for src in ext.sources]) + if has_cu: + old_compiler = self.compiler + self.compiler = NvccCompiler(verbose=old_compiler.verbose, + dry_run=old_compiler.dry_run, + force=old_compiler.force) # this is our bespoke compiler + super(CustomBuildExt, self).build_extension(ext) + self.compiler=old_compiler + else: + super(CustomBuildExt, self).build_extension(ext) - srcdir = os.path.abspath('cuda') - buildtmp = os.path.abspath(os.path.join('build', 'cuda')) - cmake_args = [ - "-DCMAKE_BUILD_TYPE=" + ("Debug" if self.debug else "Release"), - '-DCMAKE_CUDA_FLAGS={}'.format(cudaflags), - '-DGPU_TIMING={}'.format("ON" if gputiming else "OFF") - ] - if cudadir: - cmake_args += '-DCMAKE_CUDA_COMPILER="{}/bin/nvcc"'.format(cudadir) - build_args = ["--config", "Debug" if self.debug else "Release", "--", "-j{}".format(multiprocessing.cpu_count() + 1)] - if not os.path.exists(buildtmp): - os.makedirs(buildtmp) - env = os.environ.copy() - subprocess.check_call(['cmake', srcdir] + cmake_args, - cwd=buildtmp, env=env) - subprocess.check_call(['cmake', '--build', '.'] + build_args, - cwd=buildtmp) - print("Complete.") - def getExtension(self): - libdirs = ['build/cuda'] - if 'LD_LIBRARY_PATH' in os.environ: - libdirs += os.environ['LD_LIBRARY_PATH'].split(':') - return Extension('*', - sources=['ptypy/accelerate/cuda/gpu_extension.pyx'], - include_dirs=[np.get_include()], - libraries=['gpu_extension', 'cudart', 'cufft'], - library_dirs=libdirs, - depends=['build/cuda/libgpu_extension.a', ], - language="c++") diff --git a/ptypy/accelerate/base/address_manglers.py b/ptypy/accelerate/base/address_manglers.py index c60543cb4..100c4d382 100644 --- a/ptypy/accelerate/base/address_manglers.py +++ b/ptypy/accelerate/base/address_manglers.py @@ -4,52 +4,83 @@ import numpy as np np.random.seed(0) -class RandomIntMangle(object): + +class BaseMangler(object): ''' - assumes integer pixel shift. + Assumes integer pixel shift. ''' - def __init__(self, max_step_per_shift, start, stop, max_bound=None, randomseed=None): + def __init__(self, max_step_per_shift, start, stop, nshifts, max_bound=None, randomseed=None): # can be initialised in the engine.init self.max_bound = max_bound # maximum distance from the starting positions - self.max_step = lambda it: (max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression - self.call_no = 0 + self.max_step = lambda it: np.ceil(max_step_per_shift * (stop - it) / (stop - start)) # maximum step per iteration, decreases with progression + self.nshifts = nshifts + self.delta = 0 - def mangle_address(self, addr_current, addr_original, iteration): + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): ''' - Takes the current address book and adds an offset to it according to the parameters + Mangles with the address given a delta shift ''' - mangled_addr = np.zeros_like(addr_current) - mangled_addr[:] = addr_current # make a copy - max_step = self.max_step(iteration) - deltas = np.random.randint(0, max_step + 1, (addr_current.shape[0], 2)) - # the following improves things a lot! - deltas[:, 0] = (-1)**self.call_no - deltas[:, 1] = (-1)**(self.call_no//2) - self.call_no += 1 - - # deltas = np.zeros((addr_current.shape[0], 2)) # for testing old_positions = np.zeros((addr_current.shape[0], 2)) old_positions[:] = addr_current[:, 0, 1, 1:] new_positions = np.zeros((addr_current.shape[0],2)) - # new_positions[1:] = old_positions[1:] + deltas[1:] # first mode is same as all of them. - new_positions[:] = old_positions + deltas # first mode is same as all of them. - self.apply_bounding_box(new_positions, old_positions, addr_original) + new_positions[:] = old_positions + self.delta[index] # first mode is same as all of them. # now update the main matrix (Same for all modes) - for idx in range(addr_original.shape[1]): + for idx in range(addr_current.shape[1]): mangled_addr[:, idx, 1, 1:] = new_positions - return mangled_addr + self.apply_bounding_box(mangled_addr[:,:,1,1], 0, max_oby) + self.apply_bounding_box(mangled_addr[:,:,1,2], 0, max_obx) + + def apply_bounding_box(self, addr, min, max): + ''' + Check if the mangled addresses are within valid bounds + ''' + addr[addrmax] = max + + def setup_shifts(self, current_iteration, nframes=1): + ''' + Arrange an array of shifts + ''' + raise NotImplementedError("This method needs to be overwritten in order to position correct") + + +class RandomIntMangler(BaseMangler): - def apply_bounding_box(self, new_positions, old_positions, addr_original): + def __init__(self, *args, **kwargs): + super(RandomIntMangler, self).__init__(*args, **kwargs) + + def setup_shifts(self, current_iteration, nframes=1): + ''' + Calculates random integer shifts + ''' + max_step = self.max_step(current_iteration) + self.delta = np.random.randint(0, max_step + 1, (self.nshifts, nframes, 2)) + for index in range(self.nshifts): + self.delta[index, :, 0] *= (-1)**index + self.delta[index, :, 1] *= (-1)**(index//2) + # check if the shifts are within the maximum bound + norms = np.linalg.norm(self.delta, axis=-1) + self.delta[norms > self.max_bound] = 0 + +class GridSearchMangler(BaseMangler): + def __init__(self, *args, **kwargs): + super(GridSearchMangler, self).__init__(*args, **kwargs) + + def setup_shifts(self, current_iteration, nframes=1): ''' - Checks if the new co-ordinates lie within the bounding box. If not, we undo this move. + Calculates integer shifts on a grid ''' + max_step = self.max_step(current_iteration) + delta = np.mgrid[-max_step:max_step+1:1, + -max_step:max_step+1:1] + within_bound = (delta[0]**2 + delta[1]**2) < (self.max_bound**2) + print(max_step, self.max_bound, within_bound.sum()) + self.delta = np.tile(delta[:,within_bound].T.reshape(within_bound.sum(),1,2), (1,nframes,1)) + self.nshifts = self.delta.shape[0] + + + + + - distances_from_original = new_positions - addr_original[:, 0, 1, 1:] - # logger.warning("distance from original is %s" % repr(distances_from_original)) - norms = np.linalg.norm(distances_from_original, axis=-1) - for i in range(len(new_positions)): - if norms[i]> self.max_bound: - new_positions[i] = old_positions[i] - # new_positions[norms>self.max_bound] = old_positions[norms>self.max_bound] # make sure we aren't outside the bounding box -# \ No newline at end of file diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py index c2d341711..839b08e70 100644 --- a/ptypy/accelerate/base/array_utils.py +++ b/ptypy/accelerate/base/array_utils.py @@ -17,6 +17,12 @@ def dot(A, B, acc_dtype=np.float64): def norm2(A): return dot(A, A) +def max_abs2(A): + ''' + A has ndim = 3. + compute abs2, sum along first dimension and take maximum along last two dims + ''' + return np.max(np.sum(np.abs(A)**2,axis=0),axis=(-2,-1)) def abs2(input): ''' @@ -26,6 +32,7 @@ def abs2(input): ''' return np.multiply(input, input.conj()).real + def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype): ''' :param in1. An array . Can be inplace. Can be complex or real. @@ -40,6 +47,7 @@ def sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype): out1[o1[0], o1[1]:(o1[1] + inshape[1]), o1[2]:(o1[2] + inshape[2])] += in1[i1[0]] return out1 + def norm2(input): ''' Input here could be a variety of 1D, 2D, 3D complex or real. all will be single precision at least. @@ -47,17 +55,20 @@ def norm2(input): ''' return np.sum(abs2(input)) + def complex_gaussian_filter(input, mfs): ''' takes 2D and 3D arrays. Complex input, complex output. mfs has len 02: + if len(mfs) > 2: raise NotImplementedError("Only batches of 2D arrays allowed!") if input.ndim == 3: mfs = np.insert(mfs, 0, 0) - return (ndi.gaussian_filter(np.real(input), mfs) +1j *ndi.gaussian_filter(np.imag(input), mfs)).astype(input.dtype) + return (ndi.gaussian_filter(np.real(input), mfs) + 1j * ndi.gaussian_filter(np.imag(input), mfs)).astype( + input.dtype) + def mass_center(A): ''' @@ -65,6 +76,7 @@ def mass_center(A): ''' return np.array(ndi.measurements.center_of_mass(A), dtype=A.dtype) + def interpolated_shift(c, shift, do_linear=False): ''' complex bicubic interpolated shift. @@ -72,9 +84,13 @@ def interpolated_shift(c, shift, do_linear=False): ''' if not do_linear: - return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j*ndi.interpolation.shift(np.imag(c), shift, order=3, prefilter=True) + return ndi.interpolation.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.interpolation.shift( + np.imag(c), shift, order=3, prefilter=True) else: - return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0, prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1, mode='constant', cval=0, prefilter=False) + return ndi.interpolation.shift(np.real(c), shift, order=1, mode='constant', cval=0, + prefilter=False) + 1j * ndi.interpolation.shift(np.imag(c), shift, order=1, + mode='constant', cval=0, + prefilter=False) def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max): @@ -84,4 +100,51 @@ def clip_complex_magnitudes_to_range(complex_input, clip_min, clip_max): ampl = np.abs(complex_input) phase = np.exp(1j * np.angle(complex_input)) ampl = np.clip(ampl, clip_min, clip_max) - complex_input[:] = ampl * phase \ No newline at end of file + complex_input[:] = ampl * phase + + +def fill3D(A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap" + A[..., Ao[0]:min(off[0] + Blim[0], Alim[0]), + Ao[1]:min(off[1] + Blim[1], Alim[1]), + Ao[2]:min(off[2] + Blim[2], Alim[2])] \ + = B[..., Bo[0]:min(Alim[0] - off[0], Blim[0]), + Bo[1]:min(Alim[1] - off[1], Blim[1]), + Bo[2]:min(Alim[2] - off[2], Blim[2])] + + +def crop_pad_2d_simple(A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + fill3D(A, B, offset) diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index 7352a0fce..44573bf56 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -7,10 +7,6 @@ :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. :license: GPLv2, see LICENSE for details. """ - -# from .. import core -from __future__ import division - import numpy as np import time @@ -19,7 +15,6 @@ from ptypy.utils import parallel from ptypy.engines import BaseEngine, register, DM from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel -from ptypy.accelerate.base import address_manglers from ptypy.accelerate.base import array_utils as au @@ -32,8 +27,6 @@ # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that. # - Fourier_update_kernel needs to allow batched execution -## for debugging -#from matplotlib import pyplot as plt __all__ = ['DM_serial'] @@ -200,17 +193,8 @@ def _setup_kernels(self): kern.resolution = geo.resolution[0] if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes) + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution) kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler def engine_prepare(self): @@ -333,7 +317,7 @@ def engine_iterate(self, num=1): ## build exit wave t1 = time.time() - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux,addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 @@ -350,7 +334,7 @@ def engine_iterate(self, num=1): self.overlap_update(MPI=True) parallel.barrier() - if self.do_position_refinement and (self.curiter): + if self.do_position_refinement: do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 @@ -370,7 +354,8 @@ def engine_iterate(self, num=1): kern = self.kernels[prep.label] aux = kern.aux addr = prep.addr - original_addr = prep.original_addr # use this instead of the one in the address mangler. + original_addr = prep.original_addr + mangled_addr = addr.copy() mag = prep.mag ma_sum = prep.ma_sum err_fourier = prep.err_fourier @@ -378,16 +363,34 @@ def engine_iterate(self, num=1): PCK = kern.PCK FW = kern.FW + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + aux[:] = FW(aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) error_state = np.zeros_like(err_fourier) error_state[:] = err_fourier + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr, original_addr, self.curiter) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) PCK.build_aux(aux, mangled_addr, ob, pr) aux[:] = FW(aux) - PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + prep.err_fourier = error_state prep.addr = addr @@ -418,8 +421,6 @@ def overlap_update(self, MPI=True): # Update probe log(4, prestr + '----- probe update -----', True) change = self.probe_update(MPI=(parallel.size > 1 and MPI)) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) - log(4, prestr + 'change in probe is %.3f' % change, True) # stop iteration if probe change is small @@ -434,7 +435,7 @@ def object_update(self, MPI=False): cfact = self.p.object_inertia * self.mean_power if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4, 'Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] ob.data = cfact * au.complex_gaussian_filter(ob.data, smooth_mfs) else: @@ -464,19 +465,19 @@ def object_update(self, MPI=False): parallel.allreduce(ob.data) parallel.allreduce(obn.data) ob.data /= obn.data - - # Clip object (This call takes like one ms. Not time critical) - if self.p.clip_object is not None: - clip_min, clip_max = self.p.clip_object - ampl_obj = np.abs(ob.data) - phase_obj = np.exp(1j * np.angle(ob.data)) - too_high = (ampl_obj > clip_max) - too_low = (ampl_obj < clip_min) - ob.data[too_high] = clip_max * phase_obj[too_high] - ob.data[too_low] = clip_min * phase_obj[too_low] else: ob.data /= obn.data + # Clip object (This call takes like one ms. Not time critical) + if self.p.clip_object is not None: + clip_min, clip_max = self.p.clip_object + ampl_obj = np.abs(ob.data) + phase_obj = np.exp(1j * np.angle(ob.data)) + too_high = (ampl_obj > clip_max) + too_low = (ampl_obj < clip_min) + ob.data[too_high] = clip_max * phase_obj[too_high] + ob.data[too_low] = clip_min * phase_obj[too_low] + self.benchmark.object_update += time.time() - t1 self.benchmark.calls_object += 1 @@ -533,11 +534,11 @@ def probe_update(self, MPI=False): return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=True): """ try deleting ever helper contianer """ - if parallel.master: + if parallel.master and benchmark: print("----- BENCHMARKS ----") acc = 0. for name in sorted(self.benchmark.keys()): @@ -563,7 +564,7 @@ def engine_finalize(self): res = self.kernels[prep.label].resolution for i,view in enumerate(d.views): for j,(pname, pod) in enumerate(view.pods.items()): - delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res + delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res pod.ob_view.coord += delta pod.ob_view.storage.update_views(pod.ob_view) diff --git a/ptypy/accelerate/base/engines/DM_serial_stream.py b/ptypy/accelerate/base/engines/DM_serial_stream.py index e3eadc085..2c65511dc 100644 --- a/ptypy/accelerate/base/engines/DM_serial_stream.py +++ b/ptypy/accelerate/base/engines/DM_serial_stream.py @@ -29,8 +29,6 @@ # - Propagator needs to be reconfigurable for a certain batch size, gpyfft hates that. # - Fourier_update_kernel needs to allow batched execution -## for debugging -#from matplotlib import pyplot as plt __all__ = ['DM_serial_stream'] @@ -139,7 +137,7 @@ def engine_iterate(self, num=1): ## apply changes #2 t1 = time.time() - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) self.benchmark.E_Build_exit += time.time() - t1 err_phot = np.zeros_like(err_fourier) diff --git a/ptypy/accelerate/base/engines/DR_serial.py b/ptypy/accelerate/base/engines/DR_serial.py new file mode 100644 index 000000000..b13828919 --- /dev/null +++ b/ptypy/accelerate/base/engines/DR_serial.py @@ -0,0 +1,424 @@ +# -*- coding: utf-8 -*- +""" +Local Difference Map/Alternate Projections reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" +import numpy as np +import time + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy import defaults_tree +from ptypy.engines import register +from ptypy.engines.base import PositionCorrectionEngine +from ptypy.core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull +from ptypy.accelerate.base.engines import DM_serial +from ptypy.accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ptypy.accelerate.base import address_manglers +from ptypy.accelerate.base import array_utils as au + + +__all__ = ['DR_serial'] + +@register() +class DR_serial(PositionCorrectionEngine): + """ + An implementation of the Douglas-Rachford algorithm + that can be operated like the ePIE algorithm. + + Defaults: + + [name] + default = DR_serial + type = str + help = + doc = + + [alpha] + default = 1 + type = float + lowlim = 0.0 + help = Tuning parameter, a value of 0 makes it equal to ePIE. + + [tau] + default = 1 + type = float + lowlim = 0.0 + help = fourier update parameter, a value of 0 means no fourier update. + + [probe_inertia] + default = 1e-9 + type = float + lowlim = 0.0 + help = Weight of the current probe estimate in the update + + [object_inertia] + default = 1e-4 + type = float + lowlim = 0.0 + help = Weight of the current object in the update + + [clip_object] + default = None + type = tuple + help = Clip object amplitude into this interval + + [rescale_probe] + default = True + type = bool + lowlim = 0 + help = Normalise probe power according to data + + [compute_log_likelihood] + default = True + type = bool + help = A switch for computing the log-likelihood error (this can impact the performance of the engine) + + [compute_exit_error] + default = False + type = bool + help = A switch for computing the exitwave error (this can impact the performance of the engine) + + [compute_fourier_error] + default = False + type = bool + help = A switch for computing the fourier error (this can impact the performance of the engine) + + """ + + SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] + + def __init__(self, ptycho_parent, pars=None): + """ + Local difference map reconstruction engine. + """ + super(DR_serial, self).__init__(ptycho_parent, pars) + + # Instance attributes + self.error = None + self.mean_power = None + + # keep track of timings + self.benchmark = u.Param() + + # Stores all information needed with respect to the diffraction storages. + self.diff_info = {} + self.ob_cfact = {} + self.pr_cfact = {} + self.kernels = {} + + self.ptycho.citations.add_article( + title='Semi-implicit relaxed Douglas-Rachford algorithm (sDR) for ptychography', + author='Pham et al.', + journal='Opt. Express', + volume=27, + year=2019, + page=31246, + doi='10.1364/OE.27.031246', + comment='The local douglas-rachford reconstruction algorithm', + ) + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + super(DR_serial, self).engine_initialize() + + self.error = [] + self._reset_benchmarks() + self._setup_kernels() + + def _reset_benchmarks(self): + self.benchmark.A_Build_aux = 0. + self.benchmark.B_Prop = 0. + self.benchmark.C_Fourier_update = 0. + self.benchmark.D_iProp = 0. + self.benchmark.E_Build_exit = 0. + self.benchmark.F_LLerror = 0. + self.benchmark.probe_update = 0. + self.benchmark.object_update = 0. + self.benchmark.calls_fourier = 0 + self.benchmark.calls_object = 0 + self.benchmark.calls_probe = 0 + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + # TODO: make this part of the engine rather than scan + fpc = self.ptycho.frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (1 * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = aux + + # setup kernels, one for each SCAN. + kern.FUK = FourierUpdateKernel(aux, nmodes) + kern.FUK.allocate() + + kern.POK = PoUpdateKernel() + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel() + kern.AWK.allocate() + + kern.FW = geo.propagator.fw + kern.BW = geo.propagator.bw + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), + self.p.position_refinement.start, + self.p.position_refinement.stop, + max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), + randomseed=0) + logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) + logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) + + kern.PCK = PositionCorrectionKernel(aux, nmodes) + kern.PCK.allocate() + kern.PCK.address_mangler = addr_mangler + + def engine_prepare(self): + + """ + Last minute initialization. + + Everything that needs to be recalculated when new data arrives. + """ + if self.ptycho.new_data: + + # recalculate everything + mean_power = 0. + for s in self.di.storages.values(): + mean_power += s.mean_power + self.mean_power = mean_power / len(self.di.storages) + + ## Serialize new data ## + for label, d in self.ptycho.new_data: + prep = u.Param() + prep.label = label + self.diff_info[d.ID] = prep + prep.mag = np.sqrt(np.abs(d.data)) + prep.ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma_sum = prep.ma.sum(-1).sum(-1) + prep.err_phot = np.zeros_like(prep.ma_sum) + prep.err_fourier = np.zeros_like(prep.ma_sum) + prep.err_exit = np.zeros_like(prep.ma_sum) + + # Unfortunately this needs to be done for all pods, since + # the shape of the probe / object was modified. + # TODO: possible scaling issue, remove the need for padding + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.view_IDs, prep.poe_IDs, prep.addr = DM_serial.serialize_array_access(d) + if self.do_position_refinement: + prep.original_addr = np.zeros_like(prep.addr) + prep.original_addr[:] = prep.addr + pID, oID, eID = prep.poe_IDs + + ob = self.ob.S[oID] + misfit = np.asarray(ob.shape[-2:]) % 32 + if (misfit != 0).any(): + pad = 32 - np.asarray(ob.shape[-2:]) % 32 + ob.data = u.crop_pad(ob.data, [[0, pad[0]], [0, pad[1]]], axes=[-2, -1], filltype='project') + ob.shape = ob.data.shape + + # Keep a list of view indices + prep.rng = np.random.default_rng() + prep.vieworder = np.arange(prep.addr.shape[0]) + + # Modify addresses, copy pa into ea and remove da/ma + prep.addr_ex = np.vstack([prep.addr[:,0,2,0], prep.addr[:,-1,2,0]+1]).T + prep.addr[:,:,2] = prep.addr[:,:,0] + prep.addr[:,:,3:,0] = 0 + + # Reference to ex + prep.ex = self.ex.S[eID].data + + # calculate c_facts + #cfact = self.p.object_inertia * self.mean_power + #self.ob_cfact[oID] = cfact / u.parallel.size + + #pr = self.pr.S[pID] + #cfact = self.p.probe_inertia * len(pr.views) / pr.data.shape[0] + #self.pr_cfact[pID] = cfact / u.parallel.size + + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + for it in range(num): + + error_dct = {} + + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + FW = kern.FW + BW = kern.BW + + # global aux buffer + aux = kern.aux + + # references for ob, pr + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr[i,None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex[ex_from:ex_to] + mag = prep.mag[i,None] + ma = prep.ma[i,None] + ma_sum = prep.ma_sum[i,None] + err_phot = prep.err_phot[i,None] + err_fourier = prep.err_fourier[i,None] + err_exit = prep.err_exit[i,None] + + ## build auxilliary wave + t1 = time.time() + AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + self.benchmark.A_Build_aux += time.time() - t1 + + ## forward FFT + t1 = time.time() + aux[:] = FW(aux) + self.benchmark.B_Prop += time.time() - t1 + + ## Deviation from measured data + t1 = time.time() + if self.p.compute_fourier_error: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + self.benchmark.C_Fourier_update += time.time() - t1 + + ## backward FFT + t1 = time.time() + aux[:] = BW(aux) + self.benchmark.D_iProp += time.time() - t1 + + ## build exit wave + t1 = time.time() + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + self.benchmark.E_Build_exit += time.time() - t1 + self.benchmark.calls_fourier += 1 + + ## probe/object rescale + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + ## build auxilliary wave (ob * pr product) + t1 = time.time() + AWK.build_aux_no_ex(aux, addr, ob, pr) + self.benchmark.A_Build_aux += time.time() - t1 + + # object update + t1 = time.time() + POK.ob_update_local(addr, ob, pr, ex, aux) + self.benchmark.object_update += time.time() - t1 + self.benchmark.calls_object += 1 + + # probe update + t1 = time.time() + POK.pr_update_local(addr, pr, ob, ex, aux) + self.benchmark.probe_update += time.time() - t1 + self.benchmark.calls_probe += 1 + + ## compute log-likelihood + if self.p.compute_log_likelihood: + t1 = time.time() + aux[:] = FW(aux) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + self.benchmark.F_LLerror += time.time() - t1 + + # update errors + errs = np.ascontiguousarray(np.vstack([np.hstack(prep.err_fourier), + np.hstack(prep.err_phot), + np.hstack(prep.err_exit)]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + self.curiter += 1 + + error = parallel.gather_dict(error_dct) + return error + + + def engine_finalize(self): + """ + try deleting ever helper contianer + """ + if parallel.master and self.benchmark.calls_fourier: + print("----- BENCHMARKS ----") + acc = 0. + for name in sorted(self.benchmark.keys()): + t = self.benchmark[name] + if name[0] in 'ABCDEFGHI': + print('%20s : %1.3f ms per iteration' % (name, t / self.benchmark.calls_fourier * 1000)) + acc += t + elif str(name) == 'probe_update': + print('%20s : %1.3f ms per call. %d calls' % ( + name, t / self.benchmark.calls_probe * 1000, self.benchmark.calls_probe)) + elif str(name) == 'object_update': + print('%20s : %1.3f ms per call. %d calls' % ( + name, t / self.benchmark.calls_object * 1000, self.benchmark.calls_object)) + + print('%20s : %1.3f ms per iteration. %d calls' % ( + 'Fourier_total', acc / self.benchmark.calls_fourier * 1000, self.benchmark.calls_fourier)) + + self._reset_benchmarks() + + if self.do_position_refinement: + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + res = self.kernels[prep.label].resolution + for i,view in enumerate(d.views): + for j,(pname, pod) in enumerate(view.pods.items()): + delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res + pod.ob_view.coord += delta + pod.ob_view.storage.update_views(pod.ob_view) diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 04bfd58ba..214aa0536 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -17,16 +17,15 @@ from ptypy.engines.ML import ML, BaseModel from .DM_serial import serialize_array_access from ptypy import utils as u -from ptypy.utils.verbose import logger +from ptypy.utils.verbose import logger, log from ptypy.utils import parallel from ptypy.engines.utils import Cnorm2, Cdot from ptypy.engines import register -from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \ - PositionCorrectionKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel from ptypy.accelerate.base import address_manglers -__all__ = ['ML_serial'] +__all__ = ['ML_serial'] @register() class ML_serial(ML): @@ -104,20 +103,6 @@ def _setup_kernels(self): kern.FW = geo.propagator.fw kern.BW = geo.propagator.bw - if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle( - int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes) - kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler - def engine_prepare(self): ## Serialize new data ## @@ -137,9 +122,6 @@ def engine_prepare(self): for label, d in self.di.storages.items(): prep = self.diff_info[d.ID] prep.view_IDs, prep.poe_IDs, prep.addr = serialize_array_access(d) - if self.do_position_refinement: - prep.original_addr = np.zeros_like(prep.addr) - prep.original_addr[:] = prep.addr self.ML_model.prepare() @@ -194,7 +176,6 @@ def engine_iterate(self, num=1): # probe/object rescaling if self.p.scale_precond: - cn2_new_pr_grad = cn2_new_pr_grad if cn2_new_pr_grad > 1e-5: scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad / cn2_new_pr_grad) @@ -362,10 +343,8 @@ def new_grad(self): aux[:] = FW(aux) GDK.make_model(aux, addr) - if self.p.floating_intensities: GDK.floating_intensity(addr, w, I, fic) - GDK.main(aux, addr, w, I) GDK.error_reduce(addr, err_phot) @@ -448,7 +427,6 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): b[:] = FW(b) GDK.make_a012(f, a, b, addr, I, fic) - GDK.fill_b(addr, Brenorm, w, B) parallel.allreduce(B) diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index 6e62fccdf..b1f109444 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -1,5 +1,6 @@ import numpy as np from ptypy.utils.verbose import logger, log +from .array_utils import max_abs2 class Adict(object): @@ -73,6 +74,28 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum): ferr[:] = mask * np.abs(fdev) ** 2 / mask_sum.reshape((maxz, 1, 1)) return + def fourier_deviation(self, b_aux, addr, mag): + # reference shape (write-to shape) + sh = self.fshape + # stopper + maxz = mag.shape[0] + + # batch buffers + fdev = self.npy.fdev[:maxz] + aux = b_aux[:maxz * self.nmodes] + + ## Actual math ## + + # build model from complex fourier magnitudes, summing up + # all modes incoherently + tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) + af = np.sqrt((np.abs(tf) ** 2).sum(1)) + + # calculate difference to real data (g_mag) + fdev[:] = af - mag + + return + def error_reduce(self, addr, err_sum): # reference shape (write-to shape) sh = self.fshape @@ -133,6 +156,33 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0): aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) return + def fmag_update_nopbound(self, b_aux, addr, mag, mask): + + sh = self.fshape + nmodes = self.nmodes + + # stopper + maxz = mag.shape[0] + + # batch buffers + fdev = self.npy.fdev[:maxz] + aux = b_aux[:maxz * nmodes] + + # write-to shape + ish = aux.shape + + ## Actual math ## + + # local values + fm = np.ones((maxz, sh[1], sh[2]), np.float32) + + af = fdev + mag + fm[:] = (1 - mask) + mask * mag / (af + self.denom) + + # upcasting + aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + return + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # reference shape (write-to shape) sh = self.fshape @@ -358,7 +408,6 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): aux = b_aux[:maxz * nmodes] flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3]) rows, cols = ex.shape[-2:] - for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): tmp = ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ pr[prc[0], :, :] * \ @@ -368,7 +417,7 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): aux[ind, :, :] = tmp return - def build_exit(self, b_aux, addr, ob, pr, ex): + def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1): sh = addr.shape @@ -384,9 +433,35 @@ def build_exit(self, b_aux, addr, ob, pr, ex): rows, cols = ex.shape[-2:] for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): - dex = aux[ind, :, :] - \ + dex = aux[ind, :, :] - alpha * \ ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ - pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + (alpha - 1) * \ + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex + aux[ind, :, :] = dex + return + + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): + sh = addr.shape + + nmodes = sh[1] + + # stopper + maxz = sh[0] + + # batch buffers + aux = b_aux[:maxz * nmodes] + + flat_addr = addr.reshape(maxz * nmodes, sh[2], sh[3]) + rows, cols = ex.shape[-2:] + + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + dex = tau * aux[ind, :, :] + (tau * alpha - 1) * \ + ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + \ + (1 - tau * (1 + alpha)) * \ + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex aux[ind, :, :] = dex @@ -478,8 +553,39 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0): ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac return + def ob_update_local(self, addr, ob, pr, ex, aux): + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + pr_norm = max_abs2(pr) + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \ + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \ + (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ + pr_norm + return + + def pr_update_local(self, addr, pr, ob, ex, aux): + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + ob_norm = max_abs2(ob) + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \ + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \ + (ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind,:,:]) / \ + ob_norm + return + class PositionCorrectionKernel(BaseKernel): - def __init__(self, aux, nmodes): + from ptypy.accelerate.base import address_manglers + + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, aux, nmodes, parameters, resolution): super(PositionCorrectionKernel, self).__init__() ash = aux.shape self.fshape = (ash[0] // nmodes, ash[1], ash[2]) @@ -487,11 +593,20 @@ def __init__(self, aux, nmodes): self.npy.fdev = None self.addr = None self.nmodes = nmodes - self.address_mangler = None + self.param = parameters + self.nshifts = parameters.nshifts + self.resolution = resolution self.kernels = ['build_aux', 'fourier_error', 'error_reduce', 'update_addr'] + self.setup() + + def setup(self): + Mangler = self.MANGLERS[self.param.method] + self.mangler = Mangler(int(self.param.amplitude // self.resolution[0]), self.param.start, self.param.stop, + self.param.nshifts, + max_bound=int(self.param.max_shift // self.resolution[0]), randomseed=0) def allocate(self): self.npy.fdev = np.zeros(self.fshape, dtype=np.float32) # we won't use this again but preallocate for speed @@ -565,11 +680,32 @@ def error_reduce(self, addr, err_sum): err_sum[:] = ferr.sum(-1).sum(-1) return + def log_likelihood(self, b_aux, addr, mag, mask, err_sum): + # reference shape (write-to shape) + sh = self.fshape + # stopper + maxz = mag.shape[0] + + # batch buffers + aux = b_aux[:maxz * self.nmodes] + + # build model from complex fourier magnitudes, summing up + # all modes incoherently + tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) + LL = (np.abs(tf) ** 2).sum(1) + + # Intensity data + I = mag**2 + + # Calculate log likelihood error + err_sum[:] = ((mask * (LL - I)**2 / (I + 1.)).sum(-1).sum(-1) / np.prod(LL.shape[-2:])) + return + def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): ''' updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu ''' update_indices = err_sum < error_state - log(4, "updating %s indices" % np.sum(update_indices)) + log(4, "Position correction: updating %s indices" % np.sum(update_indices)) addr[update_indices] = mangled_addr[update_indices] error_state[update_indices] = err_sum[update_indices] diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index 04074625b..e6c51d49f 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -3,8 +3,14 @@ import numpy as np import os # debug_options = [] -#debug_options = ['-O0', '-G', '-g', '-std=c++11', '--keep'] -debug_options = ['-O3', '-DNDEBUG', '-std=c++11', '-lineinfo'] # release mode flags +# debug_options = ['-O0', '-G', '-g'] +debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags + +# C++14 support was added with CUDA 9, so we only enable the flag there +if cuda.get_version()[0] >= 9: + debug_options += ['-std=c++14'] +else: + debug_options += ['-std=c++11'] context = None queue = None @@ -18,9 +24,13 @@ def get_context(new_context=False, new_queue=False): if context is None or new_context: cuda.init() - if parallel.rank_local < cuda.Device.count(): - context = cuda.Device(parallel.rank_local).make_context() - context.push() + if parallel.rank_local >= cuda.Device.count(): + raise Exception('Local rank must be smaller than total device count, \ + rank={}, rank_local={}, device_count={}'.format( + parallel.rank, parallel.rank_local, cuda.Device.count() + )) + context = cuda.Device(parallel.rank_local).make_context() + context.push() # print("made context %s on rank %s" % (str(context), str(parallel.rank))) # print("The cuda device count on %s is:%s" % (str(parallel.rank), # str(cuda.Device.count()))) @@ -28,20 +38,31 @@ def get_context(new_context=False, new_queue=False): # str(parallel.rank_local))) if queue is None or new_queue: queue = cuda.Stream() + return context, queue def load_kernel(name, subs={}, file=None): if file is None: - fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + if isinstance(name, str): + fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + else: + raise ValueError("name parameter must be a string if not filename is given") else: fn = "%s/cuda/%s" % (os.path.dirname(__file__), file) - + with open(fn, 'r') as f: kernel = f.read() for k,v in list(subs.items()): kernel = kernel.replace(k, str(v)) + # insert a preprocessor line directive to assist compiler errors + escaped = fn.replace("\\", "\\\\") + kernel = '#line 1 "{}"\n'.format(escaped) + kernel mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options) - return mod.get_function(name) + + if isinstance(name, str): + return mod.get_function(name) + else: # tuple + return tuple(mod.get_function(n) for n in name) diff --git a/ptypy/accelerate/cuda_pycuda/address_manglers.py b/ptypy/accelerate/cuda_pycuda/address_manglers.py new file mode 100644 index 000000000..d19a77fa4 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/address_manglers.py @@ -0,0 +1,74 @@ +from ptypy.accelerate.cuda_pycuda import load_kernel +import numpy as np +from ptypy.accelerate.base import address_manglers as npam +from pycuda import gpuarray +import pycuda.driver as cuda + +class BaseMangler(npam.BaseMangler): + + def __init__(self, *args, queue_thread=None, **kwargs): + super().__init__(*args, **kwargs) + self.queue = queue_thread + self.get_address_cuda = load_kernel("get_address") + self.delta = None + self.delta_gpu = None + + def _setup_delta_gpu(self): + assert self.delta is not None, "Setup delta using the setup_shifts method first" + self.delta = np.ascontiguousarray(self.delta, dtype=np.int32) + + if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]: + self.delta_gpu = gpuarray.empty(self.delta.shape, dtype=np.int32) + # in case self.delta is smaller than delta_gpu, this will only copy the + # relevant part + cuda.memcpy_htod(dest=self.delta_gpu.ptr, + src=self.delta) + + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): + assert addr_current.dtype == np.int32, "addresses must be int32" + assert mangled_addr.dtype == np.int32, "addresses must be int32" + assert len(addr_current.shape) == 4, "addresses must be 4 dimensions" + assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated" + assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first" + assert index < self.delta_gpu.shape[0], "Index out of range for deltas" + assert isinstance(self.delta_gpu, gpuarray.GPUArray), "Only GPU arrays are supported for delta" + + # only using a single thread block here as it's not enough work + # otherwise + self.get_address_cuda( + addr_current, + mangled_addr, + np.int32(addr_current.shape[0] * addr_current.shape[1]), + self.delta_gpu[index,None], + np.int32(max_oby), + np.int32(max_obx), + block=(64,1,1), + grid=(1, 1, 1), + stream=self.queue) + +# with multiple inheritance, we have to be explicit which super class +# we are calling in the methods +class RandomIntMangler(BaseMangler, npam.RandomIntMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.RandomIntMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) + + +class GridSearchMangler(BaseMangler, npam.GridSearchMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.GridSearchMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 7ec819b95..85f816223 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -1,28 +1,45 @@ from . import load_kernel from pycuda import gpuarray +import pycuda.driver as cuda from ptypy.utils import gaussian import numpy as np +# maps a numpy dtype to the corresponding C type +def map2ctype(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) + + class ArrayUtilsKernel: def __init__(self, acc_dtype=np.float64, queue=None): self.queue = queue self.acc_dtype = acc_dtype self.cdot_cuda = load_kernel("dot", { - 'INTYPE': 'complex', - 'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float' + 'IN_TYPE': 'complex', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float' }) self.dot_cuda = load_kernel("dot", { - 'INTYPE': 'float', - 'ACCTYPE': 'double' if acc_dtype==np.float64 else 'float' + 'IN_TYPE': 'float', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float' }) self.full_reduce_cuda = load_kernel("full_reduce", { - 'DTYPE': 'double' if acc_dtype==np.float64 else 'float', + 'IN_TYPE': 'double' if acc_dtype==np.float64 else 'float', + 'OUT_TYPE': 'double' if acc_dtype==np.float64 else 'float', + 'ACC_TYPE': 'double' if acc_dtype==np.float64 else 'float', 'BDIM_X': 1024 }) - self.transpose_cuda = load_kernel("transpose", { - 'DTYPE': 'int', - 'BDIM': 16 - }) self.Ctmp = None def dot(self, A, B, out=None): @@ -60,6 +77,18 @@ def dot(self, A, B, out=None): return out + def norm2(self, A, out=None): + return self.dot(A, A, out) + +class TransposeKernel: + + def __init__(self, queue=None): + self.queue = queue + self.transpose_cuda = load_kernel("transpose", { + 'DTYPE': 'int', + 'BDIM': 16 + }) + def transpose(self, input, output): # only for int at the moment (addr array), and 2D (reshape pls) if len(input.shape) != 2: @@ -80,8 +109,134 @@ def transpose(self, input, output): self.transpose_cuda(input, output, np.int32(width), np.int32(height), block=blk, grid=grd, stream=self.queue) - def norm2(self, A, out=None): - return self.dot(A, A, out) +class MaxAbs2Kernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.max_abs2_cuda = {} + + def max_abs2(self, X, out): + """ Calculate max(abs(x)**2) across the final 2 dimensions""" + rows = np.int32(X.shape[-2]) + cols = np.int32(X.shape[-1]) + firstdims = np.int32(np.prod(X.shape[:-2])) + gy = int(rows) + # lazy-loading, keeping scratch memory and both kernels in the same dictionary + bx = int(64) + version = '{},{},{}'.format(map2ctype(X.dtype), map2ctype(out.dtype), gy) + if version not in self.max_abs2_cuda: + step1, step2 = load_kernel( + ("max_abs2_step1", "max_abs2_step2"), + { + 'IN_TYPE': map2ctype(X.dtype), + 'OUT_TYPE': map2ctype(out.dtype), + 'BDIM_X': bx, + }, "max_abs2.cu") + self.max_abs2_cuda[version] = { + 'step1': step1, + 'step2': step2, + 'scratchmem': gpuarray.empty((gy,), dtype=out.dtype) + } + + # if self.max_abs2_cuda[version]['scratchmem'] is None \ + # or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: + # self.max_abs2_cuda[version]['scratchmem'] = + scratch = self.max_abs2_cuda[version]['scratchmem'] + + + self.max_abs2_cuda[version]['step1'](X, firstdims, rows, cols, scratch, + block=(bx, 1, 1), grid=(1, gy, 1), + stream=self.queue) + self.max_abs2_cuda[version]['step2'](scratch, np.int32(gy), out, + block=(bx, 1, 1), grid=(1, 1, 1), + stream=self.queue + ) + + +class CropPadKernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.fill3D_cuda = {} + + def fill3D(self, A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any(), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all(), "At least one dimension lacks overlap" + Ao = Ao.astype(np.int32) + Bo = Bo.astype(np.int32) + lengths = np.array([ + min(off[0] + Blim[0], Alim[0]) - Ao[0], + min(off[1] + Blim[1], Alim[1]) - Ao[1], + min(off[2] + Blim[2], Alim[2]) - Ao[2], + ], dtype=np.int32) + lengths2 = np.array([ + min(Alim[0] - off[0], Blim[0]) - Bo[0], + min(Alim[1] - off[1], Blim[1]) - Bo[1], + min(Alim[2] - off[2], Blim[2]) - Bo[2], + ], dtype=np.int32) + assert (lengths == lengths2).all(), "left and right lenghts are not matching" + batch = int(np.prod(A.shape[:-3])) + + # lazy loading depending on data type + version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype)) + if version not in self.fill3D_cuda: + self.fill3D_cuda[version] = load_kernel("fill3D", { + 'IN_TYPE': map2ctype(B.dtype), + 'OUT_TYPE': map2ctype(A.dtype) + }) + bx = by = 32 + self.fill3D_cuda[version]( + A, B, + np.int32(A.shape[-3]), np.int32(A.shape[-2]), np.int32(A.shape[-1]), + np.int32(B.shape[-3]), np.int32(B.shape[-2]), np.int32(B.shape[-1]), + Ao[0], Ao[1], Ao[2], + Bo[0], Bo[1], Bo[2], + lengths[0], lengths[1], lengths[2], + block=(int(bx), int(by), int(1)), + grid=( + int((lengths[2] + bx - 1)//bx), + int((lengths[1] + by - 1)//by), + int(batch)), + stream=self.queue + ) + + + def crop_pad_2d_simple(self, A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any(), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + self.fill3D(A, B, offset) class DerivativesKernel: @@ -99,30 +254,27 @@ def __init__(self, dtype, queue=None): self.last_axis_block = (256, 4, 1) self.mid_axis_block = (256, 4, 1) - self.delxf_last = load_kernel("delx_last", file="delx_last.cu", subs={ - 'IS_FORWARD': 'true', - 'BDIM_X': str(self.last_axis_block[0]), - 'BDIM_Y': str(self.last_axis_block[1]), - 'DTYPE': stype - }) - self.delxb_last = load_kernel("delx_last", file="delx_last.cu", subs={ - 'IS_FORWARD': 'false', - 'BDIM_X': str(self.last_axis_block[0]), - 'BDIM_Y': str(self.last_axis_block[1]), - 'DTYPE': stype - }) - self.delxf_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ - 'IS_FORWARD': 'true', - 'BDIM_X': str(self.mid_axis_block[0]), - 'BDIM_Y': str(self.mid_axis_block[1]), - 'DTYPE': stype - }) - self.delxb_mid = load_kernel("delx_mid", file="delx_mid.cu", subs={ - 'IS_FORWARD': 'false', - 'BDIM_X': str(self.mid_axis_block[0]), - 'BDIM_Y': str(self.mid_axis_block[1]), - 'DTYPE': stype - }) + self.delxf_last, self.delxf_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'true', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + self.delxb_last, self.delxb_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'false', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + def delxf(self, input, out, axis=-1): if input.dtype != self.dtype: @@ -188,34 +340,59 @@ def delxb(self, input, out, axis=-1): class GaussianSmoothingKernel: - def __init__(self, queue=None, num_stdevs=4): + def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type self.dtype = np.complex64 self.stype = "complex" self.queue = queue self.num_stdevs = num_stdevs self.blockdim_x = 4 self.blockdim_y = 16 + # At least 2 blocks per SM self.max_shared_per_block = 48 * 1024 // 2 self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize - self.max_kernel_radius = self.max_shared_per_block_complex / self.blockdim_y + self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y) - self.convolution_row = load_kernel("convolution_row", file="convolution.cu", subs={ - 'BDIM_X': self.blockdim_x, - 'BDIM_Y': self.blockdim_y, - 'DTYPE': self.stype + self.convolution_row = load_kernel( + "convolution_row", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_x, + 'BDIM_Y': self.blockdim_y, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) - self.convolution_col = load_kernel("convolution_col", file="convolution.cu", subs={ - 'BDIM_X': self.blockdim_y, - 'BDIM_Y': self.blockdim_x, - 'DTYPE': self.stype + self.convolution_col = load_kernel( + "convolution_col", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns + 'BDIM_Y': self.blockdim_x, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type }) + # pre-allocate kernel memory on gpu, with max-radius to accomodate + dtype=np.float32 if self.kernel_type == 'float' else np.float64 + self.kernel_gpu = gpuarray.empty((self.max_kernel_radius,), dtype=dtype) + # keep track of previus radius and std to determine if we need to transfer again + self.r = 0 + self.std = 0 - def convolution(self, input, output, mfs): - ndims = input.ndim - shape = input.shape + def convolution(self, data, mfs, tmp=None): + """ + Calculates a stacked 2D convolution for smoothing, with the standard deviations + given in mfs (stdx, stdy). It works in-place in the data array, + and tmp is a gpu-allocated array of the same size and type as data, + used internally for temporary storage + """ + ndims = data.ndim + shape = data.shape + + # Create temporary array (if not given) + if tmp is None: + tmp = gpuarray.empty(shape, dtype=data.dtype) + assert shape == tmp.shape and data.dtype == tmp.dtype # Check input dimensions if ndims == 3: @@ -232,15 +409,23 @@ def convolution(self, input, output, mfs): else: raise NotImplementedError("input needs to be of dimensions 0 < ndims <= 3") + input = data + output = tmp + # Row convolution kernel # TODO: is this threshold acceptable in all cases? if stdx > 0.1: r = int(self.num_stdevs * stdx + 0.5) - g = gaussian(np.arange(-r,r+1), stdx) - g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdx != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r,r+1), stdx) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = k[:] + self.r = r + self.std = stdx bx = self.blockdim_x by = self.blockdim_y @@ -251,21 +436,27 @@ def convolution(self, input, output, mfs): blk = (bx, by, 1) grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches) - self.convolution_row(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), + self.convolution_row(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), block=blk, grid=grd, shared=shared, stream=self.queue) - # Overwrite input input = output - + output = data + # Column convolution kernel # TODO: is this threshold acceptable in all cases? if stdy > 0.1: r = int(self.num_stdevs * stdy + 0.5) - g = gaussian(np.arange(-r,r+1), stdy) - g /= g.sum() - kernel = gpuarray.to_gpu(g[r:].astype(np.float32)) if r > self.max_kernel_radius: raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdy != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r,r+1), stdy) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype(np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = k[:] + self.r = r + self.std = stdy + bx = self.blockdim_y by = self.blockdim_x @@ -276,9 +467,35 @@ def convolution(self, input, output, mfs): blk = (bx, by, 1) grd = (int((y + bx -1)// bx), int((x + by-1)// by), batches) - self.convolution_col(input, output, np.int32(y), np.int32(x), kernel, np.int32(r), + self.convolution_col(input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r), block=blk, grid=grd, shared=shared, stream=self.queue) # TODO: is this threshold acceptable in all cases? if (stdx <= 0.1 and stdy <= 0.1): - output[:] = input[:] + return # nothing to do + elif (stdx > 0.1 and stdy > 0.1): + return # both parts have run, output is back in data + else: + data[:] = tmp[:] # only one of them has run, output is in tmp + +class ClipMagnitudesKernel: + + def __init__(self, queue=None): + self.queue = queue + self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", { + 'IN_TYPE': 'complex', + }) + + def clip_magnitudes_to_range(self, array, clip_min, clip_max): + + cmin = np.float32(clip_min) + cmax = np.float32(clip_max) + + npixel = np.int32(np.prod(array.shape)) + bx = 256 + gx = int((npixel + bx - 1) // bx) + self.clip_magnitudes_cuda(array, cmin, cmax, + npixel, + block=(bx, 1, 1), + grid=(gx, 1, 1), + stream=self.queue) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu index 15ca555fa..1263841b6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu @@ -1,13 +1,19 @@ /** This kernel was used for FFT pre- and post-scaling, to test if cuFFT via python is worthwhile. It turned out it wasn't. -*/ + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + * - MATH_TYPE: the data type used for computation (filter) + */ + #include using thrust::complex; -extern "C" __global__ void batched_multiply(const complex* input, - complex* output, - const complex* filter, +extern "C" __global__ void batched_multiply(const complex* input, + complex* output, + const complex* filter, float scale, int nBatches, int rows, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu index 88b22c256..e9ceeb80c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu @@ -1,24 +1,43 @@ +/** build_aux kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; +// core calculation function - used by both kernels and inlined +inline __device__ complex calculate( + const complex& t_obj, + const complex& t_probe, + const complex& t_ex, + MATH_TYPE alpha) +{ + return t_obj * t_probe * (MATH_TYPE(1) + alpha) - t_ex * alpha; +} + extern "C" __global__ void build_aux( - complex* auxiliary_wave, - const complex* __restrict__ exit_wave, + complex* auxiliary_wave, + const complex* __restrict__ exit_wave, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, const int* __restrict__ addr, - float alpha) + IN_TYPE alpha_) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -35,9 +54,46 @@ extern "C" __global__ void build_aux( // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - auxiliary_wave[b * C + c] = - obj[b * I + c] * probe[b * F + c] * (1.0f + alpha) - - exit_wave[b * C + c] * alpha; + auxiliary_wave[b * C + c] = calculate( + obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha); } } } + +extern "C" __global__ void build_aux2( + complex* auxiliary_wave, + const complex* __restrict__ exit_wave, + int B, + int C, + const complex* __restrict__ probe, + int E, + int F, + const complex* __restrict__ obj, + int H, + int I, + const int* __restrict__ addr, + IN_TYPE alpha_) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + exit_wave += ea[0] * B * C; + auxiliary_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + auxiliary_wave[b * C + c] = calculate( + obj[b * I + c], probe[b * F + c], exit_wave[b * C + c], alpha); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu index 384efc070..ee091c58e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu @@ -1,23 +1,32 @@ +/** build_aux without exit wave kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave, +extern "C" __global__ void build_aux_no_ex(complex* auxilliary_wave, int aRows, int aCols, - const CTYPE* __restrict__ probe, + const complex* __restrict__ probe, int pRows, int pCols, - const CTYPE* __restrict__ obj, + const complex* __restrict__ obj, int oRows, int oCols, const int* __restrict__ addr, - FTYPE fac, + IN_TYPE fac_, int doAdd) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; const int addr_stride = 15; + const MATH_TYPE fac = fac_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -32,7 +41,9 @@ extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave, # pragma unroll(4) for (int c = tx; c < aCols; c += blockDim.x) { - auto tmp = obj[b * oCols + c] * probe[b * pCols + c] * fac; + complex t_obj = obj[b * oCols + c]; + complex t_probe = probe[b * pCols + c]; + auto tmp = t_obj * t_probe * fac; if (doAdd) { auxilliary_wave[b * aCols + c] += tmp; @@ -43,4 +54,50 @@ extern "C" __global__ void build_aux_no_ex(CTYPE* auxilliary_wave, } } } +} + +extern "C" __global__ void build_aux2_no_ex(complex* auxilliary_wave, + int aRows, + int aCols, + const complex* __restrict__ probe, + int pRows, + int pCols, + const complex* __restrict__ obj, + int oRows, + int oCols, + const int* __restrict__ addr, + IN_TYPE fac_, + int doAdd) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= aRows) + return; + const int addr_stride = 15; + const MATH_TYPE fac = fac_; // type conversion + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + obj += oa[0] * oRows * oCols + oa[1] * oCols + oa[2]; + probe += pa[0] * pRows * pCols + pa[1] * pCols + pa[2]; + auxilliary_wave += ea[0] * aRows * aCols; + + for (int c = tx; c < aCols; c += blockDim.x) + { + complex t_obj = obj[b * oCols + c]; + complex t_probe = probe[b * pCols + c]; + auto tmp = t_obj * t_probe * fac; + if (doAdd) + { + auxilliary_wave[b * aCols + c] += tmp; + } + else + { + auxilliary_wave[b * aCols + c] = tmp; + } + } + } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu index 004e7f0ed..327040371 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu @@ -1,12 +1,20 @@ +/** build_aux for position correction. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; extern "C" __global__ void build_aux_position_correction( - complex* auxiliary_wave, - const complex* __restrict__ probe, + complex* auxiliary_wave, + const complex* __restrict__ probe, int B, int C, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, const int* __restrict__ addr) @@ -30,7 +38,9 @@ extern "C" __global__ void build_aux_position_correction( // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - auxiliary_wave[b * C + c] = obj[b * I + c] * probe[b * C + c]; + complex t_obj = obj[b * I + c]; + complex t_probe = probe[b * C + c]; + auxiliary_wave[b * C + c] = t_obj * t_probe; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu index 87031184e..2b98634dc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu @@ -1,3 +1,12 @@ +/** build_exit kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + + #include using thrust::complex; @@ -9,22 +18,24 @@ __device__ inline void atomicAdd(complex* x, complex y) atomicAdd(xf + 1, y.imag()); } -extern "C" __global__ void build_exit(complex* auxiliary_wave, - complex* exit_wave, +extern "C" __global__ void build_exit(complex* auxiliary_wave, + complex* exit_wave, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int H, int I, - const int* __restrict__ addr) + const int* __restrict__ addr, + IN_TYPE alpha_) { int bid = blockIdx.x; int tx = threadIdx.x; int ty = threadIdx.y; const int addr_stride = 15; + const MATH_TYPE alpha = alpha_; // type conversion const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -41,8 +52,12 @@ extern "C" __global__ void build_exit(complex* auxiliary_wave, // (it will work for less as well) for (int c = tx; c < C; c += blockDim.x) { - auto auxv = auxiliary_wave[b * C + c]; - auxv -= probe[b * F + c] * obj[b * I + c]; + complex auxv = auxiliary_wave[b * C + c]; + complex t_probe = probe[b * F + c]; + complex t_obj = obj[b * I + c]; + complex t_exit = exit_wave[b * C + c]; + auxv -= alpha * t_probe * t_obj; + auxv += (alpha - 1) * t_exit; exit_wave[b * C + c] += auxv; auxiliary_wave[b * C + c] = auxv; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu new file mode 100644 index 000000000..8528f2e9c --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu @@ -0,0 +1,60 @@ +/** build_exit_alpha_tau kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + + +#include +using thrust::complex; + + +extern "C" __global__ void build_exit_alpha_tau( + complex* auxiliary_wave, + complex* exit_wave, + int B, + int C, + const complex* __restrict__ probe, + int E, + int F, + const complex* __restrict__ obj, + int H, + int I, + const int* __restrict__ addr, + IN_TYPE alpha_, + IN_TYPE tau_) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + MATH_TYPE alpha = alpha_; + MATH_TYPE tau = tau_; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + exit_wave += ea[0] * B * C; + auxiliary_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex t_aux = auxiliary_wave[b * C + c]; + complex t_probe = probe[b * F + c]; + complex t_obj = obj[b * I + c]; + complex t_ex = exit_wave[b * C + c]; + + auto dex = tau * t_aux + (tau * alpha - MATH_TYPE(1)) * t_ex + + (MATH_TYPE(1) - tau * (MATH_TYPE(1) + alpha)) * t_obj * t_probe; + + exit_wave[b * C + c] += dex; + auxiliary_wave[b * C + c] = dex; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu new file mode 100644 index 000000000..8128091f9 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu @@ -0,0 +1,30 @@ +/** clip_magnitudes. + * + */ + #include + #include + #include + using thrust::complex; + + extern "C" __global__ void clip_magnitudes(IN_TYPE *arr, + float clip_min, + float clip_max, + int N) +{ + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if (id >= N) + return; + + auto v = arr[id]; + auto mag = abs(v); + auto theta = arg(v); + + if (mag > clip_max) + mag = clip_max; + if (mag < clip_min) + mag = clip_min; + + v = thrust::polar(mag, theta); + arr[id] = v; +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu index 1b008c815..ae42ecba5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu @@ -1,3 +1,11 @@ +/** + * Data types: + * - DTYPE (float/double/complex/complex) + * - MATH_TYPE (float/double) - used for the convolution kernel itself + * + * A symmetric convolution kernel is assumed here + */ + #include using thrust::complex; @@ -42,7 +50,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input, DTYPE *output, int height, int width, - const float* kernel, + const MATH_TYPE* kernel, int kernel_radius) { int tx = threadIdx.x; @@ -97,7 +105,7 @@ extern "C" __global__ void convolution_row(const DTYPE *__restrict__ input, if (gby + ty >= width || gbx + tx >= height) return; - // compute + // compute - will be complex if kernel is double auto sum = shm[tx * shwidth + (ty + kernel_radius)] * kernel[0]; for (int i = 1; i <= kernel_radius; ++i) { @@ -117,7 +125,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input, DTYPE *output, int height, int width, - const float* kernel, + const MATH_TYPE* kernel, int kernel_radius) { int tx = threadIdx.x; @@ -169,7 +177,7 @@ extern "C" __global__ void convolution_col(const DTYPE *__restrict__ input, if (gby + ty >= width || gbx + tx >= height) return; - // compute + // compute - will be complex if kernel is double auto sum = shm[(tx + kernel_radius) * BDIM_Y + ty] * kernel[0]; for (int i = 1; i <= kernel_radius; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu similarity index 58% rename from ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu rename to ptypy/accelerate/cuda_pycuda/cuda/delx.cu index ffc6600ca..f2e8a934e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_mid.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/delx.cu @@ -1,6 +1,14 @@ +/** difference along axes (last and mid axis kernels) + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + */ + #include using thrust::complex; + /** Finite difference for forward/backward for any axis that is not the * last one, assuring that the reads and writes are coalesced. * @@ -40,8 +48,8 @@ using thrust::complex; * zero if it's the end of the input. * */ -extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, - DTYPE *output, +extern "C" __global__ void delx_mid(const IN_TYPE *__restrict__ input, + OUT_TYPE *output, int lower_dim, // x for 3D int higher_dim, // z for 3D int axis_dim) @@ -49,8 +57,8 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, // reinterpret to avoid compiler warning that // constructor of complex() cannot be called if it's // shared memory - polluting the outputs - __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)]; - auto shared_data = reinterpret_cast(shr); + __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; + auto shared_data = reinterpret_cast(shr); unsigned int tx = threadIdx.x; unsigned int ty = threadIdx.y; @@ -82,7 +90,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, { if (IS_FORWARD) { - DTYPE plus1; + IN_TYPE plus1; if (ty < BDIM_Y - 1 && iy < axis_dim - 1) // we have a next element in shared data { @@ -100,7 +108,7 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, } else { - DTYPE minus1; + IN_TYPE minus1; if (ty > 0) // we have a previous element in shared { minus1 = shared_data[(ty - 1) * BDIM_X + tx]; @@ -118,3 +126,85 @@ extern "C" __global__ void delx_mid(const DTYPE *__restrict__ input, } } } + + + +/** This is the special case for when we diff along the last axis. + * + * Here, flat_dim is all other dims multiplied together, and axis_dim + * is the dimension along which we diff. + * To ensure that we stay coalesced (compared to delx_mid), + * we use the x index to iterate within each thread block (the loop). + * Otherwise it follows the same ideas as delx_mid - please read the + * description there. + */ +extern "C" __global__ void delx_last(const IN_TYPE *__restrict__ input, + OUT_TYPE *output, + int flat_dim, + int axis_dim) +{ + // reinterpret to avoid constructor of complex() + compiler warning + __shared__ char shr[BDIM_X * BDIM_Y * sizeof(IN_TYPE)]; + auto shared_data = reinterpret_cast(shr); + + unsigned int tx = threadIdx.x; + unsigned int ty = threadIdx.y; + + unsigned int ix = tx; + unsigned int iy = ty + blockIdx.x * BDIM_Y; // we always use x in grid + + int stride_y = axis_dim; + + auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X; + for (int bidx = 0; bidx < maxblocks; ++bidx) + { + ix = tx + bidx * BDIM_X; + + if (iy < flat_dim && ix < axis_dim) + { + shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix]; + } + + __syncthreads(); + + if (iy < flat_dim && ix < axis_dim) + { + if (IS_FORWARD) + { + IN_TYPE plus1; + if (tx < BDIM_X - 1 && + ix < axis_dim - 1) // we have a next element in shared data + { + plus1 = shared_data[ty * BDIM_X + tx + 1]; + } + else if (ix == axis_dim - 1) // end of axis - same as current to get 0 + { + plus1 = shared_data[ty * BDIM_X + tx]; + } + else // end of block, but nore input is there + { + plus1 = input[iy * stride_y + ix + 1]; + } + + output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx]; + } + else + { + IN_TYPE minus1; + if (tx > 0) // we have a previous element in shared + { + minus1 = shared_data[ty * BDIM_X + tx - 1]; + } + else if (ix == 0) // use same as next to get zero + { + minus1 = shared_data[ty * BDIM_X + tx]; + } + else // read previous input (ty == 0 but iy > 0) + { + minus1 = input[iy * stride_y + ix - 1]; + } + output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1; + } + } + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu b/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu deleted file mode 100644 index c4449f19a..000000000 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx_last.cu +++ /dev/null @@ -1,82 +0,0 @@ -#include -using thrust::complex; - -/** This is the special case for when we diff along the last axis. - * - * Here, flat_dim is all other dims multiplied together, and axis_dim - * is the dimension along which we diff. - * To ensure that we stay coalesced (compared to delx_mid), - * we use the x index to iterate within each thread block (the loop). - * Otherwise it follows the same ideas as delx_mid - please read the - * description there. - */ -extern "C" __global__ void delx_last(const DTYPE *__restrict__ input, - DTYPE *output, - int flat_dim, - int axis_dim) -{ - // reinterpret to avoid constructor of complex() + compiler warning - __shared__ char shr[BDIM_X * BDIM_Y * sizeof(DTYPE)]; - auto shared_data = reinterpret_cast(shr); - - unsigned int tx = threadIdx.x; - unsigned int ty = threadIdx.y; - - unsigned int ix = tx; - unsigned int iy = ty + blockIdx.x * BDIM_Y; // we always use x in grid - - int stride_y = axis_dim; - - auto maxblocks = (axis_dim + BDIM_X - 1) / BDIM_X; - for (int bidx = 0; bidx < maxblocks; ++bidx) - { - ix = tx + bidx * BDIM_X; - - if (iy < flat_dim && ix < axis_dim) - { - shared_data[ty * BDIM_X + tx] = input[iy * stride_y + ix]; - } - - __syncthreads(); - - if (iy < flat_dim && ix < axis_dim) - { - if (IS_FORWARD) - { - DTYPE plus1; - if (tx < BDIM_X - 1 && - ix < axis_dim - 1) // we have a next element in shared data - { - plus1 = shared_data[ty * BDIM_X + tx + 1]; - } - else if (ix == axis_dim - 1) // end of axis - same as current to get 0 - { - plus1 = shared_data[ty * BDIM_X + tx]; - } - else // end of block, but nore input is there - { - plus1 = input[iy * stride_y + ix + 1]; - } - - output[iy * stride_y + ix] = plus1 - shared_data[ty * BDIM_X + tx]; - } - else - { - DTYPE minus1; - if (tx > 0) // we have a previous element in shared - { - minus1 = shared_data[ty * BDIM_X + tx - 1]; - } - else if (ix == 0) // use same as next to get zero - { - minus1 = shared_data[ty * BDIM_X + tx]; - } - else // read previous input (ty == 0 but iy > 0) - { - minus1 = input[iy * stride_y + ix - 1]; - } - output[iy * stride_y + ix] = shared_data[ty * BDIM_X + tx] - minus1; - } - } - } -} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu index 1f53b0d0c..21087abe3 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/dot.cu @@ -15,15 +15,15 @@ __device__ inline T dotmul(const complex& a, const complex& b) return a.real() * b.real() + a.imag() * b.imag(); } -extern "C" __global__ void dot(const INTYPE* a, - const INTYPE* b, +extern "C" __global__ void dot(const IN_TYPE* a, + const IN_TYPE* b, int size, - ACCTYPE* out) + ACC_TYPE* out) { int tx = threadIdx.x; int ix = tx + blockIdx.x * blockDim.x; - __shared__ ACCTYPE sh[1024]; + __shared__ ACC_TYPE sh[1024]; if (ix < size) { @@ -31,7 +31,7 @@ extern "C" __global__ void dot(const INTYPE* a, } else { - sh[tx] = ACCTYPE(0); + sh[tx] = ACC_TYPE(0); } __syncthreads(); diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu index 177732e9b..91b5357b4 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu @@ -1,17 +1,24 @@ - -extern "C" __global__ void error_reduce(const float* ferr, - float* err_fmag, +/** error_reduce kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - ACC_TYPE: the data type used for computation + */ + +extern "C" __global__ void error_reduce(const IN_TYPE* ferr, + OUT_TYPE* err_fmag, int M, int N) { int tx = threadIdx.x; int ty = threadIdx.y; int batch = blockIdx.x; - extern __shared__ float sum_v[1024]; + __shared__ ACC_TYPE sum_v[BDIM_X*BDIM_Y]; int shidx = ty * blockDim.x + tx; // shidx: index in shared memory for this block - float sum = 0.0f; + ACC_TYPE sum = ACC_TYPE(0.0); for (int m = ty; m < M; m += blockDim.y) { @@ -20,7 +27,7 @@ extern "C" __global__ void error_reduce(const float* ferr, { int idx = batch * M * N + m * N + n; // idx is index qwith respect to the full stack - sum += ferr[idx]; + sum += ACC_TYPE(ferr[idx]); } } @@ -28,7 +35,7 @@ extern "C" __global__ void error_reduce(const float* ferr, __syncthreads(); - int nt = blockDim.x * blockDim.y; + int nt = BDIM_X * BDIM_Y; int c = nt; while (c > 1) @@ -44,6 +51,6 @@ extern "C" __global__ void error_reduce(const float* ferr, if (shidx == 0) { - err_fmag[batch] = float(sum_v[0]); + err_fmag[batch] = OUT_TYPE(sum_v[0]); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu index d4f774319..fdac52e46 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu @@ -11,16 +11,16 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) exit_error(int nmodes, - complex *aux, - float *ferr, - const int *addr, + const complex * __restrict aux, + OUT_TYPE *ferr, + const int * __restrict addr, int A, int B) { int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; - float denom = A * B; + MATH_TYPE denom = A * B; const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride; const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride; @@ -32,15 +32,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(aux[a * B + b + idx * A * B]); + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - ferr[a * B + b] = acc / denom; + ferr[a * B + b] = OUT_TYPE(acc / denom); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu new file mode 100644 index 000000000..c3f03d8ca --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu @@ -0,0 +1,60 @@ +/** fill3D kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: data type for outputs + */ + +#include +#include +using thrust::complex; + +extern "C" __global__ void fill3D( + OUT_TYPE* A, + const IN_TYPE* B, + // final dimensions of A/B in [z, y, x] + int A_Z, + int A_Y, + int A_X, + int B_Z, + int B_Y, + int B_X, + // offsets to start reading/writing + int Ao_z, + int Ao_y, + int Ao_x, + int Bo_z, + int Bo_y, + int Bo_x, + // lengths to copy + int len_z, + int len_y, + int len_x + ) +{ + // We use the following strategy: + // - BlockIdx.z for the batch (first dims combined if 4D+) + // - blockDim.z = 1 + // - multiple blocks are used across y and x dimensions + // - we loop over z dimension within the thread block + int batch = blockIdx.z; + int ix = threadIdx.x + blockIdx.x * blockDim.x; + int iy = threadIdx.y + blockIdx.y * blockDim.y; + + if (ix >= len_x || iy >= len_y) + return; + + // offset for current batch (4D+ dimension) + A += batch * A_X * A_Y * A_Z; + B += batch * B_X * B_Y * B_Z; + + // offset for start position in each dimension of the last 3 + A += Ao_z * A_Y * A_X + Ao_y * A_X + Ao_x; + B += Bo_z * B_Y * B_X + Bo_y * B_X + Bo_x; + + // copy data + for (int iz = 0; iz < len_z; ++iz) { + A[iz * A_Y * A_X + iy * A_X + ix] = + B[iz * B_Y * B_X + iy * B_X + ix]; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu index cfdffb911..46d0d09f1 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu @@ -1,29 +1,84 @@ -extern "C" __global__ void fill_b(const FTYPE* A0, - const FTYPE* A1, - const FTYPE* A2, - const FTYPE* w, - FTYPE Brenorm, +/** fill_b kernels. + * Data types: + * - IN_TYPE: the data type for the inputs + * - OUT_TYPE: the data type for the outputs + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: the accumulator type for summing + */ + +extern "C" __global__ void fill_b(const IN_TYPE* A0, + const IN_TYPE* A1, + const IN_TYPE* A2, + const IN_TYPE* w, + IN_TYPE Brenorm, int size, - double* out) + ACC_TYPE* out) { int tx = threadIdx.x; int ix = tx + blockIdx.x * blockDim.x; - __shared__ double smem[3][BDIM_X]; + __shared__ ACC_TYPE smem[3][BDIM_X]; if (ix < size) { - // FTYPE(2) to make sure it's float in single precision and doesn't + // MATHTYPE(2) to make sure it's float in single precision and doesn't // accidentally promote the equation to double - smem[0][tx] = w[ix] * A0[ix] * A0[ix]; - smem[1][tx] = w[ix] * FTYPE(2) * A0[ix] * A1[ix]; - smem[2][tx] = w[ix] * (A1[ix] * A1[ix] + FTYPE(2) * A0[ix] * A2[ix]); + MATH_TYPE t_a0 = A0[ix]; + MATH_TYPE t_a1 = A1[ix]; + MATH_TYPE t_a2 = A2[ix]; + MATH_TYPE t_w = w[ix]; + smem[0][tx] = t_w * t_a0 * t_a0; + smem[1][tx] = t_w * MATH_TYPE(2) * t_a0 * t_a1; + smem[2][tx] = t_w * (t_a1 * t_a1 + MATH_TYPE(2) * t_a0 * t_a2); } else { - smem[0][tx] = FTYPE(0); - smem[1][tx] = FTYPE(0); - smem[2][tx] = FTYPE(0); + smem[0][tx] = ACC_TYPE(0); + smem[1][tx] = ACC_TYPE(0); + smem[2][tx] = ACC_TYPE(0); + } + __syncthreads(); + + int nt = blockDim.x; + int c = nt; + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + smem[0][tx] += smem[0][c - tx - 1]; + smem[1][tx] += smem[1][c - tx - 1]; + smem[2][tx] += smem[2][c - tx - 1]; + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + out[blockIdx.x * 3 + 0] = MATH_TYPE(smem[0][0]) * MATH_TYPE(Brenorm); + out[blockIdx.x * 3 + 1] = MATH_TYPE(smem[1][0]) * MATH_TYPE(Brenorm); + out[blockIdx.x * 3 + 2] = MATH_TYPE(smem[2][0]) * MATH_TYPE(Brenorm); + } +} + +extern "C" __global__ void fill_b_reduce(const ACC_TYPE* in, OUT_TYPE* B, int blocks) +{ + // always a single thread block for 2nd stage + assert(gridDim.x == 1); + int tx = threadIdx.x; + + __shared__ ACC_TYPE smem[3][BDIM_X]; + + double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; + for (int ix = tx; ix < blocks; ix += blockDim.x) + { + sum0 += in[ix * 3 + 0]; + sum1 += in[ix * 3 + 1]; + sum2 += in[ix * 3 + 2]; } + smem[0][tx] = sum0; + smem[1][tx] = sum1; + smem[2][tx] = sum2; __syncthreads(); int nt = blockDim.x; @@ -43,8 +98,8 @@ extern "C" __global__ void fill_b(const FTYPE* A0, if (tx == 0) { - out[blockIdx.x * 3 + 0] = smem[0][0] * double(Brenorm); - out[blockIdx.x * 3 + 1] = smem[1][0] * double(Brenorm); - out[blockIdx.x * 3 + 2] = smem[2][0] * double(Brenorm); + B[0] += OUT_TYPE(smem[0][0]); + B[1] += OUT_TYPE(smem[1][0]); + B[2] += OUT_TYPE(smem[2][0]); } -} \ No newline at end of file +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu deleted file mode 100644 index c37d494d8..000000000 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill_b_reduce.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include - -extern "C" __global__ void fill_b_reduce(const double* in, FTYPE* B, int blocks) -{ - // always a single thread block for 2nd stage - assert(gridDim.x == 1); - int tx = threadIdx.x; - - __shared__ double smem[3][BDIM_X]; - - double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for (int ix = tx; ix < blocks; ix += blockDim.x) - { - sum0 += in[ix * 3 + 0]; - sum1 += in[ix * 3 + 1]; - sum2 += in[ix * 3 + 2]; - } - smem[0][tx] = sum0; - smem[1][tx] = sum1; - smem[2][tx] = sum2; - __syncthreads(); - - int nt = blockDim.x; - int c = nt; - while (c > 1) - { - int half = c / 2; - if (tx < half) - { - smem[0][tx] += smem[0][c - tx - 1]; - smem[1][tx] += smem[1][c - tx - 1]; - smem[2][tx] += smem[2][c - tx - 1]; - } - __syncthreads(); - c = c - half; - } - - if (tx == 0) - { - B[0] += FTYPE(smem[0][0]); - B[1] += FTYPE(smem[1][0]); - B[2] += FTYPE(smem[2][0]); - } -} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu index bb152466a..586d7f356 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.cu @@ -29,18 +29,6 @@ #include #include -#ifndef MY_FFT_ROWS -# define MY_FFT_ROWS 128 -# pragma GCC warning "MY_FFT_ROWS not set in preprocessor - defaulting to 128" -#endif - -#ifndef MY_FFT_COLS -# define MY_FFT_COLS 128 -# pragma GCC warning "MY_FFT_COLS not set in preprocessor - defaulting to 128" -#endif - - - template class FilteredFFTImpl : public FilteredFFT { public: @@ -274,9 +262,37 @@ void FilteredFFTImpl::setupPlan() { } } +template +static FilteredFFT* make(int batches, int rows, int cols, complex* prefilt, complex* postfilt, + cudaStream_t stream) +{ + // we only support rows / colums are equal and powers of 2, from 16x16 to 512x512 + if (rows != cols) + throw std::runtime_error("Only equal numbers of rows and columns are supported"); + switch (rows) + { + case 16: return new FilteredFFTImpl<16, 16, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 32: return new FilteredFFTImpl<32, 32, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 64: return new FilteredFFTImpl<64, 64, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 128: return new FilteredFFTImpl<128, 128, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 256: return new FilteredFFTImpl<256, 256, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 512: return new FilteredFFTImpl<512, 512, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 1024: return new FilteredFFTImpl<1024, 1024, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + case 2048: return new FilteredFFTImpl<2048, 2048, SYMMETRIC, FORWARD>(batches, prefilt, postfilt, stream); + default: throw std::runtime_error("Only powers of 2 from 16 to 2048 are supported"); + } +} + //////////// Factory Functions for Python -FilteredFFT* make_filtered(int batches, bool symmetricScaling, +// Note: This will instantiate templates for 8 powers of 2, with 4 combinations of forward/reverse, symmetric/not, +// i.e. 32 different FFTs into the binary. Compile time might be quite long, but we intend to do this once +// during installation + +FilteredFFT* make_filtered( + int batches, + int rows, int cols, + bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, cudaStream_t stream) @@ -284,21 +300,17 @@ FilteredFFT* make_filtered(int batches, bool symmetricScaling, if (symmetricScaling) { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } else { if (isForward) { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } else { - return new FilteredFFTImpl(batches, - prefilt, postfilt, stream); + return make(batches, rows, cols, prefilt, postfilt, stream); } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h index fd153f768..9afa4e119 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/filtered_fft.h @@ -23,6 +23,7 @@ class FilteredFFT { // Note that cudaStream_t (runtime API) and CUStream (driver API) are // the same type FilteredFFT* make_filtered(int batches, + int rows, int columns, bool symmetricScaling, bool isForward, complex* prefilt, complex* postfilt, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp index 186d40cb2..3eb0eb37e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp +++ b/ptypy/accelerate/cuda_pycuda/cuda/filtered_fft/module.cpp @@ -13,7 +13,7 @@ class FilteredFFTPython { public: - FilteredFFTPython(int batches, bool symmetric, + FilteredFFTPython(int batches, int rows, int columns, bool symmetric, bool is_forward, std::size_t prefilt_ptr, std::size_t postfilt_ptr, @@ -21,6 +21,7 @@ class FilteredFFTPython { fft_ = make_filtered( batches, + rows, columns, symmetric, is_forward, reinterpret_cast*>(prefilt_ptr), @@ -70,12 +71,14 @@ class FilteredFFTPython namespace py = pybind11; -PYBIND11_MODULE(module, m) { +PYBIND11_MODULE(filtered_cufft, m) { m.doc() = "Filtered FFT for PtyPy"; py::class_(m, "FilteredFFT", py::module_local()) - .def(py::init(), + .def(py::init(), py::arg("batches"), + py::arg("rows"), + py::arg("columns"), py::arg("symmetricScaling"), py::arg("is_forward"), py::arg("prefilt"), diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu index 7d7a512a7..f8f695ca5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu @@ -1,15 +1,23 @@ +/** fmag_all_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include #include using std::sqrt; using thrust::complex; -extern "C" __global__ void fmag_all_update(complex* f, - const float* fmask, - const float* fmag, - const float* fdev, - const float* err_fmag, +extern "C" __global__ void fmag_all_update(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const IN_TYPE* err_fmag, const int* addr_info, - float pbound, + IN_TYPE pbound_, int A, int B) { @@ -17,23 +25,24 @@ extern "C" __global__ void fmag_all_update(complex* f, int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; + MATH_TYPE pbound = pbound_; const int* ea = addr_info + batch * addr_stride + 6; const int* da = addr_info + batch * addr_stride + 9; const int* ma = addr_info + batch * addr_stride + 12; fmask += ma[0] * A * B; - float err = err_fmag[da[0]]; + MATH_TYPE err = err_fmag[da[0]]; fdev += da[0] * A * B; fmag += da[0] * A * B; f += ea[0] * A * B; - float renorm = sqrt(pbound / err); + MATH_TYPE renorm = sqrt(pbound / err); for (int a = ty; a < A; a += blockDim.y) { for (int b = tx; b < B; b += blockDim.x) { - float m = fmask[a * A + b]; + MATH_TYPE m = fmask[a * A + b]; if (renorm < 1.0f) { /* @@ -42,10 +51,10 @@ extern "C" __global__ void fmag_all_update(complex* f, ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] + fmag[a * A + b] + 1e-7f)) ; */ - auto fmagv = fmag[a * A + b]; - auto fdevv = fdev[a * A + b]; - float fm = (1.0f - m) + - m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + 1e-7f)); + MATH_TYPE fmagv = fmag[a * A + b]; + MATH_TYPE fdevv = fdev[a * A + b]; + MATH_TYPE fm = (MATH_TYPE(1) - m) + + m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + MATH_TYPE(1e-7))); f[a * A + b] *= fm; } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu new file mode 100644 index 000000000..40a65c172 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu @@ -0,0 +1,53 @@ +/** fmag_all_update_nopbound. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +#include +using std::sqrt; +using thrust::complex; + +extern "C" __global__ void fmag_update_nopbound(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const int* addr_info, + int A, + int B) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int a = threadIdx.y + blockIdx.y * blockDim.y; + if (a >= A) + return; + int addr_stride = 15; + + const int* ea = addr_info + bid * addr_stride + 6; + const int* da = addr_info + bid * addr_stride + 9; + const int* ma = addr_info + bid * addr_stride + 12; + + fmask += ma[0] * A * B; + fdev += da[0] * A * B; + fmag += da[0] * A * B; + f += ea[0] * A * B; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE m = fmask[a * A + b]; + /* + // assuming this is actually a mask, i.e. 0 or 1 --> this is slower + float fm = m < 0.5f ? 1.0f : + ((fmag[a * A + b] + fdev[a * A + b] * renorm) / (fdev[a * A + b] + + fmag[a * A + b] + 1e-7f)) ; + */ + MATH_TYPE fmagv = fmag[a * A + b]; + MATH_TYPE fdevv = fdev[a * A + b]; + MATH_TYPE fm = + (MATH_TYPE(1) - m) + m * (fmagv / (fmagv + fdevv + MATH_TYPE(1e-7))); + f[a * A + b] *= fm; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu new file mode 100644 index 000000000..3427222c3 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu @@ -0,0 +1,58 @@ +/** fourier_deviation. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +#include +#include +using std::sqrt; +using thrust::abs; +using thrust::complex; + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_deviation(int nmodes, + const complex *f, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + const int *addr, + int A, + int B) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int a = threadIdx.y + blockIdx.y * blockDim.y; + const int addr_stride = 15; + + const int *ea = addr + 6 + (bid * nmodes) * addr_stride; + const int *da = addr + 9 + (bid * nmodes) * addr_stride; + + f += ea[0] * A * B; + fdev += da[0] * A * B; + fmag += da[0] * A * B; + + if (a >= A) + return; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE acc = MATH_TYPE(0); + for (int idx = 0; idx < nmodes; ++idx) + { + complex t_f = f[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_f); + acc += abs_exit_wave * + abs_exit_wave; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding + } + auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); + fdev[a * B + b] = fdevv; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu index 7998e094c..ad483c870 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu @@ -1,3 +1,12 @@ +/** fourier_error. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + + #include #include #include @@ -11,12 +20,12 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) fourier_error(int nmodes, - complex *f, - const float *fmask, - const float *fmag, - float *fdev, - float *ferr, - const float *mask_sum, + const complex *f, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + OUT_TYPE *ferr, + const IN_TYPE *mask_sum, const int *addr, int A, int B) @@ -39,15 +48,16 @@ extern "C" __global__ void __launch_bounds__(1024, 2) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = MATH_TYPE(0); for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(f[a * B + b + idx * A * B]); + complex t_f = f[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_f); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - auto fdevv = sqrt(acc) - fmag[a * B + b]; + auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); ferr[a * B + b] = (fmask[a * B + b] * fdevv * fdevv) / mask_sum[ma[0]]; fdev[a * B + b] = fdevv; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu index 3fe6ac8a5..801204aaa 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu @@ -1,16 +1,25 @@ +/** full_reduce kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - ACC_TYPE: the data type used for internal accumulation + */ + + #include -extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size) +extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size) { assert(gridDim.x == 1); int tx = threadIdx.x; - __shared__ DTYPE smem[BDIM_X]; + __shared__ ACC_TYPE smem[BDIM_X]; - auto sum = DTYPE(); + auto sum = ACC_TYPE(); for (int ix = tx; ix < size; ix += blockDim.x) { - sum = sum + in[ix]; + sum = sum + ACC_TYPE(in[ix]); } smem[tx] = sum; __syncthreads(); @@ -30,6 +39,6 @@ extern "C" __global__ void full_reduce(const DTYPE* in, DTYPE* out, int size) if (tx == 0) { - out[0] = smem[0]; + out[0] = OUT_TYPE(smem[0]); } } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu index 06d73ae88..1ab643c4c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu @@ -1,11 +1,19 @@ +/** gd_main kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double - for aux wave) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void gd_main(const FTYPE* Imodel, - const FTYPE* I, - const FTYPE* w, - FTYPE* err, - CTYPE* aux, +extern "C" __global__ void gd_main(const IN_TYPE* Imodel, + const IN_TYPE* I, + const IN_TYPE* w, + OUT_TYPE* err, + complex* aux, int z, int modes, int x) @@ -16,8 +24,8 @@ extern "C" __global__ void gd_main(const FTYPE* Imodel, if (iz >= z || ix >= x) return; - auto DI = Imodel[iz * x + ix] - I[iz * x + ix]; - auto tmp = w[iz * x + ix] * DI; + auto DI = MATH_TYPE(Imodel[iz * x + ix]) - MATH_TYPE(I[iz * x + ix]); + auto tmp = MATH_TYPE(w[iz * x + ix]) * MATH_TYPE(DI); err[iz * x + ix] = tmp * DI; // now set this for all modes (promote) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu new file mode 100644 index 000000000..dda9b45f1 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu @@ -0,0 +1,35 @@ +#include +#include +using thrust::complex; + +inline __device__ int minimum(int a, int b) { return a < b ? a : b; } + +inline __device__ int maximum(int a, int b) { return a < b ? b : a; } + +extern "C" __global__ void get_address(const int* addr_current, + int* mangled_addr, + int num_pods, + const int* __restrict delta, + int max_oby, + int max_obx) +{ + // we use only one thread block + const int tx = threadIdx.x; + const int idx = tx % 2; // even threads access y dim, odd threads x dim + const int maxval = (idx == 0) ? max_oby : max_obx; + + const int addr_stride = 15; + const int d = delta[idx]; + addr_current += 3 + idx + 1; + mangled_addr += 3 + idx + 1; + + for (int ix = tx; ix < num_pods * 2; ix += blockDim.x) + { + const int bid = ix / 2; + int cur = addr_current[bid * addr_stride] + d; + int bound = maximum(0, minimum(maxval, cur)); + assert(bound >= 0); + assert(bound <= maxval); + mangled_addr[bid * addr_stride] = bound; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu index 13f8551b7..d0033f7f4 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu @@ -1,41 +1,66 @@ +/** intens_renorm - with 2 steps as separate kernels. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -extern "C" __global__ void step1(const FTYPE* Imodel, - const FTYPE* I, - const FTYPE* w, - FTYPE* num, - FTYPE* den, - int z, - int x) +extern "C" __global__ void step1(const IN_TYPE* Imodel, + const IN_TYPE* I, + const IN_TYPE* w, + OUT_TYPE* num, + OUT_TYPE* den, + int n) { - int iz = blockIdx.z; - int ix = threadIdx.x + blockIdx.x * blockDim.x; + int i = threadIdx.x + blockIdx.x * blockDim.x; - if (iz >= z || ix >= x) + if (i >= n) return; - auto tmp = w[iz * x + ix] * Imodel[iz * x + ix]; - num[iz * x + ix] = tmp * I[iz * x + ix]; - den[iz * x + ix] = tmp * Imodel[iz * x + ix]; + auto tmp = MATH_TYPE(w[i]) * MATH_TYPE(Imodel[i]); + num[i] = tmp * MATH_TYPE(I[i]); + den[i] = tmp * MATH_TYPE(Imodel[i]); } -extern "C" __global__ void step2(const FTYPE* fic_tmp, - FTYPE* fic, - FTYPE* Imodel, - int z, - int x) +extern "C" __global__ void step2(const IN_TYPE* fic_tmp, + OUT_TYPE* fic, + OUT_TYPE* Imodel, + int X, + int Y) { int iz = blockIdx.z; - int ix = threadIdx.x + blockIdx.x * blockDim.x; + int tx = threadIdx.x; + int ty = threadIdx.y; + + // one thread block per fic data point - we want the first thread to read this + // into shared memory and then sync the block, so we don't get into data races + // with writing it back to global memory in the end (and we read the value only + // once) + // + __shared__ MATH_TYPE shfic[1]; + if (tx == 0 && ty == 0) { + shfic[0] = MATH_TYPE(fic[iz]) / MATH_TYPE(fic_tmp[iz]); + } + __syncthreads(); - if (iz >= z || ix >= x) - return; - //probably not so clever having all threads read from the same locations - auto tmp = fic[iz] / fic_tmp[iz]; - Imodel[iz * x + ix] *= tmp; + // now all threads can access that value + auto tmp = shfic[0]; + + // offset Imodel for current z + Imodel += iz * X * Y; + + for (int iy = ty; iy < Y; iy += blockDim.y) { + #pragma unroll(4) + for (int ix = tx; ix < X; ix += blockDim.x) { + Imodel[iy * X + ix] *= tmp; + } + } + // race condition if write is not restricted to one thread - // learned this the hard way - if (ix==0) + if (tx==0 && ty == 0) fic[iz] = tmp; -} \ No newline at end of file +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu index e538dd725..90455b1e2 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu @@ -1,3 +1,11 @@ +/** log_likelihood kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include #include #include @@ -11,13 +19,13 @@ using thrust::complex; // (guided by profiler) extern "C" __global__ void __launch_bounds__(1024, 2) log_likelihood(int nmodes, - complex *aux, - const float *fmask, - const float *fmag, - const int *addr, - float *llerr, - int A, - int B) + complex *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) { int tx = threadIdx.x; int ty = threadIdx.y; @@ -31,22 +39,69 @@ extern "C" __global__ void __launch_bounds__(1024, 2) fmag += da[0] * A * B; fmask += ma[0] * A * B; llerr += da[0] * A * B; - float norm = A * B; + MATH_TYPE norm = A * B; for (int a = ty; a < A; a += blockDim.y) { for (int b = tx; b < B; b += blockDim.x) { - float acc = 0.0; + MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - float abs_exit_wave = abs(aux[a * B + b + idx * A * B]); + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); acc += abs_exit_wave * abs_exit_wave; // if we do this manually (real*real +imag*imag) // we get differences to numpy due to rounding } - auto I = fmag[a * B + b] * fmag[a * B + b]; - llerr[a * B + b] = fmask[a * B + b] * (acc - I) * (acc - I) / (I + 1) / norm; + auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); + llerr[a * B + b] = + MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm; } } } + + +extern "C" __global__ void + log_likelihood2(int nmodes, + complex *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) +{ + int bid = blockIdx.z; + int tx = threadIdx.x; + int a = threadIdx.y + blockIdx.y * blockDim.y; + if (a >= A) + return; + int addr_stride = 15; + + const int *ea = addr + 6 + (bid * nmodes) * addr_stride; + const int *da = addr + 9 + (bid * nmodes) * addr_stride; + const int *ma = addr + 12 + (bid * nmodes) * addr_stride; + + aux += ea[0] * A * B; + fmag += da[0] * A * B; + fmask += ma[0] * A * B; + llerr += da[0] * A * B; + MATH_TYPE norm = A * B; + + for (int b = tx; b < B; b += blockDim.x) + { + MATH_TYPE acc = 0.0; + for (int idx = 0; idx < nmodes; ++idx) + { + complex t_aux = aux[a * B + b + idx * A * B]; + MATH_TYPE abs_exit_wave = abs(t_aux); + acc += abs_exit_wave * + abs_exit_wave; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding + } + auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); + llerr[a * B + b] = + MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu index e86d900f5..11ba29f62 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu @@ -1,14 +1,23 @@ +/** fmag_all_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: data type used for accumulation + */ + #include using thrust::complex; -extern "C" __global__ void make_a012(const CTYPE* f, - const CTYPE* a, - const CTYPE* b, - const FTYPE* I, - const FTYPE* fic, - FTYPE* A0, - FTYPE* A1, - FTYPE* A2, +extern "C" __global__ void make_a012(const complex* f, + const complex* a, + const complex* b, + const IN_TYPE* I, + const IN_TYPE* fic, + OUT_TYPE* A0, + OUT_TYPE* A1, + OUT_TYPE* A2, int z, int y, int x, @@ -22,37 +31,37 @@ extern "C" __global__ void make_a012(const CTYPE* f, if (iz >= maxz) { - A0[iz * x + ix] = FTYPE(0); // make sure it's the right type (double/float) - A1[iz * x + ix] = FTYPE(0); - A2[iz * x + ix] = FTYPE(0); + A0[iz * x + ix] = OUT_TYPE(0); // make sure it's the right type (double/float) + A1[iz * x + ix] = OUT_TYPE(0); + A2[iz * x + ix] = OUT_TYPE(0); return; } // we sum across y directly, as this is the number of modes, // which is typically small - auto sumtf0 = FTYPE(0); - auto sumtf1 = FTYPE(0); - auto sumtf2 = FTYPE(0); + auto sumtf0 = ACC_TYPE(0); + auto sumtf1 = ACC_TYPE(0); + auto sumtf2 = ACC_TYPE(0); for (auto iy = 0; iy < y; ++iy) { - auto fv = f[iz * y * x + iy * x + ix]; + complex fv = f[iz * y * x + iy * x + ix]; sumtf0 += fv.real() * fv.real() + fv.imag() * fv.imag(); - auto av = a[iz * y * x + iy * x + ix]; + complex av = a[iz * y * x + iy * x + ix]; // 2 * real(f * conj(a)) - sumtf1 += FTYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag()); + sumtf1 += MATH_TYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag()); // use FTYPE(2) to make sure double creeps into a float calculation // as 2.0 * would make everything double. - auto bv = b[iz * y * x + iy * x + ix]; + complex bv = b[iz * y * x + iy * x + ix]; // 2 * real(f * conj(b)) + abs(a)^2 - sumtf2 += FTYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) + + sumtf2 += MATH_TYPE(2) * (fv.real() * bv.real() + fv.imag() * bv.imag()) + (av.real() * av.real() + av.imag() * av.imag()); } - auto Iv = I[iz * x + ix]; - auto ficv = fic[iz]; - A0[iz * x + ix] = sumtf0 * ficv - Iv; - A1[iz * x + ix] = sumtf1 * ficv; - A2[iz * x + ix] = sumtf2 * ficv; + MATH_TYPE Iv = I[iz * x + ix]; + MATH_TYPE ficv = fic[iz]; + A0[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf0) * ficv - Iv); + A1[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf1) * ficv); + A2[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf2) * ficv); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu index 0f8380d71..22bf7d4ab 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu @@ -1,8 +1,16 @@ +/** make_model - with 2 steps as separate kernels. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; extern "C" __global__ void make_model( - const CTYPE* in, FTYPE* out, int z, int y, int x) + const complex* in, OUT_TYPE* out, int z, int y, int x) { int ix = threadIdx.x + blockIdx.x * blockDim.x; int iz = blockIdx.z; @@ -12,11 +20,11 @@ extern "C" __global__ void make_model( // we sum accross y directly, as this is the number of modes, // which is typically small - auto sum = FTYPE(); + auto sum = MATH_TYPE(); for (auto iy = 0; iy < y; ++iy) { - auto v = in[iz * y * x + iy * x + ix]; + complex v = in[iz * y * x + iy * x + ix]; sum += v.real() * v.real() + v.imag() * v.imag(); } - out[iz * x + ix] = sum; + out[iz * x + ix] = OUT_TYPE(sum); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu new file mode 100644 index 000000000..4da8efb3e --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu @@ -0,0 +1,115 @@ +/** max_abs2 kernel, calculating the sum of abs(x)**2 value in the first dimension + * and then the maximum across the last 2 dimensions + * + * Data types: + * - IN_TYPE: can be float/double or complex/complex + */ + +#include +#include +using thrust::complex; +using thrust::norm; + +inline __device__ OUT_TYPE norm(const float& in) { + return in*in; +} + +inline __device__ OUT_TYPE norm(const double& in) { + return in*in; +} + +extern "C" __global__ void max_abs2_step1(const IN_TYPE* a, + int n, + int rows, + int cols, + OUT_TYPE* out) +{ + int tx = threadIdx.x; + const int iy = blockIdx.y; + + __shared__ OUT_TYPE sh[BDIM_X]; + + OUT_TYPE maxv = OUT_TYPE(0); + + for (int ix = tx; ix < cols; ix += BDIM_X) { + OUT_TYPE v = OUT_TYPE(0); + for (int in = 0; in < n; ++in) { + v += norm(a[in * rows * cols + iy * cols + ix]); + } + if (v > maxv) + maxv = v; + } + + + sh[tx] = maxv; + + __syncthreads(); + + // reduce: + const int nt = BDIM_X; + int c = nt; + + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + auto v = sh[c - tx - 1]; + if (maxv < v) { + sh[tx] = v; + maxv = v; + } + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + out[iy] = sh[0]; + } +} + +extern "C" __global__ void max_abs2_step2(const OUT_TYPE* in, + int n, + OUT_TYPE* out) +{ + int tx = threadIdx.x; + + in += blockIdx.x * n; + + __shared__ OUT_TYPE sh[BDIM_X]; + + OUT_TYPE maxv = OUT_TYPE(0); + for (int i = tx; i < n; ++i) { + auto v = in[i]; + if (v > maxv) + maxv = v; + } + sh[tx] = maxv; + __syncthreads(); + + // reduce: + const int nt = BDIM_X; + int c = nt; + + while (c > 1) + { + int half = c / 2; + if (tx < half) + { + auto v = sh[c - tx - 1]; + if (maxv < v) { + sh[tx] = v; + maxv = v; + } + } + __syncthreads(); + c = c - half; + } + + if (tx == 0) + { + out[0] = sh[0]; + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu index c2cf2fd22..29b993fb0 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu @@ -1,8 +1,16 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); @@ -10,20 +18,20 @@ __device__ inline void atomicAdd(complex* x, complex y) } extern "C" __global__ void ob_update( - const complex* __restrict__ exit_wave, + const complex* __restrict__ exit_wave, int A, int B, int C, - const complex* __restrict__ probe, + const complex* __restrict__ probe, int D, int E, int F, - complex* obj, + complex* obj, int G, int H, int I, const int* __restrict__ addr, - DENOM_TYPE* denominator) + OUT_TYPE* denominator) { const int bid = blockIdx.x; const int tx = threadIdx.x; @@ -46,12 +54,15 @@ extern "C" __global__ void ob_update( { for (int c = tx; c < C; c += blockDim.x) { - auto probe_val = probe[b * F + c]; - atomicAdd(&obj[b * I + c], conj(probe_val) * exit_wave[b * C + c]); - auto denomreal = reinterpret_cast(&denominator[b * I + c]); + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + auto add_val_m = conj(probe_val) * exit_val; + complex add_val = add_val_m; + atomicAdd(&obj[b * I + c], add_val); + auto upd_probe = probe_val.real() * probe_val.real() + probe_val.imag() * probe_val.imag(); - atomicAdd(denomreal, upd_probe); + atomicAdd(&denominator[b * I + c], upd_probe); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu index 1f9c5b573..821c04a6d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu @@ -1,3 +1,20 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator type for the local ob accumulation + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in ob_update.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + + #include #include using thrust::complex; @@ -8,59 +25,40 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, T r) -{ - v.real(r); -} -template -__device__ inline void set_real(T& v, T r) -{ - v = r; -} -template -__device__ inline T get_real(const complex& v) -{ - return v.real(); -} -template -__device__ inline T get_real(const T& v) -{ - return v; -} extern "C" __global__ void ob_update2( int pr_sh, int ob_modes, int num_pods, - int ob_sh, + int ob_sh_rows, + int ob_sh_cols, int pr_modes, int ex_0, int ex_1, int ex_2, - complex* ob_g, - DENOM_TYPE* obn_g, - const complex* __restrict__ pr_g, // 2, 5, 5 - const complex* __restrict__ ex_g, // 16, 5, 5 + complex* ob_g, + OUT_TYPE* obn_g, + const complex* __restrict__ pr_g, // 2, 5, 5 + const complex* __restrict__ ex_g, // 16, 5, 5 const int* addr) { int y = blockIdx.y * BDIM_Y + threadIdx.y; - int dy = ob_sh; + int dy = ob_sh_rows; int z = blockIdx.x * BDIM_X + threadIdx.x; - int dz = ob_sh; - complex ob[NUM_MODES]; - DENOM_TYPE obn[NUM_MODES]; + int dz = ob_sh_cols; + complex ob[NUM_MODES]; + ACC_TYPE obn[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { #pragma unroll for (int i = 0; i < NUM_MODES; ++i) { auto idx = i * dy * dz + y * dz + z; - assert(idx < ob_modes * ob_sh * ob_sh); + assert(idx < ob_modes * ob_sh_rows * ob_sh_cols); ob[i] = ob_g[idx]; obn[i] = obn_g[idx]; } @@ -92,7 +90,7 @@ extern "C" __global__ void ob_update2( __syncthreads(); - if (y >= ob_sh || z >= ob_sh) + if (y >= dy || z >= dz) continue; #pragma unroll 4 @@ -105,21 +103,21 @@ extern "C" __global__ void ob_update2( { auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(pridx < pr_modes * pr_sh * pr_sh); - auto pr = pr_g[pridx]; + complex pr = pr_g[pridx]; int idx = ad[2]; assert(idx < NUM_MODES); auto cpr = conj(pr); auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(exidx < ex_0 * ex_1 * ex_2); - ob[idx] += cpr * ex_g[exidx]; - auto rr = get_real(obn[idx]); - rr += pr.real() * pr.real() + pr.imag() * pr.imag(); - set_real(obn[idx], rr); + complex t_ex_g = ex_g[exidx]; + complex add_val = cpr * t_ex_g; + ob[idx] += add_val; + obn[idx] += pr.real() * pr.real() + pr.imag() * pr.imag(); } } } - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { for (int i = 0; i < NUM_MODES; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu index 56d088788..b62e66006 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu @@ -1,3 +1,20 @@ +/** ob_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator for the ob field + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in ob_update_ML.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + + #include #include using thrust::complex; @@ -11,33 +28,36 @@ using thrust::complex; extern "C" __global__ void ob_update2_ML(int pr_sh, int ob_modes, int num_pods, - int ob_sh, + int ob_sh_rows, + int ob_sh_cols, int pr_modes, int ex_0, int ex_1, int ex_2, - CTYPE* ob_g, - const CTYPE* __restrict__ pr_g, - const CTYPE* __restrict__ ex_g, + complex* ob_g, + const complex* __restrict__ pr_g, + const complex* __restrict__ ex_g, const int* addr, - FTYPE fac) + IN_TYPE fac_) { int y = blockIdx.y * BDIM_Y + threadIdx.y; - int dy = ob_sh; + int dy = ob_sh_rows; int z = blockIdx.x * BDIM_X + threadIdx.x; - int dz = ob_sh; - CTYPE ob[NUM_MODES]; + int dz = ob_sh_cols; + MATH_TYPE fac = fac_; + complex ob[NUM_MODES]; + int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(ob_modes <= NUM_MODES); - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { #pragma unroll for (int i = 0; i < NUM_MODES; ++i) { auto idx = i * dy * dz + y * dz + z; - assert(idx < ob_modes * ob_sh * ob_sh); + assert(idx < ob_modes * ob_sh_rows * ob_sh_cols); ob[i] = ob_g[idx]; } } @@ -68,7 +88,7 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, __syncthreads(); - if (y >= ob_sh || z >= ob_sh) + if (y >= dy || z >= dz) continue; #pragma unroll 4 @@ -81,18 +101,20 @@ extern "C" __global__ void ob_update2_ML(int pr_sh, { auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(pridx < pr_modes * pr_sh * pr_sh); - auto pr = pr_g[pridx]; + complex pr = pr_g[pridx]; int idx = ad[2]; assert(idx < NUM_MODES); auto cpr = conj(pr); auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2; assert(exidx < ex_0 * ex_1 * ex_2); - ob[idx] += cpr * ex_g[exidx] * fac; + complex t_ex_g = ex_g[exidx]; + complex add_val = cpr * t_ex_g * fac; + ob[idx] += add_val; } } } - if (y < ob_sh && z < ob_sh) + if (y < dy && z < dz) { for (int i = 0; i < NUM_MODES; ++i) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu index c6aa9ca11..84e678ebb 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu @@ -1,8 +1,16 @@ +/** ob_update_ML. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); @@ -11,25 +19,26 @@ __device__ inline void atomicAdd(complex* x, complex y) extern "C" { - __global__ void ob_update_ML(const CTYPE* __restrict__ exit_wave, + __global__ void ob_update_ML(const complex* __restrict__ exit_wave, int A, int B, int C, - const CTYPE* __restrict__ probe, + const complex* __restrict__ probe, int D, int E, int F, - CTYPE* obj, + complex* obj, int G, int H, int I, const int* __restrict__ addr, - FTYPE fac) + IN_TYPE fac_) { const int bid = blockIdx.x; const int tx = threadIdx.x; const int ty = threadIdx.y; const int addr_stride = 15; + MATH_TYPE fac = fac_; const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -46,9 +55,12 @@ extern "C" { for (int c = tx; c < C; c += blockDim.x) { - auto probe_val = probe[b * F + c]; - atomicAdd(&obj[b * I + c], - conj(probe_val) * exit_wave[b * C + c] * fac); + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(probe_val) * exit_val * fac; + complex add_val(add_val_m); + + atomicAdd(&obj[b * I + c], add_val); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu new file mode 100644 index 000000000..c49119be2 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu @@ -0,0 +1,67 @@ +/** ob_update_local - in DR algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include +using thrust::complex; + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, y.real()); + atomicAdd(xf + 1, y.imag()); +} + +extern "C" __global__ void ob_update_local( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + const complex* __restrict__ probe, + int D, + int E, + int F, + const IN_TYPE* __restrict__ pr_norm, + complex* obj, + int G, + int H, + int I, + const int* __restrict__ addr) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + MATH_TYPE norm_val = pr_norm[0]; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex probe_val = probe[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + auto add_val_m = conj(probe_val) * (exit_val - aux_val) / norm_val; + complex add_val = add_val_m; + atomicAdd(&obj[b * I + c], add_val); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu index 13a6c72b1..180cf8f14 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu @@ -1,29 +1,37 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + #include using thrust::complex; -template -__device__ inline void atomicAdd(complex* x, complex y) +template +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); - atomicAdd(xf, y.real()); - atomicAdd(xf + 1, y.imag()); + atomicAdd(xf, T(y.real())); + atomicAdd(xf + 1, T(y.imag())); } extern "C" __global__ void pr_update( - const complex* __restrict__ exit_wave, + const complex* __restrict__ exit_wave, int A, int B, int C, - complex* probe, + complex* probe, int D, int E, int F, - const complex* __restrict__ obj, + const complex* __restrict__ obj, int G, int H, int I, const int* __restrict__ addr, - DENOM_TYPE* denominator) + OUT_TYPE* denominator) { assert(B == E); // prsh[1] assert(C == F); // prsh[2] @@ -48,12 +56,14 @@ extern "C" __global__ void pr_update( { for (int c = tx; c < C; c += blockDim.x) { - auto obj_val = obj[b * I + c]; - atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c]); - auto denomreal = reinterpret_cast(&denominator[b * F + c]); - auto upd_obj = + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(obj_val) * exit_val; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); + MATH_TYPE upd_obj = obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag(); - atomicAdd(denomreal, upd_obj); + atomicAdd(&denominator[b * F + c], upd_obj); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu index 1361cb18d..e5417cc01 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu @@ -1,3 +1,19 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator type for local pr array + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in pr_update.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + #include #include using thrust::complex; @@ -10,29 +26,6 @@ using thrust::complex; #define obj_roi_row(k) addr[4 * num_pods + (k)] #define obj_roi_column(k) addr[5 * num_pods + (k)] -template -__device__ inline void set_real(complex& v, T r) -{ - v.real(r); -} - -template -__device__ inline void set_real(T& v, T r) -{ - v = r; -} - -template -__device__ inline T get_real(const complex& v) -{ - return v.real(); -} - -template -__device__ inline T get_real(const T& v) -{ - return v; -} extern "C" __global__ void pr_update2(int pr_sh, int ob_sh_row, @@ -40,18 +33,18 @@ extern "C" __global__ void pr_update2(int pr_sh, int pr_modes, int ob_modes, int num_pods, - complex* pr_g, - DENOM_TYPE* prn_g, - const complex* __restrict__ ob_g, - const complex* __restrict__ ex_g, + complex* pr_g, + OUT_TYPE* prn_g, + const complex* __restrict__ ob_g, + const complex* __restrict__ ex_g, const int* addr) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = pr_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = pr_sh; - complex pr[NUM_MODES]; - DENOM_TYPE prn[NUM_MODES]; + complex pr[NUM_MODES]; + ACC_TYPE prn[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(pr_modes <= NUM_MODES); @@ -107,15 +100,15 @@ extern "C" __global__ void pr_update2(int pr_sh, { auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2; assert(obidx < ob_modes * ob_sh_row * ob_sh_col); - auto ob = ob_g[obidx]; + complex ob = ob_g[obidx]; int idx = ad[0]; assert(idx < NUM_MODES); auto cob = conj(ob); - pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; - auto rr = get_real(prn[idx]); - rr += ob.real() * ob.real() + ob.imag() * ob.imag(); - set_real(prn[idx], rr); + complex ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; + complex add_val = cob * ex_val; + pr[idx] += add_val; + prn[idx] += ob.real() * ob.real() + ob.imag() * ob.imag(); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu index 696682e97..8a45891c5 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu @@ -1,3 +1,19 @@ +/** pr_update. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: accumulator type for local pr array + * + * NOTE: This version of ob_update goes over all tiles that need to be accumulated + * in a single thread block to avoid global atomic additions (as in pr_update_ML.cu). + * This requires a local array of NUM_MODES size to store the local updates. + * GPU registers per thread are limited (255 32bit registers on V100), + * and at some point the registers will spill into shared or global memory + * and the kernel will get considerably slower. + */ + #include #include using thrust::complex; @@ -16,17 +32,18 @@ extern "C" __global__ void pr_update2_ML(int pr_sh, int pr_modes, int ob_modes, int num_pods, - CTYPE* pr_g, - const CTYPE* __restrict__ ob_g, - const CTYPE* __restrict__ ex_g, + complex* pr_g, + const complex* __restrict__ ob_g, + const complex* __restrict__ ex_g, const int* addr, - FTYPE fac) + IN_TYPE fac_) { int y = blockIdx.y * BDIM_Y + threadIdx.y; int dy = pr_sh; int z = blockIdx.x * BDIM_X + threadIdx.x; int dz = pr_sh; - CTYPE pr[NUM_MODES]; + MATH_TYPE fac = fac_; + complex pr[NUM_MODES]; int txy = threadIdx.y * BDIM_X + threadIdx.x; assert(pr_modes <= NUM_MODES); @@ -81,12 +98,15 @@ extern "C" __global__ void pr_update2_ML(int pr_sh, { auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2; assert(obidx < ob_modes * ob_sh_row * ob_sh_col); - auto ob = ob_g[obidx]; + complex ob = ob_g[obidx]; int idx = ad[0]; assert(idx < NUM_MODES); auto cob = conj(ob); - pr[idx] += cob * ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z] * fac; + complex ex_val = ex_g[ad[1] * pr_sh * pr_sh + y * pr_sh + z]; + complex add_val_m = cob * ex_val * fac; + complex add_val = add_val_m; + pr[idx] += add_val; } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu index 156e6d198..3fa24137d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu @@ -1,28 +1,37 @@ +/** pr_update_ML. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + + #include using thrust::complex; template -__device__ inline void atomicAdd(complex* x, complex y) +__device__ inline void atomicAdd(complex* x, const complex& y) { auto xf = reinterpret_cast(x); atomicAdd(xf, y.real()); atomicAdd(xf + 1, y.imag()); } -extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, +extern "C" __global__ void pr_update_ML(const complex* __restrict__ exit_wave, int A, int B, int C, - CTYPE* probe, + complex* probe, int D, int E, int F, - const CTYPE* __restrict__ obj, + const complex* __restrict__ obj, int G, int H, int I, const int* __restrict__ addr, - FTYPE fac) + IN_TYPE fac_) { assert(B == E); // prsh[1] assert(C == F); // prsh[2] @@ -30,6 +39,7 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, const int tx = threadIdx.x; const int ty = threadIdx.y; const int addr_stride = 15; + MATH_TYPE fac = fac_; const int* oa = addr + 3 + bid * addr_stride; const int* pa = addr + bid * addr_stride; @@ -46,8 +56,11 @@ extern "C" __global__ void pr_update_ML(const CTYPE* __restrict__ exit_wave, { for (int c = tx; c < C; c += blockDim.x) { - auto obj_val = obj[b * I + c]; - atomicAdd(&probe[b * F + c], conj(obj_val) * exit_wave[b * C + c] * fac); + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex add_val_m = conj(obj_val) * exit_val * fac; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); } } } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu new file mode 100644 index 000000000..ee81e1620 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu @@ -0,0 +1,71 @@ +/** pr_update_local - for DR algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: data type used in norm calculation (input here) + */ + +#include +using thrust::complex; + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, T(y.real())); + atomicAdd(xf + 1, T(y.imag())); +} + +extern "C" __global__ void pr_update_local( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + complex* probe, + int D, + int E, + int F, + const IN_TYPE* __restrict__ ob_norm, + const complex* __restrict__ obj, + int G, + int H, + int I, + const int* __restrict__ addr) +{ + assert(B == E); // prsh[1] + assert(C == F); // prsh[2] + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + MATH_TYPE norm_val = ob_norm[0]; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex obj_val = obj[b * I + c]; + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + complex add_val_m = conj(obj_val) * (exit_val - aux_val) / norm_val; + complex add_val = add_val_m; + atomicAdd(&probe[b * F + c], add_val); + } + +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu index a460727a4..8de4e7ad7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu @@ -5,6 +5,11 @@ * and shared memory access has no bank conflicts. */ +/** + * Data types: + * - DTYPE - any pod type + */ + #include using thrust::complex; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu index 2e6d21059..1220a0986 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu @@ -1,11 +1,18 @@ +/** update_addr_error_state kernel. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + */ + #include #include using thrust::complex; -extern "C" __global__ void update_addr_error_state(int* addr, - const int* mangled_addr, - float* error_state, - const float* error_sum, +extern "C" __global__ void update_addr_error_state(int* __restrict addr, + const int* __restrict mangled_addr, + OUT_TYPE* error_state, + const IN_TYPE* __restrict error_sum, int nmodes) { int tx = threadIdx.x; @@ -23,7 +30,7 @@ extern "C" __global__ void update_addr_error_state(int* addr, if (err_sum < err_st) { - for (int i = tx; i < nmodes * 15; i += blockDim.x) + for (int i = tx, e = nmodes * 15; i < e; i += blockDim.x) { addr[i] = mangled_addr[i]; } diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index 89c2c650b..686171342 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -17,6 +17,10 @@ def __init__(self, array, queue=None, if dims < 2: raise AssertionError('Input array must be at least 2-dimensional') self.arr_shape = (array.shape[-2], array.shape[-1]) + rows = self.arr_shape[0] + columns = self.arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward @@ -34,10 +38,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): else: self.post_fft_ptr = 0 - from . import import_fft - mod = import_fft.ImportFFT(self.arr_shape[0], self.arr_shape[1]).get_mod() - self.fftobj = mod.FilteredFFT( + from ptypy import filtered_cufft + self.fftobj = filtered_cufft.FilteredFFT( self.batches, + self.arr_shape[0], + self.arr_shape[1], symmetric, forward, self.pre_fft_ptr, @@ -75,14 +80,30 @@ def queue(self, queue): cufftlib.cufftSetStream(self.plan.handle, queue.handle) def _load(self, array, pre_fft, post_fft, symmetric, forward): + assert(array.dtype in [np.complex64, np.complex128]) + assert(pre_fft.dtype in [np.complex64, np.complex128] if pre_fft is not None else True) + assert(post_fft.dtype in [np.complex64, np.complex128] if post_fft is not None else True) + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if pre_fft is not None: + math_type = 'float' if pre_fft.dtype == np.complex64 else 'double' self.pre_fft_knl = load_kernel("batched_multiply", { 'MPY_DO_SCALE': 'false', - 'MPY_DO_FILT': 'true' + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type }) if pre_fft is not None else None + math_type = 'float' if array.dtype == np.complex64 else 'double' + if post_fft is not None: + math_type = 'float' if post_fft.dtype == np.complex64 else 'double' self.post_fft_knl = load_kernel("batched_multiply", { 'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false', - 'MPY_DO_FILT': 'true' if post_fft is not None else 'false' + 'MPY_DO_FILT': 'true' if post_fft is not None else 'false', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type }) if (not (forward and not symmetric) or post_fft is not None) else None self.block = (32, 32, 1) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 154f073ee..961851072 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -18,21 +18,15 @@ from ptypy.utils import parallel from ptypy.engines import register from ptypy.accelerate.base.engines import DM_serial -from ptypy.accelerate.base import address_manglers from .. import get_context -from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel -from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel from ..mem_utils import make_pagelocked_paired_arrays as mppa - -MPI = parallel.size > 1 -MPI = True +from ..multi_gpu import get_multi_gpu_communicator __all__ = ['DM_pycuda'] -serialize_array_access = DM_serial.serialize_array_access -gaussian_kernel = DM_serial.gaussian_kernel - - @register() class DM_pycuda(DM_serial.DM_serial): @@ -66,27 +60,28 @@ def __init__(self, ptycho_parent, pars=None): Difference map reconstruction engine. """ super(DM_pycuda, self).__init__(ptycho_parent, pars) + self.multigpu = None def engine_initialize(self): """ Prepare for reconstruction. """ - self.context, self.queue = get_context(new_context=True, new_queue=True) - # allocator for READ only buffers - # self.const_allocator = cl.tools.ImmediateAllocator(queue, cl.mem_flags.READ_ONLY) - ## gaussian filter - # dummy kernel - # if not self.p.obj_smooth_std: - # gauss_kernel = gaussian_kernel(1, 1).astype(np.float32) - # else: - # gauss_kernel = gaussian_kernel(self.p.obj_smooth_std, self.p.obj_smooth_std).astype(np.float32) - # self.gauss_kernel_gpu = gpuarray.to_gpu(gauss_kernel) + # Context, Multi GPU communicator and Stream (needs to be in this order) + self.context, self.queue = get_context(new_context=True, new_queue=False) + self.multigpu = get_multi_gpu_communicator() + self.context, self.queue = get_context(new_context=False, new_queue=True) # Gaussian Smoothing Kernel self.GSK = GaussianSmoothingKernel(queue=self.queue) + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + # Clip Magnitudes Kernel + self.CMK = ClipMagnitudesKernel(queue=self.queue) + super(DM_pycuda, self).engine_initialize() - self.error = [] def _setup_kernels(self): """ @@ -117,41 +112,34 @@ def _setup_kernels(self): kern.aux = gpuarray.to_gpu(aux) # setup kernels, one for each SCAN. - logger.info("Setting up FourierUpdateKernel") + log(4, "Setting up FourierUpdateKernel") kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) kern.FUK.allocate() - logger.info("Setting up PoUpdateKernel") - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32) + log(4, "Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) kern.POK.allocate() - logger.info("Setting up AuxiliaryWaveKernel") + log(4, "Setting up AuxiliaryWaveKernel") kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) kern.AWK.allocate() - logger.info("Setting up ArrayUtilsKernel") + log(4, "Setting up ArrayUtilsKernel") kern.AUK = ArrayUtilsKernel(queue=self.queue) - logger.info("Setting up PropagationKernel") + log(4, "Setting up TransposeKernel") + kern.TK = TransposeKernel(queue=self.queue) + + log(4, "Setting up PropagationKernel") kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) kern.PROP.allocate() kern.resolution = geo.resolution[0] if self.do_position_refinement: - logger.info("Setting up position correction") - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) + log(4, "Setting up PositionCorrectionKernel") + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler - #self.queue.synchronize() - logger.info("Kernel setup completed") + log(4, "Kernel setup completed") def engine_prepare(self): @@ -165,6 +153,8 @@ def engine_prepare(self): s.gpu, s.data = mppa(s.data) for name, s in self.pr.S.items(): s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) for name, s in self.pr_nrm.S.items(): s.gpu, s.data = mppa(s.data) @@ -177,6 +167,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() for label, d in self.ptycho.new_data: prep = self.diff_info[d.ID] @@ -233,47 +225,33 @@ def engine_iterate(self, num=1): ## compute log-likelihood if self.p.compute_log_likelihood: - t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) FUK.log_likelihood(aux, addr, mag, ma, err_phot) - self.benchmark.F_LLerror += time.time() - t1 ## build auxilliary wave - t1 = time.time() AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) - self.benchmark.A_Build_aux += time.time() - t1 ## forward FFT - t1 = time.time() PROP.fw(aux, aux) - self.benchmark.B_Prop += time.time() - t1 ## Deviation from measured data - t1 = time.time() FUK.fourier_error(aux, addr, mag, ma, ma_sum) FUK.error_reduce(addr, err_fourier) FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) - self.benchmark.C_Fourier_update += time.time() - t1 ## backward FFT - t1 = time.time() PROP.bw(aux, aux) - self.benchmark.D_iProp += time.time() - t1 ## build exit wave - t1 = time.time() - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) - self.benchmark.E_Build_exit += time.time() - t1 - - self.benchmark.calls_fourier += 1 parallel.barrier() sync = (self.curiter % 1 == 0) - self.overlap_update(MPI=MPI) + self.overlap_update() parallel.barrier() if self.do_position_refinement and (self.curiter): @@ -285,7 +263,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): prep = self.diff_info[dID] @@ -297,39 +275,53 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu mag = prep.mag ma_sum = prep.ma_sum err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu PCK = kern.PCK - AUK = kern.AUK - - #error_state = np.zeros(err_fourier.shape, dtype=np.float32) - #error_state[:] = err_fourier.get() - cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr, + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod(dest=error_state.ptr, src=err_fourier.ptr, size=err_fourier.nbytes) + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu(mangled_addr) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, err_fourier) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - err_fourier) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) - # prep.err_fourier_gpu.set(error_state) - cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes) + cuda.memcpy_dtod(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes) if use_tiles: s1 = addr.shape[0] * addr.shape[1] s2 = addr.shape[2] * addr.shape[3] - AUK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 queue.synchronize() @@ -354,7 +346,6 @@ def engine_iterate(self, num=1): ## object update def object_update(self, MPI=False): - t1 = time.time() use_atomics = self.p.object_update_cuda_atomics queue = self.queue queue.synchronize() @@ -363,11 +354,10 @@ def object_update(self, MPI=False): cfact = self.ob_cfact[oID] if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4, 'Smoothing object, cfact is %.2f' % cfact) + obb = self.ob_buf.S[oID] smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - ob_gpu_tmp = gpuarray.empty(ob.shape, dtype=np.complex64) - self.GSK.convolution(ob.gpu, ob_gpu_tmp, smooth_mfs) - ob.gpu = ob_gpu_tmp + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) ob.gpu *= cfact obn.gpu.fill(cfact) @@ -393,33 +383,19 @@ def object_update(self, MPI=False): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] - # MPI test - if MPI: - ob.data[:] = ob.gpu.get() - obn.data[:] = obn.gpu.get() - queue.synchronize() - parallel.allreduce(ob.data) - parallel.allreduce(obn.data) - ob.data /= obn.data - - self.clip_object(ob) - ob.gpu.set(ob.data) - else: - ob.gpu /= obn.gpu + self.multigpu.allReduceSum(ob.gpu) + self.multigpu.allReduceSum(obn.gpu) + ob.gpu /= obn.gpu + self.clip_object(ob.gpu) queue.synchronize() - # print 'object update: ' + str(time.time()-t1) - self.benchmark.object_update += time.time() - t1 - self.benchmark.calls_object += 1 - ## probe update def probe_update(self, MPI=False): - t1 = time.time() queue = self.queue # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) cfact = self.p.probe_inertia use_atomics = self.p.probe_update_cuda_atomics for pID, pr in self.pr.storages.items(): @@ -450,35 +426,56 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - if MPI: - pr.data[:] = pr.gpu.get() - prn.data[:] = prn.gpu.get() - queue.synchronize() - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - self.support_constraint(pr) - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - pr.data[:] = pr.gpu.get() - self.support_constraint(pr) - pr.gpu.set(pr.data) - - ## this should be done on GPU - queue.synchronize() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) - # print 'probe update: ' + str(time.time()-t1) - self.benchmark.probe_update += time.time() - t1 - self.benchmark.calls_probe += 1 + ## calculate change on GPU + queue.synchronize() + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size return np.sqrt(change) - def engine_finalize(self): + def support_constraint(self, storage=None): + """ + Enforces 2D support contraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def clip_object(self, ob): + """ + Clips magnitudes of object into given range. + """ + if self.p.clip_object is not None: + cmin, cmax = self.p.clip_object + self.CMK.clip_magnitudes_to_range(ob, cmin, cmax) + + def engine_finalize(self, benchmark=False): """ clear GPU data and destroy context. """ @@ -495,9 +492,11 @@ def engine_finalize(self): for dID, prep in self.diff_info.items(): prep.addr = prep.addr_gpu.get() - # copy data to cpu + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) for name, s in self.pr.S.items(): - s.data = np.copy(s.data) # is this the same as s.data.get()? + s.data = np.copy(s.data) + self.context.pop() self.context.detach() - super(DM_pycuda, self).engine_finalize() \ No newline at end of file + super(DM_pycuda, self).engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 820124b5f..9306475b1 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -28,9 +28,6 @@ from ..mem_utils import make_pagelocked_paired_arrays as mppa from ..mem_utils import GpuDataManager2 -MPI = parallel.size > 1 -MPI = True - EX_MA_BLOCKS_RATIO = 2 MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit #MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit @@ -69,8 +66,8 @@ def _setup_kernels(self): # TODO grow blocks dynamically nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) nma = min(fit, MAX_BLOCKS) - - log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) + log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) # reset memory or create new self.ex_data = GpuDataManager2(ex_mem, 0, nex, True) self.ma_data = GpuDataManager2(ma_mem, 0, nma, False) @@ -88,6 +85,8 @@ def engine_prepare(self): s.gpu, s.data = mppa(s.data) for name, s in self.pr.S.items(): s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) for name, s in self.pr_nrm.S.items(): s.gpu, s.data = mppa(s.data) @@ -100,6 +99,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() for label, d in self.ptycho.new_data: dID = d.ID @@ -123,10 +124,11 @@ def engine_prepare(self): prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) prep.mag[:] = mag + log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) self.ex_data.add_data_block() self.ma_data.add_data_block() self.mag_data.add_data_block() - + def engine_iterate(self, num=1): """ Compute one iteration. @@ -136,7 +138,7 @@ def engine_iterate(self, num=1): atomics_probe = self.p.probe_update_cuda_atomics atomics_object = self.p.object_update_cuda_atomics use_tiles = (not atomics_object) or (not atomics_probe) - + for it in range(num): error = {} @@ -157,9 +159,9 @@ def engine_iterate(self, num=1): obb = self.ob_buf.S[oID] if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4, 'Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs) + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) # obb.gpu[:] = ob.gpu * cfactf32 ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=self.queue) @@ -167,7 +169,6 @@ def engine_iterate(self, num=1): # First cycle: Fourier + object update for iblock, dID in enumerate(self.dID_list): - t1 = time.time() prep = self.diff_info[dID] # find probe, object in exit ID in dependence of dID @@ -211,24 +212,18 @@ def engine_iterate(self, num=1): ## compute log-likelihood if self.p.compute_log_likelihood: - t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) # synchronize h2d stream with compute stream self.queue.wait_for_event(ev_mag) FUK.log_likelihood(aux, addr, mag, ma, err_phot) - self.benchmark.F_LLerror += time.time() - t1 # synchronize h2d stream with compute stream self.queue.wait_for_event(ev_ex) - t1 = time.time() AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) - self.benchmark.A_Build_aux += time.time() - t1 ## FFT - t1 = time.time() PROP.fw(aux, aux) - self.benchmark.B_Prop += time.time() - t1 ## Deviation from measured data # synchronize h2d stream with compute stream @@ -237,32 +232,23 @@ def engine_iterate(self, num=1): FUK.error_reduce(addr, err_fourier) FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) - self.benchmark.C_Fourier_update += time.time() - t1 data_mag.record_done(self.queue, 'compute') data_ma.record_done(self.queue, 'compute') - t1 = time.time() PROP.bw(aux, aux) ## apply changes - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) - self.benchmark.E_Build_exit += time.time() - t1 - self.benchmark.calls_fourier += 1 - prestr = '%d Iteration (Overlap) #%02d: ' % (parallel.rank, inner) # Update object if do_update_object: log(4, prestr + '----- object update -----', True) - t1 = time.time() - addrt = addr if atomics_object else addr2 self.queue.wait_for_event(ev_ex) POK.ob_update(addrt, obb, obn, pr, ex, atomics=atomics_object) - self.benchmark.object_update += time.time() - t1 - self.benchmark.calls_object += 1 data_ex.record_done(self.queue, 'compute') if iblock + len(self.ex_data) < len(self.dID_list): @@ -280,29 +266,21 @@ def engine_iterate(self, num=1): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] obb = self.ob_buf.S[oID] - # MPI test - if MPI: - obb.data[:] = obb.gpu.get() - obn.data[:] = obn.gpu.get() - parallel.allreduce(obb.data) - parallel.allreduce(obn.data) - obb.data /= obn.data - self.clip_object(obb) - ob.gpu.set(obb.data) - else: - obb.gpu /= obn.gpu - ob.gpu[:] = obb.gpu + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu # Exit if probe should not yet be updated if not do_update_probe: break - self.ex_data.syncback = False + # Update probe log(4, prestr + '----- probe update -----', True) - change = self.probe_update(MPI=MPI) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) - + change = self.probe_update() log(4, prestr + 'change in probe is %.3f' % change, True) # stop iteration if probe change is small @@ -320,7 +298,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): prep = self.diff_info[dID] @@ -331,43 +309,61 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu ma_sum = prep.ma_sum_gpu + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + PCK = kern.PCK - AUK = kern.AUK + TK = kern.TK PROP = kern.PROP + # Make sure our data arrays are on device ev_ma, ma, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod) ev_mag, mag, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod) - # error_state = np.zeros(err_fourier.shape, dtype=np.float32) - # err_fourier.get_async(streamdata.queue, error_state) - cuda.memcpy_dtod(dest=prep.error_state_gpu.ptr, - src=prep.err_fourier_gpu.ptr, - size=prep.err_fourier_gpu.nbytes)#, stream=self.queue) + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # wait for data to arrive + self.queue.wait_for_event(ev_mag) + + # We need to re-calculate the current error + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod_async(dest=error_state.ptr, + src=err_fourier.ptr, + size=err_fourier.nbytes, stream=self.queue) + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - mangled_addr = PCK.address_mangler.mangle_address(addr.get(), original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu(mangled_addr) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - # wait for data to arrive - self.queue.wait_for_event(ev_mag) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu) - # err_fourier_cpu = err_fourier.get_async(streamdata.queue) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - prep.err_fourier_gpu) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) data_mag.record_done(self.queue, 'compute') data_ma.record_done(self.queue, 'compute') - cuda.memcpy_dtod(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes) #stream=self.queue) + cuda.memcpy_dtod_async(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes, stream=self.queue) if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) self.curiter += 1 self.queue.synchronize() @@ -392,11 +388,10 @@ def engine_iterate(self, num=1): ## probe update def probe_update(self, MPI=False): - t1 = time.time() queue = self.queue use_atomics = self.p.probe_update_cuda_atomics # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) for pID, pr in self.pr.storages.items(): prn = self.pr_nrm.S[pID] cfact = self.pr_cfact[pID] @@ -434,40 +429,22 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - # MPI test - if MPI: - # if False: - pr.data[:] = pr.gpu.get() - prn.data[:] = prn.gpu.get() - # queue.synchronize() - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - - self.support_constraint(pr) - - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - # ca. 0.3 ms - # self.pr.S[pID].gpu = probe_gpu - pr.data[:] = pr.gpu.get() - - ## this should be done on GPU - - # queue.synchronize() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size - - # print 'probe update: ' + str(time.time()-t1) - self.benchmark.probe_update += time.time() - t1 - self.benchmark.calls_probe += 1 + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + ## calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=False): """ Clear all GPU data, pinned memory, etc """ @@ -479,4 +456,4 @@ def engine_finalize(self): for name, s in self.pr.S.items(): s.data = np.copy(s.data) # is this the same as s.data.get()? - super().engine_finalize() + super().engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index 36aadfe1b..d2db342f5 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -20,9 +20,6 @@ from . import DM_pycuda from ..mem_utils import GpuDataManager -MPI = parallel.size > 1 -MPI = True - # factor how many more exit waves we wanna keep on GPU compared to # ma / mag data EX_MA_BLOCKS_RATIO = 2 @@ -149,6 +146,12 @@ def engine_prepare(self): s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0) s.data[:] = d s.gpu = gpuarray.to_gpu(s.data) + for name, s in self.pr_buf.S.items(): + # pr + d = s.data + s.data = cuda.pagelocked_empty(d.shape, d.dtype, order="C", mem_flags=0) + s.data[:] = d + s.gpu = gpuarray.to_gpu(s.data) for name, s in self.pr_nrm.S.items(): # prn d = s.data @@ -169,6 +172,8 @@ def engine_prepare(self): if use_tiles: prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum) # prepare page-locked mems: @@ -206,7 +211,7 @@ def engine_prepare(self): nma = min(fit, blocks) nstreams = min(MAX_STREAMS, blocks) - log(3, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks)) + log(4, 'PyCUDA blocks fitting on GPU: exit arrays={}, ma_arrays={}, streams={}, totalblocks={}'.format(nex, nma, nstreams, blocks)) # reset memory or create new if self.ex_data is not None: self.ex_data.reset(ex_mem, nex) @@ -256,13 +261,13 @@ def engine_iterate(self, num=1): cfact = self.ob_cfact[oID] obn = self.ob_nrm.S[oID] obb = self.ob_buf.S[oID] - + if self.p.obj_smooth_std is not None: - logger.info('Smoothing object, cfact is %.2f' % cfact) + log(4,'Smoothing object, cfact is %.2f' % cfact) smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] - self.GSK.convolution(ob.gpu, obb.gpu, smooth_mfs) + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) - obb.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue) + ob.gpu._axpbz(np.complex64(cfact), 0, obb.gpu, stream=streamdata.queue) obn.gpu.fill(np.float32(cfact), stream=streamdata.queue) self.ex_data.syncback = True @@ -350,7 +355,7 @@ def engine_iterate(self, num=1): t1 = time.time() PROP.bw(aux, aux) ## apply changes - AWK.build_exit(aux, addr, ob, pr, ex) + AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 @@ -391,8 +396,7 @@ def engine_iterate(self, num=1): # Update probe log(4, prestr + '----- probe update -----', True) self.ex_data.syncback = False - change = self.probe_update(MPI=MPI) - # change = self.probe_update(MPI=(parallel.size>1 and MPI)) + change = self.probe_update() # swap direction for next time self.dID_list.reverse() @@ -416,7 +420,7 @@ def engine_iterate(self, num=1): """ Iterates through all positions and refines them by a given algorithm. """ - log(3, "----------- START POS REF -------------") + log(4, "----------- START POS REF -------------") prev_event = None for dID in self.di.S.keys(): streamdata = self.streams[self.cur_stream] @@ -429,46 +433,59 @@ def engine_iterate(self, num=1): aux = kern.aux addr = prep.addr_gpu original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu ma_sum = prep.ma_sum_gpu ma, mag = streamdata.ma_to_gpu(dID, prep.ma, prep.mag) + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu PCK = kern.PCK - AUK = kern.AUK + TK = kern.TK + PROP = kern.PROP PCK.queue = streamdata.queue + TK.queue = streamdata.queue PROP.queue = streamdata.queue - AUK.queue = streamdata.queue - #error_state = np.zeros(err_fourier.shape, dtype=np.float32) - #err_fourier.get_async(streamdata.queue, error_state) - cuda.memcpy_dtod_async(dest=prep.error_state_gpu.ptr, - src=prep.err_fourier_gpu.ptr, - size=prep.err_fourier_gpu.nbytes, + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cuda.memcpy_dtod_async(dest=error_state.ptr, + src=err_fourier.ptr, + size=err_fourier.nbytes, stream=streamdata.queue) streamdata.start_compute(prev_event) log(4, 'Position refinement trial: iteration %s' % (self.curiter)) - for i in range(self.p.position_refinement.nshifts): - addr_cpu = addr.get_async(streamdata.queue) + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): streamdata.queue.synchronize() - mangled_addr = PCK.address_mangler.mangle_address(addr_cpu, original_addr, self.curiter) - mangled_addr_gpu = gpuarray.to_gpu_async(mangled_addr, stream=streamdata.queue) - PCK.build_aux(aux, mangled_addr_gpu, ob, pr) + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) PROP.fw(aux, aux) - PCK.fourier_error(aux, mangled_addr_gpu, mag, ma, ma_sum) - PCK.error_reduce(mangled_addr_gpu, prep.err_fourier_gpu) - # err_fourier_cpu = err_fourier.get_async(streamdata.queue) - PCK.update_addr_and_error_state(addr, - prep.error_state_gpu, - mangled_addr_gpu, - prep.err_fourier_gpu) - cuda.memcpy_dtod_async(dest=prep.err_fourier_gpu.ptr, - src=prep.error_state_gpu.ptr, - size=prep.err_fourier_gpu.nbytes, + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + + cuda.memcpy_dtod_async(dest=err_fourier.ptr, + src=error_state.ptr, + size=err_fourier.nbytes, stream=streamdata.queue) if use_tiles: s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] - AUK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) prev_event = streamdata.end_compute() @@ -486,7 +503,6 @@ def engine_iterate(self, num=1): for name, s in self.pr.S.items(): s.gpu.get(s.data) - # FIXXME: copy to pinned memory for dID, prep in self.diff_info.items(): err_fourier = prep.err_fourier_gpu.get() @@ -498,7 +514,6 @@ def engine_iterate(self, num=1): self.error = error return error - def _object_allreduce(self): # make sure that all transfers etc are finished for sd in self.streams: @@ -507,20 +522,12 @@ def _object_allreduce(self): for oID, ob in self.ob.storages.items(): obn = self.ob_nrm.S[oID] obb = self.ob_buf.S[oID] - if MPI: - obb.gpu.get(obb.data) - obn.gpu.get(obn.data) - parallel.allreduce(obb.data) - parallel.allreduce(obn.data) - obb.data /= obn.data - self.clip_object(obb) - tt1 = time.time() - ob.gpu.set(obb.data) # async tx on same stream? - - else: - obb.gpu /= obn.gpu - ob.gpu[:] = obb.gpu - + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu ## probe update def probe_update(self, MPI=False): @@ -528,7 +535,7 @@ def probe_update(self, MPI=False): streamdata = self.streams[self.cur_stream] use_atomics = self.p.probe_update_cuda_atomics # storage for-loop - change = 0 + change_gpu = gpuarray.zeros((1,), dtype=np.float32) prev_event = None for pID, pr in self.pr.storages.items(): prn = self.pr_nrm.S[pID] @@ -560,7 +567,6 @@ def probe_update(self, MPI=False): prev_event = streamdata.end_compute() self.cur_stream = (self.cur_stream + self.stream_direction) % len(self.streams) - # sync all streams first for sd in self.streams: sd.synchronize() @@ -569,31 +575,19 @@ def probe_update(self, MPI=False): buf = self.pr_buf.S[pID] prn = self.pr_nrm.S[pID] - - # MPI test - if MPI: - # if False: - pr.gpu.get(pr.data) - prn.gpu.get(prn.data) - parallel.allreduce(pr.data) - parallel.allreduce(prn.data) - pr.data /= prn.data - self.support_constraint(pr) - pr.gpu.set(pr.data) - else: - pr.gpu /= prn.gpu - # ca. 0.3 ms - # self.pr.S[pID].gpu = probe_gpu - pr.gpu.get(pr.data) - - ## this should be done on GPU - tt1 = time.time() - change += u.norm2(pr.data - buf.data) / u.norm2(pr.data) - buf.data[:] = pr.data - if MPI: - change = parallel.allreduce(change) / parallel.size - tt2 = time.time() - #print('time for pr change: {}s'.format(tt2-tt1)) + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + ## calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size # print 'probe update: ' + str(time.time()-t1) self.benchmark.probe_update += time.time() - t1 @@ -601,7 +595,7 @@ def probe_update(self, MPI=False): return np.sqrt(change) - def engine_finalize(self): + def engine_finalize(self, benchmark=False): """ Clear all GPU data, pinned memory, etc """ @@ -610,4 +604,4 @@ def engine_finalize(self): self.ma_data = None self.mag_data = None - super().engine_finalize() + super().engine_finalize(benchmark) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py new file mode 100644 index 000000000..0454e753c --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda.py @@ -0,0 +1,288 @@ +# -*- coding: utf-8 -*- +""" +Local Douglas-Rachford reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" + +import numpy as np +import time +from pycuda import gpuarray +import pycuda.driver as cuda + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.accelerate.base.engines import DR_serial +from ptypy.accelerate.base import address_manglers +from .. import get_context +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel, PropagationKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa + +MPI = False + + +__all__ = ['DR_pycuda'] + +@register() +class DR_pycuda(DR_serial.DR_serial): + + """ + Defaults: + + [fft_lib] + default = reikna + type = str + help = Choose the pycuda-compatible FFT module. + doc = One of: + - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming) + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages) + choices = 'reikna','cuda','skcuda' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Difference map reconstruction engine. + """ + super(DR_pycuda, self).__init__(ptycho_parent, pars) + + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + self.context, self.queue = get_context(new_context=True, new_queue=True) + + super(DR_pycuda, self).engine_initialize() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + # TODO: make this part of the engine rather than scan + fpc = self.ptycho.frames_per_block + + # Currently modes not implemented for DR algorithm + #assert scan.p.coherence.num_probe_modes == 1 + #assert scan.p.coherence.num_object_modes == 1 + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + fpc = 1 + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = gpuarray.to_gpu(aux) + + # setup kernels, one for each SCAN. + logger.info("Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) + kern.FUK.fshape = (1,) + kern.FUK.fshape[1:] + kern.FUK.allocate() + + logger.info("Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + logger.info("Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + logger.info("Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + #logger.info("Setting up TransposeKernel") + #kern.TK = TransposeKernel(queue=self.queue) + + logger.info("Setting up PropagationKernel") + kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + # if self.do_position_refinement: + # logger.info("Setting up position correction") + # addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), + # self.p.position_refinement.start, + # self.p.position_refinement.stop, + # max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), + # randomseed=0) + # logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) + # logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) + + # kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) + # kern.PCK.allocate() + # kern.PCK.address_mangler = addr_mangler + + logger.info("Kernel setup completed") + + + def engine_prepare(self): + + super(DR_pycuda, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = gpuarray.to_gpu(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = gpuarray.to_gpu(prep.addr) + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.ex = gpuarray.to_gpu(prep.ex) + prep.mag = gpuarray.to_gpu(prep.mag) + prep.ma = gpuarray.to_gpu(prep.ma) + prep.ma_sum = gpuarray.to_gpu(prep.ma_sum) + prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier) + prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot) + prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) + # if self.do_position_refinement: + # prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu) + + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + queue = self.queue + error = {} + for it in range(num): + + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i,None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex[ex_from:ex_to] + mag = prep.mag[i,None] + ma = prep.ma[i,None] + ma_sum = prep.ma_sum[i,None] + err_phot = prep.err_phot_gpu[i,None] + err_fourier = prep.err_fourier_gpu[i,None] + err_exit = prep.err_exit_gpu[i,None] + + ## build auxilliary wave + AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha) + + ## forward FFT + PROP.fw(aux, aux) + + ## Deviation from measured data + if self.p.compute_fourier_error: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + ## backward FFT + PROP.bw(aux, aux) + + ## build exit wave + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + + ## probe/object rescale + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + ## build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.ob_update_local(addr, ob, pr, ex, aux) + + # probe update + POK.pr_update_local(addr, pr, ob, ex, aux) + + ## compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + self.curiter += 1 + + queue.synchronize() + for name, s in self.ob.S.items(): + s.gpu.get(s.data) + for name, s in self.pr.S.items(): + s.gpu.get(s.data) + + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + for name, s in self.ob.S.items(): + del s.gpu + for name, s in self.pr.S.items(): + del s.gpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + + self.context.detach() + super(DR_pycuda, self).engine_finalize() \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py new file mode 100644 index 000000000..fd8dd4b5e --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/engines/DR_pycuda_stream.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +""" +Local Douglas-Rachford reconstruction engine for NVIDIA GPUs. + +This engine uses three streams, one for the compute queue and one for each I/O queue. +Events are used to synchronize download / compute/ upload. we cannot manipulate memory +for each loop over the state vector, a certain number of memory sections is preallocated +and reused. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: GPLv2, see LICENSE for details. +""" + +from ptypy.accelerate.cuda_pycuda.engines.DM_pycuda_stream import DM_pycuda_stream +import numpy as np +from pycuda import gpuarray +import pycuda.driver as cuda + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from . import DR_pycuda + +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager2 + +MPI = False + +EX_MA_BLOCKS_RATIO = 2 +MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit +#MAX_BLOCKS = 4 # can be used to limit the number of blocks, simulating that they don't fit + +__all__ = ['DR_pycuda_stream'] + +@register() +class DR_pycuda_stream(DR_pycuda.DR_pycuda): + + def __init__(self, ptycho_parent, pars=None): + + super(DR_pycuda_stream, self).__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + super().engine_initialize() + self.qu_htod = cuda.Stream() + self.qu_dtoh = cuda.Stream() + + def _setup_kernels(self): + super()._setup_kernels() + ex_mem = 0 + mag_mem = 0 + fpc = self.ptycho.frames_per_block + for scan, kern in self.kernels.items(): + ex_mem = max(kern.aux.nbytes * fpc, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem) + ma_mem = mag_mem + mem = cuda.mem_get_info()[0] + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + + log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager2(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager2(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager2(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super(DR_pycuda.DR_pycuda, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = gpuarray.to_gpu(prep.addr) + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum) + # prepare page-locked mems: + prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier) + prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot) + prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4) + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cuda.pagelocked_empty(ex.shape, ex.dtype, order="C", mem_flags=4) + prep.ex[:] = ex + mag = prep.mag + prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) + prep.mag[:] = mag + + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + self.dID_list = list(self.di.S.keys()) + error = {} + + for it in range(num): + + for iblock, dID in enumerate(self.dID_list): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Schedule ex, ma, mag to device + ev_ex, ex_full, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod) + ev_mag, mag_full, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod) + ev_ma, ma_full, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod) + + ## synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_ex) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i,None] + ex = ex_full[i,None] + mag = mag_full[i,None] + ma = ma_full[i,None] + ma_sum = prep.ma_sum[i,None] + err_phot = prep.err_phot_gpu[i,None] + err_fourier = prep.err_fourier_gpu[i,None] + err_exit = prep.err_exit_gpu[i,None] + + ## build auxilliary wave + AWK.build_aux2(aux, addr, ob, pr, ex, alpha=self.p.alpha) + + ## forward FFT + PROP.fw(aux, aux) + + ## Deviation from measured data + self.queue.wait_for_event(ev_mag) + if self.p.compute_fourier_error: + self.queue.wait_for_event(ev_ma) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + self.queue.wait_for_event(ev_ma) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + ## backward FFT + PROP.bw(aux, aux) + + ## build exit wave + AWK.build_exit_alpha_tau(aux, addr, ob, pr, ex, alpha=self.p.alpha, tau=self.p.tau) + if self.p.compute_exit_error: + FUK.exit_error(aux,addr) + FUK.error_reduce(addr, err_exit) + + ## probe/object rescale + #if self.p.rescale_probe: + # pr *= np.sqrt(self.mean_power / (np.abs(pr)**2).mean()) + + ## build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.ob_update_local(addr, ob, pr, ex, aux) + + # probe update + POK.pr_update_local(addr, pr, ob, ex, aux) + + ## compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + self.dID_list.reverse() + + self.curiter += 1 + self.ex_data.syncback = False + + # finish all the compute + self.queue.synchronize() + + for name, s in self.ob.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + for name, s in self.pr.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + + for dID, prep in self.diff_info.items(): + prep.err_fourier_gpu.get(prep.err_fourier) + prep.err_phot_gpu.get(prep.err_phot) + prep.err_exit_gpu.get(prep.err_exit) + errs = np.ascontiguousarray(np.vstack([ + prep.err_fourier, prep.err_phot, prep.err_exit + ]).T) + error.update(zip(prep.view_IDs, errs)) + + # wait for the async transfers + self.qu_dtoh.synchronize() + + self.error = error + return error + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + # replacing page-locked data with normal npy to avoid + # crash on context destroy + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + + super().engine_finalize() + \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 4112df968..5f36b9121 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -23,8 +23,7 @@ from ptypy.utils.verbose import logger from ptypy.utils import parallel from .. import get_context -from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, \ - PositionCorrectionKernel, PropagationKernel +from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PropagationKernel from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel from ptypy.accelerate.base import address_manglers @@ -160,6 +159,7 @@ def engine_initialize(self): self.queue_transfer = cuda.Stream() self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None super().engine_initialize() #self._setup_kernels() @@ -168,13 +168,6 @@ def _setup_kernels(self): """ Setup kernels, one for each scan. Derive scans from ptycho class """ - - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT - except: - logger.warning('Unable to import cuFFT version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - AUK = ArrayUtilsKernel(queue=self.queue) self._dot_kernel = AUK.dot # get the scans @@ -208,7 +201,7 @@ def _setup_kernels(self): kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue) kern.GDK.allocate() - kern.POK = PoUpdateKernel(queue_thread=self.queue, denom_type=np.float32) + kern.POK = PoUpdateKernel(queue_thread=self.queue) kern.POK.allocate() kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) @@ -217,20 +210,6 @@ def _setup_kernels(self): kern.PROP = PropagationKernel(aux, geo.propagator, queue_thread=self.queue) kern.PROP.allocate() - - if self.do_position_refinement: - addr_mangler = address_manglers.RandomIntMangle(int(self.p.position_refinement.amplitude // geo.resolution[0]), - self.p.position_refinement.start, - self.p.position_refinement.stop, - max_bound=int(self.p.position_refinement.max_shift // geo.resolution[0]), - randomseed=0) - logger.warning("amplitude is %s " % (self.p.position_refinement.amplitude // geo.resolution[0])) - logger.warning("max bound is %s " % (self.p.position_refinement.max_shift // geo.resolution[0])) - - kern.PCK = PositionCorrectionKernel(aux, nmodes, queue_thread=self.queue) - kern.PCK.allocate() - kern.PCK.address_mangler = addr_mangler - def _initialize_model(self): # Create noise model @@ -264,9 +243,10 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy) def _get_smooth_gradient(self, data, sigma): - tmp = gpuarray.empty(data.shape, dtype=np.complex64) - self.GSK.convolution(data, tmp, [sigma, sigma]) - return tmp + if self.GSK.tmp is None: + self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64) + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + return data def _replace_ob_grad(self): new_ob_grad = self.ob_grad_new diff --git a/ptypy/accelerate/cuda_pycuda/import_fft.py b/ptypy/accelerate/cuda_pycuda/import_fft.py deleted file mode 100644 index 6a3d3312e..000000000 --- a/ptypy/accelerate/cuda_pycuda/import_fft.py +++ /dev/null @@ -1,176 +0,0 @@ -''' -"Just-in-time" compilation for callbacks in cufft. -''' -import os -import sys -import importlib -import tempfile -import setuptools -import sysconfig -from pycuda import driver as cuda_driver -import pybind11 -import contextlib -from io import StringIO -from ptypy.utils.verbose import log -import distutils -from distutils.unixccompiler import UnixCCompiler -from distutils.command.build_ext import build_ext - - -def find_in_path(name, path): - "Find a file in a search path" - # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ - for dir in path.split(os.pathsep): - binpath = os.path.join(dir, name) - if os.path.exists(binpath): - return os.path.abspath(binpath) - return None - -def locate_cuda(): - """ - Locate the CUDA environment on the system - Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' - and values giving the absolute path to each directory. - Starts by looking for the CUDAHOME env variable. If not found, everything - is based on finding 'nvcc' in the PATH. - """ - # first check if the CUDAHOME env variable is in use - if 'CUDAHOME' in os.environ: - home = os.environ['CUDAHOME'] - nvcc = os.path.join(home, 'bin', 'nvcc') - else: - # otherwise, search the PATH for NVCC - nvcc = find_in_path('nvcc', os.environ['PATH']) - if nvcc is None: - raise EnvironmentError('The nvcc binary could not be ' - 'located in your $PATH. Either add it to your path, or set $CUDAHOME') - home = os.path.dirname(os.path.dirname(nvcc)) - - cudaconfig = {'home': home, 'nvcc': nvcc, - 'include': os.path.join(home, 'include'), - 'lib64': os.path.join(home, 'lib64')} - for k, v in cudaconfig.items(): - if not os.path.exists(v): - raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) - return cudaconfig - -class NvccCompiler(UnixCCompiler): - def __init__(self, *args, **kwargs): - super(NvccCompiler, self).__init__(*args, **kwargs) - self.CUDA = locate_cuda() - module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - cmp = cuda_driver.Context.get_device().compute_capability() - archflag = '-arch=sm_{}{}'.format(cmp[0], cmp[1]) - self.src_extensions.append('.cu') - self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] - self.NVCC_FLAGS = ["-dc", archflag] - self.CXXFLAGS = ['"-fPIC"'] - pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')] - INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir] - self.INCLUDES = ["-I%s" % ix for ix in INCLUDES] - self.OPTFLAGS = ["-O3", "-std=c++14"] - - def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): - default_compiler_so = self.compiler_so - CPPFLAGS = self.INCLUDES + extra_postargs # little hack here, since postargs usually goes at the end, which we won't do. - # makefile line is - # $(NVCC) $(NVCC_FLAGS) $(OPTFLAGS) -Xcompiler "$(CXXFLAGS)" $(CPPFLAGS) - compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS - compiler_exec = " ".join(compiler_command) - self.set_executable('compiler_so', compiler_exec) - postargs = [] # we don't actually have any postargs - super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method - # reset the default compiler_so, which we might have changed for cuda - self.compiler_so = default_compiler_so - - def link(self, target_desc, objects, - output_filename, output_dir=None, libraries=None, - library_dirs=None, runtime_library_dirs=None, - export_symbols=None, debug=0, extra_preargs=None, - extra_postargs=None, build_temp=None, target_lang=None): - default_linker_so = self.linker_so - # make file line is - # $(NVCC) $(OPTFLAGS) -shared $(LD_FLAGS) $(OBJ) $(OBJ_MOD) -o $@ - linker_command = [self.CUDA["nvcc"]] + self.OPTFLAGS + ["-shared"] + self.LD_FLAGS - linker_exec = " ".join(linker_command) - self.set_executable('linker_so', linker_exec) - super(NvccCompiler, self).link(target_desc, objects, - output_filename, output_dir=None, libraries=None, - library_dirs=None, runtime_library_dirs=None, - export_symbols=None, debug=0, extra_preargs=None, - extra_postargs=None, build_temp=None, target_lang=None) - self.linker_so = default_linker_so - -class CustomBuildExt(build_ext): - def build_extensions(self): - old_compiler = self.compiler - self.compiler = NvccCompiler(verbose=old_compiler.verbose, - dry_run=old_compiler.dry_run, - force=old_compiler.force) # this is our bespoke compiler - super(CustomBuildExt, self).build_extensions() - self.compiler=old_compiler - -@contextlib.contextmanager -def stdchannel_redirected(stdchannel): - """ - Redirects stdout or stderr to a StringIO object. As of python 3.4, there is a - standard library contextmanager for this, but backwards compatibility! - """ - old = getattr(sys, stdchannel) - try: - s = StringIO() - setattr(sys, stdchannel, s) - yield s - finally: - setattr(sys, stdchannel, old) - - -class ImportFFT: - def __init__(self, rows, columns, build_path=None, quiet=True): - self.build_path = build_path - self.cleanup_build_path = None - if self.build_path is None: - self.build_path = tempfile.mkdtemp(prefix="ptypy_fft") - self.cleanup_build_path = True - - full_module_name = "module" - module_dir = os.path.join(__file__.strip('import_fft.py'), 'cuda', 'filtered_fft') - # If we specify the libraries through the extension we soon run into trouble since distutils adds a -l infront of all of these (add_library_option:https://github.com/python/cpython/blob/1c1e68cf3e3a2a19a0edca9a105273e11ddddc6e/Lib/distutils/ccompiler.py#L1115) - ext = distutils.extension.Extension(full_module_name, - sources=[os.path.join(module_dir, "module.cpp"), - os.path.join(module_dir, "filtered_fft.cu")], - extra_compile_args=["-DMY_FFT_COLS=%s" % str(columns) , "-DMY_FFT_ROWS=%s" % str(rows)]) - - script_args = ['build_ext', - '--build-temp=%s' % self.build_path, - '--build-lib=%s' % self.build_path] - # do I need full_module_name here? - setuptools_args = {"name": full_module_name, - "ext_modules": [ext], - "script_args": script_args, - "cmdclass":{"build_ext": CustomBuildExt - }} - - if quiet: - # we really don't care about the make print for almost all cases so we redirect - with stdchannel_redirected("stdout"): - with stdchannel_redirected("stderr"): - setuptools.setup(**setuptools_args) - else: - setuptools.setup(**setuptools_args) - - spec = importlib.util.spec_from_file_location(full_module_name, - os.path.join(self.build_path, - "module" + distutils.sysconfig.get_config_var('EXT_SUFFIX') - ) - ) - self.mod = importlib.util.module_from_spec(spec) - - def get_mod(self): - return self.mod - - def __del__(self): - import shutil - if self.cleanup_build_path: - log(5, "cleaning up the build directory") - shutil.rmtree(self.build_path) diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 9064ab593..47dd4cb79 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -3,9 +3,28 @@ from pycuda import gpuarray from ptypy.utils.verbose import log, logger from . import load_kernel +from .array_utils import CropPadKernel +from .array_utils import MaxAbs2Kernel from ..base import kernels as ab from ..base.kernels import Adict +def choose_fft(fft_type): + if fft_type=='cuda': + try: + from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT + except: + logger.warning('Unable to import cufft version - using Reikna instead') + from ptypy.accelerate.cuda_pycuda.fft import FFT + elif fft_type=='skcuda': + try: + from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT + except: + logger.warning('Unable to import skcuda.fft version - using Reikna instead') + from ptypy.accelerate.cuda_pycuda.fft import FFT + else: + from ptypy.accelerate.cuda_pycuda.fft import FFT + return FFT + class PropagationKernel: def __init__(self, aux, propagator, queue_thread=None, fft='reikna'): @@ -22,35 +41,49 @@ def __init__(self, aux, propagator, queue_thread=None, fft='reikna'): def allocate(self): aux = self.aux - - if self._fft_type=='cuda': - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT - except: - logger.warning('Unable to import cufft version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - elif self._fft_type=='skcuda': - try: - from ptypy.accelerate.cuda_pycuda.cufft import FFT_skcuda as FFT - except: - logger.warning('Unable to import skcuda.fft version - using Reikna instead') - from ptypy.accelerate.cuda_pycuda.fft import FFT - else: - from ptypy.accelerate.cuda_pycuda.fft import FFT + FFT = choose_fft(self._fft_type) if self.prop_type == 'farfield': - self._fft1 = FFT(aux, self.queue, + + self._do_crop_pad = (self._p.crop_pad != 0).any() + if self._do_crop_pad: + self._tmp = np.zeros(aux.shape + self._p.crop_pad, dtype=aux.dtype) + self._CPK = CropPadKernel(queue=self._queue) + else: + self._tmp = aux + + self._fft1 = FFT(self._tmp, self.queue, pre_fft=self._p.pre_fft, post_fft=self._p.post_fft, symmetric=True, forward=True) - self._fft2 = FFT(aux, self.queue, + self._fft2 = FFT(self._tmp, self.queue, pre_fft=self._p.pre_ifft, post_fft=self._p.post_ifft, symmetric=True, forward=False) - self.fw = self._fft1.ft - self.bw = self._fft2.ift + if self._do_crop_pad: + self._tmp = gpuarray.to_gpu(self._tmp) + + def _fw(x,y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft1.ft(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft1.ft(x,y) + + def _bw(x,y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft2.ift(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft2.ift(x,y) + + self.fw = _fw + self.bw = _bw + elif self.prop_type == "nearfield": self._fft1 = FFT(aux, self.queue, post_fft=self._p.kernel, @@ -90,18 +123,78 @@ def queue(self, queue): if self.prop_type == "nearfield": self._fft3.queue = queue +class FourierSupportKernel: + def __init__(self, support, queue_thread=None, fft='reikna'): + self.support = support + self.queue = queue_thread + self._fft_type = fft + def allocate(self): + FFT = choose_fft(self._fft_type) + + self._fft1 = FFT(self.support, self.queue, + post_fft=self.support, + symmetric=True, + forward=True) + self._fft2 = FFT(self.support, self.queue, + symmetric=True, + forward=False) + def apply_fourier_support(self,x): + self._fft1.ft(x,x) + self._fft2.ift(x,x) + +class RealSupportKernel: + def __init__(self, support): + self.support = support + def allocate(self): + self.support = gpuarray.to_gpu(self.support) + def apply_real_support(self, x): + x *= self.support + class FourierUpdateKernel(ab.FourierUpdateKernel): - def __init__(self, aux, nmodes=1, queue_thread=None): + def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): super(FourierUpdateKernel, self).__init__(aux, nmodes=nmodes) + + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + if math_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + self.accumulate_type = accumulate_type + self.math_type = math_type self.queue = queue_thread - self.fmag_all_update_cuda = load_kernel("fmag_all_update") - self.fourier_error_cuda = load_kernel("fourier_error") + self.fmag_all_update_cuda = load_kernel("fmag_all_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda = None + self.fourier_deviation_cuda = None + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) self.fourier_error2_cuda = None - self.error_reduce_cuda = load_kernel("error_reduce") + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'ACC_TYPE': self.accumulate_type, + 'BDIM_X': 32, + 'BDIM_Y': 32, + }) self.fourier_update_cuda = None - self.log_likelihood_cuda = load_kernel("log_likelihood") - self.exit_error_cuda = load_kernel("exit_error") + self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel( + ("log_likelihood", "log_likelihood2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "log_likelihood.cu") + self.exit_error_cuda = load_kernel("exit_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) self.gpu = Adict() self.gpu.fdev = None @@ -156,6 +249,28 @@ def fourier_error(self, f, addr, fmag, fmask, mask_sum): shared=int(bx*by*bz*4), stream=self.queue) + def fourier_deviation(self, f, addr, fmag): + fdev = self.gpu.fdev + if self.fourier_deviation_cuda is None: + self.fourier_deviation_cuda = load_kernel("fourier_deviation",{ + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + bx = 64 + by = 1 + self.fourier_deviation_cuda(np.int32(self.nmodes), + f, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), + stream=self.queue) + + def error_reduce(self, addr, err_sum): self.error_reduce_cuda(self.gpu.ferr, err_sum, @@ -163,7 +278,6 @@ def error_reduce(self, addr, err_sum): np.int32(self.fshape[2]), block=(32, 32, 1), grid=(int(err_sum.shape[0]), 1, 1), - shared=32*32*4, stream=self.queue) def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): @@ -180,6 +294,29 @@ def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): block=(32, 32, 1), grid=(int(fmag.shape[0]*self.nmodes), 1, 1), stream=self.queue) + + def fmag_update_nopbound(self, f, addr, fmag, fmask): + fdev = self.gpu.fdev + bx = 64 + by = 1 + if self.fmag_update_nopbound_cuda is None: + self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda(f, + fmask, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, + int((self.fshape[2] + by - 1) // by), + int(fmag.shape[0]*self.nmodes)), + stream=self.queue) # Note: this was a test to join the kernels, but it's > 2x slower! def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): @@ -229,6 +366,24 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # TODO: we might want to move this call outside of here self.error_reduce(addr, err_phot) + def log_likelihood2(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + bx = 64 + by = 1 + self.log_likelihood2_cuda(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), + stream=self.queue) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + def exit_error(self, aux, addr): sh = addr.shape maxz = sh[0] @@ -261,17 +416,36 @@ def execute(self, kernel_name=None, compare=False, sync=False): class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel): - def __init__(self, queue_thread=None): + def __init__(self, queue_thread=None, math_type = 'float'): super(AuxiliaryWaveKernel, self).__init__() # and now initialise the cuda self.queue = queue_thread self._ob_shape = None self._ob_id = None - self.build_aux_cuda = load_kernel("build_aux") - self.build_exit_cuda = load_kernel("build_exit") - self.build_aux_no_ex_cuda = load_kernel("build_aux_no_ex", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + self.math_type = math_type + if math_type not in ['float', 'double']: + raise ValueError('Only double or float math is supported') + self.build_aux_cuda, self.build_aux2_cuda = load_kernel( + ("build_aux", "build_aux2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux.cu") + self.build_exit_cuda = load_kernel("build_exit", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel( + ("build_aux_no_ex", "build_aux2_no_ex"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux_no_ex.cu") + self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) # DEPRECATED? @@ -282,14 +456,9 @@ def load(self, aux, ob, pr, ex, addr): def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): obr, obc = self._cache_object_shape(ob) - # print('grid={}, 1, 1'.format(int(ex.shape[0]))) - # print('b_aux={}, sh={}'.format(type(b_aux), b_aux.shape)) - # print('ex={}, sh={}'.format(type(ex), ex.shape)) - # print('pr={}, sh={}'.format(type(pr), pr.shape)) - # print('ob={}, sh={}'.format(type(ob), ob.shape)) - # print('obr={}, obc={}'.format(obr, obc)) - # print('addr={}, sh={}'.format(type(addr), addr.shape)) - # print('stream={}'.format(self.queue)) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_aux_cuda(b_aux, ex, np.int32(ex.shape[1]), np.int32(ex.shape[2]), @@ -298,11 +467,37 @@ def build_aux(self, b_aux, addr, ob, pr, ex, alpha=1.0): ob, obr, obc, addr, - np.float32(alpha), - block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue) + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_aux2_cuda(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), + block=(bx, by, 1), + grid=( + 1, + int((ex.shape[1] + by - 1)//by), + int(maxz * nmodes)), + stream=self.queue) - def build_exit(self, b_aux, addr, ob, pr, ex): + def build_exit(self, b_aux, addr, ob, pr, ex, alpha=1): obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_exit_cuda(b_aux, ex, np.int32(ex.shape[1]), np.int32(ex.shape[2]), @@ -311,7 +506,28 @@ def build_exit(self, b_aux, addr, ob, pr, ex): ob, obr, obc, addr, - block=(32, 32, 1), grid=(int(ex.shape[0]), 1, 1), stream=self.queue) + np.float32(alpha) if ex.dtype == np.complex64 else np.float64(alpha), + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_exit_alpha_tau_cuda(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha), np.float32(tau), + block=(bx, by, 1), + grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), + stream=self.queue) def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): obr, obc = self._cache_object_shape(ob) @@ -327,12 +543,36 @@ def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): ob, obr, obc, addr, - np.float32(fac), + np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac), np.int32(add), block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) + + def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + self.build_aux2_no_ex_cuda(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32(fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add), + block=(bx, by, 1), + grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)), + stream=self.queue) + + def _cache_object_shape(self, ob): oid = id(ob) @@ -345,28 +585,44 @@ def _cache_object_shape(self, ob): class GradientDescentKernel(ab.GradientDescentKernel): - def __init__(self, aux, nmodes=1, queue=None): + def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_type='float'): super().__init__(aux, nmodes) self.queue = queue - + self.accumulate_type = accumulate_type + self.math_type = math_type + if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']): + raise ValueError("accumulate and math types must be double for float") + self.gpu = Adict() self.gpu.LLden = None self.gpu.LLerr = None self.gpu.Imodel = None subs = { - 'CTYPE': 'complex' if self.ctype == np.complex64 else 'complex', - 'FTYPE': 'float' if self.ftype == np.float32 else 'double' + 'IN_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'ACC_TYPE': self.accumulate_type, + 'MATH_TYPE': self.math_type } self.make_model_cuda = load_kernel('make_model', subs) self.make_a012_cuda = load_kernel('make_a012', subs) - self.error_reduce_cuda = load_kernel('error_reduce', subs) - self.fill_b_cuda = load_kernel('fill_b', {**subs, 'BDIM_X': 1024}) - self.fill_b_reduce_cuda = load_kernel( - 'fill_b_reduce', {**subs, 'BDIM_X': 1024}) + self.error_reduce_cuda = load_kernel('error_reduce', { + **subs, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'BDIM_X': 32, + 'BDIM_Y': 32 + }) + self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel( + ('fill_b', 'fill_b_reduce'), + { + **subs, + 'BDIM_X': 1024, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + }, + file="fill_b.cu") self.main_cuda = load_kernel('gd_main', subs) - self.floating_intensity_cuda_step1 = load_kernel('step1', subs,'intens_renorm.cu') - self.floating_intensity_cuda_step2 = load_kernel('step2', subs,'intens_renorm.cu') + self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \ + load_kernel(('step1', 'step2'), subs,'intens_renorm.cu') def allocate(self): self.gpu.LLden = gpuarray.zeros(self.fshape, dtype=self.ftype) @@ -377,7 +633,7 @@ def allocate(self): # temporary array for the reduction in fill_b sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024)) - self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64) + self.gpu.Btmp = gpuarray.zeros(sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32) def make_model(self, b_aux, addr): # reference shape @@ -463,7 +719,6 @@ def error_reduce(self, addr, err_sum): np.int32(ferr.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) def floating_intensity(self, addr, w, I, fic): @@ -481,14 +736,13 @@ def floating_intensity(self, addr, w, I, fic): fic_tmp = self.gpu.fic_tmp ## math ## - x = np.int32(sh[1] * sh[2]) - z = np.int32(maxz) + xall = np.int32(maxz * sh[1] * sh[2]) bx = 1024 self.floating_intensity_cuda_step1(Imodel, I, w, num, den, - z, x, + xall, block=(bx, 1, 1), - grid=(int((x + bx - 1) // bx), 1, int(z)), + grid=(int((xall + bx - 1) // bx), 1, 1), stream=self.queue) self.error_reduce_cuda(num, fic, @@ -496,7 +750,6 @@ def floating_intensity(self, addr, w, I, fic): np.int32(num.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) self.error_reduce_cuda(den, fic_tmp, @@ -504,13 +757,13 @@ def floating_intensity(self, addr, w, I, fic): np.int32(den.shape[-1]), block=(32, 32, 1), grid=(int(maxz), 1, 1), - shared=32*32*4, stream=self.queue) self.floating_intensity_cuda_step2(fic_tmp, fic, Imodel, - z, x, - block=(bx, 1, 1), - grid=(int((x + bx - 1) // bx), 1, int(z)), + np.int32(Imodel.shape[-2]), + np.int32(Imodel.shape[-1]), + block=(32, 32, 1), + grid=(1, 1, int(maxz)), stream=self.queue) @@ -542,39 +795,61 @@ def main(self, b_aux, addr, w, I): class PoUpdateKernel(ab.PoUpdateKernel): - def __init__(self, queue_thread=None, denom_type=np.complex64): + def __init__(self, queue_thread=None, + math_type='float', accumulator_type='float'): super(PoUpdateKernel, self).__init__() # and now initialise the cuda - if denom_type == np.complex64: - dtype = 'complex' - elif denom_type == np.float32: - dtype = 'float' - else: - raise ValueError('only complex64 and float32 types supported') - self.dtype = dtype + if math_type not in ['double', 'float']: + raise ValueError('only float and double are supported for math_type') + if accumulator_type not in ['double', 'float']: + raise ValueError('only float and double are supported for accumulator_type') + self.math_type = math_type + self.accumulator_type = accumulator_type self.queue = queue_thread + self.norm = None + self.MAK = MaxAbs2Kernel(self.queue) self.ob_update_cuda = load_kernel("ob_update", { - 'DENOM_TYPE': dtype + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.ob_update2_cuda = None # load_kernel("ob_update2") self.pr_update_cuda = load_kernel("pr_update", { - 'DENOM_TYPE': dtype + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.pr_update2_cuda = None self.ob_update_ML_cuda = load_kernel("ob_update_ML", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.ob_update2_ML_cuda = None self.pr_update_ML_cuda = load_kernel("pr_update_ML", { - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type }) self.pr_update2_ML_cuda = None + self.ob_update_local_cuda = load_kernel("ob_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_local_cuda = load_kernel("pr_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) def ob_update(self, addr, ob, obn, pr, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + if obn.dtype != np.float32: + raise ValueError("Denominator must be float32 in current implementation") if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: @@ -595,12 +870,15 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in ob.shape[-2:]] - grid = (grid[0], grid[1], int(1)) - self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], + grid = (grid[1], grid[0], int(1)) + self.ob_update2_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], prsh[0], np.int32(ex.shape[0]), np.int32(ex.shape[1]), @@ -611,6 +889,8 @@ def ob_update(self, addr, ob, obn, pr, ex, atomics=True): def pr_update(self, addr, pr, prn, ob, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + if prn.dtype != np.float32: + raise ValueError("Denominator must be float32 in current implementation") if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: raise ValueError('Address not in required shape for atomics pr_update') @@ -632,7 +912,10 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'DENOM_TYPE': self.dtype + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in pr.shape[-2:]] @@ -645,17 +928,18 @@ def pr_update(self, addr, pr, prn, ob, ex, atomics=True): def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] if atomics: if addr.shape[3] != 3 or addr.shape[2] != 5: raise ValueError('Address not in required shape for tiled ob_update') num_pods = np.int32(addr.shape[0] * addr.shape[1]) - self.ob_update_ML_cuda(ex, num_pods, prsh[1], prsh[2], + self.ob_update_ML_cuda(ex, num_pods, exsh[1], exsh[2], pr, prsh[0], prsh[1], prsh[2], ob, obsh[0], obsh[1], obsh[2], addr, - np.float32(fac), + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue) else: if addr.shape[0] != 5 or addr.shape[1] != 3: @@ -667,17 +951,20 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): "NUM_MODES": obsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in ob.shape[-2:]] - grid = (grid[0], grid[1], int(1)) - self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], + grid = (grid[1], grid[0], int(1)) + self.ob_update2_ML_cuda(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], prsh[0], np.int32(ex.shape[0]), np.int32(ex.shape[1]), np.int32(ex.shape[2]), - ob, pr, ex, addr, np.float32(fac), + ob, pr, ex, addr, + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): @@ -691,7 +978,7 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): pr, prsh[0], prsh[1], prsh[2], ob, obsh[0], obsh[1], obsh[2], addr, - np.float32(fac), + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue) else: if addr.shape[0] != 5 or addr.shape[1] != 3: @@ -702,29 +989,128 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): "NUM_MODES": prsh[0], "BDIM_X": 16, "BDIM_Y": 16, - 'CTYPE': 'complex', - 'FTYPE': 'float' + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type }) grid = [int((x+15)//16) for x in pr.shape[-2:]] grid = (grid[0], grid[1], int(1)) self.pr_update2_ML_cuda(prsh[-1], obsh[-2], obsh[-1], prsh[0], obsh[0], num_pods, - pr, ob, ex, addr, np.float32(fac), + pr, ob, ex, addr, + np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) + def ob_update_local(self, addr, ob, pr, ex, aux): + # lazy allocation of temporary 1-element array + if self.norm is None: + self.norm = gpuarray.empty((1,), dtype=np.float32) + self.MAK.max_abs2(pr, self.norm) + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.ob_update_local_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + self.norm, + ob, + obsh[0], obsh[1], obsh[2], + addr, + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + stream=self.queue) + + def pr_update_local(self, addr, pr, ob, ex, aux): + # lazy allocation of temporary 1-element array + if self.norm is None: + self.norm = gpuarray.empty((1,), dtype=np.float32) + self.MAK.max_abs2(ob, self.norm) + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + + bx = 64 + by = 1 + self.pr_update_local_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + self.norm, + ob, + obsh[0], obsh[1], obsh[2], + addr, + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1) // by), int(num_pods)), + stream=self.queue) + + + class PositionCorrectionKernel(ab.PositionCorrectionKernel): - def __init__(self, aux, nmodes, queue_thread=None): - super(PositionCorrectionKernel, self).__init__(aux, nmodes) + from ptypy.accelerate.cuda_pycuda import address_manglers + + # these are used by the self.setup method - replacing them with the GPU implementation + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs): + super(PositionCorrectionKernel, self).__init__(*args, **kwargs) + # make sure we set the right stream in the mangler + self.mangler.queue = queue_thread + if math_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + # add kernels + self.math_type = math_type + self.accumulate_type = accumulate_type self.queue = queue_thread self._ob_shape = None self._ob_id = None - self.fourier_error_cuda = load_kernel("fourier_error") - self.error_reduce_cuda = load_kernel("error_reduce") - self.build_aux_pc_cuda = load_kernel("build_aux_position_correction") - self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state") + self.fourier_error_cuda = load_kernel("fourier_error",{ + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'BDIM_X': 32, + 'BDIM_Y': 32, + 'ACC_TYPE': self.accumulate_type + }) + self.log_likelihood_cuda = load_kernel("log_likelihood", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "log_likelihood.cu") + self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float' + }) self.gpu = Adict() self.gpu.fdev = None @@ -736,13 +1122,16 @@ def allocate(self): def build_aux(self, b_aux, addr, ob, pr): obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] self.build_aux_pc_cuda(b_aux, pr, np.int32(pr.shape[1]), np.int32(pr.shape[2]), ob, obr, obc, addr, - block=(32, 32, 1), grid=(int(np.prod(addr.shape[:1])), 1, 1), stream=self.queue) + block=(32, 32, 1), grid=(int(maxz * nmodes), 1, 1), stream=self.queue) def fourier_error(self, f, addr, fmag, fmask, mask_sum): fdev = self.gpu.fdev @@ -772,22 +1161,23 @@ def error_reduce(self, addr, err_fmag): np.int32(self.fshape[2]), block=(32, 32, 1), grid=(int(err_fmag.shape[0]), 1, 1), - shared=32*32*4, stream=self.queue) - def update_addr_and_error_state_old(self, addr, error_state, mangled_addr, err_sum): - ''' - updates the addresses and err state vector corresponding to the smallest error. I think this can be done on the cpu - ''' - update_indices = err_sum < error_state - log(4, "updating %s indices" % np.sum(update_indices)) - print('update ind {}, addr {}, mangled {}'.format(update_indices.shape, addr.shape, mangled_addr.shape)) - addr_cpu = addr.get_async(self.queue) - self.queue.synchronize() - addr_cpu[update_indices] = mangled_addr[update_indices] - addr.set_async(ary=addr_cpu, stream=self.queue) - - error_state[update_indices] = err_sum[update_indices] + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + self.log_likelihood_cuda(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(mag.shape[0]), 1, 1), + stream=self.queue) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): # assume all data is on GPU! diff --git a/ptypy/accelerate/cuda_pycuda/mem_utils.py b/ptypy/accelerate/cuda_pycuda/mem_utils.py index fdded3dfb..2f5917173 100644 --- a/ptypy/accelerate/cuda_pycuda/mem_utils.py +++ b/ptypy/accelerate/cuda_pycuda/mem_utils.py @@ -308,7 +308,7 @@ def add_data_block(self, nbytes=None): Returns ------- """ - if self.max is None or len(self)<=self.max: + if self.max is None or len(self) if cuda support isn't enabled, the application simply crashes with a seg fault + +4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. + It should be in DEFAULT mode. + +""" + +import mpi4py +from pkg_resources import parse_version +import numpy as np +from pycuda import gpuarray +import pycuda.driver as cuda +from ptypy.utils import parallel +from ptypy.utils.verbose import logger, log +import os + +try: + from cupy.cuda import nccl + import cupy as cp +except ImportError: + nccl = None + +# properties to check which versions are available + +# use NCCL is it is available, and the user didn't override the +# default selection with environment variables +have_nccl = (nccl is not None) and \ + (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ + (not 'PTYPY_USE_MPI' in os.environ) + +# At the moment, we require: +# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, +# mpi4py >= 3.1.0 +# pycuda with __cuda_array_interface__ +# and not setting the PTYPY_USE_MPI environment variable +# +# -> we ideally want to allow enabling support from a parameter in ptypy +have_cuda_mpi = "OMPI_MCA_opal_cuda_support" in os.environ and \ + os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \ + parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \ + hasattr(gpuarray.GPUArray, '__cuda_array_interface__') and \ + not ('PTYPY_USE_MPI' in os.environ) + + +class MultiGpuCommunicatorBase: + """Base class for multi-GPU communicator options, to aggregate common bits""" + + def __init__(self): + self.rank = parallel.rank + self.ndev = parallel.size + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + # base class only checks properties of arrays + assert isinstance(arr, gpuarray.GPUArray), "Input must be a GPUArray" + + +class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase): + """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D""" + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + super().allReduceSum(arr) + + if parallel.MPIenabled: + # note: this creates a temporary CPU array + data = arr.get() + parallel.allreduce(data) + arr.set(data) + +class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase): + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + # Check if cuda array interface is available + if not hasattr(arr, '__cuda_array_interface__'): + raise RuntimeError("input array should have a cuda array interface") + + if parallel.MPIenabled: + comm = parallel.comm + comm.Allreduce(parallel.MPI.IN_PLACE, arr) + + +class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): + + def __init__(self): + super().__init__() + + # Check if GPUs are in default mode + if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT: + raise RuntimeError("Compute mode must be default in order to use NCCL") + + # get a unique identifier for the NCCL communicator and + # broadcast it to all MPI processes (assuming one device per process) + if self.rank == 0: + self.id = nccl.get_unique_id() + else: + self.id = None + + self.id = parallel.bcast(self.id) + + self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + buf = int(arr.gpudata) + count, datatype = self.__get_NCCL_count_dtype(arr) + + # no stream support here for now - it fails in NCCL when + # pycuda.Stream.handle is used for some unexplained reason + stream = cp.cuda.Stream.null.ptr + + self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream) + + def __get_NCCL_count_dtype(self, arr): + if arr.dtype == np.complex64: + return arr.size*2, nccl.NCCL_FLOAT32 + elif arr.dtype == np.complex128: + return arr.size*2, nccl.NCCL_FLOAT64 + elif arr.dtype == np.float32: + return arr.size, nccl.NCCL_FLOAT32 + elif arr.dtype == np.float64: + return arr.size, nccl.NCCL_FLOAT64 + else: + raise ValueError("This dtype is not supported by NCCL.") + + +# pick the appropriate communicator depending on installed packages +def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): + if have_nccl and use_nccl: + try: + comm = MultiGpuCommunicatorNccl() + log(4, "Using NCCL communicator") + return comm + except RuntimeError: + pass + if have_cuda_mpi and use_cuda_mpi: + try: + comm = MultiGpuCommunicatorCudaMpi() + log(4, "Using CUDA-aware MPI communicator") + return comm + except RuntimeError: + pass + comm = MultiGpuCommunicatorMpi() + log(4, "Using MPI communicator") + return comm \ No newline at end of file diff --git a/ptypy/engines/DM.py b/ptypy/engines/DM.py index 9b8340a63..46fa0a2bc 100644 --- a/ptypy/engines/DM.py +++ b/ptypy/engines/DM.py @@ -363,8 +363,7 @@ def object_update(self): # array and therefore underestimate the strength of the probe terms. cfact = self.p.object_inertia * self.mean_power if self.p.obj_smooth_std is not None: - logger.info( - 'Smoothing object, average cfact is %.2f' + log(4, 'Smoothing object, average cfact is %.2f' % np.mean(cfact).real) smooth_mfs = [0, self.p.obj_smooth_std, diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index b0bbaf678..f6009e9b8 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -19,14 +19,15 @@ from ..utils import parallel from .utils import Cnorm2, Cdot from . import register -from .base import PositionCorrectionEngine +from .base import BaseEngine from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull + __all__ = ['ML'] @register() -class ML(PositionCorrectionEngine): +class ML(BaseEngine): """ Maximum likelihood reconstruction engine. @@ -98,7 +99,7 @@ class ML(PositionCorrectionEngine): type = int lowlim = 0 help = Number of iterations before probe update starts - + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull] @@ -153,7 +154,6 @@ def engine_initialize(self): """ Prepare for ML reconstruction. """ - super(ML, self).engine_initialize() # Object gradient and minimization direction self.ob_grad = self.ob.copy(self.ob.ID + '_grad', fill=0.) @@ -232,9 +232,10 @@ def engine_iterate(self, num=1): # probe/object rescaling if self.p.scale_precond: cn2_new_pr_grad = Cnorm2(new_pr_grad) + cn2_new_ob_grad = Cnorm2(new_ob_grad) if cn2_new_pr_grad > 1e-5: - scale_p_o = (self.p.scale_probe_object * Cnorm2(new_ob_grad) - / Cnorm2(new_pr_grad)) + scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad + / cn2_new_pr_grad) else: scale_p_o = self.p.scale_probe_object if self.scale_p_o is None: @@ -319,7 +320,6 @@ def engine_finalize(self): """ Delete temporary containers. """ - super(ML, self).engine_finalize() del self.ptycho.containers[self.ob_grad.ID] del self.ob_grad del self.ptycho.containers[self.ob_grad_new.ID] diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py index 18f18b65e..1a6a49cdd 100644 --- a/ptypy/engines/base.py +++ b/ptypy/engines/base.py @@ -13,7 +13,7 @@ from .. import utils as u from ..utils import parallel from ..utils.verbose import logger, headerline, log -from .posref import AnnealingRefine +from .posref import AnnealingRefine, GridSearchRefine __all__ = ['BaseEngine', 'Base3dBraggEngine', 'DEFAULT_iter_info', 'PositionCorrectionEngine'] @@ -175,16 +175,16 @@ def support_constraint(self, storage=None): for s in self.pr.storages.values(): self.support_contraint(s) - # Real space - support = self._probe_support.get(storage.ID) - if support is not None: - storage.data *= support - # Fourier space support = self._probe_fourier_support.get(storage.ID) if support is not None: storage.data[:] = np.fft.ifft2(support * np.fft.fft2(storage.data)) + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + storage.data *= support + def iterate(self, num=None): """ Compute one or several iterations. @@ -314,6 +314,11 @@ class PositionCorrectionEngine(BaseEngine): type = Param, bool help = If True refine scan positions + [position_refinement.method] + default = Annealing + type = str + help = Annealing or GridSearch + [position_refinement.start] default = None type = int @@ -357,6 +362,11 @@ class PositionCorrectionEngine(BaseEngine): help = record movement of positions """ + POSREF_ENGINES = { + "Annealing": AnnealingRefine, + "GridSearch": GridSearchRefine + } + def __init__(self, ptycho_parent, pars): """ Position Correction engine. @@ -386,17 +396,17 @@ def engine_initialize(self): self.do_position_refinement = False else: self.do_position_refinement = True - log(3, "Initialising position refinement") + log(3, "Initialising position refinement (%s)" %self.p.position_refinement.method) # Enlarge object arrays, # This can be skipped though if the boundary is less important for name, s in self.ob.storages.items(): - s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize)) - s.reformat() + s.padding = int(self.p.position_refinement.max_shift // np.max(s.psize)) + s.reformat() - # this could be some kind of dictionary lookup if we want to add more - self.position_refinement = AnnealingRefine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric) - log(3, "Position refinement initialised") + # Choose position refinement engine from dictionary + PosrefEngine = self.POSREF_ENGINES[self.p.position_refinement.method] + self.position_refinement = PosrefEngine(self.p.position_refinement, self.ob, metric=self.p.position_refinement.metric) self.ptycho.citations.add_article(**self.position_refinement.citation_dictionary) if self.p.position_refinement.stop is None: self.p.position_refinement.stop = self.p.numiter diff --git a/ptypy/engines/posref.py b/ptypy/engines/posref.py index c0f12a857..af27cdaf1 100644 --- a/ptypy/engines/posref.py +++ b/ptypy/engines/posref.py @@ -41,14 +41,61 @@ def update_constraints(self, iteration): iteration : int The current iteration of the engine. ''' + start, end = self.p.start, self.p.stop + # Compute the maximum shift allowed at this iteration + self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start) - raise NotImplementedError('This method needs to be overridden in order to position correct') + def estimate_fourier_metric(self, di_view, obj): + ''' + Calculates error based on DM fourier error estimate. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view for which we wish to calculate the error. + + obj : numpy.ndarray + The current calculated object for which we wish to evaluate the error against. + Returns + ------- + np.float + The calculated fourier error + ''' + af2 = np.zeros_like(di_view.data) + for name, pod in di_view.pods.items(): + af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) + return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2) / di_view.pod.mask.sum() + + def estimate_photon_metric(self, di_view, obj): + ''' + Calculates error based on reduced likelihood estimate. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view for which we wish to calculate the error. + + obj : numpy.ndarray + The current calculated object for which we wish to evaluate the error against. + Returns + ------- + np.float + The calculated fourier error + ''' + af2 = np.zeros_like(di_view.data) + for name, pod in di_view.pods.items(): + af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) + return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape)) def cleanup(self): ''' Cleans up every iteration ''' + @property + def citation_dictionary(self): + return {} + class AnnealingRefine(PositionRefine): @@ -85,48 +132,6 @@ def __init__(self, position_refinement_parameters, Cobj, metric="fourier"): else: raise NotImplementedError("Metric %s is currently not implemented" %metric) - def estimate_fourier_metric(self, di_view, obj): - ''' - Calculates error based on DM fourier error estimate. - - Parameters - ---------- - di_view : ptypy.core.classes.View - A diffraction view for which we wish to calculate the error. - - obj : numpy.ndarray - The current calculated object for which we wish to evaluate the error against. - Returns - ------- - np.float - The calculated fourier error - ''' - af2 = np.zeros_like(di_view.data) - for name, pod in di_view.pods.items(): - af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) - return np.sum(di_view.pod.mask * (np.sqrt(af2) - np.sqrt(np.abs(di_view.data)))**2) - - def estimate_photon_metric(self, di_view, obj): - ''' - Calculates error based on reduced likelihood estimate. - - Parameters - ---------- - di_view : ptypy.core.classes.View - A diffraction view for which we wish to calculate the error. - - obj : numpy.ndarray - The current calculated object for which we wish to evaluate the error against. - Returns - ------- - np.float - The calculated fourier error - ''' - af2 = np.zeros_like(di_view.data) - for name, pod in di_view.pods.items(): - af2 += pod.downsample(u.abs2(pod.fw(pod.probe*obj))) - return (np.sum(di_view.pod.mask * (af2 - di_view.data)**2 / (di_view.data + 1.)) / np.prod(af2.shape)) - def update_view_position(self, di_view): ''' Refines the positions by the following algorithm: @@ -189,24 +194,118 @@ def update_view_position(self, di_view): error = new_error coord = new_coord log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta)) - + ob_view.coord = coord ob_view.storage.update_views(ob_view) return coord - initial_coord - def update_constraints(self, iteration): + @property + def citation_dictionary(self): + return { + "title" : 'An annealing algorithm to correct positioning errors in ptychography', + "author" : 'Maiden et al.', + "journal" : 'Ultramicroscopy', + "volume" : 120, + "year" : 2012, + "page" : 64, + "doi" : '10.1016/j.ultramic.2012.06.001', + "comment" : 'Position Refinement using annealing algorithm'} + +class GridSearchRefine(PositionRefine): + + def __init__(self, position_refinement_parameters, Cobj, metric="fourier"): ''' + Grid Search Position Refinement. - Parameters ---------- - iteration : int - The current iteration of the engine. + position_refinement_parameters : ptypy.utils.parameters.Param + The parameter tree for the refinement + + Cobj : ptypy.core.classes.Container + The current pbject container object + metric : str + "fourier" or "photon" ''' + super(GridSearchRefine, self).__init__(position_refinement_parameters) - start, end = self.p.start, self.p.stop + self.Cobj = Cobj # take a reference here. It would be cool if we could make this read-only or something - # Compute the maximum shift allowed at this iteration - self.max_shift_dist = self.p.amplitude * (end - iteration) / (end - start) + # Updated before each iteration by self.update_constraints + self.max_shift_dist = None + + # Choose metric for fourier error + if metric == "fourier": + self.fourier_error = self.estimate_fourier_metric + elif metric == "photon": + self.fourier_error = self.estimate_photon_metric + else: + raise NotImplementedError("Metric %s is currently not implemented" %metric) + + def update_view_position(self, di_view): + ''' + Refines the positions by the following algorithm: + + Calculates all shifts in a given radius around the original position and calculates the fourier error. + If the fourier error decreased the calculated postion will be used as new position. + + Parameters + ---------- + di_view : ptypy.core.classes.View + A diffraction view that we wish to refine. + + Returns + ------- + numpy.ndarray + A length 2 numpy array with the position increments for x and y co-ordinates respectively + ''' + # there might be more than one object view + ob_view = di_view.pod.ob_view + + initial_coord = ob_view.coord.copy() + coord = initial_coord + psize = ob_view.psize.copy() + + # if you cannot move far, do nothing + if np.max(psize) >= self.max_shift_dist: + return np.zeros((2,)) + + # This can be optimized by saving existing iteration fourier error... + error = self.fourier_error(di_view, ob_view.data) + + max_shift_pix = self.max_shift_dist // np.min(psize) + max_bound_pix = self.p.max_shift // np.min(psize) + + # Create the search grid + deltas = np.mgrid[-max_shift_pix:max_shift_pix+1:1, + -max_shift_pix:max_shift_pix+1:1] + within_bound = (deltas[0]**2 + deltas[1]**2) < (max_bound_pix**2) + deltas = (deltas[:,within_bound] * np.min(psize)).T + + for i in range(deltas.shape[0]): + # Current shift + delta = deltas[i] + + # Move view to new position + new_coord = initial_coord + delta + ob_view.coord = new_coord + ob_view.storage.update_views(ob_view) + data = ob_view.data + + # catch bad slicing + if not np.allclose(data.shape, ob_view.shape): + continue + + new_error = self.fourier_error(di_view, data) + + if new_error < error: + # keep + error = new_error + coord = new_coord + log(4, "Position correction: %s, coord: %s, delta: %s" % (di_view.ID, coord, delta)) + + ob_view.coord = coord + ob_view.storage.update_views(ob_view) + return coord - initial_coord @property def citation_dictionary(self): @@ -218,4 +317,4 @@ def citation_dictionary(self): "year" : 2012, "page" : 64, "doi" : '10.1016/j.ultramic.2012.06.001', - "comment" : 'Position Refinement using annealing algorithm'} + "comment" : 'Position Refinement using annealing algorithm'} \ No newline at end of file diff --git a/ptypy/engines/utils.py b/ptypy/engines/utils.py index fadb012c9..39fcbc93c 100644 --- a/ptypy/engines/utils.py +++ b/ptypy/engines/utils.py @@ -152,7 +152,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object + df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit pod.exit += df err_exit += np.mean(u.abs2(df)) elif err_fmag > pbound: @@ -162,7 +162,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = pod.bw(pod.upsample(fm) * f[name]) - pod.probe * pod.object + df = pod.bw(pod.upsample(fm) * f[name]) - alpha * pod.probe * pod.object + (alpha - 1) * pod.exit pod.exit += df err_exit += np.mean(u.abs2(df)) else: @@ -170,7 +170,7 @@ def basic_fourier_update(diff_view, pbound=None, alpha=1., LL_error=True): for name, pod in diff_view.pods.items(): if not pod.active: continue - df = alpha * (pod.probe * pod.object - pod.exit) + df = (pod.probe * pod.object - pod.exit) pod.exit += df err_exit += np.mean(u.abs2(df)) diff --git a/ptypy/utils/array_utils.py b/ptypy/utils/array_utils.py index dbd7a2366..a6dc3ede9 100644 --- a/ptypy/utils/array_utils.py +++ b/ptypy/utils/array_utils.py @@ -54,9 +54,9 @@ def switch_orientation(A, orientation, center=None): o = 0 if orientation is None else orientation if np.isscalar(o): - o = [i=='1' for i in '%03d' % int(np.base_repr(o))] + o = [i == '1' for i in '%03d' % int(np.base_repr(o))] - assert len(o)==3 + assert len(o) == 3 # switch orientation if o[0]: axes = list(range(A.ndim - 2)) + [-1, -2] @@ -101,10 +101,11 @@ def rebin_2d(A, rebin=1): sh = np.asarray(A.shape[-2:]) newdim = sh // rebin if not (sh % rebin == 0).all(): - raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)),str(rebin))) + raise ValueError('Last two axes %s of input array `A` cannot be binned by %s' % (str(tuple(sh)), str(rebin))) else: return A.reshape(-1, newdim[0], rebin, newdim[1], rebin).mean(-1).mean(-2) + def crop_pad_symmetric_2d(A, newshape, center=None): """ Crops or pads Array `A` symmetrically along the last two axes `(-2,-1)` @@ -148,7 +149,8 @@ def crop_pad_symmetric_2d(A, newshape, center=None): return A, c + low -def rebin(a, *args,**kwargs): + +def rebin(a, *args, **kwargs): """ Rebin ndarray data into a smaller ndarray of the same rank whose dimensions are factors of the original dimensions. @@ -184,46 +186,52 @@ def rebin(a, *args,**kwargs): """ shape = a.shape lenShape = a.ndim - factor = np.asarray(shape)//np.asarray(args) + factor = np.asarray(shape) // np.asarray(args) evList = ['a.reshape('] + \ - ['args[%d],factor[%d],'%(i,i) for i in range(lenShape)] + \ - [')'] + ['.sum(%d)'%(i+1) for i in range(lenShape)] + \ - ['*( 1.'] + ['/factor[%d]'%i for i in range(lenShape)] + [')'] - if kwargs.get('verbose',False): + ['args[%d],factor[%d],' % (i, i) for i in range(lenShape)] + \ + [')'] + ['.sum(%d)' % (i + 1) for i in range(lenShape)] + \ + ['*( 1.'] + ['/factor[%d]' % i for i in range(lenShape)] + [')'] + if kwargs.get('verbose', False): print(''.join(evList)) return eval(''.join(evList)) + def _confine(A): """\ Doc TODO. """ - sh=np.asarray(A.shape)[1:] - A=A.astype(float) - m=np.reshape(sh,(len(sh),) + len(sh)*(1,)) - return (A+m//2.0) % m - m//2.0 + sh = np.asarray(A.shape)[1:] + A = A.astype(float) + m = np.reshape(sh, (len(sh),) + len(sh) * (1,)) + return (A + m // 2.0) % m - m // 2.0 -def _translate_to_pix(sh,center): + +def _translate_to_pix(sh, center): """\ Take arbitrary input and translate it to a pixel position with respect to sh. """ - sh=np.array(sh) + sh = np.array(sh) if not isstr(center): cen = np.asarray(center) % sh - elif center=='fftshift': - cen=sh//2.0 - elif center=='geometric': - cen=sh/2.0-0.5 - elif center=='fft': - cen=sh*0.0 + elif center == 'fftshift': + cen = sh // 2.0 + elif center == 'geometric': + cen = sh / 2.0 - 0.5 + elif center == 'fft': + cen = sh * 0.0 else: raise TypeError('Input %s not understood for center' % str(center)) return cen + + """ def center_2d(sh,center): return translate_to_pix(sh[-2:],expect2(center)) """ -def grids(sh,psize=None,center='geometric',FFTlike=True): + + +def grids(sh, psize=None, center='geometric', FFTlike=True): """\ ``q0,q1,... = grids(sh)`` returns centered coordinates for a N-dimensional array of shape sh (pixel units) @@ -258,14 +266,14 @@ def grids(sh,psize=None,center='geometric',FFTlike=True): ndarray The coordinate grids """ - sh=np.asarray(sh) + sh = np.asarray(sh) - cen = _translate_to_pix(sh,center) + cen = _translate_to_pix(sh, center) - grid=np.indices(sh).astype(float) - np.reshape(cen,(len(sh),) + len(sh)*(1,)) + grid = np.indices(sh).astype(float) - np.reshape(cen, (len(sh),) + len(sh) * (1,)) if FFTlike: - grid=_confine(grid) + grid = _confine(grid) if psize is None: return grid @@ -273,16 +281,17 @@ def grids(sh,psize=None,center='geometric',FFTlike=True): psize = np.asarray(psize) if psize.size == 1: psize = psize * np.ones((len(sh),)) - psize = np.asarray(psize).reshape( (len(sh),) + len(sh)*(1,)) + psize = np.asarray(psize).reshape((len(sh),) + len(sh) * (1,)) return grid * psize + def rectangle(grids, dims=None, ew=2): if dims is None: dims = (grids.shape[-2] / 2., grids.shape[-1] / 2.) v, h = dims V, H = grids - return (smooth_step(-np.abs(V) + v/2, ew) - * smooth_step(-np.abs(H) + h/2, ew)) + return (smooth_step(-np.abs(V) + v / 2, ew) + * smooth_step(-np.abs(H) + h / 2, ew)) def ellipsis(grids, dims=None, ew=2): @@ -291,9 +300,10 @@ def ellipsis(grids, dims=None, ew=2): v, h = dims V, H = grids return smooth_step( - 0.5 - np.sqrt(V**2/v**2 + H**2/h**2), ew/np.sqrt(v * h)) + 0.5 - np.sqrt(V ** 2 / v ** 2 + H ** 2 / h ** 2), ew / np.sqrt(v * h)) + -def zoom(c,*arg,**kwargs): +def zoom(c, *arg, **kwargs): """ Wrapper `scipy.ndimage.zoom `_ function and shares @@ -311,25 +321,27 @@ def zoom(c,*arg,**kwargs): numpy.ndarray Zoomed array """ - #if np.all(arg[0] == 1): + # if np.all(arg[0] == 1): # return c # from scipy.ndimage import zoom as _zoom if np.iscomplexobj(c): - return complex_overload(_zoom)(c,*arg,**kwargs) + return complex_overload(_zoom)(c, *arg, **kwargs) else: - return _zoom(c,*arg,**kwargs) + return _zoom(c, *arg, **kwargs) + c_zoom = zoom -c_zoom.__doc__='*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__ +c_zoom.__doc__ = '*Deprecated*, kept for backward compatibility only.\n\n' + zoom.__doc__ """ c_affine_transform=complex_overload(ndi.affine_transform) c_affine_transform.__doc__='*complex input*\n\n'+c_affine_transform.__doc__ """ -def shift_zoom(c,zoom,cen_old,cen_new,**kwargs): + +def shift_zoom(c, zoom, cen_old, cen_new, **kwargs): """ Move array from center `cen_old` to `cen_new` and perform a zoom `zoom`. @@ -359,39 +371,40 @@ def shift_zoom(c,zoom,cen_old,cen_new,**kwargs): numpy.ndarray Shifted and zoomed array """ - + from scipy.ndimage import affine_transform as at zoom = np.diag(zoom) - offset=np.asarray(cen_old)-np.asarray(cen_new).dot(zoom) + offset = np.asarray(cen_old) - np.asarray(cen_new).dot(zoom) if np.iscomplexobj(c): - return complex_overload(at)(c,zoom,offset,**kwargs) + return complex_overload(at)(c, zoom, offset, **kwargs) else: - return at(c,zoom,offset,**kwargs) + return at(c, zoom, offset, **kwargs) -def fill3D(A,B,offset=[0,0,0]): +def fill3D(A, B, offset=[0, 0, 0]): """ Fill 3-dimensional array A with B. """ - if A.ndim != 3 or B.ndim!=3: + if A.ndim != 3 or B.ndim != 3: raise ValueError('3D a numpy arrays expected') - Alim=np.array(A.shape) - Blim=np.array(B.shape) - off=np.array(offset) + Alim = np.array(A.shape) + Blim = np.array(B.shape) + off = np.array(offset) Ao = off.copy() - Ao[Ao<0]=0 + Ao[Ao < 0] = 0 Bo = -off.copy() - Bo[Bo<0]=0 - print(Ao,Bo) + Bo[Bo < 0] = 0 if (Bo > Blim).any() or (Ao > Alim).any(): print("misfit") pass else: - A[Ao[0]:min(off[0]+Blim[0],Alim[0]),Ao[1]:min(off[1]+Blim[1],Alim[1]),Ao[2]:min(off[2]+Blim[2],Alim[2])] \ - =B[Bo[0]:min(Alim[0]-off[0],Blim[0]),Bo[1]:min(Alim[1]-off[1],Blim[1]),Bo[2]:min(Alim[2]-off[2],Blim[2])] + A[Ao[0]:min(off[0] + Blim[0], Alim[0]), Ao[1]:min(off[1] + Blim[1], Alim[1]), + Ao[2]:min(off[2] + Blim[2], Alim[2])] \ + = B[Bo[0]:min(Alim[0] - off[0], Blim[0]), Bo[1]:min(Alim[1] - off[1], Blim[1]), + Bo[2]:min(Alim[2] - off[2], Blim[2])] -def mirror(A,axis=-1): +def mirror(A, axis=-1): """ Mirrors array `A` along one axis `axis` @@ -409,9 +422,10 @@ def mirror(A,axis=-1): A view to the mirrored array. """ - return np.flipud(np.asarray(A).swapaxes(axis,0)).swapaxes(0,axis) + return np.flipud(np.asarray(A).swapaxes(axis, 0)).swapaxes(0, axis) + -def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'): +def pad_lr(A, axis, l, r, fillpar=0.0, filltype='scalar'): """ Pads ndarray `A` orthogonal to `axis` with `l` layers (pixels,lines,planes,...) on low side an `r` layers on high side. @@ -445,62 +459,61 @@ def pad_lr(A,axis,l,r,fillpar=0.0, filltype='scalar'): crop_pad crop_pad_symmetric_2d """ - fsh=np.array(A.shape) - if l>fsh[axis]: #rare case - l-=fsh[axis] - A=pad_lr(A,axis,fsh[axis],0,fillpar, filltype) - return pad_lr(A,axis,l,r,fillpar, filltype) - elif r>fsh[axis]: - r-=fsh[axis] - A=pad_lr(A,axis,0,fsh[axis],fillpar, filltype) - return pad_lr(A,axis,l,r,fillpar, filltype) - elif filltype=='mirror': - left=mirror(np.split(A,[l],axis)[0],axis) - right=mirror(np.split(A,[A.shape[axis]-r],axis)[1],axis) - elif filltype=='periodic': - right=np.split(A,[r],axis)[0] - left=np.split(A,[A.shape[axis]-l],axis)[1] - elif filltype=='project': - fsh[axis]=l - left=np.ones(fsh,A.dtype)*np.split(A,[1],axis)[0] - fsh[axis]=r - right=np.ones(fsh,A.dtype)*np.split(A,[A.shape[axis]-1],axis)[1] - if filltype=='scalar' or l==0: - fsh[axis]=l - left=np.ones(fsh,A.dtype)*fillpar - if filltype=='scalar' or r==0: - fsh[axis]=r - right=np.ones(fsh,A.dtype)*fillpar - if filltype=='custom': - left=fillpar[0].astype(A.dtype) - right=fillpar[1].astype(A.dtype) - return np.concatenate((left,A,right),axis=axis) - - -def _roll_from_pixcenter(sh,center): + fsh = np.array(A.shape) + if l > fsh[axis]: # rare case + l -= fsh[axis] + A = pad_lr(A, axis, fsh[axis], 0, fillpar, filltype) + return pad_lr(A, axis, l, r, fillpar, filltype) + elif r > fsh[axis]: + r -= fsh[axis] + A = pad_lr(A, axis, 0, fsh[axis], fillpar, filltype) + return pad_lr(A, axis, l, r, fillpar, filltype) + elif filltype == 'mirror': + left = mirror(np.split(A, [l], axis)[0], axis) + right = mirror(np.split(A, [A.shape[axis] - r], axis)[1], axis) + elif filltype == 'periodic': + right = np.split(A, [r], axis)[0] + left = np.split(A, [A.shape[axis] - l], axis)[1] + elif filltype == 'project': + fsh[axis] = l + left = np.ones(fsh, A.dtype) * np.split(A, [1], axis)[0] + fsh[axis] = r + right = np.ones(fsh, A.dtype) * np.split(A, [A.shape[axis] - 1], axis)[1] + if filltype == 'scalar' or l == 0: + fsh[axis] = l + left = np.ones(fsh, A.dtype) * fillpar + if filltype == 'scalar' or r == 0: + fsh[axis] = r + right = np.ones(fsh, A.dtype) * fillpar + if filltype == 'custom': + left = fillpar[0].astype(A.dtype) + right = fillpar[1].astype(A.dtype) + return np.concatenate((left, A, right), axis=axis) + + +def _roll_from_pixcenter(sh, center): """\ returns array of ints as input for np.roll use np.roll(A,-roll_from_pixcenter(sh,cen)[ax],ax) to put 'cen' in geometric center of array A """ - sh=np.array(sh) + sh = np.array(sh) if center != None: - if center=='fftshift': - cen=sh//2.0 - elif center=='geometric': - cen=sh/2.0-0.5 - elif center=='fft': - cen=sh*0.0 + if center == 'fftshift': + cen = sh // 2.0 + elif center == 'geometric': + cen = sh / 2.0 - 0.5 + elif center == 'fft': + cen = sh * 0.0 elif center is not None: - cen=sh*np.asarray(center) % sh - 0.5 + cen = sh * np.asarray(center) % sh - 0.5 - roll=np.ceil(cen - sh/2.0) % sh + roll = np.ceil(cen - sh / 2.0) % sh else: - roll=np.zeros_like(sh) + roll = np.zeros_like(sh) return roll.astype(int) - -def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'): +def crop_pad_axis(A, hplanes, axis=-1, roll=0, fillpar=0.0, filltype='scalar'): """ Crops or pads a volume array `A` at beginning and end of axis `axis` with a number of hyperplanes specified by `hplanes` @@ -573,37 +586,36 @@ def crop_pad_axis(A,hplanes,axis=-1,roll=0,fillpar=0.0, filltype='scalar'): >>> B=crop_pad_axis(V,(3,-2),1,filltype='mirror') """ if np.isscalar(hplanes): - hplanes=int(hplanes) - r=np.abs(hplanes) // 2 * np.sign(hplanes) - l=hplanes - r - elif len(hplanes)==2: - l=int(hplanes[0]) - r=int(hplanes[1]) + hplanes = int(hplanes) + r = np.abs(hplanes) // 2 * np.sign(hplanes) + l = hplanes - r + elif len(hplanes) == 2: + l = int(hplanes[0]) + r = int(hplanes[1]) else: raise RuntimeError('unsupoorted input for \'hplanes\'') - if roll!=0: - A=np.roll(A,-roll,axis=axis) - - if l<=0 and r<=0: - A=np.split(A,[-l,A.shape[axis]+r],axis)[1] - elif l>0 and r>0: - A=pad_lr(A,axis,l,r,fillpar,filltype) - elif l>0 and r<=0: - A=pad_lr(A,axis,l,0,fillpar,filltype) - A=np.split(A,[0,A.shape[axis]+r],axis)[1] - elif l<=0 and r>0: - A=pad_lr(A,axis,0,r,fillpar,filltype) - A=np.split(A,[-l,A.shape[axis]],axis)[1] - - - if roll!=0: - return np.roll(A,roll+r,axis=axis) + if roll != 0: + A = np.roll(A, -roll, axis=axis) + + if l <= 0 and r <= 0: + A = np.split(A, [-l, A.shape[axis] + r], axis)[1] + elif l > 0 and r > 0: + A = pad_lr(A, axis, l, r, fillpar, filltype) + elif l > 0 and r <= 0: + A = pad_lr(A, axis, l, 0, fillpar, filltype) + A = np.split(A, [0, A.shape[axis] + r], axis)[1] + elif l <= 0 and r > 0: + A = pad_lr(A, axis, 0, r, fillpar, filltype) + A = np.split(A, [-l, A.shape[axis]], axis)[1] + + if roll != 0: + return np.roll(A, roll + r, axis=axis) else: return A -def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'): +def crop_pad(A, hplane_list, axes=None, cen=None, fillpar=0.0, filltype='scalar'): """\ Crops or pads a volume array `A` with a number of hyperplanes according to parameters in `hplanes` Wrapper for crop_pad_axis. @@ -660,14 +672,13 @@ def crop_pad(A,hplane_list,axes=None,cen=None,fillpar=0.0,filltype='scalar'): """ if axes is None: - axes=np.arange(len(hplane_list))-len(hplane_list) - elif not(len(axes)==len(hplane_list)): + axes = np.arange(len(hplane_list)) - len(hplane_list) + elif not (len(axes) == len(hplane_list)): raise RuntimeError('if axes is specified, hplane_list has to be same length as axes') - sh=np.array(A.shape) - roll = _roll_from_pixcenter(sh,cen) + sh = np.array(A.shape) + roll = _roll_from_pixcenter(sh, cen) - for ax,cut in zip(axes,hplane_list): - A=crop_pad_axis(A,cut,ax,roll[ax],fillpar,filltype) + for ax, cut in zip(axes, hplane_list): + A = crop_pad_axis(A, cut, ax, roll[ax], fillpar, filltype) return A - diff --git a/setup.py b/setup.py index 43940038c..83d5b9a89 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,11 @@ #!/usr/bin/env python +# we should aim to remove the distutils dependency +import distutils import setuptools #, setuptools.command.build_ext from distutils.core import setup +import os +import sys CLASSIFIERS = """\ Development Status :: 3 - Alpha @@ -62,6 +66,38 @@ def write_version_py(filename='ptypy/version.py'): except: vers = VERSION +ext_modules = [] +cmdclass = {} +# filtered Cuda FFT extension module +""" +Alternative options for this switch: + +1. Put the cufft extension module as a separate python package with its own setup.py and + put an optional dependency into ptypy (extras_require={ "cufft": ["pybind11"] }), so that + when users do pip install ptypy it installs it without that dependency, and if users do + pip install ptypy[cufft] it installs the optional dependency module + +2. Use an environment variable to control the setting, as sqlalchemy does for its C extensions, + or detect if cuda is available on the system and enable it in this case, etc. +""" +try: + from extensions import locate_cuda # this raises an error if pybind11 is not available + CUDA = locate_cuda() # this raises an error if CUDA is not available + from extensions import CustomBuildExt + cufft_dir = os.path.join('ptypy', 'accelerate', 'cuda_pycuda', 'cuda', 'filtered_fft') + ext_modules.append( + distutils.core.Extension("ptypy.filtered_cufft", + sources=[os.path.join(cufft_dir, "module.cpp"), + os.path.join(cufft_dir, "filtered_fft.cu")] + ) + ) + cmdclass = {"build_ext": CustomBuildExt} + EXTBUILD_MESSAGE = "ptypy has been successfully installed with the pre-compiled cufft extension.\n" +except: + EXTBUILD_MESSAGE = '*' * 75 + "\n" + EXTBUILD_MESSAGE += "ptypy has been installed without the pre-compiled cufft extension.\n" + EXTBUILD_MESSAGE += "If you require cufft, make sure to have CUDA and pybind11 installed.\n" + EXTBUILD_MESSAGE += '*' * 75 + "\n" exclude_packages = [] package_list = setuptools.find_packages(exclude=exclude_packages) @@ -74,12 +110,15 @@ def write_version_py(filename='ptypy/version.py'): package_dir={'ptypy': 'ptypy'}, packages=package_list, package_data={'ptypy': ['resources/*',], - 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu'], - 'ptypy.accelerate.cuda_pycuda.cuda.filtered_fft': ['*.hpp', '*.cpp', 'Makefile', '*.cu', '*.h']}, + 'ptypy.accelerate.cuda_pycuda.cuda': ['*.cu']}, scripts=['scripts/ptypy.plot', 'scripts/ptypy.inspect', 'scripts/ptypy.plotclient', 'scripts/ptypy.new', 'scripts/ptypy.csv2cp', 'scripts/ptypy.run'], + ext_modules=ext_modules, + cmdclass=cmdclass ) + +print(EXTBUILD_MESSAGE) \ No newline at end of file diff --git a/templates/minimal_prep_and_run_DM_pycuda.py b/templates/minimal_prep_and_run_DM_pycuda.py index 269e3dd42..976a8b0b8 100644 --- a/templates/minimal_prep_and_run_DM_pycuda.py +++ b/templates/minimal_prep_and_run_DM_pycuda.py @@ -16,7 +16,9 @@ p.io = u.Param() p.io.home = "~/dumps/ptypy/" p.io.autosave = u.Param(active=True) -p.io.autoplot = u.Param(active=False) +p.io.autoplot = u.Param(active=True) +p.io.interaction = u.Param(active=True) +p.io.interaction.client = u.Param(poll_timeout=1) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() p.scans.MF = u.Param() diff --git a/templates/minimal_prep_and_run_DR_pycuda.py b/templates/minimal_prep_and_run_DR_pycuda.py new file mode 100644 index 000000000..618616320 --- /dev/null +++ b/templates/minimal_prep_and_run_DR_pycuda.py @@ -0,0 +1,56 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# Frames per block +p.frames_per_block = 200 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0.0 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 3 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DR_pycuda' +p.engines.engine00.numiter = 100 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 + +# prepare and run +P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_DR_pycuda_stream.py b/templates/minimal_prep_and_run_DR_pycuda_stream.py new file mode 100644 index 000000000..38c5157a0 --- /dev/null +++ b/templates/minimal_prep_and_run_DR_pycuda_stream.py @@ -0,0 +1,59 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.cuda_pycuda.engines import DR_pycuda_stream, DR_pycuda +DR_pycuda_stream.MAX_BLOCKS=3 +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# Frames per block +p.frames_per_block = 20 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 384 +p.scans.MF.data.num_frames = 120 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0.0 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 3 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DR_pycuda_stream' +p.engines.engine00.numiter = 20 +p.engines.engine00.numiter_contiguous = 10 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 + +# prepare and run +P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_DR_serial.py b/templates/minimal_prep_and_run_DR_serial.py new file mode 100644 index 000000000..a9c3c04ba --- /dev/null +++ b/templates/minimal_prep_and_run_DR_serial.py @@ -0,0 +1,58 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.base.engines import DR_serial +p = u.Param() + +# for verbose output +p.verbose_level = 3 + +# Frames per block +p.frames_per_block = 200 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0.0 +p.scans.MF.coherence = u.Param() +p.scans.MF.coherence.num_probe_modes = 3 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DR_serial' +p.engines.engine00.numiter = 100 +p.engines.engine00.alpha = 0 # alpha=0, tau=1 behaves like ePIE +p.engines.engine00.tau = 1 +#p.engines.engine00.rescale_probe = False +#p.engines.engine00.fourier_power_bound = 0.0 + +# prepare and run +P = Ptycho(p,level=5) diff --git a/templates/minimal_prep_and_run_ML_pycuda.py b/templates/minimal_prep_and_run_ML_pycuda.py index a66f39825..4b0dd5f51 100644 --- a/templates/minimal_prep_and_run_ML_pycuda.py +++ b/templates/minimal_prep_and_run_ML_pycuda.py @@ -11,7 +11,7 @@ p = u.Param() # for verbose output -p.verbose_level = 2 +p.verbose_level = 3 p.frames_per_block = 400 # set home path p.io = u.Param() @@ -27,7 +27,7 @@ p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 -p.scans.MF.data.num_frames = 600 +p.scans.MF.data.num_frames = 100 p.scans.MF.data.save = None p.scans.MF.illumination = u.Param(diversity=None) @@ -43,15 +43,14 @@ p.engines = u.Param() p.engines.engine00 = u.Param() p.engines.engine00.name = 'ML_pycuda' -p.engines.engine00.numiter = 10 +p.engines.engine00.numiter = 300 p.engines.engine00.numiter_contiguous = 5 p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used -p.engines.engine00.floating_intensities = True - +p.engines.engine00.scale_precond = True +p.engines.engine00.smooth_gradient = 20. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.floating_intensities = False # prepare and run P = Ptycho(p,level=5) -#P.run() -P.print_stats() -#u.pause(10) diff --git a/templates/minimal_prep_and_run_probe_modes.py b/templates/minimal_prep_and_run_probe_modes.py index 8dbcb4dc4..17d358b6b 100644 --- a/templates/minimal_prep_and_run_probe_modes.py +++ b/templates/minimal_prep_and_run_probe_modes.py @@ -14,7 +14,10 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=True) +p.io.interaction.client = u.Param() +p.io.interaction.client.poll_timeout = 1 # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/templates/minimal_prep_and_run_resample_DM.py b/templates/minimal_prep_and_run_resample_DM.py index 08c12540c..b06281223 100644 --- a/templates/minimal_prep_and_run_resample_DM.py +++ b/templates/minimal_prep_and_run_resample_DM.py @@ -14,7 +14,7 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/templates/minimal_prep_and_run_resample_ML.py b/templates/minimal_prep_and_run_resample_ML.py index f0d5619f9..2edbb8bcc 100644 --- a/templates/minimal_prep_and_run_resample_ML.py +++ b/templates/minimal_prep_and_run_resample_ML.py @@ -15,7 +15,7 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = None +p.io.autosave = u.Param(active=False) #p.io.autoplot = u.Param() #p.io.autoplot.dump = True #p.io.autoplot = False diff --git a/templates/position_refinement.py b/templates/position_refinement_DM.py similarity index 80% rename from templates/position_refinement.py rename to templates/position_refinement_DM.py index c3a348c24..052b4b679 100644 --- a/templates/position_refinement.py +++ b/templates/position_refinement_DM.py @@ -15,7 +15,8 @@ # set home path p.io = u.Param() p.io.home = "/tmp/ptypy/" -p.io.autosave = u.Param() +p.io.autosave = u.Param(active=False) +p.io.interaction = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() @@ -41,15 +42,15 @@ p.engines.engine00 = u.Param() p.engines.engine00.name = 'DM' p.engines.engine00.probe_support = 1 -# p.engines.engine00.probe_center_tol = 0.5 p.engines.engine00.numiter = 1000 p.engines.engine00.position_refinement = u.Param() p.engines.engine00.position_refinement.start = 50 -p.engines.engine00.position_refinement.stop = 990 +p.engines.engine00.position_refinement.stop = 950 p.engines.engine00.position_refinement.interval = 10 p.engines.engine00.position_refinement.nshifts = 32 -p.engines.engine00.position_refinement.amplitude = 1e-6 -p.engines.engine00.position_refinement.max_shift = 2e-6 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" # prepare and run P = Ptycho(p, level=4) @@ -58,26 +59,24 @@ a = 0. coords = [] +coords_start = [] for pname, pod in P.pods.items(): + # Save real position coords.append(np.copy(pod.ob_view.coord)) before = pod.ob_view.coord psize = pod.pr_view.psize - # print(pname) - # print(before) perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) - new_coord = before + perturbation # make sure integer number of pixels shift - - pod.ob_view.coord = new_coord - - #pod.diff *= np.random.uniform(0.1,1)y + coords_start.append(np.copy(pod.ob_view.coord)) + #pod.diff *= np.random.uniform(0.1,1) a += 4. np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start.txt", coords_start) P.obj.reformat() - # Run P.run() +P.finalize() diff --git a/templates/position_refinement_DM_pycuda.py b/templates/position_refinement_DM_pycuda.py new file mode 100644 index 000000000..ac51ef337 --- /dev/null +++ b/templates/position_refinement_DM_pycuda.py @@ -0,0 +1,93 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +import numpy as np +from ptypy.core import Ptycho +from ptypy import utils as u + +from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda + +p = u.Param() + +# for verbose output +p.verbose_level = 3 +p.frames_per_block = 100 +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=True, interval=500) +p.io.autoplot = u.Param(active=False)#True, interval=100) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' # or 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +p.scans.MF.coherence = u.Param(num_probe_modes=1) +# p.scans.MF.illumination.diversity=u.Param() +# p.scans.MF.illumination.diversity.power = 0.1 +# p.scans.MF.illumination.diversity.noise = (np.pi, 3.0) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. +#p.scans.MF.data.add_poisson_noise = False + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_pycuda' +p.engines.engine00.probe_support = 1 +p.engines.engine00.numiter = 1000 +p.engines.engine00.numiter_contiguous = 10 +p.engines.engine00.position_refinement = u.Param() +p.engines.engine00.position_refinement.start = 50 +p.engines.engine00.position_refinement.stop = 950 +p.engines.engine00.position_refinement.interval = 10 +p.engines.engine00.position_refinement.nshifts = 32 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" + +# prepare and run +P = Ptycho(p, level=4) + +# Mess up the positions +a = 0. + +coords = [] +coords_start = [] +for pname, pod in P.pods.items(): + + # Save real position + coords.append(np.copy(pod.ob_view.coord)) + before = pod.ob_view.coord + psize = pod.pr_view.psize + perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) + new_coord = before + perturbation # make sure integer number of pixels shift + pod.ob_view.coord = new_coord + coords_start.append(np.copy(pod.ob_view.coord)) + #pod.diff *= np.random.uniform(0.1,1)y + a += 4. + +np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start", coords_start) +P.obj.reformat()# update the object storage + +# Run +P.run() +P.finalize() + diff --git a/templates/position_refinement_DM_serial.py b/templates/position_refinement_DM_serial.py index 523dfd486..6c5584cfd 100644 --- a/templates/position_refinement_DM_serial.py +++ b/templates/position_refinement_DM_serial.py @@ -8,7 +8,6 @@ from ptypy.core import Ptycho from ptypy import utils as u -from ptypy.accelerate.cuda_pycuda.engines import DM_pycuda_stream, DM_pycuda_streams, DM_pycuda from ptypy.accelerate.base.engines import DM_serial @@ -16,12 +15,13 @@ # for verbose output p.verbose_level = 3 -p.frames_per_block = 300 +p.frames_per_block = 100 # set home path p.io = u.Param() p.io.home = "~/dumps/ptypy/" p.io.autosave = u.Param(active=True, interval=500) p.io.autoplot = u.Param(active=False)#True, interval=100) +p.io.interaction = u.Param(active=False) # max 200 frames (128x128px) of diffraction data p.scans = u.Param() @@ -32,7 +32,7 @@ p.scans.MF.data= u.Param() p.scans.MF.data.name = 'MoonFlowerScan' p.scans.MF.data.shape = 128 -p.scans.MF.data.num_frames = 2000 +p.scans.MF.data.num_frames = 200 p.scans.MF.data.save = None p.scans.MF.illumination = u.Param(diversity=None) @@ -43,25 +43,26 @@ # position distance in fraction of illumination frame p.scans.MF.data.density = 0.2 # total number of photon in empty beam -p.scans.MF.data.photons = 1e6 +p.scans.MF.data.photons = 1e8 # Gaussian FWHM of possible detector blurring p.scans.MF.data.psf = 0. -p.scans.MF.data.add_poisson_noise = False - +#p.scans.MF.data.add_poisson_noise = False # attach a reconstrucion engine p.engines = u.Param() p.engines.engine00 = u.Param() -p.engines.engine00.name = 'DM_pycuda' -p.engines.engine00.numiter = 1000 +p.engines.engine00.name = 'DM_serial' +p.engines.engine00.probe_support = 1 +p.engines.engine00.numiter = 100 p.engines.engine00.numiter_contiguous = 10 p.engines.engine00.position_refinement = u.Param() p.engines.engine00.position_refinement.start = 50 p.engines.engine00.position_refinement.stop = 950 p.engines.engine00.position_refinement.interval = 10 -p.engines.engine00.position_refinement.nshifts = 16 -p.engines.engine00.position_refinement.amplitude = 1e-6 -p.engines.engine00.position_refinement.max_shift = 2e-6 +p.engines.engine00.position_refinement.nshifts = 32 +p.engines.engine00.position_refinement.amplitude = 5e-7 +p.engines.engine00.position_refinement.max_shift = 1e-6 +p.engines.engine00.position_refinement.method = "GridSearch" # prepare and run P = Ptycho(p, level=4) @@ -70,23 +71,25 @@ a = 0. coords = [] +coords_start = [] for pname, pod in P.pods.items(): + # Save real position coords.append(np.copy(pod.ob_view.coord)) before = pod.ob_view.coord psize = pod.pr_view.psize - perturbation = psize * ((3e-7 * np.array([np.sin(a), np.cos(a)])) // psize) new_coord = before + perturbation # make sure integer number of pixels shift pod.ob_view.coord = new_coord - + coords_start.append(np.copy(pod.ob_view.coord)) #pod.diff *= np.random.uniform(0.1,1)y a += 4. -# np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_theory.txt", coords) +np.savetxt("positions_start.txt", coords_start) P.obj.reformat()# update the object storage - # Run P.run() +P.finalize() diff --git a/test/accelerate_tests/base_tests/address_manglers_test.py b/test/accelerate_tests/base_tests/address_manglers_test.py index 11af45e42..7e27c885a 100644 --- a/test/accelerate_tests/base_tests/address_manglers_test.py +++ b/test/accelerate_tests/base_tests/address_manglers_test.py @@ -1,7 +1,7 @@ import unittest import sys import numpy as np -from ptypy.accelerate.base.address_manglers import RandomIntMangle +from ptypy.accelerate.base.address_manglers import BaseMangler, RandomIntMangler COMPLEX_TYPE = np.complex64 FLOAT_TYPE = np.float32 @@ -16,14 +16,8 @@ def setUp(self): def tearDown(self): np.set_printoptions() - def test_addr_original_set(self): - - max_bound = 10 - step_size = 3 - scan_pts = 2 + def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3): total_number_scan_positions = scan_pts ** 2 - num_modes = 3 - X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) X = X.reshape((total_number_scan_positions)) + max_bound # max bound is added in the DM_serial engine. Y = Y.reshape((total_number_scan_positions)) + max_bound @@ -44,26 +38,59 @@ def test_addr_original_set(self): mode_idx += 1 exit_idx += 1 position_idx += 1 + + return addr_original - print(repr(addr_original)) - - old_positions = np.zeros((total_number_scan_positions)) - - differences_from_original = np.zeros((len(addr_original), 2)) - differences_from_original[::2] = 12 # so definitely more than the max_bound - new_positions = addr_original[:, 0, 1, 1:] + differences_from_original - - mangler = RandomIntMangle(step_size, 50, 100, max_bound=max_bound, ) - - - mangler.apply_bounding_box(new_positions, old_positions, addr_original) - print(repr(new_positions)) - expected_new_positions = new_positions[:] - expected_new_positions[::2] = 0 - - print(repr(expected_new_positions)) - - np.testing.assert_array_equal(expected_new_positions, new_positions) - - + def test_apply_bounding_box(self): + scan_pts=2 + max_bound=10 + addr = self.prepare_addresses(scan_pts=scan_pts, max_bound=max_bound) + step_size = 3 + + mangler = BaseMangler(step_size, 50, 100, nshifts=1, max_bound=max_bound, ) + min_oby = 1 + max_oby = 10 + min_obx = 2 + max_obx = 9 + mangler.apply_bounding_box(addr[:,:,1,1], min_oby, max_oby) + mangler.apply_bounding_box(addr[:,:,1,2], min_obx, max_obx) + + np.testing.assert_array_less(addr[:,:,1,1], max_oby+1) + np.testing.assert_array_less(addr[:,:,1,2], max_obx+1) + np.testing.assert_array_less(min_oby-1, addr[:,:,1,1]) + np.testing.assert_array_less(min_obx-1, addr[:,:,1,2]) + + + def test_get_address(self): + # the other manglers are using the BaseMangler's get_address function + # so we set the deltas in a BaseMangler object and test get_address + + scan_pts=2 + addr_original = self.prepare_addresses(scan_pts=scan_pts) + total_number_scan_positions = scan_pts ** 2 + addr1 = np.copy(addr_original) + addr2 = np.copy(addr_original) + nshifts=1 + step_size=2 + mglr = BaseMangler(step_size, 50, 100, nshifts, max_bound=2) + # 2 shifts, with positive/negative shifting + mglr.delta = np.array([ + [1, 2], + [-4, -2] + ]) + mglr.get_address(0, addr_original, addr1, 10, 9) + mglr.get_address(1, addr_original, addr2, 10, 9) + + exp1 = np.copy(addr_original) + exp2 = np.copy(addr_original) + # element-wise here to prepare reference + for f in range(addr_original.shape[0]): + for m in range(addr_original.shape[1]): + exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1)) + exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2)) + exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4)) + exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2)) + + np.testing.assert_array_equal(addr1, exp1) + np.testing.assert_array_equal(addr2, exp2) diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py index f1a182ab0..b1cac58fe 100644 --- a/test/accelerate_tests/base_tests/array_utils_test.py +++ b/test/accelerate_tests/base_tests/array_utils_test.py @@ -2,7 +2,6 @@ Tests for the array_utils module ''' - import unittest import numpy as np from ptypy.accelerate.base import FLOAT_TYPE, COMPLEX_TYPE @@ -12,7 +11,7 @@ class ArrayUtilsTest(unittest.TestCase): def test_dot_resolution(self): - X,Y,Z = np.indices((3,3,1001), dtype=np.float32) + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) A = 10 ** Y + 1j * 10 ** X out = au.dot(A, A) np.testing.assert_array_equal(out, 60666606.0) @@ -21,7 +20,7 @@ def test_abs2_real_input(self): single_dim = 50.0 npts = single_dim ** 3 array_to_be_absed = np.arange(npts) - absed = np.array([ix**2 for ix in array_to_be_absed]) + absed = np.array([ix ** 2 for ix in array_to_be_absed]) array_shape = (int(single_dim), int(single_dim), int(single_dim)) array_to_be_absed.reshape(array_shape) absed.reshape(array_shape) @@ -29,13 +28,12 @@ def test_abs2_real_input(self): np.testing.assert_array_equal(absed, out) self.assertEqual(absed.dtype, np.float) - def test_abs2_complex_input(self): single_dim = 50.0 array_shape = (int(single_dim), int(single_dim), int(single_dim)) npts = single_dim ** 3 array_to_be_absed = np.arange(npts) + 1j * np.arange(npts) - absed = np.array([np.abs(ix**2) for ix in array_to_be_absed]) + absed = np.array([np.abs(ix ** 2) for ix in array_to_be_absed]) absed.reshape(array_shape) array_to_be_absed.reshape(array_shape) out = au.abs2(array_to_be_absed) @@ -53,7 +51,7 @@ def test_sum_to_buffer(self): # fill the input array for idx in range(I): - in1[idx] = np.ones((M, N))* (idx + 1.0) + in1[idx] = np.ones((M, N)) * (idx + 1.0) outshape = (X, M, N) expected_out = np.empty(outshape) @@ -64,9 +62,9 @@ def test_sum_to_buffer(self): in1_addr = np.empty((I, 3)) in1_addr = np.array([(0, 0, 0), - (1, 0, 0), - (2, 0, 0), - (3, 0, 0)]) + (1, 0, 0), + (2, 0, 0), + (3, 0, 0)]) out1_addr = np.empty_like(in1_addr) out1_addr = np.array([(0, 0, 0), @@ -77,7 +75,6 @@ def test_sum_to_buffer(self): out = au.sum_to_buffer(in1, outshape, in1_addr, out1_addr, dtype=FLOAT_TYPE) np.testing.assert_array_equal(out, expected_out) - def test_sum_to_buffer_complex(self): I = 4 @@ -89,20 +86,20 @@ def test_sum_to_buffer_complex(self): # fill the input array for idx in range(I): - in1[idx] = np.ones((M, N))* (idx + 1.0) + 1j * np.ones((M, N))* (idx + 1.0) + in1[idx] = np.ones((M, N)) * (idx + 1.0) + 1j * np.ones((M, N)) * (idx + 1.0) outshape = (X, M, N) expected_out = np.empty(outshape, dtype=COMPLEX_TYPE) - expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N))* 4.0 - expected_out[1] = np.ones((M, N)) * 6.0+ 1j * np.ones((M, N))* 6.0 + expected_out[0] = np.ones((M, N)) * 4.0 + 1j * np.ones((M, N)) * 4.0 + expected_out[1] = np.ones((M, N)) * 6.0 + 1j * np.ones((M, N)) * 6.0 in1_addr = np.empty((I, 3)) in1_addr = np.array([(0, 0, 0), - (1, 0, 0), - (2, 0, 0), - (3, 0, 0)]) + (1, 0, 0), + (2, 0, 0), + (3, 0, 0)]) out1_addr = np.empty_like(in1_addr) out1_addr = np.array([(0, 0, 0), @@ -120,7 +117,7 @@ def test_norm2_1d_real(self): np.testing.assert_array_equal(out, 5.0) def test_norm2_1d_complex(self): - a = np.array([1.0+1.0j, 2.0+2.0j], dtype=COMPLEX_TYPE) + a = np.array([1.0 + 1.0j, 2.0 + 2.0j], dtype=COMPLEX_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 10.0) @@ -131,22 +128,22 @@ def test_norm2_2d_real(self): np.testing.assert_array_equal(out, 30.0) def test_norm2_2d_complex(self): - a = np.array([[1.0+1.0j, 2.0+2.0j], - [3.0+3.0j, 4.0+4.0j]], dtype=COMPLEX_TYPE) + a = np.array([[1.0 + 1.0j, 2.0 + 2.0j], + [3.0 + 3.0j, 4.0 + 4.0j]], dtype=COMPLEX_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 60.0) def test_norm2_3d_real(self): a = np.array([[[1.0, 2.0], - [3.0, 4.0]], + [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]], dtype=FLOAT_TYPE) out = au.norm2(a) np.testing.assert_array_equal(out, 204.0) def test_norm2_3d_complex(self): - a = np.array([[[1.0+1.0j, 2.0+2.0j], - [3.0+3.0j, 4.0+4.0j]], + a = np.array([[[1.0 + 1.0j, 2.0 + 2.0j], + [3.0 + 3.0j, 4.0 + 4.0j]], [[5.0 + 5.0j, 6.0 + 6.0j], [7.0 + 7.0j, 8.0 + 8.0j]]], dtype=COMPLEX_TYPE) out = au.norm2(a) @@ -154,46 +151,45 @@ def test_norm2_3d_complex(self): def test_complex_gaussian_filter_2d(self): data = np.zeros((8, 8), dtype=COMPLEX_TYPE) - data[3:5, 3:5] = 2.0+2.0j - mfs = 3.0,4.0 + data[3:5, 3:5] = 2.0 + 2.0j + mfs = 3.0, 4.0 out = au.complex_gaussian_filter(data, mfs) expected_out = np.array([0.11033735 + 0.11033735j, 0.11888228 + 0.11888228j, 0.13116673 + 0.13116673j , 0.13999543 + 0.13999543j, 0.13999543 + 0.13999543j, 0.13116673 + 0.13116673j , 0.11888228 + 0.11888228j, 0.11033735 + 0.11033735j], dtype=COMPLEX_TYPE) np.testing.assert_array_almost_equal(np.diagonal(out), expected_out) - def test_complex_gaussian_filter_2d_batched(self): batch_number = 2 A = 5 B = 5 data = np.zeros((batch_number, A, B), dtype=COMPLEX_TYPE) - data[:, 2:3, 2:3] = 2.0+2.0j - mfs = 3.0,4.0 + data[:, 2:3, 2:3] = 2.0 + 2.0j + mfs = 3.0, 4.0 out = au.complex_gaussian_filter(data, mfs) - expected_out = np.array([[[ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.08012911+0.08012911j, 0.08013555+0.08013555j, 0.08013615+0.08013615j, - 0.08013555+0.08013555j, 0.08012911+0.08012911j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ]], - - [[ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.08012911+0.08012911j, 0.08013555+0.08013555j, 0.08013615+0.08013615j, - 0.08013555+0.08013555j, 0.08012911+0.08012911j], - [ 0.08003781+0.08003781j, 0.08004424+0.08004424j, 0.08004485+0.08004485j, - 0.08004424+0.08004424j, 0.08003781+0.08003781j], - [ 0.07988770+0.0798877j, 0.07989411+0.07989411j, 0.07989471+0.07989471j, - 0.07989411+0.07989411j, 0.07988770+0.0798877j ]]], dtype=COMPLEX_TYPE) + expected_out = np.array([[[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]], + + [[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]]], dtype=COMPLEX_TYPE) np.testing.assert_array_almost_equal(out, expected_out) @@ -206,13 +202,12 @@ def test_mass_center_2d(self): X, Y = np.meshgrid(x, x) Xoff = 5.0 Yoff = 2.0 - probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals + probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals com = au.mass_center(np.abs(probe[0])) expected_out = np.array([Yoff, Xoff]) + npts // 2 np.testing.assert_array_almost_equal(com, expected_out, decimal=6) - def test_mass_center_3d(self): npts = 64 probe = np.zeros((npts, npts, npts), dtype=COMPLEX_TYPE) @@ -223,7 +218,7 @@ def test_mass_center_3d(self): Xoff = 5.0 Yoff = 2.0 Zoff = 10.0 - probe[(X-Xoff)**2 + (Y-Yoff)**2 + (Z-Zoff)**2< rad**2] = probe_vals + probe[(X - Xoff) ** 2 + (Y - Yoff) ** 2 + (Z - Zoff) ** 2 < rad ** 2] = probe_vals com = au.mass_center(np.abs(probe)) expected_out = np.array([Yoff, Xoff, Zoff]) + npts // 2 @@ -238,28 +233,64 @@ def test_interpolated_shift(self): X, Y = np.meshgrid(x, x) Xoff = 5.0 Yoff = 2.0 - probe[0, (X-Xoff)**2 + (Y-Yoff)**2 < rad**2] = probe_vals + probe[0, (X - Xoff) ** 2 + (Y - Yoff) ** 2 < rad ** 2] = probe_vals offset = np.array([-Yoff, -Xoff]) not_shifted_probe = np.zeros((1, npts, npts), dtype=COMPLEX_TYPE) - not_shifted_probe[0, (X)**2 + (Y)**2 < rad**2] = probe_vals + not_shifted_probe[0, (X) ** 2 + (Y) ** 2 < rad ** 2] = probe_vals probe[0] = au.interpolated_shift(probe[0], offset) np.testing.assert_array_almost_equal(probe, not_shifted_probe, decimal=8) def test_clip_magnitudes_to_range(self): - data = np.ones((5,5), dtype=COMPLEX_TYPE) - data[2, 4] = 20.0*np.exp(1j*np.pi/2) - data[3, 1] = 0.2*np.exp(1j*np.pi/3) + data = np.ones((5, 5), dtype=COMPLEX_TYPE) + data[2, 4] = 20.0 * np.exp(1j * np.pi / 2) + data[3, 1] = 0.2 * np.exp(1j * np.pi / 3) clip_min = 0.5 clip_max = 2.0 expected_out = np.ones_like(data) - expected_out[2, 4] = 2.0*np.exp(1j*np.pi/2) - expected_out[3, 1] = 0.5*np.exp(1j*np.pi/3) + expected_out[2, 4] = 2.0 * np.exp(1j * np.pi / 2) + expected_out[3, 1] = 0.5 * np.exp(1j * np.pi / 3) au.clip_complex_magnitudes_to_range(data, clip_min, clip_max) - np.testing.assert_array_almost_equal(data, expected_out, decimal=7) # floating point precision I guess... - - - -if __name__=='__main__': - unittest.main() \ No newline at end of file + np.testing.assert_array_almost_equal(data, expected_out, decimal=7) # floating point precision I guess... + + def test_crop_pad_1(self): + # pad, integer, 2D + B = np.indices((4, 4), dtype=np.int) + A = np.zeros((6, 6), dtype=B.dtype) + au.crop_pad_2d_simple(A, B.sum(0)) + exp_A = np.array([[0, 0, 0, 0, 0, 0], + [0, 0, 1, 2, 3, 0], + [0, 1, 2, 3, 4, 0], + [0, 2, 3, 4, 5, 0], + [0, 3, 4, 5, 6, 0], + [0, 0, 0, 0, 0, 0]]) + np.testing.assert_equal(A, exp_A) + + def test_crop_pad_2(self): + # crop, float, 3D + B = np.indices((4, 4), dtype=np.float32) + A = np.zeros((2, 2, 2), dtype=B.dtype) + au.crop_pad_2d_simple(A, B) + exp_A = np.array([[[1., 1.], + [2., 2.]], + [[1., 2.], + [1., 2.]]], dtype=np.float32) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_crop_pad_3(self): + # crop/pad, complex, 3D + B = np.indices((4, 3), dtype=np.complex64) + B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2, 2, 5), dtype=B.dtype) + au.crop_pad_2d_simple(A, B) + exp_A = np.array([[[0. + 0.j, 1. + 0.j, 1. + 1.j, 1. + 2.j, 0. + 0.j], + [0. + 0.j, 2. + 0.j, 2. + 1.j, 2. + 2.j, 0. + 0.j]], + [[0. + 0.j, 0. + 1.j, 1. + 1.j, 2. + 1.j, 0. + 0.j], + [0. + 0.j, 0. + 2.j, 1. + 2.j, 2. + 2.j, 0. + 0.j]]], + dtype=np.complex64) + np.testing.assert_array_almost_equal(A, exp_A) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py index e38909e71..93e753a51 100644 --- a/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/base_tests/auxiliary_wave_kernel_test.py @@ -21,7 +21,7 @@ def setUp(self): def tearDown(self): np.set_printoptions() - def prepare_arrays(self): + def prepare_arrays(self, scan_points = None): B = 3 # frame size y C = 3 # frame size x @@ -34,7 +34,10 @@ def prepare_arrays(self): H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points total_number_scan_positions = scan_pts ** 2 total_number_modes = G * D @@ -76,25 +79,17 @@ def prepare_arrays(self): return addr, object_array, probe, exit_wave def test_build_aux_same_as_exit(self): - ''' - setup - ''' - - ''' - test - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() alpha_set = 1.0 AWK.allocate() # doesn't actually do anything at the moment - AWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) - # print("auxiliary_wave after") - # print(repr(auxiliary_wave)) - + # assert expected_auxiliary_wave = np.array([[[-1. + 3.j, -1. + 3.j, -1. + 3.j], [-1. + 3.j, -1. + 3.j, -1. + 3.j], [-1. + 3.j, -1. + 3.j, -1. + 3.j]], @@ -143,32 +138,20 @@ def test_build_aux_same_as_exit(self): [[-16. + 16.j, -16. + 16.j, -16. + 16.j], [-16. + 16.j, -16. + 16.j, -16. + 16.j], [-16. + 16.j, -16. + 16.j, -16. + 16.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(expected_auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") def test_build_exit_aux_same_as_exit(self): - ''' - setup - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() - - ''' - test - ''' auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() AWK.allocate() - AWK.build_exit(auxiliary_wave, addr, object_array, probe, exit_wave) - # - # print("auxiliary_wave after") - # print(repr(auxiliary_wave)) - # - # print("exit_wave after") - # print(repr(exit_wave)) + # assert expected_auxiliary_wave = np.array([[[0. - 2.j, 0. - 2.j, 0. - 2.j], [0. - 2.j, 0. - 2.j, 0. - 2.j], [0. - 2.j, 0. - 2.j, 0. - 2.j]], @@ -217,10 +200,10 @@ def test_build_exit_aux_same_as_exit(self): [[0. - 16.j, 0. - 16.j, 0. - 16.j], [0. - 16.j, 0. - 16.j, 0. - 16.j], [0. - 16.j, 0. - 16.j, 0. - 16.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + # assert expected_exit_wave = np.array([[[1. - 1.j, 1. - 1.j, 1. - 1.j], [1. - 1.j, 1. - 1.j, 1. - 1.j], [1. - 1.j, 1. - 1.j, 1. - 1.j]], @@ -269,24 +252,20 @@ def test_build_exit_aux_same_as_exit(self): [[16. + 0.j, 16. + 0.j, 16. + 0.j], [16. + 0.j, 16. + 0.j, 16. + 0.j], [16. + 0.j, 16. + 0.j, 16. + 0.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(exit_wave, expected_exit_wave, err_msg="The exit_wave has not been updated as expected") def test_build_aux_no_ex(self): - ''' - setup - ''' + # setup addr, object_array, probe, exit_wave = self.prepare_arrays() - - ''' - test - ''' auxiliary_wave = np.zeros_like(exit_wave) + # test AWK = AuxiliaryWaveKernel() AWK.allocate() AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + # assert expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j], [0. + 2.j, 0. + 2.j, 0. + 2.j], [0. + 2.j, 0. + 2.j, 0. + 2.j]], @@ -337,9 +316,12 @@ def test_build_aux_no_ex(self): [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64) np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + + # test auxiliary_wave = exit_wave AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + # assert expected_auxiliary_wave = np.array([[[1. + 5.j, 1. + 5.j, 1. + 5.j], [1. + 5.j, 1. + 5.j, 1. + 5.j], [1. + 5.j, 1. + 5.j, 1. + 5.j]], @@ -391,5 +373,57 @@ def test_build_aux_no_ex(self): np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, err_msg="The auxiliary_wave has not been updated as expected") + + def test_build_exit_alpha_tau(self): + + # setup + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + auxiliary_wave = np.zeros_like(exit_wave) + + # test + AWK = AuxiliaryWaveKernel() + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave) + + # assert + expected_auxiliary_wave = np.array( + [[[0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j]], + + [[0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j]], + + [[0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j]], + + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64) + np.testing.assert_array_equal(auxiliary_wave, expected_auxiliary_wave, + err_msg="The auxiliary_wave has not been updated as expected") + + # assert + expected_exit_wave = np.array( + [[[1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j]], + + [[2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j]], + + [[3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j]], + + [[4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64) + np.testing.assert_array_equal(exit_wave, expected_exit_wave, + err_msg="The exit_wave has not been updated as expected") + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/base_tests/po_update_kernel_test.py b/test/accelerate_tests/base_tests/po_update_kernel_test.py index 15557e3d2..a8d20ce78 100644 --- a/test/accelerate_tests/base_tests/po_update_kernel_test.py +++ b/test/accelerate_tests/base_tests/po_update_kernel_test.py @@ -91,26 +91,15 @@ def prepare_arrays(self): return addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator def test_ob_update(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' + # test POUK = PoUpdateKernel() - POUK.allocate() # doesn't do anything but is the call signature - - # print("object array denom before:") - # print(object_array_denominator) - POUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave) - # print("object array denom after:") - # print(repr(object_array_denominator)) - + # assert expected_object_array = np.array([[[15. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 53. + 1.j, 39. + 1.j, 1. + 1.j], [77. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 201. + 1.j, 125. + 1.j, 1. + 1.j], @@ -136,10 +125,10 @@ def test_ob_update(self): 4. + 4.j], [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") + # assert expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.], [22., 42., 42., 42., 42., 22., 2.], [22., 42., 42., 42., 42., 22., 2.], @@ -159,29 +148,15 @@ def test_ob_update(self): err_msg="The object array denominatorhas not been updated as expected") def test_pr_update(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - - # print("probe array before:") - # print(repr(probe)) - # print("probe denominator array before:") - # print(repr(probe_denominator)) - POUK.pr_update(addr, probe, probe_denominator, object_array, exit_wave) - # print("probe array after:") - # print(repr(probe)) - # print("probe denominator array after:") - # print(repr(probe_denominator)) - + # assert expected_probe = np.array([[[313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], [313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j, 313. + 1.j], @@ -194,9 +169,10 @@ def test_pr_update(self): [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j], [394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j, 394. + 2.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(probe, expected_probe, err_msg="The probe has not been updated as expected") + + # assert expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.], [138., 138., 138., 138., 138.], [138., 138., 138., 138., 138.], @@ -212,19 +188,15 @@ def test_pr_update(self): err_msg="The probe denominatorhas not been updated as expected") def test_pr_update_ML(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - POUK.pr_update_ML(addr, probe, object_array, exit_wave) + # assert expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], @@ -237,26 +209,19 @@ def test_pr_update_ML(self): [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]], dtype=COMPLEX_TYPE) - np.testing.assert_array_equal(probe, expected_probe, err_msg="The probe has not been updated as expected") def test_ob_update_ML(self): - ''' - setup - ''' + # setup addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() - ''' - test - ''' - POUK = PoUpdateKernel() + # test + POUK = PoUpdateKernel() POUK.allocate() # this doesn't do anything, but is the call pattern. - POUK.ob_update_ML(addr, object_array, probe, exit_wave) - print(repr(object_array)) - + # assert expected_object_array = np.array( [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j], [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], @@ -274,7 +239,147 @@ def test_ob_update_ML(self): [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j], [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], dtype=COMPLEX_TYPE) + np.testing.assert_array_equal(object_array, expected_object_array, + err_msg="The object array has not been updated as expected") + + + def test_pr_update_local(self): + # setup + B = 5 # frame size y + C = 5 # frame size x + + D = 1 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 1 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 1.5 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # test + POUK = PoUpdateKernel() + POUK.allocate() # this doesn't do anything, but is the call pattern. + POUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave) + + # assert + expected_probe = np.array( + [[[0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j], + [0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j, 0.49999994+1.j]]], dtype=COMPLEX_TYPE) + np.testing.assert_array_equal(probe, expected_probe, + err_msg="The probe has not been updated as expected") + + def test_ob_update_local(self): + # setup + B = 5 # frame size y + C = 5 # frame size x + + D = 1 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 1 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 2 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + # test + POUK = PoUpdateKernel() + POUK.allocate() # this doesn't do anything, but is the call pattern. + POUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave) + + # assert + expected_object_array = np.array( + [[[-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [-1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, -1.1920929e-07+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [ 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j], + [ 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j, 1.0000000e+00+1.j]]], dtype=COMPLEX_TYPE) np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") diff --git a/test/accelerate_tests/base_tests/position_correction_kernel_test.py b/test/accelerate_tests/base_tests/position_correction_kernel_test.py index 20764e39a..117915f6b 100644 --- a/test/accelerate_tests/base_tests/position_correction_kernel_test.py +++ b/test/accelerate_tests/base_tests/position_correction_kernel_test.py @@ -6,6 +6,7 @@ import unittest import numpy as np from ptypy.accelerate.base.kernels import PositionCorrectionKernel +from ptypy import utils as u COMPLEX_TYPE = np.complex64 FLOAT_TYPE = np.float32 INT_TYPE = np.int32 @@ -16,6 +17,14 @@ class PositionCorrectionKernelTest(unittest.TestCase): def setUp(self): import sys np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) + self.params = u.Param() + self.params.nshifts = 4 + self.params.method = "Annealing" + self.params.amplitude = 2e-9 + self.params.start = 0 + self.params.stop = 10 + self.params.max_shift = 2e-9 + self.resolution = [1e-9,1e-9] def tearDown(self): np.set_printoptions() @@ -77,7 +86,7 @@ def test_build_aux(self): ''' auxiliary_wave = np.zeros((A, B, C), dtype=COMPLEX_TYPE) - PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes) + PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution) PCK.allocate() # doesn't actually do anything at the moment PCK.build_aux(auxiliary_wave, addr, object_array, probe) @@ -205,7 +214,7 @@ def test_fourier_error(self): mask_sum = mask.sum(-1).sum(-1) - PCK = PositionCorrectionKernel(auxiliary_wave, nmodes=total_number_modes) + PCK = PositionCorrectionKernel(auxiliary_wave, total_number_modes, self.params, self.resolution) PCK.allocate() PCK.fourier_error(auxiliary_wave, addr, fmag, mask, mask_sum) @@ -276,7 +285,7 @@ def test_error_reduce(self): addr = np.zeros((N, 1, 5, 3)) - PCK = PositionCorrectionKernel(fake_aux, nmodes=1) + PCK = PositionCorrectionKernel(fake_aux, 1, self.params, self.resolution) PCK.allocate() err_fmag = np.zeros(N, dtype=FLOAT_TYPE) PCK.error_reduce(addr, err_fmag) diff --git a/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py new file mode 100644 index 000000000..2704dcf97 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/address_manglers_test.py @@ -0,0 +1,77 @@ +import unittest +import numpy as np +from . import perfrun, PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.base import address_manglers as am + from ptypy.accelerate.cuda_pycuda import address_manglers as gam + + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class AddressManglersTest(PyCudaTest): + + def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3): + total_number_scan_positions = scan_pts ** 2 + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + max_bound # max bound is added in the DM_serial engine. + Y = Y.reshape((total_number_scan_positions)) + max_bound + + addr_original = np.zeros((total_number_scan_positions, num_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(num_modes): + for ob_mode in range(1): + addr_original[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + return addr_original + + def test_get_address_REGRESSION(self): + # the other manglers are using the BaseMangler's get_address function + # so we set the deltas in a BaseMangler object and test get_address + + scan_pts=2 + addr_original = self.prepare_addresses(scan_pts=scan_pts) + addr_original_dev = gpuarray.to_gpu(addr_original) + nshifts=1 + step_size=2 + mglr = gam.BaseMangler(step_size, 50, 100, nshifts, max_bound=2) + # 2 shifts, with positive/negative shifting + mglr.delta = np.array([ + [1, 2], + [-4, -2] + ], dtype=np.int32) + mglr._setup_delta_gpu() + + addr1 = addr_original_dev.copy() + mglr.get_address(0, addr_original_dev, addr1, 10, 9) + + addr2 = addr_original_dev.copy() + mglr.get_address(1, addr_original_dev, addr2, 10, 9) + + exp1 = np.copy(addr_original) + exp2 = np.copy(addr_original) + # element-wise here to prepare reference + for f in range(addr_original.shape[0]): + for m in range(addr_original.shape[1]): + exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1)) + exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2)) + exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4)) + exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2)) + + np.testing.assert_array_equal(addr2.get(), exp2) + np.testing.assert_array_equal(addr1.get(), exp1) + diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index dcd133344..23950af26 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -88,7 +88,7 @@ def test_transpose_2D(self): out_dev = gpuarray.empty((3,5), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev, out_dev) ## Assert @@ -103,7 +103,7 @@ def test_transpose_2D_large(self): out_dev = gpuarray.empty((61,137), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev, out_dev) ## Assert @@ -118,7 +118,7 @@ def test_transpose_4D(self): out_dev = gpuarray.empty((5, 3, 250, 3), dtype=np.int32) ## Act - AU = gau.ArrayUtilsKernel() + AU = gau.TransposeKernel() AU.transpose(inp_dev.reshape(750, 15), out_dev.reshape(15, 750)) ## Assert @@ -128,124 +128,124 @@ def test_transpose_4D(self): def test_complex_gaussian_filter_1d_no_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [0] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() self.assertTrue(np.testing.assert_allclose(out_exp, out, rtol=1e-5) is None) def test_complex_gaussian_filter_1d_little_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [0.2] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_1d_more_blurring_UNITY(self): # Arrange - inp = np.zeros((11,), dtype=np.complex64) - inp[5] = 1.0 +1.0j + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j mfs = [2.0] - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_no_blurring_UNITY(self): # Arrange - inp = np.zeros((11, 11), dtype=np.complex64) - inp[5, 5] = 1.0+1.0j + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j mfs = 0.0,0.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,11), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,11), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_little_blurring_UNITY(self): # Arrange - inp = np.zeros((11, 11), dtype=np.complex64) - inp[5, 5] = 1.0+1.0j + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j mfs = 0.2,0.2 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((11,11),dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((11,11),dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-5) def test_complex_gaussian_filter_2d_more_blurring_UNITY(self): # Arrange - inp = np.zeros((8, 8), dtype=np.complex64) - inp[3:5, 3:5] = 2.0+2.0j + data = np.zeros((8, 8), dtype=np.complex64) + data[3:5, 3:5] = 2.0+2.0j mfs = 3.0,4.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((8,8), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + #tmp_dev = gpuarray.empty((8,8), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) def test_complex_gaussian_filter_2d_nonsquare_UNITY(self): # Arrange - inp = np.zeros((32, 16), dtype=np.complex64) - inp[3:4, 11:12] = 2.0+2.0j - inp[3:5, 3:5] = 2.0+2.0j - inp[20:25,3:5] = 2.0+2.0j + data = np.zeros((32, 16), dtype=np.complex64) + data[3:4, 11:12] = 2.0+2.0j + data[3:5, 3:5] = 2.0+2.0j + data[20:25,3:5] = 2.0+2.0j mfs = 1.0,1.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty(inp.shape, dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty(data_dev.shape, dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) @@ -254,17 +254,152 @@ def test_complex_gaussian_filter_2d_batched(self): batch_number = 2 A = 5 B = 5 - inp = np.zeros((batch_number, A, B), dtype=np.complex64) - inp[:, 2:3, 2:3] = 2.0+2.0j + data = np.zeros((batch_number, A, B), dtype=np.complex64) + data[:, 2:3, 2:3] = 2.0+2.0j mfs = 3.0,4.0 - inp_dev = gpuarray.to_gpu(inp) - out_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64) + data_dev = gpuarray.to_gpu(data) + tmp_dev = gpuarray.empty((batch_number,A,B), dtype=np.complex64) # Act GS = gau.GaussianSmoothingKernel() - GS.convolution(inp_dev, out_dev, mfs) + GS.convolution(data_dev, mfs, tmp=tmp_dev) # Assert - out_exp = au.complex_gaussian_filter(inp, mfs) - out = out_dev.get() + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() np.testing.assert_allclose(out_exp, out, rtol=1e-4) + + + def test_crop_pad_simple_1_UNITY(self): + # pad, integer, 2D + B = np.indices((4, 4), dtype=np.int).sum(0) + A = np.zeros((6, 6), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_2_UNITY(self): + # crop, float, 3D + B = np.indices((4, 4), dtype=np.float32) + A = np.zeros((2, 2, 2), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_3_UNITY(self): + # crop/pad, complex, 3D + B = np.indices((4, 3), dtype=np.complex64) + B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2, 2, 5), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_difflike_UNITY(self): + np.random.seed(1983) + # crop/pad, 4D + D = np.random.randint(0, 3000, (100,256,256)).astype(np.float32) + A = np.zeros((100,260,260), dtype=D.dtype) + B = np.zeros((100,250,250), dtype=D.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + D_dev = gpuarray.to_gpu(D) + + # Act + au.crop_pad_2d_simple(A, D) + au.crop_pad_2d_simple(B, D) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, D_dev) + k.crop_pad_2d_simple(B_dev, D_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(B, B_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_oblike_UNITY(self): + np.random.seed(1983) + # crop/pad, 4D + B = np.random.rand(2,1230,1434).astype(np.complex64) \ + +2j * np.pi * np.random.randn(2,1230,1434).astype(np.complex64) + A = np.ones((2,1000,1500), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_max_abs2_complex_UNITY(self): + np.random.seed(1983) + X = (np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + \ + 1j * np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32)).astype(np.complex64) + out = np.zeros((1,), dtype=np.float32) + X_dev = gpuarray.to_gpu(X) + out_dev = gpuarray.to_gpu(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + def test_max_abs2_float_UNITY(self): + np.random.seed(1983) + X = np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + + out = np.zeros((1,), dtype=np.float32) + X_dev = gpuarray.to_gpu(X) + out_dev = gpuarray.to_gpu(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + + def test_clip_magnitudes_to_range_UNITY(self): + np.random.seed(1987) + A = np.random.random((2,10,10)) + B = A[0] + 1j* A[1] + B = B.astype(np.complex64) + B_gpu = gpuarray.to_gpu(B) + + au.clip_complex_magnitudes_to_range(B, 0.2,0.8) + CMK = gau.ClipMagnitudesKernel() + CMK.clip_magnitudes_to_range(B_gpu, 0.2, 0.8) + + np.testing.assert_allclose(B_gpu.get(), B, rtol=1e-6, atol=1e-6, + err_msg="The magnitudes of the array have not been clipped as expected") + + diff --git a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py index bc38a62b1..71e8e1e7e 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/auxiliary_wave_kernel_test.py @@ -17,7 +17,7 @@ class AuxiliaryWaveKernelTest(PyCudaTest): - def prepare_arrays(self, performance=False): + def prepare_arrays(self, performance=False, scan_points=None): if not performance: B = 3 # frame size y C = 3 # frame size x @@ -27,8 +27,10 @@ def prepare_arrays(self, performance=False): npts_greater_than = 2 # how many points bigger than the probe the object is. G = 2 # number of object modes - - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points else: B = 128 C = 128 @@ -37,7 +39,10 @@ def prepare_arrays(self, performance=False): F = C npts_greater_than = 1215 G = 4 - scan_pts = 14 + if scan_points is None: + scan_pts = 14 + else: + scan_pts = scan_points H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x @@ -189,6 +194,25 @@ def test_build_aux_same_as_exit_UNITY(self): np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), err_msg="The gpu auxiliary_wave does not look the same as the numpy version") + def test_build_aux2_same_as_exit_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = np.zeros_like(exit_wave) + auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev) + + ## Act + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = FLOAT_TYPE(1.0) + + AWK.build_aux2(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set) + nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The gpu auxiliary_wave does not look the same as the numpy version") def test_build_exit_aux_same_as_exit_REGRESSION(self): ## Arrange @@ -413,6 +437,27 @@ def test_build_aux_no_ex_noadd_UNITY(self): np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, err_msg="The auxiliary_wave does not match numpy") + def test_build_aux2_no_ex_noadd_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.zeros_like(exit_wave_dev) + auxiliary_wave = np.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=1.0, add=False) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + def test_build_aux_no_ex_add_REGRESSION(self): ## Arrange @@ -500,6 +545,27 @@ def test_build_aux_no_ex_add_UNITY(self): np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, err_msg="The auxiliary_wave does not match numpy") + def test_build_aux2_no_ex_add_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=2.0, add=True) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + @unittest.skipIf(not perfrun, "performance test") def test_build_aux_no_ex_performance(self): @@ -512,5 +578,89 @@ def test_build_aux_no_ex_performance(self): AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + def test_build_exit_alpha_tau_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = gpuarray.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave) + + # Assert + expected_auxiliary_wave = np.array( + [[[0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j]], + + [[0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j]], + + [[0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j]], + + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64) + np.testing.assert_allclose(auxiliary_wave.get(), expected_auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave has not been updated as expected") + + expected_exit_wave = np.array( + [[[1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j]], + + [[2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j]], + + [[3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j]], + + [[4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64) + np.testing.assert_allclose(exit_wave.get(), expected_exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave has not been updated as expected") + + def test_build_exit_alpha_tau_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = gpuarray.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=0.8, tau=0.6) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + + ## Assert + np.testing.assert_allclose(auxiliary_wave_dev.get(), auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave does not match numpy") + ## Assert + np.testing.assert_allclose(exit_wave_dev.get(), exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave does not match numpy") + + @unittest.skipIf(not perfrun, "performance test") + def test_build_exit_alpha_tau_performance(self): + addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True, scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = gpuarray.zeros_like(exit_wave) + + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py new file mode 100644 index 000000000..0d943c28e --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_auxiliary_wave_kernel_test.py @@ -0,0 +1,57 @@ +''' +Testing based on real data +''' +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import perfrun, PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import AuxiliaryWaveKernel +from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as BaseAuxiliaryWaveKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsAuxiliaryWaveKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_build_aux_no_ex_noadd_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir % name + "build_aux_no_ex_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + + # CPU kernel + BAWK = BaseAuxiliaryWaveKernel() + BAWK.allocate() + BAWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + ## GPU kernel + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(aux_dev, addr_dev, ob_dev, pr_dev, add=False) + + ## Assert + np.testing.assert_allclose(aux_dev.get(), aux, rtol=self.rtol, atol=self.atol, + err_msg="The auxiliary_wave does not match the base kernel output") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py new file mode 100644 index 000000000..57f62f9dd --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_drpycuda_test.py @@ -0,0 +1,83 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel +from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsDRpycudaTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-dr/" + iter = 0 + rtol = 1e-6 + atol = 1e-6 + + def test_ob_update_local_UNITY(self): + + # Load data + with h5py.File(self.datadir + "ob_update_local_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + ex = f["ex"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + ex_dev = gpuarray.to_gpu(ex) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.ob_update_local(addr, ob, pr, ex, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.ob_update_local(addr_dev, ob_dev, pr_dev, ex_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(ob_dev.get(), ob, atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The object array has not been updated as expected") + + def test_pr_update_local_UNITY(self): + + # Load data + with h5py.File(self.datadir + "pr_update_local_%04d.h5" %self.iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + pr = f["pr"][:] + ex = f["ex"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + ob_dev = gpuarray.to_gpu(ob) + pr_dev = gpuarray.to_gpu(pr) + ex_dev = gpuarray.to_gpu(ex) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.pr_update_local(addr, pr, ob, ex, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.pr_update_local(addr_dev, pr_dev, ob_dev, ex_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(pr_dev.get(), pr, atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The object array has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py new file mode 100644 index 000000000..f62834e2e --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_gradient_descent_kernel_test.py @@ -0,0 +1,261 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import perfrun, PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel as BaseGradientDescentKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsGradientDescentKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_make_model_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_model(aux, addr) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.make_model(aux_dev, addr_dev) + + ## Assert + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + err_msg="`Imodel` buffer has not been updated as expected") + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_floating_intensity_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir %name + "floating_intensities_%04d.h5" %iter, "r") as f: + w = f["w"][:] + addr = f["addr"][:] + I = f["I"][:] + fic = f["fic"][:] + Imodel = f["Imodel"][:] + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + fic_dev = gpuarray.to_gpu(fic) + Imodel_dev = gpuarray.to_gpu(np.ascontiguousarray(Imodel)) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = Imodel + BGDK.floating_intensity(addr, w, I, fic) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.gpu.Imodel = Imodel_dev + GDK.floating_intensity(addr_dev, w_dev, I_dev, fic_dev) + + ## Assert + np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`LLerr` buffer has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.LLden, GDK.gpu.LLden.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`LLden` buffer has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.fic_tmp, GDK.gpu.fic_tmp.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`fic_tmp` buffer has not been updated as expected") + + np.testing.assert_allclose(fic, fic_dev.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="floating intensity coeff (fic) has not been updated as expected") + + np.testing.assert_allclose(BGDK.npy.Imodel, GDK.gpu.Imodel.get(), atol=self.atol, rtol=self.rtol, + verbose=False, equal_nan=False, + err_msg="`Imodel` buffer has not been updated as expected") + + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_main_and_error_reduce_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir %name + "main_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + w = f["w"][:] + I = f["I"][:] + # Load data + with h5py.File(self.datadir %name + "error_reduce_%04d.h5" %iter, "r") as f: + err_phot = f["err_phot"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + err_phot_dev = gpuarray.to_gpu(err_phot) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.main(aux, addr, w, I) + BGDK.error_reduce(addr, err_phot) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.main(aux_dev, addr_dev, w_dev, I_dev) + GDK.error_reduce(addr_dev, err_phot_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="Auxiliary has not been updated as expected") + np.testing.assert_allclose(BGDK.npy.LLerr, GDK.gpu.LLerr.get(), atol=self.atol, rtol=self.rtol, + err_msg="LogLikelihood error has not been updated as expected") + np.testing.assert_allclose(err_phot, err_phot_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="`err_phot` has not been updated as expected") + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_make_a012_UNITY(self, name, iter): + + # Reduce the array size to make the tests run faster + Nmax = 10 + Ymax = 128 + Xmax = 128 + + # Load data + with h5py.File(self.datadir %name + "make_a012_%04d.h5" %iter, "r") as g: + addr = g["addr"][:Nmax] + I = g["I"][:Nmax,:Ymax,:Xmax] + f = g["f"][:Nmax,:Ymax,:Xmax] + a = g["a"][:Nmax,:Ymax,:Xmax] + b = g["b"][:Nmax,:Ymax,:Xmax] + fic = g["fic"][:Nmax] + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as h: + aux = h["aux"][:Nmax,:Ymax,:Xmax] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + addr_dev = gpuarray.to_gpu(addr) + I_dev = gpuarray.to_gpu(I) + f_dev = gpuarray.to_gpu(f) + a_dev = gpuarray.to_gpu(a) + b_dev = gpuarray.to_gpu(b) + fic_dev = gpuarray.to_gpu(fic) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.make_a012(f, a, b, addr, I, fic) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1], queue=self.stream) + GDK.allocate() + GDK.gpu.Imodel.fill(np.nan) + GDK.gpu.LLerr.fill(np.nan) + GDK.gpu.LLden.fill(np.nan) + GDK.make_a012(f_dev, a_dev, b_dev, addr_dev, I_dev, fic_dev) + + ## Assert + np.testing.assert_allclose(GDK.gpu.Imodel.get(), BGDK.npy.Imodel, atol=self.atol, rtol=self.rtol, + err_msg="Imodel error has not been updated as expected") + np.testing.assert_allclose(GDK.gpu.LLerr.get(), BGDK.npy.LLerr, atol=self.atol, rtol=self.rtol, + err_msg="LLerr error has not been updated as expected") + np.testing.assert_allclose(GDK.gpu.LLden.get(), BGDK.npy.LLden, atol=self.atol, rtol=self.rtol, + err_msg="LLden error has not been updated as expected") + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_fill_b_UNITY(self, name, iter): + + Nmax = 10 + Ymax = 128 + Xmax = 128 + + # Load data + with h5py.File(self.datadir %name + "fill_b_%04d.h5" %iter, "r") as f: + w = f["w"][:Nmax, :Ymax, :Xmax] + addr = f["addr"][:] + B = f["B"][:] + Brenorm = f["Brenorm"][...] + A0 = f["A0"][:Nmax, :Ymax, :Xmax] + A1 = f["A1"][:Nmax, :Ymax, :Xmax] + A2 = f["A2"][:Nmax, :Ymax, :Xmax] + with h5py.File(self.datadir %name + "make_model_%04d.h5" %iter, "r") as f: + aux = f["aux"][:Nmax, :Ymax, :Xmax] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + w_dev = gpuarray.to_gpu(w) + addr_dev = gpuarray.to_gpu(addr) + B_dev = gpuarray.to_gpu(B.astype(np.float32)) + A0_dev = gpuarray.to_gpu(A0) + A1_dev = gpuarray.to_gpu(A1) + A2_dev = gpuarray.to_gpu(A2) + + # CPU Kernel + BGDK = BaseGradientDescentKernel(aux, addr.shape[1]) + BGDK.allocate() + BGDK.npy.Imodel = A0 + BGDK.npy.LLerr = A1 + BGDK.npy.LLden = A2 + BGDK.fill_b(addr, Brenorm, w, B) + + # GPU kernel + GDK = GradientDescentKernel(aux_dev, addr.shape[1]) + GDK.allocate() + GDK.gpu.Imodel = A0_dev + GDK.gpu.LLerr = A1_dev + GDK.gpu.LLden = A2_dev + GDK.fill_b(addr_dev, Brenorm, w_dev, B_dev) + + ## Assert + np.testing.assert_allclose(B, B_dev.get(), rtol=self.rtol, atol=self.atol, + err_msg="`B` has not been updated as expected") + diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py new file mode 100644 index 000000000..3b8ee0474 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_po_update_kernel_test.py @@ -0,0 +1,106 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PoUpdateKernel +from ptypy.accelerate.base.kernels import PoUpdateKernel as BasePoUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsPoUpdateKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + + @parameterized.expand([ + ["base", 10, False], + ["regul", 50, False], + ["floating", 0, False], + ["base", 10, True], + ["regul", 50, True], + ["floating", 0, True], + ]) + def test_op_update_ml_UNITY(self, name, iter, atomics): + + # Load data + with h5py.File(self.datadir %name + "op_update_ml_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + obg = f["obg"][:] + pr = f["pr"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + obg_dev = gpuarray.to_gpu(obg) + pr_dev = gpuarray.to_gpu(pr) + + # If not using atomics we need to change the addresses + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = gpuarray.to_gpu(addr2) + else: + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.ob_update_ML(addr, obg, pr, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.ob_update_ML(addr_dev, obg_dev, pr_dev, aux_dev, atomics=atomics) + + ## Assert + np.testing.assert_allclose(obg_dev.get(), obg, atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The object array has not been updated as expected") + + @parameterized.expand([ + ["base", 10, False], + ["regul", 50, False], + ["floating", 0, False], + ["base", 10, True], + ["regul", 50, True], + ["floating", 0, True], + ]) + def test_pr_update_ml_UNITY(self, name, iter, atomics): + + # Load data + with h5py.File(self.datadir %name + "pr_update_ml_%04d.h5" %iter, "r") as f: + aux = f["aux"][:] + addr = f["addr"][:] + ob = f["ob"][:] + prg = f["prg"][:] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + ob_dev = gpuarray.to_gpu(ob) + prg_dev = gpuarray.to_gpu(prg) + + # If not using atomics we need to change the addresses + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = gpuarray.to_gpu(addr2) + else: + addr_dev = gpuarray.to_gpu(addr) + + # CPU Kernel + BPOK = BasePoUpdateKernel() + BPOK.pr_update_ML(addr, prg, ob, aux) + + # GPU Kernel + POK = PoUpdateKernel() + POK.pr_update_ML(addr_dev, prg_dev, ob_dev, aux_dev, atomics=atomics) + + ## Assert + np.testing.assert_allclose(prg, prg_dev.get(), atol=self.atol, rtol=self.rtol, verbose=False, + err_msg="The probe array has not been updated as expected") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py new file mode 100644 index 000000000..ac9fa0402 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_propagation_kernel_test.py @@ -0,0 +1,102 @@ +''' +testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel + +import ptypy.utils as u +from ptypy.core import geometry +from ptypy.core import Base as theBase + +# subclass for dictionary access +Base = type('Base',(theBase,),{}) + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DLsPropagationKernelTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + + def set_up_farfield(self,shape): + P = Base() + P.CType = COMPLEX_TYPE + P.Ftype = FLOAT_TYPE + g = u.Param() + g.energy = None # u.keV2m(1.0)/6.32e-7 + g.lam = 5.32e-7 + g.distance = 15e-2 + g.psize = 24e-6 + g.shape = shape + g.propagation = "farfield" + G = geometry.Geo(owner=P, pars=g) + return G + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_forward_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir % name + "forward_%04d.h5" %iter, "r") as f: + aux = f["aux"][0] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + + # Geometry + geo = self.set_up_farfield(aux.shape) + + # CPU kernel + aux = geo.propagator.fw(aux) + + # GPU kernel + PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="Forward propagation was not as expected") + + @parameterized.expand([ + ["base", 10], + ["regul", 50], + ["floating", 0], + ]) + def test_backward_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir % name + "backward_%04d.h5" %iter, "r") as f: + aux = f["aux"][0] + + # Copy data to device + aux_dev = gpuarray.to_gpu(aux) + + # Geometry + geo = self.set_up_farfield(aux.shape) + + # CPU kernel + aux = geo.propagator.bw(aux) + + # GPU kernel + PropK = PropagationKernel(aux_dev, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_dev, aux_dev) + + ## Assert + np.testing.assert_allclose(aux, aux_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="Backward propagation was not as expected") \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py new file mode 100644 index 000000000..972648552 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/dls_tests/dls_regularizer_kernel_test.py @@ -0,0 +1,77 @@ +''' +Testing on real data +''' + +import h5py +import unittest +import numpy as np +from parameterized import parameterized +from .. import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + from ptypy.accelerate.cuda_pycuda.engines.ML_pycuda import Regul_del2_pycuda + import pycuda.driver as cuda +from ptypy.engines.ML import Regul_del2 + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class DlsRegularizerTest(PyCudaTest): + + datadir = "/dls/science/users/iat69393/gpu-hackathon/test-data-%s/" + rtol = 1e-6 + atol = 1e-6 + + @parameterized.expand([ + ["regul", 50] + ]) + def test_regularizer_grad_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir %name + "regul_grad_%04d.h5" %iter, "r") as f: + ob = f["ob"][:] + + # Copy data to device + ob_dev = gpuarray.to_gpu(ob) + + # CPU Kernel + regul = Regul_del2(0.1) + obr = regul.grad(ob) + + # GPU Kernel + regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc) + obr_dev = regul_pycuda.grad(ob_dev) + + ## Assert + np.testing.assert_allclose(obr, obr_dev.get(), atol=self.atol, rtol=self.rtol, + err_msg="The object array has not been updated as expected") + np.testing.assert_allclose(regul.LL, regul_pycuda.LL, atol=self.atol, rtol=self.rtol, + err_msg="The LL array has not been updated as expected") + + @parameterized.expand([ + ["regul", 50], + ]) + def test_regularizer_poly_line_ceoffs_UNITY(self, name, iter): + + # Load data + with h5py.File(self.datadir % name + "regul_poly_line_coeffs_%04d.h5" %iter, "r") as f: + ob = f["ob"][:] + obh = f["obh"][:] + + # Copy data to device + ob_dev = gpuarray.to_gpu(ob) + obh_dev = gpuarray.to_gpu(obh) + + # CPU Kernel + regul = Regul_del2(0.1) + res = regul.poly_line_coeffs(obh, ob) + + # GPU Kernel + regul_pycuda = Regul_del2_pycuda(0.1, queue=self.stream, allocator=cuda.mem_alloc) + res_pycuda = regul_pycuda.poly_line_coeffs(obh_dev, ob_dev) + + ## Assert + np.testing.assert_allclose(res, res_pycuda, atol=self.atol, rtol=self.rtol, + err_msg="The B array has not been updated as expected") diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py deleted file mode 100644 index ed6929865..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_accuracy_test.py +++ /dev/null @@ -1,48 +0,0 @@ -''' -''' - -import unittest -import numpy as np -import scipy.fft as fft -from . import PyCudaTest, have_pycuda - - -if have_pycuda(): - from pycuda import gpuarray - from ptypy.accelerate.cuda_pycuda.fft import FFT as ReiknaFFT - from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as cuFFT - -class FftAccurracyTest(PyCudaTest): - - def gen_input(self): - rows = cols = 32 - batches = 1 - f = np.random.randn(batches, rows, cols) + 1j * np.random.randn(batches,rows, cols) - f = np.ascontiguousarray(f.astype(np.complex64)) - return f - - def test_random_cufft_fwd(self): - f = self.gen_input() - cuft = cuFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=None, forward=True).ft - reikft = ReiknaFFT(f, self.stream, inplace=True, pre_fft=None, post_fft=None, symmetric=False).ft - for i in range(10): - f = self.gen_input() - y = fft.fft2(f) - - x_d = gpuarray.to_gpu(f) - cuft(x_d, x_d) - y_cufft = x_d.get().reshape(y.shape) - - x_d = gpuarray.to_gpu(f) - reikft(x_d, x_d) - y_reikna = x_d.get().reshape(y.shape) - - # cufft_diff = np.max(np.abs(y_cufft - y)) - # reikna_diff = np.max(np.abs(y_reikna-y)) - # cufft_rdiff = np.max(np.abs(y_cufft - y) / np.abs(y)) - # reikna_rdiff = np.max(np.abs(y_reikna - y) / np.abs(y)) - # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) - - # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py new file mode 100644 index 000000000..ac28436b4 --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/cufft_init_test.py @@ -0,0 +1,28 @@ + +import unittest +from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda + +if have_pycuda(): + from ptypy.filtered_cufft import FilteredFFT + +class CuFFTInitTest(PyCudaTest): + + def test_import_fft(self): + ft = FilteredFFT(2, 32, 32, False, True, 0, 0, 0) + + + def test_import_fft_different_shape(self): + ft = FilteredFFT(2, 128, 128, False, True, 0, 0, 0) + + + @unittest.expectedFailure + def test_import_fft_not_square(self): + ft = FilteredFFT(2, 32, 64, False, True, 0, 0, 0) + + @unittest.expectedFailure + def test_import_fft_not_pow2(self): + ft = FilteredFFT(2, 40, 40, False, True, 0, 0, 0) + + +if __name__=="__main__": + unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py index 9c87e34f2..7c30c3221 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_accuracy_test.py @@ -44,5 +44,5 @@ def test_random_cufft_fwd(self): # print('{}: {}\t{}\t{}\t{}'.format(i, cufft_diff, reikna_diff, cufft_rdiff, reikna_rdiff)) # Note: check if this tolerance and test case is ok - np.testing.assert_allclose(y, y_cufft, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) - np.testing.assert_allclose(y, y_reikna, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_cufft, atol=1e-6, rtol=5e-5, err_msg='cuFFT error at index {}'.format(i)) + np.testing.assert_allclose(y, y_reikna, atol=1e-6, rtol=5e-5, err_msg='reikna FFT error at index {}'.format(i)) diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py deleted file mode 100644 index 7d60ce46a..000000000 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_tests/fft_import_fft_test.py +++ /dev/null @@ -1,27 +0,0 @@ - -import unittest, pytest -from test.accelerate_tests.cuda_pycuda_tests import PyCudaTest, have_pycuda -import os, shutil -from distutils import sysconfig - -if have_pycuda(): - import pycuda.driver as cuda - from pycuda import gpuarray - from ptypy.accelerate.cuda_pycuda import import_fft - from pycuda.tools import make_default_context - -class ImportFFTTest(PyCudaTest): - - def test_import_fft(self): - import_fft.ImportFFT(32, 32) - - - def test_import_fft_different_shape(self): - import_fft.ImportFFT(128, 128) - - def test_import_fft_same_module_again(self): - import_fft.ImportFFT(32, 32) - - -if __name__=="__main__": - unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py index dfea1e19b..3d7cb5fa6 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py @@ -109,11 +109,104 @@ def test_fmag_all_update_UNITY(self): nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set) expected_f = f measured_f = f_d.get() - np.testing.assert_array_equal(expected_f, measured_f, err_msg="Numpy f " + np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f " "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), repr(measured_f), repr(mask))) + def test_fmag_update_nopbound_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # print("address book is:") + # print(repr(addr)) + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + nFUK.error_reduce(addr, err_fmag) + # print(np.sqrt(pbound_set/err_fmag)) + f_d = gpuarray.to_gpu(f) + fmag_d = gpuarray.to_gpu(fmag) + mask_d = gpuarray.to_gpu(mask) + addr_d = gpuarray.to_gpu(addr) + + # now set the state for both. + + FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev) + FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr) + + FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d) + nFUK.fmag_update_nopbound(f, addr, fmag, mask) + + expected_f = f + measured_f = f_d.get() + np.testing.assert_allclose(measured_f, expected_f, rtol=1e-6, err_msg="Numpy f " + "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), + repr(measured_f), + repr(mask))) + + def test_fourier_error_UNITY(self): ''' setup @@ -191,7 +284,7 @@ def test_fourier_error_UNITY(self): expected_fdev = nFUK.npy.fdev measured_fdev = FUK.gpu.fdev.get() - np.testing.assert_array_equal(expected_fdev, measured_fdev, err_msg="Numpy fdev " + np.testing.assert_allclose(expected_fdev, measured_fdev, rtol=1e-6, err_msg="Numpy fdev " "is \n%s, \nbut gpu fdev is \n %s, \n " % ( repr(expected_fdev), repr(measured_fdev))) @@ -203,6 +296,87 @@ def test_fourier_error_UNITY(self): "is \n%s, \nbut gpu ferr is \n %s, \n " % ( repr(expected_ferr), repr(measured_ferr))) + def test_fourier_deviation_UNITY(self): + ''' + setup - using the fourier_error as reference, so we need mask, etc. + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + f_d = gpuarray.to_gpu(f) + fmag_d = gpuarray.to_gpu(fmag) + addr_d = gpuarray.to_gpu(addr) + + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_deviation(f, addr, fmag) + FUK.fourier_deviation(f_d, addr_d, fmag_d) + + expected_fdev = nFUK.npy.fdev + measured_fdev = FUK.gpu.fdev.get() + np.testing.assert_allclose(measured_fdev, expected_fdev, rtol=1e-6, err_msg="Numpy fdev " + "is \n%s, \nbut gpu fdev is \n %s, \n " % ( + repr(expected_fdev), + repr(measured_fdev))) + + def test_error_reduce_UNITY(self): ''' @@ -348,7 +522,7 @@ def test_error_reduce(self): "is not behaving as expected.") - def test_log_likelihood_UNITY(self): + def log_likelihood_UNITY_tester(self, use_version2=False): ''' setup ''' @@ -420,7 +594,10 @@ def test_log_likelihood_UNITY(self): FUK = FourierUpdateKernel(f, nmodes=total_number_modes) FUK.allocate() - FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) + if use_version2: + FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d) + else: + FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) expected_err_phot = LLerr measured_err_phot = LLerr_d.get() @@ -429,6 +606,11 @@ def test_log_likelihood_UNITY(self): "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % ( repr(expected_err_phot), repr(measured_err_phot)), rtol=1e-5) + def test_log_likelihood_UNITY(self): + self.log_likelihood_UNITY_tester(False) + + def test_log_likelihood2_UNITY(self): + self.log_likelihood_UNITY_tester(True) def test_exit_error_UNITY(self): ''' diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py new file mode 100644 index 000000000..64cc5110d --- /dev/null +++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py @@ -0,0 +1,84 @@ +''' +''' + +import unittest +from mpi4py.MPI import Get_version +import numpy as np +from . import PyCudaTest, have_pycuda + +if have_pycuda(): + from pycuda import gpuarray + import pycuda.driver as cuda + from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu + from ptypy.utils import parallel + +from pkg_resources import parse_version + +class GpuDataTest(PyCudaTest): + """ + This is a test class for MPI - to really check if it all works, it needs + to be run as: + + mpirun -np 2 pytest multi_gpu_test.py + + For CUDA-aware MPI testing, currently the environment variable + + OMPI_MCA_opal_cuda_support=true + + needs to be set, mpi4py version 3.1.0+ used, a pycuda build from master, + and a cuda-aware MPI version. + """ + + def setUp(self): + if parallel.rank_local < cuda.Device.count(): + self.device = cuda.Device(parallel.rank_local) + self.ctx = self.device.make_context() + self.ctx.push() + else: + self.ctx = None + + def tearDown(self): + if self.ctx is not None: + self.ctx.pop() + self.ctx.detach() + + @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0") + def test_version(self): + v1 = parse_version("3.1.0") + v2 = parse_version(parse_version("3.1.0a").base_version) + + self.assertGreaterEqual(v2, v1) + + def test_compute_mode(self): + attr = cuda.Context.get_device().get_attributes() + self.assertIn(cuda.device_attribute.COMPUTE_MODE, attr) + mode = attr[cuda.device_attribute.COMPUTE_MODE] + self.assertIn(mode, + [cuda.compute_mode.DEFAULT, cuda.compute_mode.PROHIBITED, cuda.compute_mode.EXCLUSIVE_PROCESS] + ) + + def multigpu_tester(self, com): + if self.ctx is None: + return + + data = np.ones((2, 1), dtype=np.float32) + data_dev = gpuarray.to_gpu(data) + sz = parallel.size + com.allReduceSum(data_dev) + + out = data_dev.get() + np.testing.assert_allclose(out, sz * data, rtol=1e-6) + + def test_multigpu_auto(self): + self.multigpu_tester(mgpu.get_multi_gpu_communicator()) + + def test_multigpu_mpi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi()) + + @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available") + def test_multigpu_cudampi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi()) + + @unittest.skipIf(not mgpu.have_nccl, "NCCL not available") + def test_multigpu_nccl(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl()) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py index 81674d610..d626c0ca2 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py @@ -6,6 +6,8 @@ import unittest import numpy as np from . import PyCudaTest, have_pycuda +from ptypy.accelerate.base.array_utils import max_abs2 +from parameterized import parameterized if have_pycuda(): from pycuda import gpuarray @@ -18,7 +20,7 @@ class PoUpdateKernelTest(PyCudaTest): - def prepare_arrays(self): + def prepare_arrays(self, scan_points=None): B = 5 # frame size y C = 5 # frame size x @@ -31,7 +33,10 @@ def prepare_arrays(self): H = B + npts_greater_than # object size y I = C + npts_greater_than # object size x - scan_pts = 2 # one dimensional scan point number + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points total_number_scan_positions = scan_pts ** 2 total_number_modes = G * D @@ -72,11 +77,11 @@ def prepare_arrays(self): object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) # + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) # + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) return (gpuarray.to_gpu(addr), gpuarray.to_gpu(object_array), @@ -87,17 +92,12 @@ def prepare_arrays(self): def test_init(self): - POUK = PoUpdateKernel() - - np.testing.assert_equal(POUK.kernels, - ['pr_update', 'ob_update'], + np.testing.assert_equal(POUK.kernels, ['pr_update', 'ob_update'], err_msg='PoUpdateKernel does not have the correct functions registered.') def ob_update_REGRESSION_tester(self, atomics=True): - ''' - setup - ''' + B = 5 # frame size y C = 5 # frame size x @@ -149,14 +149,13 @@ def ob_update_REGRESSION_tester(self, atomics=True): mode_idx += 1 exit_idx += 1 position_idx += 1 - ''' test ''' - object_array_denominator = np.empty_like(object_array) + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -204,22 +203,22 @@ def ob_update_REGRESSION_tester(self, atomics=True): np.testing.assert_array_equal(object_array, expected_object_array, err_msg="The object array has not been updated as expected") - expected_object_array_denominator = np.array([[[12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [22.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 42.+2.j, 22.+2.j, 2.+2.j], - [12.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 22.+2.j, 12.+2.j, 2.+2.j], - [ 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j, 2.+2.j]], + expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [12., 22., 22., 22., 22., 12., 2.], + [ 2., 2., 2., 2., 2., 2., 2.]], - [[17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [27.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 47.+7.j, 27.+7.j, 7.+7.j], - [17.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 27.+7.j, 17.+7.j, 7.+7.j], - [ 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j, 7.+7.j]]], - dtype=COMPLEX_TYPE) + [[17., 27., 27., 27., 27., 17., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [17., 27., 27., 27., 27., 17., 7.], + [ 7., 7., 7., 7., 7., 7., 7.]]], + dtype=FLOAT_TYPE) np.testing.assert_array_equal(object_array_denominator_dev.get(), expected_object_array_denominator, @@ -291,9 +290,9 @@ def ob_update_UNITY_tester(self, atomics=True): ''' test ''' - object_array_denominator = np.empty_like(object_array) + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) for idx in range(G): - object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + 1j * np.ones((H, I)) * (5 * idx + 2) + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -394,9 +393,9 @@ def pr_update_REGRESSION_tester(self, atomics=True): ''' test ''' - probe_denominator = np.empty_like(probe) + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) POUK = PoUpdateKernel() @@ -438,18 +437,18 @@ def pr_update_REGRESSION_tester(self, atomics=True): np.testing.assert_array_equal(probe_dev.get(), expected_probe, err_msg="The probe has not been updated as expected") - expected_probe_denominator = np.array([[[138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j], - [138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j, 138.+2.j]], + expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.]], - [[143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j], - [143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j, 143.+7.j]]], - dtype=COMPLEX_TYPE) + [[143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.]]], + dtype=FLOAT_TYPE) np.testing.assert_array_equal(probe_denominator_dev.get(), expected_probe_denominator, err_msg="The probe denominatorhas not been updated as expected") @@ -519,9 +518,9 @@ def pr_update_UNITY_tester(self, atomics=True): ''' test ''' - probe_denominator = np.empty_like(probe) + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) for idx in range(D): - probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + 1j * np.ones((E, F)) * (5 * idx + 2) + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) POUK = PoUpdateKernel() from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel @@ -650,6 +649,158 @@ def test_ob_update_ML_atomics_REGRESSION(self): def test_ob_update_ML_tiled_REGRESSION(self): self.ob_update_ML_tester(False) + def test_ob_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 2 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel(queue_thread=self.stream) + + object_array_dev = gpuarray.to_gpu(object_array) + probe_dev = gpuarray.to_gpu(probe) + exit_wave_dev = gpuarray.to_gpu(exit_wave) + auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave) + addr_dev = gpuarray.to_gpu(addr) + + POUK.ob_update_local(addr_dev, object_array_dev, probe_dev, exit_wave_dev, auxiliary_wave_dev) + nPOUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave) + + np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6, + err_msg="The object array has not been updated as expected") + + def test_pr_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 1.5 + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel() + + object_array_dev = gpuarray.to_gpu(object_array) + probe_dev = gpuarray.to_gpu(probe) + exit_wave_dev = gpuarray.to_gpu(exit_wave) + auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave) + addr_dev = gpuarray.to_gpu(addr) + + POUK.pr_update_local(addr_dev, probe_dev, object_array_dev,exit_wave_dev, auxiliary_wave_dev) + nPOUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave) + + np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6, + err_msg="The probe has not been updated as expected") + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py index a8deebdc6..7f36f138c 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/position_correction_kernel_test.py @@ -6,6 +6,7 @@ import unittest import numpy as np from . import PyCudaTest, have_pycuda +from ptypy import utils as u if have_pycuda(): from pycuda import gpuarray @@ -19,6 +20,17 @@ class PositionCorrectionKernelTest(PyCudaTest): + def setUp(self): + PyCudaTest.setUp(self) + self.params = u.Param() + self.params.nshifts = 4 + self.params.method = "Annealing" + self.params.amplitude = 2e-9 + self.params.start = 0 + self.params.stop = 10 + self.params.max_shift = 2e-9 + self.resolution = [1e-9,1e-9] + def update_addr_and_error_state_UNITY_helper(self, size, modes): ## Arrange addr = np.ones((size, modes, 5, 3), dtype=np.int32) @@ -33,9 +45,9 @@ def update_addr_and_error_state_UNITY_helper(self, size, modes): aux = np.ones((1,1,1), dtype=np.complex64) ## Act - PCK = PositionCorrectionKernel(aux, modes, queue_thread=self.stream) + PCK = PositionCorrectionKernel(aux, modes, self.params, self.resolution, queue_thread=self.stream) PCK.update_addr_and_error_state(addr_gpu, err_state_gpu, mangled_addr_gpu, err_sum_gpu) - abPCK = abPositionCorrectionKernel(aux, modes) + abPCK = abPositionCorrectionKernel(aux, modes, self.params, self.resolution) abPCK.update_addr_and_error_state(addr, err_state, mangled_addr, err_sum) ## Assert diff --git a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py index 28f576b9e..794a547fd 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/propagation_kernel_test.py @@ -23,7 +23,7 @@ class PropagationKernelTest(PyCudaTest): - def set_up_farfield(self,shape): + def set_up_farfield(self,shape, resolution=None): P = Base() P.CType = COMPLEX_TYPE P.Ftype = FLOAT_TYPE @@ -34,6 +34,8 @@ def set_up_farfield(self,shape): g.psize = 24e-6 g.shape = shape g.propagation = "farfield" + if resolution is not None: + g.resolution = resolution G = geometry.Geo(owner=P, pars=g) return G @@ -65,7 +67,8 @@ def test_farfield_propagator_forward_UNITY(self): PropK.allocate() PropK.fw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_farfield_propagator_backward_UNITY(self): # setup @@ -81,7 +84,44 @@ def test_farfield_propagator_backward_UNITY(self): PropK.allocate() PropK.bw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_forward_crop_pad_UNITY(self): + # setup + SH = (16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[5:11,5:11] = 1. + 2j + aux_d = gpuarray.to_gpu(aux) + geo = self.set_up_farfield(SH) + geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.fw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_backward_crop_pad_UNITY(self): + # setup + SH = (16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[5:11,5:11] = 1. + 2j + aux_d = gpuarray.to_gpu(aux) + geo = self.set_up_farfield(SH) + geo = self.set_up_farfield(SH, resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.bw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_nearfield_propagator_forward_UNITY(self): # setup @@ -97,7 +137,8 @@ def test_nearfield_propagator_forward_UNITY(self): PropK.allocate() PropK.fw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) def test_nearfield_propagator_backward_UNITY(self): # setup @@ -113,4 +154,5 @@ def test_nearfield_propagator_backward_UNITY(self): PropK.allocate() PropK.bw(aux_d, aux_d) - np.testing.assert_allclose(aux, aux_d.get(), atol=1e-06, rtol=5e-5, err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) \ No newline at end of file + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) \ No newline at end of file diff --git a/test/engine_tests/ML_test.py b/test/engine_tests/ML_test.py index b7ae3525e..fd95b816e 100644 --- a/test/engine_tests/ML_test.py +++ b/test/engine_tests/ML_test.py @@ -12,21 +12,6 @@ class MLTest(unittest.TestCase): - def test_ML_farfield_position_refinement(self): - engine_params = u.Param() - engine_params.name = 'ML' - engine_params.numiter = 5 - engine_params.probe_update_start = 2 - engine_params.floating_intensities = False - engine_params.intensity_renormalization = 1.0 - engine_params.reg_del2 =True - engine_params.reg_del2_amplitude = 0.01 - engine_params.smooth_gradient = 0.0 - engine_params.scale_precond =False - engine_params.probe_update_start = 0 - engine_params.position_refinement = True - tu.EngineTestRunner(engine_params) - def test_ML_farfield_floating_intensities(self): engine_params = u.Param() engine_params.name = 'ML'