diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py index 839b08e70..410d9102f 100644 --- a/ptypy/accelerate/base/array_utils.py +++ b/ptypy/accelerate/base/array_utils.py @@ -3,6 +3,7 @@ ''' import numpy as np from scipy import ndimage as ndi +from ptypy import utils as u def dot(A, B, acc_dtype=np.float64): @@ -148,3 +149,26 @@ def crop_pad_2d_simple(A, B): b1, b2 = B.shape[-2:] offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] fill3D(A, B, offset) + +def resample(A, B): + """ + Resamples the last two dimensions of B onto shape of A and places it in A. + The ratio between shapes needs to be a power of 2 along the last two dimension. + upsampling (A larger than B): nearest neighbour interpolation + downsampling (B larger than A): integrate over neighbouring regions + """ + assert A.ndim > 2, "Arrays must have at least 2 dimensions" + assert B.ndim > 2, "Arrays must have at least 2 dimensions" + assert A.shape[:-2] == B.shape[:-2], "Arrays must have same shape expect along the last 2 dimensions" + assert A.shape[-2] == A.shape[-1], "Last two dimensions must be of equal length" + assert B.shape[-2] == B.shape[-2], "Last two dimensions must be of equal length" + # same sampling, no need to call this function + assert A.shape != B.shape, "A and B have the same shape, no need to do any resampling" + # upsampling + if A.shape[-1] > B.shape[-1]: + resample = A.shape[-1] // B.shape[-1] + A[:] = u.repeat_2d(B, resample) / (resample**2) + # downsampling + elif A.shape[-1] < B.shape[-1]: + resample = B.shape[-1] // A.shape[-1] + A[:] = u.rebin_2d(B, resample) * (resample**2) diff --git a/ptypy/accelerate/base/engines/DM_serial.py b/ptypy/accelerate/base/engines/DM_serial.py index 44573bf56..1a77eeb65 100644 --- a/ptypy/accelerate/base/engines/DM_serial.py +++ b/ptypy/accelerate/base/engines/DM_serial.py @@ -178,8 +178,18 @@ def _setup_kernels(self): aux = np.zeros(ash, dtype=np.complex64) kern.aux = aux + # create extra array for resampling (if needed) + kern.resample = scan.resample > 1 + if kern.resample: + ish = (ash[0],) + tuple(geo.shape//scan.resample) + kern.aux_tmp1 = np.zeros(ash, dtype=np.float32) + kern.aux_tmp2 = np.zeros(ish, dtype=np.float32) + aux_f = kern.aux_tmp2 # making sure to pass the correct shapes to FUK + else: + aux_f = aux + # setup kernels, one for each SCAN. - kern.FUK = FourierUpdateKernel(aux, nmodes) + kern.FUK = FourierUpdateKernel(aux_f, nmodes) kern.FUK.allocate() kern.POK = PoUpdateKernel() @@ -279,6 +289,12 @@ def engine_iterate(self, num=1): pbound = self.pbound_scan[prep.label] aux = kern.aux + # resampling + resample = kern.resample + if resample: + aux_tmp1 = kern.aux_tmp1 + aux_tmp2 = kern.aux_tmp2 + # local references ma = prep.ma ob = self.ob.S[oID].data @@ -290,7 +306,12 @@ def engine_iterate(self, num=1): t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) aux[:] = FW(aux) - FUK.log_likelihood(aux, addr, mag, ma, err_phot) + if resample: + aux_tmp1 = np.abs(aux)**2 + au.resample(aux_tmp2, aux_tmp1) + FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True) + else: + FUK.log_likelihood(aux, addr, mag, ma, err_phot) self.benchmark.F_LLerror += time.time() - t1 ## build auxilliary wave @@ -305,9 +326,18 @@ def engine_iterate(self, num=1): ## Deviation from measured data t1 = time.time() - FUK.fourier_error(aux, addr, mag, ma, ma_sum) - FUK.error_reduce(addr, err_fourier) - FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + if resample: + aux_tmp1 = np.abs(aux)**2 + au.resample(aux_tmp2, aux_tmp1) + FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False) + au.resample(aux_tmp1, aux_tmp2) + aux *= aux_tmp1 + else: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) self.benchmark.C_Fourier_update += time.time() - t1 ## backward FFT @@ -318,7 +348,12 @@ def engine_iterate(self, num=1): ## build exit wave t1 = time.time() AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) - FUK.exit_error(aux,addr) + if resample: + aux_tmp1 = np.abs(aux)**2 + au.resample(aux_tmp2, aux_tmp1) + FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True) + else: + FUK.exit_error(aux,addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 214aa0536..21a473fbe 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -21,8 +21,9 @@ from ptypy.utils import parallel from ptypy.engines.utils import Cnorm2, Cdot from ptypy.engines import register -from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel +from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel from ptypy.accelerate.base import address_manglers +from ptypy.accelerate.base import array_utils as au __all__ = ['ML_serial'] @@ -90,6 +91,13 @@ def _setup_kernels(self): kern.a = np.zeros(ash, dtype=np.complex64) kern.b = np.zeros(ash, dtype=np.complex64) + # create extra array for resampling (if needed) + kern.resample = scan.resample > 1 + if kern.resample: + ish = (ash[0],) + tuple(geo.shape//scan.resample) + kern.aux_tmp1 = np.zeros(ash, dtype=np.float32) + kern.aux_tmp2 = np.zeros(ish, dtype=np.float32) + # setup kernels, one for each SCAN. kern.GDK = GradientDescentKernel(aux, nmodes) kern.GDK.allocate() @@ -284,6 +292,13 @@ def prepare(self): prep = self.engine.diff_info[d.ID] prep.weights = (self.Irenorm * self.engine.ma.S[d.ID].data / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + prep.intensity = self.engine.di.S[d.ID].data + kern = self.engine.kernels[prep.label] + if kern.resample: + prep.weights2 = np.zeros((prep.weights.shape[0],) + kern.aux_tmp1.shape[-2:], dtype=prep.weights.dtype) + au.resample(prep.weights2, prep.weights) + prep.intensity2 = np.zeros((prep.intensity.shape[0],) + kern.aux_tmp1.shape[-2:], dtype=prep.weights.dtype) + au.resample(prep.intensity2, prep.intensity) def __del__(self): """ @@ -325,7 +340,8 @@ def new_grad(self): # get addresses and auxilliary array addr = prep.addr - w = prep.weights + w = prep.weights2 + I = prep.intensity2 err_phot = prep.err_phot fic = prep.float_intens_coeff @@ -334,7 +350,11 @@ def new_grad(self): obg = ob_grad.S[oID].data pr = self.engine.pr.S[pID].data prg = pr_grad.S[pID].data - I = self.engine.di.S[dID].data + + # resampling + if kern.resample: + aux_tmp1 = kern.aux_tmp1 + aux_tmp2 = kern.aux_tmp2 # make propagated exit (to buffer) AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) @@ -342,7 +362,14 @@ def new_grad(self): # forward prop aux[:] = FW(aux) - GDK.make_model(aux, addr) + if kern.resample: + aux_tmp1 = np.abs(aux)**2 + au.resample(aux_tmp2, aux_tmp1) + au.resample(aux_tmp1, aux_tmp2) + GDK.make_model(aux_tmp1, addr, aux_is_intensity=True) + else: + GDK.make_model(aux, addr) + if self.p.floating_intensities: GDK.floating_intensity(addr, w, I, fic) GDK.main(aux, addr, w, I) @@ -406,6 +433,7 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): # get addresses and auxilliary array addr = prep.addr w = prep.weights + I = prep.intensity fic = prep.float_intens_coeff # local references @@ -413,7 +441,11 @@ def poly_line_coeffs(self, c_ob_h, c_pr_h): ob_h = c_ob_h.S[oID].data pr = self.pr.S[pID].data pr_h = c_pr_h.S[pID].data - I = self.di.S[dID].data + + # resampling + if kern.resample: + w = prep.weights2 + I = prep.intensity2 # make propagated exit (to buffer) AWK.build_aux_no_ex(f, addr, ob, pr, add=False) diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index b1f109444..d11d2e77e 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -49,9 +49,9 @@ def allocate(self): self.npy.fdev = np.zeros(self.fshape, dtype=np.float32) self.npy.ferr = np.zeros(self.fshape, dtype=np.float32) - def fourier_error(self, b_aux, addr, mag, mask, mask_sum): + def fourier_error(self, b_aux, addr, mag, mask, mask_sum, aux_is_intensity=False): # reference shape (write-to shape) - sh = self.fshape + sh = b_aux.shape # stopper maxz = mag.shape[0] @@ -65,7 +65,10 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum): # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) - af = np.sqrt((np.abs(tf) ** 2).sum(1)) + if aux_is_intensity: + af = np.sqrt(tf.sum(1)) + else: + af = np.sqrt((np.abs(tf) ** 2).sum(1)) # calculate difference to real data (g_mag) fdev[:] = af - mag @@ -74,9 +77,9 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum): ferr[:] = mask * np.abs(fdev) ** 2 / mask_sum.reshape((maxz, 1, 1)) return - def fourier_deviation(self, b_aux, addr, mag): + def fourier_deviation(self, b_aux, addr, mag, aux_is_intensity=False): # reference shape (write-to shape) - sh = self.fshape + sh = b_aux.shape # stopper maxz = mag.shape[0] @@ -89,7 +92,10 @@ def fourier_deviation(self, b_aux, addr, mag): # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) - af = np.sqrt((np.abs(tf) ** 2).sum(1)) + if aux_is_intensity: + af = np.sqrt(tf.sum(1)) + else: + af = np.sqrt((np.abs(tf) ** 2).sum(1)) # calculate difference to real data (g_mag) fdev[:] = af - mag @@ -97,8 +103,6 @@ def fourier_deviation(self, b_aux, addr, mag): return def error_reduce(self, addr, err_sum): - # reference shape (write-to shape) - sh = self.fshape # stopper maxz = err_sum.shape[0] @@ -113,9 +117,9 @@ def error_reduce(self, addr, err_sum): err_sum[:] = ferr.sum(-1).sum(-1) return - def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0): + def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0, mult=True): - sh = self.fshape + sh = b_aux.shape nmodes = self.nmodes # stopper @@ -153,12 +157,15 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0): #fm[:] = mag / (af + 1e-6) # upcasting - aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + if mult: + aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + else: + aux[:] = (np.ones((ish[0] // nmodes, nmodes, ish[1], ish[2])) * fm[:, np.newaxis, :, :]).reshape(ish) return - def fmag_update_nopbound(self, b_aux, addr, mag, mask): + def fmag_update_nopbound(self, b_aux, addr, mag, mask, mult=True): - sh = self.fshape + sh = b_aux.shape nmodes = self.nmodes # stopper @@ -180,12 +187,15 @@ def fmag_update_nopbound(self, b_aux, addr, mag, mask): fm[:] = (1 - mask) + mask * mag / (af + self.denom) # upcasting - aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + if mult: + aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * fm[:, np.newaxis, :, :]).reshape(ish) + else: + aux[:] = (np.ones((ish[0] // nmodes, nmodes, ish[1], ish[2])) * fm[:, np.newaxis, :, :]).reshape(ish) return - def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + def log_likelihood(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False): # reference shape (write-to shape) - sh = self.fshape + sh = b_aux.shape # stopper maxz = mag.shape[0] @@ -195,7 +205,10 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) - LL = (np.abs(tf) ** 2).sum(1) + if aux_is_intensity: + LL = tf.sum(1) + else: + LL = (np.abs(tf) ** 2).sum(1) # Intensity data I = mag**2 @@ -204,7 +217,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot): err_phot[:] = ((mask * (LL - I)**2 / (I + 1.)).sum(-1).sum(-1) / np.prod(LL.shape[-2:])) return - def exit_error(self, aux, addr): + def exit_error(self, aux, addr, aux_is_intensity=False): sh = addr.shape maxz = sh[0] @@ -212,7 +225,11 @@ def exit_error(self, aux, addr): ferr = self.npy.ferr[:maxz] dex = aux[:maxz * self.nmodes] fsh = dex.shape[-2:] - ferr[:] = (np.abs(dex.reshape((maxz,self.nmodes,fsh[0], fsh[1])))**2).sum(axis=1) / np.prod(fsh) + tf = dex.reshape((maxz,self.nmodes,fsh[0], fsh[1])) + if aux_is_intensity: + ferr[:] = (tf).sum(axis=1) / np.prod(fsh) + else: + ferr[:] = (np.abs(tf)**2).sum(axis=1) / np.prod(fsh) class GradientDescentKernel(BaseKernel): @@ -256,10 +273,10 @@ def allocate(self): self.npy.fic_tmp = np.ones((self.fshape[0],), dtype=self.ftype) - def make_model(self, b_aux, addr): + def make_model(self, b_aux, addr, aux_is_intensity=False): # reference shape (= GPU global dims) - sh = self.fshape + sh = b_aux.shape # batch buffers Imodel = self.npy.Imodel @@ -267,7 +284,10 @@ def make_model(self, b_aux, addr): ## Actual math ## (subset of FUK.fourier_error) tf = aux.reshape(sh[0], self.nmodes, sh[1], sh[2]) - Imodel[:] = (np.abs(tf) ** 2).sum(1) + if aux_is_intensity: + Imodel[:] = tf.sum(1) + else: + Imodel[:] = (np.abs(tf) ** 2).sum(1) def make_a012(self, b_f, b_a, b_b, addr, I, fic): @@ -377,8 +397,8 @@ def main(self, b_aux, addr, w, I): ## math ## DI = Imodel - I tmp = w * DI - err[:] = tmp * DI + err[:] = tmp * DI aux[:] = (aux.reshape(ish[0] // nmodes, nmodes, ish[1], ish[2]) * tmp[:, np.newaxis, :, :]).reshape(ish) return diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 85f816223..ed40475db 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -239,6 +239,85 @@ def crop_pad_2d_simple(self, A, B): self.fill3D(A, B, offset) +class ResampleKernel: + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.upsample_cuda = {} + self.downsample_cuda = {} + + def resample(self, A, B): + assert A.ndim > 2, "Arrays must have at least 2 dimensions" + assert B.ndim > 2, "Arrays must have at least 2 dimensions" + assert A.shape[:-2] == B.shape[:-2], "Arrays must have same shape expect along the last 2 dimensions" + assert A.shape[-2] == A.shape[-1], "Last two dimensions must be of equal length" + assert B.shape[-2] == B.shape[-2], "Last two dimensions must be of equal length" + # same sampling, no need to call this function + assert A.shape != B.shape, "A and B have the same shape, no need to do any resampling" + assert A.dtype == B.dtype, "A and B must have the same data type" + + def map_type(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) + + bx = 16 + by = 16 + # upsampling + if A.shape[-1] > B.shape[-1]: + resample = A.shape[-1] // B.shape[-1] + assert resample <= bx, "Upsampling up to factor {} is supported".format(bx) + + version = '{}'.format(map_type(A.dtype)) + if version not in self.upsample_cuda: + self.upsample_cuda[version] = load_kernel('upsample', { + 'TYPE': map_type(A.dtype) + }) + self.upsample_cuda[version](A, B, + np.int32(A.shape[1]), + np.int32(A.shape[2]), + np.int32(resample), + block=(bx, by, 1), + grid=( + int((A.shape[2] + bx - 1) // bx), + int((A.shape[1] + by - 1) // by), + int(A.shape[0])), + shared=int(bx*by*A.dtype.itemsize/resample/resample), + stream=self.queue) + + # downsampling + elif A.shape[-1] < B.shape[-1]: + resample = B.shape[-1] // A.shape[-1] + assert resample <= bx, "Downsampling up to factor {} is supported".format(bx) + + version = '{}'.format(map_type(A.dtype)) + if version not in self.downsample_cuda: + self.downsample_cuda[version] = load_kernel('downsample', { + 'TYPE': map_type(A.dtype) + }) + self.downsample_cuda[version](A, B, + np.int32(B.shape[1]), + np.int32(B.shape[2]), + np.int32(resample), + block=(bx, by, 1), + grid=( + int((B.shape[2] + bx - 1) // bx), + int((B.shape[1] + by - 1) // by), + int(A.shape[0])), + shared=int(bx*by*A.dtype.itemsize), + stream=self.queue) + class DerivativesKernel: def __init__(self, dtype, queue=None): if dtype == np.float32: diff --git a/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu b/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu new file mode 100644 index 000000000..7c9544ad3 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/downsample.cu @@ -0,0 +1,60 @@ +#include +using thrust::complex; + +extern "C" __global__ void downsample(TYPE* A, + const TYPE* B, + int Brows, + int Bcols, + int factor) +{ + // shared mem is BDIMx x BDIMy, declared on the call-side + extern __shared__ char sh_raw[]; + auto block = reinterpret_cast(sh_raw); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int bx = threadIdx.x + blockIdx.x * blockDim.x; + const int by = threadIdx.y + blockIdx.y * blockDim.y; + + // offset A and B to the right point in first dimension + A += blockIdx.z * Brows / factor * Bcols / factor; + B += blockIdx.z * Brows * Bcols; + + // read input data into shared memory + if (bx < Bcols && by < Brows) { + block[ty * blockDim.x + tx] = B[by * Bcols + bx]; + } else { + block[ty * blockDim.x + tx] = TYPE(0); + } + __syncthreads(); + + // reduce across the downsampling area + // first in x direction + int c = 1; + while (c != factor) + { + c *= 2; + if (tx % c == 0) { + block[ty * blockDim.x + tx] += block[ty * blockDim.x + tx + c/2]; + } + __syncthreads(); + } + + // now in y direction + c = 1; + while (c != factor) + { + c *= 2; + if (ty % c == 0) { + block[ty * blockDim.x + tx] += block[(ty +c/2) * blockDim.x + tx]; + } + __syncthreads(); + } + + // now write out + if (tx % factor == 0 && ty % factor == 0 && bx < Bcols && by < Brows) { + const int ax = bx / factor; + const int ay = by / factor; + A[ay * Bcols / factor + ax] = block[ty * blockDim.x + tx]; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu index fdac52e46..93671d158 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu @@ -5,22 +5,33 @@ using std::sqrt; using thrust::abs; using thrust::complex; -// specify max number of threads/block and min number of blocks per SM, -// to assist the compiler in register optimisations. -// We achieve a higher occupancy in this case, as less registers are used -// (guided by profiler) -extern "C" __global__ void __launch_bounds__(1024, 2) - exit_error(int nmodes, - const complex * __restrict aux, - OUT_TYPE *ferr, - const int * __restrict addr, - int A, - int B) +// version if input is complex, i.e. we need to calc abs(.)**2 +inline __device__ MATH_TYPE aux_intensity(complex aux_t) { + MATH_TYPE abst = abs(aux_t); + return abst * abst; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding +} + +// version if input is real, so we can just return it +inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) { + return aux_t; +} + + +// generic template that works for complex aux, or intensity aux +template +inline __device__ void exit_error_impl(int nmodes, + const AUX_T* __restrict aux, + OUT_TYPE *ferr, + const int* __restrict addr, + int A, + int B) { int tx = threadIdx.x; int ty = threadIdx.y; int addr_stride = 15; MATH_TYPE denom = A * B; + const int *ea = addr + 6 + (blockIdx.x * nmodes) * addr_stride; const int *da = addr + 9 + (blockIdx.x * nmodes) * addr_stride; @@ -35,13 +46,40 @@ extern "C" __global__ void __launch_bounds__(1024, 2) MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - complex t_aux = aux[a * B + b + idx * A * B]; - MATH_TYPE abs_exit_wave = abs(t_aux); - acc += abs_exit_wave * - abs_exit_wave; // if we do this manually (real*real +imag*imag) - // we get differences to numpy due to rounding + acc += aux_intensity(aux[a * B + b + idx * A * B]); } ferr[a * B + b] = OUT_TYPE(acc / denom); } } } + +// 2 kernels using the same template above - only difference is the input +// type for aux (and the name) + + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + exit_error(int nmodes, + const complex* __restrict aux, + OUT_TYPE *ferr, + const int * __restrict addr, + int A, + int B) +{ + exit_error_impl(nmodes, aux, ferr, addr, A, B); +} + + +extern "C" __global__ void __launch_bounds__(1024, 2) + exit_error_auxintensity(int nmodes, + const IN_TYPE* __restrict aux, + OUT_TYPE *ferr, + const int * __restrict addr, + int A, + int B) +{ + exit_error_impl(nmodes, aux, ferr, addr, A, B); +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu index f8f695ca5..5babcd291 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu @@ -11,15 +11,16 @@ using std::sqrt; using thrust::complex; -extern "C" __global__ void fmag_all_update(complex* f, - const IN_TYPE* fmask, - const IN_TYPE* fmag, - const IN_TYPE* fdev, - const IN_TYPE* err_fmag, - const int* addr_info, - IN_TYPE pbound_, - int A, - int B) +template +inline __device__ void fmag_all_update_impl(FType* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const IN_TYPE* err_fmag, + const int* addr_info, + IN_TYPE pbound_, + int A, + int B) { int batch = blockIdx.x; int tx = threadIdx.x; @@ -55,8 +56,38 @@ extern "C" __global__ void fmag_all_update(complex* f, MATH_TYPE fdevv = fdev[a * A + b]; MATH_TYPE fm = (MATH_TYPE(1) - m) + m * ((fmagv + fdevv * renorm) / (fmagv + fdevv + MATH_TYPE(1e-7))); - f[a * A + b] *= fm; + if (doMult) // compile-time constant, so only one branch ends up in kernel + f[a * A + b] *= fm; + else + f[a * A + b] = fm; } } } } + +extern "C" __global__ void fmag_all_update(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const IN_TYPE* err_fmag, + const int* addr_info, + IN_TYPE pbound_, + int A, + int B) +{ + fmag_all_update_impl(f, fmask, fmag, fdev, err_fmag, addr_info, pbound_, A, B); +} + + +extern "C" __global__ void fmag_all_update_nomult(OUT_TYPE* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const IN_TYPE* err_fmag, + const int* addr_info, + IN_TYPE pbound_, + int A, + int B) +{ + fmag_all_update_impl(f, fmask, fmag, fdev, err_fmag, addr_info, pbound_, A, B); +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu index 40a65c172..166e22f4c 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu @@ -11,13 +11,14 @@ using std::sqrt; using thrust::complex; -extern "C" __global__ void fmag_update_nopbound(complex* f, - const IN_TYPE* fmask, - const IN_TYPE* fmag, - const IN_TYPE* fdev, - const int* addr_info, - int A, - int B) +template +inline __device__ void fmag_update_nopbound_impl(FType* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const int* addr_info, + int A, + int B) { const int bid = blockIdx.z; const int tx = threadIdx.x; @@ -48,6 +49,32 @@ extern "C" __global__ void fmag_update_nopbound(complex* f, MATH_TYPE fdevv = fdev[a * A + b]; MATH_TYPE fm = (MATH_TYPE(1) - m) + m * (fmagv / (fmagv + fdevv + MATH_TYPE(1e-7))); - f[a * A + b] *= fm; + if (doMult) // compile-time constant, so only one branch ends up in kernel + f[a * A + b] *= fm; + else + f[a * A + b] = fm; } } + +extern "C" __global__ void fmag_update_nopbound(complex* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const int* addr_info, + int A, + int B) +{ + fmag_update_nopbound_impl(f, fmask, fmag, fdev, addr_info, A, B); +} + + +extern "C" __global__ void fmag_update_nopbound_nomult(OUT_TYPE* f, + const IN_TYPE* fmask, + const IN_TYPE* fmag, + const IN_TYPE* fdev, + const int* addr_info, + int A, + int B) +{ + fmag_update_nopbound_impl(f, fmask, fmag, fdev, addr_info, A, B); +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu index 3427222c3..5c6526b37 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu @@ -13,13 +13,23 @@ using std::sqrt; using thrust::abs; using thrust::complex; -// specify max number of threads/block and min number of blocks per SM, -// to assist the compiler in register optimisations. -// We achieve a higher occupancy in this case, as less registers are used -// (guided by profiler) -extern "C" __global__ void __launch_bounds__(1024, 2) - fourier_deviation(int nmodes, - const complex *f, +// version where input is complex and we need to calculate intensity +inline __device__ MATH_TYPE intensity(complex f) +{ + MATH_TYPE abst = abs(f); + return abst * abst; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding +} + +// version where input is already an intensity - just return +inline __device__ MATH_TYPE intensity(MATH_TYPE f) +{ + return f; +} + +template +inline __device__ void fourier_deviation_impl(int nmodes, + const AUX_T *f, const IN_TYPE *fmag, OUT_TYPE *fdev, const int *addr, @@ -46,13 +56,40 @@ extern "C" __global__ void __launch_bounds__(1024, 2) MATH_TYPE acc = MATH_TYPE(0); for (int idx = 0; idx < nmodes; ++idx) { - complex t_f = f[a * B + b + idx * A * B]; - MATH_TYPE abs_exit_wave = abs(t_f); - acc += abs_exit_wave * - abs_exit_wave; // if we do this manually (real*real +imag*imag) - // we get differences to numpy due to rounding + acc += intensity(f[a * B + b + idx * A * B]); } auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); fdev[a * B + b] = fdevv; } } + +// two kernels - one if input is already an intensity and one where it's not + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_deviation(int nmodes, + const complex *f, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + const int *addr, + int A, + int B) +{ + fourier_deviation_impl(nmodes, f, fmag, fdev, addr, A, B); +} + + +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_deviation_auxintensity(int nmodes, + const IN_TYPE *f, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + const int *addr, + int A, + int B) +{ + fourier_deviation_impl(nmodes, f, fmag, fdev, addr, A, B); +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu index ad483c870..93e86c0f7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu @@ -14,13 +14,22 @@ using std::sqrt; using thrust::abs; using thrust::complex; -// specify max number of threads/block and min number of blocks per SM, -// to assist the compiler in register optimisations. -// We achieve a higher occupancy in this case, as less registers are used -// (guided by profiler) -extern "C" __global__ void __launch_bounds__(1024, 2) - fourier_error(int nmodes, - const complex *f, +// version if input is complex, i.e. we need to calc abs(.)**2 +inline __device__ MATH_TYPE aux_intensity(complex aux_t) { + MATH_TYPE abst = abs(aux_t); + return abst * abst; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding +} + +// version if input is real, so we can just return it +inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) { + return aux_t; +} + +template +inline __device__ void fourier_error_impl( + int nmodes, + const AUX_T *f, const IN_TYPE *fmask, const IN_TYPE *fmag, OUT_TYPE *fdev, @@ -51,11 +60,7 @@ extern "C" __global__ void __launch_bounds__(1024, 2) MATH_TYPE acc = MATH_TYPE(0); for (int idx = 0; idx < nmodes; ++idx) { - complex t_f = f[a * B + b + idx * A * B]; - MATH_TYPE abs_exit_wave = abs(t_f); - acc += abs_exit_wave * - abs_exit_wave; // if we do this manually (real*real +imag*imag) - // we get differences to numpy due to rounding + acc += aux_intensity(f[a * B + b + idx * A * B]); } auto fdevv = sqrt(acc) - MATH_TYPE(fmag[a * B + b]); ferr[a * B + b] = (fmask[a * B + b] * fdevv * fdevv) / mask_sum[ma[0]]; @@ -63,3 +68,38 @@ extern "C" __global__ void __launch_bounds__(1024, 2) } } } + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_error(int nmodes, + const complex *f, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + OUT_TYPE *ferr, + const IN_TYPE *mask_sum, + const int *addr, + int A, + int B) +{ + fourier_error_impl(nmodes, f, fmask, fmag, fdev, ferr, mask_sum, addr, A, B); +} + + +extern "C" __global__ void __launch_bounds__(1024, 2) + fourier_error_auxintensity(int nmodes, + const IN_TYPE *f, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + OUT_TYPE *fdev, + OUT_TYPE *ferr, + const IN_TYPE *mask_sum, + const int *addr, + int A, + int B) +{ + fourier_error_impl(nmodes, f, fmask, fmag, fdev, ferr, mask_sum, addr, A, B); +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu index 90455b1e2..04fa202a7 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu @@ -13,13 +13,24 @@ using std::sqrt; using thrust::abs; using thrust::complex; -// specify max number of threads/block and min number of blocks per SM, -// to assist the compiler in register optimisations. -// We achieve a higher occupancy in this case, as less registers are used -// (guided by profiler) -extern "C" __global__ void __launch_bounds__(1024, 2) - log_likelihood(int nmodes, - complex *aux, +// version if input is complex, i.e. we need to calc abs(.)**2 +inline __device__ MATH_TYPE aux_intensity(complex aux_t) { + MATH_TYPE abst = abs(aux_t); + return abst * abst; // if we do this manually (real*real +imag*imag) + // we get differences to numpy due to rounding +} + +// version if input is real, so we can just return it +inline __device__ MATH_TYPE aux_intensity(MATH_TYPE aux_t) { + return aux_t; +} + +////////// log_likelihood with 1 thread block per image + +template +inline __device__ void log_likelihood_impl( + int nmodes, + AUX_T *aux, const IN_TYPE *fmask, const IN_TYPE *fmag, const int *addr, @@ -48,11 +59,7 @@ extern "C" __global__ void __launch_bounds__(1024, 2) MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - complex t_aux = aux[a * B + b + idx * A * B]; - MATH_TYPE abs_exit_wave = abs(t_aux); - acc += abs_exit_wave * - abs_exit_wave; // if we do this manually (real*real +imag*imag) - // we get differences to numpy due to rounding + acc += aux_intensity(aux[a * B + b + idx * A * B]); } auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); llerr[a * B + b] = @@ -61,9 +68,12 @@ extern "C" __global__ void __launch_bounds__(1024, 2) } } - -extern "C" __global__ void - log_likelihood2(int nmodes, +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + log_likelihood(int nmodes, complex *aux, const IN_TYPE *fmask, const IN_TYPE *fmag, @@ -71,6 +81,34 @@ extern "C" __global__ void IN_TYPE *llerr, int A, int B) +{ + log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B); +} + +extern "C" __global__ void __launch_bounds__(1024, 2) + log_likelihood_auxintensity(int nmodes, + OUT_TYPE *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) +{ + log_likelihood_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B); +} + +/////////////////// version with 1 thread block per x dimension only +template +__device__ inline void log_likelihood2_impl( + int nmodes, + AUX_T *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) { int bid = blockIdx.z; int tx = threadIdx.x; @@ -94,14 +132,37 @@ extern "C" __global__ void MATH_TYPE acc = 0.0; for (int idx = 0; idx < nmodes; ++idx) { - complex t_aux = aux[a * B + b + idx * A * B]; - MATH_TYPE abs_exit_wave = abs(t_aux); - acc += abs_exit_wave * - abs_exit_wave; // if we do this manually (real*real +imag*imag) - // we get differences to numpy due to rounding + acc += aux_intensity(aux[a * B + b + idx * A * B]); } auto I = MATH_TYPE(fmag[a * B + b]) * MATH_TYPE(fmag[a * B + b]); llerr[a * B + b] = MATH_TYPE(fmask[a * B + b]) * (acc - I) * (acc - I) / (I + 1) / norm; } +} + + +extern "C" __global__ void + log_likelihood2(int nmodes, + complex *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) +{ + log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B); +} + +extern "C" __global__ void + log_likelihood2_auxintensity(int nmodes, + OUT_TYPE *aux, + const IN_TYPE *fmask, + const IN_TYPE *fmag, + const int *addr, + IN_TYPE *llerr, + int A, + int B) +{ + log_likelihood2_impl(nmodes, aux, fmask, fmag, addr, llerr, A, B); } \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu b/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu new file mode 100644 index 000000000..ed4024114 --- /dev/null +++ b/ptypy/accelerate/cuda_pycuda/cuda/upsample.cu @@ -0,0 +1,41 @@ +#include +using thrust::complex; + +extern "C" __global__ void upsample(TYPE* A, + const TYPE* B, + int Arows, + int Acols, + int factor) +{ + // shared mem is BDIM/factor x BDIM/factor of TYPE, declared on the call-side + extern __shared__ char sh_raw[]; + auto block = reinterpret_cast(sh_raw); + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int ax = threadIdx.x + blockIdx.x * blockDim.x; + const int ay = threadIdx.y + blockIdx.y * blockDim.y; + + // offset A and B to the right point in first dimension + A += blockIdx.z * Arows * Acols; + B += blockIdx.z * Arows / factor * Acols / factor; + + // read the input data into the shared memory + // we use a thread block per output size, so we only need a + // subset of the threads to read the data in + if (tx < blockDim.x / factor && ty < blockDim.y / factor && ax < Acols && + ay < Arows) + { + const int tbx = blockIdx.x * blockDim.x / factor + tx; + const int tby = blockIdx.y * blockDim.y / factor + ty; + block[ty * blockDim.x / factor + tx] = + B[tby * Acols / factor + tbx] / (factor * factor); + } + __syncthreads(); + + // now all the threads read from shared mem to output + if (ax < Acols && ay < Arows) + { + A[ay * Acols + ax] = block[(ty / factor * blockDim.x + tx) / factor]; + } +} \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py index 961851072..8e15a1516 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda.py @@ -12,16 +12,17 @@ import time from pycuda import gpuarray import pycuda.driver as cuda +from pycuda.elementwise import ElementwiseKernel from ptypy import utils as u from ptypy.utils.verbose import logger, log from ptypy.utils import parallel from ptypy.engines import register from ptypy.accelerate.base.engines import DM_serial -from .. import get_context +from .. import get_context, debug_options from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel -from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel, ResampleKernel, TransposeKernel, ClipMagnitudesKernel from ..mem_utils import make_pagelocked_paired_arrays as mppa from ..multi_gpu import get_multi_gpu_communicator @@ -111,9 +112,19 @@ def _setup_kernels(self): aux = np.zeros(ash, dtype=np.complex64) kern.aux = gpuarray.to_gpu(aux) + # create extra array for resampling (if needed) + kern.resample = scan.resample > 1 + if kern.resample: + ish = (ash[0],) + tuple(geo.shape//scan.resample) + kern.aux_tmp1 = np.zeros(ash, dtype=np.float32) + kern.aux_tmp2 = np.zeros(ish, dtype=np.float32) + aux_f = kern.aux_tmp2 # making sure to pass the correct shapes to FUK + else: + aux_f = aux + # setup kernels, one for each SCAN. log(4, "Setting up FourierUpdateKernel") - kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) + kern.FUK = FourierUpdateKernel(aux_f, nmodes, queue_thread=self.queue) kern.FUK.allocate() log(4, "Setting up PoUpdateKernel") @@ -135,6 +146,18 @@ def _setup_kernels(self): kern.PROP.allocate() kern.resolution = geo.resolution[0] + if kern.resample: + logger.info("Setting up Resample kernel") + kern.RSMP = ResampleKernel(queue=self.queue) + + logger.info("Setting up abs**2 kernel") + kern.ABS_SQR = ElementwiseKernel( + "pycuda::complex* in, float* out", + "float abst = abs(in[i]); out[i] = abst*abst;", + "abs_sqr", + options=debug_options + ) + if self.do_position_refinement: log(4, "Setting up PositionCorrectionKernel") kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) @@ -186,6 +209,11 @@ def engine_prepare(self): if self.do_position_refinement: prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu) + kern = self.kernels[prep.label] + if kern.resample: + kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1) + kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2) + def engine_iterate(self, num=1): """ Compute one iteration. @@ -206,6 +234,9 @@ def engine_iterate(self, num=1): FUK = kern.FUK AWK = kern.AWK PROP = kern.PROP + if kern.resample: + ABS_SQR = kern.ABS_SQR + RSMP = kern.RSMP # get addresses and buffers addr = prep.addr_gpu @@ -217,6 +248,12 @@ def engine_iterate(self, num=1): pbound = self.pbound_scan[prep.label] aux = kern.aux + # resampling + resample = kern.resample + if resample: + aux_tmp1 = kern.aux_tmp1_gpu + aux_tmp2 = kern.aux_tmp2_gpu + # local references ma = self.ma.S[dID].gpu ob = self.ob.S[oID].gpu @@ -227,7 +264,12 @@ def engine_iterate(self, num=1): if self.p.compute_log_likelihood: AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) - FUK.log_likelihood(aux, addr, mag, ma, err_phot) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True) + else: + FUK.log_likelihood(aux, addr, mag, ma, err_phot) ## build auxilliary wave AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) @@ -236,16 +278,31 @@ def engine_iterate(self, num=1): PROP.fw(aux, aux) ## Deviation from measured data - FUK.fourier_error(aux, addr, mag, ma, ma_sum) - FUK.error_reduce(addr, err_fourier) - FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False) + RSMP.resample(aux_tmp1, aux_tmp2) + # aux *= aux_tmp1 , but with stream + aux._elwise_multiply(aux_tmp1, aux, stream=self.queue) + else: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) ## backward FFT PROP.bw(aux, aux) ## build exit wave AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) - FUK.exit_error(aux, addr) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True) + else: + FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) parallel.barrier() diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py index 9306475b1..3f28d3c78 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_stream.py @@ -114,6 +114,10 @@ def engine_prepare(self): prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) if self.do_position_refinement: prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu) + kern = self.kernels[prep.label] + if kern.resample: + kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1) + kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2) ma = self.ma.S[dID].data.astype(np.float32) prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4) prep.ma[:] = ma @@ -179,10 +183,19 @@ def engine_iterate(self, num=1): FUK = kern.FUK AWK = kern.AWK POK = kern.POK + PROP = kern.PROP + if kern.resample: + ABS_SQR = kern.ABS_SQR + RSMP = kern.RSMP pbound = self.pbound_scan[prep.label] aux = kern.aux - PROP = kern.PROP + + # resampling + resample = kern.resample + if resample: + aux_tmp1 = kern.aux_tmp1_gpu + aux_tmp2 = kern.aux_tmp2_gpu # get addresses and auxilliary array addr = prep.addr_gpu @@ -214,6 +227,17 @@ def engine_iterate(self, num=1): if self.p.compute_log_likelihood: AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + # synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_mag) + FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True) + else: + # synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_mag) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + # synchronize h2d stream with compute stream self.queue.wait_for_event(ev_mag) FUK.log_likelihood(aux, addr, mag, ma, err_phot) @@ -226,11 +250,23 @@ def engine_iterate(self, num=1): PROP.fw(aux, aux) ## Deviation from measured data - # synchronize h2d stream with compute stream - self.queue.wait_for_event(ev_mag) - FUK.fourier_error(aux, addr, mag, ma, ma_sum) - FUK.error_reduce(addr, err_fourier) - FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + # synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_mag) + FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False) + RSMP.resample(aux_tmp1, aux_tmp2) + # aux *= aux_tmp1 , but with stream + aux._elwise_multiply(aux_tmp1, aux, stream=self.queue) + else: + # synchronize h2d stream with compute stream + self.queue.wait_for_event(ev_mag) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) data_mag.record_done(self.queue, 'compute') data_ma.record_done(self.queue, 'compute') @@ -238,7 +274,12 @@ def engine_iterate(self, num=1): PROP.bw(aux, aux) ## apply changes AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) - FUK.exit_error(aux, addr) + if resample: + ABS_SQR(aux, aux_tmp1, stream=self.queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True) + else: + FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) prestr = '%d Iteration (Overlap) #%02d: ' % (parallel.rank, inner) diff --git a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py index d2db342f5..e6b045786 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py +++ b/ptypy/accelerate/cuda_pycuda/engines/DM_pycuda_streams.py @@ -182,6 +182,11 @@ def engine_prepare(self): prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit) if self.do_position_refinement: prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu) + kern = self.kernels[prep.label] + if kern.resample: + kern.aux_tmp1_gpu = gpuarray.to_gpu(kern.aux_tmp1) + kern.aux_tmp2_gpu = gpuarray.to_gpu(kern.aux_tmp2) + ma = self.ma.S[dID].data.astype(np.float32) prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4) prep.ma[:] = ma @@ -290,13 +295,20 @@ def engine_iterate(self, num=1): pbound = self.pbound_scan[prep.label] aux = kern.aux PROP = kern.PROP - + # set streams queue = streamdata.queue FUK.queue = queue AWK.queue = queue POK.queue = queue PROP.queue = queue + resample = kern.resample + if resample: + ABS_SQR = kern.ABS_SQR + RSMP = kern.RSMP + RSMP.queue = queue + aux_tmp1 = kern.aux_tmp1_gpu + aux_tmp2 = kern.aux_tmp2_gpu # get addresses and auxilliary array addr = prep.addr_gpu @@ -331,7 +343,12 @@ def engine_iterate(self, num=1): t1 = time.time() AWK.build_aux_no_ex(aux, addr, ob, pr) PROP.fw(aux, aux) - FUK.log_likelihood(aux, addr, mag, ma, err_phot) + if resample: + ABS_SQR(aux, aux_tmp1, stream=queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.log_likelihood(aux_tmp2, addr, mag, ma, err_phot, aux_is_intensity=True) + else: + FUK.log_likelihood(aux, addr, mag, ma, err_phot) self.benchmark.F_LLerror += time.time() - t1 ## prep + forward FFT @@ -345,9 +362,19 @@ def engine_iterate(self, num=1): ## Deviation from measured data t1 = time.time() - FUK.fourier_error(aux, addr, mag, ma, ma_sum) - FUK.error_reduce(addr, err_fourier) - FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + if resample: + ABS_SQR(aux, aux_tmp1, stream=queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.fourier_error(aux_tmp2, addr, mag, ma, ma_sum, aux_is_intensity=True) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux_tmp2, addr, mag, ma, err_fourier, pbound, mult=False) + RSMP.resample(aux_tmp1, aux_tmp2) + # aux *= aux_tmp1 , but with stream + aux._elwise_multiply(aux_tmp1, aux, stream=queue) + else: + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) self.benchmark.C_Fourier_update += time.time() - t1 streamdata.record_done_ma(dID) @@ -356,7 +383,12 @@ def engine_iterate(self, num=1): PROP.bw(aux, aux) ## apply changes AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) - FUK.exit_error(aux, addr) + if resample: + ABS_SQR(aux, aux_tmp1, stream=queue) + RSMP.resample(aux_tmp2, aux_tmp1) + FUK.exit_error(aux_tmp2, addr, aux_is_intensity=True) + else: + FUK.exit_error(aux, addr) FUK.error_reduce(addr, err_exit) self.benchmark.E_Build_exit += time.time() - t1 diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 47dd4cb79..da0c7848d 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -162,18 +162,26 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma self.accumulate_type = accumulate_type self.math_type = math_type self.queue = queue_thread - self.fmag_all_update_cuda = load_kernel("fmag_all_update", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.fmag_all_update_cuda, self.fmag_all_update_nomult_cuda = load_kernel( + ["fmag_all_update", "fmag_all_update_nomult"], + { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "fmag_all_update.cu") self.fmag_update_nopbound_cuda = None + self.fmag_update_nopbound_nomult_cuda = None self.fourier_deviation_cuda = None - self.fourier_error_cuda = load_kernel("fourier_error", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.fourier_deviation_auxintensity_cuda = None + self.fourier_error_cuda, self.fourier_error_auxintensity_cuda = load_kernel( + ["fourier_error", "fourier_error_auxintensity"], + { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "fourier_error.cu") self.fourier_error2_cuda = None self.error_reduce_cuda = load_kernel("error_reduce", { 'IN_TYPE': 'float', @@ -183,18 +191,23 @@ def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', ma 'BDIM_Y': 32, }) self.fourier_update_cuda = None - self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel( - ("log_likelihood", "log_likelihood2"), { + self.log_likelihood_cuda, self.log_likelihood_auxintensity_cuda, \ + self.log_likelihood2_cuda, self.log_likelihood2_auxintensity_cuda = \ + load_kernel( + ("log_likelihood", "log_likelihood_auxintensity", "log_likelihood2", "log_likelihood2_auxintensity"), { 'IN_TYPE': 'float', 'OUT_TYPE': 'float', 'MATH_TYPE': self.math_type }, "log_likelihood.cu") - self.exit_error_cuda = load_kernel("exit_error", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.exit_error_cuda, self.exit_error_auxintensity_cuda = load_kernel( + ["exit_error", "exit_error_auxintensity"], + { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "exit_error.cu") self.gpu = Adict() self.gpu.fdev = None @@ -204,24 +217,26 @@ def allocate(self): self.gpu.fdev = gpuarray.zeros(self.fshape, dtype=np.float32) self.gpu.ferr = gpuarray.zeros(self.fshape, dtype=np.float32) - def fourier_error(self, f, addr, fmag, fmask, mask_sum): + def fourier_error(self, f, addr, fmag, fmask, mask_sum, aux_is_intensity=False): + assert (aux_is_intensity and f.dtype == np.float32) or (not aux_is_intensity and f.dtype == np.complex64) fdev = self.gpu.fdev ferr = self.gpu.ferr if True: + kernel = self.fourier_error_cuda if not aux_is_intensity else self.fourier_error_auxintensity_cuda # version going over all modes in a single thread (faster) - self.fourier_error_cuda(np.int32(self.nmodes), - f, - fmask, - fmag, - fdev, - ferr, - mask_sum, - addr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(32, 32, 1), - grid=(int(fmag.shape[0]), 1, 1), - stream=self.queue) + kernel(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(fmag.shape[0]), 1, 1), + stream=self.queue) else: # version using one thread per mode + shared mem reduction (slower) if self.fourier_error2_cuda is None: @@ -249,26 +264,31 @@ def fourier_error(self, f, addr, fmag, fmask, mask_sum): shared=int(bx*by*bz*4), stream=self.queue) - def fourier_deviation(self, f, addr, fmag): + def fourier_deviation(self, f, addr, fmag, aux_is_intensity=False): + assert (aux_is_intensity and f.dtype == np.float32) or (not aux_is_intensity and f.dtype == np.complex64) fdev = self.gpu.fdev if self.fourier_deviation_cuda is None: - self.fourier_deviation_cuda = load_kernel("fourier_deviation",{ - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) + self.fourier_deviation_cuda, self.fourier_deviation_auxintensity_cuda = load_kernel( + ["fourier_deviation", "fourier_deviation_auxintensity"], + { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "fourier_deviation.cu") + kernel = self.fourier_deviation_auxintensity_cuda if aux_is_intensity else self.fourier_deviation_cuda bx = 64 by = 1 - self.fourier_deviation_cuda(np.int32(self.nmodes), - f, - fmag, - fdev, - addr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(bx, by, 1), - grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), - stream=self.queue) + kernel(np.int32(self.nmodes), + f, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), + stream=self.queue) def error_reduce(self, addr, err_sum): @@ -280,43 +300,50 @@ def error_reduce(self, addr, err_sum): grid=(int(err_sum.shape[0]), 1, 1), stream=self.queue) - def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): + def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0, mult=True): + assert (mult and f.dtype == np.complex64) or (not mult and f.dtype == np.float32) + fdev = self.gpu.fdev - self.fmag_all_update_cuda(f, - fmask, - fmag, - fdev, - err_fmag, - addr, - np.float32(pbound), - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(32, 32, 1), - grid=(int(fmag.shape[0]*self.nmodes), 1, 1), - stream=self.queue) + kernel = self.fmag_all_update_cuda if mult else self.fmag_all_update_nomult_cuda + kernel(f, + fmask, + fmag, + fdev, + err_fmag, + addr, + np.float32(pbound), + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(fmag.shape[0]*self.nmodes), 1, 1), + stream=self.queue) - def fmag_update_nopbound(self, f, addr, fmag, fmask): + def fmag_update_nopbound(self, f, addr, fmag, fmask, mult=True): + assert (mult and f.dtype == np.complex64) or (not mult and f.dtype == np.float32) fdev = self.gpu.fdev bx = 64 by = 1 if self.fmag_update_nopbound_cuda is None: - self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", { - 'IN_TYPE': 'float', - 'OUT_TYPE': 'float', - 'MATH_TYPE': self.math_type - }) - self.fmag_update_nopbound_cuda(f, - fmask, - fmag, - fdev, - addr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(bx, by, 1), - grid=(1, - int((self.fshape[2] + by - 1) // by), - int(fmag.shape[0]*self.nmodes)), - stream=self.queue) + self.fmag_update_nopbound_cuda, self.fmag_update_nopbound_nomult_cuda = load_kernel( + ["fmag_update_nopbound","fmag_update_nopbound_nomult"], + { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "fmag_update_nopbound.cu") + kernel = self.fmag_update_nopbound_cuda if mult else self.fmag_update_nopbound_nomult_cuda + kernel(f, + fmask, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, + int((self.fshape[2] + by - 1) // by), + int(fmag.shape[0]*self.nmodes)), + stream=self.queue) # Note: this was a test to join the kernels, but it's > 2x slower! def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): @@ -350,53 +377,61 @@ def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): shared=smem, stream=self.queue) - def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + def log_likelihood(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False): + assert (aux_is_intensity and b_aux.dtype == np.float32) or (not aux_is_intensity and b_aux.dtype == np.complex64) ferr = self.gpu.ferr - self.log_likelihood_cuda(np.int32(self.nmodes), - b_aux, - mask, - mag, - addr, - ferr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(32, 32, 1), - grid=(int(mag.shape[0]), 1, 1), - stream=self.queue) + kernel = self.log_likelihood_cuda if not aux_is_intensity else self.log_likelihood_auxintensity_cuda + kernel(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(mag.shape[0]), 1, 1), + stream=self.queue) # TODO: we might want to move this call outside of here self.error_reduce(addr, err_phot) - def log_likelihood2(self, b_aux, addr, mag, mask, err_phot): + def log_likelihood2(self, b_aux, addr, mag, mask, err_phot, aux_is_intensity=False): + assert (aux_is_intensity and b_aux.dtype == np.float32) or (not aux_is_intensity and b_aux.dtype == np.complex64) ferr = self.gpu.ferr bx = 64 by = 1 - self.log_likelihood2_cuda(np.int32(self.nmodes), - b_aux, - mask, - mag, - addr, - ferr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(bx, by, 1), - grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), - stream=self.queue) + kernel = self.log_likelihood2_cuda if not aux_is_intensity else self.log_likelihood2_auxintensity_cuda + kernel(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(bx, by, 1), + grid=(1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), + stream=self.queue) # TODO: we might want to move this call outside of here self.error_reduce(addr, err_phot) - def exit_error(self, aux, addr): + def exit_error(self, aux, addr, aux_is_intensity=False): + assert (aux_is_intensity and aux.dtype == np.float32) or \ + (not aux_is_intensity and aux.dtype == np.complex64) + sh = addr.shape maxz = sh[0] ferr = self.gpu.ferr - self.exit_error_cuda(np.int32(self.nmodes), - aux, - ferr, - addr, - np.int32(self.fshape[1]), - np.int32(self.fshape[2]), - block=(32, 32, 1), - grid=(int(maxz), 1, 1), - stream=self.queue) + kernel = self.exit_error_auxintensity_cuda if aux_is_intensity else self.exit_error_cuda + kernel(np.int32(self.nmodes), + aux, + ferr, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]), + block=(32, 32, 1), + grid=(int(maxz), 1, 1), + stream=self.queue) # DEPRECATED? def execute(self, kernel_name=None, compare=False, sync=False): diff --git a/ptypy/core/geometry.py b/ptypy/core/geometry.py index 05a2b630e..f2f67a731 100644 --- a/ptypy/core/geometry.py +++ b/ptypy/core/geometry.py @@ -386,7 +386,7 @@ def upsample(self, c): """ if self.resample == 1: return c - return u.zoom(c, self.resample, order=0) / (self.resample**2) + return u.repeat_2d(c, self.resample)[0] / (self.resample**2) def downsample(self, a): """ @@ -394,7 +394,7 @@ def downsample(self, a): """ if self.resample == 1: return a - return u.rebin_2d(a, self.resample)[0] + return u.rebin_2d(a, self.resample)[0] * (self.resample**2) def __str__(self): keys = list(self.p.keys()) diff --git a/ptypy/utils/array_utils.py b/ptypy/utils/array_utils.py index a6dc3ede9..0942f3f83 100644 --- a/ptypy/utils/array_utils.py +++ b/ptypy/utils/array_utils.py @@ -15,7 +15,7 @@ __all__ = ['grids', 'switch_orientation', 'mirror', 'crop_pad_symmetric_2d', 'crop_pad_axis', 'crop_pad', 'pad_lr', 'zoom', 'shift_zoom', 'c_zoom', - 'rebin', 'rebin_2d', 'rectangle', 'ellipsis'] + 'rebin', 'rebin_2d', 'repeat_2d', 'rectangle', 'ellipsis'] def switch_orientation(A, orientation, center=None): @@ -105,6 +105,33 @@ def rebin_2d(A, rebin=1): else: return A.reshape(-1, newdim[0], rebin, newdim[1], rebin).mean(-1).mean(-2) +def repeat_2d(A, repeat=1): + """ + Repeats array `A` symmetrically along the last 2 axes + with factor `repeat`. + + Parameters + ---------- + A : array-like + input array, must be at least two-dimensional. + + repeat : int + repeat factor, ``repeat=2`` means that one pixel will be repeated + into a square of 4 pixels. + + Returns + ------- + out : ndarray + repeated array. Note that data type casting follows numpy rules, so that boolean arrays are converted to int. + + See also + -------- + rebin_2d + """ + sh = np.asarray(A.shape[-2:]) + newdim = sh * repeat + assert (sh % repeat == 0).all(), "Last two axes %s of input array `A` cannot be repeated by %s" % (str(tuple(sh)), str(repeat)) + return np.repeat(np.repeat(A.reshape((-1,sh[0],1,sh[1],1)),repeat, axis=-4),repeat,axis=-2).reshape(-1,newdim[0],newdim[1]) def crop_pad_symmetric_2d(A, newshape, center=None): """ diff --git a/templates/minimal_prep_and_run_resample_DM.py b/templates/minimal_prep_and_run_resample_DM.py index b06281223..570d166e2 100644 --- a/templates/minimal_prep_and_run_resample_DM.py +++ b/templates/minimal_prep_and_run_resample_DM.py @@ -48,3 +48,4 @@ # prepare and run P = Ptycho(p,level=4) P.run() +P.finalize() diff --git a/templates/minimal_prep_and_run_resample_DM_serial.py b/templates/minimal_prep_and_run_resample_DM_serial.py new file mode 100644 index 000000000..91b037441 --- /dev/null +++ b/templates/minimal_prep_and_run_resample_DM_serial.py @@ -0,0 +1,53 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.base.engines import DM_serial + +p = u.Param() +# for verbose output +p.verbose_level = 3 +p.frames_per_block = 200 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'Vanilla' # or 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# Resample by a factor of 2 +p.scans.MF.resample = 2 + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_serial' +p.engines.engine00.numiter = 100 +p.engines.engine00.probe_support = 0.05 + +# prepare and run +P = Ptycho(p,level=4) +P.run() +P.finalize() diff --git a/templates/minimal_prep_and_run_resample_ML.py b/templates/minimal_prep_and_run_resample_ML.py index 2edbb8bcc..0ebfc05ad 100644 --- a/templates/minimal_prep_and_run_resample_ML.py +++ b/templates/minimal_prep_and_run_resample_ML.py @@ -56,4 +56,5 @@ # prepare and run P = Ptycho(p,level=4) P.run() +P.finalize() diff --git a/templates/minimal_prep_and_run_resample_ML_serial.py b/templates/minimal_prep_and_run_resample_ML_serial.py new file mode 100644 index 000000000..926917089 --- /dev/null +++ b/templates/minimal_prep_and_run_resample_ML_serial.py @@ -0,0 +1,61 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" +#import ptypy +from ptypy.core import Ptycho +from ptypy import utils as u +from ptypy.accelerate.base.engines import ML_serial + +p = u.Param() + +# for verbose output +p.verbose_level = 4 +p.frames_per_block = 100 + +# set home path +p.io = u.Param() +p.io.home = "/tmp/ptypy/" +p.io.autosave = u.Param(active=False) +#p.io.autoplot = u.Param() +#p.io.autoplot.dump = True +#p.io.autoplot = False + +# max 100 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +p.scans.MF.name = 'Full' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 100 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# Resample by a factor of 2 +p.scans.MF.resample = 2 + +p.engines = u.Param() +p.engines.engine01 = u.Param() +p.engines.engine01.name = 'ML_serial' +p.engines.engine01.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine01.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine01.scale_precond = True +p.engines.engine01.scale_probe_object = 1. +p.engines.engine01.smooth_gradient = 20. +p.engines.engine01.smooth_gradient_decay = 1/50. +p.engines.engine01.floating_intensities = True +p.engines.engine01.numiter = 300 + +# prepare and run +P = Ptycho(p,level=4) +P.run() +P.finalize() + diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py index b1cac58fe..158656456 100644 --- a/test/accelerate_tests/base_tests/array_utils_test.py +++ b/test/accelerate_tests/base_tests/array_utils_test.py @@ -291,6 +291,69 @@ def test_crop_pad_3(self): dtype=np.complex64) np.testing.assert_array_almost_equal(A, exp_A) + def test_downsample_factor_2(self): + # downsample, complex, 3D + B = np.indices((4,4), dtype=np.complex64) + B = np.indices((4,4), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,2,2), dtype=B.dtype) + au.resample(A,B) + exp_A = np.array([[[ 2. +2.j, 2.+10.j], + [10. +2.j, 10.+10.j]], + [[ 2. +2.j, 10.+ 2.j], + [ 2.+10.j, 10.+10.j]]], dtype=np.complex64) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_upsample_factor_2(self): + # upsample, complex, 3D + B = np.indices((2,2), dtype=np.complex64) + B = np.indices((2,2), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,4,4), dtype=B.dtype) + au.resample(A,B) + exp_A = np.array([[[0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j]], + [[0. +0.j , 0. +0.j , 0.25+0.j , 0.25+0.j ], + [0. +0.j , 0. +0.j , 0.25+0.j , 0.25+0.j ], + [0. +0.25j, 0. +0.25j, 0.25+0.25j, 0.25+0.25j], + [0. +0.25j, 0. +0.25j, 0.25+0.25j, 0.25+0.25j]]], dtype=np.complex64) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_downsample_factor_4(self): + # downsample, complex, 3D + B = np.indices((8,8), dtype=np.complex64) + B = np.indices((8,8), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,2,2), dtype=B.dtype) + au.resample(A,B) + exp_A = np.array([[[24.+24.j, 24.+88.j], + [88.+24.j, 88.+88.j]], + [[24.+24.j, 88.+24.j], + [24.+88.j, 88.+88.j]]], dtype=np.complex64) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_upsample_factor_4(self): + # upsample, complex, 3D + Bshape = (2,4,4) + B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16 + B = B + 1j * B + A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype) + au.resample(A, B) + exp_A = np.zeros_like(A) + + # building the expected value element-wise, to ensure correctness + for z in range(A.shape[0]): + for y in range(A.shape[1]): + for x in range(A.shape[2]): + exp_A[z, y, x] = B[z, y//4, x//4] / 16 + if np.abs(A[z, y, x]-exp_A[z, y, x]) > 1e-6: + print("mismatch! i=({},{},{}): act={}, exp={}".format(z, y, x, A[z, y, x], exp_A[z, y, x])) + + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/base_tests/fourier_update_kernel_test.py b/test/accelerate_tests/base_tests/fourier_update_kernel_test.py index 00ad5e3d3..dd2d3a7a6 100644 --- a/test/accelerate_tests/base_tests/fourier_update_kernel_test.py +++ b/test/accelerate_tests/base_tests/fourier_update_kernel_test.py @@ -196,6 +196,128 @@ def test_fourier_error(self): err_msg="ferr does not give the expected error " "for the fourier_update_kernel.fourier_error emthods") + + def test_fourier_error_aux_is_intensity(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3)) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + + mask_sum = mask.sum(-1).sum(-1) + + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + pbound_set = 0.9 + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + FUK.allocate() + i = np.abs(f)**2 + FUK.fourier_error(i, addr, fmag, mask, mask_sum, aux_is_intensity=True) + + + expected_fdev = np.array([[[7.7459664, 6.7459664, 5.7459664, 4.7459664, 3.7459664], + [2.7459664, 1.7459664, 0.74596643, -0.25403357, -1.2540336], + [-2.2540336, -3.2540336, -4.2540336, -5.2540336, -6.2540336], + [-7.2540336, -8.254034, -9.254034, -10.254034, -11.254034], + [-12.254034, -13.254034, -14.254034, -15.254034, -16.254034]], + + [[-6.3452415, -7.3452415, -8.345242, -9.345242, -10.345242], + [-11.345242, -12.345242, -13.345242, -14.345242, -15.345242], + [-16.345242, -17.345242, -18.345242, -19.345242, -20.345242], + [-21.345242, -22.345242, -23.345242, -24.345242, -25.345242], + [-26.345242, -27.345242, -28.345242, -29.345242, -30.345242]], + + [[-20.13363, -21.13363, -22.13363, -23.13363, -24.13363], + [-25.13363, -26.13363, -27.13363, -28.13363, -29.13363], + [-30.13363, -31.13363, -32.13363, -33.13363, -34.13363], + [-35.13363, -36.13363, -37.13363, -38.13363, -39.13363], + [-40.13363, -41.13363, -42.13363, -43.13363, -44.13363]], + + [[-33.866074, -34.866074, -35.866074, -36.866074, -37.866074], + [-38.866074, -39.866074, -40.866074, -41.866074, -42.866074], + [-43.866074, -44.866074, -45.866074, -46.866074, -47.866074], + [-48.866074, -49.866074, -50.866074, -51.866074, -52.866074], + [-53.866074, -54.866074, -55.866074, -56.866074, -57.866074]]], + dtype=FLOAT_TYPE) + np.testing.assert_array_equal(FUK.npy.fdev, expected_fdev, + err_msg="fdev does not give the expected error " + "for the fourier_update_kernel.fourier_error emthods") + + expected_ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [7.54033208e-01, 3.04839879e-01, 5.56465909e-02, 6.45330548e-03, 1.57260016e-01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [5.26210022e+00, 6.81290817e+00, 8.56371498e+00, 1.05145216e+01, 1.26653280e+01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], + + [[1.61048353e+00, 2.15810299e+00, 2.78572226e+00, 3.49334168e+00, 4.28096104e+00], + [5.14858055e+00, 6.09619951e+00, 7.12381887e+00, 8.23143768e+00, 9.41905785e+00], + [1.06866770e+01, 1.20342960e+01, 1.34619150e+01, 1.49695349e+01, 1.65571537e+01], + [1.82247734e+01, 1.99723930e+01, 2.18000126e+01, 2.37076321e+01, 2.56952515e+01], + [2.77628708e+01, 2.99104881e+01, 3.21381073e+01, 3.44457283e+01, 3.68333473e+01]], + + [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [6.31699409e+01, 6.82966690e+01, 7.36233978e+01, 7.91501160e+01, 8.48768463e+01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [1.23437180e+02, 1.30563919e+02, 1.37890640e+02, 1.45417374e+02, 1.53144089e+02], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], + + [[4.58764343e+01, 4.86257210e+01, 5.14550095e+01, 5.43642960e+01, 5.73535805e+01], + [6.04228668e+01, 6.35721550e+01, 6.68014374e+01, 7.01107254e+01, 7.35000076e+01], + [7.69692993e+01, 8.05185852e+01, 8.41478729e+01, 8.78571548e+01, 9.16464386e+01], + [9.55157242e+01, 9.94650116e+01, 1.03494293e+02, 1.07603584e+02, 1.11792870e+02], + [1.16062157e+02, 1.20411446e+02, 1.24840721e+02, 1.29350006e+02, 1.33939301e+02]]], + dtype=FLOAT_TYPE) + np.testing.assert_array_equal(FUK.npy.ferr, expected_ferr, + err_msg="ferr does not give the expected error " + "for the fourier_update_kernel.fourier_error emthods") + + def test_error_reduce(self): # array from the previous test ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index 23950af26..411fd59d4 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -355,6 +355,119 @@ def test_crop_pad_simple_oblike_UNITY(self): # Assert np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + def test_downsample_factor_2(self): + # downsample, complex, 3D + B = np.indices((4,4), dtype=np.complex64) + B = np.indices((4,4), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,2,2), dtype=B.dtype) + + A_dev = gpuarray.to_gpu(A) + B_dev = gpuarray.to_gpu(B) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + exp_A = np.array([[[ 2. +2.j, 2.+10.j], + [10. +2.j, 10.+10.j]], + [[ 2. +2.j, 10.+ 2.j], + [ 2.+10.j, 10.+10.j]]], dtype=np.complex64) + + np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum()) + np.testing.assert_array_almost_equal(A_dev.get(), exp_A) + + def test_upsample_factor_2(self): + # upsample, complex, 3D + B = np.indices((2,2), dtype=np.complex64) + B = np.indices((2,2), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,4,4), dtype=B.dtype) + A_dev = gpuarray.to_gpu(A) + B_dev = gpuarray.to_gpu(B) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + exp_A = np.array([[[0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j]], + [[0. +0.j , 0. +0.j , 0.25+0.j , 0.25+0.j ], + [0. +0.j , 0. +0.j , 0.25+0.j , 0.25+0.j ], + [0. +0.25j, 0. +0.25j, 0.25+0.25j, 0.25+0.25j], + [0. +0.25j, 0. +0.25j, 0.25+0.25j, 0.25+0.25j]]], dtype=np.complex64) + + np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum()) + np.testing.assert_array_almost_equal(A_dev.get(), exp_A) + + def test_downsample_factor_4(self): + # downsample, complex, 3D + B = np.indices((8,8), dtype=np.complex64) + B = np.indices((8,8), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2,2,2), dtype=B.dtype) + + A_dev = gpuarray.to_gpu(A) + B_dev = gpuarray.to_gpu(B) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + exp_A = np.array([[[24.+24.j, 24.+88.j], + [88.+24.j, 88.+88.j]], + [[24.+24.j, 88.+24.j], + [24.+88.j, 88.+88.j]]], dtype=np.complex64) + + np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum()) + np.testing.assert_array_almost_equal(A_dev.get(), exp_A) + + def test_upsample_factor_4(self): + # upsample, complex, 3D + Bshape = (2,4,4) + B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16 + B = B + 1j * B + A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype) + A_dev = gpuarray.to_gpu(A) + B_dev = gpuarray.to_gpu(B) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + exp_A = np.zeros_like(A) + + # building the expected value element-wise, to ensure correctness + for z in range(A.shape[0]): + for y in range(A.shape[1]): + for x in range(A.shape[2]): + exp_A[z, y, x] = B[z, y//4, x//4] / 16 + + np.testing.assert_almost_equal(A_dev.get().sum(), B_dev.get().sum()) + np.testing.assert_array_almost_equal(A_dev.get(), exp_A) + + def test_upsample_factor_4_UNITY(self): + np.random.seed(1983) + + Bshape = (10,64,64) + B = np.reshape(np.arange(0, np.prod(Bshape)), Bshape).astype(np.complex64) * 16 + B = B + 1j * B + A = np.zeros((Bshape[0], Bshape[1]*4, Bshape[2]*4), dtype=B.dtype) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + au.resample(A, B) + + np.testing.assert_allclose(A_dev.get(), A, rtol=1e-6, atol=1e-6, verbose=False) + + + def test_downsample_factor_4_UNITY(self): + np.random.seed(1983) + + A = np.zeros((10, 64, 64), dtype=np.float32) + B = np.random.rand(10, 256, 256).astype(np.float32) + B_dev = gpuarray.to_gpu(B) + A_dev = gpuarray.to_gpu(A) + + k = gau.ResampleKernel(queue=self.stream) + k.resample(A_dev, B_dev) + au.resample(A, B) + + np.testing.assert_allclose(A_dev.get(), A, rtol=1e-6, atol=1e-6, verbose=False) + def test_max_abs2_complex_UNITY(self): np.random.seed(1983) X = (np.random.randint(-1000, 1000, (3,100,200)).astype(np.float32) + \ diff --git a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py index 3d7cb5fa6..e00075f5b 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fourier_update_kernel_test.py @@ -19,7 +19,7 @@ class FourierUpdateKernelTest(PyCudaTest): - def test_fmag_all_update_UNITY(self): + def fmag_update_all_UNITY_tester(self, mult): ''' setup ''' @@ -92,6 +92,9 @@ def test_fmag_all_update_UNITY(self): nFUK.fourier_error(f, addr, fmag, mask, mask_sum) nFUK.error_reduce(addr, err_fmag) # print(np.sqrt(pbound_set/err_fmag)) + if not mult: + f = np.real(f) + f_d = gpuarray.to_gpu(f) fmag_d = gpuarray.to_gpu(fmag) mask_d = gpuarray.to_gpu(mask) @@ -103,10 +106,11 @@ def test_fmag_all_update_UNITY(self): FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev) FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr) - FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set) + + FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set, mult=mult) - nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set) + nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set, mult=mult) expected_f = f measured_f = f_d.get() np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f " @@ -114,7 +118,13 @@ def test_fmag_all_update_UNITY(self): repr(measured_f), repr(mask))) - def test_fmag_update_nopbound_UNITY(self): + def test_fmag_update_all_UNITY(self): + self.fmag_update_all_UNITY_tester(True) + + def test_fmag_update_all_nomult_UNITY(self): + self.fmag_update_all_UNITY_tester(False) + + def fmag_update_nopbound_UNITY_tester(self, mult): ''' setup ''' @@ -186,6 +196,8 @@ def test_fmag_update_nopbound_UNITY(self): nFUK.fourier_error(f, addr, fmag, mask, mask_sum) nFUK.error_reduce(addr, err_fmag) # print(np.sqrt(pbound_set/err_fmag)) + if not mult: + f = np.abs(f)**2 f_d = gpuarray.to_gpu(f) fmag_d = gpuarray.to_gpu(fmag) mask_d = gpuarray.to_gpu(mask) @@ -196,8 +208,8 @@ def test_fmag_update_nopbound_UNITY(self): FUK.gpu.fdev = gpuarray.to_gpu(nFUK.npy.fdev) FUK.gpu.ferr = gpuarray.to_gpu(nFUK.npy.ferr) - FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d) - nFUK.fmag_update_nopbound(f, addr, fmag, mask) + FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d, mult=mult) + nFUK.fmag_update_nopbound(f, addr, fmag, mask, mult=mult) expected_f = f measured_f = f_d.get() @@ -205,9 +217,13 @@ def test_fmag_update_nopbound_UNITY(self): "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), repr(measured_f), repr(mask))) + def test_fmag_update_nopbound_UNITY(self): + self.fmag_update_nopbound_UNITY_tester(True) + def test_fmag_update_nopbound_nomult_UNITY(self): + self.fmag_update_nopbound_UNITY_tester(False) - def test_fourier_error_UNITY(self): + def fourier_error_UNITY_tester(self, aux_is_int): ''' setup ''' @@ -266,6 +282,9 @@ def test_fourier_error_UNITY(self): ''' mask_sum = mask.sum(-1).sum(-1) + if aux_is_int: + f = np.abs(f)**2 + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel f_d = gpuarray.to_gpu(f) fmag_d = gpuarray.to_gpu(fmag) @@ -279,8 +298,8 @@ def test_fourier_error_UNITY(self): nFUK.allocate() FUK.allocate() - nFUK.fourier_error(f, addr, fmag, mask, mask_sum) - FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d) + nFUK.fourier_error(f, addr, fmag, mask, mask_sum, aux_is_intensity=aux_is_int) + FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d, aux_is_intensity=aux_is_int) expected_fdev = nFUK.npy.fdev measured_fdev = FUK.gpu.fdev.get() @@ -296,7 +315,14 @@ def test_fourier_error_UNITY(self): "is \n%s, \nbut gpu ferr is \n %s, \n " % ( repr(expected_ferr), repr(measured_ferr))) - def test_fourier_deviation_UNITY(self): + + def test_fourier_error_UNITY(self): + self.fourier_error_UNITY_tester(False) + + def test_fourier_error_aux_is_intensity_UNITY(self): + self.fourier_error_UNITY_tester(True) + + def fourier_deviation_UNITY_tester(self, aux_is_int): ''' setup - using the fourier_error as reference, so we need mask, etc. ''' @@ -355,6 +381,9 @@ def test_fourier_deviation_UNITY(self): ''' mask_sum = mask.sum(-1).sum(-1) + if aux_is_int: + f = np.abs(f)**2 + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel f_d = gpuarray.to_gpu(f) fmag_d = gpuarray.to_gpu(fmag) @@ -366,8 +395,8 @@ def test_fourier_deviation_UNITY(self): nFUK.allocate() FUK.allocate() - nFUK.fourier_deviation(f, addr, fmag) - FUK.fourier_deviation(f_d, addr_d, fmag_d) + nFUK.fourier_deviation(f, addr, fmag, aux_is_intensity=aux_is_int) + FUK.fourier_deviation(f_d, addr_d, fmag_d, aux_is_intensity=aux_is_int) expected_fdev = nFUK.npy.fdev measured_fdev = FUK.gpu.fdev.get() @@ -376,7 +405,11 @@ def test_fourier_deviation_UNITY(self): repr(expected_fdev), repr(measured_fdev))) + def test_fourier_deviation_UNITY(self): + self.fourier_deviation_UNITY_tester(False) + def test_fourier_deviation_aux_is_intensity_UNITY(self): + self.fourier_deviation_UNITY_tester(True) def test_error_reduce_UNITY(self): ''' @@ -522,7 +555,7 @@ def test_error_reduce(self): "is not behaving as expected.") - def log_likelihood_UNITY_tester(self, use_version2=False): + def log_likelihood_UNITY_tester(self, use_version2=False, aux_is_int=False): ''' setup ''' @@ -579,6 +612,10 @@ def log_likelihood_UNITY_tester(self, use_version2=False): ''' test ''' + + if aux_is_int: + f = np.abs(f)**2 + mask_sum = mask.sum(-1).sum(-1) LLerr = np.zeros_like(mask_sum, dtype=np.float32) f_d = gpuarray.to_gpu(f) @@ -590,14 +627,14 @@ def log_likelihood_UNITY_tester(self, use_version2=False): from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) nFUK.allocate() - nFUK.log_likelihood(f, addr, fmag, mask, LLerr) + nFUK.log_likelihood(f, addr, fmag, mask, LLerr, aux_is_intensity=aux_is_int) FUK = FourierUpdateKernel(f, nmodes=total_number_modes) FUK.allocate() if use_version2: - FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d) + FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d, aux_is_intensity=aux_is_int) else: - FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) + FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d, aux_is_intensity=aux_is_int) expected_err_phot = LLerr measured_err_phot = LLerr_d.get() @@ -607,12 +644,18 @@ def log_likelihood_UNITY_tester(self, use_version2=False): repr(expected_err_phot), repr(measured_err_phot)), rtol=1e-5) def test_log_likelihood_UNITY(self): - self.log_likelihood_UNITY_tester(False) + self.log_likelihood_UNITY_tester(False, False) def test_log_likelihood2_UNITY(self): - self.log_likelihood_UNITY_tester(True) + self.log_likelihood_UNITY_tester(True, False) - def test_exit_error_UNITY(self): + def test_log_likelihood_aux_is_intensity_UNITY(self): + self.log_likelihood_UNITY_tester(False, True) + + def test_log_likelihood2_aux_is_intensity_UNITY(self): + self.log_likelihood_UNITY_tester(True, True) + + def exit_error_UNITY_tester(self, aux_is_int): ''' setup ''' @@ -659,6 +702,10 @@ def test_exit_error_UNITY(self): ''' test ''' + + if aux_is_int: + aux = np.abs(aux)**2 + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel aux_d = gpuarray.to_gpu(aux) addr_d = gpuarray.to_gpu(addr) @@ -669,8 +716,8 @@ def test_exit_error_UNITY(self): nFUK.allocate() FUK.allocate() - nFUK.exit_error(aux, addr, ) - FUK.exit_error(aux_d, addr_d) + nFUK.exit_error(aux, addr, aux_is_intensity=aux_is_int) + FUK.exit_error(aux_d, addr_d, aux_is_intensity=aux_is_int) expected_ferr = nFUK.npy.ferr measured_ferr = FUK.gpu.ferr.get() @@ -681,5 +728,11 @@ def test_exit_error_UNITY(self): repr(measured_ferr)), rtol=1e-7) + def test_exit_error_UNITY(self): + self.exit_error_UNITY_tester(False) + + def test_exit_error_aux_is_intensity_UNITY(self): + self.exit_error_UNITY_tester(True) + if __name__ == '__main__': unittest.main() diff --git a/test/core_tests/geometry_test.py b/test/core_tests/geometry_test.py index 84e4c868d..67d26638c 100644 --- a/test/core_tests/geometry_test.py +++ b/test/core_tests/geometry_test.py @@ -58,6 +58,38 @@ def test_geometry_nearfield_resolution(self): G = self.set_up_nearfield() assert (np.round(G.resolution*1e7) == [1.00, 1.00]).all(), "geometry resolution incorrect for the nearfield" + def test_downsample(self): + G = self.set_up_farfield() + G.resample = 2 + + B = np.indices((4,4), dtype=np.complex64)[0] + 1j * np.indices((4,4), dtype=np.complex64)[1] + A = G.downsample(B) + exp_A = np.array([[ 2. +2.j, 2.+10.j], + [10. +2.j, 10.+10.j]], dtype=np.complex64) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_upsample(self): + G = self.set_up_farfield() + G.resample = 2 + + B = np.indices((2,2), dtype=np.complex64)[0] + 1j * np.indices((2,2), dtype=np.complex64)[1] + A = G.upsample(B) + exp_A = np.array([[0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0. +0.j , 0. +0.j , 0. +0.25j, 0. +0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j], + [0.25+0.j , 0.25+0.j , 0.25+0.25j, 0.25+0.25j]], dtype=np.complex64) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, exp_A) + + def test_downsample_upsample(self): + G = self.set_up_farfield() + G.resample = 2 + + A = np.random.random((4,4)) + B = G.downsample(G.upsample(A)) + np.testing.assert_almost_equal(A.sum(), B.sum()) + np.testing.assert_array_almost_equal(A, B) if __name__ == '__main__': unittest.main()