diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7592947e0..98be46bef 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: ptypy tests +name: Tests on: # Trigger the workflow on push or pull request, @@ -10,6 +10,7 @@ on: branches: - master - dev + - hotfixes # Also trigger on page_build, as well as release created events page_build: release: @@ -20,26 +21,32 @@ jobs: build-linux: runs-on: ubuntu-latest strategy: - max-parallel: 5 + max-parallel: 10 + fail-fast: false matrix: - python-version: ['3.7','3.8','3.9','3.10'] - + python-version: ['3.8','3.9','3.10', '3.11'] + name: Testing with Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 + - name: Checkout + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Add conda to system path run: | # $CONDA is an environment variable pointing to the root of the miniconda directory echo $CONDA/bin >> $GITHUB_PATH + conda --version - name: Install dependencies run: | + # replace python version in core dependencies + sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_core.yml conda env update --file dependencies_core.yml --name base + conda list - name: Prepare ptypy run: | - # Dry install to create ptypy/version.py + # Install ptypy pip install . - name: Lint with flake8 run: | diff --git a/.gitignore b/.gitignore index 70e473b91..3bebd583d 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ ghostdriver* .DS_Store .ipynb_checkpoints .clang-format +pip-wheel-metadata/ diff --git a/CONTRIB.rst b/CONTRIB.rst index a9aada0a4..1f26aebf5 100644 --- a/CONTRIB.rst +++ b/CONTRIB.rst @@ -26,24 +26,25 @@ Please ensure you satisfy most of PEP8_ recommendations. We are not dogmatic abo Testing ^^^^^^^ -Not much testing exists at the time of writing this document, but we are aware that this is something that should change. If you want to contribute code, it would be very good practice to also submit related tests. +All tests are in the (``/test/``) folder and our CI pipeline runs these test for every commit (?). Please note that tests that require GPUs are disabled for the CI pipeline. Make sure to supply tests for new code or drastic changes to the existing code base. Smaller commits or bug fixes don't require an extra test. Branches ^^^^^^^^ +We are following the Gitflow https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow development model where a development branch (``dev``) is merged into the master branch for every release. Individual features are developed on topic branches from the development branch and squash-merged back into it when the feature is mature + The important permanent branches are: - - ``master``: the current cutting-edge but functional package. - - ``stable``: the latest release, recommended for production use. - - ``target``: target for a next release. This branch should stay up-to-date with ``master``, and contain planned updates that will break compatibility with the current version. - - other thematic and temporary branches will appear and disappear as new ideas are tried out and merged in. + - ``master``: (protected) the current release plus bugfixes / hotpatches. + - ``dev``: (protected) current branch for all developments. Features are branched this branch and merged back into it upon completion. Development cycle ^^^^^^^^^^^^^^^^^ -There has been only two releases of the code up to now, so what we can tell about the *normal development cycle* for |ptypy| is rather limited. However the plan is as follows: - - Normal development usually happens on thematic branches. These branches are merged back to master when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break. - - At regular interval admins will decide to freeze the development for a new stable release. During this period, development will be allowed only on feature branches but master will accept only bug fixes. Once the stable release is done, development will continue. +|ptypy| does not follow a rigid release schedule. Releases are prepared for major event or when a set of features have reached maturity. + + - Normal development usually happens on thematic branches from the ``dev`` branch. These branches are merged back to ``dev`` when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break. + - For a release the dev branch will be merged back into master and that merge tagged as a release. 3. Pull requests @@ -51,16 +52,9 @@ There has been only two releases of the code up to now, so what we can tell abou Most likely you are a member of the |ptypy| team, which give you access to the full repository, but no right to commit changes. The proper way of doing this is *pull requests*. You can read about how this is done on github's `pull requests tutorial`_. -Pull requests can be made against one of the feature branches, or against ``target`` or ``master``. In the latter cases, if your changes are deemed a bit too substantial, the first thing we will do is create a feature branch for your commits, and we will let it live for a little while, making sure that it is all fine. We will then merge it onto ``master`` (or ``target``). - -In principle bug fixes can be requested on the ``stable`` branch. - -3. Direct commits ------------------ - -If you are one of our power-users (or power-developers), you can be given rights to commit directly to |ptypy|. This makes things much simpler of course, but with great power comes great responsibility. +Pull requests shall be made against one of the feature branches, or against ``dev`` or ``master``. For PRs against master we will only accept bugifxes or smaller changes. Every other PR should be made against ``dev``. Your PR will be reviewed and discussed anmongst the core developer team. The more you touch core libraries, the more scrutiny your PR will face. However, we created two folders in the main source folder where you have mmore freedom to try out things. For example, if you want to provide a new reconstruction engine, place it into the ``custom/`` folder. A new ``PtyScan`` subclass that prepares data from your experiment is best placed in the ``experiment/`` folder. -To make sure that things are done cleanly, we encourage all the core developers to create thematic remote branches instead of committing always onto master. Merging these thematic branches will be done as a collective decision during one of the regular admin meetings. +If you develop a new feature on a topic branch, it is your responsibility to keep it current with dev branch to avoid merge conflicts. .. |ptypy| replace:: PtyPy @@ -68,4 +62,4 @@ To make sure that things are done cleanly, we encourage all the core developers .. _PEP8: https://www.python.org/dev/peps/pep-0008/ -.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/ \ No newline at end of file +.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/ diff --git a/archive/cuda_extension/engines/DM_gpu.py b/archive/cuda_extension/engines/DM_gpu.py index 399eb143c..9e81ad7fa 100644 --- a/archive/cuda_extension/engines/DM_gpu.py +++ b/archive/cuda_extension/engines/DM_gpu.py @@ -57,6 +57,7 @@ class DMGpu(DMNpy): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/archive/cuda_extension/engines/DM_npy.py b/archive/cuda_extension/engines/DM_npy.py index f601a46dd..6fce4bc5d 100644 --- a/archive/cuda_extension/engines/DM_npy.py +++ b/archive/cuda_extension/engines/DM_npy.py @@ -55,6 +55,7 @@ class DMNpy(DM): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/archive/cuda_extension/python/gpu_extension.pyx b/archive/cuda_extension/python/gpu_extension.pyx index aa0b36402..f0db75587 100644 --- a/archive/cuda_extension/python/gpu_extension.pyx +++ b/archive/cuda_extension/python/gpu_extension.pyx @@ -153,7 +153,7 @@ def abs2(input): cdef np.float32_t [:,:,::1] cout_3c cdef np.float64_t [:,::1] cout_d2c cdef np.float64_t [:,:,::1] cout_d3c - cdef int n = np.product(cin.shape) + cdef int n = np.prod(cin.shape) cdef np.float32_t [:, ::1] cin_f2c cdef np.complex64_t [:, ::1] cin_c2c diff --git a/archive/engines/DM.py b/archive/engines/DM.py index 50936bd42..1124158a2 100644 --- a/archive/engines/DM.py +++ b/archive/engines/DM.py @@ -55,6 +55,7 @@ class DM(PositionCorrectionEngine): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py index 273a8ecbf..193c5693e 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py index 1cf42d5e4..edb0cd1e4 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py index 9d1abcccb..eaa848f4a 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py index 8e3c7241e..fcf483c47 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py @@ -29,6 +29,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/mpi_allreduce_speed.py b/benchmark/mpi_allreduce_speed.py index 5102e35af..2e562d944 100644 --- a/benchmark/mpi_allreduce_speed.py +++ b/benchmark/mpi_allreduce_speed.py @@ -11,7 +11,7 @@ } def run_benchmark(shape): - megabytes = np.product(shape) * 8 / 1024 / 1024 * 2 + megabytes = np.prod(shape) * 8 / 1024 / 1024 * 2 data = np.zeros(shape, dtype=np.complex64) @@ -39,4 +39,4 @@ def run_benchmark(shape): print('Final results for {} processes'.format(parallel.size)) print(','.join(['Name', 'Duration', 'MB', 'MB/s'])) for r in res: - print(','.join([str(x) for x in r])) \ No newline at end of file + print(','.join([str(x) for x in r])) diff --git a/cufft/dependencies.yml b/cufft/dependencies.yml index 949079d36..48f17a1e7 100644 --- a/cufft/dependencies.yml +++ b/cufft/dependencies.yml @@ -2,7 +2,7 @@ name: ptypy_cufft channels: - conda-forge dependencies: - - python=3.9 + - python - cmake>=3.8.0 - pybind11 - compilers diff --git a/cufft/extensions.py b/cufft/extensions.py index 4fabf2d2c..545b43d04 100644 --- a/cufft/extensions.py +++ b/cufft/extensions.py @@ -4,7 +4,6 @@ import os, re import subprocess import sysconfig -import pybind11 from distutils.unixccompiler import UnixCCompiler from distutils.command.build_ext import build_ext @@ -98,6 +97,7 @@ def __init__(self, *args, **kwargs): self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] self.CXXFLAGS = ['"-fPIC"'] + import pybind11 pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')] INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir] self.INCLUDES = ["-I%s" % ix for ix in INCLUDES] diff --git a/cufft/setup.py b/cufft/setup.py index 8cba2f560..5108ebf32 100644 --- a/cufft/setup.py +++ b/cufft/setup.py @@ -39,6 +39,7 @@ description='Extension of CuFFT to include pre- and post-filters using callbacks', packages=package_list, ext_modules=ext_modules, + install_requires=["pybind11"], cmdclass=cmdclass ) diff --git a/dependencies_core.yml b/dependencies_core.yml index 32c492de1..5f0b7c13f 100644 --- a/dependencies_core.yml +++ b/dependencies_core.yml @@ -1,8 +1,6 @@ name: ptypy_core -channels: - - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - h5py diff --git a/dependencies_dev.yml b/dependencies_dev.yml index 230a7e190..5462c145a 100644 --- a/dependencies_dev.yml +++ b/dependencies_dev.yml @@ -2,7 +2,7 @@ name: ptypy_full channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/dependencies_full.yml b/dependencies_full.yml index 65a1774f6..e43241fce 100644 --- a/dependencies_full.yml +++ b/dependencies_full.yml @@ -2,7 +2,7 @@ name: ptypy_full channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/doc/rst_templates/getting_started.tmp b/doc/rst_templates/getting_started.tmp index db4a6a19c..851734eb5 100644 --- a/doc/rst_templates/getting_started.tmp +++ b/doc/rst_templates/getting_started.tmp @@ -96,7 +96,7 @@ GPUs based on our own kernels and the Install the dependencies for this version like so. :: - $ conda env create -f accelerate/cuda_pycuda/dependencies.yml + $ conda env create -f ptypy/accelerate/cuda_pycuda/dependencies.yml $ conda activate ptypy_pycuda (ptypy_pycuda)$ pip install . diff --git a/ptypy/__init__.py b/ptypy/__init__.py index 0ca662fb7..74c336d01 100644 --- a/ptypy/__init__.py +++ b/ptypy/__init__.py @@ -78,11 +78,16 @@ # Convenience loader for GPU engines def load_gpu_engines(arch='cuda'): - if arch=='cuda': + if arch in ['cuda', 'pycuda']: from .accelerate.cuda_pycuda.engines import projectional_pycuda from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream from .accelerate.cuda_pycuda.engines import stochastic from .accelerate.cuda_pycuda.engines import ML_pycuda + if arch in ['cuda', 'cupy']: + from .accelerate.cuda_cupy.engines import projectional_cupy + from .accelerate.cuda_cupy.engines import projectional_cupy_stream + from .accelerate.cuda_cupy.engines import stochastic + from .accelerate.cuda_cupy.engines import ML_cupy if arch=='serial': from .accelerate.base.engines import projectional_serial from .accelerate.base.engines import projectional_serial_stream diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 248110326..38f63f385 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -348,6 +348,7 @@ def engine_finalize(self): prep = self.diff_info[d.ID] float_intens_coeff[label] = prep.float_intens_coeff self.ptycho.runtime["float_intens"] = parallel.gather_dict(float_intens_coeff) + super().engine_finalize() class BaseModelSerial(BaseModel): diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index f3a13bad5..c5343f940 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -62,7 +62,7 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum): ## Actual math ## - # build model from complex fourier magnitudes, summing up + # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) af = np.sqrt((np.abs(tf) ** 2).sum(1)) @@ -86,7 +86,7 @@ def fourier_deviation(self, b_aux, addr, mag): ## Actual math ## - # build model from complex fourier magnitudes, summing up + # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) af = np.sqrt((np.abs(tf) ** 2).sum(1)) @@ -109,7 +109,7 @@ def error_reduce(self, addr, err_sum): ## Actual math ## # Reduces the Fourier error along the last 2 dimensions.fd - #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) err_sum[:] = ferr.sum(-1).sum(-1) return @@ -136,12 +136,12 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0): ## As opposed to DM we use renorm to differentiate the cases. - # pbound >= g_err_sum + # pbound >= g_err_sum # fm = 1.0 (as renorm = 1, i.e. renorm[~ind]) # pbound < g_err_sum : - # fm = (1 - g_mask) + g_mask * (g_mag + fdev * renorm) / (af + 1e-10) + # fm = (1 - g_mask) + g_mask * (g_mag + fdev * renorm) / (af + 1e-10) # (as renorm in [0,1]) - # pbound == 0.0 + # pbound == 0.0 # fm = (1 - g_mask) + g_mask * g_mag / (af + 1e-10) (as renorm=0) ind = err_sum > pbound @@ -192,7 +192,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot): # batch buffers aux = b_aux[:maxz * self.nmodes] - # build model from complex fourier magnitudes, summing up + # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) LL = (np.abs(tf) ** 2).sum(1) @@ -516,7 +516,7 @@ def _build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + \ (1 - tau * (1 + alpha)) * \ ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \ - pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex aux[ind, :, :] = dex @@ -660,6 +660,40 @@ def pr_norm_local(self, addr, pr, prn): pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols]).real return + def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm, alpha=1): + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + pr_conj = pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() + pr_abs2 = abs2(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols]) + deltaEW = ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind, :, :] + + ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += 0.5 * pr_conj * deltaEW / (pr_abs2.mean() * alpha + pr_abs2) + + ob_sum_nmr[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += pr_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + ob_sum_dnm[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += pr_abs2 + + def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm, beta=1): + sh = addr.shape + flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3]) + rows, cols = ex.shape[-2:] + + for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr): + ob_conj = ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() + ob_abs2 = abs2(ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols]) + deltaEW = ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind, :, :] + + pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_conj * deltaEW / (beta + ob_abs2) + + pr_sum_nmr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + pr_sum_dnm[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_abs2 + + def avg_wasp(self, arr, nmr, dnm): + is_zero = np.isclose(dnm, 0) + arr[:] = np.where(is_zero, nmr, nmr / dnm) + class PositionCorrectionKernel(BaseKernel): from ptypy.accelerate.base import address_manglers @@ -775,7 +809,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_sum): # batch buffers aux = b_aux[:maxz * self.nmodes] - # build model from complex fourier magnitudes, summing up + # build model from complex fourier magnitudes, summing up # all modes incoherently tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2]) LL = (np.abs(tf) ** 2).sum(1) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/__init__.py b/ptypy/accelerate/cuda_common/__init__.py similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/__init__.py rename to ptypy/accelerate/cuda_common/__init__.py diff --git a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu b/ptypy/accelerate/cuda_common/abs2sum.cu similarity index 94% rename from ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu rename to ptypy/accelerate/cuda_common/abs2sum.cu index 475a228bb..9783c20cc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu +++ b/ptypy/accelerate/cuda_common/abs2sum.cu @@ -5,8 +5,7 @@ * - OUT_TYPE: can be float/double */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void abs2sum(const IN_TYPE* a, const int n, diff --git a/ptypy/accelerate/cuda_common/avg_wasp.cu b/ptypy/accelerate/cuda_common/avg_wasp.cu new file mode 100644 index 000000000..0c2fce77a --- /dev/null +++ b/ptypy/accelerate/cuda_common/avg_wasp.cu @@ -0,0 +1,47 @@ +/** avg_wasp +* +* Data types: +* - IN_TYPE: the data type for the inputs (float or double) +* - OUT_TYPE: the data type for the outputs (float or double) +* - MATH_TYPE: the data type used for computation +*/ + +#include "common.cuh" + +// specify max number of threads/block and min number of blocks per SM, +// to assist the compiler in register optimisations. +// We achieve a higher occupancy in this case, as less registers are used +// (guided by profiler) +extern "C" __global__ void __launch_bounds__(1024, 2) + avg_wasp(complex *arr, + const complex* __restrict__ nmr, + const IN_TYPE* __restrict__ dnm, + int A, + int B, + int C + ) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + + /*go to this mode*/ + arr += bid * B * C; + nmr += bid * B * C; + dnm += bid * B * C; + + if (b >= B) + return; + + for (int c = tx; c < C; c += blockDim.x) { + if (dnm[b * C + c] != 0) { + auto avg_val_tmp = nmr[b * C + c] / dnm[b * C + c]; + complex avg_val = avg_val_tmp; + arr[b * C + c] = avg_val; + } + else { + complex avg_val = nmr[b * C + c]; + arr[b * C + c] = avg_val; + } + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu rename to ptypy/accelerate/cuda_common/batched_multiply.cu index 1263841b6..f91bb6d38 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu +++ b/ptypy/accelerate/cuda_common/batched_multiply.cu @@ -8,8 +8,7 @@ * - MATH_TYPE: the data type used for computation (filter) */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void batched_multiply(const complex* input, complex* output, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_common/build_aux.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu rename to ptypy/accelerate/cuda_common/build_aux.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu rename to ptypy/accelerate/cuda_common/build_aux_no_ex.cu index ee091c58e..d02500a1a 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu +++ b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void build_aux_no_ex(complex* auxilliary_wave, int aRows, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu rename to ptypy/accelerate/cuda_common/build_aux_position_correction.cu index 327040371..9d0f44fad 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu +++ b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void build_aux_position_correction( complex* auxiliary_wave, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_common/build_exit.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu rename to ptypy/accelerate/cuda_common/build_exit.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu rename to ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_common/clip_magnitudes.cu similarity index 86% rename from ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu rename to ptypy/accelerate/cuda_common/clip_magnitudes.cu index 8128091f9..5db29dbe9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu +++ b/ptypy/accelerate/cuda_common/clip_magnitudes.cu @@ -1,10 +1,7 @@ /** clip_magnitudes. * */ - #include - #include - #include - using thrust::complex; + #include "common.cuh" extern "C" __global__ void clip_magnitudes(IN_TYPE *arr, float clip_min, diff --git a/ptypy/accelerate/cuda_common/common.cuh b/ptypy/accelerate/cuda_common/common.cuh new file mode 100644 index 000000000..d2c022373 --- /dev/null +++ b/ptypy/accelerate/cuda_common/common.cuh @@ -0,0 +1,12 @@ +#pragma once + +#ifndef PTYPY_CUPY_NVTRC +// pycuda code +# include +using thrust::complex; + +#else +// cupy code +# include + +#endif \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_common/convolution.cu similarity index 99% rename from ptypy/accelerate/cuda_pycuda/cuda/convolution.cu rename to ptypy/accelerate/cuda_common/convolution.cu index ae42ecba5..d729fd067 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu +++ b/ptypy/accelerate/cuda_common/convolution.cu @@ -6,8 +6,7 @@ * A symmetric convolution kernel is assumed here */ -#include -using thrust::complex; +#include "common.cuh" /** Implements reflect-mode index wrapping * diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu b/ptypy/accelerate/cuda_common/delx.cu similarity index 99% rename from ptypy/accelerate/cuda_pycuda/cuda/delx.cu rename to ptypy/accelerate/cuda_common/delx.cu index f2e8a934e..23ce09f05 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu +++ b/ptypy/accelerate/cuda_common/delx.cu @@ -5,8 +5,7 @@ * - OUT_TYPE: the data type for the outputs */ -#include -using thrust::complex; +#include "common.cuh" /** Finite difference for forward/backward for any axis that is not the diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_common/dot.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/dot.cu rename to ptypy/accelerate/cuda_common/dot.cu index 21087abe3..3dfd909cf 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu +++ b/ptypy/accelerate/cuda_common/dot.cu @@ -1,6 +1,4 @@ -#include -#include -using thrust::complex; +#include "common.cuh" template __device__ inline T dotmul(const T& a, const T& b) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_common/error_reduce.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu rename to ptypy/accelerate/cuda_common/error_reduce.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_common/exit_error.cu similarity index 91% rename from ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu rename to ptypy/accelerate/cuda_common/exit_error.cu index fdac52e46..2dded7e0f 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu +++ b/ptypy/accelerate/cuda_common/exit_error.cu @@ -1,9 +1,4 @@ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_common/fill3D.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu rename to ptypy/accelerate/cuda_common/fill3D.cu index c3f03d8ca..ddaf6b2bc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu +++ b/ptypy/accelerate/cuda_common/fill3D.cu @@ -5,9 +5,7 @@ * - OUT_TYPE: data type for outputs */ -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fill3D( OUT_TYPE* A, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_common/fill_b.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu rename to ptypy/accelerate/cuda_common/fill_b.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_common/fmag_all_update.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu rename to ptypy/accelerate/cuda_common/fmag_all_update.cu index f8f695ca5..42d217a67 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu +++ b/ptypy/accelerate/cuda_common/fmag_all_update.cu @@ -6,10 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -using std::sqrt; -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fmag_all_update(complex* f, const IN_TYPE* fmask, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu rename to ptypy/accelerate/cuda_common/fmag_update_nopbound.cu index 40a65c172..89e65450b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu +++ b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu @@ -6,10 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -using std::sqrt; -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fmag_update_nopbound(complex* f, const IN_TYPE* fmask, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_common/fourier_deviation.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu rename to ptypy/accelerate/cuda_common/fourier_deviation.cu index 3427222c3..1548094e9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu +++ b/ptypy/accelerate/cuda_common/fourier_deviation.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_common/fourier_error.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu rename to ptypy/accelerate/cuda_common/fourier_error.cu index ad483c870..43b4e5208 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu +++ b/ptypy/accelerate/cuda_common/fourier_error.cu @@ -7,12 +7,7 @@ */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu b/ptypy/accelerate/cuda_common/fourier_error2.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu rename to ptypy/accelerate/cuda_common/fourier_error2.cu index 86dddf549..36a80c377 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu +++ b/ptypy/accelerate/cuda_common/fourier_error2.cu @@ -2,10 +2,7 @@ * the modes. It turned out to run about 2x slower than the one without * shared memory, so it's not used at this stage. */ -#include -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fourier_error2(int nmodes, complex *f, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu b/ptypy/accelerate/cuda_common/fourier_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu rename to ptypy/accelerate/cuda_common/fourier_update.cu index a713c4418..5be874424 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu +++ b/ptypy/accelerate/cuda_common/fourier_update.cu @@ -6,10 +6,7 @@ is 2x slower than individual as we have many idle threads here. It is not used at the moment. */ -#include -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fourier_update(int nmodes, complex *f_d, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_common/full_reduce.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu rename to ptypy/accelerate/cuda_common/full_reduce.cu index 801204aaa..7f53a4b2e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu +++ b/ptypy/accelerate/cuda_common/full_reduce.cu @@ -7,8 +7,6 @@ */ -#include - extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size) { assert(gridDim.x == 1); diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_common/gd_main.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu rename to ptypy/accelerate/cuda_common/gd_main.cu index 1ab643c4c..461e103ae 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu +++ b/ptypy/accelerate/cuda_common/gd_main.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void gd_main(const IN_TYPE* Imodel, const IN_TYPE* I, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_common/get_address.cu similarity index 94% rename from ptypy/accelerate/cuda_pycuda/cuda/get_address.cu rename to ptypy/accelerate/cuda_common/get_address.cu index dda9b45f1..4c42d295b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu +++ b/ptypy/accelerate/cuda_common/get_address.cu @@ -1,6 +1,4 @@ -#include -#include -using thrust::complex; +#include "common.cuh" inline __device__ int minimum(int a, int b) { return a < b ? a : b; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_common/intens_renorm.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu rename to ptypy/accelerate/cuda_common/intens_renorm.cu index d0033f7f4..4acd11cf1 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu +++ b/ptypy/accelerate/cuda_common/intens_renorm.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void step1(const IN_TYPE* Imodel, const IN_TYPE* I, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu b/ptypy/accelerate/cuda_common/interpolated_shift.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu rename to ptypy/accelerate/cuda_common/interpolated_shift.cu index 49db445f7..23acce6f9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu +++ b/ptypy/accelerate/cuda_common/interpolated_shift.cu @@ -1,10 +1,4 @@ -#include -#include -#include -#include -#include -#include -using thrust::complex; +#include "common.cuh" __device__ inline complex& ascomplex(float2& f2) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_common/log_likelihood.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu rename to ptypy/accelerate/cuda_common/log_likelihood.cu index 075d59f0a..c488f8b69 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_common/log_likelihood.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_common/make_a012.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu rename to ptypy/accelerate/cuda_common/make_a012.cu index 11ba29f62..760b28913 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu +++ b/ptypy/accelerate/cuda_common/make_a012.cu @@ -7,8 +7,7 @@ * - ACC_TYPE: data type used for accumulation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void make_a012(const complex* f, const complex* a, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu b/ptypy/accelerate/cuda_common/make_aux.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu rename to ptypy/accelerate/cuda_common/make_aux.cu index b2f64ba1d..fde2f7812 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu +++ b/ptypy/accelerate/cuda_common/make_aux.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" // core calculation function - used by both kernels and inlined inline __device__ complex calculate( diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu b/ptypy/accelerate/cuda_common/make_exit.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu rename to ptypy/accelerate/cuda_common/make_exit.cu index 956b292dc..e8613da10 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu +++ b/ptypy/accelerate/cuda_common/make_exit.cu @@ -7,8 +7,7 @@ */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, complex y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_common/make_model.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/make_model.cu rename to ptypy/accelerate/cuda_common/make_model.cu index 22bf7d4ab..727388d65 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu +++ b/ptypy/accelerate/cuda_common/make_model.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void make_model( const complex* in, OUT_TYPE* out, int z, int y, int x) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu b/ptypy/accelerate/cuda_common/mass_center.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu rename to ptypy/accelerate/cuda_common/mass_center.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_common/max_abs2.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu rename to ptypy/accelerate/cuda_common/max_abs2.cu index 4da8efb3e..1780bc268 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu +++ b/ptypy/accelerate/cuda_common/max_abs2.cu @@ -5,10 +5,7 @@ * - IN_TYPE: can be float/double or complex/complex */ -#include -#include -using thrust::complex; -using thrust::norm; +#include "common.cuh" inline __device__ OUT_TYPE norm(const float& in) { return in*in; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu b/ptypy/accelerate/cuda_common/ob_norm_local.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu rename to ptypy/accelerate/cuda_common/ob_norm_local.cu index 3969ea6e9..9d14cae6d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu +++ b/ptypy/accelerate/cuda_common/ob_norm_local.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_common/ob_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu rename to ptypy/accelerate/cuda_common/ob_update.cu index 29b993fb0..7bf8dddd9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_common/ob_update.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_common/ob_update2.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu rename to ptypy/accelerate/cuda_common/ob_update2.cu index 821c04a6d..1e9717b81 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_common/ob_update2.cu @@ -15,9 +15,7 @@ */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define ex_dlayer(k) addr[6 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_common/ob_update2_ML.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu rename to ptypy/accelerate/cuda_common/ob_update2_ML.cu index b62e66006..8840457c0 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu +++ b/ptypy/accelerate/cuda_common/ob_update2_ML.cu @@ -15,9 +15,7 @@ */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define ex_dlayer(k) addr[6 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_common/ob_update_ML.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu rename to ptypy/accelerate/cuda_common/ob_update_ML.cu index 84e678ebb..3a20024f9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu +++ b/ptypy/accelerate/cuda_common/ob_update_ML.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_common/ob_update_local.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu rename to ptypy/accelerate/cuda_common/ob_update_local.cu index b3a955868..9ff5b73e6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu +++ b/ptypy/accelerate/cuda_common/ob_update_local.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_common/ob_update_wasp.cu b/ptypy/accelerate/cuda_common/ob_update_wasp.cu new file mode 100644 index 000000000..7127cc287 --- /dev/null +++ b/ptypy/accelerate/cuda_common/ob_update_wasp.cu @@ -0,0 +1,89 @@ +/** ob_update_wasp - in WASP algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include "common.cuh" + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, y.real()); + atomicAdd(xf + 1, y.imag()); +} + +extern "C" __global__ void ob_update_wasp( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + const complex* __restrict__ probe, + const IN_TYPE* __restrict__ probe_abs2, + int D, + int E, + int F, + complex* obj, + complex* obj_sum_nmr, + OUT_TYPE* obj_sum_dnm, + int G, + int H, + int I, + const int* __restrict__ addr, + const IN_TYPE* __restrict__ probe_abs2_mean, + const IN_TYPE alpha_) +{ + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + probe_abs2 += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + obj_sum_nmr += oa[0] * H * I + oa[1] * I + oa[2]; + obj_sum_dnm += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + /*the abs2 mean of this probe mode*/ + const MATH_TYPE probe_abs2_mean_val = probe_abs2_mean[pa[0]]; + const MATH_TYPE alpha = alpha_; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex probe_val = probe[b * F + c]; + MATH_TYPE probe_abs2_val = probe_abs2[b * F + c]; + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + /*(pr_abs2.mean() * alpha + pr_abs2)*/ + MATH_TYPE norm_val = probe_abs2_mean_val * alpha + probe_abs2_val; + + /*0.5 * pr_conj * deltaEW / (pr_abs2.mean() * alpha + pr_abs2)*/ + auto add_val_0 = MATH_TYPE(0.5) * conj(probe_val) * (exit_val - aux_val) / norm_val; + complex add_val = add_val_0; + atomicAdd(&obj[b * I + c], add_val); + + /*pr_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]*/ + auto add_val_1 = conj(probe_val) * exit_val; + complex add_val_nmr = add_val_1; + atomicAdd(&obj_sum_nmr[b * I + c], add_val_nmr); + + /*pr_abs2*/ + OUT_TYPE add_val_dnm = probe_abs2_val; + atomicAdd(&obj_sum_dnm[b * I + c], add_val_dnm); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu b/ptypy/accelerate/cuda_common/pr_norm_local.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu rename to ptypy/accelerate/cuda_common/pr_norm_local.cu index 6e9a8ea76..a89e8f842 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu +++ b/ptypy/accelerate/cuda_common/pr_norm_local.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_common/pr_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu rename to ptypy/accelerate/cuda_common/pr_update.cu index 180cf8f14..d7739a569 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_common/pr_update.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_common/pr_update2.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu rename to ptypy/accelerate/cuda_common/pr_update2.cu index e5417cc01..09913bc02 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu +++ b/ptypy/accelerate/cuda_common/pr_update2.cu @@ -14,9 +14,7 @@ * and the kernel will get considerably slower. */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define pr_roi_row(k) addr[1 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_common/pr_update2_ML.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu rename to ptypy/accelerate/cuda_common/pr_update2_ML.cu index 8a45891c5..167610ea6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu +++ b/ptypy/accelerate/cuda_common/pr_update2_ML.cu @@ -14,9 +14,7 @@ * and the kernel will get considerably slower. */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define pr_roi_row(k) addr[1 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_common/pr_update_ML.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu rename to ptypy/accelerate/cuda_common/pr_update_ML.cu index 3fa24137d..ad32dfe8a 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu +++ b/ptypy/accelerate/cuda_common/pr_update_ML.cu @@ -7,8 +7,7 @@ */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_common/pr_update_local.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu rename to ptypy/accelerate/cuda_common/pr_update_local.cu index d515afd55..cf221aadd 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu +++ b/ptypy/accelerate/cuda_common/pr_update_local.cu @@ -7,8 +7,7 @@ * - ACC_TYPE: data type used in norm calculation (input here) */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_common/pr_update_wasp.cu b/ptypy/accelerate/cuda_common/pr_update_wasp.cu new file mode 100644 index 000000000..04c2a08e5 --- /dev/null +++ b/ptypy/accelerate/cuda_common/pr_update_wasp.cu @@ -0,0 +1,83 @@ +/** pr_update_wasp - in WASP algorithm. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + */ + +#include "common.cuh" + +template +__device__ inline void atomicAdd(complex* x, const complex& y) +{ + auto xf = reinterpret_cast(x); + atomicAdd(xf, y.real()); + atomicAdd(xf + 1, y.imag()); +} + +extern "C" __global__ void pr_update_wasp( + const complex* __restrict__ exit_wave, + const complex* __restrict__ aux, + int A, + int B, + int C, + complex* probe, + complex* probe_sum_nmr, + OUT_TYPE* probe_sum_dnm, + int D, + int E, + int F, + const complex* __restrict__ obj, + int G, + int H, + int I, + const int* __restrict__ addr, + const IN_TYPE beta_) +{ + assert(B == E); // prsh[1] + assert(C == F); // prsh[2] + const int bid = blockIdx.z; + const int tx = threadIdx.x; + const int b = threadIdx.y + blockIdx.y * blockDim.y; + if (b >= B) + return; + const int addr_stride = 15; + + const int* oa = addr + 3 + bid * addr_stride; + const int* pa = addr + bid * addr_stride; + const int* ea = addr + 6 + bid * addr_stride; + + probe += pa[0] * E * F + pa[1] * F + pa[2]; + probe_sum_nmr += pa[0] * E * F + pa[1] * F + pa[2]; + probe_sum_dnm += pa[0] * E * F + pa[1] * F + pa[2]; + obj += oa[0] * H * I + oa[1] * I + oa[2]; + aux += bid * B * C; + const MATH_TYPE beta = beta_; + + assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I); + + exit_wave += ea[0] * B * C; + + for (int c = tx; c < C; c += blockDim.x) + { + complex obj_val = obj[b * I + c]; + MATH_TYPE obj_abs2_val = obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag(); + complex exit_val = exit_wave[b * C + c]; + complex aux_val = aux[b * C + c]; + + /*ob_conj * deltaEW / (beta + ob_abs2)*/ + auto add_val_0 = conj(obj_val) * (exit_val - aux_val) / (beta + obj_abs2_val); + complex add_val = add_val_0; + atomicAdd(&probe[b * F + c], add_val); + + /*ob_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]*/ + auto add_val_1 = conj(obj_val) * exit_val; + complex add_val_nmr = add_val_1; + atomicAdd(&probe_sum_nmr[b * F + c], add_val_nmr); + + /*ob_abs2*/ + OUT_TYPE add_val_dnm = obj_abs2_val; + atomicAdd(&probe_sum_dnm[b * F + c], add_val_dnm); + } +} diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_common/transpose.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/transpose.cu rename to ptypy/accelerate/cuda_common/transpose.cu index 8de4e7ad7..f00be8937 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu +++ b/ptypy/accelerate/cuda_common/transpose.cu @@ -10,8 +10,7 @@ * - DTYPE - any pod type */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void transpose(const DTYPE* idata, DTYPE* odata, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_common/update_addr_error_state.cu similarity index 94% rename from ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu rename to ptypy/accelerate/cuda_common/update_addr_error_state.cu index 1220a0986..e4045a38b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu +++ b/ptypy/accelerate/cuda_common/update_addr_error_state.cu @@ -5,9 +5,7 @@ * - OUT_TYPE: the data type for the outputs (float or double) */ -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void update_addr_error_state(int* __restrict addr, const int* __restrict mangled_addr, diff --git a/ptypy/accelerate/cuda_common/utils.py b/ptypy/accelerate/cuda_common/utils.py new file mode 100644 index 000000000..a953bfdb4 --- /dev/null +++ b/ptypy/accelerate/cuda_common/utils.py @@ -0,0 +1,18 @@ +import numpy as np + +# maps a numpy dtype to the corresponding C type +def map2ctype(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) diff --git a/ptypy/accelerate/cuda_cupy/__init__.py b/ptypy/accelerate/cuda_cupy/__init__.py new file mode 100644 index 000000000..717878241 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/__init__.py @@ -0,0 +1,73 @@ + +from typing import Optional +import cupy as cp +import os + +from ptypy.utils.verbose import headerline, log + +kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common')) +compile_options =['-std=c++14', '-DPTYPY_CUPY_NVTRC=1', '-I' + kernel_dir, '-DNDEBUG'] +queue = None +device = None + + +def get_context(new_queue=False): + + from ptypy.utils import parallel + + global queue, device + + if queue is None or new_queue: + ndevs = cp.cuda.runtime.getDeviceCount() + if parallel.rank_local >= ndevs: + raise Exception('Local rank must be smaller than total device count, \ + rank={}, rank_local={}, device_count={}'.format( + parallel.rank, parallel.rank_local, ndevs + )) + device = cp.cuda.Device(parallel.rank_local) + device.use() + queue = cp.cuda.Stream() + + return queue + + +def load_kernel(name, subs={}, file=None, options=None): + + if file is None: + if isinstance(name, str): + fn = "%s/%s.cu" % (kernel_dir, name) + else: + raise ValueError( + "name parameter must be a string if not filename is given") + else: + fn = "%s/%s" % (kernel_dir, file) + + with open(fn, 'r') as f: + kernel = f.read() + for k, v in list(subs.items()): + kernel = kernel.replace(k, str(v)) + # insert a preprocessor line directive to assist compiler errors + escaped = fn.replace("\\", "\\\\") + kernel = '#line 1 "{}"\n'.format(escaped) + kernel + + opt = [*compile_options] + if options is not None: + opt += list(options) + module = cp.RawModule(code=kernel, options=tuple(opt)) + if isinstance(name, str): + return module.get_function(name) + else: # tuple + return tuple(module.get_function(n) for n in name) + +def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'): + mempool = cp.get_default_memory_pool() + pinned_pool = cp.get_default_pinned_memory_pool() + log(level, '\n' + headerline(heading)) + log(level, f'Device id : {cp.cuda.Device().id}') + log(level, f'Total device mem : {cp.cuda.runtime.memGetInfo()[1]/1024/1024} MB') + log(level, f'Free device mem : {cp.cuda.runtime.memGetInfo()[0]/1024/1024} MB') + log(level, f'MemoryPool size : {mempool.total_bytes()/1024/1024} MB') + log(level, f'MemoryPool used : {mempool.used_bytes()/1024/1024} MB') + log(level, f'MemoryPool limit : {mempool.get_limit()/1024/1024} MB') + log(level, f'MemoryPool free blocks: {mempool.n_free_blocks()}') + log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}') \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/address_manglers.py b/ptypy/accelerate/cuda_cupy/address_manglers.py new file mode 100644 index 000000000..ae4eeadbe --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/address_manglers.py @@ -0,0 +1,81 @@ +from . import load_kernel +import numpy as np +from ptypy.accelerate.base import address_manglers as npam +import cupy as cp + + +class BaseMangler(npam.BaseMangler): + + def __init__(self, *args, queue_thread=None, **kwargs): + super().__init__(*args, **kwargs) + self.queue = queue_thread + self.get_address_cuda = load_kernel("get_address") + self.delta = None + self.delta_gpu = None + + def _setup_delta_gpu(self): + if self.queue is not None: + self.queue.use() + assert self.delta is not None, "Setup delta using the setup_shifts method first" + self.delta = np.ascontiguousarray(self.delta, dtype=np.int32) + + if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]: + self.delta_gpu = cp.empty(self.delta.shape, dtype=np.int32) + # in case self.delta is smaller than delta_gpu, this will only copy the + # relevant part + cp.cuda.runtime.memcpy(dst=self.delta_gpu.data.ptr, + src=self.delta.ctypes.data, + size=self.delta.size * self.delta.itemsize, + kind=1) # host to device + + + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): + assert addr_current.dtype == np.int32, "addresses must be int32" + assert mangled_addr.dtype == np.int32, "addresses must be int32" + assert len(addr_current.shape) == 4, "addresses must be 4 dimensions" + assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated" + assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first" + assert index < self.delta_gpu.shape[0], "Index out of range for deltas" + assert isinstance(self.delta_gpu, cp.ndarray), "Only GPU arrays are supported for delta" + + if self.queue is not None: + self.queue.use() + + # only using a single thread block here as it's not enough work + # otherwise + self.get_address_cuda( + (1, 1, 1), + (64, 1, 1), + (addr_current, + mangled_addr, + np.int32(addr_current.shape[0] * addr_current.shape[1]), + self.delta_gpu[index,None], + np.int32(max_oby), + np.int32(max_obx))) + +# with multiple inheritance, we have to be explicit which super class +# we are calling in the methods +class RandomIntMangler(BaseMangler, npam.RandomIntMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.RandomIntMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) + + +class GridSearchMangler(BaseMangler, npam.GridSearchMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.GridSearchMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py new file mode 100644 index 000000000..9c68d9431 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/array_utils.py @@ -0,0 +1,670 @@ +import cupy as cp +import numpy as np + +from ptypy.accelerate.cuda_common.utils import map2ctype +from ptypy.utils.math_utils import gaussian +from . import load_kernel + + +class ArrayUtilsKernel: + def __init__(self, acc_dtype=cp.float64, queue=None): + self.queue = queue + self.acc_dtype = acc_dtype + # Note: cupy's ReductionKernel is far less efficient + self.cdot_cuda = load_kernel("dot", { + 'IN_TYPE': 'complex', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float' + }) + self.dot_cuda = load_kernel("dot", { + 'IN_TYPE': 'float', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float' + }) + self.full_reduce_cuda = load_kernel("full_reduce", { + 'IN_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'OUT_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'BDIM_X': 1024 + }) + self.Ctmp = None + + def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarray: + assert A.dtype == B.dtype, "Input arrays must be of same data type" + assert A.size == B.size, "Input arrays must be of the same size" + + if self.queue is not None: + self.queue.use() + if out is None: + out = cp.empty(1, dtype=self.acc_dtype) + + block = (1024, 1, 1) + grid = (int((B.size + 1023) // 1024), 1, 1) + if self.acc_dtype == np.float32: + elsize = 4 + elif self.acc_dtype == np.float64: + elsize = 8 + if self.Ctmp is None or self.Ctmp.size < grid[0]: + self.Ctmp = cp.zeros((grid[0],), dtype=self.acc_dtype) + Ctmp = self.Ctmp + if grid[0] == 1: + Ctmp = out + if np.iscomplexobj(B): + self.cdot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp), + shared_mem=1024 * elsize) + else: + self.dot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp), + shared_mem=1024 * elsize) + if grid[0] > 1: + self.full_reduce_cuda((1, 1, 1), (1024, 1, 1), (self.Ctmp, out, np.int32(grid[0])), + shared_mem=elsize*1024) + + return out + + def norm2(self, A, out=None): + return self.dot(A, A, out) + + +class TransposeKernel: + + def __init__(self, queue=None): + self.queue = queue + self.transpose_cuda = load_kernel("transpose", { + 'DTYPE': 'int', + 'BDIM': 16 + }) + + def transpose(self, input, output): + # only for int at the moment (addr array), and 2D (reshape pls) + if len(input.shape) != 2: + raise ValueError( + "Only 2D tranpose is supported - reshape as desired") + if input.shape[0] != output.shape[1] or input.shape[1] != output.shape[0]: + raise ValueError("Input/Output must be of flipped shape") + if input.dtype != np.int32 or output.dtype != np.int32: + raise ValueError("Only int types are supported at the moment") + + width = input.shape[1] + height = input.shape[0] + blk = (16, 16, 1) + grd = ( + int((input.shape[1] + 15) // 16), + int((input.shape[0] + 15) // 16), + 1 + ) + if self.queue is not None: + self.queue.use() + self.transpose_cuda( + grd, blk, (input, output, np.int32(width), np.int32(height))) + + +class MaxAbs2Kernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.max_abs2_cuda = {} + + def max_abs2(self, X: cp.ndarray, out: cp.ndarray): + """ Calculate max(abs(x)**2) across the final 2 dimensions""" + rows = np.int32(X.shape[-2]) + cols = np.int32(X.shape[-1]) + firstdims = np.int32(np.prod(X.shape[:-2])) + gy = int(rows) + # lazy-loading, keeping scratch memory and both kernels in the same dictionary + bx = int(64) + version = '{},{},{}'.format( + map2ctype(X.dtype), map2ctype(out.dtype), gy) + if version not in self.max_abs2_cuda: + step1, step2 = load_kernel( + ("max_abs2_step1", "max_abs2_step2"), + { + 'IN_TYPE': map2ctype(X.dtype), + 'OUT_TYPE': map2ctype(out.dtype), + 'BDIM_X': bx, + }, "max_abs2.cu") + self.max_abs2_cuda[version] = { + 'step1': step1, + 'step2': step2, + 'scratchmem': cp.empty((gy,), dtype=out.dtype) + } + + # if self.max_abs2_cuda[version]['scratchmem'] is None \ + # or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: + # self.max_abs2_cuda[version]['scratchmem'] = + scratch = self.max_abs2_cuda[version]['scratchmem'] + + if self.queue is not None: + self.queue.use() + + self.max_abs2_cuda[version]['step1']( + (1, gy, 1), (bx, 1, 1), (X, firstdims, rows, cols, scratch)) + self.max_abs2_cuda[version]['step2']( + (1, 1, 1), (bx, 1, 1), (scratch, np.int32(gy), out)) + + +class CropPadKernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.fill3D_cuda = {} + + def fill3D(self, A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any( + ), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all( + ), "At least one dimension lacks overlap" + Ao = Ao.astype(np.int32) + Bo = Bo.astype(np.int32) + lengths = np.array([ + min(off[0] + Blim[0], Alim[0]) - Ao[0], + min(off[1] + Blim[1], Alim[1]) - Ao[1], + min(off[2] + Blim[2], Alim[2]) - Ao[2], + ], dtype=np.int32) + lengths2 = np.array([ + min(Alim[0] - off[0], Blim[0]) - Bo[0], + min(Alim[1] - off[1], Blim[1]) - Bo[1], + min(Alim[2] - off[2], Blim[2]) - Bo[2], + ], dtype=np.int32) + assert (lengths == lengths2).all( + ), "left and right lenghts are not matching" + batch = int(np.prod(A.shape[:-3])) + + # lazy loading depending on data type + version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype)) + if version not in self.fill3D_cuda: + self.fill3D_cuda[version] = load_kernel("fill3D", { + 'IN_TYPE': map2ctype(B.dtype), + 'OUT_TYPE': map2ctype(A.dtype) + }) + bx = by = 32 + if self.queue is not None: + self.queue.use() + self.fill3D_cuda[version]( + (int((lengths[2] + bx - 1)//bx), + int((lengths[1] + by - 1)//by), + int(batch)), + (int(bx), int(by), int(1)), + (A, B, + np.int32(A.shape[-3]), np.int32(A.shape[-2] + ), np.int32(A.shape[-1]), + np.int32(B.shape[-3]), np.int32(B.shape[-2] + ), np.int32(B.shape[-1]), + Ao[0], Ao[1], Ao[2], + Bo[0], Bo[1], Bo[2], + lengths[0], lengths[1], lengths[2]) + ) + + def crop_pad_2d_simple(self, A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any( + ), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + self.fill3D(A, B, offset) + + +class DerivativesKernel: + def __init__(self, dtype, queue=None): + if dtype == np.float32: + stype = "float" + elif dtype == np.complex64: + stype = "complex" + else: + raise NotImplementedError( + "delxf is only implemented for float32 and complex64") + + self.queue = queue + self.dtype = dtype + self.last_axis_block = (256, 4, 1) + self.mid_axis_block = (256, 4, 1) + + self.delxf_last, self.delxf_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'true', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + self.delxb_last, self.delxb_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'false', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + + def delxf(self, input, out, axis=-1): + if input.dtype != self.dtype: + raise ValueError('Invalid input data type') + + if axis < 0: + axis = input.ndim + axis + axis = np.int32(axis) + + if self.queue is not None: + self.queue.use() + + if axis == input.ndim - 1: + flat_dim = np.int32(np.prod(input.shape[0:-1])) + self.delxf_last(( + int((flat_dim + + self.last_axis_block[1] - 1) // self.last_axis_block[1]), + 1, 1), + self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) + else: + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) + gx = int( + (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) + gy = 1 + gz = int(higher_dim) + self.delxf_mid((gx, gy, gz), self.mid_axis_block, (input, + out, lower_dim, higher_dim, np.int32(input.shape[axis]))) + + def delxb(self, input, out, axis=-1): + if input.dtype != self.dtype: + raise ValueError('Invalid input data type') + + if axis < 0: + axis = input.ndim + axis + axis = np.int32(axis) + + if self.queue is not None: + self.queue.use() + if axis == input.ndim - 1: + flat_dim = np.int32(np.prod(input.shape[0:-1])) + self.delxb_last(( + int((flat_dim + + self.last_axis_block[1] - 1) // self.last_axis_block[1]), + 1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) + else: + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) + gx = int( + (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) + gy = 1 + gz = int(higher_dim) + self.delxb_mid((gx, gy, gz), self.mid_axis_block, (input, + out, lower_dim, higher_dim, np.int32(input.shape[axis]))) + + +class GaussianSmoothingKernel: + def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type + self.dtype = np.complex64 + self.stype = "complex" + self.queue = queue + self.num_stdevs = num_stdevs + self.blockdim_x = 4 + self.blockdim_y = 16 + + # At least 2 blocks per SM + self.max_shared_per_block = 48 * 1024 // 2 + self.max_shared_per_block_complex = self.max_shared_per_block / \ + 2 * np.dtype(np.float32).itemsize + self.max_kernel_radius = int( + self.max_shared_per_block_complex / self.blockdim_y) + + self.convolution_row = load_kernel( + "convolution_row", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_x, + 'BDIM_Y': self.blockdim_y, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type + }) + self.convolution_col = load_kernel( + "convolution_col", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns + 'BDIM_Y': self.blockdim_x, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type + }) + # pre-allocate kernel memory on gpu, with max-radius to accomodate + dtype = np.float32 if self.kernel_type == 'float' else np.float64 + self.kernel_gpu = cp.empty((self.max_kernel_radius,), dtype=dtype) + # keep track of previus radius and std to determine if we need to transfer again + self.r = 0 + self.std = 0 + + def convolution(self, data, mfs, tmp=None): + """ + Calculates a stacked 2D convolution for smoothing, with the standard deviations + given in mfs (stdx, stdy). It works in-place in the data array, + and tmp is a gpu-allocated array of the same size and type as data, + used internally for temporary storage + """ + ndims = data.ndim + shape = data.shape + + # Create temporary array (if not given) + if tmp is None: + tmp = cp.empty(shape, dtype=data.dtype) + assert shape == tmp.shape and data.dtype == tmp.dtype + + # Check input dimensions + if ndims == 3: + batches, y, x = shape + stdy, stdx = mfs + elif ndims == 2: + batches = 1 + y, x = shape + stdy, stdx = mfs + elif ndims == 1: + batches = 1 + y, x = shape[0], 1 + stdy, stdx = mfs[0], 0.0 + else: + raise NotImplementedError( + "input needs to be of dimensions 0 < ndims <= 3") + + input = data + output = tmp + + if self.queue is not None: + self.queue.use() + + # Row convolution kernel + # TODO: is this threshold acceptable in all cases? + if stdx > 0.1: + r = int(self.num_stdevs * stdx + 0.5) + if r > self.max_kernel_radius: + raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdx != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r, r+1), stdx) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype( + np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = cp.asarray(k[:]) + self.r = r + self.std = stdx + + bx = self.blockdim_x + by = self.blockdim_y + + shared = (bx + 2*r) * by * np.dtype(np.complex64).itemsize + if shared > self.max_shared_per_block: + raise MemoryError("Cannot run kernel in shared memory") + + blk = (bx, by, 1) + grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches) + self.convolution_row(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)), + shared_mem=shared) + + input = output + output = data + + # Column convolution kernel + # TODO: is this threshold acceptable in all cases? + if stdy > 0.1: + r = int(self.num_stdevs * stdy + 0.5) + if r > self.max_kernel_radius: + raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdy != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r, r+1), stdy) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype( + np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = cp.asarray(k[:]) + self.r = r + self.std = stdy + + bx = self.blockdim_y + by = self.blockdim_x + + shared = (by + 2*r) * bx * np.dtype(np.complex64).itemsize + if shared > self.max_shared_per_block: + raise MemoryError("Cannot run kernel in shared memory") + + blk = (bx, by, 1) + grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches) + self.convolution_col(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)), + shared_mem=shared) + + # TODO: is this threshold acceptable in all cases? + if (stdx <= 0.1 and stdy <= 0.1): + return # nothing to do + elif (stdx > 0.1 and stdy > 0.1): + return # both parts have run, output is back in data + else: + data[:] = tmp[:] # only one of them has run, output is in tmp + + +class ClipMagnitudesKernel: + + def __init__(self, queue=None): + self.queue = queue + self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", { + 'IN_TYPE': 'complex', + }) + + def clip_magnitudes_to_range(self, array, clip_min, clip_max): + if self.queue is not None: + self.queue.use() + + cmin = np.float32(clip_min) + cmax = np.float32(clip_max) + + npixel = np.int32(np.prod(array.shape)) + bx = 256 + gx = int((npixel + bx - 1) // bx) + self.clip_magnitudes_cuda((gx, 1, 1), (bx, 1, 1), (array, cmin, cmax, + npixel)) + +class MassCenterKernel: + + def __init__(self, queue=None): + self.queue = queue + self.threadsPerBlock = 256 + + self.indexed_sum_middim_cuda = load_kernel("indexed_sum_middim", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : self.threadsPerBlock, + 'BDIM_Y' : 1, + } + ) + + self.indexed_sum_lastdim_cuda = load_kernel("indexed_sum_lastdim", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : 32, + 'BDIM_Y' : 32, + } + ) + + self.final_sums_cuda = load_kernel("final_sums", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : 256, + 'BDIM_Y' : 1, + } + ) + + def mass_center(self, array): + if array.dtype != np.float32: + raise NotImplementedError("mass_center is only implemented for float32") + + i = np.int32(array.shape[0]) + m = np.int32(array.shape[1]) + if array.ndim >= 3: + n = np.int32(array.shape[2]) + else: + n = np.int32(1) + + if self.queue is not None: + self.queue.use() + + total_sum = cp.sum(array, dtype=np.float32).get() + sc = np.float32(1. / total_sum.item()) + + i_sum = cp.empty(array.shape[0], dtype=np.float32) + m_sum = cp.empty(array.shape[1], dtype=np.float32) + n_sum = cp.empty(int(n), dtype=np.float32) + out = cp.empty(3 if n>1 else 2, dtype=np.float32) + + # sum all dims except the first, multiplying by the index and scaling factor + block_ = (self.threadsPerBlock, 1, 1) + grid_ = (int(i), 1, 1) + self.indexed_sum_middim_cuda(grid_, block_, (array, i_sum, np.int32(1), i, n*m, sc), + shared_mem=self.threadsPerBlock*4) + + if array.ndim >= 3: + # 3d case + # sum all dims, except the middle, multiplying by the index and scaling factor + block_ = (self.threadsPerBlock, 1, 1) + grid_ = (int(m), 1, 1) + self.indexed_sum_middim_cuda(grid_, block_, (array, m_sum, i, n, m, sc), + shared_mem=self.threadsPerBlock*4) + + # sum the all dims except the last, multiplying by the index and scaling factor + block_ = (32, 32, 1) + grid_ = (1, int(n + 32 - 1) // 32, 1) + self.indexed_sum_lastdim_cuda(grid_, block_, (array, n_sum, i*m, n, sc), + shared_mem=32*32*4) + else: + # 2d case + # sum the all dims except the last, multiplying by the index and scaling factor + block_ = (32, 32, 1) + grid_ = (1, int(m + 32 - 1) // 32, 1) + self.indexed_sum_lastdim_cuda(grid_, block_, (array, m_sum, i, m, sc), + shared_mem=32*32*4) + + block_ = (256, 1, 1) + grid_ = (3 if n>1 else 2, 1, 1) + self.final_sums_cuda(grid_, block_, (i_sum, i, m_sum, m, n_sum, n, out), + shared_mem=256*4) + + return out + +class Abs2SumKernel: + + def __init__(self, dtype, queue=None): + self.in_stype = map2ctype(dtype) + if self.in_stype == 'complex': + self.out_stype = 'float' + self.out_dtype = np.float32 + elif self.in_stype == 'copmlex': + self.out_stype = 'double' + self.out_dtype = np.float64 + else: + self.out_stype = self.in_stype + self.out_dtype = dtype + + self.queue = queue + self.threadsPerBlock = 32 + + self.abs2sum_cuda = load_kernel("abs2sum", subs={ + 'IN_TYPE': self.in_stype, + 'OUT_TYPE' : self.out_stype, + 'BDIM_X' : 32, + } + ) + + def abs2sum(self, array): + nmodes = np.int32(array.shape[0]) + row, col = array.shape[1:] + out = cp.empty(array.shape[1:], dtype=self.out_dtype) + + if self.queue is not None: + self.queue.use() + block_ = (32, 1, 1) + grid_ = (1, row, 1) + self.abs2sum_cuda(grid_, block_, (array, nmodes, np.int32(row), np.int32(col), out)) + + return out + +class InterpolatedShiftKernel: + + def __init__(self, queue=None): + self.queue = queue + + self.integer_shift_cuda, self.linear_interpolate_cuda = load_kernel( + ("integer_shift_kernel", "linear_interpolate_kernel"), + file="interpolated_shift.cu", subs={ + 'IN_TYPE': 'complex', + 'OUT_TYPE': 'complex', + 'BDIM_X' : 32, + 'BDIM_Y' : 32, + } + ) + + def interpolate_shift(self, array, shift): + shift = np.asarray(shift, dtype=np.float32) + if len(shift) != 2: + raise NotImplementedError("Shift only applied to 2D array.") + if array.dtype != np.complex64: + raise NotImplementedError("Only complex single precision supported") + if array.ndim == 3: + items, rows, columns = array.shape + elif array.ndim == 2: + items, rows, columns = 1, *array.shape + else: + raise NotImplementedError("Only 2- or 3-dimensional arrays supported") + + offsetRow, offsetCol = shift + + offsetRowFrac, offsetRowInt = np.modf(offsetRow) + offsetColFrac, offsetColInt = np.modf(offsetCol) + + if self.queue is not None: + self.queue.use() + + out = cp.empty_like(array) + block_ = (32, 32, 1) + grid_ = ((rows + 31) // 32, (columns + 31) // 32, items) + + if np.abs(offsetRowFrac) < 1e-6 and np.abs(offsetColFrac) < 1e-6: + if offsetRowInt == 0 and offsetColInt == 0: + # no transformation at all + out = array + else: + # no fractional part, so we can just use a shifted copy + self.integer_shift_cuda(grid_, block_, (array, out, np.int32(rows), + np.int32(columns), np.int32(offsetRow), + np.int32(offsetCol))) + else: + self.linear_interpolate_cuda(grid_, block_, (array, out, np.int32(rows), + np.int32(columns), np.float32(offsetRow), + np.float32(offsetCol)), + shared_mem=(32+2)**2*8+32*(32+2)*8) + + return out + diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py new file mode 100644 index 000000000..450d6455e --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/cufft.py @@ -0,0 +1,184 @@ +import cupy as cp +from cupyx.scipy import fft as cuxfft +from cupyx.scipy.fft import get_fft_plan +from . import load_kernel +import numpy as np + +class FFT_base(object): + + def __init__(self, array, queue=None, + inplace=False, + pre_fft=None, + post_fft=None, + symmetric=True, + forward=True): + self._queue = queue + dims = array.ndim + if dims < 2: + raise AssertionError('Input array must be at least 2-dimensional') + self.arr_shape = (array.shape[-2], array.shape[-1]) + self.batches = int(np.prod( + array.shape[0:dims-2]) if dims > 2 else 1) + self.forward = forward + + self._load(array, pre_fft, post_fft, symmetric, forward) + +class FFT_cuda(FFT_base): + + def __init__(self, array, queue=None, + inplace=False, + pre_fft=None, + post_fft=None, + symmetric=True, + forward=True): + rows, columns = (array.shape[-2], array.shape[-1]) + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError( + "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") + super(FFT_cuda, self).__init__(array, queue=queue, + inplace=inplace, + pre_fft=pre_fft, + post_fft=post_fft, + symmetric=symmetric, + forward=forward) + + def _load(self, array, pre_fft, post_fft, symmetric, forward): + if pre_fft is not None: + self.pre_fft = cp.asarray(pre_fft) + self.pre_fft_ptr = self.pre_fft.data.ptr + else: + self.pre_fft_ptr = 0 + if post_fft is not None: + self.post_fft = cp.asarray(post_fft) + self.post_fft_ptr = self.post_fft.data.ptr + else: + self.post_fft_ptr = 0 + + import filtered_cufft + self.fftobj = filtered_cufft.FilteredFFT( + self.batches, + self.arr_shape[0], + self.arr_shape[1], + symmetric, + forward, + self.pre_fft_ptr, + self.post_fft_ptr, + self._queue.ptr) + + self.ft = self._ft + self.ift = self._ift + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + self.fftobj.queue = self._queue.ptr + + def _ft(self, input, output): + self.fftobj.fft(input.data.ptr, output.data.ptr) + + def _ift(self, input, output): + self.fftobj.ifft(input.data.ptr, output.data.ptr) + + +class FFT_cupy(FFT_base): + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + + def _load(self, array, pre_fft, post_fft, symmetric, forward): + assert (array.dtype in [np.complex64, np.complex128]) + assert (pre_fft.dtype in [ + np.complex64, np.complex128] if pre_fft is not None else True) + assert (post_fft.dtype in [ + np.complex64, np.complex128] if post_fft is not None else True) + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if pre_fft is not None: + math_type = 'float' if pre_fft.dtype == np.complex64 else 'double' + self.pre_fft_knl = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'false', + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type + }) if pre_fft is not None else None + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if post_fft is not None: + math_type = 'float' if post_fft.dtype == np.complex64 else 'double' + self.post_fft_knl = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false', + 'MPY_DO_FILT': 'true' if post_fft is not None else 'false', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type + }) if (not (forward and not symmetric) or post_fft is not None) else None + + self.block = (32, 32, 1) + self.grid = ( + int((self.arr_shape[0] + 31) // 32), + int((self.arr_shape[1] + 31) // 32), + int(self.batches) + ) + if self.queue is not None: + self.queue.use() + self.plan = get_fft_plan(array, self.arr_shape, axes=(-2, -1), value_type="C2C") + self.scale = 1.0 + self.norm = 'ortho' if symmetric else 'backward' + + if pre_fft is not None: + self.pre_fft = cp.asarray(pre_fft) + else: + self.pre_fft = np.intp(0) # NULL + if post_fft is not None: + self.post_fft = cp.asarray(post_fft) + else: + self.post_fft = np.intp(0) + + self.ft = self._ft + self.ift = self._ift + + def _prefilt(self, x, y): + if self.pre_fft_knl: + self.pre_fft_knl(grid=self.grid, + block=self.block, + args=(x, y, self.pre_fft, + np.float32(self.scale), + np.int32(self.batches), + np.int32(self.arr_shape[0]), + np.int32(self.arr_shape[1]))) + else: + y[:] = x[:] + + def _postfilt(self, y): + if self.post_fft_knl: + assert self.post_fft is not None + assert self.scale is not None + self.post_fft_knl(grid=self.grid, + block=self.block, + args=(y, y, self.post_fft, np.float32(self.scale), + np.int32(self.batches), + np.int32(self.arr_shape[0]), + np.int32(self.arr_shape[1]))) + def _ft(self, x, y): + if self.queue is not None: + self.queue.use() + self._prefilt(x, y) + cuxfft.fft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm) + self._postfilt(y) + + def _ift(self, x, y): + if self.queue is not None: + self.queue.use() + self._prefilt(x, y) + cuxfft.ifft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm) + self._postfilt(y) diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml new file mode 100644 index 000000000..6331bbbc5 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/dependencies.yml @@ -0,0 +1,17 @@ +name: ptypy_cupy +channels: + - conda-forge +dependencies: + - python + - numpy + - scipy + - matplotlib + - h5py + - pyzmq + - mpi4py + - pillow + - pyfftw + - cupy + - cudatoolkit-dev + - pip + - compilers \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py new file mode 100644 index 000000000..efcc42338 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py @@ -0,0 +1,806 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood reconstruction engine. + +TODO. + + * Implement other regularizers + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +import cupy as cp +import cupyx + +from ptypy.engines import register +from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from .. import get_context, log_device_memory_stats +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel +from ..mem_utils import GpuDataManager + +#from ..mem_utils import GpuDataManager +from ptypy.accelerate.base import address_manglers + +__all__ = ['ML_cupy'] + +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + + +@register() +class ML_cupy(ML_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [fft_lib] + default = cuda + type = str + help = Choose the cupy-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy using cufft (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + self.queue = get_context(new_queue=True) + + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + super().engine_initialize() + # self._setup_kernels() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + AUK = ArrayUtilsKernel(queue=self.queue) + self._dot_kernel = AUK.dot + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape]) + aux = cp.zeros(ash, dtype=np.complex64) + kern.aux = aux + kern.a = cp.zeros(ash, dtype=np.complex64) + kern.b = cp.zeros(ash, dtype=np.complex64) + + # setup kernels, one for each SCAN. + kern.GDK = GradientDescentKernel( + aux, nmodes, queue=self.queue, math_type="double") + kern.GDK.allocate() + + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + kern.TK = TransposeKernel(queue=self.queue) + + kern.PROP = PropagationKernel( + aux, geo.propagator, queue_thread=self.queue, fft_type=self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + mag_mem = 0 + for scan, kern in self.kernels.items(): + mag_mem = max(kern.aux.nbytes // 2, mag_mem) + ma_mem = mag_mem + blk = ma_mem + mag_mem + + # We need to add the free memory from the pool to the free device memory, + # as both will be used for allocations + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nma = min(fit, MAX_BLOCKS) + log_device_memory_stats(4) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3))) + log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma)) + # reset memory or create new + self.w_data = GpuDataManager(ma_mem, 0, nma, False) + self.I_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super().engine_prepare() + ## Serialize new data ## + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # recursive copy to gpu for probe and object + for _cname, c in self.ptycho.containers.items(): + if c.original != self.pr and c.original != self.ob: + continue + for _sname, s in c.S.items(): + # convert data here + s.gpu = cp.asarray(s.data) + s.cpu = cupyx.empty_pinned( + s.data.shape, s.data.dtype, order="C") + s.cpu[:] = s.data + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.fic_gpu = cp.ones_like(prep.err_phot_gpu) + + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + + prep.addr_gpu = cp.asarray(prep.addr) + if self.do_position_refinement: + prep.original_addr_gpu = cp.asarray(prep.original_addr) + prep.error_state_gpu = cp.empty_like(prep.err_phot_gpu) + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + # Todo: Which address to pick? + if use_tiles: + prep.addr2_gpu = cp.asarray(prep.addr2) + + prep.I = cupyx.empty_pinned(d.data.shape, d.data.dtype, order="C") + prep.I[:] = d.data + + # Todo: avoid that extra copy of data + if self.do_position_refinement: + ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + + log(4, 'Free memory on device: %.2f GB' % + (float(cp.cuda.runtime.memGetInfo()[0])/1e9)) + self.w_data.add_data_block() + self.I_data.add_data_block() + + self.dID_list = list(self.di.S.keys()) + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + elif self.p.ML_type.lower() == "poisson": + raise NotImplementedError('Poisson norm model not yet implemented') + elif self.p.ML_type.lower() == "euclid": + raise NotImplementedError('Euclid norm model not yet implemented') + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): + """ + Overloading the context of Storage.data here, to allow for in-place math on Container instances: + """ + if container is not None: + if container.original == self.pr or container.original == self.ob: + for s in container.S.values(): + # convert data here + if dev == 'gpu': + s.data = s.gpu + if sync_copy: + s.gpu.set(s.cpu) + elif dev == 'cpu': + s.data = s.cpu + if sync_copy: + s.gpu.get(out=s.cpu) + #print('%s to cpu' % s.ID) + else: + for container in self.ptycho.containers.values(): + self._set_pr_ob_ref_for_data( + dev=dev, container=container, sync_copy=sync_copy) + + def _get_smooth_gradient(self, data, sigma): + if self.GSK.tmp is None: + self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64) + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + return data + + def _replace_ob_grad(self): + new_ob_grad = self.ob_grad_new + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.gpu = self._get_smooth_gradient( + s.gpu, self.smooth_gradient.sigma) + + return self._replace_grad(self.ob_grad, new_ob_grad) + + def _replace_pr_grad(self): + new_pr_grad = self.pr_grad_new + # probe support + if self.p.probe_update_start <= self.curiter: + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + else: + new_pr_grad.fill(0.) + + return self._replace_grad(self.pr_grad, new_pr_grad) + + def _replace_grad(self, grad, new_grad): + norm = np.double(0.) + dot = np.double(0.) + for name, new in new_grad.storages.items(): + old = grad.storages[name] + norm += self._dot_kernel(new.gpu, new.gpu).get()[0] + dot += self._dot_kernel(new.gpu, old.gpu).get()[0] + old.gpu[:] = new.gpu + return norm, dot + + def engine_iterate(self, num=1): + err = super().engine_iterate(num) + # copy all data back to cpu + self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True) + return err + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.dID_list: + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + err_phot = prep.err_phot_gpu + error_state = prep.error_state_gpu + + # copy intensities and weights to GPU + ev_w, w, data_w = self.w_data.to_gpu( + prep.weights, dID, self.qu_htod) + ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod) + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + PCK.queue.wait_event(ev) + # w & I now on device + PCK.log_likelihood_ml(aux, addr, I, w, err_phot) + cp.cuda.runtime.memcpy(dst=error_state.data.ptr, + src=err_phot.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_phot) + + data_w.record_done(self.queue, 'compute') + data_I.record_done(self.queue, 'compute') + cp.cuda.runtime.memcpy(dst=err_phot.data.ptr, + src=error_state.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), + prep.addr2_gpu.reshape(s2, s1)) + + self.dID_list.reverse() + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel( + supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel( + support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.w_data = None + self.I_data = None + + for name, s in self.pr.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for name, s in self.ob.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + prep.float_intens_coeff = prep.fic_gpu.get() + + # self.queue.synchronize() + super().engine_finalize() + + log_device_memory_stats(4) + + +class GaussianModel(BaseModelSerial): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + super(GaussianModel, self).__init__(MLengine) + + if self.p.reg_del2: + self.regularizer = Regul_del2_cupy( + self.p.reg_del2_amplitude, + queue=self.engine.queue + ) + else: + self.regularizer = None + + def prepare(self): + + super(GaussianModel, self).prepare() + + for label, d in self.engine.ptycho.new_data: + prep = self.engine.diff_info[d.ID] + w = (self.Irenorm * self.engine.ma.S[d.ID].data + / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + prep.weights = cupyx.empty_pinned(w.shape, w.dtype, order="C") + prep.weights[:] = w + + def __del__(self): + """ + Clean up routine + """ + super(GaussianModel, self).__del__() + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + ob_grad = self.engine.ob_grad_new + pr_grad = self.engine.pr_grad_new + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + self.engine._set_pr_ob_ref_for_data('gpu') + ob_grad << 0. + pr_grad << 0. + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + POK = kern.POK + aux = kern.aux + + FW = kern.PROP.fw + BW = kern.PROP.bw + + # get addresses and auxilliary array + addr = prep.addr_gpu + fic = prep.fic_gpu + + err_phot = prep.err_phot_gpu + # local references + ob = self.engine.ob.S[oID].data + obg = ob_grad.S[oID].data + pr = self.engine.pr.S[pID].data + prg = pr_grad.S[pID].data + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + # forward prop + FW(aux, aux) + GDK.make_model(aux, addr) + + queue.wait_event(ev) + + if self.p.floating_intensities: + GDK.floating_intensity(addr, w, I, fic) + + GDK.main(aux, addr, w, I) + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + GDK.error_reduce(addr, err_phot) + + BW(aux, aux) + + use_atomics = self.p.object_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics) + + use_atomics = self.p.probe_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics) + + queue.synchronize() + self.engine.dID_list.reverse() + + # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration + for dID, prep in self.engine.diff_info.items(): + err_phot = prep.err_phot_gpu.get() + LL += err_phot.sum() + err_phot /= np.prod(prep.weights.shape[-2:]) + err_fourier = np.zeros_like(err_phot) + err_exit = np.zeros_like(err_phot) + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + # MPI reduction of gradients + + # DtoH copies + for s in ob_grad.S.values(): + s.gpu.get(out=s.cpu) + for s in pr_grad.S.values(): + s.gpu.get(out=s.cpu) + self.engine._set_pr_ob_ref_for_data('cpu') + + ob_grad.allreduce() + pr_grad.allreduce() + parallel.allreduce(LL) + + # HtoD cause we continue on gpu + for s in ob_grad.S.values(): + s.gpu.set(s.cpu) + for s in pr_grad.S.values(): + s.gpu.set(s.cpu) + self.engine._set_pr_ob_ref_for_data('gpu') + + # Object regularizer + if self.regularizer: + for name, s in self.engine.ob.storages.items(): + ob_grad.storages[name].data += self.regularizer.grad(s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs(self, c_ob_h, c_pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + # does not accept np.longdouble + B = cp.zeros((3,), dtype=np.float32) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + b = kern.b + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + ob_h = c_ob_h.S[oID].data + pr = self.pr.S[pID].data + pr_h = c_pr_h.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True) + AWK.build_aux_no_ex(b, addr, ob_h, pr_h, add=False) + + # forward prop + FW(f, f) + FW(a, a) + FW(b, b) + + queue.wait_event(ev) + + GDK.make_a012(f, a, b, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + c_ob_h.storages[name].data, s.data) + + self.B = B + + return B + + +class Regul_del2_cupy(object): + """\ + Squared gradient regularizer (Gaussian prior). + + This class applies to any numpy array. + """ + + def __init__(self, amplitude, axes=[-2, -1], queue=None): + # Regul.__init__(self, axes) + self.axes = axes + self.amplitude = amplitude + self.delxy = None + self.g = None + self.LL = None + self.queue = queue + self.AUK = ArrayUtilsKernel(queue=queue) + self.DELK_c = DerivativesKernel(np.complex64, queue=queue) + self.DELK_f = DerivativesKernel(np.float32, queue=queue) + + + def empty(x): return cp.empty( + x.shape, x.dtype) + + def delxb(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxb(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxb(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxb = delxb + + def delxf(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxf(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxf(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxf = delxf + self.norm = lambda x: self.AUK.norm2(x).get().item() + self.dot = lambda x, y: self.AUK.dot(x, y).get().item() + + self._grad_reg_kernel = cp.ElementwiseKernel( + "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", + "complex64 out", + "out = (px+py-my-mx) * fac", + "grad_reg", + no_return=True + ) + + def grad(amp, px, py, mx, my): + out = empty(px) + if self.queue is not None: + self.queue.use() + self._grad_reg_kernel(amp, py, px, mx, my, out) + return out + self.reg_grad = grad + + def grad(self, x): + """ + Compute and return the regularizer gradient given the array x. + """ + ax0, ax1 = self.axes + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + self.delxy = [del_xf, del_yf, del_xb, del_yb] + + # TODO this one might be slow, maybe try with elementwise kernel + #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude + self.g = self.reg_grad(2. * self.amplitude, + del_xb, del_yb, del_xf, del_yf) + + self.LL = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + return self.g + + def poly_line_coeffs(self, h, x=None): + ax0, ax1 = self.axes + if x is None: + del_xf, del_yf, del_xb, del_yb = self.delxy + else: + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + hdel_xf = self.delxf(h, axis=ax0) + hdel_yf = self.delxf(h, axis=ax1) + hdel_xb = self.delxb(h, axis=ax0) + hdel_yb = self.delxb(h, axis=ax1) + + c0 = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf) + + self.dot(del_yf, hdel_yf) + + self.dot(del_xb, hdel_xb) + + self.dot(del_yb, hdel_yb)) + + c2 = self.amplitude * (self.norm(hdel_xf) + + self.norm(hdel_yf) + + self.norm(hdel_xb) + + self.norm(hdel_yb)) + + self.coeff = np.array([c0, c1, c2]) + return self.coeff diff --git a/ptypy/accelerate/cuda_cupy/engines/__init__.py b/ptypy/accelerate/cuda_cupy/engines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py new file mode 100644 index 000000000..45eb4d016 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py @@ -0,0 +1,623 @@ +# -*- coding: utf-8 -*- +""" +Difference Map reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import cupy as cp + +from ptypy import utils as u +from ptypy.accelerate.cuda_cupy import get_context +from ptypy.utils.verbose import log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.projectional import DMMixin, RAARMixin +from ptypy.accelerate.base.engines import projectional_serial +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\ + TransposeKernel, ClipMagnitudesKernel, MassCenterKernel, Abs2SumKernel,\ + InterpolatedShiftKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..multi_gpu import get_multi_gpu_communicator + +__all__ = ['DM_cupy', 'RAAR_cupy'] + + +class _ProjectionEngine_cupy(projectional_serial._ProjectionEngine_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [fft_lib] + default = cuda + type = str + help = Choose the pycuda-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy's cuFFT wrapper (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Difference map reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + self.multigpu = None + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + # Context, Multi GPU communicator and Stream (needs to be in this order) + self.queue = get_context(new_queue=False) + self.multigpu = get_multi_gpu_communicator() + + # Gaussian Smoothing Kernel + self.GSK = GaussianSmoothingKernel(queue=self.queue) + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + # Clip Magnitudes Kernel + self.CMK = ClipMagnitudesKernel(queue=self.queue) + + # initialise kernels for centring probe if required + if self.p.probe_center_tol is not None: + # mass center kernel + self.MCK = MassCenterKernel(queue=self.queue) + # absolute sum kernel + self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue) + # interpolated shift kernel + self.ISK = InterpolatedShiftKernel(queue=self.queue) + + super().engine_initialize() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + if not int(mem) // aux.nbytes: + log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...") + raise SystemExit("ptypy has been exited.") + kern.aux = cp.asarray(aux) + + # setup kernels, one for each SCAN. + log(4, "Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue) + kern.FUK.allocate() + + log(4, "Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + log(4, "Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + log(4, "Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + log(4, "Setting up TransposeKernel") + kern.TK = TransposeKernel(queue=self.queue) + + log(4, "Setting up PropagationKernel") + kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + log(4, "Setting up PositionCorrectionKernel") + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + log(4, "Kernel setup completed") + + def engine_prepare(self): + + super().engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = cp.asarray(s.data) # TODO: investigate if this can be pinned, it's much faster + for name, s in self.ob_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.ob_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if use_tiles: + prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) + prep.addr2_gpu = cp.asarray(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + pID, oID, eID = prep.poe_IDs + s = self.ex.S[eID] + s.gpu = cp.asarray(s.data) + s = self.ma.S[d.ID] + s.gpu = cp.asarray(s.data.astype(np.float32)) + + prep.mag = cp.asarray(prep.mag) + prep.ma_sum = cp.asarray(prep.ma_sum) + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + queue = self.queue + queue.use() + + for it in range(num): + error = {} + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + PROP = kern.PROP + + # get addresses and buffers + addr = prep.addr_gpu + mag = prep.mag + ma_sum = prep.ma_sum + err_fourier = prep.err_fourier_gpu + err_phot = prep.err_phot_gpu + err_exit = prep.err_exit_gpu + pbound = self.pbound_scan[prep.label] + aux = kern.aux + + # local references + ma = self.ma.S[dID].gpu + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + ex = self.ex.S[eID].gpu + + # compute log-likelihood + if self.p.compute_log_likelihood: + AWK.build_aux_no_ex(aux, addr, ob, pr) + PROP.fw(aux, aux) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + + # build auxilliary wave + #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_aux(aux, addr, ob, pr, ex, + c_po=self._c, c_e=1-self._c) + + # forward FFT + PROP.fw(aux, aux) + + # Deviation from measured data + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + + # backward FFT + PROP.bw(aux, aux) + + # build exit wave + #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b)) + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + parallel.barrier() + + sync = (self.curiter % 1 == 0) + self.overlap_update() + + self.center_probe() + + parallel.barrier() + self.position_update() + + self.curiter += 1 + queue.synchronize() + + for name, s in self.ob.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + for name, s in self.pr.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + + queue.synchronize() + + # costly but needed to sync back with + # for name, s in self.ex.S.items(): + # s.data[:] = s.gpu.get() + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + self.queue.use() + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.di.S.keys(): + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ma = self.ma.S[dID].gpu + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + mag = prep.mag + ma_sum = prep.ma_sum + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + kind=3, # device to device + stream=self.queue.ptr) + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + kind=3, + stream=self.queue.ptr) # d2d + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), + prep.addr2_gpu.reshape(s2, s1)) + + def center_probe(self): + if self.p.probe_center_tol is not None: + self.queue.use() + for name, pr_s in self.pr.storages.items(): + psum_d = self.A2SK.abs2sum(pr_s.gpu) + c1 = self.MCK.mass_center(psum_d).get() + c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype) + + shift = c2 - c1 + # exit if the current center of mass is within the tolerance + if u.norm(shift) < self.p.probe_center_tol: + break + + # shift the probe + pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift) + + # shift the object + ob_s = pr_s.views[0].pod.ob_view.storage + ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift) + + # shift the exit waves + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + if pID == name: + self.ex.S[eID].gpu = self.ISK.interpolate_shift(self.ex.S[eID].gpu, shift) + + log(4, 'Probe recentered from %s to %s' + % (str(tuple(c1)), str(tuple(c2)))) + + # object update + + def object_update(self, MPI=False): + use_atomics = self.p.object_update_cuda_atomics + queue = self.queue + queue.synchronize() + queue.use() + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + cfact = self.ob_cfact[oID] + + if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) + obb = self.ob_buf.S[oID] + smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) + + ob.gpu *= cfact + obn.gpu.fill(cfact) + queue.synchronize() + + # storage for-loop + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # scan for loop + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.ob_update(addr, + self.ob.S[oID].gpu, + self.ob_nrm.S[oID].gpu, + self.pr.S[pID].gpu, + self.ex.S[eID].gpu, + atomics=use_atomics) + queue.synchronize() + + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + self.multigpu.allReduceSum(ob.gpu) + self.multigpu.allReduceSum(obn.gpu) + with queue: + ob.gpu /= obn.gpu + + self.clip_object(ob.gpu) + queue.synchronize() + + # probe update + def probe_update(self, MPI=False): + queue = self.queue + + # storage for-loop + change_gpu = cp.zeros((1,), dtype=np.float32) + cfact = self.p.probe_inertia + use_atomics = self.p.probe_update_cuda_atomics + for pID, pr in self.pr.storages.items(): + prn = self.pr_nrm.S[pID] + cfact = self.pr_cfact[pID] + pr.gpu *= cfact + prn.gpu.fill(cfact) + + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # scan for-loop + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.pr_update(addr, + self.pr.S[pID].gpu, + self.pr_nrm.S[pID].gpu, + self.ob.S[oID].gpu, + self.ex.S[eID].gpu, + atomics=use_atomics) + queue.synchronize() + + for pID, pr in self.pr.storages.items(): + + buf = self.pr_buf.S[pID] + prn = self.pr_nrm.S[pID] + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + # calculate change on GPU + queue.synchronize() + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size + + return np.sqrt(change) + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def clip_object(self, ob): + """ + Clips magnitudes of object into given range. + """ + if self.p.clip_object is not None: + cmin, cmax = self.p.clip_object + self.CMK.clip_magnitudes_to_range(ob, cmin, cmax) + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + # revert page-locked memory + delete GPU memory + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.ob_buf.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.ob_nrm.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr_buf.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr_nrm.S.items(): + s.data = np.copy(s.data) + del s.gpu + + # copy addr to cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + del prep.addr_gpu + + mempool = cp.get_default_memory_pool() + mempool.free_all_blocks() + pinned_pool = cp.get_default_pinned_memory_pool() + pinned_pool.free_all_blocks() + + # we don't need the "benchmarking" in DM_serial + super().engine_finalize(benchmark=False) + + +@register(name="DM_cupy_nostream") +class DM_cupy(_ProjectionEngine_cupy, DMMixin): + """ + A full-fledged Difference Map engine accelerated with pycuda. + + Defaults: + + [name] + default = DM_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars) + DMMixin.__init__(self, self.p.alpha) + ptycho_parent.citations.add_article(**self.article) + + +@register(name="RAAR_cupy_nostream") +class RAAR_cupy(_ProjectionEngine_cupy, RAARMixin): + """ + A RAAR engine in accelerated with pycuda. + + Defaults: + + [name] + default = RAAR_pycuda + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars) + RAARMixin.__init__(self, self.p.beta) diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py new file mode 100644 index 000000000..b236874ab --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py @@ -0,0 +1,532 @@ +# -*- coding: utf-8 -*- +""" +Difference Map reconstruction engine for NVIDIA GPUs. + +This engine uses three streams, one for the compute queue and one for each I/O queue. +Events are used to synchronize download / compute/ upload. we cannot manipulate memory +for each loop over the state vector, a certain number of memory sections is preallocated +and reused. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import cupy as cp +import cupyx + +from ptypy.accelerate.cuda_cupy import log_device_memory_stats +from ptypy.utils.verbose import log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.projectional import DMMixin, RAARMixin +from . import projectional_cupy + +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager + +EX_MA_BLOCKS_RATIO = 2 +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + +__all__ = ['DM_cupy_stream', 'RAAR_cupy_stream'] + + +class _ProjectionEngine_cupy_stream(projectional_cupy._ProjectionEngine_cupy): + + def __init__(self, ptycho_parent, pars=None): + + super().__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + super().engine_initialize() + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + def _setup_kernels(self): + + super()._setup_kernels() + ex_mem = 0 + mag_mem = 0 + for scan, kern in self.kernels.items(): + ex_mem = max(kern.aux.nbytes, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes, mag_mem) + ma_mem = mag_mem + + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + + # We need to add the free memory from the pool to the free device memory, + # as both will be used for allocations + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + log_device_memory_stats(4) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3))) + log(4, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super(projectional_cupy._ProjectionEngine_cupy, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = cp.asarray(s.data) + for name, s in self.ob_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.ob_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + + use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics) + + # Extra object buffer for smoothing kernel + if self.p.obj_smooth_std is not None: + for name, s in self.ob_buf.S.items(): + s.tmp = cp.empty(s.gpu.shape, s.gpu.dtype) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + prep.addr2_gpu = cp.asarray(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = cp.asarray(prep.ma_sum) + # prepare page-locked mems: + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C") + prep.ex[:] = ex + mag = prep.mag + prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C") + prep.mag[:] = mag + + log(4, 'Free memory on device: {:.2f} GB'.format(float(cp.cuda.runtime.memGetInfo()[0])/(1024**3))) + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + # ma_buf = ma_c = np.zeros(FUK.fshape, dtype=np.float32) + self.dID_list = list(self.di.S.keys()) + atomics_probe = self.p.probe_update_cuda_atomics + atomics_object = self.p.object_update_cuda_atomics + use_tiles = (not atomics_object) or (not atomics_probe) + + for it in range(num): + + error = {} + + for inner in range(self.p.overlap_max_iterations): + + change = 0 + + do_update_probe = (self.curiter >= self.p.probe_update_start) + do_update_object = (self.p.update_object_first or (inner > 0) or not do_update_probe) + do_update_fourier = (inner == 0) + + # initialize probe and object buffer to receive an update + if do_update_object: + for oID, ob in self.ob.storages.items(): + cfact = self.ob_cfact[oID] + obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] + + if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) + smooth_mfs = [self.p.obj_smooth_std, + self.p.obj_smooth_std] + # We need a third copy, because we still need ob.gpu for the fourier update + obb.gpu[:] = ob.gpu[:] + self.GSK.convolution(obb.gpu, smooth_mfs, tmp=obb.tmp) + obb.gpu *= np.complex64(cfact) + else: + # obb.gpu[:] = ob.gpu * np.complex64(cfact) + cp.multiply(ob.gpu, np.complex64(cfact), out=obb.gpu) + obn.gpu.fill(np.float32(cfact)) + + # First cycle: Fourier + object update + for iblock, dID in enumerate(self.dID_list): + prep = self.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + + pbound = self.pbound_scan[prep.label] + aux = kern.aux + PROP = kern.PROP + + # get addresses and auxilliary array + addr = prep.addr_gpu + addr2 = prep.addr2_gpu if use_tiles else None + err_fourier = prep.err_fourier_gpu + err_phot = prep.err_phot_gpu + err_exit = prep.err_exit_gpu + ma_sum = prep.ma_sum_gpu + + # local references + ob = self.ob.S[oID].gpu + obn = self.ob_nrm.S[oID].gpu + obb = self.ob_buf.S[oID].gpu + pr = self.pr.S[pID].gpu + + # Schedule ex to device + ev_ex, ex, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod) + + # Fourier update. + if do_update_fourier: + self.ex_data.syncback = True + log(4, '----- Fourier update -----', True) + + # Schedule ma & mag to device + ev_ma, ma, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod) + ev_mag, mag, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod) + + # compute log-likelihood + if self.p.compute_log_likelihood: + AWK.build_aux_no_ex(aux, addr, ob, pr) + PROP.fw(aux, aux) + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_mag) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_ex) + #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_aux(aux, addr, ob, pr, ex, c_po=self._c, c_e=1-self._c) + + # FFT + PROP.fw(aux, aux) + + # Deviation from measured data + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_mag) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + + data_mag.record_done(self.queue, 'compute') + data_ma.record_done(self.queue, 'compute') + + PROP.bw(aux, aux) + # apply changes + #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b)) + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + prestr = '%d Iteration (Overlap) #%02d: ' % (parallel.rank, inner) + + # Update object + if do_update_object: + log(4, prestr + '----- object update -----', True) + addrt = addr if atomics_object else addr2 + self.queue.wait_event(ev_ex) + POK.ob_update(addrt, obb, obn, pr, ex, atomics=atomics_object) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + if do_update_fourier or do_update_object: + self.dID_list.reverse() + + # wait for compute stream to finish + self.queue.synchronize() + + if do_update_object: + + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu + + # Exit if probe should not yet be updated + if not do_update_probe: + break + self.ex_data.syncback = False + + # Update probe + log(4, prestr + '----- probe update -----', True) + change = self.probe_update() + log(4, prestr + 'change in probe is %.3f' % change, True) + + # stop iteration if probe change is small + if change < self.p.overlap_converge_factor: + break + + self.queue.synchronize() + parallel.barrier() + + if self.do_position_refinement and (self.curiter): + do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.di.S.keys(): + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + ma_sum = prep.ma_sum_gpu + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Make sure our data arrays are on device + ev_ma, ma, data_ma = self.ma_data.to_gpu( + prep.ma, dID, self.qu_htod) + ev_mag, mag, data_mag = self.mag_data.to_gpu( + prep.mag, dID, self.qu_htod) + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # wait for data to arrive + self.queue.wait_event(ev_mag) + + # We need to re-calculate the current error + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + kind=3, # device to device + stream=self.queue.ptr) + + log(4, 'Position refinement trial: iteration %s' % + (self.curiter)) + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood( aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier) + + data_mag.record_done(self.queue, 'compute') + data_ma.record_done(self.queue, 'compute') + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + kind=3, # d2d + stream=self.queue.ptr) + if use_tiles: + s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1] + s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3] + TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + + self.curiter += 1 + self.queue.synchronize() + + for name, s in self.ob.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + for name, s in self.pr.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + + self.queue.synchronize() + + # costly but needed to sync back with + # for name, s in self.ex.S.items(): + # s.data[:] = s.gpu.get() + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + # probe update + def probe_update(self, MPI=False): + queue = self.queue + use_atomics = self.p.probe_update_cuda_atomics + # storage for-loop + change_gpu = cp.zeros((1,), dtype=np.float32) + for pID, pr in self.pr.storages.items(): + prn = self.pr_nrm.S[pID] + cfact = self.pr_cfact[pID] + with queue: + pr.gpu *= np.float32(cfact) + prn.gpu.fill(np.float32(cfact)) + + for iblock, dID in enumerate(self.dID_list): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + ev, ex, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod) + self.queue.wait_event(ev) + + addrt = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.pr_update(addrt, + self.pr.S[pID].gpu, + self.pr_nrm.S[pID].gpu, + self.ob.S[oID].gpu, + ex, + atomics=use_atomics) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + self.dID_list.reverse() + + self.queue.synchronize() + self.queue.use() + for pID, pr in self.pr.storages.items(): + + buf = self.pr_buf.S[pID] + prn = self.pr_nrm.S[pID] + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + # calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size + + return np.sqrt(change) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + super().engine_finalize() + + log_device_memory_stats(4) + +@register(name="DM_cupy") +class DM_cupy_stream(_ProjectionEngine_cupy_stream, DMMixin): + """ + A full-fledged Difference Map engine accelerated with cupy. + + Defaults: + + [name] + default = DM_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars) + DMMixin.__init__(self, self.p.alpha) + ptycho_parent.citations.add_article(**self.article) + + +@register(name="RAAR_cupy") +class RAAR_cupy_stream(_ProjectionEngine_cupy_stream, RAARMixin): + """ + A RAAR engine in accelerated with cupy. + + Defaults: + + [name] + default = RAAR_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + + _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars) + RAARMixin.__init__(self, self.p.beta) diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py new file mode 100644 index 000000000..f798d569e --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/stochastic.py @@ -0,0 +1,560 @@ +# -*- coding: utf-8 -*- +""" +Accelerated stochastic reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import time +import cupy as cp +import cupyx + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.stochastic import EPIEMixin, SDRMixin +from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial +from ptypy.accelerate.base import address_manglers +from .. import get_context +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\ + PositionCorrectionKernel, PropagationKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\ + TransposeKernel, MaxAbs2Kernel, MassCenterKernel, Abs2SumKernel,\ + InterpolatedShiftKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager + +MPI = False + +EX_MA_BLOCKS_RATIO = 2 +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 10 # can be used to limit the number of blocks, simulating that they don't fit + + +class _StochasticEngineCupy(_StochasticEngineSerial): + + """ + An accelerated implementation of a stochastic algorithm for ptychography + + Defaults: + + [fft_lib] + default = cuda + type = str + help = Choose the cupy-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy's cufft wrapper (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Accelerated base engine for stochastic algorithms. + """ + super().__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + self.queue = get_context(new_queue=True) + + # initialise kernels for centring probe if required + if self.p.probe_center_tol is not None: + # mass center kernel + self.MCK = MassCenterKernel(queue=self.queue) + # absolute sum kernel + self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue) + # interpolated shift kernel + self.ISK = InterpolatedShiftKernel(queue=self.queue) + + super().engine_initialize() + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + fpc = 0 + + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = max(scan.max_frames_per_block, fpc) + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = cp.asarray(aux) + + # setup kernels, one for each SCAN. + log(4, "Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel( + aux, nmodes, queue_thread=self.queue) + kern.FUK.fshape = (1,) + kern.FUK.fshape[1:] + kern.FUK.allocate() + + log(4, "Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + log(4, "Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + log(4, "Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + #log(4, "Setting up TransposeKernel") + #kern.TK = TransposeKernel(queue=self.queue) + + log(4, "setting up MaxAbs2Kernel") + kern.MAK = MaxAbs2Kernel(queue=self.queue) + + log(4, "Setting up PropagationKernel") + kern.PROP = PropagationKernel( + aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + log(4, "Setting up position correction") + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + ex_mem = 0 + mag_mem = 0 + for scan, kern in self.kernels.items(): + if kern.scanmodel in ("GradFull", "BlockGradFull"): + ex_mem = max(kern.aux.nbytes * 1, ex_mem) + else: + ex_mem = max(kern.aux.nbytes * fpc, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem) + ma_mem = mag_mem + mem = cp.cuda.runtime.memGetInfo()[0] + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + self.context.pop() + self.context.detach() + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + + log(3, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager(mag_mem, 0, nma, False) + log(4, "Kernel setup completed") + + def engine_prepare(self): + super().engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = cp.asarray(prep.ma_sum) + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + prep.obn = cp.asarray(prep.obn) + prep.prn = cp.asarray(prep.prn) + # prepare page-locked mems: + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C") + prep.ex[:] = ex + mag = prep.mag + prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C") + prep.mag[:] = mag + + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + self.dID_list = list(self.di.S.keys()) + error = {} + for it in range(num): + + for iblock, dID in enumerate(self.dID_list): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + MAK = kern.MAK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Schedule ex, ma, mag to device + ev_ex, ex_full, data_ex = self.ex_data.to_gpu( + prep.ex, dID, self.qu_htod) + ev_mag, mag_full, data_mag = self.mag_data.to_gpu( + prep.mag, dID, self.qu_htod) + ev_ma, ma_full, data_ma = self.ma_data.to_gpu( + prep.ma, dID, self.qu_htod) + + # Reference to ex, ma and mag + prep.ex_full = ex_full + prep.mag_full = mag_full + prep.ma_full = ma_full + + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_ex) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i, None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex_full[ex_from:ex_to] + mag = prep.mag_full[i, None] + ma = prep.ma_full[i, None] + ma_sum = prep.ma_sum_gpu[i, None] + obn = prep.obn + prn = prep.prn + err_phot = prep.err_phot_gpu[i, None] + err_fourier = prep.err_fourier_gpu[i, None] + err_exit = prep.err_exit_gpu[i, None] + + # position update + self.position_update_local(prep, i) + + # build auxilliary wave + AWK.make_aux(aux, addr, ob, pr, ex, + c_po=self._c, c_e=1-self._c) + + # forward FFT + PROP.fw(aux, aux) + + # Deviation from measured data + self.queue.wait_event(ev_mag) + if self.p.compute_fourier_error: + self.queue.wait_event(ev_ma) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + self.queue.wait_event(ev_ma) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + # backward FFT + PROP.bw(aux, aux) + + # build exit wave + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, + c_po=self._a, c_e=-(self._a + self._b)) + if self.p.compute_exit_error: + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + # build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.pr_norm_local(addr, pr, prn) + POK.ob_update_local( + addr, ob, pr, ex, aux, prn, a=self._ob_a, b=self._ob_b) + + # probe update + if self._object_norm_is_global and self._pr_a == 0: + obn_max = cp.empty((1,), dtype=np.float32) + MAK.max_abs2(ob, obn_max) + obn.fill(np.float32(0.)) + else: + POK.ob_norm_local(addr, ob, obn) + obn_max = cp.max(obn) + if self.p.probe_update_start <= self.curiter: + POK.pr_update_local( + addr, pr, ob, ex, aux, obn, obn_max, a=self._pr_a, b=self._pr_b) + + # compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + self.dID_list.reverse() + + # Re-center probe + self.center_probe() + + self.curiter += 1 + self.ex_data.syncback = False + + # finish all the compute + self.queue.synchronize() + + for name, s in self.ob.S.items(): + #s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + cp.cuda.runtime.memcpyAsync(dst=s.data.ctypes.data, + src=s.gpu.data.ptr, + size=s.gpu.nbytes, + kind=2, # d2h + stream=self.queue.ptr) + for name, s in self.pr.S.items(): + #s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + cp.cuda.runtime.memcpyAsync(dst=s.data.ctypes.data, + src=s.gpu.data.ptr, + size=s.gpu.nbytes, + kind=2, # d2h + stream=self.queue.ptr) + + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + # wait for the async transfers + self.qu_dtoh.synchronize() + + self.error = error + return error + + def position_update_local(self, prep, i): + if not self.do_position_refinement: + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + #log(4, "----------- START POS REF -------------") + pID, oID, eID = prep.poe_IDs + mag = prep.mag_full[i, None] + ma = prep.ma_full[i, None] + ma_sum = prep.ma_sum_gpu[i, None] + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu[i, None] + mangled_addr = prep.mangled_addr_gpu[i, None] + err_fourier = prep.err_fourier_gpu[i, None] + error_state = prep.error_state_gpu[i, None] + + PCK = kern.PCK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # self.queue.wait_event(ev_mag) + # self.queue.wait_event(ev_ma) + + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + stream=self.queue.ptr, + kind=3) # d2d + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + #log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_fourier) + + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + stream=self.queue.ptr, + kind=3) # d2d + + def center_probe(self): + if self.p.probe_center_tol is not None: + for name, pr_s in self.pr.storages.items(): + psum_d = self.A2SK.abs2sum(pr_s.gpu) + c1 = self.MCK.mass_center(psum_d).get() + c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype) + + shift = c2 - c1 + # exit if the current center of mass is within the tolerance + if u.norm(shift) < self.p.probe_center_tol: + break + + # shift the probe + pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift) + + # shift the object + ob_s = pr_s.views[0].pod.ob_view.storage + ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift) + + # shift the exit waves + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + if pID == name: + prep.ex_full = self.ISK.interpolate_shift(prep.ex_full, + shift) + + log(4, 'Probe recentered from %s to %s' + % (str(tuple(c1)), str(tuple(c2)))) + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + for name, s in self.ob.S.items(): + del s.gpu + for name, s in self.pr.S.items(): + del s.gpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + + #self.context.detach() + super().engine_finalize() + + +@register() +class EPIE_cupy(_StochasticEngineCupy, EPIEMixin): + """ + An accelerated implementation of the EPIE algorithm. + + Defaults: + + [name] + default = EPIE_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _StochasticEngineCupy.__init__(self, ptycho_parent, pars) + EPIEMixin.__init__(self, self.p.alpha, self.p.beta) + ptycho_parent.citations.add_article(**self.article) + + +@register() +class SDR_cupy(_StochasticEngineCupy, SDRMixin): + """ + An accelerated implementation of the semi-implicit relaxed Douglas-Rachford (SDR) algorithm. + + Defaults: + + [name] + default = SDR_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _StochasticEngineCupy.__init__(self, ptycho_parent, pars) + SDRMixin.__init__(self, self.p.sigma, self.p.tau, + self.p.beta_probe, self.p.beta_object) + ptycho_parent.citations.add_article(**self.article) diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py new file mode 100644 index 000000000..049108e71 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/kernels.py @@ -0,0 +1,1442 @@ +import numpy as np +from ..base import kernels as ab +from . import load_kernel +import cupy as cp +from ..base.kernels import Adict +from inspect import getfullargspec +from .array_utils import MaxAbs2Kernel, CropPadKernel +from ptypy.utils.verbose import logger + + +# fourier support +def choose_fft(arr_shape, fft_type=None): + dims_are_powers_of_two = True + rows = arr_shape[0] + columns = arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + dims_are_powers_of_two = False + if fft_type=='cuda' and not dims_are_powers_of_two: + logger.warning('cufft: array dimensions are not powers of two (16 to 2048) - using cufft with seperated callbacks') + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT + elif fft_type=='cuda' and dims_are_powers_of_two: + try: + import filtered_cufft + from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as FFT + except: + logger.warning( + 'Unable to import optimised cufft version - using cufft with separte callbacks instead') + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT + else: + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT + return FFT + + +class PropagationKernel: + + def __init__(self, aux, propagator, queue_thread=None, fft_type='cuda'): + self.aux = aux + self._queue = queue_thread + self.prop_type = propagator.p.propagation + self.fw = None + self.bw = None + self._fft1 = None + self._fft2 = None + self._p = propagator + self.fft_type = fft_type + + def allocate(self): + + aux = self.aux + FFT = choose_fft(aux.shape[-2:], self.fft_type) + + if self.prop_type == 'farfield': + + self._do_crop_pad = (self._p.crop_pad != 0).any() + if self._do_crop_pad: + aux_shape = tuple(np.array(aux.shape) + + np.append([0], self._p.crop_pad)) + self._tmp = np.zeros(aux_shape, dtype=aux.dtype) + self._CPK = CropPadKernel(queue=self._queue) + else: + self._tmp = aux + + self._fft1 = FFT(self._tmp, self.queue, + pre_fft=self._p.pre_fft, + post_fft=self._p.post_fft, + symmetric=True, + forward=True) + self._fft2 = FFT(self._tmp, self.queue, + pre_fft=self._p.pre_ifft, + post_fft=self._p.post_ifft, + symmetric=True, + forward=False) + if self._do_crop_pad: + self._tmp = cp.asarray(self._tmp) + + def _fw(x, y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft1.ft(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft1.ft(x, y) + + def _bw(x, y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft2.ift(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft2.ift(x, y) + + self.fw = _fw + self.bw = _bw + + elif self.prop_type == "nearfield": + self._fft1 = FFT(aux, self.queue, + post_fft=self._p.kernel, + symmetric=True, + forward=True) + self._fft2 = FFT(aux, self.queue, + post_fft=self._p.ikernel, + inplace=True, + symmetric=True, + forward=True) + self._fft3 = FFT(aux, self.queue, + symmetric=True, + forward=False) + + def _fw(x, y): + self._fft1.ft(x, y) + self._fft3.ift(y, y) + + def _bw(x, y): + self._fft2.ft(x, y) + self._fft3.ift(y, y) + + self.fw = _fw + self.bw = _bw + else: + logger.warning( + "Unable to select propagator %s, only nearfield and farfield are supported" % self.prop_type) + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + self._fft1.queue = queue + self._fft2.queue = queue + if self.prop_type == "nearfield": + self._fft3.queue = queue + + +class FourierSupportKernel: + def __init__(self, support, queue_thread=None): + self.support = support + self.queue = queue_thread + + def allocate(self): + FFT = choose_fft(self.support.shape[-2:]) + + self._fft1 = FFT(self.support, self.queue, + post_fft=self.support, + symmetric=True, + forward=True) + self._fft2 = FFT(self.support, self.queue, + symmetric=True, + forward=False) + + def apply_fourier_support(self, x): + self._fft1.ft(x, x) + self._fft2.ift(x, x) + + +class RealSupportKernel: + def __init__(self, support, queue=None): + self.queue = queue + self.support = support + + def allocate(self): + if self.queue is not None: + self.queue.use() + self.support = cp.asarray(self.support) + + def apply_real_support(self, x): + if self.queue is not None: + self.queue.use() + x *= self.support + + +class FourierUpdateKernel(ab.FourierUpdateKernel): + + def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): + super(FourierUpdateKernel, self).__init__(aux, nmodes=nmodes) + + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + if math_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + self.accumulate_type = accumulate_type + self.math_type = math_type + self.queue = queue_thread + self.fmag_all_update_cuda = load_kernel("fmag_all_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda = None + self.fourier_deviation_cuda = None + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fourier_error2_cuda = None + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'ACC_TYPE': self.accumulate_type, + 'BDIM_X': 32, + 'BDIM_Y': 32, + }) + self.fourier_update_cuda = None + self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel( + ("log_likelihood", "log_likelihood2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "log_likelihood.cu") + self.exit_error_cuda = load_kernel("exit_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + + self.gpu = Adict() + self.gpu.fdev = None + self.gpu.ferr = None + + def allocate(self): + self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32) + self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32) + + def fourier_error(self, f, addr, fmag, fmask, mask_sum): + fdev = self.gpu.fdev + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + if True: + # version going over all modes in a single thread (faster) + self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + else: + # version using one thread per mode + shared mem reduction (slower) + if self.fourier_error2_cuda is None: + self.fourier_error2_cuda = load_kernel("fourier_error2") + bx = 16 + by = 16 + bz = int(self.nmodes) + blk = (bx, by, bz) + grd = (int((self.fshape[2] + bx-1) // bx), + int((self.fshape[1] + by-1) // by), + int(self.fshape[0])) + # print('block={}, grid={}, fshape={}'.format(blk, grd, self.fshape)) + self.fourier_error2_cuda(grid=grd, + block=blk, + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2])), + shared_mem=int(bx*by*bz*4)) + + def fourier_deviation(self, f, addr, fmag): + fdev = self.gpu.fdev + if self.fourier_deviation_cuda is None: + self.fourier_deviation_cuda = load_kernel("fourier_deviation", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.fourier_deviation_cuda(grid=( + 1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), + block=(bx, by, 1), + args=(np.int32(self.nmodes), + f, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def error_reduce(self, addr, err_sum): + if self.queue is not None: + self.queue.use() + self.error_reduce_cuda(grid=(int(err_sum.shape[0]), 1, 1), + block=(32, 32, 1), + args=(self.gpu.ferr, + err_sum, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): + fdev = self.gpu.fdev + if self.queue is not None: + self.queue.use() + self.fmag_all_update_cuda(grid=(int(fmag.shape[0]*self.nmodes), 1, 1), + block=(32, 32, 1), + args=(f, + fmask, + fmag, + fdev, + err_fmag, + addr, + np.float32(pbound), + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def fmag_update_nopbound(self, f, addr, fmag, fmask): + fdev = self.gpu.fdev + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + if self.fmag_update_nopbound_cuda is None: + self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda(grid=(1, + int(( + self.fshape[2] + by - 1) // by), + int(fmag.shape[0]*self.nmodes)), + block=(bx, by, 1), + args=(f, + fmask, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + # Note: this was a test to join the kernels, but it's > 2x slower! + def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): + if self.fourier_update_cuda is None: + self.fourier_update_cuda = load_kernel("fourier_update") + if self.queue is not None: + self.queue.use() + + fdev = self.gpu.fdev + ferr = self.gpu.ferr + + bx = 16 + by = 16 + bz = int(self.nmodes) + blk = (bx, by, bz) + grd = (int((self.fshape[2] + bx-1) // bx), + int((self.fshape[1] + by-1) // by), + int(self.fshape[0])) + smem = int(bx*by*bz*4) + self.fourier_update_cuda(grid=grd, + block=blk, + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + err_fmag, + np.float32(pbound), + np.int32(self.fshape[1]), + np.int32(self.fshape[2])), + shared_mem=smem) + + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def log_likelihood2(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.log_likelihood2_cuda(grid=( + 1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), + block=(bx, by, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def exit_error(self, aux, addr): + sh = addr.shape + maxz = sh[0] + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.exit_error_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + aux, + ferr, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + +class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel): + + def __init__(self, queue_thread=None, math_type='float'): + super(AuxiliaryWaveKernel, self).__init__() + # and now initialise the cuda + self.queue = queue_thread + self._ob_shape = None + self._ob_id = None + self.math_type = math_type + if math_type not in ['float', 'double']: + raise ValueError('Only double or float math is supported') + self.make_aux_cuda, self.make_aux2_cuda = load_kernel( + ("make_aux", "make_aux2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "make_aux.cu") + self.make_exit_cuda = load_kernel("make_exit", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel( + ("build_aux_no_ex", "build_aux2_no_ex"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux_no_ex.cu") + # self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", { + # 'IN_TYPE': 'float', + # 'OUT_TYPE': 'float', + # 'MATH_TYPE': self.math_type + # }) + + # DEPRECATED? + def load(self, aux, ob, pr, ex, addr): + super(AuxiliaryWaveKernel, self).load(aux, ob, pr, ex, addr) + for key, array in self.npy.__dict__.items(): + self.ocl.__dict__[key] = cp.to_gpu(array) + + def make_aux(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.make_aux_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32(c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def make_aux2(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.make_aux2_cuda(grid=(1, + int((ex.shape[1] + by - 1)//by), + int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32( + c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def make_exit(self, b_aux, addr, ob, pr, ex, c_a=1.0, c_po=0.0, c_e=-1.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.make_exit_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_a) if ex.dtype == np.complex64 else np.float64(c_a), + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32( + c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0): + # DM only, legacy. also make_aux2 does no exit in the parent + self.make_aux2(b_aux, addr, ob, pr, ex, 1.+alpha, -alpha) + + """ + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.build_exit_alpha_tau_cuda(grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha), np.float32(tau))) + """ + + def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.build_aux_no_ex_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32( + fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add))) + + def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.build_aux2_no_ex_cuda(grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32( + fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add))) + + def _cache_object_shape(self, ob): + oid = id(ob) + + if not oid == self._ob_id: + self._ob_id = oid + self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1])) + + return self._ob_shape + + +class GradientDescentKernel(ab.GradientDescentKernel): + + def __init__(self, aux, nmodes=1, queue=None, accumulate_type='double', math_type='float'): + super().__init__(aux, nmodes) + self.queue = queue + self.accumulate_type = accumulate_type + self.math_type = math_type + if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']): + raise ValueError( + "accumulate and math types must be double for float") + + self.gpu = Adict() + self.gpu.LLden = None + self.gpu.LLerr = None + self.gpu.Imodel = None + + subs = { + 'IN_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'ACC_TYPE': self.accumulate_type, + 'MATH_TYPE': self.math_type + } + self.make_model_cuda = load_kernel('make_model', subs) + self.make_a012_cuda = load_kernel('make_a012', subs) + self.error_reduce_cuda = load_kernel('error_reduce', { + **subs, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'BDIM_X': 32, + 'BDIM_Y': 32 + }) + self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel( + ('fill_b', 'fill_b_reduce'), + { + **subs, + 'BDIM_X': 1024, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + }, + file="fill_b.cu") + self.main_cuda = load_kernel('gd_main', subs) + self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \ + load_kernel(('step1', 'step2'), subs, 'intens_renorm.cu') + + def allocate(self): + self.gpu.LLden = cp.zeros(self.fshape, dtype=self.ftype) + self.gpu.LLerr = cp.zeros(self.fshape, dtype=self.ftype) + self.gpu.Imodel = cp.zeros(self.fshape, dtype=self.ftype) + tmp = np.ones((self.fshape[0],), dtype=self.ftype) + self.gpu.fic_tmp = cp.asarray(tmp) + + # temporary array for the reduction in fill_b + sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024)) + self.gpu.Btmp = cp.zeros( + sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32) + + def make_model(self, b_aux, addr): + # reference shape + sh = self.fshape + + # batch buffers + Imodel = self.gpu.Imodel + aux = b_aux + + # dimensions / grid + z = np.int32(sh[0]) + y = np.int32(self.nmodes) + x = np.int32(sh[1] * sh[2]) + bx = 1024 + if self.queue is not None: + self.queue.use() + self.make_model_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(aux, Imodel, z, y, x)) + + def make_a012(self, b_f, b_a, b_b, addr, I, fic): + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + z = np.int32(sh[0]) + maxz = np.int32(maxz) + y = np.int32(self.nmodes) + x = np.int32(sh[1]*sh[2]) + bx = 1024 + if self.queue is not None: + self.queue.use() + self.make_a012_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(b_f, b_a, b_b, I, fic, + A0, A1, A2, z, y, x, maxz)) + + def fill_b(self, addr, Brenorm, w, B): + # stopper + maxz = w.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + sz = np.int32(np.prod(w.shape)) + blks = int((sz + 1023) // 1024) + # print('blocks={}, Btmp={}, fshape={}, wshape={}, modes={}'.format(blks, self.gpu.Btmp.shape, self.fshape, w.shape, self.nmodes)) + assert self.gpu.Btmp.shape[1] >= blks + # 2-stage reduction - even if 1 block, as we have a += in second kernel + if self.queue is not None: + self.queue.use() + self.fill_b_cuda(grid=(blks, 1, 1), + block=(1024, 1, 1), + args=(A0, A1, A2, w, + np.float32(Brenorm) if self.ftype == np.float32 else np.float64( + Brenorm), + sz, self.gpu.Btmp)) + self.fill_b_reduce_cuda(grid=(1, 1, 1), + block=(1024, 1, 1), + args=(self.gpu.Btmp, B, np.int32(blks))) + + def error_reduce(self, addr, err_sum): + # reference shape (= GPU global dims) + sh = err_sum.shape + + # stopper + maxz = err_sum.shape[0] + + # batch buffers + ferr = self.gpu.LLerr + + # print('maxz={}, ferr={}'.format(maxz, ferr.shape)) + assert (maxz <= np.prod(ferr.shape[:-2])) + + if self.queue is not None: + self.queue.use() + + # Reduces the LL error along the last 2 dimensions.fd + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(ferr, err_sum, + np.int32(ferr.shape[-2]), + np.int32(ferr.shape[-1]))) + + def floating_intensity(self, addr, w, I, fic): + + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + # internal buffers + num = self.gpu.LLerr + den = self.gpu.LLden + Imodel = self.gpu.Imodel + fic_tmp = self.gpu.fic_tmp + + ## math ## + xall = np.int32(maxz * sh[1] * sh[2]) + bx = 1024 + + if self.queue is not None: + self.queue.use() + + self.floating_intensity_cuda_step1(grid=(int((xall + bx - 1) // bx), 1, 1), + block=(bx, 1, 1), + args=(Imodel, I, w, num, den, + xall)) + + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(num, fic, + np.int32(num.shape[-2]), + np.int32(num.shape[-1]))) + + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(den, fic_tmp, + np.int32(den.shape[-2]), + np.int32(den.shape[-1]))) + + self.floating_intensity_cuda_step2(grid=(1, 1, int(maxz)), + block=(32, 32, 1), + args=(fic_tmp, fic, Imodel, + np.int32(Imodel.shape[-2]), + np.int32(Imodel.shape[-1]))) + + def main(self, b_aux, addr, w, I): + nmodes = self.nmodes + # stopper + maxz = I.shape[0] + + # batch buffers + err = self.gpu.LLerr + Imodel = self.gpu.Imodel + aux = b_aux + + # write-to shape (= GPU global dims) + ish = aux.shape + + x = np.int32(ish[1] * ish[2]) + y = np.int32(nmodes) + z = np.int32(maxz) + bx = 1024 + + if self.queue is not None: + self.queue.use() + + # print(Imodel.dtype, I.dtype, w.dtype, err.dtype, aux.dtype, z, y, x) + self.main_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(Imodel, I, w, err, aux, + z, y, x)) + + +class PoUpdateKernel(ab.PoUpdateKernel): + + def __init__(self, queue_thread=None, + math_type='float', accumulator_type='float'): + super(PoUpdateKernel, self).__init__() + # and now initialise the cuda + if math_type not in ['double', 'float']: + raise ValueError( + 'only float and double are supported for math_type') + if accumulator_type not in ['double', 'float']: + raise ValueError( + 'only float and double are supported for accumulator_type') + self.math_type = math_type + self.accumulator_type = accumulator_type + self.queue = queue_thread + self.norm = None + self.MAK = MaxAbs2Kernel(self.queue) + self.ob_update_cuda = load_kernel("ob_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.ob_update2_cuda = None # load_kernel("ob_update2") + self.pr_update_cuda = load_kernel("pr_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.pr_update2_cuda = None + self.ob_update_ML_cuda = load_kernel("ob_update_ML", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.ob_update2_ML_cuda = None + self.pr_update_ML_cuda = load_kernel("pr_update_ML", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.pr_update2_ML_cuda = None + self.ob_update_local_cuda = load_kernel("ob_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_local_cuda = load_kernel("pr_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.ob_norm_local_cuda = load_kernel("ob_norm_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_norm_local_cuda = load_kernel("pr_norm_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.ob_update_wasp_cuda = load_kernel("ob_update_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_wasp_cuda = load_kernel("pr_update_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.avg_wasp_cuda = load_kernel("avg_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + def ob_update(self, addr, ob, obn, pr, ex, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if obn.dtype != np.float32: + raise ValueError( + "Denominator must be float32 in current implementation") + + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for atomics ob_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.ob_update_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + obn)) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled ob_update') + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.ob_update2_cuda: + self.ob_update2_cuda = load_kernel("ob_update2", { + "NUM_MODES": obsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in ob.shape[-2:]] + grid = (grid[1], grid[0], int(1)) + self.ob_update2_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], + prsh[0], + np.int32(ex.shape[0]), + np.int32(ex.shape[1]), + np.int32(ex.shape[2]), + ob, obn, pr, ex, addr)) + + def pr_update(self, addr, pr, prn, ob, ex, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if prn.dtype != np.float32: + raise ValueError( + "Denominator must be float32 in current implementation") + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for atomics pr_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.pr_update_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + prn)) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled pr_update') + + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.pr_update2_cuda: + self.pr_update2_cuda = load_kernel("pr_update2", { + "NUM_MODES": prsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in pr.shape[-2:]] + grid = (grid[0], grid[1], int(1)) + self.pr_update2_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[-2], obsh[-1], + prsh[0], obsh[0], num_pods, + pr, prn, ob, ex, addr)) + + def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.ob_update_ML_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, exsh[1], exsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.ob_update2_ML_cuda: + self.ob_update2_ML_cuda = load_kernel("ob_update2_ML", { + "NUM_MODES": obsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + grid = [int((x+15)//16) for x in ob.shape[-2:]] + grid = (grid[1], grid[0], int(1)) + self.ob_update2_ML_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], + prsh[0], + np.int32(ex.shape[0]), + np.int32(ex.shape[1]), + np.int32(ex.shape[2]), + ob, pr, ex, addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + + def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.pr_update_ML_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.pr_update2_ML_cuda: + self.pr_update2_ML_cuda = load_kernel("pr_update2_ML", { + "NUM_MODES": prsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in pr.shape[-2:]] + grid = (grid[0], grid[1], int(1)) + self.pr_update2_ML_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[-2], obsh[-1], + prsh[0], obsh[0], num_pods, + pr, ob, ex, addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + + def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.): + if self.queue is not None: + self.queue.use() + prn_max = cp.max(prn) + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled ob_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.ob_update_local_cuda(grid=( + 1, int((exsh[1] + by - 1)//by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + prn, + ob, + obsh[0], obsh[1], obsh[2], + addr, + prn_max, + np.float32(a), + np.float32(b))) + + def pr_update_local(self, addr, pr, ob, ex, aux, obn, obn_max, a=0., b=1.): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled pr_update') + if self.queue is not None: + self.queue.use() + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.pr_update_local_cuda(grid=( + 1, int((exsh[1] + by - 1) // by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + obn, + ob, + obsh[0], obsh[1], obsh[2], + addr, + obn_max, + np.float32(a), + np.float32(b))) + + def ob_norm_local(self, addr, ob, obn): + obsh = [np.int32(ax) for ax in ob.shape] + obnsh = [np.int32(ax) for ax in obn.shape] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.ob_norm_local_cuda(grid=( + 1, int((obnsh[1] + by - 1)//by), int(obnsh[0])), + block=(bx, by, 1), + args=(obn, + obnsh[0], obnsh[1], obnsh[2], + ob, + obsh[0], obsh[1], obsh[2], + addr)) + + def pr_norm_local(self, addr, pr, prn): + prsh = [np.int32(ax) for ax in pr.shape] + prnsh = [np.int32(ax) for ax in prn.shape] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.pr_norm_local_cuda(grid=( + 1, int((prnsh[1] + by - 1)//by), int(prnsh[0])), + block=(bx, by, 1), + args=(prn, + prnsh[0], prnsh[1], prnsh[2], + pr, + prsh[0], prsh[1], prsh[2], + addr)) + + def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm, + alpha=1): + # ensure it is C-contiguous! + pr_abs2 = cp.ascontiguousarray((pr * pr.conj()).real) + pr_abs2_mean = cp.mean(pr_abs2, axis=(1,2)) + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.ob_update_wasp_cuda( + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + pr_abs2, + prsh[0], prsh[1], prsh[2], + ob, + ob_sum_nmr, + ob_sum_dnm, + obsh[0], obsh[1], obsh[2], + addr, + pr_abs2_mean, + np.float32(alpha))) + + def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm, + beta=1): + if self.queue is not None: + self.queue.use() + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.pr_update_wasp_cuda( + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + pr_sum_nmr, + pr_sum_dnm, + prsh[0], prsh[1], prsh[2], + ob, + obsh[0], obsh[1], obsh[2], + addr, + np.float32(beta))) + + def avg_wasp(self, arr, nmr, dnm): + arrsh = [np.int32(ax) for ax in arr.shape] + bx = 64 + by = 1 + + if self.queue is not None: + self.queue.use() + self.avg_wasp_cuda( + grid=(1, int((arrsh[1] + by - 1)//by), int(arrsh[0])), + block=(bx, by, 1), + args=(arr, nmr, dnm, arrsh[0], arrsh[1], arrsh[2])) + + +class PositionCorrectionKernel(ab.PositionCorrectionKernel): + from ptypy.accelerate.cuda_cupy import address_manglers + + # these are used by the self.setup method - replacing them with the GPU implementation + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs): + super(PositionCorrectionKernel, self).__init__(*args, **kwargs) + # make sure we set the right stream in the mangler + self.mangler.queue = queue_thread + if math_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + + # add kernels + self.math_type = math_type + self.accumulate_type = accumulate_type + self.queue = queue_thread + self._ob_shape = None + self._ob_id = None + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'BDIM_X': 32, + 'BDIM_Y': 32, + 'ACC_TYPE': self.accumulate_type + }) + self.log_likelihood_cuda, self.log_likelihood_ml_cuda = load_kernel( + ("log_likelihood", "log_likelihood_ml"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "log_likelihood.cu") + self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float' + }) + + self.gpu = Adict() + self.gpu.fdev = None + self.gpu.ferr = None + + def allocate(self): + self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32) + self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32) + + def build_aux(self, b_aux, addr, ob, pr): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.build_aux_pc_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + pr, + np.int32(pr.shape[1]), np.int32( + pr.shape[2]), + ob, + obr, obc, + addr)) + + def fourier_error(self, f, addr, fmag, fmask, mask_sum): + fdev = self.gpu.fdev + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def error_reduce(self, addr, err_fmag): + # import sys + # float_size = sys.getsizeof(np.float32(4)) + # shared_memory_size =int(2 * 32 * 32 *float_size) # this doesn't work even though its the same... + # shared_memory_size = int(49152) + if self.queue is not None: + self.queue.use() + self.error_reduce_cuda(grid=(int(err_fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(self.gpu.ferr, + err_fmag, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def log_likelihood_ml(self, b_aux, addr, I, weights, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_ml_cuda(grid=(int(I.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + weights, + I, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): + # assume all data is on GPU! + if self.queue is not None: + self.queue.use() + self.update_addr_and_error_state_cuda(grid=( + 1, int((err_sum.shape[0] + 1) // 2), 1), + block=(32, 2, 1), + args=(addr, mangled_addr, error_state, err_sum, + np.int32(addr.shape[1]))) + + def _cache_object_shape(self, ob): + oid = id(ob) + + if not oid == self._ob_id: + self._ob_id = oid + self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1])) + + return self._ob_shape diff --git a/ptypy/accelerate/cuda_cupy/mem_utils.py b/ptypy/accelerate/cuda_cupy/mem_utils.py new file mode 100644 index 000000000..a92a9657b --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/mem_utils.py @@ -0,0 +1,319 @@ +import numpy as np +import cupy as cp +import cupyx +from collections import deque + + +def make_pagelocked_paired_arrays(ar): + mem = cupyx.empty_pinned(ar.shape, ar.dtype, order="C") + mem[:] = ar + return cp.asarray(mem), mem + + +class GpuData: + """ + Manages one block of GPU data with corresponding CPU data. + Keeps track of which cpu array is currently on GPU by its id, + and transfers if it's not already there. + + To be used for the exit wave, ma, and mag arrays. + Note: Allocator should be pooled for best performance + """ + + def __init__(self, nbytes, syncback=False): + """ + New instance of GpuData. Allocates the GPU-side array. + + :param nbytes: Number of bytes held by this instance. + :param syncback: Should the data be synced back to CPU any time it's swapped out + """ + + self.gpu = None + self.gpuraw = cp.cuda.alloc(nbytes) + self.nbytes = nbytes + self.nbytes_buffer = nbytes + self.gpuId = None + self.cpu = None + self.syncback = syncback + self.ev_done = None + + def _allocator(self, nbytes): + if nbytes > self.nbytes: + raise Exception('requested more bytes than maximum given before: {} vs {}'.format( + nbytes, self.nbytes)) + return self.gpuraw + + def record_done(self, stream): + self.ev_done = cp.cuda.Event() + with stream: + self.ev_done.record() + + def to_gpu(self, cpu, id, stream): + """ + Transfer cpu array to GPU on stream (async), keeping track of its id + """ + if self.gpuId != id: + if self.syncback: + self.from_gpu(stream) + self.gpuId = id + self.cpu = cpu + if self.ev_done is not None: + self.ev_done.synchronize() + alloc = cp.cuda.get_allocator() + try: + cp.cuda.set_allocator(self._allocator) + with stream: + self.gpu = cp.asarray(cpu) + finally: + cp.cuda.set_allocator(alloc) + return self.gpu + + def from_gpu(self, stream): + """ + Transfer data back to CPU, into same data handle it was copied from + before. + """ + if self.cpu is not None and self.gpuId is not None and self.gpu is not None: + if self.ev_done is not None: + stream.wait_event(self.ev_done) + cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data, + src=self.gpu.data.ptr, + size=self.gpu.nbytes, + kind=2, # d2h + stream=stream.ptr) + self.ev_done = cp.cuda.Event() + self.ev_done.record(stream) + + def resize(self, nbytes): + """ + Resize the size of the underlying buffer, to allow re-use in different contexts. + Note that memory will only be freed/reallocated if the new number of bytes are + either larger than before, or if they are less than 90% of the original size - + otherwise it reuses the existing buffer + """ + if nbytes > self.nbytes_buffer or nbytes < self.nbytes_buffer * .9: + self.nbytes_buffer = nbytes + self.gpuraw.mem.free() + self.gpuraw = cp.cuda.alloc(nbytes) + + self.nbytes = nbytes + self.reset() + + def reset(self): + """ + Resets handles of cpu references and ids, so that all data will be transfered + again even if IDs match. + """ + self.gpuId = None + self.cpu = None + self.ev_done = None + + def free(self): + """ + Free the underlying buffer on GPU - this object should not be used afterwards + """ + self.gpuraw.mem.free() + self.gpuraw = None + + +class GpuData2(GpuData): + """ + Manages one block of GPU data with corresponding CPU data. + Keeps track of which cpu array is currently on GPU by its id, + and transfers if it's not already there. + + To be used for the exit wave, ma, and mag arrays. + Note: Allocator should be pooled for best performance + """ + + def __init__(self, nbytes, syncback=False): + """ + New instance of GpuData. Allocates the GPU-side array. + + :param nbytes: Number of bytes held by this instance. + :param syncback: Should the data be synced back to CPU any time it's swapped out + """ + self.done_what = None + super().__init__(nbytes, syncback) + + def record_done(self, stream, what): + assert what in ['dtoh', 'htod', 'compute'] + self.ev_done = cp.cuda.Event() + with stream: + self.ev_done.record() + self.done_what = what + + def to_gpu(self, cpu, ident, stream): + """ + Transfer cpu array to GPU on stream (async), keeping track of its id + """ + ident = id(cpu) if ident is None else ident + if self.gpuId != ident: + if self.ev_done is not None: + stream.wait_event(self.ev_done) + # Safety measure. This is asynchronous, but it should still work + # Essentially we want to copy the data held in gpu array back to its CPU + # handle before the buffer can be reused. + if self.done_what != 'dtoh' and self.syncback: + # uploads on the download stream, easy to spot in nsight-sys + self.from_gpu(stream) + self.gpuId = ident + self.cpu = cpu + alloc = cp.cuda.get_allocator() + try: + cp.cuda.set_allocator(self._allocator) + with stream: + self.gpu = cp.asarray(cpu) + finally: + cp.cuda.set_allocator(alloc) + self.record_done(stream, 'htod') + return self.ev_done, self.gpu + + def from_gpu(self, stream): + """ + Transfer data back to CPU, into same data handle it was copied from + before. + """ + if self.cpu is not None and self.gpuId is not None and self.gpu is not None: + # Wait for any action recorded with this array + if self.ev_done is not None: + stream.wait_event(self.ev_done) + cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data, + src=self.gpu.data.ptr, + size=self.gpu.nbytes, + kind=2, # d2h + stream=stream.ptr) + self.record_done(stream, 'dtoh') + # Mark for reuse + self.gpuId = None + return self.ev_done + else: + return None + + +class GpuDataManager: + """ + Manages a set of GpuData instances, to keep several blocks on device. + + Currently all blocks must be the same size. + + Note that the syncback property is used so that during fourier updates, + the exit wave array is synced bck to cpu (it is updated), + while during probe update, it's not. + """ + + def __init__(self, nbytes, num, max=None, syncback=False): + """ + Create an instance of GpuDataManager. + Parameters are the same as for GpuData, and num is the number of + GpuData instances to create (blocks on device). + """ + self._syncback = syncback + self._nbytes = nbytes + self.data = [] + self.max = max + for i in range(num): + self.add_data_block() + + def add_data_block(self, nbytes=None): + """ + Add a GpuData block. + + Parameters + ---------- + nbytes - Size of block + + Returns + ------- + """ + if self.max is None or len(self) < self.max: + nbytes = nbytes if nbytes is not None else self._nbytes + self.data.append(GpuData2(nbytes, self._syncback)) + + @property + def syncback(self): + """ + Get if syncback of data to CPU on swapout is enabled. + """ + return self._syncback + + @syncback.setter + def syncback(self, whether): + """ + Adjust the syncback setting + """ + self._syncback = whether + for d in self.data: + d.syncback = whether + + @property + def nbytes(self): + """ + Get the number of bytes in each block + """ + return self.data[0].nbytes + + @property + def memory(self): + """ + Get all memory occupied by all blocks + """ + m = 0 + for d in self.data: + m += d.nbytes_buffer + return m + + def __len__(self): + return len(self.data) + + def reset(self, nbytes, num): + """ + Reset this object as if these parameters were given to the constructor. + The syncback property is untouched. + """ + sync = self.syncback + # remove if too many, explictly freeing memory + for i in range(num, len(self.data)): + self.data[i].free() + # cut short if too many + self.data = self.data[:num] + # reset existing + for d in self.data: + d.resize(nbytes) + # append new ones + for i in range(len(self.data), num): + self.data.append(GpuData2(nbytes, sync)) + + def free(self): + """ + Explicitly clear all data blocks - same as resetting to 0 blocks + """ + self.reset(0, 0) + + def to_gpu(self, cpu, id, stream, pop_id="none"): + """ + Transfer a block to the GPU, given its ID and CPU data array + """ + idx = 0 + for x in self.data: + if x.gpuId == id or x.gpuId == pop_id: + break + idx += 1 + if idx == len(self.data): + idx = 0 + else: + pass + m = self.data.pop(idx) + self.data.append(m) + #print("Swap %s for %s and move from %d to %d" % (m.gpuId,id,idx,len(self.data))) + ev, gpu = m.to_gpu(cpu, id, stream) + # return the wait event, the gpu array and the function to register a finished computation + return ev, gpu, m + + def sync_to_cpu(self, stream): + """ + Sync back all data to CPU + """ + for x in self.data: + x.from_gpu(stream) + diff --git a/ptypy/accelerate/cuda_cupy/multi_gpu.py b/ptypy/accelerate/cuda_cupy/multi_gpu.py new file mode 100644 index 000000000..79f511423 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/multi_gpu.py @@ -0,0 +1,151 @@ +""" +Multi-GPU AllReduce Wrapper, that uses NCCL via cupy if it's available, +and otherwise falls back to CUDA-aware MPI, +and if that doesn't work, uses host/device copies with regular MPI. + +Findings: + +1) OpenMPI with CUDA support needs to be available, and: + - mpi4py needs to be compiled from master (3.1.0a - latest stable release 3.0.x doesn't have it) + - OpenMPI in a conda install needs to have the environment variable + --> if cuda support isn't enabled, the application simply crashes with a seg fault + +2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. + It should be in DEFAULT mode. + +""" + +from pkg_resources import parse_version +import numpy as np +import cupy as cp +from ptypy.utils import parallel +from ptypy.utils.verbose import logger, log +import os +from cupy.cuda import nccl + +try: + import mpi4py +except ImportError: + mpi4py = None + +# properties to check which versions are available + +# use NCCL if it is available, and the user didn't override the +# default selection with environment variables +have_nccl = (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ + (not 'PTYPY_USE_MPI' in os.environ) + +# At the moment, we require: +# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, +# mpi4py >= 3.1.0 +# and not setting the PTYPY_USE_MPI environment variable +# +# -> we ideally want to allow enabling support from a parameter in ptypy +have_cuda_mpi = (mpi4py is not None) and \ + "OMPI_MCA_opal_cuda_support" in os.environ and \ + os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \ + parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \ + not ('PTYPY_USE_MPI' in os.environ) + + +class MultiGpuCommunicatorBase: + """Base class for multi-GPU communicator options, to aggregate common bits""" + + def __init__(self): + self.rank = parallel.rank + self.ndev = parallel.size + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + # base class only checks properties of arrays + assert isinstance(arr, cp.ndarray), "Input must be a GPU Array" + + +class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase): + """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D""" + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + super().allReduceSum(arr) + + if parallel.MPIenabled: + # note: this creates a temporary CPU array + data = arr.get() + parallel.allreduce(data) + arr.set(data) + + +class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase): + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + if parallel.MPIenabled: + comm = parallel.comm + comm.Allreduce(parallel.MPI.IN_PLACE, arr) + + +class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): + + def __init__(self): + super().__init__() + + # Check if GPUs are in default mode + if cp.cuda.Device().attributes["ComputeMode"] != 0: ## ComputeModeDefault + raise RuntimeError( + "Compute mode must be default in order to use NCCL") + + # get a unique identifier for the NCCL communicator and + # broadcast it to all MPI processes (assuming one device per process) + if self.rank == 0: + self.id = nccl.get_unique_id() + else: + self.id = None + + self.id = parallel.bcast(self.id) + + self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + count, datatype = self.__get_NCCL_count_dtype(arr) + + self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM, + cp.cuda.get_current_stream().ptr) + + def __get_NCCL_count_dtype(self, arr): + if arr.dtype == np.complex64: + return arr.size*2, nccl.NCCL_FLOAT32 + elif arr.dtype == np.complex128: + return arr.size*2, nccl.NCCL_FLOAT64 + elif arr.dtype == np.float32: + return arr.size, nccl.NCCL_FLOAT32 + elif arr.dtype == np.float64: + return arr.size, nccl.NCCL_FLOAT64 + else: + raise ValueError("This dtype is not supported by NCCL.") + + +# pick the appropriate communicator depending on installed packages +def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): + if have_nccl and use_nccl: + try: + comm = MultiGpuCommunicatorNccl() + log(4, "Using NCCL communicator") + return comm + except RuntimeError: + pass + except AttributeError: + # see issue #323 + pass + if have_cuda_mpi and use_cuda_mpi: + try: + comm = MultiGpuCommunicatorCudaMpi() + log(4, "Using CUDA-aware MPI communicator") + return comm + except RuntimeError: + pass + comm = MultiGpuCommunicatorMpi() + log(4, "Using MPI communicator") + return comm diff --git a/ptypy/accelerate/cuda_cupy/porting_notes.md b/ptypy/accelerate/cuda_cupy/porting_notes.md new file mode 100644 index 000000000..a10869492 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/porting_notes.md @@ -0,0 +1,60 @@ +# PyCUDA to CuPy Porting Notes + +This file collects notes for things to consider and issues that were fixed when +porting the pycuda code to cupy. + +## Simple Conversions + +- `gpuarray.to_gpu` => `cp.asarray` +- `gpuarray.zeros`, etc, typically have cupy equivalents in `cp.` +- `gpuarray.get` generally works with `cp.get` as well, but cupy has a more descriptive `cp.asnumpy` as well +- all functions that don't have a direct numpy equivalent are in `cupyx` rather than `cupy` + (for example for pinned arrays) +- raw data pointers to GPU arrays can be retrieved with `x.data.ptr` +- raw data pointers to streams: `stream.ptr` +- low-level APIs, are closer to the standard CUDA runtime calls and are in `cupy.cuda.runtime` module, for example `memcpyAsync` +- streams are not parameters, but rather contexts: + +```python +stream = cp.cuda.Stream() +with stream: + ... # kernel calls etc will go onto this stream + +# alternative: +stream.use() +... # next kernel calls will use that stream +``` + + +## Sticky Points + +### Memory Pool + +- cupy uses a device memory pool by default, which re-uses freed memory blocks +- the pool is empty at the start and new allocations are using the regular cudaAlloc functions +- once blocks are freed, they are not given back to the device with cudaFree, but are rather + kept in a free list and re-used in further allocations +- therefore the flag for using device memory pool that some engines had made no sense +- this also affects are total available memory should be calculated - it is in fact the free + device memory + the free memory in the pool + +### Page-locked Memory Pool + +- cupy also uses a `PinnedMemoryPool` for obtaining page-locked blocks +- these will be kept in a free list when they are not required anymore +- it works similar to the `DeviceMemoryPool` + +### Context Management + +- cupy does not have explicit context creation or deletion of the context +- everything runs in the CUDA runtime's default context (created on first use by default) +- no functions are available to pop the context (as in PyCuda), so need to be + careful with cleanup + + +### Kernel Compilation + +- cupy uses NVTRC, which is slightly different to NVCC. +- the generated device code is not exactly the same for some reason +- Kernels might therefore perform a little bit different - faster or slower, but tests showed + that they are largely equivalent in performance diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index e6c51d49f..92267c775 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -1,10 +1,15 @@ +import sys +import os + +import numpy as np import pycuda.driver as cuda from pycuda.compiler import SourceModule -import numpy as np -import os -# debug_options = [] -# debug_options = ['-O0', '-G', '-g'] -debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags +from pycuda.tools import DeviceMemoryPool + +from ptypy.utils import parallel + +kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common')) +debug_options = ['-O3', '-DNDEBUG', '-lineinfo', '-I' + kernel_dir] # release mode flags # C++14 support was added with CUDA 9, so we only enable the flag there if cuda.get_version()[0] >= 9: @@ -12,45 +17,70 @@ else: debug_options += ['-std=c++11'] -context = None +# ensure pycuda's make_default_context picks up the correct GPU for that rank +os.environ['CUDA_DEVICE'] = str(parallel.rank_local) + queue = None +dev_pool = None + + +def _pycuda_excepthook(type, value, tb): + global dev_pool + + # memory pool clean-up, avoid memory leak in the case of raising exception + if dev_pool is not None: + # only do the clean-up if it is present + dev_pool.stop_holding() -def get_context(new_context=False, new_queue=False): + # raise the original exception + sys.__excepthook__(type, value, tb) +sys.excepthook = _pycuda_excepthook - from ptypy.utils import parallel - global context +def get_context(new_queue=False): + global queue - if context is None or new_context: - cuda.init() - if parallel.rank_local >= cuda.Device.count(): - raise Exception('Local rank must be smaller than total device count, \ - rank={}, rank_local={}, device_count={}'.format( - parallel.rank, parallel.rank_local, cuda.Device.count() - )) - context = cuda.Device(parallel.rank_local).make_context() - context.push() - # print("made context %s on rank %s" % (str(context), str(parallel.rank))) - # print("The cuda device count on %s is:%s" % (str(parallel.rank), - # str(cuda.Device.count()))) - # print("parallel.rank:%s, parallel.rank_local:%s" % (str(parallel.rank), - # str(parallel.rank_local))) + # idempotent anyway + cuda.init() + + if parallel.rank_local >= cuda.Device.count(): + raise Exception('Local rank must be smaller than total device count, \ + rank={}, rank_local={}, device_count={}'.format( + parallel.rank, parallel.rank_local, cuda.Device.count() + )) + + # the existing context will always be the primary context, unless + # explicitly created elsewhere + if (context := cuda.Context.get_current()) is None: + from pycuda import autoprimaryctx + context = autoprimaryctx.context + if queue is None or new_queue: queue = cuda.Stream() - + return context, queue +def get_dev_pool(): + global dev_pool + + # retain a single global instance of device memory pool + if dev_pool is None: + dev_pool = DeviceMemoryPool() + + return dev_pool + + def load_kernel(name, subs={}, file=None): if file is None: if isinstance(name, str): - fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + fn = "%s/%s.cu" % (kernel_dir, name) else: raise ValueError("name parameter must be a string if not filename is given") else: - fn = "%s/cuda/%s" % (os.path.dirname(__file__), file) + fn = "%s/%s" % (kernel_dir, file) with open(fn, 'r') as f: kernel = f.read() @@ -60,9 +90,8 @@ def load_kernel(name, subs={}, file=None): escaped = fn.replace("\\", "\\\\") kernel = '#line 1 "{}"\n'.format(escaped) + kernel mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options) - + if isinstance(name, str): return mod.get_function(name) else: # tuple return tuple(mod.get_function(n) for n in name) - diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 7c2de8f3f..72eae996f 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -1,26 +1,11 @@ +from ptypy.accelerate.cuda_common.utils import map2ctype + from . import load_kernel from pycuda import gpuarray import pycuda.driver as cuda from ptypy.utils import gaussian import numpy as np -# maps a numpy dtype to the corresponding C type -def map2ctype(dt): - if dt == np.float32: - return 'float' - elif dt == np.float64: - return 'double' - elif dt == np.complex64: - return 'complex' - elif dt == np.complex128: - return 'complex' - elif dt == np.int32: - return 'int' - elif dt == np.int64: - return 'long long' - else: - raise ValueError('No mapping for {}'.format(dt)) - class ArrayUtilsKernel: def __init__(self, acc_dtype=np.float64, queue=None): @@ -285,7 +270,7 @@ def delxf(self, input, out, axis=-1): axis = np.int32(axis) if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxf_last(input, out, flat_dim, np.int32(input.shape[axis]), block=self.last_axis_block, grid=( @@ -295,8 +280,8 @@ def delxf(self, input, out, axis=-1): stream=self.queue ) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 @@ -316,7 +301,7 @@ def delxb(self, input, out, axis=-1): axis = np.int32(axis) if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxb_last(input, out, flat_dim, np.int32(input.shape[axis]), block=self.last_axis_block, grid=( @@ -326,8 +311,8 @@ def delxb(self, input, out, axis=-1): stream=self.queue ) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index d10e82b1a..5364f092d 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -1,10 +1,9 @@ -import skcuda.fft as cu_fft -from skcuda.fft import cufft as cufftlib + from pycuda import gpuarray from . import load_kernel import numpy as np -class FFT_cuda(object): +class FFT_base(object): def __init__(self, array, queue=None, inplace=False, @@ -17,15 +16,29 @@ def __init__(self, array, queue=None, if dims < 2: raise AssertionError('Input array must be at least 2-dimensional') self.arr_shape = (array.shape[-2], array.shape[-1]) - rows = self.arr_shape[0] - columns = self.arr_shape[1] - if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: - raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") - self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) + self.batches = int(np.prod(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward self._load(array, pre_fft, post_fft, symmetric, forward) +class FFT_cuda(FFT_base): + + def __init__(self, array, queue=None, + inplace=False, + pre_fft=None, + post_fft=None, + symmetric=True, + forward=True): + rows, columns = (array.shape[-2], array.shape[-1]) + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") + super(FFT_cuda, self).__init__(array, queue=queue, + inplace=inplace, + pre_fft=pre_fft, + post_fft=post_fft, + symmetric=symmetric, + forward=forward) + def _load(self, array, pre_fft, post_fft, symmetric, forward): if pre_fft is not None: self.pre_fft = gpuarray.to_gpu(pre_fft) @@ -68,7 +81,23 @@ def _ift(self, input, output): self.fftobj.ifft(input.gpudata, output.gpudata) -class FFT_skcuda(FFT_cuda): +class FFT_skcuda(FFT_base): + + def __init__(self, array, queue=None, + inplace=False, + pre_fft=None, + post_fft=None, + symmetric=True, + forward=True): + import skcuda.fft as cu_fft + self._fft = cu_fft.fft + self._ifft = cu_fft.ifft + super(FFT_cuda, self).__init__(array, queue=queue, + inplace=inplace, + pre_fft=pre_fft, + post_fft=post_fft, + symmetric=symmetric, + forward=forward) @property def queue(self): @@ -77,6 +106,7 @@ def queue(self): @queue.setter def queue(self, queue): self._queue = queue + from skcuda.fft import cufft as cufftlib cufftlib.cufftSetStream(self.plan.handle, queue.handle) def _load(self, array, pre_fft, post_fft, symmetric, forward): @@ -112,6 +142,7 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): int((self.arr_shape[1] + 31) // 32), int(self.batches) ) + import skcuda.fft as cu_fft self.plan = cu_fft.Plan( self.arr_shape, array.dtype, @@ -121,11 +152,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): ) # with cuFFT, we need to scale ifft if not symmetric and not forward: - self.scale = 1 / np.product(self.arr_shape) + self.scale = 1 / np.prod(self.arr_shape) elif forward and not symmetric: self.scale = 1.0 else: - self.scale = 1 / np.sqrt(np.product(self.arr_shape)) + self.scale = 1 / np.sqrt(np.prod(self.arr_shape)) if pre_fft is not None: self.pre_fft = gpuarray.to_gpu(pre_fft) @@ -166,11 +197,11 @@ def _postfilt(self, y): def _ft(self, x, y): d = self._prefilt(x, y) - cu_fft.fft(d, y, self.plan) + self._fft(d, y, self.plan) self._postfilt(y) def _ift(self, x, y): d = self._prefilt(x, y) - cu_fft.ifft(d, y, self.plan) + self._ifft(d, y, self.plan) self._postfilt(y) diff --git a/ptypy/accelerate/cuda_pycuda/dependencies.yml b/ptypy/accelerate/cuda_pycuda/dependencies.yml index d8b9dfad9..455d60479 100644 --- a/ptypy/accelerate/cuda_pycuda/dependencies.yml +++ b/ptypy/accelerate/cuda_pycuda/dependencies.yml @@ -2,7 +2,7 @@ name: ptypy_pycuda channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 7e1320357..339102452 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -15,14 +15,13 @@ from pycuda import gpuarray import pycuda.driver as cuda import pycuda.cumath -from pycuda.tools import DeviceMemoryPool from ptypy.engines import register from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial from ptypy import utils as u from ptypy.utils.verbose import logger, log from ptypy.utils import parallel -from .. import get_context +from .. import get_context, get_dev_pool from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel @@ -79,13 +78,13 @@ def engine_initialize(self): """ Prepare for ML reconstruction. """ - self.context, self.queue = get_context(new_context=True, new_queue=True) + self.context, self.queue = get_context(new_queue=True) if self.p.use_cuda_device_memory_pool: - self._dmp = DeviceMemoryPool() - self.allocate = self._dmp.allocate + self._dev_pool = get_dev_pool() + self.allocate = self._dev_pool.allocate else: - self._dmp = None + self._dev_pool = None self.allocate = cuda.mem_alloc self.qu_htod = cuda.Stream() @@ -163,13 +162,12 @@ def _setup_kernels(self): fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety if not fit: log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...") - self.context.pop() - self.context.detach() raise SystemExit("ptypy has been exited.") # TODO grow blocks dynamically nma = min(fit, MAX_BLOCKS) - log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block {:.2f} GB'.format(float(blk)/(1024**3))) log(4, 'PyCUDA max blocks fitting on GPU: ma_arrays={}'.format(nma)) # reset memory or create new self.w_data = GpuDataManager(ma_mem, 0, nma, False) @@ -301,7 +299,7 @@ def engine_iterate(self, num=1): return err def position_update(self): - """ + """ Position refinement """ if not self.do_position_refinement or (not self.curiter): @@ -342,7 +340,7 @@ def position_update(self): max_oby = ob.shape[-2] - aux.shape[-2] - 1 max_obx = ob.shape[-1] - aux.shape[-1] - 1 - # We need to re-calculate the current error + # We need to re-calculate the current error PCK.build_aux(aux, addr, ob, pr) PROP.fw(aux, aux) PCK.queue.wait_for_event(ev) @@ -351,9 +349,9 @@ def position_update(self): cuda.memcpy_dtod(dest=error_state.ptr, src=err_phot.ptr, size=err_phot.nbytes) - + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) - + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) for i in range(PCK.mangler.nshifts): PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) @@ -422,8 +420,6 @@ def engine_finalize(self): #self.queue.synchronize() - self.context.pop() - self.context.detach() super().engine_finalize() class GaussianModel(BaseModelSerial): @@ -683,11 +679,11 @@ def __init__(self, amplitude, axes=[-2, -1], queue=None, allocator=None): self.DELK_f = DerivativesKernel(np.float32, queue=queue) if allocator is None: - self._dmp = DeviceMemoryPool() - self.allocator=self._dmp.allocate + self._dev_pool = get_dev_pool() + self.allocator=self._dev_pool.allocate else: self.allocator = allocator - self._dmp= None + self._dev_pool= None empty = lambda x: gpuarray.empty(x.shape, x.dtype, allocator=self.allocator) diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py index c02445265..a10fceff2 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py @@ -69,9 +69,9 @@ def engine_initialize(self): Prepare for reconstruction. """ # Context, Multi GPU communicator and Stream (needs to be in this order) - self.context, self.queue = get_context(new_context=True, new_queue=False) + self.context, self.queue = get_context(new_queue=False) self.multigpu = get_multi_gpu_communicator() - self.context, self.queue = get_context(new_context=False, new_queue=True) + self.context, self.queue = get_context(new_queue=True) # Gaussian Smoothing Kernel self.GSK = GaussianSmoothingKernel(queue=self.queue) @@ -120,6 +120,10 @@ def _setup_kernels(self): # create buffer arrays ash = (fpc * nmodes,) + tuple(geo.shape) aux = np.zeros(ash, dtype=np.complex64) + mem = cuda.mem_get_info()[0] + if not int(mem) // aux.nbytes: + log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...") + raise SystemExit("ptypy has been exited.") kern.aux = gpuarray.to_gpu(aux) # setup kernels, one for each SCAN. @@ -555,9 +559,6 @@ def engine_finalize(self): for name, s in self.pr.S.items(): s.data = np.copy(s.data) - self.context.pop() - self.context.detach() - # we don't need the "benchmarking" in DM_serial super().engine_finalize(benchmark=False) diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py index e9204f80b..6c54a8074 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py @@ -64,14 +64,13 @@ def _setup_kernels(self): fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety if not fit: log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...") - self.context.pop() - self.context.detach() raise SystemExit("ptypy has been exited.") # TODO grow blocks dynamically nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) nma = min(fit, MAX_BLOCKS) - log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9)) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3))) log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) # reset memory or create new self.ex_data = GpuDataManager(ex_mem, 0, nex, True) @@ -134,11 +133,11 @@ def engine_prepare(self): prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4) prep.mag[:] = mag - log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) + log(4, 'Free memory on device: {:.2f} GB'.format(float(cuda.mem_get_info()[0])/(1024**3))) self.ex_data.add_data_block() self.ma_data.add_data_block() self.mag_data.add_data_block() - + def engine_iterate(self, num=1): """ Compute one iteration. @@ -148,7 +147,7 @@ def engine_iterate(self, num=1): atomics_probe = self.p.probe_update_cuda_atomics atomics_object = self.p.object_update_cuda_atomics use_tiles = (not atomics_object) or (not atomics_probe) - + for it in range(num): error = {} @@ -311,7 +310,7 @@ def engine_iterate(self, num=1): # Update positions if do_update_pos: """ - Iterates through all positions and refines them by a given algorithm. + Iterates through all positions and refines them by a given algorithm. """ log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): @@ -347,7 +346,7 @@ def engine_iterate(self, num=1): # wait for data to arrive self.queue.wait_for_event(ev_mag) - # We need to re-calculate the current error + # We need to re-calculate the current error if self.p.position_refinement.metric == "fourier": PCK.fourier_error(aux, addr, mag, ma, ma_sum) PCK.error_reduce(addr, err_fourier) diff --git a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py index a3363e161..d45a67218 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py +++ b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py @@ -68,7 +68,7 @@ def engine_initialize(self): """ Prepare for reconstruction. """ - self.context, self.queue = get_context(new_context=True, new_queue=True) + self.context, self.queue = get_context(new_queue=True) # initialise kernels for centring probe if required if self.p.probe_center_tol is not None: @@ -160,15 +160,15 @@ def _setup_kernels(self): fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety if not fit: log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...") - self.context.pop() - self.context.detach() raise SystemExit("ptypy has been exited.") # TODO grow blocks dynamically nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) nma = min(fit, MAX_BLOCKS) - log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3))) + log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) # reset memory or create new self.ex_data = GpuDataManager(ex_mem, 0, nex, True) self.ma_data = GpuDataManager(ma_mem, 0, nma, False) @@ -484,7 +484,6 @@ def engine_finalize(self): for name, s in self.ob.S.items(): s.data = np.copy(s.data) - self.context.detach() super().engine_finalize() diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index fcb1448bb..8f7378715 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -21,6 +21,7 @@ def choose_fft(fft_type, arr_shape): try: from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT except: + import filtered_cufft logger.warning('Unable to import cufft version - using Reikna instead') from ptypy.accelerate.cuda_pycuda.fft import FFT elif fft_type=='skcuda': @@ -89,7 +90,7 @@ def _bw(x,y): self._CPK.crop_pad_2d_simple(y, self._tmp) else: self._fft2.ift(x,y) - + self.fw = _fw self.bw = _bw @@ -110,11 +111,11 @@ def _bw(x,y): def _fw(x,y): self._fft1.ft(x,y) self._fft3.ift(y,y) - + def _bw(x,y): self._fft2.ft(x,y) self._fft3.ift(y,y) - + self.fw = _fw self.bw = _bw else: @@ -303,7 +304,7 @@ def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): block=(32, 32, 1), grid=(int(fmag.shape[0]*self.nmodes), 1, 1), stream=self.queue) - + def fmag_update_nopbound(self, f, addr, fmag, fmask): fdev = self.gpu.fdev bx = 64 @@ -322,8 +323,8 @@ def fmag_update_nopbound(self, f, addr, fmag, fmask): np.int32(self.fshape[1]), np.int32(self.fshape[2]), block=(bx, by, 1), - grid=(1, - int((self.fshape[2] + by - 1) // by), + grid=(1, + int((self.fshape[2] + by - 1) // by), int(fmag.shape[0]*self.nmodes)), stream=self.queue) @@ -499,9 +500,9 @@ def make_aux2(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0): np.float32(c_e) if ex.dtype == np.complex64 else np.float64(c_e), block=(bx, by, 1), grid=( - 1, - int((ex.shape[1] + by - 1)//by), - int(maxz * nmodes)), + 1, + int((ex.shape[1] + by - 1)//by), + int(maxz * nmodes)), stream=self.queue) @@ -544,8 +545,8 @@ def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): obr, obc, addr, np.float32(alpha), np.float32(tau), - block=(bx, by, 1), - grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), + block=(bx, by, 1), + grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), stream=self.queue) """ def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): @@ -590,8 +591,8 @@ def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): block=(bx, by, 1), grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)), stream=self.queue) - - + + def _cache_object_shape(self, ob): oid = id(ob) @@ -611,7 +612,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t self.math_type = math_type if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']): raise ValueError("accumulate and math types must be double for float") - + self.gpu = Adict() self.gpu.LLden = None self.gpu.LLerr = None @@ -632,9 +633,9 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t 'BDIM_Y': 32 }) self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel( - ('fill_b', 'fill_b_reduce'), + ('fill_b', 'fill_b_reduce'), { - **subs, + **subs, 'BDIM_X': 1024, 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' }, @@ -814,7 +815,7 @@ def main(self, b_aux, addr, w, I): class PoUpdateKernel(ab.PoUpdateKernel): - def __init__(self, queue_thread=None, + def __init__(self, queue_thread=None, math_type='float', accumulator_type='float'): super(PoUpdateKernel, self).__init__() # and now initialise the cuda @@ -875,7 +876,24 @@ def __init__(self, queue_thread=None, 'MATH_TYPE': self.math_type, 'ACC_TYPE': self.accumulator_type }) - + self.ob_update_wasp_cuda = load_kernel("ob_update_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_wasp_cuda = load_kernel("pr_update_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.avg_wasp_cuda = load_kernel("avg_wasp", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) def ob_update(self, addr, ob, obn, pr, ex, atomics=True): obsh = [np.int32(ax) for ax in ob.shape] @@ -995,7 +1013,7 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): np.int32(ex.shape[0]), np.int32(ex.shape[1]), np.int32(ex.shape[2]), - ob, pr, ex, addr, + ob, pr, ex, addr, np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) @@ -1031,7 +1049,7 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): grid = (grid[0], grid[1], int(1)) self.pr_update2_ML_cuda(prsh[-1], obsh[-2], obsh[-1], prsh[0], obsh[0], num_pods, - pr, ob, ex, addr, + pr, ob, ex, addr, np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac), block=(16, 16, 1), grid=grid, stream=self.queue) @@ -1092,7 +1110,7 @@ def ob_norm_local(self, addr, ob, obn): obnsh = [np.int32(ax) for ax in obn.shape] bx = 64 by = 1 - self.ob_norm_local_cuda(obn, + self.ob_norm_local_cuda(obn, obnsh[0], obnsh[1], obnsh[2], ob, obsh[0], obsh[1], obsh[2], @@ -1101,12 +1119,12 @@ def ob_norm_local(self, addr, ob, obn): grid=(1, int((obnsh[1] + by - 1)//by), int(obnsh[0])), stream=self.queue) - def pr_norm_local(self, addr, pr, prn): + def pr_norm_local(self, addr, pr, prn): prsh = [np.int32(ax) for ax in pr.shape] prnsh = [np.int32(ax) for ax in prn.shape] bx = 64 by = 1 - self.pr_norm_local_cuda(prn, + self.pr_norm_local_cuda(prn, prnsh[0], prnsh[1], prnsh[2], pr, prsh[0], prsh[1], prsh[2], @@ -1115,6 +1133,80 @@ def pr_norm_local(self, addr, pr, prn): grid=(1, int((prnsh[1] + by - 1)//by), int(prnsh[0])), stream=self.queue) + def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm, + alpha=1): + pr_abs2 = (pr * pr.conj()).real + # this looks absolutely ugly, may need a separate kernel? + # PyCUDA doesn't provide sum across particular axis or a mean/avg method + pr_sz = pr_abs2.shape[1] * pr_abs2.shape[2] + pr_abs2_mean = gpuarray.empty(pr_abs2.shape[0], dtype=pr_abs2.dtype) + for k, pr_abs2_i in enumerate(pr_abs2): + pr_abs2_mean[k] = gpuarray.sum(pr_abs2_i, stream=self.queue) / pr_sz + + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.ob_update_wasp_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + pr_abs2, + prsh[0], prsh[1], prsh[2], + ob, + ob_sum_nmr, + ob_sum_dnm, + obsh[0], obsh[1], obsh[2], + addr, + pr_abs2_mean, + np.float32(alpha), + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + stream=self.queue) + + def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm, + beta=1): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError('Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.pr_update_wasp_cuda(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + pr_sum_nmr, + pr_sum_dnm, + prsh[0], prsh[1], prsh[2], + ob, + obsh[0], obsh[1], obsh[2], + addr, + np.float32(beta), + block=(bx, by, 1), + grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)), + stream=self.queue) + + def avg_wasp(self, arr, nmr, dnm): + arrsh = [np.int32(ax) for ax in arr.shape] + bx = 64 + by = 1 + self.avg_wasp_cuda(arr, nmr, dnm, + arrsh[0], arrsh[1], arrsh[2], + block=(bx, by, 1), + grid=(1, int((arrsh[1] + by - 1)//by), int(arrsh[0])), + stream=self.queue) + class PositionCorrectionKernel(ab.PositionCorrectionKernel): from ptypy.accelerate.cuda_pycuda import address_manglers @@ -1133,7 +1225,7 @@ def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type= raise ValueError('Only float or double math is supported') if accumulate_type not in ['float', 'double']: raise ValueError('Only float or double math is supported') - + # add kernels self.math_type = math_type self.accumulate_type = accumulate_type diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py index 33113c273..73138c3ee 100644 --- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py +++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py @@ -25,6 +25,8 @@ 4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. It should be in DEFAULT mode. +5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead + """ from pkg_resources import parse_version @@ -35,12 +37,6 @@ from ptypy.utils.verbose import logger, log import os -try: - from cupy.cuda import nccl - import cupy as cp -except ImportError: - nccl = None - try: import mpi4py except ImportError: @@ -48,13 +44,6 @@ # properties to check which versions are available -# use NCCL is it is available, and the user didn't override the -# default selection with environment variables -have_nccl = (nccl is not None) and \ - (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ - (not 'PTYPY_USE_MPI' in os.environ) and \ - ('PTYPY_USE_NCCL' in os.environ) - # At the moment, we require: # the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, # mpi4py >= 3.1.0 @@ -109,64 +98,9 @@ def allReduceSum(self, arr): comm = parallel.comm comm.Allreduce(parallel.MPI.IN_PLACE, arr) - -class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): - - def __init__(self): - super().__init__() - - # Check if GPUs are in default mode - if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT: - raise RuntimeError("Compute mode must be default in order to use NCCL") - - # get a unique identifier for the NCCL communicator and - # broadcast it to all MPI processes (assuming one device per process) - if self.rank == 0: - self.id = nccl.get_unique_id() - else: - self.id = None - - self.id = parallel.bcast(self.id) - - self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) - - def allReduceSum(self, arr): - """Call MPI.all_reduce in-place, with array on GPU""" - - buf = int(arr.gpudata) - count, datatype = self.__get_NCCL_count_dtype(arr) - - # no stream support here for now - it fails in NCCL when - # pycuda.Stream.handle is used for some unexplained reason - stream = cp.cuda.Stream.null.ptr - - self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream) - - def __get_NCCL_count_dtype(self, arr): - if arr.dtype == np.complex64: - return arr.size*2, nccl.NCCL_FLOAT32 - elif arr.dtype == np.complex128: - return arr.size*2, nccl.NCCL_FLOAT64 - elif arr.dtype == np.float32: - return arr.size, nccl.NCCL_FLOAT32 - elif arr.dtype == np.float64: - return arr.size, nccl.NCCL_FLOAT64 - else: - raise ValueError("This dtype is not supported by NCCL.") - # pick the appropriate communicator depending on installed packages -def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): - if have_nccl and use_nccl: - try: - comm = MultiGpuCommunicatorNccl() - log(4, "Using NCCL communicator") - return comm - except RuntimeError: - pass - except AttributeError: - # see issue #323 - pass +def get_multi_gpu_communicator(use_cuda_mpi=True): if have_cuda_mpi and use_cuda_mpi: try: comm = MultiGpuCommunicatorCudaMpi() diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py index 3c87978ae..8f09e94d0 100644 --- a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py +++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py @@ -102,7 +102,7 @@ def error_reduce(self, g_err_sum, offset=0): ## Actual math ## # Reduceses the Fourier error along the last 2 dimensions.fd - error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) def fmag_all_update(self, pbound, g_mag, g_mask, g_err_sum, offset=0): diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py index 85c01d4be..b8a284492 100644 --- a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py +++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py @@ -87,7 +87,7 @@ def error_reduce(self, addr, err_sum): ## Actual math ## # Reduceses the Fourier error along the last 2 dimensions.fd - #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) err_sum[:] = ferr.sum(-1).sum(-1) return diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py index 26e08c298..4cfa33905 100644 --- a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py +++ b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py @@ -174,7 +174,7 @@ def __init__(self, queue, array, # attach scaling from reikna.transformations import mul_param - sc = mul_param(array, np.float) + sc = mul_param(array, float) ftreikna.parameter.output.connect(sc, sc.input, out=sc.output, scale=sc.param) iscale = np.sqrt(np.prod(array.shape[-2:])) if symmetric else 1.0 scale = 1.0 / iscale diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py index 9d4b5d88a..20793ea94 100644 --- a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py +++ b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py @@ -253,7 +253,7 @@ def ocl_fourier_error(self, f, fmag, fdev, ferr, fmask, mask_sum): self.queue.finish() def npy_error_reduce(self, ferr, err_fmag): - err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) def ocl_error_reduce(self, ferr, err_fmag): shape = (self.fshape[0], 64), diff --git a/ptypy/core/classes.py b/ptypy/core/classes.py index c55ee8bbe..622077a48 100644 --- a/ptypy/core/classes.py +++ b/ptypy/core/classes.py @@ -81,7 +81,7 @@ # Hard-coded limit in array size # TODO: make this dynamic from available memory. -MEGAPIXEL_LIMIT = 50 +MEGAPIXEL_LIMIT = 100 class Base(object): @@ -709,8 +709,8 @@ def reformat(self, newID=None, update=True): megapixels = np.array(new_shape).astype(float).prod() / 1e6 if megapixels > MEGAPIXEL_LIMIT: - raise RuntimeError('Arrays larger than %dM not supported. You ' - 'requested %.2fM pixels.' % (MEGAPIXEL_LIMIT, megapixels)) + logger.warning('Arrays larger than %dM not recommended. You ' + 'requested %.2fM pixels.' % (MEGAPIXEL_LIMIT, megapixels)) # Apply Nd misfit if self.data is not None: diff --git a/ptypy/core/data.py b/ptypy/core/data.py index 636857bca..3b55495e0 100644 --- a/ptypy/core/data.py +++ b/ptypy/core/data.py @@ -114,6 +114,7 @@ class PtyScan(object): default = data help = Determines what will be loaded in parallel doc = Choose from ``None``, ``'data'``, ``'common'``, ``'all'`` + choices = ['data', 'common', 'all'] [rebin] type = int @@ -122,7 +123,7 @@ class PtyScan(object): doc = Rebinning factor for the raw data frames. ``'None'`` or ``1`` both mean *no binning* userlevel = 1 lowlim = 1 - uplim = 8 + uplim = 32 [orientation] type = int, tuple, list @@ -139,6 +140,7 @@ class PtyScan(object): Alternatively, a 3-tuple of booleans may be provided ``(do_transpose, do_flipud, do_fliplr)`` + choices = [0, 1, 2, 3, 4, 5, 6, 7] userlevel = 1 [min_frames] @@ -797,7 +799,7 @@ def get_data_chunk(self, chunksize, start=None): rebin = self.rebin if rebin <= 1: pass - elif (rebin in range(2, 6) + elif (rebin in range(2, 32+1) and (((sh / float(rebin)) % 1) == 0.0).all()): mask = w > 0 d = u.rebin_2d(d, rebin) diff --git a/ptypy/core/geometry.py b/ptypy/core/geometry.py index d8a10f7b5..ab7852216 100644 --- a/ptypy/core/geometry.py +++ b/ptypy/core/geometry.py @@ -7,7 +7,7 @@ :license: see LICENSE for details. """ import numpy as np -from scipy import fftpack +import scipy.fft from .. import utils as u from ..utils.verbose import logger @@ -55,7 +55,7 @@ class Geo(Base): If set to True, changes to properties like :py:meth:`energy`, :py:meth:`lam`, :py:meth:`shape` or :py:meth:`psize` will cause a call to :py:meth:`update`. - + Default geometry parameters. See also :py:data:`.scan.geometry` @@ -104,7 +104,16 @@ class Geo(Base): type = str default = farfield help = Propagation type - doc = Either "farfield" or "nearfield" + doc = Choose between "farfield" or "nearfield" + choices = 'farfield', 'nearfield' + userlevel = 1 + + [ffttype] + type = str + default = scipy + help = FFT library + doc = Choose which library to use for FFTs + choices = 'numpy', 'scipy', 'fftw' userlevel = 1 [shape] @@ -429,9 +438,9 @@ def get_propagator(geo_dct, **kwargs): Helper function to determine propagator to be attached to Geometry class. """ if geo_dct['propagation'] == 'farfield': - return BasicFarfieldPropagator(geo_dct, **kwargs) + return BasicFarfieldPropagator(geo_dct, ffttype=geo_dct["ffttype"], **kwargs) else: - return BasicNearfieldPropagator(geo_dct, **kwargs) + return BasicNearfieldPropagator(geo_dct, ffttype=geo_dct["ffttype"], **kwargs) class FFTchooser(object): @@ -439,7 +448,7 @@ class FFTchooser(object): Chooses the desired FFT algo, and assigns scaling. If pyFFTW is not available, falls back to scipy. """ - def __init__(self, ffttype='std'): + def __init__(self, ffttype='scipy'): """ Parameters ---------- @@ -462,12 +471,12 @@ def _FFTW_fft(self): self.ifft = lambda x: fftw_np.ifft2(x, planner_effort=pe) def _scipy_fft(self): - self.fft = lambda x: fftpack.fft2(x).astype(x.dtype) - self.ifft = lambda x: fftpack.ifft2(x).astype(x.dtype) + self.fft = lambda x: scipy.fft.fft2(x).astype(x.dtype) + self.ifft = lambda x: scipy.fft.ifft2(x).astype(x.dtype) def _numpy_fft(self): - self.fft = lambda x: np.fft.fft2(x).astype(x.dtype) - self.ifft = lambda x: np.fft.ifft2(x).astype(x.dtype) + self.fft = lambda x: np.ascontiguousarray(np.fft.fft2(x).astype(x.dtype)) + self.ifft = lambda x: np.ascontiguousarray(np.fft.ifft2(x).astype(x.dtype)) def assign_scaling(self, shape): if isinstance(self.ffttype, tuple) and len(self.ffttype) > 2: @@ -506,7 +515,7 @@ class BasicFarfieldPropagator(object): coordinates are rolled periodically, just like in the conventional fft case. """ - def __init__(self, geo_pars=None, ffttype='numpy', **kwargs): + def __init__(self, geo_pars=None, ffttype='scipy', **kwargs): """ Parameters ---------- @@ -685,7 +694,7 @@ class BasicNearfieldPropagator(object): Basic two step (i.e. two ffts) Nearfield Propagator. """ - def __init__(self, geo_pars=None, ffttype='numpy', **kwargs): + def __init__(self, geo_pars=None, ffttype='scipy', **kwargs): """ Parameters ---------- diff --git a/ptypy/core/illumination.py b/ptypy/core/illumination.py index af5d4c06b..ddf1f98f7 100644 --- a/ptypy/core/illumination.py +++ b/ptypy/core/illumination.py @@ -130,6 +130,7 @@ - *