Skip to content

Commit

Permalink
Changes to Cupy backend (#483)
Browse files Browse the repository at this point in the history
* Improve FFT chooser for GPUs

* improve logging of available GPU memory

* formatting and more info about GPU memory

* separate filtered FFT from Cupy FFT

* reorganised engine templates

* fixes need to get EPIE_cupy engine working

* load both pycuda and cupy

* separate filtered FFT from Skcuda fft (which is broken atm)

* import was missing

* changed named of templates folder
  • Loading branch information
daurer authored Feb 2, 2024
1 parent db8b26c commit a7b635d
Show file tree
Hide file tree
Showing 51 changed files with 858 additions and 126 deletions.
2 changes: 1 addition & 1 deletion ptypy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def load_gpu_engines(arch='cuda'):
from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream
from .accelerate.cuda_pycuda.engines import stochastic
from .accelerate.cuda_pycuda.engines import ML_pycuda
if arch=='cupy':
if arch in ['cuda', 'cupy']:
from .accelerate.cuda_cupy.engines import projectional_cupy
from .accelerate.cuda_cupy.engines import projectional_cupy_stream
from .accelerate.cuda_cupy.engines import stochastic
Expand Down
29 changes: 21 additions & 8 deletions ptypy/accelerate/cuda_cupy/cufft.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from . import load_kernel
import numpy as np


class FFT_cuda(object):
class FFT_base(object):

def __init__(self, array, queue=None,
inplace=False,
Expand All @@ -18,17 +17,31 @@ def __init__(self, array, queue=None,
if dims < 2:
raise AssertionError('Input array must be at least 2-dimensional')
self.arr_shape = (array.shape[-2], array.shape[-1])
rows = self.arr_shape[0]
columns = self.arr_shape[1]
if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
raise ValueError(
"CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
self.batches = int(np.prod(
array.shape[0:dims-2]) if dims > 2 else 1)
self.forward = forward

self._load(array, pre_fft, post_fft, symmetric, forward)

class FFT_cuda(FFT_base):

def __init__(self, array, queue=None,
inplace=False,
pre_fft=None,
post_fft=None,
symmetric=True,
forward=True):
rows, columns = (array.shape[-2], array.shape[-1])
if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
raise ValueError(
"CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
super(FFT_cuda, self).__init__(array, queue=queue,
inplace=inplace,
pre_fft=pre_fft,
post_fft=post_fft,
symmetric=symmetric,
forward=forward)

def _load(self, array, pre_fft, post_fft, symmetric, forward):
if pre_fft is not None:
self.pre_fft = cp.asarray(pre_fft)
Expand Down Expand Up @@ -71,7 +84,7 @@ def _ift(self, input, output):
self.fftobj.ifft(input.data.ptr, output.data.ptr)


class FFT_cupy(FFT_cuda):
class FFT_cupy(FFT_base):

@property
def queue(self):
Expand Down
2 changes: 2 additions & 0 deletions ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ def _setup_kernels(self):
# TODO grow blocks dynamically
nma = min(fit, MAX_BLOCKS)
log_device_memory_stats(4)
log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma))
# reset memory or create new
self.w_data = GpuDataManager(ma_mem, 0, nma, False)
Expand Down
57 changes: 22 additions & 35 deletions ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
"""

import numpy as np
import time
import cupy as cp

from ptypy import utils as u
from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats
from ptypy.utils.verbose import logger, log
from ptypy.accelerate.cuda_cupy import get_context
from ptypy.utils.verbose import log
from ptypy.utils import parallel
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
Expand Down Expand Up @@ -119,12 +118,16 @@ def _setup_kernels(self):
# create buffer arrays
ash = (fpc * nmodes,) + tuple(geo.shape)
aux = np.zeros(ash, dtype=np.complex64)
mempool = cp.get_default_memory_pool()
mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
if not int(mem) // aux.nbytes:
log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
raise SystemExit("ptypy has been exited.")
kern.aux = cp.asarray(aux)

# setup kernels, one for each SCAN.
log(4, "Setting up FourierUpdateKernel")
kern.FUK = FourierUpdateKernel(
aux, nmodes, queue_thread=self.queue)
kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
kern.FUK.allocate()

log(4, "Setting up PoUpdateKernel")
Expand All @@ -142,15 +145,13 @@ def _setup_kernels(self):
kern.TK = TransposeKernel(queue=self.queue)

log(4, "Setting up PropagationKernel")
kern.PROP = PropagationKernel(
aux, geo.propagator, self.queue, self.p.fft_lib)
kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
kern.PROP.allocate()
kern.resolution = geo.resolution[0]

if self.do_position_refinement:
log(4, "Setting up PositionCorrectionKernel")
kern.PCK = PositionCorrectionKernel(
aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
kern.PCK.allocate()
log(4, "Kernel setup completed")

Expand Down Expand Up @@ -179,8 +180,7 @@ def engine_prepare(self):
prep = self.diff_info[d.ID]
prep.addr_gpu = cp.asarray(prep.addr)
if use_tiles:
prep.addr2 = np.ascontiguousarray(
np.transpose(prep.addr, (2, 3, 0, 1)))
prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
prep.addr2_gpu = cp.asarray(prep.addr2)
if self.do_position_refinement:
prep.mangled_addr_gpu = prep.addr_gpu.copy()
Expand Down Expand Up @@ -262,8 +262,7 @@ def engine_iterate(self, num=1):

# build exit wave
#AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
c_po=self._a, c_e=-(self._a + self._b))
AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
FUK.exit_error(aux, addr)
FUK.error_reduce(addr, err_exit)

Expand Down Expand Up @@ -294,8 +293,7 @@ def engine_iterate(self, num=1):
err_fourier = prep.err_fourier_gpu.get()
err_phot = prep.err_phot_gpu.get()
err_exit = prep.err_exit_gpu.get()
errs = np.ascontiguousarray(
np.vstack([err_fourier, err_phot, err_exit]).T)
errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
error.update(zip(prep.view_IDs, errs))

self.error = error
Expand All @@ -307,12 +305,9 @@ def position_update(self):
"""
if not self.do_position_refinement or (not self.curiter):
return
do_update_pos = (self.p.position_refinement.stop >
self.curiter >= self.p.position_refinement.start)
do_update_pos &= (self.curiter %
self.p.position_refinement.interval) == 0
use_tiles = (not self.p.probe_update_cuda_atomics) or (
not self.p.object_update_cuda_atomics)
do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)

# Update positions
if do_update_pos:
Expand Down Expand Up @@ -364,18 +359,15 @@ def position_update(self):

log(4, 'Position refinement trial: iteration %s' % (self.curiter))
for i in range(PCK.mangler.nshifts):
PCK.mangler.get_address(
i, addr, mangled_addr, max_oby, max_obx)
PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
PCK.build_aux(aux, mangled_addr, ob, pr)
PROP.fw(aux, aux)
if self.p.position_refinement.metric == "fourier":
PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
PCK.error_reduce(mangled_addr, err_fourier)
if self.p.position_refinement.metric == "photon":
PCK.log_likelihood(
aux, mangled_addr, mag, ma, err_fourier)
PCK.update_addr_and_error_state(
addr, error_state, mangled_addr, err_fourier)
PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)

cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
src=error_state.data.ptr,
Expand Down Expand Up @@ -413,8 +405,7 @@ def center_probe(self):
prep = self.diff_info[dID]
pID, oID, eID = prep.poe_IDs
if pID == name:
self.ex.S[eID].gpu = self.ISK.interpolate_shift(
self.ex.S[eID].gpu, shift)
self.ex.S[eID].gpu = self.ISK.interpolate_shift(self.ex.S[eID].gpu, shift)

log(4, 'Probe recentered from %s to %s'
% (str(tuple(c1)), str(tuple(c2))))
Expand Down Expand Up @@ -533,17 +524,15 @@ def support_constraint(self, storage=None):
if support is not None:
if storage.ID not in self.FSK:
supp = support.astype(np.complex64)
self.FSK[storage.ID] = FourierSupportKernel(
supp, self.queue, self.p.fft_lib)
self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib)
self.FSK[storage.ID].allocate()
self.FSK[storage.ID].apply_fourier_support(storage.gpu)

# Real space
support = self._probe_support.get(storage.ID)
if support is not None:
if storage.ID not in self.RSK:
self.RSK[storage.ID] = RealSupportKernel(
support.astype(np.complex64))
self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64))
self.RSK[storage.ID].allocate()
self.RSK[storage.ID].apply_real_support(storage.gpu)

Expand Down Expand Up @@ -584,13 +573,11 @@ def engine_finalize(self):
prep.addr = prep.addr_gpu.get()
del prep.addr_gpu


mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
pinned_pool = cp.get_default_pinned_memory_pool()
pinned_pool.free_all_blocks()


# we don't need the "benchmarking" in DM_serial
super().engine_finalize(benchmark=False)

Expand Down
Loading

0 comments on commit a7b635d

Please sign in to comment.