Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to Cupy backend #483

Merged
merged 10 commits into from
Feb 2, 2024
2 changes: 1 addition & 1 deletion ptypy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def load_gpu_engines(arch='cuda'):
from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream
from .accelerate.cuda_pycuda.engines import stochastic
from .accelerate.cuda_pycuda.engines import ML_pycuda
if arch=='cupy':
if arch in ['cuda', 'cupy']:
from .accelerate.cuda_cupy.engines import projectional_cupy
from .accelerate.cuda_cupy.engines import projectional_cupy_stream
from .accelerate.cuda_cupy.engines import stochastic
Expand Down
29 changes: 21 additions & 8 deletions ptypy/accelerate/cuda_cupy/cufft.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from . import load_kernel
import numpy as np


class FFT_cuda(object):
class FFT_base(object):

def __init__(self, array, queue=None,
inplace=False,
Expand All @@ -18,17 +17,31 @@ def __init__(self, array, queue=None,
if dims < 2:
raise AssertionError('Input array must be at least 2-dimensional')
self.arr_shape = (array.shape[-2], array.shape[-1])
rows = self.arr_shape[0]
columns = self.arr_shape[1]
if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
raise ValueError(
"CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
self.batches = int(np.prod(
array.shape[0:dims-2]) if dims > 2 else 1)
self.forward = forward

self._load(array, pre_fft, post_fft, symmetric, forward)

class FFT_cuda(FFT_base):

def __init__(self, array, queue=None,
inplace=False,
pre_fft=None,
post_fft=None,
symmetric=True,
forward=True):
rows, columns = (array.shape[-2], array.shape[-1])
if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
raise ValueError(
"CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
super(FFT_cuda, self).__init__(array, queue=queue,
inplace=inplace,
pre_fft=pre_fft,
post_fft=post_fft,
symmetric=symmetric,
forward=forward)

def _load(self, array, pre_fft, post_fft, symmetric, forward):
if pre_fft is not None:
self.pre_fft = cp.asarray(pre_fft)
Expand Down Expand Up @@ -71,7 +84,7 @@ def _ift(self, input, output):
self.fftobj.ifft(input.data.ptr, output.data.ptr)


class FFT_cupy(FFT_cuda):
class FFT_cupy(FFT_base):

@property
def queue(self):
Expand Down
2 changes: 2 additions & 0 deletions ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ def _setup_kernels(self):
# TODO grow blocks dynamically
nma = min(fit, MAX_BLOCKS)
log_device_memory_stats(4)
log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma))
# reset memory or create new
self.w_data = GpuDataManager(ma_mem, 0, nma, False)
Expand Down
57 changes: 22 additions & 35 deletions ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
"""

import numpy as np
import time
import cupy as cp

from ptypy import utils as u
from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats
from ptypy.utils.verbose import logger, log
from ptypy.accelerate.cuda_cupy import get_context
from ptypy.utils.verbose import log
from ptypy.utils import parallel
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
Expand Down Expand Up @@ -119,12 +118,16 @@ def _setup_kernels(self):
# create buffer arrays
ash = (fpc * nmodes,) + tuple(geo.shape)
aux = np.zeros(ash, dtype=np.complex64)
mempool = cp.get_default_memory_pool()
mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
if not int(mem) // aux.nbytes:
log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
raise SystemExit("ptypy has been exited.")
kern.aux = cp.asarray(aux)

# setup kernels, one for each SCAN.
log(4, "Setting up FourierUpdateKernel")
kern.FUK = FourierUpdateKernel(
aux, nmodes, queue_thread=self.queue)
kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
kern.FUK.allocate()

log(4, "Setting up PoUpdateKernel")
Expand All @@ -142,15 +145,13 @@ def _setup_kernels(self):
kern.TK = TransposeKernel(queue=self.queue)

log(4, "Setting up PropagationKernel")
kern.PROP = PropagationKernel(
aux, geo.propagator, self.queue, self.p.fft_lib)
kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
kern.PROP.allocate()
kern.resolution = geo.resolution[0]

if self.do_position_refinement:
log(4, "Setting up PositionCorrectionKernel")
kern.PCK = PositionCorrectionKernel(
aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
kern.PCK.allocate()
log(4, "Kernel setup completed")

Expand Down Expand Up @@ -179,8 +180,7 @@ def engine_prepare(self):
prep = self.diff_info[d.ID]
prep.addr_gpu = cp.asarray(prep.addr)
if use_tiles:
prep.addr2 = np.ascontiguousarray(
np.transpose(prep.addr, (2, 3, 0, 1)))
prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
prep.addr2_gpu = cp.asarray(prep.addr2)
if self.do_position_refinement:
prep.mangled_addr_gpu = prep.addr_gpu.copy()
Expand Down Expand Up @@ -262,8 +262,7 @@ def engine_iterate(self, num=1):

# build exit wave
#AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
c_po=self._a, c_e=-(self._a + self._b))
AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
FUK.exit_error(aux, addr)
FUK.error_reduce(addr, err_exit)

Expand Down Expand Up @@ -294,8 +293,7 @@ def engine_iterate(self, num=1):
err_fourier = prep.err_fourier_gpu.get()
err_phot = prep.err_phot_gpu.get()
err_exit = prep.err_exit_gpu.get()
errs = np.ascontiguousarray(
np.vstack([err_fourier, err_phot, err_exit]).T)
errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
error.update(zip(prep.view_IDs, errs))

self.error = error
Expand All @@ -307,12 +305,9 @@ def position_update(self):
"""
if not self.do_position_refinement or (not self.curiter):
return
do_update_pos = (self.p.position_refinement.stop >
self.curiter >= self.p.position_refinement.start)
do_update_pos &= (self.curiter %
self.p.position_refinement.interval) == 0
use_tiles = (not self.p.probe_update_cuda_atomics) or (
not self.p.object_update_cuda_atomics)
do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)

# Update positions
if do_update_pos:
Expand Down Expand Up @@ -364,18 +359,15 @@ def position_update(self):

log(4, 'Position refinement trial: iteration %s' % (self.curiter))
for i in range(PCK.mangler.nshifts):
PCK.mangler.get_address(
i, addr, mangled_addr, max_oby, max_obx)
PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
PCK.build_aux(aux, mangled_addr, ob, pr)
PROP.fw(aux, aux)
if self.p.position_refinement.metric == "fourier":
PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
PCK.error_reduce(mangled_addr, err_fourier)
if self.p.position_refinement.metric == "photon":
PCK.log_likelihood(
aux, mangled_addr, mag, ma, err_fourier)
PCK.update_addr_and_error_state(
addr, error_state, mangled_addr, err_fourier)
PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)

cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
src=error_state.data.ptr,
Expand Down Expand Up @@ -413,8 +405,7 @@ def center_probe(self):
prep = self.diff_info[dID]
pID, oID, eID = prep.poe_IDs
if pID == name:
self.ex.S[eID].gpu = self.ISK.interpolate_shift(
self.ex.S[eID].gpu, shift)
self.ex.S[eID].gpu = self.ISK.interpolate_shift(self.ex.S[eID].gpu, shift)

log(4, 'Probe recentered from %s to %s'
% (str(tuple(c1)), str(tuple(c2))))
Expand Down Expand Up @@ -533,17 +524,15 @@ def support_constraint(self, storage=None):
if support is not None:
if storage.ID not in self.FSK:
supp = support.astype(np.complex64)
self.FSK[storage.ID] = FourierSupportKernel(
supp, self.queue, self.p.fft_lib)
self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib)
self.FSK[storage.ID].allocate()
self.FSK[storage.ID].apply_fourier_support(storage.gpu)

# Real space
support = self._probe_support.get(storage.ID)
if support is not None:
if storage.ID not in self.RSK:
self.RSK[storage.ID] = RealSupportKernel(
support.astype(np.complex64))
self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64))
self.RSK[storage.ID].allocate()
self.RSK[storage.ID].apply_real_support(storage.gpu)

Expand Down Expand Up @@ -584,13 +573,11 @@ def engine_finalize(self):
prep.addr = prep.addr_gpu.get()
del prep.addr_gpu


mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
pinned_pool = cp.get_default_pinned_memory_pool()
pinned_pool.free_all_blocks()


# we don't need the "benchmarking" in DM_serial
super().engine_finalize(benchmark=False)

Expand Down
Loading