Skip to content

Commit

Permalink
Provide a safe value for frames_per_block for all pycuda engines
Browse files Browse the repository at this point in the history
  • Loading branch information
ptim0626 committed Nov 22, 2024
1 parent dd4a3db commit f7141e8
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 3 deletions.
18 changes: 17 additions & 1 deletion ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from ptypy import utils as u
from ptypy.utils.verbose import logger, log
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from .. import get_context, get_dev_pool
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
Expand All @@ -33,6 +35,8 @@

MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit
#MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

@register()
class ML_pycuda(ML_serial):
Expand Down Expand Up @@ -163,9 +167,21 @@ def _setup_kernels(self):
ma_mem = mag_mem
mem = cuda.mem_get_info()[0]
blk = ma_mem + mag_mem
fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety

# leave 200MB room for safety
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down
12 changes: 12 additions & 0 deletions ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from ptypy.accelerate.base.engines import projectional_serial
from ptypy.accelerate.base.mem_utils import calculate_safe_fpb
from .. import get_context
from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
Expand All @@ -30,6 +31,10 @@

__all__ = ['DM_pycuda', 'RAAR_pycuda']

# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3


class _ProjectionEngine_pycuda(projectional_serial._ProjectionEngine_serial):

"""
Expand Down Expand Up @@ -123,6 +128,13 @@ def _setup_kernels(self):
mem = cuda.mem_get_info()[0]
if not int(mem) // aux.nbytes:
log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
if scan.__class__.__name__ != "GradFull":
# only make sense if the model is not GradFull
per_frame = (aux.nbytes / aux.shape[0]) * nmodes
safe_fpb = calculate_safe_fpb(mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {fpc}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")
kern.aux = gpuarray.to_gpu(aux)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from ptypy import utils as u
from ptypy.utils.verbose import log, logger
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from . import projectional_pycuda
Expand All @@ -32,6 +34,8 @@
EX_MA_BLOCKS_RATIO = 2
MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit
#MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

__all__ = ['DM_pycuda_stream', 'RAAR_pycuda_stream']

Expand Down Expand Up @@ -61,9 +65,21 @@ def _setup_kernels(self):
ma_mem = mag_mem
mem = cuda.mem_get_info()[0]
blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety

# leave 200MB room for safety
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down
17 changes: 16 additions & 1 deletion ptypy/accelerate/cuda_pycuda/engines/stochastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from ptypy.engines import register
from ptypy.engines.stochastic import EPIEMixin, SDRMixin
from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from ptypy.accelerate.base import address_manglers
from .. import get_context
from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\
Expand All @@ -34,6 +36,8 @@
EX_MA_BLOCKS_RATIO = 2
MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit
#MAX_BLOCKS = 10 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

class _StochasticEnginePycuda(_StochasticEngineSerial):

Expand Down Expand Up @@ -157,9 +161,20 @@ def _setup_kernels(self):
ma_mem = mag_mem
mem = cuda.mem_get_info()[0]
blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety
# leave 200MB room for safety
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down

0 comments on commit f7141e8

Please sign in to comment.