Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maximise the usage of dynamic shared memory for GaussianSmoothingKernel #519

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions ptypy/accelerate/cuda_cupy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def get_context(new_queue=False):
return queue


def load_kernel(name, subs={}, file=None, options=None):
def load_kernel(name, subs={}, file=None, options=None, use_max_shm_optin=False):

global device

if file is None:
if isinstance(name, str):
Expand All @@ -54,10 +56,31 @@ def load_kernel(name, subs={}, file=None, options=None):
if options is not None:
opt += list(options)
module = cp.RawModule(code=kernel, options=tuple(opt))

# explicit opt-in to use the max shared memory available for this device
if use_max_shm_optin:
devprop = cp.cuda.runtime.getDeviceProperties(device)
# default to the static limit
max_shm = devprop.get('sharedMemPerBlockOptin', 48*1024)

if isinstance(name, str):
return module.get_function(name)
func = module.get_function(name)
if use_max_shm_optin:
try:
func.max_dynamic_shared_size_bytes = max_shm
except:
pass
return func
else: # tuple
return tuple(module.get_function(n) for n in name)
func = tuple(module.get_function(n) for n in name)
if use_max_shm_optin:
for f in func:
try:
# reference to the function
f.max_dynamic_shared_size_bytes = max_shm
except:
pass
return func

def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'):
mempool = cp.get_default_memory_pool()
Expand All @@ -70,4 +93,4 @@ def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'):
log(level, f'MemoryPool used : {mempool.used_bytes()/1024/1024} MB')
log(level, f'MemoryPool limit : {mempool.get_limit()/1024/1024} MB')
log(level, f'MemoryPool free blocks: {mempool.n_free_blocks()}')
log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}')
log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}')
27 changes: 18 additions & 9 deletions ptypy/accelerate/cuda_cupy/array_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,27 +334,36 @@ def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
self.blockdim_x = 4
self.blockdim_y = 16

# At least 2 blocks per SM
self.max_shared_per_block = 48 * 1024 // 2
self.max_shared_per_block_complex = self.max_shared_per_block / \
2 * np.dtype(np.float32).itemsize
self.max_kernel_radius = int(
self.max_shared_per_block_complex / self.blockdim_y)

# explicit opt-in to use the max dynamic shared memory available on the
# device for these two kernels
self.convolution_row = load_kernel(
"convolution_row", file="convolution.cu", subs={
"convolution_row", file="convolution.cu", use_max_shm_optin=True,
subs={
'BDIM_X': self.blockdim_x,
'BDIM_Y': self.blockdim_y,
'DTYPE': self.stype,
'MATH_TYPE': self.kernel_type
})

self.convolution_col = load_kernel(
"convolution_col", file="convolution.cu", subs={
"convolution_col", file="convolution.cu", use_max_shm_optin=True,
subs={
'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns
'BDIM_Y': self.blockdim_x,
'DTYPE': self.stype,
'MATH_TYPE': self.kernel_type
})

# At least 2 blocks per SM
max_shm = min(self.convolution_row.attributes['max_dynamic_shared_size_bytes'],
self.convolution_col.attributes['max_dynamic_shared_size_bytes'])
# 1 kB is reserved by the device
self.max_shared_per_block = (max_shm - 1024) // 2
self.max_shared_per_block_complex = self.max_shared_per_block / \
2 * np.dtype(np.float32).itemsize
self.max_kernel_radius = int(
self.max_shared_per_block_complex / self.blockdim_y)

# pre-allocate kernel memory on gpu, with max-radius to accomodate
dtype = np.float32 if self.kernel_type == 'float' else np.float64
self.kernel_gpu = cp.empty((self.max_kernel_radius,), dtype=dtype)
Expand Down
37 changes: 31 additions & 6 deletions ptypy/accelerate/cuda_pycuda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
from pycuda._driver import function_attribute
import numpy as np
import os
kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common'))
Expand Down Expand Up @@ -37,11 +38,13 @@ def get_context(new_context=False, new_queue=False):
# str(parallel.rank_local)))
if queue is None or new_queue:
queue = cuda.Stream()

return context, queue


def load_kernel(name, subs={}, file=None):
def load_kernel(name, subs={}, file=None, use_max_shm_optin=False):

global context

if file is None:
if isinstance(name, str):
Expand All @@ -59,9 +62,31 @@ def load_kernel(name, subs={}, file=None):
escaped = fn.replace("\\", "\\\\")
kernel = '#line 1 "{}"\n'.format(escaped) + kernel
mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options)


# explicit opt-in to use the max shared memory available for this device
if use_max_shm_optin:
dev = context.get_device()
try:
max_shm = dev.get_attribute(cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)
except:
# if anything is wrong, set to the default static limit
max_shm = 48 * 1024

if isinstance(name, str):
return mod.get_function(name)
func = mod.get_function(name)
if use_max_shm_optin:
try:
func.set_attribute(function_attribute.MAX_DYNAMIC_SHARED_SIZE_BYTES, max_shm)
except:
pass
return func
else: # tuple
return tuple(mod.get_function(n) for n in name)

func = tuple(mod.get_function(n) for n in name)
if use_max_shm_optin:
for f in func:
try:
# reference to the function
f.set_attribute(function_attribute.MAX_DYNAMIC_SHARED_SIZE_BYTES, max_shm)
except:
pass
return func
24 changes: 15 additions & 9 deletions ptypy/accelerate/cuda_pycuda/array_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,34 +336,40 @@ def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
self.blockdim_x = 4
self.blockdim_y = 16


# At least 2 blocks per SM
self.max_shared_per_block = 48 * 1024 // 2
self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize
self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y)

# explicit opt-in to use the max dynamic shared memory available on the
# device for these two kernels
self.convolution_row = load_kernel(
"convolution_row", file="convolution.cu", subs={
"convolution_row", file="convolution.cu", use_max_shm_optin=True,
subs={
'BDIM_X': self.blockdim_x,
'BDIM_Y': self.blockdim_y,
'DTYPE': self.stype,
'MATH_TYPE': self.kernel_type
})
self.convolution_col = load_kernel(
"convolution_col", file="convolution.cu", subs={
"convolution_col", file="convolution.cu", use_max_shm_optin=True,
subs={
'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns
'BDIM_Y': self.blockdim_x,
'DTYPE': self.stype,
'MATH_TYPE': self.kernel_type
})

# At least 2 blocks per SM
max_shm = min(self.convolution_row.max_dynamic_shared_size_bytes,
self.convolution_col.max_dynamic_shared_size_bytes)
# 1 kB is reserved by the device
self.max_shared_per_block = (max_shm - 1024) // 2
self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize
self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y)

# pre-allocate kernel memory on gpu, with max-radius to accomodate
dtype=np.float32 if self.kernel_type == 'float' else np.float64
self.kernel_gpu = gpuarray.empty((self.max_kernel_radius,), dtype=dtype)
# keep track of previus radius and std to determine if we need to transfer again
self.r = 0
self.std = 0


def convolution(self, data, mfs, tmp=None):
"""
Calculates a stacked 2D convolution for smoothing, with the standard deviations
Expand Down