ptycho · ptim0626 · Dec 5, 2023 · Dec 5, 2023
diff --git a/ptypy/accelerate/cuda_cupy/__init__.py b/ptypy/accelerate/cuda_cupy/__init__.py
@@ -31,7 +31,9 @@ def get_context(new_queue=False):
     return queue
 
 
-def load_kernel(name, subs={}, file=None, options=None):
+def load_kernel(name, subs={}, file=None, options=None, use_max_shm_optin=False):
+
+    global device
 
     if file is None:
         if isinstance(name, str):
@@ -54,10 +56,31 @@ def load_kernel(name, subs={}, file=None, options=None):
     if options is not None:
         opt += list(options)
     module = cp.RawModule(code=kernel, options=tuple(opt))
+
+    # explicit opt-in to use the max shared memory available for this device
+    if use_max_shm_optin:
+        devprop = cp.cuda.runtime.getDeviceProperties(device)
+        # default to the static limit
+        max_shm = devprop.get('sharedMemPerBlockOptin', 48*1024)
+
     if isinstance(name, str):
-        return module.get_function(name)
+        func = module.get_function(name)
+        if use_max_shm_optin:
+            try:
+                func.max_dynamic_shared_size_bytes = max_shm
+            except:
+                pass
+        return func
     else:  # tuple
-        return tuple(module.get_function(n) for n in name)
+        func = tuple(module.get_function(n) for n in name)
+        if use_max_shm_optin:
+            for f in func:
+                try:
+                    # reference to the function
+                    f.max_dynamic_shared_size_bytes = max_shm
+                except:
+                    pass
+        return func
 
 def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'):
         mempool = cp.get_default_memory_pool()
@@ -70,4 +93,4 @@ def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'):
         log(level, f'MemoryPool used       : {mempool.used_bytes()/1024/1024} MB')
         log(level, f'MemoryPool limit      : {mempool.get_limit()/1024/1024} MB')
         log(level, f'MemoryPool free blocks: {mempool.n_free_blocks()}')
-        log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}')
+        log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}')
diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py
@@ -334,27 +334,36 @@ def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
         self.blockdim_x = 4
         self.blockdim_y = 16
 
-        # At least 2 blocks per SM
-        self.max_shared_per_block = 48 * 1024 // 2
-        self.max_shared_per_block_complex = self.max_shared_per_block / \
-            2 * np.dtype(np.float32).itemsize
-        self.max_kernel_radius = int(
-            self.max_shared_per_block_complex / self.blockdim_y)
-
+        # explicit opt-in to use the max dynamic shared memory available on the
+        # device for these two kernels
         self.convolution_row = load_kernel(
-            "convolution_row", file="convolution.cu", subs={
+            "convolution_row", file="convolution.cu", use_max_shm_optin=True,
+            subs={
                 'BDIM_X': self.blockdim_x,
                 'BDIM_Y': self.blockdim_y,
                 'DTYPE': self.stype,
                 'MATH_TYPE': self.kernel_type
             })
+
         self.convolution_col = load_kernel(
-            "convolution_col", file="convolution.cu", subs={
+            "convolution_col", file="convolution.cu", use_max_shm_optin=True,
+            subs={
                 'BDIM_X': self.blockdim_y,   # NOTE: we swap x and y in this columns
                 'BDIM_Y': self.blockdim_x,
                 'DTYPE': self.stype,
                 'MATH_TYPE': self.kernel_type
             })
+
+        # At least 2 blocks per SM
+        max_shm = min(self.convolution_row.attributes['max_dynamic_shared_size_bytes'],
+                      self.convolution_col.attributes['max_dynamic_shared_size_bytes'])
+        # 1 kB is reserved by the device
+        self.max_shared_per_block = (max_shm - 1024) // 2
+        self.max_shared_per_block_complex = self.max_shared_per_block / \
+            2 * np.dtype(np.float32).itemsize
+        self.max_kernel_radius = int(
+            self.max_shared_per_block_complex / self.blockdim_y)
+
         # pre-allocate kernel memory on gpu, with max-radius to accomodate
         dtype = np.float32 if self.kernel_type == 'float' else np.float64
         self.kernel_gpu = cp.empty((self.max_kernel_radius,), dtype=dtype)

diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py
@@ -1,5 +1,6 @@
 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
+from pycuda._driver import function_attribute
 import numpy as np
 import os
 kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common'))
@@ -37,11 +38,13 @@ def get_context(new_context=False, new_queue=False):
         #                                                     str(parallel.rank_local)))
     if queue is None or new_queue:
         queue = cuda.Stream()
-    
+
     return context, queue
 
 
-def load_kernel(name, subs={}, file=None):
+def load_kernel(name, subs={}, file=None, use_max_shm_optin=False):
+
+    global context
 
     if file is None:
         if isinstance(name, str):
@@ -59,9 +62,31 @@ def load_kernel(name, subs={}, file=None):
     escaped = fn.replace("\\", "\\\\")
     kernel = '#line 1 "{}"\n'.format(escaped) + kernel
     mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options)
-
+
+    # explicit opt-in to use the max shared memory available for this device
+    if use_max_shm_optin:
+        dev = context.get_device()
+        try:
+            max_shm = dev.get_attribute(cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)
+        except:
+            # if anything is wrong, set to the default static limit
+            max_shm = 48 * 1024
+
     if isinstance(name, str):
-        return mod.get_function(name)
+        func = mod.get_function(name)
+        if use_max_shm_optin:
+            try:
+                func.set_attribute(function_attribute.MAX_DYNAMIC_SHARED_SIZE_BYTES, max_shm)
+            except:
+                pass
+        return func
     else:  # tuple
-        return tuple(mod.get_function(n) for n in name)
-
+        func = tuple(mod.get_function(n) for n in name)
+        if use_max_shm_optin:
+            for f in func:
+                try:
+                    # reference to the function
+                    f.set_attribute(function_attribute.MAX_DYNAMIC_SHARED_SIZE_BYTES, max_shm)
+                except:
+                    pass
+        return func
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -336,34 +336,40 @@ def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
         self.blockdim_x = 4
         self.blockdim_y = 16
 
-
-        # At least 2 blocks per SM
-        self.max_shared_per_block = 48 * 1024 // 2
-        self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize
-        self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y)
-
+        # explicit opt-in to use the max dynamic shared memory available on the
+        # device for these two kernels
         self.convolution_row = load_kernel(
-            "convolution_row", file="convolution.cu", subs={
+            "convolution_row", file="convolution.cu", use_max_shm_optin=True,
+            subs={
                 'BDIM_X': self.blockdim_x,
                 'BDIM_Y': self.blockdim_y,
                 'DTYPE': self.stype,
                 'MATH_TYPE': self.kernel_type
         })
         self.convolution_col = load_kernel(
-        "convolution_col", file="convolution.cu", subs={
+            "convolution_col", file="convolution.cu", use_max_shm_optin=True,
+            subs={
                 'BDIM_X': self.blockdim_y,   # NOTE: we swap x and y in this columns
                 'BDIM_Y': self.blockdim_x,
                 'DTYPE': self.stype,
                 'MATH_TYPE': self.kernel_type
         })
+
+        # At least 2 blocks per SM
+        max_shm = min(self.convolution_row.max_dynamic_shared_size_bytes,
+                      self.convolution_col.max_dynamic_shared_size_bytes)
+        # 1 kB is reserved by the device
+        self.max_shared_per_block = (max_shm - 1024) // 2
+        self.max_shared_per_block_complex = self.max_shared_per_block / 2 * np.dtype(np.float32).itemsize
+        self.max_kernel_radius = int(self.max_shared_per_block_complex / self.blockdim_y)
+
         # pre-allocate kernel memory on gpu, with max-radius to accomodate
         dtype=np.float32 if self.kernel_type == 'float' else np.float64
         self.kernel_gpu = gpuarray.empty((self.max_kernel_radius,), dtype=dtype)
         # keep track of previus radius and std to determine if we need to transfer again
         self.r = 0
         self.std = 0
 
-
     def convolution(self, data, mfs, tmp=None):
         """
         Calculates a stacked 2D convolution for smoothing, with the standard deviations