diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index 614fabcf..4b9533cb 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from dataclasses import dataclass +import importlib.metadata from typing import Optional, Union import numpy as np @@ -15,10 +16,30 @@ from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return +# TODO: revisit this treatment for py313t builds +_inited = False +_use_ex = None + + +def _lazy_init(): + global _inited + if _inited: + return + + global _use_ex + # binding availability depends on cuda-python version + _py_major_minor = tuple(int(v) for v in ( + importlib.metadata.version("cuda-python").split(".")[:2])) + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8)) + _inited = True + + @dataclass class LaunchConfig: """ """ + # TODO: expand LaunchConfig to include other attributes grid: Union[tuple, int] = None block: Union[tuple, int] = None stream: Stream = None @@ -39,6 +60,8 @@ def __post_init__(self): if self.shmem_size is None: self.shmem_size = 0 + _lazy_init() + def _cast_to_3_tuple(self, cfg): if isinstance(cfg, int): if cfg < 1: @@ -67,24 +90,34 @@ def launch(kernel, config, *kernel_args): if not isinstance(kernel, Kernel): raise ValueError config = check_or_create_options(LaunchConfig, config, "launch config") + if config.stream is None: + raise CUDAError("stream cannot be None") + # TODO: can we ensure kernel_args is valid/safe to use here? + # TODO: merge with HelperKernelParams? + kernel_args = ParamHolder(kernel_args) + args_ptr = kernel_args.ptr - driver_ver = handle_return(cuda.cuDriverGetVersion()) - if driver_ver >= 12000: + # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care + # about the CUfunction/CUkernel difference (which depends on whether the "old" or + # "new" module loading APIs are in use). We check both binding & driver versions here + # mainly to see if the "Ex" API is available and if so we use it, as it's more feature + # rich. + if _use_ex: drv_cfg = cuda.CUlaunchConfig() drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block - if config.stream is None: - raise CUDAError("stream cannot be None") drv_cfg.hStream = config.stream._handle drv_cfg.sharedMemBytes = config.shmem_size - drv_cfg.numAttrs = 0 # FIXME - - # TODO: merge with HelperKernelParams? - kernel_args = ParamHolder(kernel_args) - args_ptr = kernel_args.ptr - + drv_cfg.numAttrs = 0 # TODO handle_return(cuda.cuLaunchKernelEx( drv_cfg, int(kernel._handle), args_ptr, 0)) else: - raise NotImplementedError("TODO") + # TODO: check if config has any unsupported attrs + handle_return(cuda.cuLaunchKernel( + int(kernel._handle), + *config.grid, + *config.block, + config.shmem_size, + config.stream._handle, + args_ptr, 0)) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index a179faf8..60d4db97 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -2,16 +2,13 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import importlib.metadata + from cuda import cuda, cudart from cuda.core.experimental._utils import handle_return _backend = { - "new": { - "file": cuda.cuLibraryLoadFromFile, - "data": cuda.cuLibraryLoadData, - "kernel": cuda.cuLibraryGetKernel, - }, "old": { "file": cuda.cuModuleLoad, "data": cuda.cuModuleLoadDataEx, @@ -20,6 +17,34 @@ } +# TODO: revisit this treatment for py313t builds +_inited = False +_py_major_ver = None +_driver_ver = None +_kernel_ctypes = None + + +def _lazy_init(): + global _inited + if _inited: + return + + global _py_major_ver, _driver_ver, _kernel_ctypes + # binding availability depends on cuda-python version + _py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0]) + if _py_major_ver >= 12: + _backend["new"] = { + "file": cuda.cuLibraryLoadFromFile, + "data": cuda.cuLibraryLoadData, + "kernel": cuda.cuLibraryGetKernel, + } + _kernel_ctypes = (cuda.CUfunction, cuda.CUkernel) + else: + _kernel_ctypes = (cuda.CUfunction,) + _driver_ver = handle_return(cuda.cuDriverGetVersion()) + _inited = True + + class Kernel: __slots__ = ("_handle", "_module",) @@ -29,13 +54,15 @@ def __init__(self): @staticmethod def _from_obj(obj, mod): - assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction)) + assert isinstance(obj, _kernel_ctypes) assert isinstance(mod, ObjectCode) ker = Kernel.__new__(Kernel) ker._handle = obj ker._module = mod return ker + # TODO: implement from_handle() + class ObjectCode: @@ -46,13 +73,16 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None): if code_type not in self._supported_code_type: raise ValueError + _lazy_init() self._handle = None - driver_ver = handle_return(cuda.cuDriverGetVersion()) - self._loader = _backend["new"] if driver_ver >= 12000 else _backend["old"] + backend = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" + self._loader = _backend[backend] if isinstance(module, str): - if driver_ver < 12000 and jit_options is not None: + # TODO: this option is only taken by the new library APIs, but we have + # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73). + if jit_options is not None: raise ValueError module = module.encode() self._handle = handle_return(self._loader["file"](module)) @@ -60,12 +90,12 @@ def __init__(self, module, code_type, jit_options=None, *, assert isinstance(module, bytes) if jit_options is None: jit_options = {} - if driver_ver >= 12000: + if backend == "new": args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options), # TODO: support library options [], [], 0) - else: - args = (module, len(jit_options), jit_options.keys(), jit_options.values()) + else: # "old" backend + args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values())) self._handle = handle_return(self._loader["data"](*args)) self._code_type = code_type @@ -83,3 +113,5 @@ def get_kernel(self, name): name = name.encode() data = handle_return(self._loader["kernel"](self._handle, name)) return Kernel._from_obj(data, self) + + # TODO: implement from_handle()