Merge pull request #188 from leofang/cuda_core_11

Support JIT compilation for CUDA driver & bindings 11.x
NVIDIA · Oct 28, 2024 · e426810 · e426810
2 parents 5066c9f + 74de685
commit e426810
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 23 deletions.
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 from dataclasses import dataclass
+import importlib.metadata
 from typing import Optional, Union
 
 import numpy as np
@@ -15,10 +16,30 @@
 from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return
 
 
+# TODO: revisit this treatment for py313t builds
+_inited = False
+_use_ex = None
+
+
+def _lazy_init():
+    global _inited
+    if _inited:
+        return
+
+    global _use_ex
+    # binding availability depends on cuda-python version
+    _py_major_minor = tuple(int(v) for v in (
+        importlib.metadata.version("cuda-python").split(".")[:2]))
+    _driver_ver = handle_return(cuda.cuDriverGetVersion())
+    _use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
+    _inited = True
+
+
 @dataclass
 class LaunchConfig:
     """
     """
+    # TODO: expand LaunchConfig to include other attributes
     grid: Union[tuple, int] = None
     block: Union[tuple, int] = None
     stream: Stream = None
@@ -39,6 +60,8 @@ def __post_init__(self):
         if self.shmem_size is None:
             self.shmem_size = 0
 
+        _lazy_init()
+
     def _cast_to_3_tuple(self, cfg):
         if isinstance(cfg, int):
             if cfg < 1:
@@ -67,24 +90,34 @@ def launch(kernel, config, *kernel_args):
     if not isinstance(kernel, Kernel):
         raise ValueError
     config = check_or_create_options(LaunchConfig, config, "launch config")
+    if config.stream is None:
+        raise CUDAError("stream cannot be None")
+
     # TODO: can we ensure kernel_args is valid/safe to use here?
+    # TODO: merge with HelperKernelParams?
+    kernel_args = ParamHolder(kernel_args)
+    args_ptr = kernel_args.ptr
 
-    driver_ver = handle_return(cuda.cuDriverGetVersion())
-    if driver_ver >= 12000:
+    # Note: CUkernel can still be launched via the old cuLaunchKernel and we do not care
+    # about the CUfunction/CUkernel difference (which depends on whether the "old" or
+    # "new" module loading APIs are in use). We check both binding & driver versions here
+    # mainly to see if the "Ex" API is available and if so we use it, as it's more feature
+    # rich.
+    if _use_ex:
         drv_cfg = cuda.CUlaunchConfig()
         drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
         drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
-        if config.stream is None:
-            raise CUDAError("stream cannot be None")
         drv_cfg.hStream = config.stream._handle
         drv_cfg.sharedMemBytes = config.shmem_size
-        drv_cfg.numAttrs = 0  # FIXME
-
-        # TODO: merge with HelperKernelParams?
-        kernel_args = ParamHolder(kernel_args)
-        args_ptr = kernel_args.ptr
-
+        drv_cfg.numAttrs = 0  # TODO
         handle_return(cuda.cuLaunchKernelEx(
             drv_cfg, int(kernel._handle), args_ptr, 0))
     else:
-        raise NotImplementedError("TODO")
+        # TODO: check if config has any unsupported attrs
+        handle_return(cuda.cuLaunchKernel(
+            int(kernel._handle),
+            *config.grid,
+            *config.block,
+            config.shmem_size,
+            config.stream._handle,
+            args_ptr, 0))
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
@@ -2,16 +2,13 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import importlib.metadata
+
 from cuda import cuda, cudart
 from cuda.core.experimental._utils import handle_return
 
 
 _backend = {
-    "new": {
-        "file": cuda.cuLibraryLoadFromFile,
-        "data": cuda.cuLibraryLoadData,
-        "kernel": cuda.cuLibraryGetKernel,
-    },
     "old": {
         "file": cuda.cuModuleLoad,
         "data": cuda.cuModuleLoadDataEx,
@@ -20,6 +17,34 @@
 }
 
 
+# TODO: revisit this treatment for py313t builds
+_inited = False
+_py_major_ver = None
+_driver_ver = None
+_kernel_ctypes = None
+
+
+def _lazy_init():
+    global _inited
+    if _inited:
+        return
+
+    global _py_major_ver, _driver_ver, _kernel_ctypes
+    # binding availability depends on cuda-python version
+    _py_major_ver = int(importlib.metadata.version("cuda-python").split(".")[0])
+    if _py_major_ver >= 12:
+        _backend["new"] = {
+            "file": cuda.cuLibraryLoadFromFile,
+            "data": cuda.cuLibraryLoadData,
+            "kernel": cuda.cuLibraryGetKernel,
+        }
+        _kernel_ctypes = (cuda.CUfunction, cuda.CUkernel)
+    else:
+        _kernel_ctypes = (cuda.CUfunction,)
+    _driver_ver = handle_return(cuda.cuDriverGetVersion())
+    _inited = True
+
+
 class Kernel:
 
     __slots__ = ("_handle", "_module",)
@@ -29,13 +54,15 @@ def __init__(self):
 
     @staticmethod
     def _from_obj(obj, mod):
-        assert isinstance(obj, (cuda.CUkernel, cuda.CUfunction))
+        assert isinstance(obj, _kernel_ctypes)
         assert isinstance(mod, ObjectCode)
         ker = Kernel.__new__(Kernel)
         ker._handle = obj
         ker._module = mod
         return ker
 
+    # TODO: implement from_handle()
+
 
 class ObjectCode:
 
@@ -46,26 +73,29 @@ def __init__(self, module, code_type, jit_options=None, *,
                  symbol_mapping=None):
         if code_type not in self._supported_code_type:
             raise ValueError
+        _lazy_init()
         self._handle = None
 
-        driver_ver = handle_return(cuda.cuDriverGetVersion())
-        self._loader = _backend["new"] if driver_ver >= 12000 else _backend["old"]
+        backend = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
+        self._loader = _backend[backend]
 
         if isinstance(module, str):
-            if driver_ver < 12000 and jit_options is not None:
+            # TODO: this option is only taken by the new library APIs, but we have
+            # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73).
+            if jit_options is not None:
                 raise ValueError
             module = module.encode()
             self._handle = handle_return(self._loader["file"](module))
         else:
             assert isinstance(module, bytes)
             if jit_options is None:
                 jit_options = {}
-            if driver_ver >= 12000:
+            if backend == "new":
                 args = (module, list(jit_options.keys()), list(jit_options.values()), len(jit_options),
                         # TODO: support library options
                         [], [], 0)
-            else:
-                args = (module, len(jit_options), jit_options.keys(), jit_options.values())
+            else:  # "old" backend
+                args = (module, len(jit_options), list(jit_options.keys()), list(jit_options.values()))
             self._handle = handle_return(self._loader["data"](*args))
 
         self._code_type = code_type
@@ -83,3 +113,5 @@ def get_kernel(self, name):
             name = name.encode()
         data = handle_return(self._loader["kernel"](self._handle, name))
         return Kernel._from_obj(data, self)
+
+    # TODO: implement from_handle()