Merge pull request #237 from NVIDIA/ksimpson/add_program_options

Add the options data class to program
NVIDIA · Jan 19, 2025 · d62c7fa · d62c7fa
2 parents fc8188a + c7cb7eb
commit d62c7fa
Show file tree

Hide file tree

Showing 19 changed files with 574 additions and 131 deletions.
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
@@ -7,7 +7,7 @@
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
-from cuda.core.experimental._program import Program
+from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
 from cuda.core.experimental._system import System
 

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -9,6 +9,7 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import check_or_create_options, driver, handle_return
 
@@ -91,10 +92,10 @@ class LinkerOptions:
 
     Attributes
     ----------
-    arch : str
-        Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
-        ``compute_<CC>`` (for generating PTX).
-        This is a required option.
+    arch : str, optional
+        Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
+        ``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
+        will be used.
     max_register_count : int, optional
         Maximum register count.
         Maps to: ``-maxrregcount=<N>``.
@@ -172,7 +173,7 @@ class LinkerOptions:
         Default: False.
     """
 
-    arch: str
+    arch: Optional[str] = None
     max_register_count: Optional[int] = None
     time: Optional[bool] = None
     verbose: Optional[bool] = None
@@ -204,6 +205,8 @@ def __post_init__(self):
     def _init_nvjitlink(self):
         if self.arch is not None:
             self.formatted_options.append(f"-arch={self.arch}")
+        else:
+            self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
         if self.max_register_count is not None:
             self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
         if self.time is not None:

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py
@@ -5,6 +5,7 @@
 import functools
 import importlib.metadata
 from collections import namedtuple
+from collections.abc import Sequence
 from typing import Callable, Dict
 
 try:
@@ -93,6 +94,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
     return options
 
 
+def _handle_boolean_option(option: bool) -> str:
+    """
+    Convert a boolean option to a string representation.
+    """
+    return "true" if bool(option) else "false"
+
+
 def precondition(checker: Callable[..., None], what: str = "") -> Callable:
     """
     A decorator that adds checks to ensure any preconditions are met.
@@ -142,6 +150,20 @@ def get_device_from_ctx(ctx_handle) -> int:
     return device_id
 
 
+def is_sequence(obj):
+    """
+    Check if the given object is a sequence (list or tuple).
+    """
+    return isinstance(obj, Sequence)
+
+
+def is_nested_sequence(obj):
+    """
+    Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
+    """
+    return is_sequence(obj) and any(is_sequence(elem) for elem in obj)
+
+
 def get_binding_version():
     try:
         major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2]

diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
@@ -35,6 +35,7 @@ CUDA compilation toolchain
 
    :template: dataclass.rst
 
+   ProgramOptions
    LinkerOptions
 
 

diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst
@@ -8,7 +8,7 @@ and other functionalities.
    :maxdepth: 2
    :caption: Contents:
 
-   release.md
+   release.rst
    install.md
    interoperability.rst
    api.rst

diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md
diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst
@@ -0,0 +1,9 @@
+Release Notes
+=============
+
+.. toctree::
+   :maxdepth: 3
+
+   release/0.2.0-notes
+   release/0.1.1-notes
+   release/0.1.0-notes
diff --git a/cuda_core/docs/source/release/0.1.0-notes.md → ..._core/docs/source/release/0.1.0-notes.rst b/cuda_core/docs/source/release/0.1.0-notes.md → ..._core/docs/source/release/0.1.0-notes.rst
@@ -1,17 +1,21 @@
-# `cuda.core` v0.1.0 Release notes
+``cuda.core`` 0.1.0 Release Notes
+=================================
 
 Released on Nov 8, 2024
 
-## Hightlights
+Highlights
+----------
+
 - Initial beta release
 - Supports all platforms that CUDA is supported
 - Supports all CUDA 11.x/12.x drivers
 - Supports all CUDA 11.x/12.x Toolkits
 - Pythonic CUDA runtime and other core functionalities
 
-## Limitations
+Limitations
+-----------
 
 - All APIs are currently *experimental* and subject to change without deprecation notice.
-  Please kindly share your feedbacks with us so that we can make `cuda.core` better!
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
 - Source code release only; `pip`/`conda` support is coming in a future release
-- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
+- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md
diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst
@@ -0,0 +1,51 @@
+.. currentmodule:: cuda.core.experimental
+
+``cuda.core`` 0.1.1 Release Notes
+=================================
+
+Released on Dec 20, 2024
+
+Highlights
+----------
+
+- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete
+  implementation of DLPack & CUDA Array Interface supports.
+- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under
+  the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version
+  detected in the current environment.
+- Support ``pip install cuda-core``. Please see the Installation Guide for further details.
+
+New features
+------------
+
+- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information.
+- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.
+
+Enhancements
+------------
+
+- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch.
+- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
+- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`.
+- Document the ``__cuda_stream__`` protocol.
+- Improve test coverage & documentation cross-references.
+- Enforce code formatting.
+
+Bug fixes
+---------
+
+- Eliminate potential class destruction issues.
+- Fix circular import during handling a foreign CUDA stream.
+
+Limitations
+-----------
+
+- All APIs are currently *experimental* and subject to change without deprecation notice.
+  Please kindly share your feedback with us so that we can make ``cuda.core`` better!
+- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
+  not supported. This will be fixed in a future release.
+- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
+  the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
+  Further, some options aren't available on CUDA versions <12.6.
+- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source
+  prior to `pip install`. This extra step will be fixed soon.
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -1,7 +1,22 @@
-``cuda.core`` v0.2.0 Release notes
-==================================
+.. currentmodule:: cuda.core.experimental
 
-Breaking changes
+``cuda.core`` 0.2.0 Release Notes
+=================================
+
+Released on <TODO>, 2024
+
+Highlights
+----------
+
+- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`.
+
+Limitations
+-----------
+
+- <TODO>
+
+Breaking Changes
 ----------------
 
-- Change ``__cuda_stream__`` from attribute to method
+- Change ``__cuda_stream__`` from attribute to method
+- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -6,7 +6,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute out = a * x + y
 code = """
@@ -29,13 +29,11 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
+prog = Program(code, code_type="c++", options=program_options)
 mod = prog.compile(
     "cubin",
-    options=(
-        "-std=c++11",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
     logs=sys.stdout,
     name_expressions=("saxpy<float>", "saxpy<double>"),
 )

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
@@ -31,7 +31,7 @@
     cp = None
 import numpy as np
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
@@ -88,16 +88,13 @@
         }
     }
     """).substitute(func_sig=func_sig)
-    gpu_prog = Program(gpu_code, code_type="c++")
+
     # To know the GPU's compute capability, we need to identify which GPU to use.
     dev = Device(0)
     dev.set_current()
     arch = "".join(f"{i}" for i in dev.compute_capability)
-    mod = gpu_prog.compile(
-        target_type="cubin",
-        # TODO: update this after NVIDIA/cuda-python#237 is merged
-        options=(f"-arch=sm_{arch}", "-std=c++11"),
-    )
+    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
+    mod = gpu_prog.compile(target_type="cubin")
     gpu_ker = mod.get_kernel(func_name)
 
 # Now we are prepared to run the code from the user's perspective!

diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -5,7 +5,7 @@
 import os
 import sys
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # prepare include
 cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
@@ -44,12 +44,12 @@
 
 # prepare program & compile kernel
 dev.set_current()
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    target_type="cubin",
-    # TODO: update this after NVIDIA/cuda-python#237 is merged
-    options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"),
+prog = Program(
+    code,
+    code_type="c++",
+    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path),
 )
+mod = prog.compile(target_type="cubin")
 ker = mod.get_kernel("check_cluster_info")
 
 # prepare launch config

diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -4,7 +4,7 @@
 
 import cupy as cp
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute c = a + b
 code = """
@@ -26,15 +26,10 @@
 s = dev.create_stream()
 
 # prepare program
-prog = Program(code, code_type="c++")
-mod = prog.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
-    ),
-    name_expressions=("vector_add<float>",),
-)
+arch = "".join(f"{i}" for i in dev.compute_capability)
+program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
+prog = Program(code, code_type="c++", options=program_options)
+mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
 # run in single precision
 ker = mod.get_kernel("vector_add<float>")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,6 +35,7 @@ CUDA compilation toolchain

		:template: dataclass.rst

		ProgramOptions
		LinkerOptions


Expand Down