diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 15df70bb..3db9e8ab 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -7,7 +7,7 @@ from cuda.core.experimental._event import EventOptions from cuda.core.experimental._launcher import LaunchConfig, launch from cuda.core.experimental._linker import Linker, LinkerOptions -from cuda.core.experimental._program import Program +from cuda.core.experimental._program import Program, ProgramOptions from cuda.core.experimental._stream import Stream, StreamOptions from cuda.core.experimental._system import System diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 6c2ca4d4..b5a6b675 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from typing import List, Optional +from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import check_or_create_options, driver, handle_return @@ -91,10 +92,10 @@ class LinkerOptions: Attributes ---------- - arch : str - Pass the SM architecture value, such as ``-arch=sm_`` (for generating CUBIN) or - ``compute_`` (for generating PTX). - This is a required option. + arch : str, optional + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. max_register_count : int, optional Maximum register count. Maps to: ``-maxrregcount=``. @@ -172,7 +173,7 @@ class LinkerOptions: Default: False. """ - arch: str + arch: Optional[str] = None max_register_count: Optional[int] = None time: Optional[bool] = None verbose: Optional[bool] = None @@ -204,6 +205,8 @@ def __post_init__(self): def _init_nvjitlink(self): if self.arch is not None: self.formatted_options.append(f"-arch={self.arch}") + else: + self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability)) if self.max_register_count is not None: self.formatted_options.append(f"-maxrregcount={self.max_register_count}") if self.time is not None: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 99bc274d..c10fd077 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -1,11 +1,368 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import weakref +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union +from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import handle_return, nvrtc +from cuda.core.experimental._utils import ( + _handle_boolean_option, + check_or_create_options, + handle_return, + is_nested_sequence, + is_sequence, + nvrtc, +) + + +@dataclass +class ProgramOptions: + """Customizable options for configuring `Program`. + + Attributes + ---------- + arch : str, optional + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. + relocatable_device_code : bool, optional + Enable (disable) the generation of relocatable device code. + Default: False + Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``) + extensible_whole_program : bool, optional + Do extensible whole program compilation of device code. + Default: False + Maps to: ``--extensible-whole-program`` (``-ewp``) + debug : bool, optional + Generate debug information. If --dopt is not specified, then turns off all optimizations. + Default: False + Maps to: ``--device-debug`` (``-G``) + lineinfo: bool, optional + Generate line-number information. + Default: False + Maps to: ``--generate-line-info`` (``-lineinfo``) + device_code_optimize : bool, optional + Enable device code optimization. When specified along with ā€˜-Gā€™, enables limited debug information generation + for optimized device code. + Default: None + Maps to: ``--dopt on`` (``-dopt``) + ptxas_options : Union[str, List[str]], optional + Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. + For example ["-v", "-O2"]. + Default: None + Maps to: ``--ptxas-options `` (``-Xptxas``) + max_register_count : int, optional + Specify the maximum amount of registers that GPU functions can use. + Default: None + Maps to: ``--maxrregcount=`` (``-maxrregcount``) + ftz : bool, optional + When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal + values. + Default: False + Maps to: ``--ftz={true|false}`` (``-ftz``) + prec_sqrt : bool, optional + For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. + Default: True + Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``) + prec_div : bool, optional + For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster + approximation. + Default: True + Maps to: ``--prec-div={true|false}`` (``-prec-div``) + fma : bool, optional + Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point + multiply-add operations. + Default: True + Maps to: ``--fmad={true|false}`` (``-fmad``) + use_fast_math : bool, optional + Make use of fast math operations. + Default: False + Maps to: ``--use_fast_math`` (``-use_fast_math``) + extra_device_vectorization : bool, optional + Enables more aggressive device code vectorization in the NVVM optimizer. + Default: False + Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``) + link_time_optimization : bool, optional + Generate intermediate code for later link-time optimization. + Default: False + Maps to: ``--dlink-time-opt`` (``-dlto``) + gen_opt_lto : bool, optional + Run the optimizer passes before generating the LTO IR. + Default: False + Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``) + define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional + Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of + strings, in which case the first element is defined as the second, or a list of strings or tuples. + Default: None + Maps to: ``--define-macro=`` (``-D``) + undefine_macro : Union[str, List[str]], optional + Cancel any previous definition of a macro, or list of macros. + Default: None + Maps to: ``--undefine-macro=`` (``-U``) + include_path : Union[str, List[str]], optional + Add the directory or directories to the list of directories to be searched for headers. + Default: None + Maps to: ``--include-path=`` (``-I``) + pre_include : Union[str, List[str]], optional + Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. + Default: None + Maps to: ``--pre-include=
`` (``-include``) + no_source_include : bool, optional + Disable the default behavior of adding the directory of each input source to the include path. + Default: False + Maps to: ``--no-source-include`` (``-no-source-include``) + std : str, optional + Set language dialect to C++03, C++11, C++14, C++17 or C++20. + Default: c++17 + Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``) + builtin_move_forward : bool, optional + Provide builtin definitions of std::move and std::forward. + Default: True + Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``) + builtin_initializer_list : bool, optional + Provide builtin definitions of std::initializer_list class and member functions. + Default: True + Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``) + disable_warnings : bool, optional + Inhibit all warning messages. + Default: False + Maps to: ``--disable-warnings`` (``-w``) + restrict : bool, optional + Programmer assertion that all kernel pointer parameters are restrict pointers. + Default: False + Maps to: ``--restrict`` (``-restrict``) + device_as_default_execution_space : bool, optional + Treat entities with no execution space annotation as __device__ entities. + Default: False + Maps to: ``--device-as-default-execution-space`` (``-default-device``) + device_int128 : bool, optional + Allow the __int128 type in device code. + Default: False + Maps to: ``--device-int128`` (``-device-int128``) + optimization_info : str, optional + Provide optimization reports for the specified kind of optimization. + Default: None + Maps to: ``--optimization-info=`` (``-opt-info``) + no_display_error_number : bool, optional + Disable the display of a diagnostic number for warning messages. + Default: False + Maps to: ``--no-display-error-number`` (``-no-err-no``) + diag_error : Union[int, List[int]], optional + Emit error for a specified diagnostic message number or comma separated list of numbers. + Default: None + Maps to: ``--diag-error=, ...`` (``-diag-error``) + diag_suppress : Union[int, List[int]], optional + Suppress a specified diagnostic message number or comma separated list of numbers. + Default: None + Maps to: ``--diag-suppress=,ā€¦`` (``-diag-suppress``) + diag_warn : Union[int, List[int]], optional + Emit warning for a specified diagnostic message number or comma separated lis of numbers. + Default: None + Maps to: ``--diag-warn=,ā€¦`` (``-diag-warn``) + brief_diagnostics : bool, optional + Disable or enable showing source line and column info in a diagnostic. + Default: False + Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``) + time : str, optional + Generate a CSV table with the time taken by each compilation phase. + Default: None + Maps to: ``--time=`` (``-time``) + split_compile : int, optional + Perform compiler optimizations in parallel. + Default: 1 + Maps to: ``--split-compile= `` (``-split-compile``) + fdevice_syntax_only : bool, optional + Ends device compilation after front-end syntax checking. + Default: False + Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``) + minimal : bool, optional + Omit certain language features to reduce compile time for small programs. + Default: False + Maps to: ``--minimal`` (``-minimal``) + """ + + arch: Optional[str] = None + relocatable_device_code: Optional[bool] = None + extensible_whole_program: Optional[bool] = None + debug: Optional[bool] = None + lineinfo: Optional[bool] = None + device_code_optimize: Optional[bool] = None + ptxas_options: Optional[Union[str, List[str], Tuple[str]]] = None + max_register_count: Optional[int] = None + ftz: Optional[bool] = None + prec_sqrt: Optional[bool] = None + prec_div: Optional[bool] = None + fma: Optional[bool] = None + use_fast_math: Optional[bool] = None + extra_device_vectorization: Optional[bool] = None + link_time_optimization: Optional[bool] = None + gen_opt_lto: Optional[bool] = None + define_macro: Optional[ + Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]], Tuple[Union[str, Tuple[str, str]]]] + ] = None + undefine_macro: Optional[Union[str, List[str], Tuple[str]]] = None + include_path: Optional[Union[str, List[str], Tuple[str]]] = None + pre_include: Optional[Union[str, List[str], Tuple[str]]] = None + no_source_include: Optional[bool] = None + std: Optional[str] = None + builtin_move_forward: Optional[bool] = None + builtin_initializer_list: Optional[bool] = None + disable_warnings: Optional[bool] = None + restrict: Optional[bool] = None + device_as_default_execution_space: Optional[bool] = None + device_int128: Optional[bool] = None + optimization_info: Optional[str] = None + no_display_error_number: Optional[bool] = None + diag_error: Optional[Union[int, List[int], Tuple[int]]] = None + diag_suppress: Optional[Union[int, List[int], Tuple[int]]] = None + diag_warn: Optional[Union[int, List[int], Tuple[int]]] = None + brief_diagnostics: Optional[bool] = None + time: Optional[str] = None + split_compile: Optional[int] = None + fdevice_syntax_only: Optional[bool] = None + minimal: Optional[bool] = None + + def __post_init__(self): + self._formatted_options = [] + if self.arch is not None: + self._formatted_options.append(f"--gpu-architecture={self.arch}") + else: + self._formatted_options.append( + "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability) + ) + if self.relocatable_device_code is not None: + self._formatted_options.append( + f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}" + ) + if self.extensible_whole_program is not None and self.extensible_whole_program: + self._formatted_options.append("--extensible-whole-program") + if self.debug is not None and self.debug: + self._formatted_options.append("--device-debug") + if self.lineinfo is not None and self.lineinfo: + self._formatted_options.append("--generate-line-info") + if self.device_code_optimize is not None: + self._formatted_options.append(f"--dopt={'on' if self.device_code_optimize else 'off'}") + if self.ptxas_options is not None: + self._formatted_options.append("--ptxas-options") + if isinstance(self.ptxas_options, str): + self._formatted_options.append(self.ptxas_options) + elif is_sequence(self.ptxas_options): + for option in self.ptxas_options: + self._formatted_options.append(option) + if self.max_register_count is not None: + self._formatted_options.append(f"--maxrregcount={self.max_register_count}") + if self.ftz is not None: + self._formatted_options.append(f"--ftz={_handle_boolean_option(self.ftz)}") + if self.prec_sqrt is not None: + self._formatted_options.append(f"--prec-sqrt={_handle_boolean_option(self.prec_sqrt)}") + if self.prec_div is not None: + self._formatted_options.append(f"--prec-div={_handle_boolean_option(self.prec_div)}") + if self.fma is not None: + self._formatted_options.append(f"--fmad={_handle_boolean_option(self.fma)}") + if self.use_fast_math is not None and self.use_fast_math: + self._formatted_options.append("--use_fast_math") + if self.extra_device_vectorization is not None and self.extra_device_vectorization: + self._formatted_options.append("--extra-device-vectorization") + if self.link_time_optimization is not None and self.link_time_optimization: + self._formatted_options.append("--dlink-time-opt") + if self.gen_opt_lto is not None and self.gen_opt_lto: + self._formatted_options.append("--gen-opt-lto") + if self.define_macro is not None: + if isinstance(self.define_macro, str): + self._formatted_options.append(f"--define-macro={self.define_macro}") + elif isinstance(self.define_macro, tuple): + assert len(self.define_macro) == 2 + self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}") + elif is_nested_sequence(self.define_macro): + for macro in self.define_macro: + if isinstance(macro, tuple): + assert len(macro) == 2 + self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") + else: + self._formatted_options.append(f"--define-macro={macro}") + + if self.undefine_macro is not None: + if isinstance(self.undefine_macro, str): + self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") + elif is_sequence(self.undefine_macro): + for macro in self.undefine_macro: + self._formatted_options.append(f"--undefine-macro={macro}") + if self.include_path is not None: + if isinstance(self.include_path, str): + self._formatted_options.append(f"--include-path={self.include_path}") + elif is_sequence(self.include_path): + for path in self.include_path: + self._formatted_options.append(f"--include-path={path}") + if self.pre_include is not None: + if isinstance(self.pre_include, str): + self._formatted_options.append(f"--pre-include={self.pre_include}") + elif is_sequence(self.pre_include): + for header in self.pre_include: + self._formatted_options.append(f"--pre-include={header}") + + if self.no_source_include is not None and self.no_source_include: + self._formatted_options.append("--no-source-include") + if self.std is not None: + self._formatted_options.append(f"--std={self.std}") + if self.builtin_move_forward is not None: + self._formatted_options.append( + f"--builtin-move-forward={_handle_boolean_option(self.builtin_move_forward)}" + ) + if self.builtin_initializer_list is not None: + self._formatted_options.append( + f"--builtin-initializer-list={_handle_boolean_option(self.builtin_initializer_list)}" + ) + if self.disable_warnings is not None and self.disable_warnings: + self._formatted_options.append("--disable-warnings") + if self.restrict is not None and self.restrict: + self._formatted_options.append("--restrict") + if self.device_as_default_execution_space is not None and self.device_as_default_execution_space: + self._formatted_options.append("--device-as-default-execution-space") + if self.device_int128 is not None and self.device_int128: + self._formatted_options.append("--device-int128") + if self.optimization_info is not None: + self._formatted_options.append(f"--optimization-info={self.optimization_info}") + if self.no_display_error_number is not None and self.no_display_error_number: + self._formatted_options.append("--no-display-error-number") + if self.diag_error is not None: + if isinstance(self.diag_error, int): + self._formatted_options.append(f"--diag-error={self.diag_error}") + elif is_sequence(self.diag_error): + for error in self.diag_error: + self._formatted_options.append(f"--diag-error={error}") + if self.diag_suppress is not None: + if isinstance(self.diag_suppress, int): + self._formatted_options.append(f"--diag-suppress={self.diag_suppress}") + elif is_sequence(self.diag_suppress): + for suppress in self.diag_suppress: + self._formatted_options.append(f"--diag-suppress={suppress}") + if self.diag_warn is not None: + if isinstance(self.diag_warn, int): + self._formatted_options.append(f"--diag-warn={self.diag_warn}") + elif is_sequence(self.diag_warn): + for warn in self.diag_warn: + self._formatted_options.append(f"--diag-warn={warn}") + if self.brief_diagnostics is not None: + self._formatted_options.append(f"--brief-diagnostics={_handle_boolean_option(self.brief_diagnostics)}") + if self.time is not None: + self._formatted_options.append(f"--time={self.time}") + if self.split_compile is not None: + self._formatted_options.append(f"--split-compile={self.split_compile}") + if self.fdevice_syntax_only is not None and self.fdevice_syntax_only: + self._formatted_options.append("--fdevice-syntax-only") + if self.minimal is not None and self.minimal: + self._formatted_options.append("--minimal") + + def _as_bytes(self): + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + return list(o.encode() for o in self._formatted_options) + + def __repr__(self): + # __TODO__ improve this + return self._formatted_options class Program: @@ -22,7 +379,9 @@ class Program: String of the CUDA Runtime Compilation program. code_type : Any String of the code type. Currently only ``"c++"`` is supported. - + options : ProgramOptions, optional + A ProgramOptions object to customize the compilation process. + See :obj:`ProgramOptions` for more information. """ class _MembersNeededForFinalize: @@ -37,13 +396,15 @@ def close(self): handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) self.handle = None - __slots__ = ("__weakref__", "_mnff", "_backend") + __slots__ = ("__weakref__", "_mnff", "_backend", "_options") _supported_code_type = ("c++",) _supported_target_type = ("ptx", "cubin", "ltoir") - def __init__(self, code, code_type): + def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff = Program._MembersNeededForFinalize(self, None) + self._options = options = check_or_create_options(ProgramOptions, options, "Program options") + if code_type not in self._supported_code_type: raise NotImplementedError @@ -61,7 +422,7 @@ def close(self): """Destroy this program.""" self._mnff.close() - def compile(self, target_type, options=(), name_expressions=(), logs=None): + def compile(self, target_type, name_expressions=(), logs=None): """Compile the program with a specific compilation type. Parameters @@ -69,9 +430,6 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): target_type : Any String of the targeted compilation type. Supported options are "ptx", "cubin" and "ltoir". - options : Union[List, Tuple], optional - List of compilation options associated with the backend - of this :obj:`~_program.Program`. (Default to no options) name_expressions : Union[List, Tuple], optional List of explicit name expressions to become accessible. (Default to no expressions) @@ -93,9 +451,11 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): if name_expressions: for n in name_expressions: handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - options = list(o.encode() for o in options) - handle_return(nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), handle=self._mnff.handle) + options = self._options._as_bytes() + handle_return( + nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), + handle=self._mnff.handle, + ) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils.py index ff5844a4..3538ae6c 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils.py @@ -5,6 +5,7 @@ import functools import importlib.metadata from collections import namedtuple +from collections.abc import Sequence from typing import Callable, Dict try: @@ -93,6 +94,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals return options +def _handle_boolean_option(option: bool) -> str: + """ + Convert a boolean option to a string representation. + """ + return "true" if bool(option) else "false" + + def precondition(checker: Callable[..., None], what: str = "") -> Callable: """ A decorator that adds checks to ensure any preconditions are met. @@ -142,6 +150,20 @@ def get_device_from_ctx(ctx_handle) -> int: return device_id +def is_sequence(obj): + """ + Check if the given object is a sequence (list or tuple). + """ + return isinstance(obj, Sequence) + + +def is_nested_sequence(obj): + """ + Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). + """ + return is_sequence(obj) and any(is_sequence(elem) for elem in obj) + + def get_binding_version(): try: major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 4b30c6ef..5239174f 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -35,6 +35,7 @@ CUDA compilation toolchain :template: dataclass.rst + ProgramOptions LinkerOptions diff --git a/cuda_core/docs/source/index.rst b/cuda_core/docs/source/index.rst index 19c14c2a..1aedfaa8 100644 --- a/cuda_core/docs/source/index.rst +++ b/cuda_core/docs/source/index.rst @@ -8,7 +8,7 @@ and other functionalities. :maxdepth: 2 :caption: Contents: - release.md + release.rst install.md interoperability.rst api.rst diff --git a/cuda_core/docs/source/release.md b/cuda_core/docs/source/release.md deleted file mode 100644 index b38d3260..00000000 --- a/cuda_core/docs/source/release.md +++ /dev/null @@ -1,12 +0,0 @@ -# Release Notes - -```{toctree} ---- -maxdepth: 3 ---- - - 0.2.0 - 0.1.1 - 0.1.0 - -``` diff --git a/cuda_core/docs/source/release.rst b/cuda_core/docs/source/release.rst new file mode 100644 index 00000000..4e74423f --- /dev/null +++ b/cuda_core/docs/source/release.rst @@ -0,0 +1,9 @@ +Release Notes +============= + +.. toctree:: + :maxdepth: 3 + + release/0.2.0-notes + release/0.1.1-notes + release/0.1.0-notes diff --git a/cuda_core/docs/source/release/0.1.0-notes.md b/cuda_core/docs/source/release/0.1.0-notes.rst similarity index 57% rename from cuda_core/docs/source/release/0.1.0-notes.md rename to cuda_core/docs/source/release/0.1.0-notes.rst index 5d1c7fd5..bc327971 100644 --- a/cuda_core/docs/source/release/0.1.0-notes.md +++ b/cuda_core/docs/source/release/0.1.0-notes.rst @@ -1,17 +1,21 @@ -# `cuda.core` v0.1.0 Release notes +``cuda.core`` 0.1.0 Release Notes +================================= Released on Nov 8, 2024 -## Hightlights +Highlights +---------- + - Initial beta release - Supports all platforms that CUDA is supported - Supports all CUDA 11.x/12.x drivers - Supports all CUDA 11.x/12.x Toolkits - Pythonic CUDA runtime and other core functionalities -## Limitations +Limitations +----------- - All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedbacks with us so that we can make `cuda.core` better! + Please kindly share your feedback with us so that we can make ``cuda.core`` better! - Source code release only; `pip`/`conda` support is coming in a future release -- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206) +- Windows TCC mode is `not yet supported `_ diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md deleted file mode 100644 index 9550cb23..00000000 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ /dev/null @@ -1,43 +0,0 @@ -# `cuda.core` v0.1.1 Release notes - -Released on Dec 20, 2024 - -## Hightlights - -- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete - implementation of DLPack & CUDA Array Interface supports. -- Add `Linker` that can link one or multiple `ObjectCode` instances generated by `Program`. Under - the hood, it uses either the nvJitLink or driver (`cuLink*`) APIs depending on the CUDA version - detected in the current environment. -- Support `pip install cuda-core`. Please see the Installation Guide for further details. - -## New features - -- Add a `cuda.core.experimental.system` module for querying system- or process- wide information. -- Add `LaunchConfig.cluster` to support thread block clusters on Hopper GPUs. - -## Enhancements - -- The internal handle held by `ObjectCode` is now lazily initialized upon first touch. -- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools. -- Ensure `"ltoir"` is a valid code type to `ObjectCode`. -- Document the `__cuda_stream__` protocol. -- Improve test coverage & documentation cross-references. -- Enforce code formatting. - -## Bug fixes - -- Eliminate potential class destruction issues. -- Fix circular import during handling a foreign CUDA stream. - -## Limitations - -- All APIs are currently *experimental* and subject to change without deprecation notice. - Please kindly share your feedbacks with us so that we can make `cuda.core` better! -- Using `cuda.core` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently - not supported. This will be fixed in a future release. -- Some `LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, - the backend is the cuLink api which supports only a subset of the options that nvjitlink does. - Further, some options aren't available on CUDA versions <12.6. -- To use `cuda.core` with Python 3.13, it currently requires building `cuda-python` from source - prior to `pip install`. This extra step will be fixed soon. diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst new file mode 100644 index 00000000..68cca671 --- /dev/null +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -0,0 +1,51 @@ +.. currentmodule:: cuda.core.experimental + +``cuda.core`` 0.1.1 Release Notes +================================= + +Released on Dec 20, 2024 + +Highlights +---------- + +- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete + implementation of DLPack & CUDA Array Interface supports. +- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under + the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version + detected in the current environment. +- Support ``pip install cuda-core``. Please see the Installation Guide for further details. + +New features +------------ + +- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information. +- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs. + +Enhancements +------------ + +- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch. +- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools. +- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`. +- Document the ``__cuda_stream__`` protocol. +- Improve test coverage & documentation cross-references. +- Enforce code formatting. + +Bug fixes +--------- + +- Eliminate potential class destruction issues. +- Fix circular import during handling a foreign CUDA stream. + +Limitations +----------- + +- All APIs are currently *experimental* and subject to change without deprecation notice. + Please kindly share your feedback with us so that we can make ``cuda.core`` better! +- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently + not supported. This will be fixed in a future release. +- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12, + the backend is the cuLink API which supports only a subset of the options that nvjitlink does. + Further, some options aren't available on CUDA versions <12.6. +- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source + prior to `pip install`. This extra step will be fixed soon. diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst index 572567fd..0a34f825 100644 --- a/cuda_core/docs/source/release/0.2.0-notes.rst +++ b/cuda_core/docs/source/release/0.2.0-notes.rst @@ -1,7 +1,22 @@ -``cuda.core`` v0.2.0 Release notes -================================== +.. currentmodule:: cuda.core.experimental -Breaking changes +``cuda.core`` 0.2.0 Release Notes +================================= + +Released on , 2024 + +Highlights +---------- + +- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`. + +Limitations +----------- + +- + +Breaking Changes ---------------- -- Change ``__cuda_stream__`` from attribute to method \ No newline at end of file +- Change ``__cuda_stream__`` from attribute to method +- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`. diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index d1d7211f..8fc4b91b 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -6,7 +6,7 @@ import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # compute out = a * x + y code = """ @@ -29,13 +29,11 @@ s = dev.create_stream() # prepare program -prog = Program(code, code_type="c++") +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}") +prog = Program(code, code_type="c++", options=program_options) mod = prog.compile( "cubin", - options=( - "-std=c++11", - "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), - ), logs=sys.stdout, name_expressions=("saxpy", "saxpy"), ) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index 2cc25989..ed970d50 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -31,7 +31,7 @@ cp = None import numpy as np -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ @@ -88,16 +88,13 @@ } } """).substitute(func_sig=func_sig) - gpu_prog = Program(gpu_code, code_type="c++") + # To know the GPU's compute capability, we need to identify which GPU to use. dev = Device(0) dev.set_current() arch = "".join(f"{i}" for i in dev.compute_capability) - mod = gpu_prog.compile( - target_type="cubin", - # TODO: update this after NVIDIA/cuda-python#237 is merged - options=(f"-arch=sm_{arch}", "-std=c++11"), - ) + gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) + mod = gpu_prog.compile(target_type="cubin") gpu_ker = mod.get_kernel(func_name) # Now we are prepared to run the code from the user's perspective! diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index fa70738d..dfbe71a5 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -5,7 +5,7 @@ import os import sys -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # prepare include cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) @@ -44,12 +44,12 @@ # prepare program & compile kernel dev.set_current() -prog = Program(code, code_type="c++") -mod = prog.compile( - target_type="cubin", - # TODO: update this after NVIDIA/cuda-python#237 is merged - options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"), +prog = Program( + code, + code_type="c++", + options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path), ) +mod = prog.compile(target_type="cubin") ker = mod.get_kernel("check_cluster_info") # prepare launch config diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 17265327..ec398209 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -4,7 +4,7 @@ import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch # compute c = a + b code = """ @@ -26,15 +26,10 @@ s = dev.create_stream() # prepare program -prog = Program(code, code_type="c++") -mod = prog.compile( - "cubin", - options=( - "-std=c++17", - "-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability), - ), - name_expressions=("vector_add",), -) +arch = "".join(f"{i}" for i in dev.compute_capability) +program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}") +prog = Program(code, code_type="c++", options=program_options) +mod = prog.compile("cubin", name_expressions=("vector_add",)) # run in single precision ker = mod.get_kernel("vector_add") diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 54cd8cf4..b81c1654 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -4,10 +4,10 @@ import pytest -from cuda.core.experimental import Linker, LinkerOptions, Program, _linker +from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode -ARCH = "sm_80" # use sm_80 for testing the oop nvJitLink wrapper +ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability) kernel_a = """ extern __device__ int B(); @@ -18,35 +18,43 @@ device_function_c = "__device__ int C(int a, int b) { return a + b; }" culink_backend = _linker._decide_nvjitlink_or_driver() +if not culink_backend: + from cuda.bindings import nvjitlink @pytest.fixture(scope="function") def compile_ptx_functions(init_cuda): # Without -rdc (relocatable device code) option, the generated ptx will not included any unreferenced # device functions, causing the link to fail - object_code_a_ptx = Program(kernel_a, "c++").compile("ptx", options=("-rdc=true",)) - object_code_b_ptx = Program(device_function_b, "c++").compile("ptx", options=("-rdc=true",)) - object_code_c_ptx = Program(device_function_c, "c++").compile("ptx", options=("-rdc=true",)) + object_code_a_ptx = Program(kernel_a, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") + object_code_b_ptx = Program(device_function_b, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") + object_code_c_ptx = Program(device_function_c, "c++", ProgramOptions(relocatable_device_code=True)).compile("ptx") return object_code_a_ptx, object_code_b_ptx, object_code_c_ptx @pytest.fixture(scope="function") def compile_ltoir_functions(init_cuda): - object_code_a_ltoir = Program(kernel_a, "c++").compile("ltoir", options=("-dlto",)) - object_code_b_ltoir = Program(device_function_b, "c++").compile("ltoir", options=("-dlto",)) - object_code_c_ltoir = Program(device_function_c, "c++").compile("ltoir", options=("-dlto",)) + object_code_a_ltoir = Program(kernel_a, "c++", ProgramOptions(link_time_optimization=True)).compile("ltoir") + object_code_b_ltoir = Program(device_function_b, "c++", ProgramOptions(link_time_optimization=True)).compile( + "ltoir" + ) + object_code_c_ltoir = Program(device_function_c, "c++", ProgramOptions(link_time_optimization=True)).compile( + "ltoir" + ) return object_code_a_ltoir, object_code_b_ltoir, object_code_c_ltoir culink_options = [ + LinkerOptions(), LinkerOptions(arch=ARCH, verbose=True), LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, no_cache=True), + LinkerOptions(arch=ARCH, no_cache=True), # TODO: consider adding cuda 12.4 to test matrix in which case this + # will fail. Tracked in issue #337 ] @@ -77,10 +85,11 @@ def test_linker_init(compile_ptx_functions, options): assert isinstance(object_code, ObjectCode) -def test_linker_init_invalid_arch(): - options = LinkerOptions(arch=None) - with pytest.raises(TypeError): - Linker(options) +def test_linker_init_invalid_arch(compile_ptx_functions): + err = AttributeError if culink_backend else nvjitlink.nvJitLinkError + with pytest.raises(err): + options = LinkerOptions(arch="99", ptx=True) + Linker(*compile_ptx_functions, options=options) @pytest.mark.skipif(culink_backend, reason="culink does not support ptx option") diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 61933d41..7db017f1 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -10,13 +10,13 @@ import pytest from conftest import can_load_generated_ptx -from cuda.core.experimental import Program +from cuda.core.experimental import Program, ProgramOptions @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") def test_get_kernel(): kernel = """extern "C" __global__ void ABC() { }""" - object_code = Program(kernel, "c++").compile("ptx", options=("-rdc=true",)) + object_code = Program(kernel, "c++", options=ProgramOptions(relocatable_device_code=True)).compile("ptx") assert object_code._handle is None kernel = object_code.get_kernel("ABC") assert object_code._handle is not None diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index cca01af5..0f9b8e3b 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -9,8 +9,43 @@ import pytest from conftest import can_load_generated_ptx -from cuda.core.experimental import Device, Program from cuda.core.experimental._module import Kernel, ObjectCode +from cuda.core.experimental._program import Program, ProgramOptions + + +# TODO handle and test options whcih are only supported on more modern CUDA versions +# tracked in #337 +def test_program_with_various_options(init_cuda): + code = 'extern "C" __global__ void my_kernel() {}' + + options_list = [ + ProgramOptions(device_code_optimize=True, debug=True), + ProgramOptions(relocatable_device_code=True, max_register_count=32), + ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), + ProgramOptions(fma=False, use_fast_math=True), + ProgramOptions(extra_device_vectorization=True), + ProgramOptions(link_time_optimization=True), + ProgramOptions(define_macro="MY_MACRO"), + ProgramOptions(define_macro=("MY_MACRO", "99")), + ProgramOptions(define_macro=[("MY_MACRO", "99")]), + ProgramOptions(define_macro=[("MY_MACRO", "99"), ("MY_OTHER_MACRO", "100")]), + ProgramOptions(undefine_macro=["MY_MACRO", "MY_OTHER_MACRO"]), + ProgramOptions(undefine_macro="MY_MACRO", include_path="/usr/local/include"), + ProgramOptions(builtin_initializer_list=False, disable_warnings=True), + ProgramOptions(restrict=True, device_as_default_execution_space=True), + ProgramOptions(device_int128=True, optimization_info="inline"), + ProgramOptions(no_display_error_number=True), + ProgramOptions(diag_error=1234, diag_suppress=1234), + ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)), + ProgramOptions(diag_warn=1000), + ] + + for options in options_list: + program = Program(code, "c++", options) + assert program.backend == "nvrtc" + program.compile("ptx") + program.close() + assert program.handle is None def test_program_init_valid_code_type(): @@ -33,13 +68,12 @@ def test_program_init_invalid_code_format(): # TODO: incorporate this check in Program +# This is tested against the current device's arch @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") - arch = "".join(str(i) for i in Device().compute_capability) - object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",)) - print(object_code._module.decode()) + object_code = program.compile("ptx") kernel = object_code.get_kernel("my_kernel") assert isinstance(object_code, ObjectCode) assert isinstance(kernel, Kernel)