Skip to content

Commit

Permalink
Merge pull request #237 from NVIDIA/ksimpson/add_program_options
Browse files Browse the repository at this point in the history
Add the options data class to program
  • Loading branch information
leofang authored Jan 19, 2025
2 parents fc8188a + c7cb7eb commit d62c7fa
Show file tree
Hide file tree
Showing 19 changed files with 574 additions and 131 deletions.
2 changes: 1 addition & 1 deletion cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from cuda.core.experimental._event import EventOptions
from cuda.core.experimental._launcher import LaunchConfig, launch
from cuda.core.experimental._linker import Linker, LinkerOptions
from cuda.core.experimental._program import Program
from cuda.core.experimental._program import Program, ProgramOptions
from cuda.core.experimental._stream import Stream, StreamOptions
from cuda.core.experimental._system import System

Expand Down
13 changes: 8 additions & 5 deletions cuda_core/cuda/core/experimental/_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dataclasses import dataclass
from typing import List, Optional

from cuda.core.experimental._device import Device
from cuda.core.experimental._module import ObjectCode
from cuda.core.experimental._utils import check_or_create_options, driver, handle_return

Expand Down Expand Up @@ -91,10 +92,10 @@ class LinkerOptions:
Attributes
----------
arch : str
Pass the SM architecture value, such as ``-arch=sm_<CC>`` (for generating CUBIN) or
``compute_<CC>`` (for generating PTX).
This is a required option.
arch : str, optional
Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
will be used.
max_register_count : int, optional
Maximum register count.
Maps to: ``-maxrregcount=<N>``.
Expand Down Expand Up @@ -172,7 +173,7 @@ class LinkerOptions:
Default: False.
"""

arch: str
arch: Optional[str] = None
max_register_count: Optional[int] = None
time: Optional[bool] = None
verbose: Optional[bool] = None
Expand Down Expand Up @@ -204,6 +205,8 @@ def __post_init__(self):
def _init_nvjitlink(self):
if self.arch is not None:
self.formatted_options.append(f"-arch={self.arch}")
else:
self.formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
if self.max_register_count is not None:
self.formatted_options.append(f"-maxrregcount={self.max_register_count}")
if self.time is not None:
Expand Down
384 changes: 372 additions & 12 deletions cuda_core/cuda/core/experimental/_program.py

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions cuda_core/cuda/core/experimental/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import functools
import importlib.metadata
from collections import namedtuple
from collections.abc import Sequence
from typing import Callable, Dict

try:
Expand Down Expand Up @@ -93,6 +94,13 @@ def check_or_create_options(cls, options, options_description, *, keep_none=Fals
return options


def _handle_boolean_option(option: bool) -> str:
"""
Convert a boolean option to a string representation.
"""
return "true" if bool(option) else "false"


def precondition(checker: Callable[..., None], what: str = "") -> Callable:
"""
A decorator that adds checks to ensure any preconditions are met.
Expand Down Expand Up @@ -142,6 +150,20 @@ def get_device_from_ctx(ctx_handle) -> int:
return device_id


def is_sequence(obj):
"""
Check if the given object is a sequence (list or tuple).
"""
return isinstance(obj, Sequence)


def is_nested_sequence(obj):
"""
Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element).
"""
return is_sequence(obj) and any(is_sequence(elem) for elem in obj)


def get_binding_version():
try:
major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2]
Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ CUDA compilation toolchain

:template: dataclass.rst

ProgramOptions
LinkerOptions


Expand Down
2 changes: 1 addition & 1 deletion cuda_core/docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ and other functionalities.
:maxdepth: 2
:caption: Contents:

release.md
release.rst
install.md
interoperability.rst
api.rst
Expand Down
12 changes: 0 additions & 12 deletions cuda_core/docs/source/release.md

This file was deleted.

9 changes: 9 additions & 0 deletions cuda_core/docs/source/release.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Release Notes
=============

.. toctree::
:maxdepth: 3

release/0.2.0-notes
release/0.1.1-notes
release/0.1.0-notes
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
# `cuda.core` v0.1.0 Release notes
``cuda.core`` 0.1.0 Release Notes
=================================

Released on Nov 8, 2024

## Hightlights
Highlights
----------

- Initial beta release
- Supports all platforms that CUDA is supported
- Supports all CUDA 11.x/12.x drivers
- Supports all CUDA 11.x/12.x Toolkits
- Pythonic CUDA runtime and other core functionalities

## Limitations
Limitations
-----------

- All APIs are currently *experimental* and subject to change without deprecation notice.
Please kindly share your feedbacks with us so that we can make `cuda.core` better!
Please kindly share your feedback with us so that we can make ``cuda.core`` better!
- Source code release only; `pip`/`conda` support is coming in a future release
- Windows TCC mode is [not yet supported](https://github.com/NVIDIA/cuda-python/issues/206)
- Windows TCC mode is `not yet supported <https://github.com/NVIDIA/cuda-python/issues/206>`_
43 changes: 0 additions & 43 deletions cuda_core/docs/source/release/0.1.1-notes.md

This file was deleted.

51 changes: 51 additions & 0 deletions cuda_core/docs/source/release/0.1.1-notes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
.. currentmodule:: cuda.core.experimental

``cuda.core`` 0.1.1 Release Notes
=================================

Released on Dec 20, 2024

Highlights
----------

- Add :obj:`~utils.StridedMemoryView` and :func:`~utils.args_viewable_as_strided_memory` that provide a concrete
implementation of DLPack & CUDA Array Interface supports.
- Add :obj:`~Linker` that can link one or multiple :obj:`~_module.ObjectCode` instances generated by :obj:`~Program`. Under
the hood, it uses either the nvJitLink or driver (``cuLink*``) APIs depending on the CUDA version
detected in the current environment.
- Support ``pip install cuda-core``. Please see the Installation Guide for further details.

New features
------------

- Add a :obj:`cuda.core.experiemental.system` module for querying system- or process-wide information.
- Add :obj:`~LaunchConfig.cluster` to support thread block clusters on Hopper GPUs.

Enhancements
------------

- The internal handle held by :obj:`~_module.ObjectCode` is now lazily initialized upon first touch.
- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools.
- Ensure ``"ltoir"`` is a valid code type to :obj:`~_module.ObjectCode`.
- Document the ``__cuda_stream__`` protocol.
- Improve test coverage & documentation cross-references.
- Enforce code formatting.

Bug fixes
---------

- Eliminate potential class destruction issues.
- Fix circular import during handling a foreign CUDA stream.

Limitations
-----------

- All APIs are currently *experimental* and subject to change without deprecation notice.
Please kindly share your feedback with us so that we can make ``cuda.core`` better!
- Using ``cuda.core`` with NVRTC or nvJitLink installed from PyPI via `pip install` is currently
not supported. This will be fixed in a future release.
- Some :class:`~LinkerOptions` are only available when using a modern version of CUDA. When using CUDA <12,
the backend is the cuLink API which supports only a subset of the options that nvjitlink does.
Further, some options aren't available on CUDA versions <12.6.
- To use ``cuda.core`` with Python 3.13, it currently requires building ``cuda-python`` from source
prior to `pip install`. This extra step will be fixed soon.
23 changes: 19 additions & 4 deletions cuda_core/docs/source/release/0.2.0-notes.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
``cuda.core`` v0.2.0 Release notes
==================================
.. currentmodule:: cuda.core.experimental

Breaking changes
``cuda.core`` 0.2.0 Release Notes
=================================

Released on <TODO>, 2024

Highlights
----------

- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`.

Limitations
-----------

- <TODO>

Breaking Changes
----------------

- Change ``__cuda_stream__`` from attribute to method
- Change ``__cuda_stream__`` from attribute to method
- The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
10 changes: 4 additions & 6 deletions cuda_core/examples/saxpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import cupy as cp

from cuda.core.experimental import Device, LaunchConfig, Program, launch
from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch

# compute out = a * x + y
code = """
Expand All @@ -29,13 +29,11 @@
s = dev.create_stream()

# prepare program
prog = Program(code, code_type="c++")
arch = "".join(f"{i}" for i in dev.compute_capability)
program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile(
"cubin",
options=(
"-std=c++11",
"-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
),
logs=sys.stdout,
name_expressions=("saxpy<float>", "saxpy<double>"),
)
Expand Down
11 changes: 4 additions & 7 deletions cuda_core/examples/strided_memory_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
cp = None
import numpy as np

from cuda.core.experimental import Device, LaunchConfig, Program, launch
from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory

# ################################################################################
Expand Down Expand Up @@ -88,16 +88,13 @@
}
}
""").substitute(func_sig=func_sig)
gpu_prog = Program(gpu_code, code_type="c++")

# To know the GPU's compute capability, we need to identify which GPU to use.
dev = Device(0)
dev.set_current()
arch = "".join(f"{i}" for i in dev.compute_capability)
mod = gpu_prog.compile(
target_type="cubin",
# TODO: update this after NVIDIA/cuda-python#237 is merged
options=(f"-arch=sm_{arch}", "-std=c++11"),
)
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
mod = gpu_prog.compile(target_type="cubin")
gpu_ker = mod.get_kernel(func_name)

# Now we are prepared to run the code from the user's perspective!
Expand Down
12 changes: 6 additions & 6 deletions cuda_core/examples/thread_block_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import sys

from cuda.core.experimental import Device, LaunchConfig, Program, launch
from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch

# prepare include
cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
Expand Down Expand Up @@ -44,12 +44,12 @@

# prepare program & compile kernel
dev.set_current()
prog = Program(code, code_type="c++")
mod = prog.compile(
target_type="cubin",
# TODO: update this after NVIDIA/cuda-python#237 is merged
options=(f"-arch=sm_{arch}", "-std=c++17", f"-I{cuda_include_path}"),
prog = Program(
code,
code_type="c++",
options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=cuda_include_path),
)
mod = prog.compile(target_type="cubin")
ker = mod.get_kernel("check_cluster_info")

# prepare launch config
Expand Down
15 changes: 5 additions & 10 deletions cuda_core/examples/vector_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import cupy as cp

from cuda.core.experimental import Device, LaunchConfig, Program, launch
from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch

# compute c = a + b
code = """
Expand All @@ -26,15 +26,10 @@
s = dev.create_stream()

# prepare program
prog = Program(code, code_type="c++")
mod = prog.compile(
"cubin",
options=(
"-std=c++17",
"-arch=sm_" + "".join(f"{i}" for i in dev.compute_capability),
),
name_expressions=("vector_add<float>",),
)
arch = "".join(f"{i}" for i in dev.compute_capability)
program_options = ProgramOptions(std="c++17", arch=f"sm_{arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))

# run in single precision
ker = mod.get_kernel("vector_add<float>")
Expand Down
Loading

0 comments on commit d62c7fa

Please sign in to comment.