Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introducing cuda.core: pythonic access to CUDA core functionalities #87

Merged
merged 34 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3c7f0e1
check in cuda.py prototype + build system
leofang Sep 2, 2024
e276683
hide all submodules
leofang Sep 2, 2024
8ecb291
update in Device
leofang Sep 2, 2024
b79b9f1
update in Stream
leofang Sep 2, 2024
f79ddd2
improve circular type hints
leofang Sep 2, 2024
96ba14e
complete Stream impl
leofang Sep 2, 2024
2c52e43
add Event impl
leofang Sep 2, 2024
690faba
change query to is_done
leofang Sep 3, 2024
4b9faf6
enforce device init checking wherever appropriate
leofang Sep 3, 2024
8baa103
expose options to top level namespace
leofang Sep 3, 2024
19ea607
add basic MR/Buffer properties + copy methods
leofang Sep 3, 2024
b64405e
add basic DLPack support to Buffer
leofang Sep 3, 2024
4535be9
fixes
leofang Sep 3, 2024
ee50ae9
add license header
leofang Sep 3, 2024
16f541d
add a simple build system for Cython modules
leofang Sep 3, 2024
8c49acc
support dlpack 1.0
leofang Sep 4, 2024
8fe6ac7
add simple build instruction
leofang Sep 4, 2024
fb952d8
split dlpack include + support cython 3
leofang Sep 5, 2024
7076a6c
add viewable & basic GPUMemoryView support
leofang Sep 5, 2024
ab83c5b
give GPUMemoryView a nicer __repr__
leofang Sep 5, 2024
48a305c
fix dtype repr and stream pass-through
leofang Sep 5, 2024
94ec937
more robust repr handling
leofang Sep 6, 2024
60682de
support CAI too
leofang Sep 6, 2024
7770a63
fix viewable return & event leak
leofang Sep 6, 2024
d765fb7
rename cuda.py to cuda.core
leofang Oct 5, 2024
4a5457e
update README
leofang Oct 5, 2024
905e5f4
align with latest design
leofang Oct 6, 2024
7e1c8f5
implement kernel arg handling + check in saxpy sample
leofang Oct 7, 2024
f0c155c
add vector_add example
leofang Oct 7, 2024
df017dd
micro optimization + make numpy as required
leofang Oct 8, 2024
94b693c
Merge branch 'main' into cuda_py
leofang Oct 8, 2024
a41a4b7
rename GPUMemoryView to StridedMemoryView
leofang Oct 10, 2024
64c3a8e
enable parallel build
leofang Oct 10, 2024
317dd13
update readme
leofang Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cuda_core/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include cuda/core *.pyx *.pxd
9 changes: 9 additions & 0 deletions cuda_core/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# `cuda.core`: (experimental) pythonic CUDA module

Currently under active development. To build from source, just do:
```shell
$ git clone https://github.com/NVIDIA/cuda-python
$ cd cuda-python/cuda_core # move to the directory where this README locates
$ pip install .
```
For now `cuda-python` is a required dependency.
Empty file.
10 changes: 10 additions & 0 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from cuda.core._device import Device
from cuda.core._event import EventOptions
from cuda.core._launcher import LaunchConfig, launch
from cuda.core._program import Program
from cuda.core._stream import Stream, StreamOptions
from cuda.core._version import __version__
29 changes: 29 additions & 0 deletions cuda_core/cuda/core/_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from dataclasses import dataclass

from cuda import cuda, cudart
from cuda.core._utils import handle_return


@dataclass
class ContextOptions:
pass # TODO


class Context:

__slots__ = ("_handle", "_id")

def __init__(self):
raise NotImplementedError("TODO")

@staticmethod
def _from_ctx(obj, dev_id):
assert isinstance(obj, cuda.CUcontext)
ctx = Context.__new__(Context)
ctx._handle = obj
ctx._id = dev_id
return ctx
187 changes: 187 additions & 0 deletions cuda_core/cuda/core/_device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import threading
from typing import Optional, Union
import warnings

from cuda import cuda, cudart
from cuda.core._utils import handle_return, ComputeCapability, CUDAError, \
precondition
from cuda.core._context import Context, ContextOptions
from cuda.core._memory import _DefaultAsyncMempool, Buffer, MemoryResource
from cuda.core._stream import default_stream, Stream, StreamOptions


_tls = threading.local()
_tls_lock = threading.Lock()


class Device:

__slots__ = ("_id", "_mr", "_has_inited")

def __new__(cls, device_id=None):
# important: creating a Device instance does not initialize the GPU!
if device_id is None:
device_id = handle_return(cudart.cudaGetDevice())
assert isinstance(device_id, int), f"{device_id=}"
else:
total = handle_return(cudart.cudaGetDeviceCount())
if not isinstance(device_id, int) or not (0 <= device_id < total):
raise ValueError(
f"device_id must be within [0, {total}), got {device_id}")

# ensure Device is singleton
with _tls_lock:
if not hasattr(_tls, "devices"):
total = handle_return(cudart.cudaGetDeviceCount())
_tls.devices = []
for dev_id in range(total):
dev = super().__new__(cls)
dev._id = dev_id
dev._mr = _DefaultAsyncMempool(dev_id)
dev._has_inited = False
_tls.devices.append(dev)

return _tls.devices[device_id]

def _check_context_initialized(self, *args, **kwargs):
if not self._has_inited:
raise CUDAError("the device is not yet initialized, "
"perhaps you forgot to call .set_current() first?")

@property
def device_id(self) -> int:
return self._id

@property
def pci_bus_id(self) -> str:
bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
return bus_id[:12].decode()

@property
def uuid(self) -> str:
driver_ver = handle_return(cuda.cuDriverGetVersion())
if driver_ver >= 11040:
uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
else:
uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
uuid = uuid.bytes.hex()
# 8-4-4-4-12
return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"

@property
def name(self) -> str:
# assuming a GPU name is less than 128 characters...
name = handle_return(cuda.cuDeviceGetName(128, self._id))
name = name.split(b'\0')[0]
return name.decode()

@property
def properties(self) -> dict:
# TODO: pythonize the key names
return handle_return(cudart.cudaGetDeviceProperties(self._id))

@property
def compute_capability(self) -> ComputeCapability:
"""Returns a named tuple with 2 fields: major and minor. """
major = handle_return(cudart.cudaDeviceGetAttribute(
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id))
minor = handle_return(cudart.cudaDeviceGetAttribute(
cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id))
return ComputeCapability(major, minor)

@property
@precondition(_check_context_initialized)
def context(self) -> Context:
ctx = handle_return(cuda.cuCtxGetCurrent())
assert int(ctx) != 0
return Context._from_ctx(ctx, self._id)

@property
def memory_resource(self) -> MemoryResource:
return self._mr

@memory_resource.setter
def memory_resource(self, mr):
if not isinstance(mr, MemoryResource):
raise TypeError
self._mr = mr

@property
def default_stream(self) -> Stream:
return default_stream()

def __int__(self):
return self._id

def __repr__(self):
return f"<Device {self._id} ({self.name})>"

def set_current(self, ctx: Context=None) -> Union[Context, None]:
"""
Entry point of this object. Users always start a code by
calling this method, e.g.

>>> from cuda.core import Device
>>> dev0 = Device(0)
>>> dev0.set_current()
>>> # ... do work on device 0 ...

The optional ctx argument is for advanced users to bind a
CUDA context with the device. In this case, the previously
set context is popped and returned to the user.
"""
if ctx is not None:
if not isinstance(ctx, Context):
raise TypeError("a Context object is required")
if ctx._id != self._id:
raise RuntimeError("the provided context was created on a different "
f"device {ctx._id} other than the target {self._id}")
prev_ctx = handle_return(cuda.cuCtxPopCurrent())
handle_return(cuda.cuCtxPushCurrent(ctx._handle))
self._has_inited = True
if int(prev_ctx) != 0:
return Context._from_ctx(prev_ctx, self._id)
else:
ctx = handle_return(cuda.cuCtxGetCurrent())
if int(ctx) == 0:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
else:
ctx_id = handle_return(cuda.cuCtxGetDevice())
if ctx_id != self._id:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
else:
# no-op, a valid context already exists and is set current
pass
self._has_inited = True

def create_context(self, options: ContextOptions = None) -> Context:
# Create a Context object (but do NOT set it current yet!).
# ContextOptions is a dataclass for setting e.g. affinity or CIG
# options.
raise NotImplementedError("TODO")

@precondition(_check_context_initialized)
def create_stream(self, obj=None, options: StreamOptions=None) -> Stream:
# Create a Stream object by either holding a newly created
# CUDA stream or wrapping an existing foreign object supporting
# the __cuda_stream__ protocol. In the latter case, a reference
# to obj is held internally so that its lifetime is managed.
return Stream._init(obj=obj, options=options)

@precondition(_check_context_initialized)
def allocate(self, size, stream=None) -> Buffer:
if stream is None:
stream = default_stream()
return self._mr.allocate(size, stream)

@precondition(_check_context_initialized)
def sync(self):
handle_return(cudart.cudaDeviceSynchronize())
79 changes: 79 additions & 0 deletions cuda_core/cuda/core/_dlpack.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

cimport cpython

from libc cimport stdlib
from libc.stdint cimport uint8_t
from libc.stdint cimport uint16_t
from libc.stdint cimport uint32_t
from libc.stdint cimport int32_t
from libc.stdint cimport int64_t
from libc.stdint cimport uint64_t
from libc.stdint cimport intptr_t


cdef extern from "dlpack.h" nogil:
"""
#define DLPACK_TENSOR_UNUSED_NAME "dltensor"
#define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"
#define DLPACK_TENSOR_USED_NAME "used_dltensor"
#define DLPACK_VERSIONED_TENSOR_USED_NAME "used_dltensor_versioned"
"""
ctypedef enum _DLDeviceType "DLDeviceType":
_kDLCPU "kDLCPU"
_kDLCUDA "kDLCUDA"
_kDLCUDAHost "kDLCUDAHost"
_kDLCUDAManaged "kDLCUDAManaged"

ctypedef struct DLDevice:
_DLDeviceType device_type
int32_t device_id

cdef enum DLDataTypeCode:
kDLInt
kDLUInt
kDLFloat
kDLBfloat
kDLComplex
kDLBool

ctypedef struct DLDataType:
uint8_t code
uint8_t bits
uint16_t lanes

ctypedef struct DLTensor:
void* data
DLDevice device
int32_t ndim
DLDataType dtype
int64_t* shape
int64_t* strides
uint64_t byte_offset

ctypedef struct DLManagedTensor:
DLTensor dl_tensor
void* manager_ctx
void (*deleter)(DLManagedTensor*)

ctypedef struct DLPackVersion:
uint32_t major
uint32_t minor

ctypedef struct DLManagedTensorVersioned:
DLPackVersion version
void* manager_ctx
void (*deleter)(DLManagedTensorVersioned*)
uint64_t flags
DLTensor dl_tensor

int DLPACK_MAJOR_VERSION
int DLPACK_MINOR_VERSION
int DLPACK_FLAG_BITMASK_READ_ONLY

const char* DLPACK_TENSOR_UNUSED_NAME
const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
const char* DLPACK_TENSOR_USED_NAME
const char* DLPACK_VERSIONED_TENSOR_USED_NAME
Loading