-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
bump version, merge pull request #1 from AMYPAD/devel
- Loading branch information
Showing
12 changed files
with
206 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
project(example_mod) | ||
file(GLOB SRC LIST_DIRECTORIES false "*.cu") | ||
|
||
include_directories(${Python3_INCLUDE_DIRS}) | ||
#include_directories(${Python3_NumPy_INCLUDE_DIRS}) | ||
|
||
add_library(${PROJECT_NAME} MODULE ${SRC}) | ||
target_include_directories(${PROJECT_NAME} PUBLIC | ||
"$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>" | ||
"$<INSTALL_INTERFACE:${CMAKE_PROJECT_NAME}/include>") | ||
target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} ${CUDA_LIBRARIES}) | ||
|
||
if(SKBUILD) | ||
python_extension_module(${PROJECT_NAME}) | ||
endif() | ||
set_target_properties(${PROJECT_NAME} PROPERTIES | ||
CXX_STANDARD 11 | ||
VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR} | ||
INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR}) | ||
set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${PROJECT_NAME}_MAJOR_VERSION) | ||
install(TARGETS ${PROJECT_NAME} | ||
INCLUDES DESTINATION ${CMAKE_PROJECT_NAME}/include | ||
LIBRARY DESTINATION ${CMAKE_PROJECT_NAME}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/** | ||
* Example external extension module using CuVec. | ||
* | ||
* Copyright (2021) Casper da Costa-Luis | ||
*/ | ||
#include "Python.h" | ||
#include "pycuvec.cuh" // PyCuVec | ||
/** functions */ | ||
/// dst = src + 1 | ||
__global__ void _d_incr(float *dst, float *src, int X, int Y) { | ||
int x = threadIdx.x + blockDim.x * blockIdx.x; | ||
if (x >= X) return; | ||
int y = threadIdx.y + blockDim.y * blockIdx.y; | ||
if (y >= Y) return; | ||
dst[y * X + x] = src[y * X + x] + 1; | ||
} | ||
static PyObject *increment_f(PyObject *self, PyObject *args) { | ||
PyCuVec<float> *src; | ||
if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL; | ||
std::vector<Py_ssize_t> &N = src->shape; | ||
|
||
cudaEvent_t eStart, eAlloc, eKern; | ||
cudaEventCreate(&eStart); | ||
cudaEventCreate(&eAlloc); | ||
cudaEventCreate(&eKern); | ||
cudaEventRecord(eStart); | ||
PyCuVec<float> *dst = PyCuVec_zeros_like(src); | ||
cudaEventRecord(eAlloc); | ||
dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32); | ||
dim3 blcks(32, 32); | ||
_d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]); | ||
// cudaDeviceSynchronize(); | ||
cudaEventRecord(eKern); | ||
cudaEventSynchronize(eKern); | ||
float alloc_ms, kernel_ms; | ||
cudaEventElapsedTime(&alloc_ms, eStart, eAlloc); | ||
cudaEventElapsedTime(&kernel_ms, eAlloc, eKern); | ||
// fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms); | ||
return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst); | ||
} | ||
static PyMethodDef example_methods[] = { | ||
{"increment_f", increment_f, METH_VARARGS, "Returns (alloc_ms, kernel_ms, input + 1)."}, | ||
{NULL, NULL, 0, NULL} // Sentinel | ||
}; | ||
|
||
/** module */ | ||
static struct PyModuleDef example_mod = {PyModuleDef_HEAD_INIT, | ||
"example_mod", // module | ||
"Example external module.", | ||
-1, // module keeps state in global variables | ||
example_methods}; | ||
PyMODINIT_FUNC PyInit_example_mod(void) { | ||
Py_Initialize(); | ||
return PyModule_Create(&example_mod); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from functools import wraps | ||
from time import time | ||
|
||
import numpy as np | ||
|
||
import cuvec as cu | ||
|
||
|
||
def _time_overhead(): | ||
tic = time() | ||
pass | ||
res = time() - tic | ||
return res | ||
|
||
|
||
def timer(func): | ||
@wraps(func) | ||
def inner(*args, **kwargs): | ||
overhead = np.mean([_time_overhead() for _ in range(100)]) | ||
tic = time() | ||
res = func(*args, **kwargs) | ||
return (time() - tic - overhead) * 1000, res | ||
|
||
return inner | ||
|
||
|
||
def test_perf(shape=(1337, 42), quiet=False): | ||
# `example_mod` is defined in ../cuvec/src/example_mod/ | ||
from cuvec.example_mod import increment_f | ||
|
||
overhead = np.mean([_time_overhead() for _ in range(100)]) | ||
t = {} | ||
t['create src'], src = timer(cu.zeros)(shape, "float32") | ||
|
||
rnd = np.random.random(shape) | ||
tic = time() | ||
src[:] = rnd | ||
t['assign'] = (time() - tic - overhead) * 1000 | ||
|
||
if not quiet: | ||
t['warmup'], (t['> create dst'], t['> kernel'], _) = timer(increment_f)(src.cuvec) | ||
t['call ext'], (t['- create dst'], t['- kernel'], res) = timer(increment_f)(src.cuvec) | ||
t['view'], dst = timer(cu.asarray)(res) | ||
|
||
if not quiet: | ||
print("\n".join(f"{k.ljust(14)} | {v:.3f}" for k, v in t.items())) | ||
assert (src + 1 == dst).all() | ||
# even a fast kernel takes longer than API overhead | ||
assert t['- kernel'] / (t['call ext'] - t['- create dst']) > 0.5 | ||
# API call should be <0.1 ms... but set a higher threshold of 2 ms | ||
assert t['call ext'] - t['- create dst'] - t['- kernel'] < 2 | ||
return t | ||
|
||
|
||
if __name__ == "__main__": | ||
try: | ||
from tqdm import trange | ||
except ImportError: | ||
trange = range | ||
nruns = 1000 | ||
|
||
print("# One run:") | ||
test_perf((1000, 1000)) | ||
|
||
print("Repeating & averaging performance test metrics over {nruns} runs.") | ||
runs = [test_perf((1000, 1000), True) for _ in trange(nruns)] | ||
pretty = { | ||
'create src': 'Create input', 'assign': 'Assign', 'call ext': 'Call extension', | ||
'- create dst': '-- Create output', '- kernel': '-- Launch kernel', 'view': 'View'} | ||
runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]} | ||
print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}" | ||
for k, v in runs.items())) |