From 80d91083ca2e40df1d1ea6f938c4cecf751d1683 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Fri, 22 Jan 2021 16:57:09 +0000 Subject: [PATCH 1/8] add half-precision (fp16) --- cuvec/include/pycuvec.cuh | 14 +++++++++----- cuvec/pycuvec.py | 4 +++- cuvec/src/pycuvec.cu | 5 +++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cuvec/include/pycuvec.cuh b/cuvec/include/pycuvec.cuh index 7224ed4..47591b8 100644 --- a/cuvec/include/pycuvec.cuh +++ b/cuvec/include/pycuvec.cuh @@ -10,11 +10,12 @@ #define _PYCUVEC_H_ #include "Python.h" -#include "cuvec.cuh" // CuVec -#include // malloc, free -#include // std::stringstream -#include // typeid -#include // std::vector +#include "cuda_fp16.h" // __half +#include "cuvec.cuh" // CuVec +#include // malloc, free +#include // std::stringstream +#include // typeid +#include // std::vector template struct PyType { static const char *format() { return typeid(T).name(); } @@ -51,6 +52,9 @@ template <> struct PyType { template <> struct PyType { static const char *format() { return "Q"; } }; +template <> struct PyType<__half> { + static const char *format() { return "e"; } +}; template <> struct PyType { static const char *format() { return "f"; } }; diff --git a/cuvec/pycuvec.py b/cuvec/pycuvec.py index f00d083..ea02a08 100644 --- a/cuvec/pycuvec.py +++ b/cuvec/pycuvec.py @@ -9,6 +9,7 @@ Vector_B, Vector_c, Vector_d, + Vector_e, Vector_f, Vector_h, Vector_H, @@ -18,7 +19,7 @@ Vector_Q, ) -typecodes = [i for i in array.typecodes if i not in "ulL"] +typecodes = [i for i in array.typecodes if i not in "ulL"] + ["e"] vec_types = { np.dtype('int8'): Vector_b, np.dtype('uint8'): Vector_B, @@ -29,6 +30,7 @@ np.dtype('uint32'): Vector_I, np.dtype('int64'): Vector_q, np.dtype('uint64'): Vector_Q, + np.dtype('float16'): Vector_e, np.dtype('float32'): Vector_f, np.dtype('float64'): Vector_d} diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu index 86045e3..ab8f39f 100644 --- a/cuvec/src/pycuvec.cu +++ b/cuvec/src/pycuvec.cu @@ -32,6 +32,7 @@ static PyCuVec_tp Vector_i; static PyCuVec_tp Vector_I; static PyCuVec_tp Vector_q; // _l static PyCuVec_tp Vector_Q; // _L +static PyCuVec_tp<__half> Vector_e; static PyCuVec_tp Vector_f; static PyCuVec_tp Vector_d; @@ -92,6 +93,10 @@ PyMODINIT_FUNC PyInit_cuvec(void) { Py_INCREF(&Vector_Q.tp_obj); PyModule_AddObject(m, "Vector_L", (PyObject *)&Vector_Q.tp_obj); + if (PyType_Ready(&Vector_e.tp_obj) < 0) return NULL; + Py_INCREF(&Vector_e.tp_obj); + PyModule_AddObject(m, Vector_e.name.c_str(), (PyObject *)&Vector_e.tp_obj); + if (PyType_Ready(&Vector_f.tp_obj) < 0) return NULL; Py_INCREF(&Vector_f.tp_obj); PyModule_AddObject(m, Vector_f.name.c_str(), (PyObject *)&Vector_f.tp_obj); From 52200888d2943f4e25654c5b1fc50a547f1c6841 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 03:06:00 +0000 Subject: [PATCH 2/8] tidy typecodes list => string --- cuvec/pycuvec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuvec/pycuvec.py b/cuvec/pycuvec.py index ea02a08..fb92ae4 100644 --- a/cuvec/pycuvec.py +++ b/cuvec/pycuvec.py @@ -19,7 +19,8 @@ Vector_Q, ) -typecodes = [i for i in array.typecodes if i not in "ulL"] + ["e"] +# u: non-standard np.dype('S2'); l/L: inconsistent between `array` and `numpy` +typecodes = ''.join(i for i in array.typecodes if i not in "ulL") + "e" vec_types = { np.dtype('int8'): Vector_b, np.dtype('uint8'): Vector_B, From fbb3152812ff026e267e5a6cf419e5eabe5f1c8f Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 03:05:27 +0000 Subject: [PATCH 3/8] tests: add performance and external module test --- cuvec/src/pycuvec.cu | 30 ++++++++++++++++++++++++++++++ tests/test_perf.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 tests/test_perf.py diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu index ab8f39f..3b43407 100644 --- a/cuvec/src/pycuvec.cu +++ b/cuvec/src/pycuvec.cu @@ -15,8 +15,38 @@ static PyObject *dev_sync(PyObject *self, PyObject *args) { Py_INCREF(Py_None); return Py_None; } +/// tests: add 1 and return +__global__ void _d_incr(float *dst, float *src, int X, int Y) { + int x = threadIdx.x + blockDim.x * blockIdx.x; + if (x >= X) return; + int y = threadIdx.y + blockDim.y * blockIdx.y; + if (y >= Y) return; + dst[y * X + x] = src[y * X + x] + 1; +} +static PyObject *_increment_f(PyObject *self, PyObject *args) { + PyCuVec *src; + if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL; + std::vector &N = src->shape; + PyCuVec *dst = PyCuVec_zeros_like(src); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32); + dim3 blcks(32, 32); + _d_incr<<>>(dst->vec.data(), src->vec.data(), N[1], N[0]); + cudaDeviceSynchronize(); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float msec = 0; + cudaEventElapsedTime(&msec, start, stop); + // fprintf(stderr, "%.5g ms\n", msec); + return Py_BuildValue("dO", double(msec), (PyObject *)dst); +} static PyMethodDef cuvec_methods[] = { {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."}, + {"_increment_f", _increment_f, METH_VARARGS, "Returns the input + 1."}, {NULL, NULL, 0, NULL} // Sentinel }; diff --git a/tests/test_perf.py b/tests/test_perf.py new file mode 100644 index 0000000..cb07461 --- /dev/null +++ b/tests/test_perf.py @@ -0,0 +1,42 @@ +from time import time + +import numpy as np + +import cuvec as cu + + +def _time_overhead(): + tic = time() + pass + res = time() - tic + return res + + +def timer(func): + def inner(*args, **kwargs): + overhead = np.mean([_time_overhead() for _ in range(100)]) + tic = time() + res = func(*args, **kwargs) + return (time() - tic - overhead) * 1000, res + + return inner + + +def test_perf(): + overhead = np.mean([_time_overhead() for _ in range(100)]) + t = {} + t['init'], _ = timer(cu.dev_sync)() + t['create'], src = timer(cu.zeros)((1337, 42), "float32") + + tic = time() + src.flat = range(1337, 42) + t['assign'] = (time() - tic - overhead) * 1000 + + # `_increment_f` is defined in ../cuvec/src/pycuvec.cu + t['call'], (t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec) + t['view'], dst = timer(cu.asarray)(res) + + assert (src + 1 == dst).all() + print(t) + # even a fast kernel takes longer than API overhead + assert t['kernel'] / t['call'] >= 0.5 From 3df0c57eed531f3e882f6b9210b74f8a1af88c6b Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 04:21:47 +0000 Subject: [PATCH 4/8] tests: update perf timing --- cuvec/src/pycuvec.cu | 27 +++++++++++++++------------ tests/test_perf.py | 15 +++++++++------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu index 3b43407..2f37276 100644 --- a/cuvec/src/pycuvec.cu +++ b/cuvec/src/pycuvec.cu @@ -27,22 +27,25 @@ static PyObject *_increment_f(PyObject *self, PyObject *args) { PyCuVec *src; if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL; std::vector &N = src->shape; - PyCuVec *dst = PyCuVec_zeros_like(src); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start); + cudaEvent_t eStart, eAlloc, eKern; + cudaEventCreate(&eStart); + cudaEventCreate(&eAlloc); + cudaEventCreate(&eKern); + cudaEventRecord(eStart); + PyCuVec *dst = PyCuVec_zeros_like(src); + cudaEventRecord(eAlloc); dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32); dim3 blcks(32, 32); _d_incr<<>>(dst->vec.data(), src->vec.data(), N[1], N[0]); - cudaDeviceSynchronize(); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - float msec = 0; - cudaEventElapsedTime(&msec, start, stop); - // fprintf(stderr, "%.5g ms\n", msec); - return Py_BuildValue("dO", double(msec), (PyObject *)dst); + // cudaDeviceSynchronize(); + cudaEventRecord(eKern); + cudaEventSynchronize(eKern); + float alloc_ms, kernel_ms; + cudaEventElapsedTime(&alloc_ms, eStart, eAlloc); + cudaEventElapsedTime(&kernel_ms, eAlloc, eKern); + // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms); + return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst); } static PyMethodDef cuvec_methods[] = { {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."}, diff --git a/tests/test_perf.py b/tests/test_perf.py index cb07461..f67b3bb 100644 --- a/tests/test_perf.py +++ b/tests/test_perf.py @@ -22,21 +22,24 @@ def inner(*args, **kwargs): return inner -def test_perf(): +def test_perf(shape=(1337, 42)): overhead = np.mean([_time_overhead() for _ in range(100)]) t = {} t['init'], _ = timer(cu.dev_sync)() - t['create'], src = timer(cu.zeros)((1337, 42), "float32") + t['create'], src = timer(cu.zeros)(shape, "float32") + rnd = np.random.random(shape) tic = time() - src.flat = range(1337, 42) + src[:] = rnd t['assign'] = (time() - tic - overhead) * 1000 # `_increment_f` is defined in ../cuvec/src/pycuvec.cu - t['call'], (t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec) + t['call'], (t['create out'], t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec) t['view'], dst = timer(cu.asarray)(res) - assert (src + 1 == dst).all() print(t) + assert (src + 1 == dst).all() # even a fast kernel takes longer than API overhead - assert t['kernel'] / t['call'] >= 0.5 + assert t['kernel'] / (t['call'] - t['create out']) > 0.5 + # API call should be <1 ms + assert t['call'] - t['create out'] - t['kernel'] < 1 From 915905667840114806c209a1666fb635b44e5554 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 04:22:09 +0000 Subject: [PATCH 5/8] fix unneeded `asarray` memcopy on raw objects --- cuvec/helpers.py | 4 ++++ tests/test_helpers.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cuvec/helpers.py b/cuvec/helpers.py index d262da6..d7fbaa8 100644 --- a/cuvec/helpers.py +++ b/cuvec/helpers.py @@ -72,4 +72,8 @@ def asarray(arr, dtype=None, order=None): Returns a `cuvec.CuVec` view of `arr`, avoiding memory copies if possible. (`cuvec` equivalent of `numpy.asarray`). """ + if not isinstance(arr, np.ndarray): # probably raw pointer + res = CuVec(arr) + if dtype is None or res.dtype == np.dtype(dtype): + return CuVec(np.asanyarray(res, order=order)) return CuVec(np.asanyarray(arr, dtype=dtype, order=order)) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index da5cd3d..f60813d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -50,7 +50,7 @@ def test_CuVec_creation(caplog): w = cu.CuVec(v) assert [i[1:] for i in caplog.record_tuples] == [(10, "new view")] nested = cu.asarray(w.cuvec).cuvec - assert nested != w.cuvec, "expected different object" + assert nested == w.cuvec, "expected different object" assert np.asarray(nested).data == np.asarray(w.cuvec).data, "expected same data" caplog.clear() From 9efb921a889435528aa3cc6062cab8597aff7c42 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 12:47:35 +0000 Subject: [PATCH 6/8] tests: add repeat & average perf test script --- tests/test_perf.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/tests/test_perf.py b/tests/test_perf.py index f67b3bb..c0925e6 100644 --- a/tests/test_perf.py +++ b/tests/test_perf.py @@ -1,3 +1,4 @@ +from functools import wraps from time import time import numpy as np @@ -13,6 +14,7 @@ def _time_overhead(): def timer(func): + @wraps(func) def inner(*args, **kwargs): overhead = np.mean([_time_overhead() for _ in range(100)]) tic = time() @@ -22,11 +24,11 @@ def inner(*args, **kwargs): return inner -def test_perf(shape=(1337, 42)): +def test_perf(shape=(1337, 42), quiet=False): overhead = np.mean([_time_overhead() for _ in range(100)]) t = {} t['init'], _ = timer(cu.dev_sync)() - t['create'], src = timer(cu.zeros)(shape, "float32") + t['create src'], src = timer(cu.zeros)(shape, "float32") rnd = np.random.random(shape) tic = time() @@ -34,12 +36,34 @@ def test_perf(shape=(1337, 42)): t['assign'] = (time() - tic - overhead) * 1000 # `_increment_f` is defined in ../cuvec/src/pycuvec.cu - t['call'], (t['create out'], t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec) + t['call ext'], (t['- create dst'], t['- kernel'], + res) = timer(cu.cuvec._increment_f)(src.cuvec) t['view'], dst = timer(cu.asarray)(res) - print(t) + if not quiet: + print("\n".join(f"{k.ljust(14)} | {v:.3f}" for k, v in t.items())) assert (src + 1 == dst).all() # even a fast kernel takes longer than API overhead - assert t['kernel'] / (t['call'] - t['create out']) > 0.5 - # API call should be <1 ms - assert t['call'] - t['create out'] - t['kernel'] < 1 + assert t['- kernel'] / (t['call ext'] - t['- create dst']) > 0.5 + # API call should be <0.1 ms... but set a higher threshold of 2 ms + assert t['call ext'] - t['- create dst'] - t['- kernel'] < 2 + return t + + +if __name__ == "__main__": + try: + from tqdm import trange + except ImportError: + trange = range + nruns = 1000 + print("Repeating & averaging performance test metrics over {nruns} runs.") + + runs = [test_perf((1000, 1000), True) for _ in trange(nruns)] + pretty = { + 'init': 'Initialise', 'create src': 'Create input', 'assign': 'Assign', + 'call ext': 'Call extension', '- create dst': '-- Create output', + '- kernel': '-- Launch kernel', 'view': 'View'} + runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]} + + print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}" + for k, v in runs.items())) From cfe107b6df7c66c8f2f9665cdb29007312394b34 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 13:14:56 +0000 Subject: [PATCH 7/8] fix and test asarray --- cuvec/helpers.py | 2 +- tests/test_helpers.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cuvec/helpers.py b/cuvec/helpers.py index d7fbaa8..8a2f2ee 100644 --- a/cuvec/helpers.py +++ b/cuvec/helpers.py @@ -72,7 +72,7 @@ def asarray(arr, dtype=None, order=None): Returns a `cuvec.CuVec` view of `arr`, avoiding memory copies if possible. (`cuvec` equivalent of `numpy.asarray`). """ - if not isinstance(arr, np.ndarray): # probably raw pointer + if not isinstance(arr, np.ndarray) and is_raw_cuvec(arr): res = CuVec(arr) if dtype is None or res.dtype == np.dtype(dtype): return CuVec(np.asanyarray(res, order=order)) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index f60813d..c08818e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -49,9 +49,6 @@ def test_CuVec_creation(caplog): assert not caplog.record_tuples w = cu.CuVec(v) assert [i[1:] for i in caplog.record_tuples] == [(10, "new view")] - nested = cu.asarray(w.cuvec).cuvec - assert nested == w.cuvec, "expected different object" - assert np.asarray(nested).data == np.asarray(w.cuvec).data, "expected same data" caplog.clear() assert w[0, 0, 0] == 1 @@ -60,3 +57,19 @@ def test_CuVec_creation(caplog): assert v.cuvec is w.cuvec assert v.data == w.data assert not caplog.record_tuples + + +def test_asarray(): + v = cu.asarray(np.random.random(shape)) + w = cu.CuVec(v) + assert w.cuvec == v.cuvec + assert (w == v).all() + assert np.asarray(w.cuvec).data == np.asarray(v.cuvec).data + x = cu.asarray(w.cuvec) + assert x.cuvec == v.cuvec + assert (x == v).all() + assert np.asarray(x.cuvec).data == np.asarray(v.cuvec).data + y = cu.asarray(x.tolist()) + assert y.cuvec != v.cuvec + assert (y == v).all() + assert np.asarray(y.cuvec).data == np.asarray(v.cuvec).data From 576e790bbda25f363e9982614e2bf72ff29b4d27 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sat, 23 Jan 2021 14:26:28 +0000 Subject: [PATCH 8/8] split external module example, fix includes --- cuvec/CMakeLists.txt | 2 + cuvec/include/cuhelpers.h | 10 ----- cuvec/include/cuvec.cuh | 23 ++++++++---- cuvec/src/cuhelpers.cu | 22 ----------- cuvec/src/{pycuvec.cu => cuvec.cu} | 35 +----------------- cuvec/src/example_mod/CMakeLists.txt | 23 ++++++++++++ cuvec/src/example_mod/example_mod.cu | 55 ++++++++++++++++++++++++++++ tests/test_perf.py | 21 ++++++----- 8 files changed, 108 insertions(+), 83 deletions(-) delete mode 100644 cuvec/include/cuhelpers.h delete mode 100644 cuvec/src/cuhelpers.cu rename cuvec/src/{pycuvec.cu => cuvec.cu} (75%) create mode 100644 cuvec/src/example_mod/CMakeLists.txt create mode 100644 cuvec/src/example_mod/example_mod.cu diff --git a/cuvec/CMakeLists.txt b/cuvec/CMakeLists.txt index d04f1e6..ab096b7 100644 --- a/cuvec/CMakeLists.txt +++ b/cuvec/CMakeLists.txt @@ -60,6 +60,8 @@ install(TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets install(EXPORT ${PROJECT_NAME}Targets FILE AMYPAD${PROJECT_NAME}Targets.cmake NAMESPACE AMYPAD:: DESTINATION ${CMAKE_PROJECT_NAME}/cmake) +add_subdirectory(src/example_mod) + # install project include(CMakePackageConfigHelpers) diff --git a/cuvec/include/cuhelpers.h b/cuvec/include/cuhelpers.h deleted file mode 100644 index 8ad2c29..0000000 --- a/cuvec/include/cuhelpers.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _CU_HELPERS_ -#define _CU_HELPERS_ - -void HandleError(cudaError_t err, const char *file, int line); -#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) - -bool PyHandleError(cudaError_t err, const char *file, int line); -#define HANDLE_PyErr(err) (PyHandleError(err, __FILE__, __LINE__)) - -#endif // _CU_HELPERS_ diff --git a/cuvec/include/cuvec.cuh b/cuvec/include/cuvec.cuh index 2529361..e94a566 100644 --- a/cuvec/include/cuvec.cuh +++ b/cuvec/include/cuvec.cuh @@ -5,12 +5,18 @@ #ifndef _CUVEC_H_ #define _CUVEC_H_ -#include "cuhelpers.h" // HANDLE_ERROR -#include // fprintf -#include // std::size_t -#include // std::numeric_limits -#include // std::bad_alloc -#include // std::vector +#include // fprintf +#include // std::size_t +#include // std::numeric_limits +#include // std::bad_alloc +#include // std::vector + +void HandleError(cudaError_t err, const char *file, int line) { + if (err != cudaSuccess) { + fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString(err), file, line); + exit(EXIT_FAILURE); + } +} template struct CuAlloc { typedef T value_type; @@ -26,7 +32,8 @@ template struct CuAlloc { if (n > std::numeric_limits::max() / sizeof(T)) throw std::bad_alloc(); T *p; - HANDLE_ERROR(cudaMallocManaged(&p, n * sizeof(T))); // p = (T *)malloc(n * sizeof(T)); + // p = (T *)malloc(n * sizeof(T)); + HandleError(cudaMallocManaged(&p, n * sizeof(T)), __FILE__, __LINE__); if (p) { report(p, n); return p; @@ -37,7 +44,7 @@ template struct CuAlloc { void deallocate(T *p, std::size_t n) noexcept { report(p, n, 0); - HANDLE_ERROR(cudaFree(p)); // free(p); + HandleError(cudaFree(p), __FILE__, __LINE__); // free(p); } private: diff --git a/cuvec/src/cuhelpers.cu b/cuvec/src/cuhelpers.cu deleted file mode 100644 index fcf2cda..0000000 --- a/cuvec/src/cuhelpers.cu +++ /dev/null @@ -1,22 +0,0 @@ -#include "Python.h" -#include "cuhelpers.h" -#include // printf -#include // std::stringstream - -void HandleError(cudaError_t err, const char *file, int line) { - if (err != cudaSuccess) { - fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString(err), file, line); - exit(EXIT_FAILURE); - } -} - -bool PyHandleError(cudaError_t err, const char *file, int line) { - std::stringstream ss; - ss << file << ':' << line << ": " << cudaGetErrorString(err); - std::string s = ss.str(); - if (err != cudaSuccess) { - PyErr_SetString(PyExc_ValueError, s.c_str()); - return false; - } - return true; -} diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/cuvec.cu similarity index 75% rename from cuvec/src/pycuvec.cu rename to cuvec/src/cuvec.cu index 2f37276..6f4ae48 100644 --- a/cuvec/src/pycuvec.cu +++ b/cuvec/src/cuvec.cu @@ -15,41 +15,8 @@ static PyObject *dev_sync(PyObject *self, PyObject *args) { Py_INCREF(Py_None); return Py_None; } -/// tests: add 1 and return -__global__ void _d_incr(float *dst, float *src, int X, int Y) { - int x = threadIdx.x + blockDim.x * blockIdx.x; - if (x >= X) return; - int y = threadIdx.y + blockDim.y * blockIdx.y; - if (y >= Y) return; - dst[y * X + x] = src[y * X + x] + 1; -} -static PyObject *_increment_f(PyObject *self, PyObject *args) { - PyCuVec *src; - if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL; - std::vector &N = src->shape; - - cudaEvent_t eStart, eAlloc, eKern; - cudaEventCreate(&eStart); - cudaEventCreate(&eAlloc); - cudaEventCreate(&eKern); - cudaEventRecord(eStart); - PyCuVec *dst = PyCuVec_zeros_like(src); - cudaEventRecord(eAlloc); - dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32); - dim3 blcks(32, 32); - _d_incr<<>>(dst->vec.data(), src->vec.data(), N[1], N[0]); - // cudaDeviceSynchronize(); - cudaEventRecord(eKern); - cudaEventSynchronize(eKern); - float alloc_ms, kernel_ms; - cudaEventElapsedTime(&alloc_ms, eStart, eAlloc); - cudaEventElapsedTime(&kernel_ms, eAlloc, eKern); - // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms); - return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst); -} static PyMethodDef cuvec_methods[] = { {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."}, - {"_increment_f", _increment_f, METH_VARARGS, "Returns the input + 1."}, {NULL, NULL, 0, NULL} // Sentinel }; @@ -146,7 +113,7 @@ PyMODINIT_FUNC PyInit_cuvec(void) { if (date == NULL) return NULL; PyModule_AddObject(m, "__date__", date); - PyObject *version = Py_BuildValue("s", "0.2.0"); + PyObject *version = Py_BuildValue("s", "0.3.0"); if (version == NULL) return NULL; PyModule_AddObject(m, "__version__", version); diff --git a/cuvec/src/example_mod/CMakeLists.txt b/cuvec/src/example_mod/CMakeLists.txt new file mode 100644 index 0000000..3322036 --- /dev/null +++ b/cuvec/src/example_mod/CMakeLists.txt @@ -0,0 +1,23 @@ +project(example_mod) +file(GLOB SRC LIST_DIRECTORIES false "*.cu") + +include_directories(${Python3_INCLUDE_DIRS}) +#include_directories(${Python3_NumPy_INCLUDE_DIRS}) + +add_library(${PROJECT_NAME} MODULE ${SRC}) +target_include_directories(${PROJECT_NAME} PUBLIC + "$" + "$") +target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} ${CUDA_LIBRARIES}) + +if(SKBUILD) + python_extension_module(${PROJECT_NAME}) +endif() +set_target_properties(${PROJECT_NAME} PROPERTIES + CXX_STANDARD 11 + VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR} + INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR}) +set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${PROJECT_NAME}_MAJOR_VERSION) +install(TARGETS ${PROJECT_NAME} + INCLUDES DESTINATION ${CMAKE_PROJECT_NAME}/include + LIBRARY DESTINATION ${CMAKE_PROJECT_NAME}) diff --git a/cuvec/src/example_mod/example_mod.cu b/cuvec/src/example_mod/example_mod.cu new file mode 100644 index 0000000..539836f --- /dev/null +++ b/cuvec/src/example_mod/example_mod.cu @@ -0,0 +1,55 @@ +/** + * Example external extension module using CuVec. + * + * Copyright (2021) Casper da Costa-Luis + */ +#include "Python.h" +#include "pycuvec.cuh" // PyCuVec +/** functions */ +/// dst = src + 1 +__global__ void _d_incr(float *dst, float *src, int X, int Y) { + int x = threadIdx.x + blockDim.x * blockIdx.x; + if (x >= X) return; + int y = threadIdx.y + blockDim.y * blockIdx.y; + if (y >= Y) return; + dst[y * X + x] = src[y * X + x] + 1; +} +static PyObject *increment_f(PyObject *self, PyObject *args) { + PyCuVec *src; + if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL; + std::vector &N = src->shape; + + cudaEvent_t eStart, eAlloc, eKern; + cudaEventCreate(&eStart); + cudaEventCreate(&eAlloc); + cudaEventCreate(&eKern); + cudaEventRecord(eStart); + PyCuVec *dst = PyCuVec_zeros_like(src); + cudaEventRecord(eAlloc); + dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32); + dim3 blcks(32, 32); + _d_incr<<>>(dst->vec.data(), src->vec.data(), N[1], N[0]); + // cudaDeviceSynchronize(); + cudaEventRecord(eKern); + cudaEventSynchronize(eKern); + float alloc_ms, kernel_ms; + cudaEventElapsedTime(&alloc_ms, eStart, eAlloc); + cudaEventElapsedTime(&kernel_ms, eAlloc, eKern); + // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms); + return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst); +} +static PyMethodDef example_methods[] = { + {"increment_f", increment_f, METH_VARARGS, "Returns (alloc_ms, kernel_ms, input + 1)."}, + {NULL, NULL, 0, NULL} // Sentinel +}; + +/** module */ +static struct PyModuleDef example_mod = {PyModuleDef_HEAD_INIT, + "example_mod", // module + "Example external module.", + -1, // module keeps state in global variables + example_methods}; +PyMODINIT_FUNC PyInit_example_mod(void) { + Py_Initialize(); + return PyModule_Create(&example_mod); +} diff --git a/tests/test_perf.py b/tests/test_perf.py index c0925e6..652ad20 100644 --- a/tests/test_perf.py +++ b/tests/test_perf.py @@ -25,9 +25,11 @@ def inner(*args, **kwargs): def test_perf(shape=(1337, 42), quiet=False): + # `example_mod` is defined in ../cuvec/src/example_mod/ + from cuvec.example_mod import increment_f + overhead = np.mean([_time_overhead() for _ in range(100)]) t = {} - t['init'], _ = timer(cu.dev_sync)() t['create src'], src = timer(cu.zeros)(shape, "float32") rnd = np.random.random(shape) @@ -35,9 +37,9 @@ def test_perf(shape=(1337, 42), quiet=False): src[:] = rnd t['assign'] = (time() - tic - overhead) * 1000 - # `_increment_f` is defined in ../cuvec/src/pycuvec.cu - t['call ext'], (t['- create dst'], t['- kernel'], - res) = timer(cu.cuvec._increment_f)(src.cuvec) + if not quiet: + t['warmup'], (t['> create dst'], t['> kernel'], _) = timer(increment_f)(src.cuvec) + t['call ext'], (t['- create dst'], t['- kernel'], res) = timer(increment_f)(src.cuvec) t['view'], dst = timer(cu.asarray)(res) if not quiet: @@ -56,14 +58,15 @@ def test_perf(shape=(1337, 42), quiet=False): except ImportError: trange = range nruns = 1000 - print("Repeating & averaging performance test metrics over {nruns} runs.") + print("# One run:") + test_perf((1000, 1000)) + + print("Repeating & averaging performance test metrics over {nruns} runs.") runs = [test_perf((1000, 1000), True) for _ in trange(nruns)] pretty = { - 'init': 'Initialise', 'create src': 'Create input', 'assign': 'Assign', - 'call ext': 'Call extension', '- create dst': '-- Create output', - '- kernel': '-- Launch kernel', 'view': 'View'} + 'create src': 'Create input', 'assign': 'Assign', 'call ext': 'Call extension', + '- create dst': '-- Create output', '- kernel': '-- Launch kernel', 'view': 'View'} runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]} - print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}" for k, v in runs.items()))