From 80d91083ca2e40df1d1ea6f938c4cecf751d1683 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 22 Jan 2021 16:57:09 +0000
Subject: [PATCH 1/8] add half-precision (fp16)

---
 cuvec/include/pycuvec.cuh | 14 +++++++++-----
 cuvec/pycuvec.py          |  4 +++-
 cuvec/src/pycuvec.cu      |  5 +++++
 3 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/cuvec/include/pycuvec.cuh b/cuvec/include/pycuvec.cuh
index 7224ed4..47591b8 100644
--- a/cuvec/include/pycuvec.cuh
+++ b/cuvec/include/pycuvec.cuh
@@ -10,11 +10,12 @@
 #define _PYCUVEC_H_
 
 #include "Python.h"
-#include "cuvec.cuh" // CuVec
-#include <cstdlib>   // malloc, free
-#include <sstream>   // std::stringstream
-#include <typeinfo>  // typeid
-#include <vector>    // std::vector
+#include "cuda_fp16.h" // __half
+#include "cuvec.cuh"   // CuVec
+#include <cstdlib>     // malloc, free
+#include <sstream>     // std::stringstream
+#include <typeinfo>    // typeid
+#include <vector>      // std::vector
 
 template <typename T> struct PyType {
   static const char *format() { return typeid(T).name(); }
@@ -51,6 +52,9 @@ template <> struct PyType<long long> {
 template <> struct PyType<unsigned long long> {
   static const char *format() { return "Q"; }
 };
+template <> struct PyType<__half> {
+  static const char *format() { return "e"; }
+};
 template <> struct PyType<float> {
   static const char *format() { return "f"; }
 };
diff --git a/cuvec/pycuvec.py b/cuvec/pycuvec.py
index f00d083..ea02a08 100644
--- a/cuvec/pycuvec.py
+++ b/cuvec/pycuvec.py
@@ -9,6 +9,7 @@
     Vector_B,
     Vector_c,
     Vector_d,
+    Vector_e,
     Vector_f,
     Vector_h,
     Vector_H,
@@ -18,7 +19,7 @@
     Vector_Q,
 )
 
-typecodes = [i for i in array.typecodes if i not in "ulL"]
+typecodes = [i for i in array.typecodes if i not in "ulL"] + ["e"]
 vec_types = {
     np.dtype('int8'): Vector_b,
     np.dtype('uint8'): Vector_B,
@@ -29,6 +30,7 @@
     np.dtype('uint32'): Vector_I,
     np.dtype('int64'): Vector_q,
     np.dtype('uint64'): Vector_Q,
+    np.dtype('float16'): Vector_e,
     np.dtype('float32'): Vector_f,
     np.dtype('float64'): Vector_d}
 
diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu
index 86045e3..ab8f39f 100644
--- a/cuvec/src/pycuvec.cu
+++ b/cuvec/src/pycuvec.cu
@@ -32,6 +32,7 @@ static PyCuVec_tp<int> Vector_i;
 static PyCuVec_tp<unsigned int> Vector_I;
 static PyCuVec_tp<long long> Vector_q;          // _l
 static PyCuVec_tp<unsigned long long> Vector_Q; // _L
+static PyCuVec_tp<__half> Vector_e;
 static PyCuVec_tp<float> Vector_f;
 static PyCuVec_tp<double> Vector_d;
 
@@ -92,6 +93,10 @@ PyMODINIT_FUNC PyInit_cuvec(void) {
   Py_INCREF(&Vector_Q.tp_obj);
   PyModule_AddObject(m, "Vector_L", (PyObject *)&Vector_Q.tp_obj);
 
+  if (PyType_Ready(&Vector_e.tp_obj) < 0) return NULL;
+  Py_INCREF(&Vector_e.tp_obj);
+  PyModule_AddObject(m, Vector_e.name.c_str(), (PyObject *)&Vector_e.tp_obj);
+
   if (PyType_Ready(&Vector_f.tp_obj) < 0) return NULL;
   Py_INCREF(&Vector_f.tp_obj);
   PyModule_AddObject(m, Vector_f.name.c_str(), (PyObject *)&Vector_f.tp_obj);

From 52200888d2943f4e25654c5b1fc50a547f1c6841 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 03:06:00 +0000
Subject: [PATCH 2/8] tidy typecodes list => string

---
 cuvec/pycuvec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuvec/pycuvec.py b/cuvec/pycuvec.py
index ea02a08..fb92ae4 100644
--- a/cuvec/pycuvec.py
+++ b/cuvec/pycuvec.py
@@ -19,7 +19,8 @@
     Vector_Q,
 )
 
-typecodes = [i for i in array.typecodes if i not in "ulL"] + ["e"]
+# u: non-standard np.dype('S2'); l/L: inconsistent between `array` and `numpy`
+typecodes = ''.join(i for i in array.typecodes if i not in "ulL") + "e"
 vec_types = {
     np.dtype('int8'): Vector_b,
     np.dtype('uint8'): Vector_B,

From fbb3152812ff026e267e5a6cf419e5eabe5f1c8f Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 03:05:27 +0000
Subject: [PATCH 3/8] tests: add performance and external module test

---
 cuvec/src/pycuvec.cu | 30 ++++++++++++++++++++++++++++++
 tests/test_perf.py   | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 tests/test_perf.py

diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu
index ab8f39f..3b43407 100644
--- a/cuvec/src/pycuvec.cu
+++ b/cuvec/src/pycuvec.cu
@@ -15,8 +15,38 @@ static PyObject *dev_sync(PyObject *self, PyObject *args) {
   Py_INCREF(Py_None);
   return Py_None;
 }
+/// tests: add 1 and return
+__global__ void _d_incr(float *dst, float *src, int X, int Y) {
+  int x = threadIdx.x + blockDim.x * blockIdx.x;
+  if (x >= X) return;
+  int y = threadIdx.y + blockDim.y * blockIdx.y;
+  if (y >= Y) return;
+  dst[y * X + x] = src[y * X + x] + 1;
+}
+static PyObject *_increment_f(PyObject *self, PyObject *args) {
+  PyCuVec<float> *src;
+  if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL;
+  std::vector<Py_ssize_t> &N = src->shape;
+  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start);
+  dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
+  dim3 blcks(32, 32);
+  _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
+  cudaDeviceSynchronize();
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  float msec = 0;
+  cudaEventElapsedTime(&msec, start, stop);
+  // fprintf(stderr, "%.5g ms\n", msec);
+  return Py_BuildValue("dO", double(msec), (PyObject *)dst);
+}
 static PyMethodDef cuvec_methods[] = {
     {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."},
+    {"_increment_f", _increment_f, METH_VARARGS, "Returns the input + 1."},
     {NULL, NULL, 0, NULL} // Sentinel
 };
 
diff --git a/tests/test_perf.py b/tests/test_perf.py
new file mode 100644
index 0000000..cb07461
--- /dev/null
+++ b/tests/test_perf.py
@@ -0,0 +1,42 @@
+from time import time
+
+import numpy as np
+
+import cuvec as cu
+
+
+def _time_overhead():
+    tic = time()
+    pass
+    res = time() - tic
+    return res
+
+
+def timer(func):
+    def inner(*args, **kwargs):
+        overhead = np.mean([_time_overhead() for _ in range(100)])
+        tic = time()
+        res = func(*args, **kwargs)
+        return (time() - tic - overhead) * 1000, res
+
+    return inner
+
+
+def test_perf():
+    overhead = np.mean([_time_overhead() for _ in range(100)])
+    t = {}
+    t['init'], _ = timer(cu.dev_sync)()
+    t['create'], src = timer(cu.zeros)((1337, 42), "float32")
+
+    tic = time()
+    src.flat = range(1337, 42)
+    t['assign'] = (time() - tic - overhead) * 1000
+
+    # `_increment_f` is defined in ../cuvec/src/pycuvec.cu
+    t['call'], (t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec)
+    t['view'], dst = timer(cu.asarray)(res)
+
+    assert (src + 1 == dst).all()
+    print(t)
+    # even a fast kernel takes longer than API overhead
+    assert t['kernel'] / t['call'] >= 0.5

From 3df0c57eed531f3e882f6b9210b74f8a1af88c6b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 04:21:47 +0000
Subject: [PATCH 4/8] tests: update perf timing

---
 cuvec/src/pycuvec.cu | 27 +++++++++++++++------------
 tests/test_perf.py   | 15 +++++++++------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/pycuvec.cu
index 3b43407..2f37276 100644
--- a/cuvec/src/pycuvec.cu
+++ b/cuvec/src/pycuvec.cu
@@ -27,22 +27,25 @@ static PyObject *_increment_f(PyObject *self, PyObject *args) {
   PyCuVec<float> *src;
   if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL;
   std::vector<Py_ssize_t> &N = src->shape;
-  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
 
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start);
+  cudaEvent_t eStart, eAlloc, eKern;
+  cudaEventCreate(&eStart);
+  cudaEventCreate(&eAlloc);
+  cudaEventCreate(&eKern);
+  cudaEventRecord(eStart);
+  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
+  cudaEventRecord(eAlloc);
   dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
   dim3 blcks(32, 32);
   _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
-  cudaDeviceSynchronize();
-  cudaEventRecord(stop);
-  cudaEventSynchronize(stop);
-  float msec = 0;
-  cudaEventElapsedTime(&msec, start, stop);
-  // fprintf(stderr, "%.5g ms\n", msec);
-  return Py_BuildValue("dO", double(msec), (PyObject *)dst);
+  // cudaDeviceSynchronize();
+  cudaEventRecord(eKern);
+  cudaEventSynchronize(eKern);
+  float alloc_ms, kernel_ms;
+  cudaEventElapsedTime(&alloc_ms, eStart, eAlloc);
+  cudaEventElapsedTime(&kernel_ms, eAlloc, eKern);
+  // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms);
+  return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst);
 }
 static PyMethodDef cuvec_methods[] = {
     {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."},
diff --git a/tests/test_perf.py b/tests/test_perf.py
index cb07461..f67b3bb 100644
--- a/tests/test_perf.py
+++ b/tests/test_perf.py
@@ -22,21 +22,24 @@ def inner(*args, **kwargs):
     return inner
 
 
-def test_perf():
+def test_perf(shape=(1337, 42)):
     overhead = np.mean([_time_overhead() for _ in range(100)])
     t = {}
     t['init'], _ = timer(cu.dev_sync)()
-    t['create'], src = timer(cu.zeros)((1337, 42), "float32")
+    t['create'], src = timer(cu.zeros)(shape, "float32")
 
+    rnd = np.random.random(shape)
     tic = time()
-    src.flat = range(1337, 42)
+    src[:] = rnd
     t['assign'] = (time() - tic - overhead) * 1000
 
     # `_increment_f` is defined in ../cuvec/src/pycuvec.cu
-    t['call'], (t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec)
+    t['call'], (t['create out'], t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec)
     t['view'], dst = timer(cu.asarray)(res)
 
-    assert (src + 1 == dst).all()
     print(t)
+    assert (src + 1 == dst).all()
     # even a fast kernel takes longer than API overhead
-    assert t['kernel'] / t['call'] >= 0.5
+    assert t['kernel'] / (t['call'] - t['create out']) > 0.5
+    # API call should be <1 ms
+    assert t['call'] - t['create out'] - t['kernel'] < 1

From 915905667840114806c209a1666fb635b44e5554 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 04:22:09 +0000
Subject: [PATCH 5/8] fix unneeded `asarray` memcopy on raw objects

---
 cuvec/helpers.py      | 4 ++++
 tests/test_helpers.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cuvec/helpers.py b/cuvec/helpers.py
index d262da6..d7fbaa8 100644
--- a/cuvec/helpers.py
+++ b/cuvec/helpers.py
@@ -72,4 +72,8 @@ def asarray(arr, dtype=None, order=None):
     Returns a `cuvec.CuVec` view of `arr`, avoiding memory copies if possible.
     (`cuvec` equivalent of `numpy.asarray`).
     """
+    if not isinstance(arr, np.ndarray): # probably raw pointer
+        res = CuVec(arr)
+        if dtype is None or res.dtype == np.dtype(dtype):
+            return CuVec(np.asanyarray(res, order=order))
     return CuVec(np.asanyarray(arr, dtype=dtype, order=order))
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index da5cd3d..f60813d 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -50,7 +50,7 @@ def test_CuVec_creation(caplog):
     w = cu.CuVec(v)
     assert [i[1:] for i in caplog.record_tuples] == [(10, "new view")]
     nested = cu.asarray(w.cuvec).cuvec
-    assert nested != w.cuvec, "expected different object"
+    assert nested == w.cuvec, "expected different object"
     assert np.asarray(nested).data == np.asarray(w.cuvec).data, "expected same data"
 
     caplog.clear()

From 9efb921a889435528aa3cc6062cab8597aff7c42 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 12:47:35 +0000
Subject: [PATCH 6/8] tests: add repeat & average perf test script

---
 tests/test_perf.py | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/tests/test_perf.py b/tests/test_perf.py
index f67b3bb..c0925e6 100644
--- a/tests/test_perf.py
+++ b/tests/test_perf.py
@@ -1,3 +1,4 @@
+from functools import wraps
 from time import time
 
 import numpy as np
@@ -13,6 +14,7 @@ def _time_overhead():
 
 
 def timer(func):
+    @wraps(func)
     def inner(*args, **kwargs):
         overhead = np.mean([_time_overhead() for _ in range(100)])
         tic = time()
@@ -22,11 +24,11 @@ def inner(*args, **kwargs):
     return inner
 
 
-def test_perf(shape=(1337, 42)):
+def test_perf(shape=(1337, 42), quiet=False):
     overhead = np.mean([_time_overhead() for _ in range(100)])
     t = {}
     t['init'], _ = timer(cu.dev_sync)()
-    t['create'], src = timer(cu.zeros)(shape, "float32")
+    t['create src'], src = timer(cu.zeros)(shape, "float32")
 
     rnd = np.random.random(shape)
     tic = time()
@@ -34,12 +36,34 @@ def test_perf(shape=(1337, 42)):
     t['assign'] = (time() - tic - overhead) * 1000
 
     # `_increment_f` is defined in ../cuvec/src/pycuvec.cu
-    t['call'], (t['create out'], t['kernel'], res) = timer(cu.cuvec._increment_f)(src.cuvec)
+    t['call ext'], (t['- create dst'], t['- kernel'],
+                    res) = timer(cu.cuvec._increment_f)(src.cuvec)
     t['view'], dst = timer(cu.asarray)(res)
 
-    print(t)
+    if not quiet:
+        print("\n".join(f"{k.ljust(14)} | {v:.3f}" for k, v in t.items()))
     assert (src + 1 == dst).all()
     # even a fast kernel takes longer than API overhead
-    assert t['kernel'] / (t['call'] - t['create out']) > 0.5
-    # API call should be <1 ms
-    assert t['call'] - t['create out'] - t['kernel'] < 1
+    assert t['- kernel'] / (t['call ext'] - t['- create dst']) > 0.5
+    # API call should be <0.1 ms... but set a higher threshold of 2 ms
+    assert t['call ext'] - t['- create dst'] - t['- kernel'] < 2
+    return t
+
+
+if __name__ == "__main__":
+    try:
+        from tqdm import trange
+    except ImportError:
+        trange = range
+    nruns = 1000
+    print("Repeating & averaging performance test metrics over {nruns} runs.")
+
+    runs = [test_perf((1000, 1000), True) for _ in trange(nruns)]
+    pretty = {
+        'init': 'Initialise', 'create src': 'Create input', 'assign': 'Assign',
+        'call ext': 'Call extension', '- create dst': '-- Create output',
+        '- kernel': '-- Launch kernel', 'view': 'View'}
+    runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]}
+
+    print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}"
+                    for k, v in runs.items()))

From cfe107b6df7c66c8f2f9665cdb29007312394b34 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 13:14:56 +0000
Subject: [PATCH 7/8] fix and test asarray

---
 cuvec/helpers.py      |  2 +-
 tests/test_helpers.py | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/cuvec/helpers.py b/cuvec/helpers.py
index d7fbaa8..8a2f2ee 100644
--- a/cuvec/helpers.py
+++ b/cuvec/helpers.py
@@ -72,7 +72,7 @@ def asarray(arr, dtype=None, order=None):
     Returns a `cuvec.CuVec` view of `arr`, avoiding memory copies if possible.
     (`cuvec` equivalent of `numpy.asarray`).
     """
-    if not isinstance(arr, np.ndarray): # probably raw pointer
+    if not isinstance(arr, np.ndarray) and is_raw_cuvec(arr):
         res = CuVec(arr)
         if dtype is None or res.dtype == np.dtype(dtype):
             return CuVec(np.asanyarray(res, order=order))
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index f60813d..c08818e 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -49,9 +49,6 @@ def test_CuVec_creation(caplog):
     assert not caplog.record_tuples
     w = cu.CuVec(v)
     assert [i[1:] for i in caplog.record_tuples] == [(10, "new view")]
-    nested = cu.asarray(w.cuvec).cuvec
-    assert nested == w.cuvec, "expected different object"
-    assert np.asarray(nested).data == np.asarray(w.cuvec).data, "expected same data"
 
     caplog.clear()
     assert w[0, 0, 0] == 1
@@ -60,3 +57,19 @@ def test_CuVec_creation(caplog):
     assert v.cuvec is w.cuvec
     assert v.data == w.data
     assert not caplog.record_tuples
+
+
+def test_asarray():
+    v = cu.asarray(np.random.random(shape))
+    w = cu.CuVec(v)
+    assert w.cuvec == v.cuvec
+    assert (w == v).all()
+    assert np.asarray(w.cuvec).data == np.asarray(v.cuvec).data
+    x = cu.asarray(w.cuvec)
+    assert x.cuvec == v.cuvec
+    assert (x == v).all()
+    assert np.asarray(x.cuvec).data == np.asarray(v.cuvec).data
+    y = cu.asarray(x.tolist())
+    assert y.cuvec != v.cuvec
+    assert (y == v).all()
+    assert np.asarray(y.cuvec).data == np.asarray(v.cuvec).data

From 576e790bbda25f363e9982614e2bf72ff29b4d27 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 23 Jan 2021 14:26:28 +0000
Subject: [PATCH 8/8] split external module example, fix includes

---
 cuvec/CMakeLists.txt                 |  2 +
 cuvec/include/cuhelpers.h            | 10 -----
 cuvec/include/cuvec.cuh              | 23 ++++++++----
 cuvec/src/cuhelpers.cu               | 22 -----------
 cuvec/src/{pycuvec.cu => cuvec.cu}   | 35 +-----------------
 cuvec/src/example_mod/CMakeLists.txt | 23 ++++++++++++
 cuvec/src/example_mod/example_mod.cu | 55 ++++++++++++++++++++++++++++
 tests/test_perf.py                   | 21 ++++++-----
 8 files changed, 108 insertions(+), 83 deletions(-)
 delete mode 100644 cuvec/include/cuhelpers.h
 delete mode 100644 cuvec/src/cuhelpers.cu
 rename cuvec/src/{pycuvec.cu => cuvec.cu} (75%)
 create mode 100644 cuvec/src/example_mod/CMakeLists.txt
 create mode 100644 cuvec/src/example_mod/example_mod.cu

diff --git a/cuvec/CMakeLists.txt b/cuvec/CMakeLists.txt
index d04f1e6..ab096b7 100644
--- a/cuvec/CMakeLists.txt
+++ b/cuvec/CMakeLists.txt
@@ -60,6 +60,8 @@ install(TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets
 install(EXPORT ${PROJECT_NAME}Targets FILE AMYPAD${PROJECT_NAME}Targets.cmake
   NAMESPACE AMYPAD:: DESTINATION ${CMAKE_PROJECT_NAME}/cmake)
 
+add_subdirectory(src/example_mod)
+
 # install project
 
 include(CMakePackageConfigHelpers)
diff --git a/cuvec/include/cuhelpers.h b/cuvec/include/cuhelpers.h
deleted file mode 100644
index 8ad2c29..0000000
--- a/cuvec/include/cuhelpers.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _CU_HELPERS_
-#define _CU_HELPERS_
-
-void HandleError(cudaError_t err, const char *file, int line);
-#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
-
-bool PyHandleError(cudaError_t err, const char *file, int line);
-#define HANDLE_PyErr(err) (PyHandleError(err, __FILE__, __LINE__))
-
-#endif // _CU_HELPERS_
diff --git a/cuvec/include/cuvec.cuh b/cuvec/include/cuvec.cuh
index 2529361..e94a566 100644
--- a/cuvec/include/cuvec.cuh
+++ b/cuvec/include/cuvec.cuh
@@ -5,12 +5,18 @@
 #ifndef _CUVEC_H_
 #define _CUVEC_H_
 
-#include "cuhelpers.h" // HANDLE_ERROR
-#include <cstdio>      // fprintf
-#include <cstdlib>     // std::size_t
-#include <limits>      // std::numeric_limits
-#include <new>         // std::bad_alloc
-#include <vector>      // std::vector
+#include <cstdio>  // fprintf
+#include <cstdlib> // std::size_t
+#include <limits>  // std::numeric_limits
+#include <new>     // std::bad_alloc
+#include <vector>  // std::vector
+
+void HandleError(cudaError_t err, const char *file, int line) {
+  if (err != cudaSuccess) {
+    fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
 
 template <class T> struct CuAlloc {
   typedef T value_type;
@@ -26,7 +32,8 @@ template <class T> struct CuAlloc {
         if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) throw std::bad_alloc();
 
         T *p;
-        HANDLE_ERROR(cudaMallocManaged(&p, n * sizeof(T))); // p = (T *)malloc(n * sizeof(T));
+        // p = (T *)malloc(n * sizeof(T));
+        HandleError(cudaMallocManaged(&p, n * sizeof(T)), __FILE__, __LINE__);
         if (p) {
           report(p, n);
           return p;
@@ -37,7 +44,7 @@ template <class T> struct CuAlloc {
 
   void deallocate(T *p, std::size_t n) noexcept {
     report(p, n, 0);
-    HANDLE_ERROR(cudaFree(p)); // free(p);
+    HandleError(cudaFree(p), __FILE__, __LINE__); // free(p);
   }
 
 private:
diff --git a/cuvec/src/cuhelpers.cu b/cuvec/src/cuhelpers.cu
deleted file mode 100644
index fcf2cda..0000000
--- a/cuvec/src/cuhelpers.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "Python.h"
-#include "cuhelpers.h"
-#include <cstdio>  // printf
-#include <sstream> // std::stringstream
-
-void HandleError(cudaError_t err, const char *file, int line) {
-  if (err != cudaSuccess) {
-    fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString(err), file, line);
-    exit(EXIT_FAILURE);
-  }
-}
-
-bool PyHandleError(cudaError_t err, const char *file, int line) {
-  std::stringstream ss;
-  ss << file << ':' << line << ": " << cudaGetErrorString(err);
-  std::string s = ss.str();
-  if (err != cudaSuccess) {
-    PyErr_SetString(PyExc_ValueError, s.c_str());
-    return false;
-  }
-  return true;
-}
diff --git a/cuvec/src/pycuvec.cu b/cuvec/src/cuvec.cu
similarity index 75%
rename from cuvec/src/pycuvec.cu
rename to cuvec/src/cuvec.cu
index 2f37276..6f4ae48 100644
--- a/cuvec/src/pycuvec.cu
+++ b/cuvec/src/cuvec.cu
@@ -15,41 +15,8 @@ static PyObject *dev_sync(PyObject *self, PyObject *args) {
   Py_INCREF(Py_None);
   return Py_None;
 }
-/// tests: add 1 and return
-__global__ void _d_incr(float *dst, float *src, int X, int Y) {
-  int x = threadIdx.x + blockDim.x * blockIdx.x;
-  if (x >= X) return;
-  int y = threadIdx.y + blockDim.y * blockIdx.y;
-  if (y >= Y) return;
-  dst[y * X + x] = src[y * X + x] + 1;
-}
-static PyObject *_increment_f(PyObject *self, PyObject *args) {
-  PyCuVec<float> *src;
-  if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL;
-  std::vector<Py_ssize_t> &N = src->shape;
-
-  cudaEvent_t eStart, eAlloc, eKern;
-  cudaEventCreate(&eStart);
-  cudaEventCreate(&eAlloc);
-  cudaEventCreate(&eKern);
-  cudaEventRecord(eStart);
-  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
-  cudaEventRecord(eAlloc);
-  dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
-  dim3 blcks(32, 32);
-  _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
-  // cudaDeviceSynchronize();
-  cudaEventRecord(eKern);
-  cudaEventSynchronize(eKern);
-  float alloc_ms, kernel_ms;
-  cudaEventElapsedTime(&alloc_ms, eStart, eAlloc);
-  cudaEventElapsedTime(&kernel_ms, eAlloc, eKern);
-  // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms);
-  return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst);
-}
 static PyMethodDef cuvec_methods[] = {
     {"dev_sync", dev_sync, METH_NOARGS, "Required before accessing cuvec on host."},
-    {"_increment_f", _increment_f, METH_VARARGS, "Returns the input + 1."},
     {NULL, NULL, 0, NULL} // Sentinel
 };
 
@@ -146,7 +113,7 @@ PyMODINIT_FUNC PyInit_cuvec(void) {
   if (date == NULL) return NULL;
   PyModule_AddObject(m, "__date__", date);
 
-  PyObject *version = Py_BuildValue("s", "0.2.0");
+  PyObject *version = Py_BuildValue("s", "0.3.0");
   if (version == NULL) return NULL;
   PyModule_AddObject(m, "__version__", version);
 
diff --git a/cuvec/src/example_mod/CMakeLists.txt b/cuvec/src/example_mod/CMakeLists.txt
new file mode 100644
index 0000000..3322036
--- /dev/null
+++ b/cuvec/src/example_mod/CMakeLists.txt
@@ -0,0 +1,23 @@
+project(example_mod)
+file(GLOB SRC LIST_DIRECTORIES false "*.cu")
+
+include_directories(${Python3_INCLUDE_DIRS})
+#include_directories(${Python3_NumPy_INCLUDE_DIRS})
+
+add_library(${PROJECT_NAME} MODULE ${SRC})
+target_include_directories(${PROJECT_NAME} PUBLIC
+  "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
+  "$<INSTALL_INTERFACE:${CMAKE_PROJECT_NAME}/include>")
+target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} ${CUDA_LIBRARIES})
+
+if(SKBUILD)
+  python_extension_module(${PROJECT_NAME})
+endif()
+set_target_properties(${PROJECT_NAME} PROPERTIES
+  CXX_STANDARD 11
+  VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+  INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR})
+set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${PROJECT_NAME}_MAJOR_VERSION)
+install(TARGETS ${PROJECT_NAME}
+  INCLUDES DESTINATION ${CMAKE_PROJECT_NAME}/include
+  LIBRARY DESTINATION ${CMAKE_PROJECT_NAME})
diff --git a/cuvec/src/example_mod/example_mod.cu b/cuvec/src/example_mod/example_mod.cu
new file mode 100644
index 0000000..539836f
--- /dev/null
+++ b/cuvec/src/example_mod/example_mod.cu
@@ -0,0 +1,55 @@
+/**
+ * Example external extension module using CuVec.
+ *
+ * Copyright (2021) Casper da Costa-Luis
+ */
+#include "Python.h"
+#include "pycuvec.cuh" // PyCuVec
+/** functions */
+/// dst = src + 1
+__global__ void _d_incr(float *dst, float *src, int X, int Y) {
+  int x = threadIdx.x + blockDim.x * blockIdx.x;
+  if (x >= X) return;
+  int y = threadIdx.y + blockDim.y * blockIdx.y;
+  if (y >= Y) return;
+  dst[y * X + x] = src[y * X + x] + 1;
+}
+static PyObject *increment_f(PyObject *self, PyObject *args) {
+  PyCuVec<float> *src;
+  if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL;
+  std::vector<Py_ssize_t> &N = src->shape;
+
+  cudaEvent_t eStart, eAlloc, eKern;
+  cudaEventCreate(&eStart);
+  cudaEventCreate(&eAlloc);
+  cudaEventCreate(&eKern);
+  cudaEventRecord(eStart);
+  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
+  cudaEventRecord(eAlloc);
+  dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
+  dim3 blcks(32, 32);
+  _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
+  // cudaDeviceSynchronize();
+  cudaEventRecord(eKern);
+  cudaEventSynchronize(eKern);
+  float alloc_ms, kernel_ms;
+  cudaEventElapsedTime(&alloc_ms, eStart, eAlloc);
+  cudaEventElapsedTime(&kernel_ms, eAlloc, eKern);
+  // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms);
+  return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst);
+}
+static PyMethodDef example_methods[] = {
+    {"increment_f", increment_f, METH_VARARGS, "Returns (alloc_ms, kernel_ms, input + 1)."},
+    {NULL, NULL, 0, NULL} // Sentinel
+};
+
+/** module */
+static struct PyModuleDef example_mod = {PyModuleDef_HEAD_INIT,
+                                         "example_mod", // module
+                                         "Example external module.",
+                                         -1, // module keeps state in global variables
+                                         example_methods};
+PyMODINIT_FUNC PyInit_example_mod(void) {
+  Py_Initialize();
+  return PyModule_Create(&example_mod);
+}
diff --git a/tests/test_perf.py b/tests/test_perf.py
index c0925e6..652ad20 100644
--- a/tests/test_perf.py
+++ b/tests/test_perf.py
@@ -25,9 +25,11 @@ def inner(*args, **kwargs):
 
 
 def test_perf(shape=(1337, 42), quiet=False):
+    # `example_mod` is defined in ../cuvec/src/example_mod/
+    from cuvec.example_mod import increment_f
+
     overhead = np.mean([_time_overhead() for _ in range(100)])
     t = {}
-    t['init'], _ = timer(cu.dev_sync)()
     t['create src'], src = timer(cu.zeros)(shape, "float32")
 
     rnd = np.random.random(shape)
@@ -35,9 +37,9 @@ def test_perf(shape=(1337, 42), quiet=False):
     src[:] = rnd
     t['assign'] = (time() - tic - overhead) * 1000
 
-    # `_increment_f` is defined in ../cuvec/src/pycuvec.cu
-    t['call ext'], (t['- create dst'], t['- kernel'],
-                    res) = timer(cu.cuvec._increment_f)(src.cuvec)
+    if not quiet:
+        t['warmup'], (t['> create dst'], t['> kernel'], _) = timer(increment_f)(src.cuvec)
+    t['call ext'], (t['- create dst'], t['- kernel'], res) = timer(increment_f)(src.cuvec)
     t['view'], dst = timer(cu.asarray)(res)
 
     if not quiet:
@@ -56,14 +58,15 @@ def test_perf(shape=(1337, 42), quiet=False):
     except ImportError:
         trange = range
     nruns = 1000
-    print("Repeating & averaging performance test metrics over {nruns} runs.")
 
+    print("# One run:")
+    test_perf((1000, 1000))
+
+    print("Repeating & averaging performance test metrics over {nruns} runs.")
     runs = [test_perf((1000, 1000), True) for _ in trange(nruns)]
     pretty = {
-        'init': 'Initialise', 'create src': 'Create input', 'assign': 'Assign',
-        'call ext': 'Call extension', '- create dst': '-- Create output',
-        '- kernel': '-- Launch kernel', 'view': 'View'}
+        'create src': 'Create input', 'assign': 'Assign', 'call ext': 'Call extension',
+        '- create dst': '-- Create output', '- kernel': '-- Launch kernel', 'view': 'View'}
     runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]}
-
     print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}"
                     for k, v in runs.items()))