bump version, merge pull request #1 from AMYPAD/devel

AMYPAD · Jan 23, 2021 · dac6aca · dac6aca
2 parents 0c8778b + 576e790
commit dac6aca
Show file tree

Hide file tree

Showing 12 changed files with 206 additions and 50 deletions.
diff --git a/cuvec/CMakeLists.txt b/cuvec/CMakeLists.txt
@@ -60,6 +60,8 @@ install(TARGETS ${PROJECT_NAME} EXPORT ${PROJECT_NAME}Targets
 install(EXPORT ${PROJECT_NAME}Targets FILE AMYPAD${PROJECT_NAME}Targets.cmake
   NAMESPACE AMYPAD:: DESTINATION ${CMAKE_PROJECT_NAME}/cmake)
 
+add_subdirectory(src/example_mod)
+
 # install project
 
 include(CMakePackageConfigHelpers)

diff --git a/cuvec/helpers.py b/cuvec/helpers.py
@@ -72,4 +72,8 @@ def asarray(arr, dtype=None, order=None):
     Returns a `cuvec.CuVec` view of `arr`, avoiding memory copies if possible.
     (`cuvec` equivalent of `numpy.asarray`).
     """
+    if not isinstance(arr, np.ndarray) and is_raw_cuvec(arr):
+        res = CuVec(arr)
+        if dtype is None or res.dtype == np.dtype(dtype):
+            return CuVec(np.asanyarray(res, order=order))
     return CuVec(np.asanyarray(arr, dtype=dtype, order=order))
diff --git a/cuvec/include/cuhelpers.h b/cuvec/include/cuhelpers.h
diff --git a/cuvec/include/cuvec.cuh b/cuvec/include/cuvec.cuh
@@ -5,12 +5,18 @@
 #ifndef _CUVEC_H_
 #define _CUVEC_H_
 
-#include "cuhelpers.h" // HANDLE_ERROR
-#include <cstdio>      // fprintf
-#include <cstdlib>     // std::size_t
-#include <limits>      // std::numeric_limits
-#include <new>         // std::bad_alloc
-#include <vector>      // std::vector
+#include <cstdio>  // fprintf
+#include <cstdlib> // std::size_t
+#include <limits>  // std::numeric_limits
+#include <new>     // std::bad_alloc
+#include <vector>  // std::vector
+
+void HandleError(cudaError_t err, const char *file, int line) {
+  if (err != cudaSuccess) {
+    fprintf(stderr, "%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
 
 template <class T> struct CuAlloc {
   typedef T value_type;
@@ -26,7 +32,8 @@ template <class T> struct CuAlloc {
         if (n > std::numeric_limits<std::size_t>::max() / sizeof(T)) throw std::bad_alloc();
 
         T *p;
-        HANDLE_ERROR(cudaMallocManaged(&p, n * sizeof(T))); // p = (T *)malloc(n * sizeof(T));
+        // p = (T *)malloc(n * sizeof(T));
+        HandleError(cudaMallocManaged(&p, n * sizeof(T)), __FILE__, __LINE__);
         if (p) {
           report(p, n);
           return p;
@@ -37,7 +44,7 @@ template <class T> struct CuAlloc {
 
   void deallocate(T *p, std::size_t n) noexcept {
     report(p, n, 0);
-    HANDLE_ERROR(cudaFree(p)); // free(p);
+    HandleError(cudaFree(p), __FILE__, __LINE__); // free(p);
   }
 
 private:

diff --git a/cuvec/include/pycuvec.cuh b/cuvec/include/pycuvec.cuh
@@ -10,11 +10,12 @@
 #define _PYCUVEC_H_
 
 #include "Python.h"
-#include "cuvec.cuh" // CuVec
-#include <cstdlib>   // malloc, free
-#include <sstream>   // std::stringstream
-#include <typeinfo>  // typeid
-#include <vector>    // std::vector
+#include "cuda_fp16.h" // __half
+#include "cuvec.cuh"   // CuVec
+#include <cstdlib>     // malloc, free
+#include <sstream>     // std::stringstream
+#include <typeinfo>    // typeid
+#include <vector>      // std::vector
 
 template <typename T> struct PyType {
   static const char *format() { return typeid(T).name(); }
@@ -51,6 +52,9 @@ template <> struct PyType<long long> {
 template <> struct PyType<unsigned long long> {
   static const char *format() { return "Q"; }
 };
+template <> struct PyType<__half> {
+  static const char *format() { return "e"; }
+};
 template <> struct PyType<float> {
   static const char *format() { return "f"; }
 };

diff --git a/cuvec/pycuvec.py b/cuvec/pycuvec.py
@@ -9,6 +9,7 @@
     Vector_B,
     Vector_c,
     Vector_d,
+    Vector_e,
     Vector_f,
     Vector_h,
     Vector_H,
@@ -18,7 +19,8 @@
     Vector_Q,
 )
 
-typecodes = [i for i in array.typecodes if i not in "ulL"]
+# u: non-standard np.dype('S2'); l/L: inconsistent between `array` and `numpy`
+typecodes = ''.join(i for i in array.typecodes if i not in "ulL") + "e"
 vec_types = {
     np.dtype('int8'): Vector_b,
     np.dtype('uint8'): Vector_B,
@@ -29,6 +31,7 @@
     np.dtype('uint32'): Vector_I,
     np.dtype('int64'): Vector_q,
     np.dtype('uint64'): Vector_Q,
+    np.dtype('float16'): Vector_e,
     np.dtype('float32'): Vector_f,
     np.dtype('float64'): Vector_d}
 

diff --git a/cuvec/src/cuhelpers.cu b/cuvec/src/cuhelpers.cu
diff --git a/cuvec/src/pycuvec.cu → cuvec/src/cuvec.cu b/cuvec/src/pycuvec.cu → cuvec/src/cuvec.cu
@@ -32,6 +32,7 @@ static PyCuVec_tp<int> Vector_i;
 static PyCuVec_tp<unsigned int> Vector_I;
 static PyCuVec_tp<long long> Vector_q;          // _l
 static PyCuVec_tp<unsigned long long> Vector_Q; // _L
+static PyCuVec_tp<__half> Vector_e;
 static PyCuVec_tp<float> Vector_f;
 static PyCuVec_tp<double> Vector_d;
 
@@ -92,6 +93,10 @@ PyMODINIT_FUNC PyInit_cuvec(void) {
   Py_INCREF(&Vector_Q.tp_obj);
   PyModule_AddObject(m, "Vector_L", (PyObject *)&Vector_Q.tp_obj);
 
+  if (PyType_Ready(&Vector_e.tp_obj) < 0) return NULL;
+  Py_INCREF(&Vector_e.tp_obj);
+  PyModule_AddObject(m, Vector_e.name.c_str(), (PyObject *)&Vector_e.tp_obj);
+
   if (PyType_Ready(&Vector_f.tp_obj) < 0) return NULL;
   Py_INCREF(&Vector_f.tp_obj);
   PyModule_AddObject(m, Vector_f.name.c_str(), (PyObject *)&Vector_f.tp_obj);
@@ -108,7 +113,7 @@ PyMODINIT_FUNC PyInit_cuvec(void) {
   if (date == NULL) return NULL;
   PyModule_AddObject(m, "__date__", date);
 
-  PyObject *version = Py_BuildValue("s", "0.2.0");
+  PyObject *version = Py_BuildValue("s", "0.3.0");
   if (version == NULL) return NULL;
   PyModule_AddObject(m, "__version__", version);
 

diff --git a/cuvec/src/example_mod/CMakeLists.txt b/cuvec/src/example_mod/CMakeLists.txt
@@ -0,0 +1,23 @@
+project(example_mod)
+file(GLOB SRC LIST_DIRECTORIES false "*.cu")
+
+include_directories(${Python3_INCLUDE_DIRS})
+#include_directories(${Python3_NumPy_INCLUDE_DIRS})
+
+add_library(${PROJECT_NAME} MODULE ${SRC})
+target_include_directories(${PROJECT_NAME} PUBLIC
+  "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
+  "$<INSTALL_INTERFACE:${CMAKE_PROJECT_NAME}/include>")
+target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} ${CUDA_LIBRARIES})
+
+if(SKBUILD)
+  python_extension_module(${PROJECT_NAME})
+endif()
+set_target_properties(${PROJECT_NAME} PROPERTIES
+  CXX_STANDARD 11
+  VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+  INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR})
+set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING ${PROJECT_NAME}_MAJOR_VERSION)
+install(TARGETS ${PROJECT_NAME}
+  INCLUDES DESTINATION ${CMAKE_PROJECT_NAME}/include
+  LIBRARY DESTINATION ${CMAKE_PROJECT_NAME})
diff --git a/cuvec/src/example_mod/example_mod.cu b/cuvec/src/example_mod/example_mod.cu
@@ -0,0 +1,55 @@
+/**
+ * Example external extension module using CuVec.
+ *
+ * Copyright (2021) Casper da Costa-Luis
+ */
+#include "Python.h"
+#include "pycuvec.cuh" // PyCuVec
+/** functions */
+/// dst = src + 1
+__global__ void _d_incr(float *dst, float *src, int X, int Y) {
+  int x = threadIdx.x + blockDim.x * blockIdx.x;
+  if (x >= X) return;
+  int y = threadIdx.y + blockDim.y * blockIdx.y;
+  if (y >= Y) return;
+  dst[y * X + x] = src[y * X + x] + 1;
+}
+static PyObject *increment_f(PyObject *self, PyObject *args) {
+  PyCuVec<float> *src;
+  if (!PyArg_ParseTuple(args, "O", (PyObject **)&src)) return NULL;
+  std::vector<Py_ssize_t> &N = src->shape;
+
+  cudaEvent_t eStart, eAlloc, eKern;
+  cudaEventCreate(&eStart);
+  cudaEventCreate(&eAlloc);
+  cudaEventCreate(&eKern);
+  cudaEventRecord(eStart);
+  PyCuVec<float> *dst = PyCuVec_zeros_like(src);
+  cudaEventRecord(eAlloc);
+  dim3 thrds((N[1] + 31) / 32, (N[0] + 31) / 32);
+  dim3 blcks(32, 32);
+  _d_incr<<<thrds, blcks>>>(dst->vec.data(), src->vec.data(), N[1], N[0]);
+  // cudaDeviceSynchronize();
+  cudaEventRecord(eKern);
+  cudaEventSynchronize(eKern);
+  float alloc_ms, kernel_ms;
+  cudaEventElapsedTime(&alloc_ms, eStart, eAlloc);
+  cudaEventElapsedTime(&kernel_ms, eAlloc, eKern);
+  // fprintf(stderr, "%.3f ms, %.3f ms\n", alloc_ms, kernel_ms);
+  return Py_BuildValue("ddO", double(alloc_ms), double(kernel_ms), (PyObject *)dst);
+}
+static PyMethodDef example_methods[] = {
+    {"increment_f", increment_f, METH_VARARGS, "Returns (alloc_ms, kernel_ms, input + 1)."},
+    {NULL, NULL, 0, NULL} // Sentinel
+};
+
+/** module */
+static struct PyModuleDef example_mod = {PyModuleDef_HEAD_INIT,
+                                         "example_mod", // module
+                                         "Example external module.",
+                                         -1, // module keeps state in global variables
+                                         example_methods};
+PyMODINIT_FUNC PyInit_example_mod(void) {
+  Py_Initialize();
+  return PyModule_Create(&example_mod);
+}
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -49,9 +49,6 @@ def test_CuVec_creation(caplog):
     assert not caplog.record_tuples
     w = cu.CuVec(v)
     assert [i[1:] for i in caplog.record_tuples] == [(10, "new view")]
-    nested = cu.asarray(w.cuvec).cuvec
-    assert nested != w.cuvec, "expected different object"
-    assert np.asarray(nested).data == np.asarray(w.cuvec).data, "expected same data"
 
     caplog.clear()
     assert w[0, 0, 0] == 1
@@ -60,3 +57,19 @@ def test_CuVec_creation(caplog):
     assert v.cuvec is w.cuvec
     assert v.data == w.data
     assert not caplog.record_tuples
+
+
+def test_asarray():
+    v = cu.asarray(np.random.random(shape))
+    w = cu.CuVec(v)
+    assert w.cuvec == v.cuvec
+    assert (w == v).all()
+    assert np.asarray(w.cuvec).data == np.asarray(v.cuvec).data
+    x = cu.asarray(w.cuvec)
+    assert x.cuvec == v.cuvec
+    assert (x == v).all()
+    assert np.asarray(x.cuvec).data == np.asarray(v.cuvec).data
+    y = cu.asarray(x.tolist())
+    assert y.cuvec != v.cuvec
+    assert (y == v).all()
+    assert np.asarray(y.cuvec).data == np.asarray(v.cuvec).data
diff --git a/tests/test_perf.py b/tests/test_perf.py
@@ -0,0 +1,72 @@
+from functools import wraps
+from time import time
+
+import numpy as np
+
+import cuvec as cu
+
+
+def _time_overhead():
+    tic = time()
+    pass
+    res = time() - tic
+    return res
+
+
+def timer(func):
+    @wraps(func)
+    def inner(*args, **kwargs):
+        overhead = np.mean([_time_overhead() for _ in range(100)])
+        tic = time()
+        res = func(*args, **kwargs)
+        return (time() - tic - overhead) * 1000, res
+
+    return inner
+
+
+def test_perf(shape=(1337, 42), quiet=False):
+    # `example_mod` is defined in ../cuvec/src/example_mod/
+    from cuvec.example_mod import increment_f
+
+    overhead = np.mean([_time_overhead() for _ in range(100)])
+    t = {}
+    t['create src'], src = timer(cu.zeros)(shape, "float32")
+
+    rnd = np.random.random(shape)
+    tic = time()
+    src[:] = rnd
+    t['assign'] = (time() - tic - overhead) * 1000
+
+    if not quiet:
+        t['warmup'], (t['> create dst'], t['> kernel'], _) = timer(increment_f)(src.cuvec)
+    t['call ext'], (t['- create dst'], t['- kernel'], res) = timer(increment_f)(src.cuvec)
+    t['view'], dst = timer(cu.asarray)(res)
+
+    if not quiet:
+        print("\n".join(f"{k.ljust(14)} | {v:.3f}" for k, v in t.items()))
+    assert (src + 1 == dst).all()
+    # even a fast kernel takes longer than API overhead
+    assert t['- kernel'] / (t['call ext'] - t['- create dst']) > 0.5
+    # API call should be <0.1 ms... but set a higher threshold of 2 ms
+    assert t['call ext'] - t['- create dst'] - t['- kernel'] < 2
+    return t
+
+
+if __name__ == "__main__":
+    try:
+        from tqdm import trange
+    except ImportError:
+        trange = range
+    nruns = 1000
+
+    print("# One run:")
+    test_perf((1000, 1000))
+
+    print("Repeating & averaging performance test metrics over {nruns} runs.")
+    runs = [test_perf((1000, 1000), True) for _ in trange(nruns)]
+    pretty = {
+        'create src': 'Create input', 'assign': 'Assign', 'call ext': 'Call extension',
+        '- create dst': '-- Create output', '- kernel': '-- Launch kernel', 'view': 'View'}
+    runs = {pretty[k]: [i[k] for i in runs] for k in runs[0]}
+    print("\n".join(f"{k.ljust(16)} | {np.mean(v):.3f} ± {np.std(v, ddof=1)/np.sqrt(len(v)):.3f}"
+                    for k, v in runs.items()))