From 0ebdd3bc5beb7adfbdd604844353cd75f1b6cffc Mon Sep 17 00:00:00 2001
From: Cory Perry <cperry@nvidia.com>
Date: Fri, 24 Jan 2020 14:18:51 -0800
Subject: [PATCH] Add cuda-vmm examples

---
 posts/cuda-vmm/Makefile        |  38 ++++
 posts/cuda-vmm/cuvector.cpp    | 359 +++++++++++++++++++++++++++++++++
 posts/cuda-vmm/cuvector.h      | 144 +++++++++++++
 posts/cuda-vmm/sync_main.cu    | 167 +++++++++++++++
 posts/cuda-vmm/vector_main.cpp | 173 ++++++++++++++++
 5 files changed, 881 insertions(+)
 create mode 100644 posts/cuda-vmm/Makefile
 create mode 100644 posts/cuda-vmm/cuvector.cpp
 create mode 100644 posts/cuda-vmm/cuvector.h
 create mode 100644 posts/cuda-vmm/sync_main.cu
 create mode 100644 posts/cuda-vmm/vector_main.cpp

diff --git a/posts/cuda-vmm/Makefile b/posts/cuda-vmm/Makefile
new file mode 100644
index 0000000..8e6ea59
--- /dev/null
+++ b/posts/cuda-vmm/Makefile
@@ -0,0 +1,38 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+NVCC ?= nvcc
+
+all: vector_example sync_example
+
+vector_example: vector_main.cpp cuvector.cpp
+	$(NVCC) $^ -o $@ -lcuda -std=c++11
+
+sync_example: sync_main.cu
+	$(NVCC) $^ -o $@ -lcuda -std=c++11
+
+clean:
+	$(RM) vector_example sync_example
diff --git a/posts/cuda-vmm/cuvector.cpp b/posts/cuda-vmm/cuvector.cpp
new file mode 100644
index 0000000..f8ef0d9
--- /dev/null
+++ b/posts/cuda-vmm/cuvector.cpp
@@ -0,0 +1,359 @@
+/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <cuda.h>
+#include <assert.h>
+#include "cuvector.h"
+
+// **************
+// VectorMemAlloc
+// **************
+
+namespace cuda_utils {
+
+VectorMemAlloc::VectorMemAlloc(CUcontext context) : ctx(context), d_p(0ULL), alloc_sz(0ULL)
+{
+
+}
+
+VectorMemAlloc::~VectorMemAlloc()
+{
+    (void)cuMemFree(d_p);
+}
+
+// Although we're not supposed to "commit" memory in a reserve call,
+// doing so for this sample demonstrates why reserve is so important
+CUresult
+VectorMemAlloc::reserve(size_t new_sz)
+{
+    CUresult status = CUDA_SUCCESS;
+    CUdeviceptr new_ptr = 0ULL;
+    CUcontext prev_ctx;
+
+    if (new_sz <= alloc_sz) {
+        return CUDA_SUCCESS;
+    }
+    (void)cuCtxGetCurrent(&prev_ctx);
+    // Make sure we allocate on the correct context
+    if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
+        return status;
+    }
+    // Allocate the bigger buffer
+    if ((status = cuMemAlloc(&new_ptr, new_sz)) == CUDA_SUCCESS) {
+        // Copy over the bigger buffer.  We'll explicitly use the per thread
+        // stream to ensure we don't add false dependencies on other threads
+        // using the null stream, but we may have issues with other prior
+        // work on this stream.  Luckily, that's not the case in our sample.
+        //
+        // We only want to copy over the alloc_sz here, as that's what's
+        // actually committed at the moment
+        if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
+            // Free the smaller buffer.  We don't need to synchronize
+            // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us
+            (void)cuMemFree(d_p);
+            d_p = new_ptr;
+            alloc_sz = new_sz;
+        }
+        else {
+            // Failed to copy the bigger buffer, free the smaller one
+            (void)cuMemFree(new_ptr);
+        }
+    }
+    // Make sure to always return to the previous context the caller had
+    (void)cuCtxSetCurrent(prev_ctx);
+
+    return status;
+}
+
+// *********************
+// VectorMemAllocManaged
+// *********************
+
+VectorMemAllocManaged::VectorMemAllocManaged(CUcontext context) : ctx(context), dev(CU_DEVICE_INVALID), d_p(0ULL),
+    alloc_sz(0ULL), reserve_sz(0ULL)
+{
+    CUcontext prev_ctx;
+    (void)cuCtxGetCurrent(&prev_ctx);
+    if (cuCtxSetCurrent(context) == CUDA_SUCCESS) {
+        (void)cuCtxGetDevice(&dev);
+        (void)cuCtxSetCurrent(prev_ctx);
+    }
+}
+
+VectorMemAllocManaged::~VectorMemAllocManaged()
+{
+    (void)cuMemFree(d_p);
+}
+
+CUresult
+VectorMemAllocManaged::reserve(size_t new_sz)
+{
+    CUresult status = CUDA_SUCCESS;
+    CUcontext prev_ctx;
+    CUdeviceptr new_ptr = 0ULL;
+
+    if (new_sz <= reserve_sz) {
+        return CUDA_SUCCESS;
+    }
+
+    (void)cuCtxGetCurrent(&prev_ctx);
+    if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
+        return status;
+    }
+
+    // Allocate the bigger buffer
+    if ((status = cuMemAllocManaged(&new_ptr, new_sz, CU_MEM_ATTACH_GLOBAL)) == CUDA_SUCCESS) {
+        // Set the preferred location for this managed allocation, to bias
+        // any migration requests ("pinning" it under most circumstances to
+        // the requested device)
+        (void)cuMemAdvise(new_ptr, new_sz, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, dev);
+        // Copy over the bigger buffer.  We'll explicitly use the per thread
+        // stream to ensure we don't add false dependencies on other threads
+        // using the null stream, but we may have issues with other prior
+        // work on this stream.  Luckily, that's not the case in our sample.
+        //
+        // We only want to copy over the alloc_sz here, as that's what's
+        // actually committed at the moment
+        if (alloc_sz > 0) {
+            if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
+                // Free the smaller buffer.  We don't need to synchronize
+                // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us
+                (void)cuMemFree(d_p);
+            }
+            else {
+                // Failed to copy the bigger buffer, free the smaller one
+                (void)cuMemFree(new_ptr);
+            }
+        }
+        if (status == CUDA_SUCCESS) {
+            d_p = new_ptr;
+            reserve_sz = new_sz;
+        }
+    }
+
+    // Make sure to always return to the previous context the caller had
+    (void)cuCtxSetCurrent(prev_ctx);
+
+    return status;
+}
+
+// Actually commits num bytes of additional memory
+CUresult
+VectorMemAllocManaged::grow(size_t new_sz)
+{
+    CUresult status = CUDA_SUCCESS;
+    CUcontext prev_ctx;
+
+    if (new_sz <= alloc_sz) {
+        return CUDA_SUCCESS;
+    }
+    if ((status = reserve(new_sz)) != CUDA_SUCCESS) {
+        return status;
+    }
+
+    (void)cuCtxGetCurrent(&prev_ctx);
+    // Make sure we allocate on the correct context
+    if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) {
+        return status;
+    }
+    // Actually commit the needed memory
+    // We explicitly use the per thread stream here to ensure we're not
+    // conflicting with other uses of the null stream from other threads
+    if ((status = cuMemPrefetchAsync(d_p + alloc_sz, (new_sz - alloc_sz), dev,
+                                     CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
+        // Not completely necessary, but will ensure the prefetch is complete
+        // and prevent future runtime faults.  Also makes for a more fair
+        // benchmark comparision
+        if ((status = cuStreamSynchronize(CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) {
+            alloc_sz = new_sz;
+        }
+    }
+    // Make sure to always return to the previous context the caller had
+    (void)cuCtxSetCurrent(prev_ctx);
+    return status;
+}
+
+// *********************
+// VectorMemMap
+// *********************
+
+VectorMemMap::VectorMemMap(CUcontext context) : d_p(0ULL), prop(), handles(), alloc_sz(0ULL), reserve_sz(0ULL), chunk_sz(0ULL)
+{
+    CUdevice device;
+    CUcontext prev_ctx;
+    CUresult status = CUDA_SUCCESS;
+    (void)status;
+
+    status = cuCtxGetCurrent(&prev_ctx);
+    assert(status == CUDA_SUCCESS);
+    if (cuCtxSetCurrent(context) == CUDA_SUCCESS) {
+        status = cuCtxGetDevice(&device);
+        assert(status == CUDA_SUCCESS);
+        status = cuCtxSetCurrent(prev_ctx);
+        assert(status == CUDA_SUCCESS);
+    }
+
+    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.id = (int)device;
+    prop.win32HandleMetaData = NULL;
+
+    accessDesc.location = prop.location;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    status = cuMemGetAllocationGranularity(&chunk_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+    assert(status == CUDA_SUCCESS);
+}
+
+VectorMemMap::~VectorMemMap()
+{
+    CUresult status = CUDA_SUCCESS;
+    (void)status;
+    if (d_p != 0ULL) {
+        status = cuMemUnmap(d_p, alloc_sz);
+        assert(status == CUDA_SUCCESS);
+        for (size_t i = 0ULL; i < va_ranges.size(); i++) {
+            status = cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz);
+            assert(status == CUDA_SUCCESS);
+        }
+        for (size_t i = 0ULL; i < handles.size(); i++) {
+            status = cuMemRelease(handles[i]);
+            assert(status == CUDA_SUCCESS);
+        }
+    }
+}
+
+CUresult
+VectorMemMap::reserve(size_t new_sz)
+{
+    CUresult status = CUDA_SUCCESS;
+    CUdeviceptr new_ptr = 0ULL;
+
+    if (new_sz <= reserve_sz) {
+        return CUDA_SUCCESS;
+    }
+
+    const size_t aligned_sz = ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz;
+
+    status = cuMemAddressReserve(&new_ptr, (aligned_sz - reserve_sz), 0ULL, d_p + reserve_sz, 0ULL);
+
+    // Try to reserve an address just after what we already have reserved
+    if (status != CUDA_SUCCESS || (new_ptr != d_p + reserve_sz)) {
+        if (new_ptr != 0ULL) {
+            (void)cuMemAddressFree(new_ptr, (aligned_sz - reserve_sz));
+        }
+        // Slow path - try to find a new address reservation big enough for us
+        status = cuMemAddressReserve(&new_ptr, aligned_sz, 0ULL, 0U, 0);
+        if (status == CUDA_SUCCESS && d_p != 0ULL) {
+            CUdeviceptr ptr = new_ptr;
+            // Found one, now unmap our previous allocations
+            status = cuMemUnmap(d_p, alloc_sz);
+            assert(status == CUDA_SUCCESS);
+            for (size_t i = 0ULL; i < handles.size(); i++) {
+                const size_t hdl_sz = handle_sizes[i];
+                // And remap them, enabling their access
+                if ((status = cuMemMap(ptr, hdl_sz, 0ULL, handles[i], 0ULL)) != CUDA_SUCCESS)
+                    break;
+                if ((status = cuMemSetAccess(ptr, hdl_sz, &accessDesc, 1ULL)) != CUDA_SUCCESS)
+                    break;
+                ptr += hdl_sz;
+            }
+            if (status != CUDA_SUCCESS) {
+                // Failed the mapping somehow... clean up!
+                status = cuMemUnmap(new_ptr, aligned_sz);
+                assert(status == CUDA_SUCCESS);
+                status = cuMemAddressFree(new_ptr, aligned_sz);
+                assert(status == CUDA_SUCCESS);
+            }
+            else {
+                // Clean up our old VA reservations!
+                for (size_t i = 0ULL; i < va_ranges.size(); i++) {
+                    (void)cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz);
+                }
+                va_ranges.clear();
+            }
+        }
+        // Assuming everything went well, update everything
+        if (status == CUDA_SUCCESS) {
+            Range r;
+            d_p = new_ptr;
+            reserve_sz = aligned_sz;
+            r.start = new_ptr;
+            r.sz = aligned_sz;
+            va_ranges.push_back(r);
+        }
+    }
+    else {
+        Range r;
+        r.start = new_ptr;
+        r.sz = aligned_sz - reserve_sz;
+        va_ranges.push_back(r);
+        if (d_p == 0ULL) {
+            d_p = new_ptr;
+        }
+        reserve_sz = aligned_sz;
+    }
+
+    return status;
+}
+
+CUresult
+VectorMemMap::grow(size_t new_sz)
+{
+    CUresult status = CUDA_SUCCESS;
+    CUmemGenericAllocationHandle handle;
+    if (new_sz <= alloc_sz) {
+        return CUDA_SUCCESS;
+    }
+
+    const size_t size_diff = new_sz - alloc_sz;
+    // Round up to the next chunk size
+    const size_t sz = ((size_diff + chunk_sz - 1) / chunk_sz) * chunk_sz;
+
+    if ((status = reserve(alloc_sz + sz)) != CUDA_SUCCESS) {
+        return status;
+    }
+
+    if ((status = cuMemCreate(&handle, sz, &prop, 0)) == CUDA_SUCCESS) {
+        if ((status = cuMemMap(d_p + alloc_sz, sz, 0ULL, handle, 0ULL)) == CUDA_SUCCESS) {
+            if ((status = cuMemSetAccess(d_p + alloc_sz, sz, &accessDesc, 1ULL)) == CUDA_SUCCESS) {
+                handles.push_back(handle);
+                handle_sizes.push_back(sz);
+                alloc_sz += sz;
+            }
+            if (status != CUDA_SUCCESS) {
+                (void)cuMemUnmap(d_p + alloc_sz, sz);
+            }
+        }
+        if (status != CUDA_SUCCESS) {
+            (void)cuMemRelease(handle);
+        }
+    }
+
+    return status;
+}
+
+}
diff --git a/posts/cuda-vmm/cuvector.h b/posts/cuda-vmm/cuvector.h
new file mode 100644
index 0000000..47b1167
--- /dev/null
+++ b/posts/cuda-vmm/cuvector.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include <cuda.h>
+#include <vector>
+
+namespace cuda_utils {
+
+// Typed wrapper class
+template<typename T, class VectorAllocator>
+class Vector {
+    VectorAllocator allocator;
+public:
+    Vector(CUcontext ctx) : allocator(ctx) {}
+
+    CUresult reserve(size_t num) {
+        return allocator.reserve(num * sizeof(T));
+    }
+
+    CUresult grow(size_t num) {
+        return allocator.grow(num * sizeof(T));
+    }
+
+    T *getPointer() const {
+        return (T*)allocator.getPointer();
+    }
+
+    size_t getSize() const {
+        return allocator.getSize();
+    }
+};
+
+class VectorMemAlloc {
+private:
+    CUcontext ctx;
+    CUdeviceptr d_p;
+    size_t alloc_sz;
+public:
+    VectorMemAlloc(CUcontext context);
+    ~VectorMemAlloc();
+
+    CUdeviceptr getPointer() const {
+        return d_p;
+    }
+
+    size_t getSize() const {
+        return alloc_sz;
+    }
+
+    // Reserves some extra space in order to speed up grow()
+    CUresult reserve(size_t new_sz);
+
+    // Actually commits num bytes of additional memory
+    CUresult grow(size_t new_sz) {
+        return reserve(new_sz);
+    }
+};
+
+class VectorMemAllocManaged {
+private:
+    CUcontext ctx;
+    CUdevice dev;
+    CUdeviceptr d_p;
+    size_t alloc_sz;
+    size_t reserve_sz;
+
+public:
+    VectorMemAllocManaged(CUcontext context);
+    ~VectorMemAllocManaged();
+
+    CUdeviceptr getPointer() const {
+        return d_p;
+    }
+
+    size_t getSize() const {
+        return alloc_sz;
+    }
+
+    // Reserves some extra space in order to speed up grow()
+    CUresult reserve(size_t new_sz);
+
+    // Actually commits num bytes of additional memory
+    CUresult grow(size_t new_sz);
+};
+
+class VectorMemMap {
+private:
+    CUdeviceptr d_p;
+    CUmemAllocationProp prop;
+    CUmemAccessDesc accessDesc;
+    struct Range {
+        CUdeviceptr start;
+        size_t sz;
+    };
+    std::vector<Range> va_ranges;
+    std::vector<CUmemGenericAllocationHandle> handles;
+    std::vector<size_t> handle_sizes;
+    size_t alloc_sz;
+    size_t reserve_sz;
+    size_t chunk_sz;
+public:
+    VectorMemMap(CUcontext context);
+    ~VectorMemMap();
+
+    CUdeviceptr getPointer() const {
+        return d_p;
+    }
+
+    size_t getSize() const {
+        return alloc_sz;
+    }
+
+    // Reserves some extra space in order to speed up grow()
+    CUresult reserve(size_t new_sz);
+
+    // Actually commits num bytes of additional memory
+    CUresult grow(size_t new_sz);
+};
+
+}
diff --git a/posts/cuda-vmm/sync_main.cu b/posts/cuda-vmm/sync_main.cu
new file mode 100644
index 0000000..f052cc9
--- /dev/null
+++ b/posts/cuda-vmm/sync_main.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <thread>
+#include <vector>
+#include <iostream>
+#include <atomic>
+#include <cuda.h>
+
+static inline void
+checkRtError(cudaError_t res, const char *tok, const char *file, unsigned line)
+{
+    if (res != cudaSuccess) {
+        std::cerr << file << ':' << line << ' ' << tok
+                  << "failed (" << (unsigned)res << "): " << cudaGetErrorString(res) << std::endl;
+        abort();
+    }
+}
+
+#define CHECK_RT(x) checkRtError(x, #x, __FILE__, __LINE__);
+
+static inline void
+checkDrvError(CUresult res, const char *tok, const char *file, unsigned line)
+{
+    if (res != CUDA_SUCCESS) {
+        const char *errStr = NULL;
+        (void)cuGetErrorString(res, &errStr);
+        std::cerr << file << ':' << line << ' ' << tok
+                  << "failed (" << (unsigned)res << "): " << errStr << std::endl;
+        abort();
+    }
+}
+
+#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__);
+
+__global__ void spinKernel(unsigned long long timeout_clocks = 100000ULL)
+{
+    register unsigned long long start_time, sample_time;
+    start_time = clock64();
+    while(1) {
+        sample_time = clock64();
+        if (timeout_clocks != ~0ULL && (sample_time - start_time) > timeout_clocks) {
+            break;
+        }
+    }
+}
+
+class MMAPAllocation {
+    size_t sz;
+    CUmemGenericAllocationHandle hdl;
+    CUmemAccessDesc accessDesc;
+    CUdeviceptr ptr;
+public:
+    MMAPAllocation(size_t size, int dev = 0) {
+        size_t aligned_sz;
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = dev;
+        accessDesc.location = prop.location;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+        CHECK_DRV(cuMemGetAllocationGranularity(&aligned_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+        sz = ((size + aligned_sz - 1) / aligned_sz) * aligned_sz;
+
+        CHECK_DRV(cuMemAddressReserve(&ptr, sz, 0ULL, 0ULL, 0ULL));
+        CHECK_DRV(cuMemCreate(&hdl, sz, &prop, 0));
+        CHECK_DRV(cuMemMap(ptr, sz, 0ULL, hdl, 0ULL));
+        CHECK_DRV(cuMemSetAccess(ptr, sz, &accessDesc, 1ULL));
+    }
+    ~MMAPAllocation() {
+        CHECK_DRV(cuMemUnmap(ptr, sz));
+        CHECK_DRV(cuMemAddressFree(ptr, sz));
+        CHECK_DRV(cuMemRelease(hdl));
+    }
+};
+
+void launch_work(std::atomic<bool> &keep_going, std::atomic<unsigned> &ready, cudaStream_t stream)
+{
+    spinKernel<<<1,1,0,stream>>>();
+    CHECK_RT(cudaGetLastError());
+
+    // We've launched at least one thing, tell the master thread
+    ready.fetch_add(1, std::memory_order_release);
+
+    while(keep_going.load(std::memory_order_acquire)) {
+        spinKernel<<<1,1,0,stream>>>();
+        CHECK_RT(cudaGetLastError());
+    }
+}
+
+
+int main()
+{
+    const size_t N = 4ULL;
+    std::atomic<bool> keep_going(true);
+    std::atomic<unsigned> ready(0);
+    std::vector<std::thread> threads;
+    std::vector<cudaStream_t> streams;
+    int supportsVMM = 0;
+    CUdevice dev;
+
+    CHECK_RT(cudaFree(0));  // Force and check the initialization of the runtime
+
+    CHECK_DRV(cuCtxGetDevice(&dev));
+    CHECK_DRV(cuDeviceGetAttribute(&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev));
+
+    for (size_t i = 0; i < N; i++) {
+        cudaStream_t stream;
+        CHECK_RT(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+        std::thread t1(launch_work, std::ref(keep_going), std::ref(ready), stream);
+
+        threads.push_back(std::move(t1));
+        streams.push_back(stream);
+    }
+
+    // Wait for all the threads to have launched at least one thing
+    while (ready.load(std::memory_order_acquire) != N);
+
+    // Use standard cudaMalloc/cudaFree
+    for (size_t i = 0; i < 100; i++) {
+        int *x = nullptr;
+        CHECK_RT(cudaMalloc(&x, sizeof(*x)));
+        CHECK_RT(cudaFree(x));
+    }
+
+
+    if (supportsVMM) {
+        // Now use the Virtual Memory Management APIs
+        for (size_t i = 0; i < 100; i++) {
+            MMAPAllocation allocMMAP(1);
+        }
+    }
+
+    keep_going.store(false, std::memory_order_release);
+
+    for (size_t i = 0; i <threads.size(); i++) {
+        threads[i].join();
+        CHECK_RT(cudaStreamDestroy(streams[i]));
+    }
+
+    return 0;
+}
diff --git a/posts/cuda-vmm/vector_main.cpp b/posts/cuda-vmm/vector_main.cpp
new file mode 100644
index 0000000..b6bf4cc
--- /dev/null
+++ b/posts/cuda-vmm/vector_main.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <thread>
+#include <assert.h>
+
+#include "cuvector.h"
+
+typedef std::chrono::nanoseconds ReportingDuration;
+
+static inline void
+checkDrvError(CUresult res, const char *tok, const char *file, unsigned line)
+{
+    if (res != CUDA_SUCCESS) {
+        const char *errStr = NULL;
+        (void)cuGetErrorString(res, &errStr);
+        std::cerr << file << ':' << line << ' ' << tok
+                  << "failed (" << (unsigned)res << "): " << errStr << std::endl;
+    }
+}
+
+#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__);
+
+template<typename V>
+void measureGrow(V& v, size_t minN, size_t maxN, std::vector<ReportingDuration>& durations)
+{
+    for (size_t n = minN; n <= maxN; n <<= 1) {
+        typedef std::chrono::time_point<std::chrono::steady_clock> time_point;
+
+        time_point start = std::chrono::steady_clock::now();
+        CUresult status = v.grow(n);
+        time_point end = std::chrono::steady_clock::now();
+
+        durations.push_back(std::chrono::duration_cast<ReportingDuration>(end - start));
+        // In non-release, verify the memory is accessible and everything worked properly
+        assert(CUDA_SUCCESS == status);
+        assert(CUDA_SUCCESS == cuMemsetD8((CUdeviceptr)v.getPointer(), 0, v.getSize()));
+        assert(CUDA_SUCCESS == cuCtxSynchronize());
+    }
+}
+
+template<typename Allocator, typename Elem>
+void runVectorPerfTest(CUcontext ctx, size_t minN, size_t maxN,
+                       std::vector<ReportingDuration>& noReserveDurations,
+                       std::vector<ReportingDuration>& reserveDurations)
+{
+    typedef cuda_utils::Vector<Elem, Allocator> VectorDUT;
+
+    if (false) {
+        // Warm-up
+        VectorDUT dut(ctx);
+        if (!dut.grow(maxN)) {
+            std::cerr << "Failed to grow to max elements, test invalid!\n" << std::endl;
+            return;
+        }
+    }
+
+    // Wait for the OS to settle it's GPU pages from past perf runs
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+    {
+        // Measure without reserving
+        VectorDUT dut(ctx);
+        measureGrow(dut, minN, maxN, noReserveDurations);
+    }
+
+    // Wait for the OS to settle it's GPU pages from past perf runs
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+    {
+        size_t free = 0ULL;
+        VectorDUT dut(ctx);
+
+        dut.reserve(maxN);
+        CHECK_DRV(cuMemGetInfo(&free, NULL));
+        std::cout << "\tReserved " << maxN << " elements..." << std::endl
+                  << "\tFree Memory: " << (float)free / std::giga::num << "GB" << std::endl;
+
+        measureGrow(dut, minN, maxN, reserveDurations);
+    }
+}
+
+int main()
+{
+    size_t free;
+    typedef unsigned char ElemType;
+    CUcontext ctx;
+    CUdevice dev;
+    int supportsVMM = 0;
+
+    CHECK_DRV(cuInit(0));
+    CHECK_DRV(cuDevicePrimaryCtxRetain(&ctx, 0));
+    CHECK_DRV(cuCtxSetCurrent(ctx));
+    CHECK_DRV(cuCtxGetDevice(&dev));
+
+    std::vector<std::vector<ReportingDuration> > durations(4);
+
+    CHECK_DRV(cuMemGetInfo(&free, NULL));
+
+    std::cout << "Total Free Memory: " << (float)free / std::giga::num << "GB" << std::endl;
+
+    // Skip the smaller cases
+    const size_t minN = (2ULL * 1024ULL * 1024ULL + sizeof(ElemType) - 1ULL) / sizeof(ElemType);
+    // Use at max about 75% of all vidmem for perf testing
+    // Also, some vector allocators like MemAlloc cannot handle more than this,
+    // as they would run out of memory during the grow algorithm
+    const size_t maxN = 3ULL * free / (4ULL * sizeof(ElemType));
+
+    std::cout << "====== cuMemAlloc ElemSz=" << sizeof(ElemType) << " ======" << std::endl;
+    runVectorPerfTest<cuda_utils::VectorMemAlloc, ElemType>(ctx, minN, maxN, durations[0], durations[1]);
+    std::cout << "====== cuMemAllocManaged ElemSz=" << sizeof(ElemType) << " ======" << std::endl;
+    runVectorPerfTest<cuda_utils::VectorMemAllocManaged, ElemType>(ctx, minN, maxN, durations[2], durations[3]);
+
+    CHECK_DRV(cuDeviceGetAttribute(&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev));
+
+    if (supportsVMM) {
+        durations.resize(durations.size() + 2);
+        std::cout << "====== cuMemMap ElemSz=" << sizeof(ElemType) << " ======" << std::endl;
+        runVectorPerfTest<cuda_utils::VectorMemMap, ElemType>(ctx, minN, maxN, durations[4], durations[5]);
+    }
+
+    // Quick and dirty table of results
+    std::cout << "Size(bytes)    | "
+              << "Alloc(us)      | "
+              << "AllocRes(us)   | "
+              << "Managed(us)    | "
+              << "ManagedRes(us) | ";
+
+    if (supportsVMM) {
+        std::cout << "cuMemMap(us)   | "
+                  << "cuMemMapRes(us)| ";
+    }
+
+    std::cout << std::endl;
+
+    for (size_t i = 0; i < durations[0].size(); i++) {
+        std::cout << std::left << std::setw(15) << std::setfill(' ') << (minN << i) << "| ";
+        for (size_t j = 0; j < durations.size(); j++) {
+            std::cout << std::left << std::setw(15) << std::setfill(' ')
+                      << std::setprecision(2) << std::fixed
+                      << std::chrono::duration_cast <std::chrono::duration<float, std::micro> >(durations[j][i]).count() << "| ";
+        }
+        std::cout << std::endl;
+    }
+
+    CHECK_DRV(cuDevicePrimaryCtxRelease(0));
+
+    return 0;
+}