From 0ebdd3bc5beb7adfbdd604844353cd75f1b6cffc Mon Sep 17 00:00:00 2001 From: Cory Perry Date: Fri, 24 Jan 2020 14:18:51 -0800 Subject: [PATCH] Add cuda-vmm examples --- posts/cuda-vmm/Makefile | 38 ++++ posts/cuda-vmm/cuvector.cpp | 359 +++++++++++++++++++++++++++++++++ posts/cuda-vmm/cuvector.h | 144 +++++++++++++ posts/cuda-vmm/sync_main.cu | 167 +++++++++++++++ posts/cuda-vmm/vector_main.cpp | 173 ++++++++++++++++ 5 files changed, 881 insertions(+) create mode 100644 posts/cuda-vmm/Makefile create mode 100644 posts/cuda-vmm/cuvector.cpp create mode 100644 posts/cuda-vmm/cuvector.h create mode 100644 posts/cuda-vmm/sync_main.cu create mode 100644 posts/cuda-vmm/vector_main.cpp diff --git a/posts/cuda-vmm/Makefile b/posts/cuda-vmm/Makefile new file mode 100644 index 0000000..8e6ea59 --- /dev/null +++ b/posts/cuda-vmm/Makefile @@ -0,0 +1,38 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +NVCC ?= nvcc + +all: vector_example sync_example + +vector_example: vector_main.cpp cuvector.cpp + $(NVCC) $^ -o $@ -lcuda -std=c++11 + +sync_example: sync_main.cu + $(NVCC) $^ -o $@ -lcuda -std=c++11 + +clean: + $(RM) vector_example sync_example diff --git a/posts/cuda-vmm/cuvector.cpp b/posts/cuda-vmm/cuvector.cpp new file mode 100644 index 0000000..f8ef0d9 --- /dev/null +++ b/posts/cuda-vmm/cuvector.cpp @@ -0,0 +1,359 @@ +/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "cuvector.h" + +// ************** +// VectorMemAlloc +// ************** + +namespace cuda_utils { + +VectorMemAlloc::VectorMemAlloc(CUcontext context) : ctx(context), d_p(0ULL), alloc_sz(0ULL) +{ + +} + +VectorMemAlloc::~VectorMemAlloc() +{ + (void)cuMemFree(d_p); +} + +// Although we're not supposed to "commit" memory in a reserve call, +// doing so for this sample demonstrates why reserve is so important +CUresult +VectorMemAlloc::reserve(size_t new_sz) +{ + CUresult status = CUDA_SUCCESS; + CUdeviceptr new_ptr = 0ULL; + CUcontext prev_ctx; + + if (new_sz <= alloc_sz) { + return CUDA_SUCCESS; + } + (void)cuCtxGetCurrent(&prev_ctx); + // Make sure we allocate on the correct context + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { + return status; + } + // Allocate the bigger buffer + if ((status = cuMemAlloc(&new_ptr, new_sz)) == CUDA_SUCCESS) { + // Copy over the bigger buffer. We'll explicitly use the per thread + // stream to ensure we don't add false dependencies on other threads + // using the null stream, but we may have issues with other prior + // work on this stream. Luckily, that's not the case in our sample. + // + // We only want to copy over the alloc_sz here, as that's what's + // actually committed at the moment + if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { + // Free the smaller buffer. We don't need to synchronize + // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us + (void)cuMemFree(d_p); + d_p = new_ptr; + alloc_sz = new_sz; + } + else { + // Failed to copy the bigger buffer, free the smaller one + (void)cuMemFree(new_ptr); + } + } + // Make sure to always return to the previous context the caller had + (void)cuCtxSetCurrent(prev_ctx); + + return status; +} + +// ********************* +// VectorMemAllocManaged +// ********************* + +VectorMemAllocManaged::VectorMemAllocManaged(CUcontext context) : ctx(context), dev(CU_DEVICE_INVALID), d_p(0ULL), + alloc_sz(0ULL), reserve_sz(0ULL) +{ + CUcontext prev_ctx; + (void)cuCtxGetCurrent(&prev_ctx); + if (cuCtxSetCurrent(context) == CUDA_SUCCESS) { + (void)cuCtxGetDevice(&dev); + (void)cuCtxSetCurrent(prev_ctx); + } +} + +VectorMemAllocManaged::~VectorMemAllocManaged() +{ + (void)cuMemFree(d_p); +} + +CUresult +VectorMemAllocManaged::reserve(size_t new_sz) +{ + CUresult status = CUDA_SUCCESS; + CUcontext prev_ctx; + CUdeviceptr new_ptr = 0ULL; + + if (new_sz <= reserve_sz) { + return CUDA_SUCCESS; + } + + (void)cuCtxGetCurrent(&prev_ctx); + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { + return status; + } + + // Allocate the bigger buffer + if ((status = cuMemAllocManaged(&new_ptr, new_sz, CU_MEM_ATTACH_GLOBAL)) == CUDA_SUCCESS) { + // Set the preferred location for this managed allocation, to bias + // any migration requests ("pinning" it under most circumstances to + // the requested device) + (void)cuMemAdvise(new_ptr, new_sz, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, dev); + // Copy over the bigger buffer. We'll explicitly use the per thread + // stream to ensure we don't add false dependencies on other threads + // using the null stream, but we may have issues with other prior + // work on this stream. Luckily, that's not the case in our sample. + // + // We only want to copy over the alloc_sz here, as that's what's + // actually committed at the moment + if (alloc_sz > 0) { + if ((status = cuMemcpyAsync(new_ptr, d_p, alloc_sz, CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { + // Free the smaller buffer. We don't need to synchronize + // CU_STREAM_PER_THREAD, since cuMemFree synchronizes for us + (void)cuMemFree(d_p); + } + else { + // Failed to copy the bigger buffer, free the smaller one + (void)cuMemFree(new_ptr); + } + } + if (status == CUDA_SUCCESS) { + d_p = new_ptr; + reserve_sz = new_sz; + } + } + + // Make sure to always return to the previous context the caller had + (void)cuCtxSetCurrent(prev_ctx); + + return status; +} + +// Actually commits num bytes of additional memory +CUresult +VectorMemAllocManaged::grow(size_t new_sz) +{ + CUresult status = CUDA_SUCCESS; + CUcontext prev_ctx; + + if (new_sz <= alloc_sz) { + return CUDA_SUCCESS; + } + if ((status = reserve(new_sz)) != CUDA_SUCCESS) { + return status; + } + + (void)cuCtxGetCurrent(&prev_ctx); + // Make sure we allocate on the correct context + if ((status = cuCtxSetCurrent(ctx)) != CUDA_SUCCESS) { + return status; + } + // Actually commit the needed memory + // We explicitly use the per thread stream here to ensure we're not + // conflicting with other uses of the null stream from other threads + if ((status = cuMemPrefetchAsync(d_p + alloc_sz, (new_sz - alloc_sz), dev, + CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { + // Not completely necessary, but will ensure the prefetch is complete + // and prevent future runtime faults. Also makes for a more fair + // benchmark comparision + if ((status = cuStreamSynchronize(CU_STREAM_PER_THREAD)) == CUDA_SUCCESS) { + alloc_sz = new_sz; + } + } + // Make sure to always return to the previous context the caller had + (void)cuCtxSetCurrent(prev_ctx); + return status; +} + +// ********************* +// VectorMemMap +// ********************* + +VectorMemMap::VectorMemMap(CUcontext context) : d_p(0ULL), prop(), handles(), alloc_sz(0ULL), reserve_sz(0ULL), chunk_sz(0ULL) +{ + CUdevice device; + CUcontext prev_ctx; + CUresult status = CUDA_SUCCESS; + (void)status; + + status = cuCtxGetCurrent(&prev_ctx); + assert(status == CUDA_SUCCESS); + if (cuCtxSetCurrent(context) == CUDA_SUCCESS) { + status = cuCtxGetDevice(&device); + assert(status == CUDA_SUCCESS); + status = cuCtxSetCurrent(prev_ctx); + assert(status == CUDA_SUCCESS); + } + + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = (int)device; + prop.win32HandleMetaData = NULL; + + accessDesc.location = prop.location; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + status = cuMemGetAllocationGranularity(&chunk_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + assert(status == CUDA_SUCCESS); +} + +VectorMemMap::~VectorMemMap() +{ + CUresult status = CUDA_SUCCESS; + (void)status; + if (d_p != 0ULL) { + status = cuMemUnmap(d_p, alloc_sz); + assert(status == CUDA_SUCCESS); + for (size_t i = 0ULL; i < va_ranges.size(); i++) { + status = cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz); + assert(status == CUDA_SUCCESS); + } + for (size_t i = 0ULL; i < handles.size(); i++) { + status = cuMemRelease(handles[i]); + assert(status == CUDA_SUCCESS); + } + } +} + +CUresult +VectorMemMap::reserve(size_t new_sz) +{ + CUresult status = CUDA_SUCCESS; + CUdeviceptr new_ptr = 0ULL; + + if (new_sz <= reserve_sz) { + return CUDA_SUCCESS; + } + + const size_t aligned_sz = ((new_sz + chunk_sz - 1) / chunk_sz) * chunk_sz; + + status = cuMemAddressReserve(&new_ptr, (aligned_sz - reserve_sz), 0ULL, d_p + reserve_sz, 0ULL); + + // Try to reserve an address just after what we already have reserved + if (status != CUDA_SUCCESS || (new_ptr != d_p + reserve_sz)) { + if (new_ptr != 0ULL) { + (void)cuMemAddressFree(new_ptr, (aligned_sz - reserve_sz)); + } + // Slow path - try to find a new address reservation big enough for us + status = cuMemAddressReserve(&new_ptr, aligned_sz, 0ULL, 0U, 0); + if (status == CUDA_SUCCESS && d_p != 0ULL) { + CUdeviceptr ptr = new_ptr; + // Found one, now unmap our previous allocations + status = cuMemUnmap(d_p, alloc_sz); + assert(status == CUDA_SUCCESS); + for (size_t i = 0ULL; i < handles.size(); i++) { + const size_t hdl_sz = handle_sizes[i]; + // And remap them, enabling their access + if ((status = cuMemMap(ptr, hdl_sz, 0ULL, handles[i], 0ULL)) != CUDA_SUCCESS) + break; + if ((status = cuMemSetAccess(ptr, hdl_sz, &accessDesc, 1ULL)) != CUDA_SUCCESS) + break; + ptr += hdl_sz; + } + if (status != CUDA_SUCCESS) { + // Failed the mapping somehow... clean up! + status = cuMemUnmap(new_ptr, aligned_sz); + assert(status == CUDA_SUCCESS); + status = cuMemAddressFree(new_ptr, aligned_sz); + assert(status == CUDA_SUCCESS); + } + else { + // Clean up our old VA reservations! + for (size_t i = 0ULL; i < va_ranges.size(); i++) { + (void)cuMemAddressFree(va_ranges[i].start, va_ranges[i].sz); + } + va_ranges.clear(); + } + } + // Assuming everything went well, update everything + if (status == CUDA_SUCCESS) { + Range r; + d_p = new_ptr; + reserve_sz = aligned_sz; + r.start = new_ptr; + r.sz = aligned_sz; + va_ranges.push_back(r); + } + } + else { + Range r; + r.start = new_ptr; + r.sz = aligned_sz - reserve_sz; + va_ranges.push_back(r); + if (d_p == 0ULL) { + d_p = new_ptr; + } + reserve_sz = aligned_sz; + } + + return status; +} + +CUresult +VectorMemMap::grow(size_t new_sz) +{ + CUresult status = CUDA_SUCCESS; + CUmemGenericAllocationHandle handle; + if (new_sz <= alloc_sz) { + return CUDA_SUCCESS; + } + + const size_t size_diff = new_sz - alloc_sz; + // Round up to the next chunk size + const size_t sz = ((size_diff + chunk_sz - 1) / chunk_sz) * chunk_sz; + + if ((status = reserve(alloc_sz + sz)) != CUDA_SUCCESS) { + return status; + } + + if ((status = cuMemCreate(&handle, sz, &prop, 0)) == CUDA_SUCCESS) { + if ((status = cuMemMap(d_p + alloc_sz, sz, 0ULL, handle, 0ULL)) == CUDA_SUCCESS) { + if ((status = cuMemSetAccess(d_p + alloc_sz, sz, &accessDesc, 1ULL)) == CUDA_SUCCESS) { + handles.push_back(handle); + handle_sizes.push_back(sz); + alloc_sz += sz; + } + if (status != CUDA_SUCCESS) { + (void)cuMemUnmap(d_p + alloc_sz, sz); + } + } + if (status != CUDA_SUCCESS) { + (void)cuMemRelease(handle); + } + } + + return status; +} + +} diff --git a/posts/cuda-vmm/cuvector.h b/posts/cuda-vmm/cuvector.h new file mode 100644 index 0000000..47b1167 --- /dev/null +++ b/posts/cuda-vmm/cuvector.h @@ -0,0 +1,144 @@ +/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#pragma once +#include +#include + +namespace cuda_utils { + +// Typed wrapper class +template +class Vector { + VectorAllocator allocator; +public: + Vector(CUcontext ctx) : allocator(ctx) {} + + CUresult reserve(size_t num) { + return allocator.reserve(num * sizeof(T)); + } + + CUresult grow(size_t num) { + return allocator.grow(num * sizeof(T)); + } + + T *getPointer() const { + return (T*)allocator.getPointer(); + } + + size_t getSize() const { + return allocator.getSize(); + } +}; + +class VectorMemAlloc { +private: + CUcontext ctx; + CUdeviceptr d_p; + size_t alloc_sz; +public: + VectorMemAlloc(CUcontext context); + ~VectorMemAlloc(); + + CUdeviceptr getPointer() const { + return d_p; + } + + size_t getSize() const { + return alloc_sz; + } + + // Reserves some extra space in order to speed up grow() + CUresult reserve(size_t new_sz); + + // Actually commits num bytes of additional memory + CUresult grow(size_t new_sz) { + return reserve(new_sz); + } +}; + +class VectorMemAllocManaged { +private: + CUcontext ctx; + CUdevice dev; + CUdeviceptr d_p; + size_t alloc_sz; + size_t reserve_sz; + +public: + VectorMemAllocManaged(CUcontext context); + ~VectorMemAllocManaged(); + + CUdeviceptr getPointer() const { + return d_p; + } + + size_t getSize() const { + return alloc_sz; + } + + // Reserves some extra space in order to speed up grow() + CUresult reserve(size_t new_sz); + + // Actually commits num bytes of additional memory + CUresult grow(size_t new_sz); +}; + +class VectorMemMap { +private: + CUdeviceptr d_p; + CUmemAllocationProp prop; + CUmemAccessDesc accessDesc; + struct Range { + CUdeviceptr start; + size_t sz; + }; + std::vector va_ranges; + std::vector handles; + std::vector handle_sizes; + size_t alloc_sz; + size_t reserve_sz; + size_t chunk_sz; +public: + VectorMemMap(CUcontext context); + ~VectorMemMap(); + + CUdeviceptr getPointer() const { + return d_p; + } + + size_t getSize() const { + return alloc_sz; + } + + // Reserves some extra space in order to speed up grow() + CUresult reserve(size_t new_sz); + + // Actually commits num bytes of additional memory + CUresult grow(size_t new_sz); +}; + +} diff --git a/posts/cuda-vmm/sync_main.cu b/posts/cuda-vmm/sync_main.cu new file mode 100644 index 0000000..f052cc9 --- /dev/null +++ b/posts/cuda-vmm/sync_main.cu @@ -0,0 +1,167 @@ +/* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include + +static inline void +checkRtError(cudaError_t res, const char *tok, const char *file, unsigned line) +{ + if (res != cudaSuccess) { + std::cerr << file << ':' << line << ' ' << tok + << "failed (" << (unsigned)res << "): " << cudaGetErrorString(res) << std::endl; + abort(); + } +} + +#define CHECK_RT(x) checkRtError(x, #x, __FILE__, __LINE__); + +static inline void +checkDrvError(CUresult res, const char *tok, const char *file, unsigned line) +{ + if (res != CUDA_SUCCESS) { + const char *errStr = NULL; + (void)cuGetErrorString(res, &errStr); + std::cerr << file << ':' << line << ' ' << tok + << "failed (" << (unsigned)res << "): " << errStr << std::endl; + abort(); + } +} + +#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__); + +__global__ void spinKernel(unsigned long long timeout_clocks = 100000ULL) +{ + register unsigned long long start_time, sample_time; + start_time = clock64(); + while(1) { + sample_time = clock64(); + if (timeout_clocks != ~0ULL && (sample_time - start_time) > timeout_clocks) { + break; + } + } +} + +class MMAPAllocation { + size_t sz; + CUmemGenericAllocationHandle hdl; + CUmemAccessDesc accessDesc; + CUdeviceptr ptr; +public: + MMAPAllocation(size_t size, int dev = 0) { + size_t aligned_sz; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = dev; + accessDesc.location = prop.location; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + CHECK_DRV(cuMemGetAllocationGranularity(&aligned_sz, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + sz = ((size + aligned_sz - 1) / aligned_sz) * aligned_sz; + + CHECK_DRV(cuMemAddressReserve(&ptr, sz, 0ULL, 0ULL, 0ULL)); + CHECK_DRV(cuMemCreate(&hdl, sz, &prop, 0)); + CHECK_DRV(cuMemMap(ptr, sz, 0ULL, hdl, 0ULL)); + CHECK_DRV(cuMemSetAccess(ptr, sz, &accessDesc, 1ULL)); + } + ~MMAPAllocation() { + CHECK_DRV(cuMemUnmap(ptr, sz)); + CHECK_DRV(cuMemAddressFree(ptr, sz)); + CHECK_DRV(cuMemRelease(hdl)); + } +}; + +void launch_work(std::atomic &keep_going, std::atomic &ready, cudaStream_t stream) +{ + spinKernel<<<1,1,0,stream>>>(); + CHECK_RT(cudaGetLastError()); + + // We've launched at least one thing, tell the master thread + ready.fetch_add(1, std::memory_order_release); + + while(keep_going.load(std::memory_order_acquire)) { + spinKernel<<<1,1,0,stream>>>(); + CHECK_RT(cudaGetLastError()); + } +} + + +int main() +{ + const size_t N = 4ULL; + std::atomic keep_going(true); + std::atomic ready(0); + std::vector threads; + std::vector streams; + int supportsVMM = 0; + CUdevice dev; + + CHECK_RT(cudaFree(0)); // Force and check the initialization of the runtime + + CHECK_DRV(cuCtxGetDevice(&dev)); + CHECK_DRV(cuDeviceGetAttribute(&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev)); + + for (size_t i = 0; i < N; i++) { + cudaStream_t stream; + CHECK_RT(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + std::thread t1(launch_work, std::ref(keep_going), std::ref(ready), stream); + + threads.push_back(std::move(t1)); + streams.push_back(stream); + } + + // Wait for all the threads to have launched at least one thing + while (ready.load(std::memory_order_acquire) != N); + + // Use standard cudaMalloc/cudaFree + for (size_t i = 0; i < 100; i++) { + int *x = nullptr; + CHECK_RT(cudaMalloc(&x, sizeof(*x))); + CHECK_RT(cudaFree(x)); + } + + + if (supportsVMM) { + // Now use the Virtual Memory Management APIs + for (size_t i = 0; i < 100; i++) { + MMAPAllocation allocMMAP(1); + } + } + + keep_going.store(false, std::memory_order_release); + + for (size_t i = 0; i +#include +#include +#include +#include + +#include "cuvector.h" + +typedef std::chrono::nanoseconds ReportingDuration; + +static inline void +checkDrvError(CUresult res, const char *tok, const char *file, unsigned line) +{ + if (res != CUDA_SUCCESS) { + const char *errStr = NULL; + (void)cuGetErrorString(res, &errStr); + std::cerr << file << ':' << line << ' ' << tok + << "failed (" << (unsigned)res << "): " << errStr << std::endl; + } +} + +#define CHECK_DRV(x) checkDrvError(x, #x, __FILE__, __LINE__); + +template +void measureGrow(V& v, size_t minN, size_t maxN, std::vector& durations) +{ + for (size_t n = minN; n <= maxN; n <<= 1) { + typedef std::chrono::time_point time_point; + + time_point start = std::chrono::steady_clock::now(); + CUresult status = v.grow(n); + time_point end = std::chrono::steady_clock::now(); + + durations.push_back(std::chrono::duration_cast(end - start)); + // In non-release, verify the memory is accessible and everything worked properly + assert(CUDA_SUCCESS == status); + assert(CUDA_SUCCESS == cuMemsetD8((CUdeviceptr)v.getPointer(), 0, v.getSize())); + assert(CUDA_SUCCESS == cuCtxSynchronize()); + } +} + +template +void runVectorPerfTest(CUcontext ctx, size_t minN, size_t maxN, + std::vector& noReserveDurations, + std::vector& reserveDurations) +{ + typedef cuda_utils::Vector VectorDUT; + + if (false) { + // Warm-up + VectorDUT dut(ctx); + if (!dut.grow(maxN)) { + std::cerr << "Failed to grow to max elements, test invalid!\n" << std::endl; + return; + } + } + + // Wait for the OS to settle it's GPU pages from past perf runs + std::this_thread::sleep_for(std::chrono::seconds(2)); + { + // Measure without reserving + VectorDUT dut(ctx); + measureGrow(dut, minN, maxN, noReserveDurations); + } + + // Wait for the OS to settle it's GPU pages from past perf runs + std::this_thread::sleep_for(std::chrono::seconds(2)); + { + size_t free = 0ULL; + VectorDUT dut(ctx); + + dut.reserve(maxN); + CHECK_DRV(cuMemGetInfo(&free, NULL)); + std::cout << "\tReserved " << maxN << " elements..." << std::endl + << "\tFree Memory: " << (float)free / std::giga::num << "GB" << std::endl; + + measureGrow(dut, minN, maxN, reserveDurations); + } +} + +int main() +{ + size_t free; + typedef unsigned char ElemType; + CUcontext ctx; + CUdevice dev; + int supportsVMM = 0; + + CHECK_DRV(cuInit(0)); + CHECK_DRV(cuDevicePrimaryCtxRetain(&ctx, 0)); + CHECK_DRV(cuCtxSetCurrent(ctx)); + CHECK_DRV(cuCtxGetDevice(&dev)); + + std::vector > durations(4); + + CHECK_DRV(cuMemGetInfo(&free, NULL)); + + std::cout << "Total Free Memory: " << (float)free / std::giga::num << "GB" << std::endl; + + // Skip the smaller cases + const size_t minN = (2ULL * 1024ULL * 1024ULL + sizeof(ElemType) - 1ULL) / sizeof(ElemType); + // Use at max about 75% of all vidmem for perf testing + // Also, some vector allocators like MemAlloc cannot handle more than this, + // as they would run out of memory during the grow algorithm + const size_t maxN = 3ULL * free / (4ULL * sizeof(ElemType)); + + std::cout << "====== cuMemAlloc ElemSz=" << sizeof(ElemType) << " ======" << std::endl; + runVectorPerfTest(ctx, minN, maxN, durations[0], durations[1]); + std::cout << "====== cuMemAllocManaged ElemSz=" << sizeof(ElemType) << " ======" << std::endl; + runVectorPerfTest(ctx, minN, maxN, durations[2], durations[3]); + + CHECK_DRV(cuDeviceGetAttribute(&supportsVMM, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev)); + + if (supportsVMM) { + durations.resize(durations.size() + 2); + std::cout << "====== cuMemMap ElemSz=" << sizeof(ElemType) << " ======" << std::endl; + runVectorPerfTest(ctx, minN, maxN, durations[4], durations[5]); + } + + // Quick and dirty table of results + std::cout << "Size(bytes) | " + << "Alloc(us) | " + << "AllocRes(us) | " + << "Managed(us) | " + << "ManagedRes(us) | "; + + if (supportsVMM) { + std::cout << "cuMemMap(us) | " + << "cuMemMapRes(us)| "; + } + + std::cout << std::endl; + + for (size_t i = 0; i < durations[0].size(); i++) { + std::cout << std::left << std::setw(15) << std::setfill(' ') << (minN << i) << "| "; + for (size_t j = 0; j < durations.size(); j++) { + std::cout << std::left << std::setw(15) << std::setfill(' ') + << std::setprecision(2) << std::fixed + << std::chrono::duration_cast >(durations[j][i]).count() << "| "; + } + std::cout << std::endl; + } + + CHECK_DRV(cuDevicePrimaryCtxRelease(0)); + + return 0; +}