diff --git a/tools/perf/ucc_pt_coll.cc b/tools/perf/ucc_pt_coll.cc index 31e6af22bc..e013615dd8 100644 --- a/tools/perf/ucc_pt_coll.cc +++ b/tools/perf/ucc_pt_coll.cc @@ -6,6 +6,7 @@ #include "ucc_pt_coll.h" #include "ucc_pt_cuda.h" +#include "utils/ucc_malloc.h" ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_memory_type_t mem_type) @@ -13,20 +14,46 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t status; int cuda_st; - if (mem_type == UCC_MEMORY_TYPE_CUDA) { + switch (mem_type) { + case UCC_MEMORY_TYPE_CUDA: *h_ptr = new ucc_mc_buffer_header_t; (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA; cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len); if (cuda_st != 0) { - return UCC_ERR_NO_RESOURCE; + return UCC_ERR_NO_MEMORY; } cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); if (cuda_st != 0) { ucc_pt_cudaFree((*h_ptr)->addr); delete *h_ptr; - return UCC_ERR_NO_RESOURCE; + return UCC_ERR_NO_MEMORY; } return UCC_OK; + case UCC_MEMORY_TYPE_CUDA_MANAGED: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED; + cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len); + if (cuda_st != 0) { + return UCC_ERR_NO_MEMORY; + } + cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len); + if (cuda_st != 0) { + ucc_pt_cudaFree((*h_ptr)->addr); + delete *h_ptr; + return UCC_ERR_NO_MEMORY; + } + return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + *h_ptr = new ucc_mc_buffer_header_t; + (*h_ptr)->mt = UCC_MEMORY_TYPE_HOST; + (*h_ptr)->addr = ucc_malloc(len, "perftest data"); + if (!((*h_ptr)->addr)) { + return UCC_ERR_NO_MEMORY; + } + memset((*h_ptr)->addr, 0, len); + return UCC_OK; + default: + break; } status = ucc_mc_alloc(h_ptr, len, mem_type); @@ -44,10 +71,18 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len, ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr) { - if (h_ptr->mt == UCC_MEMORY_TYPE_CUDA) { + switch (h_ptr->mt) { + case UCC_MEMORY_TYPE_CUDA: + case UCC_MEMORY_TYPE_CUDA_MANAGED: ucc_pt_cudaFree(h_ptr->addr); delete h_ptr; return UCC_OK; + case UCC_MEMORY_TYPE_HOST: + ucc_free(h_ptr->addr); + delete h_ptr; + return UCC_OK; + default: + break; } return ucc_mc_free(h_ptr); diff --git a/tools/perf/ucc_pt_cuda.cc b/tools/perf/ucc_pt_cuda.cc index f0807f88a9..1d9e55ab4a 100644 --- a/tools/perf/ucc_pt_cuda.cc +++ b/tools/perf/ucc_pt_cuda.cc @@ -39,6 +39,7 @@ void ucc_pt_cuda_init(void) LOAD_CUDA_SYM("cudaMalloc", cudaMalloc); LOAD_CUDA_SYM("cudaFree", cudaFree); LOAD_CUDA_SYM("cudaMemset", cudaMemset); + LOAD_CUDA_SYM("cudaMallocManaged", cudaMallocManaged); ucc_pt_cuda_iface.available = 1; } diff --git a/tools/perf/ucc_pt_cuda.h b/tools/perf/ucc_pt_cuda.h index 05c1fbbbf8..5a370c3528 100644 --- a/tools/perf/ucc_pt_cuda.h +++ b/tools/perf/ucc_pt_cuda.h @@ -10,6 +10,7 @@ #define cudaSuccess 0 #define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ typedef struct CUStream_st *cudaStream_t; #define STR(x) #x @@ -32,6 +33,7 @@ typedef struct ucc_pt_cuda_iface { int (*streamDestroy)(cudaStream_t stream); char* (*getErrorString)(int err); int (*cudaMalloc)(void **devptr, size_t size); + int (*cudaMallocManaged)(void **ptr, size_t size, unsigned int flags); int (*cudaFree)(void *devptr); int (*cudaMemset)(void *devptr, int value, size_t count); } ucc_pt_cuda_iface_t; @@ -86,6 +88,16 @@ static inline int ucc_pt_cudaMalloc(void **devptr, size_t size) return 0; } +static inline int ucc_pt_cudaMallocManaged(void **ptr, size_t size) +{ + if (!ucc_pt_cuda_iface.available) { + return 1; + } + CUDA_CHECK(ucc_pt_cuda_iface.cudaMallocManaged(ptr, size, + cudaMemAttachGlobal)); + return 0; +} + static inline int ucc_pt_cudaFree(void *devptr) { if (!ucc_pt_cuda_iface.available) {