Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 committed Oct 31, 2024
1 parent 3964225 commit d394175
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 45 deletions.
79 changes: 35 additions & 44 deletions include/mscclpp/gpu_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,29 +133,42 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
}

template <class T>
T* cudaPhysicalCallocPtr(size_t nelem, CUmemAllocationProp prop, size_t gran) {
T* cudaPhysicalCallocPtr(size_t nbytes) {
AvoidCudaGraphCaptureGuard cgcGuard;
CUmemGenericAllocationHandle memHandle;
int deviceId = -1;
CUdevice currentDevice;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));

CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
#if defined(__HIP_PLATFORM_AMD__)
prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
#else
prop.requestedHandleTypes =
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
#endif
prop.location.id = currentDevice;

size_t bufferSize = sizeof(T) * nelem;
// allocate physical memory
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
CUmemGenericAllocationHandle memHandle;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));

CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = deviceId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

T* devicePtr = nullptr;
size_t gran = 0;
// Map the device pointer
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, nbytes, &accessDesc, 1));
CudaStreamWithFlags stream(cudaStreamNonBlocking);
MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));

MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));

return devicePtr;
Expand Down Expand Up @@ -231,25 +244,6 @@ Memory safeAlloc(size_t nelem, size_t gran) {
return Memory(ptr, Deleter());
}

template <class T, T*(alloc)(size_t, CUmemAllocationProp, size_t), class Deleter, class Memory>
Memory safeAlloc(size_t nelem, CUmemAllocationProp memProp, size_t gran) {
if ((nelem * sizeof(T)) % gran) {
throw Error("The request allocation size is not divisible by the required granularity:" +
std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
ErrorCode::InvalidUsage);
}
T* ptr = nullptr;
try {
ptr = alloc(nelem, memProp, gran);
} catch (...) {
if (ptr) {
Deleter()(ptr);
}
throw;
}
return Memory(ptr, Deleter());
}

} // namespace detail

/// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
Expand Down Expand Up @@ -329,26 +323,23 @@ std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, siz
template <class T>
std::shared_ptr<T> allocSharedPhysicalCudaPtr(size_t count) {
size_t gran = 0;
CUdevice currentDev;
int cudaDev;
MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDev));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDev, cudaDev));

int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC;
CUmemAllocationProp memProp = {};
memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
int numDevices = 0;
MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));

CUmulticastObjectProp prop = {};
prop.size = count * sizeof(T);
// This is a dummy value, it might affect the granularity in the future
prop.numDevices = numDevices;
#if defined(__HIP_PLATFORM_AMD__)
// TODO: revisit when HIP fixes this typo in the field name
memProp.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
#else
memProp.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
#endif
memProp.location.id = currentDev;
MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &memProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
count = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
return detail::safeAlloc<T, detail::cudaPhysicalCallocPtr<T>, CudaPhysicalPtrDeleter<T>, std::shared_ptr<T>>(
count, memProp, gran);
prop.flags = 0;
MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
size_t nbytes = (count * sizeof(T) + gran - 1) / gran * gran;
return detail::safeAlloc<T, detail::cudaPhysicalCallocPtr<T>, CudaPhysicalPtrDeleter<T>, std::shared_ptr<T>>(nbytes);
}
#endif

Expand Down
38 changes: 37 additions & 1 deletion src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,42 @@
#include "debug.h"
#include "utils_internal.hpp"

namespace {
// Check if ptr is allocaed by cuMemMap
bool isCuMemMapAllocated(void* ptr) {
CUmemGenericAllocationHandle handle;
CUresult result = cuMemRetainAllocationHandle(&handle, ptr);
if (result != CUDA_SUCCESS) {
printf("ptr is %p\n", ptr);
return false;
}
cuMemRelease(handle);
return true;
}

// Get the recommended granularity for cuMemAddressReserve
size_t getRecommendedGranularity() {
size_t gran = 0;
int deviceId = -1;
int currentDevice = -1;
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));

CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
#if defined(__HIP_PLATFORM_AMD__)
prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
#else
prop.requestedHandleTypes =
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
#endif
prop.location.id = currentDevice;
MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
return gran;
}
} // namespace

namespace mscclpp {

RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl)
Expand Down Expand Up @@ -190,7 +226,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = deviceId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
size_t gran = 2 * 1024 * 1024;
size_t gran = getRecommendedGranularity();
MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, this->size, &accessDesc, 1));
Expand Down

0 comments on commit d394175

Please sign in to comment.