WIP

microsoft · Oct 31, 2024 · d394175 · d394175
1 parent 3964225
commit d394175
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 45 deletions.
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -133,29 +133,42 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
 }
 
 template <class T>
-T* cudaPhysicalCallocPtr(size_t nelem, CUmemAllocationProp prop, size_t gran) {
+T* cudaPhysicalCallocPtr(size_t nbytes) {
   AvoidCudaGraphCaptureGuard cgcGuard;
-  CUmemGenericAllocationHandle memHandle;
   int deviceId = -1;
+  CUdevice currentDevice;
   MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+#if defined(__HIP_PLATFORM_AMD__)
+  prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#else
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+#endif
+  prop.location.id = currentDevice;
 
-  size_t bufferSize = sizeof(T) * nelem;
   // allocate physical memory
-  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
+  CUmemGenericAllocationHandle memHandle;
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
 
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
   T* devicePtr = nullptr;
+  size_t gran = 0;
   // Map the device pointer
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, nbytes, gran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, nbytes, 0, memHandle, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, nbytes, &accessDesc, 1));
   CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));
-
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nbytes, stream));
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 
   return devicePtr;
@@ -231,25 +244,6 @@ Memory safeAlloc(size_t nelem, size_t gran) {
   return Memory(ptr, Deleter());
 }
 
-template <class T, T*(alloc)(size_t, CUmemAllocationProp, size_t), class Deleter, class Memory>
-Memory safeAlloc(size_t nelem, CUmemAllocationProp memProp, size_t gran) {
-  if ((nelem * sizeof(T)) % gran) {
-    throw Error("The request allocation size is not divisible by the required granularity:" +
-                    std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
-                ErrorCode::InvalidUsage);
-  }
-  T* ptr = nullptr;
-  try {
-    ptr = alloc(nelem, memProp, gran);
-  } catch (...) {
-    if (ptr) {
-      Deleter()(ptr);
-    }
-    throw;
-  }
-  return Memory(ptr, Deleter());
-}
-
 }  // namespace detail
 
 /// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
@@ -329,26 +323,23 @@ std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, siz
 template <class T>
 std::shared_ptr<T> allocSharedPhysicalCudaPtr(size_t count) {
   size_t gran = 0;
-  CUdevice currentDev;
-  int cudaDev;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDev));
-  MSCCLPP_CUTHROW(cuDeviceGet(&currentDev, cudaDev));
-
-  int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC;
-  CUmemAllocationProp memProp = {};
-  memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  int numDevices = 0;
+  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&numDevices));
+
+  CUmulticastObjectProp prop = {};
+  prop.size = count * sizeof(T);
+  // This is a dummy value, it might affect the granularity in the future
+  prop.numDevices = numDevices;
 #if defined(__HIP_PLATFORM_AMD__)
   // TODO: revisit when HIP fixes this typo in the field name
-  memProp.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #else
-  memProp.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
+  prop.handleTypes = (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
 #endif
-  memProp.location.id = currentDev;
-  MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &memProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-  count = ((count * sizeof(T) + gran - 1) / gran * gran) / sizeof(T);
-  return detail::safeAlloc<T, detail::cudaPhysicalCallocPtr<T>, CudaPhysicalPtrDeleter<T>, std::shared_ptr<T>>(
-      count, memProp, gran);
+  prop.flags = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &prop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  size_t nbytes = (count * sizeof(T) + gran - 1) / gran * gran;
+  return detail::safeAlloc<T, detail::cudaPhysicalCallocPtr<T>, CudaPhysicalPtrDeleter<T>, std::shared_ptr<T>>(nbytes);
 }
 #endif
 

diff --git a/src/registered_memory.cc b/src/registered_memory.cc
@@ -11,6 +11,42 @@
 #include "debug.h"
 #include "utils_internal.hpp"
 
+namespace {
+// Check if ptr is allocaed by cuMemMap
+bool isCuMemMapAllocated(void* ptr) {
+  CUmemGenericAllocationHandle handle;
+  CUresult result = cuMemRetainAllocationHandle(&handle, ptr);
+  if (result != CUDA_SUCCESS) {
+    printf("ptr is %p\n", ptr);
+    return false;
+  }
+  cuMemRelease(handle);
+  return true;
+}
+
+// Get the recommended granularity for cuMemAddressReserve
+size_t getRecommendedGranularity() {
+  size_t gran = 0;
+  int deviceId = -1;
+  int currentDevice = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+#if defined(__HIP_PLATFORM_AMD__)
+  prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#else
+  prop.requestedHandleTypes =
+      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+#endif
+  prop.location.id = currentDevice;
+  MSCCLPP_CUTHROW(cuMemGetAllocationGranularity(&gran, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  return gran;
+}
+}  // namespace
+
 namespace mscclpp {
 
 RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl)
@@ -190,7 +226,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
       accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
       accessDesc.location.id = deviceId;
       accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-      size_t gran = 2 * 1024 * 1024;
+      size_t gran = getRecommendedGranularity();
       MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&base, this->size, gran, 0, 0));
       MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)base, this->size, 0, handle, 0));
       MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)base, this->size, &accessDesc, 1));