merge main and address comments

microsoft · Nov 13, 2024 · 755629a · 755629a
2 parents 821af20 + 1baea89
commit 755629a
Show file tree

Hide file tree

Showing 20 changed files with 376 additions and 272 deletions.
diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp
@@ -6,11 +6,18 @@
 
 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/gpu_data_types.hpp>
 #include <unordered_map>
 
 namespace mscclpp {
 
+enum class DataType {
+  INT32,
+  UINT32,
+  FLOAT16,
+  FLOAT32,
+  BFLOAT16,
+};
+
 enum class PacketType {
   LL8,
   LL16,

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
@@ -103,13 +103,6 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #define CUDA_NVLS_SUPPORTED 0
 #endif  // !defined(__HIP_PLATFORM_AMD__)
 
-// Fabric
-#if !defined(__HIP_PLATFORM_AMD__)
-#define CUDA_FABRIC_SUPPORTED ((CUDART_VERSION >= 12040))
-#else  // !defined(__HIP_PLATFORM_AMD__)
-#define CUDA_FABRIC_SUPPORTED 0
-#endif  // !defined(__HIP_PLATFORM_AMD__)
-
 // GPU sync threads
 #if defined(__HIP_PLATFORM_AMD__)
 #define __syncshm() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");

diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
@@ -29,16 +29,4 @@ using __bfloat162 = __nv_bfloat162;
 
 #endif
 
-namespace mscclpp {
-
-enum class DataType {
-  INT32,
-  UINT32,
-  FLOAT16,
-  FLOAT32,
-  BFLOAT16,
-};
-
-}  // namespace mscclpp
-
 #endif  // MSCCLPP_GPU_DATA_TYPES_HPP_
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -96,7 +96,7 @@ T* cudaCalloc(size_t nelem) {
   return ptr;
 }
 
-#if (CUDA_FABRIC_SUPPORTED)
+#if (CUDA_NVLS_SUPPORTED)
 template <class T>
 T* cudaPhysicalCalloc(size_t nbytes, size_t gran) {
   AvoidCudaGraphCaptureGuard cgcGuard;
@@ -246,7 +246,7 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }
 
-#if (CUDA_FABRIC_SUPPORTED)
+#if (CUDA_NVLS_SUPPORTED)
 static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
   size_t gran = 0;
   int numDevices = 0;
@@ -270,9 +270,9 @@ static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity
 /// @return A std::shared_ptr to the allocated memory.
 template <class T>
 std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_FABRIC_SUPPORTED)
-  if (!isFabricSupported()) {
-    throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage);
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isNvlsSupported()) {
+    throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
   }
   if (count == 0) {
     return nullptr;
@@ -387,9 +387,9 @@ UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
 /// @return A std::unique_ptr to the allocated memory.
 template <class T>
 std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
-#if (CUDA_FABRIC_SUPPORTED)
-  if (!isFabricSupported()) {
-    throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage);
+#if (CUDA_NVLS_SUPPORTED)
+  if (!isNvlsSupported()) {
+    throw Error("Only suupport GPU with NVLS support", ErrorCode::InvalidUsage);
   }
   if (count == 0) {
     return nullptr;

diff --git a/include/mscclpp/nvls.hpp b/include/mscclpp/nvls.hpp
@@ -26,25 +26,20 @@ class NvlsConnection {
   struct DeviceMulticastPointer {
    private:
     void* devicePtr_;
-    std::shared_ptr<char> ptr_;
     std::shared_ptr<char> mcPtr_;
     size_t bufferSize_;
 
    public:
     using DeviceHandle = DeviceMulticastPointerDeviceHandle;
-    DeviceMulticastPointer(std::shared_ptr<char> ptr, std::shared_ptr<char> mcPtr, size_t bufferSize)
-        : devicePtr_(ptr.get()), ptr_(ptr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
     DeviceMulticastPointer(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
-        : devicePtr_(devicePtr), ptr_(nullptr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
+        : devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
     DeviceHandle deviceHandle();
     void* getDevicePtr();
 
     friend class NvlsConnection;
   };
 
-  std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
-
-  std::shared_ptr<char> bindAllocatedCuda(CUdeviceptr devicePtr, size_t size);
+  DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);
 
   size_t getMultiCastMinGranularity();
 

diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp
@@ -38,7 +38,6 @@ struct ScopedTimer : public Timer {
 std::string getHostName(int maxlen, const char delim);
 
 bool isNvlsSupported();
-bool isFabricSupported();
 
 }  // namespace mscclpp
 

diff --git a/python/mscclpp/gpu_utils_py.cpp b/python/mscclpp/gpu_utils_py.cpp
@@ -10,45 +10,21 @@ using namespace mscclpp;
 
 class PyCudaMemory {
  public:
-  PyCudaMemory(size_t size, DataType dtype) : size_(size), dtype_(dtype) { ptr_ = allocSharedPhysicalCuda<char>(size); }
+  PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }
 
   uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
   size_t size() const { return size_; }
-  DataType dtype() const { return dtype_; }
 
  private:
   std::shared_ptr<char> ptr_;
   size_t size_;
-  DataType dtype_;
 };
 
-std::shared_ptr<PyCudaMemory> allocSharedPhysicalCudaDispatcher(size_t count, DataType dtype) {
-  size_t size = 0;
-  switch (dtype) {
-    case DataType::FLOAT32:
-      size = count * sizeof(float);
-      break;
-    case DataType::FLOAT16:
-      size = count * sizeof(__half);
-      break;
-    case DataType::BFLOAT16:
-      size = count * sizeof(__bfloat16);
-      break;
-    case DataType::INT32:
-      size = count * sizeof(int);
-      break;
-    default:
-      throw std::runtime_error("Unsupported data type.");
-  }
-
-  return std::make_shared<PyCudaMemory>(size, dtype);
-}
-
 void register_gpu_utils(nb::module_& m) {
   nb::class_<PyCudaMemory>(m, "PyCudaMemory")
-      .def(nb::init<size_t, DataType>(), nb::arg("size"), nb::arg("dtype"))
+      .def(nb::init<size_t>(), nb::arg("size"))
       .def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
-      .def("size", &PyCudaMemory::size, "Get the size of the allocated memory")
-      .def("dtype", &PyCudaMemory::dtype, "Get the data type of the memory");
-  m.def("alloc_shared_physical_cuda", &allocSharedPhysicalCudaDispatcher, nb::arg("count"), nb::arg("dtype"));
+      .def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
+  m.def(
+      "alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
 }
diff --git a/python/mscclpp/nvls_py.cpp b/python/mscclpp/nvls_py.cpp
@@ -30,7 +30,7 @@ void register_nvls(nb::module_& m) {
       });
 
   nb::class_<NvlsConnection>(m, "NvlsConnection")
-      .def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
+      .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size"))
       .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
 
   m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),

diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,7 +1,7 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
 
@@ -443,10 +443,13 @@ def __init__(
         self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
         min_gran = self.nvls_connection.get_multicast_min_granularity()
         aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
-        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
+        buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
+        self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
+            buffer_raw.get_ptr(),
             aligned_buffer_size
         )  # just using recommended size for now
         self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
+        self.buffer_raw = buffer_raw
 
         self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
         self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)