Skip to content

Commit

Permalink
merge main and address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 committed Nov 13, 2024
2 parents 821af20 + 1baea89 commit 755629a
Show file tree
Hide file tree
Showing 20 changed files with 376 additions and 272 deletions.
9 changes: 8 additions & 1 deletion include/mscclpp/executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@

#include <memory>
#include <mscclpp/core.hpp>
#include <mscclpp/gpu_data_types.hpp>
#include <unordered_map>

namespace mscclpp {

enum class DataType {
INT32,
UINT32,
FLOAT16,
FLOAT32,
BFLOAT16,
};

enum class PacketType {
LL8,
LL16,
Expand Down
7 changes: 0 additions & 7 deletions include/mscclpp/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,6 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
#define CUDA_NVLS_SUPPORTED 0
#endif // !defined(__HIP_PLATFORM_AMD__)

// Fabric
#if !defined(__HIP_PLATFORM_AMD__)
#define CUDA_FABRIC_SUPPORTED ((CUDART_VERSION >= 12040))
#else // !defined(__HIP_PLATFORM_AMD__)
#define CUDA_FABRIC_SUPPORTED 0
#endif // !defined(__HIP_PLATFORM_AMD__)

// GPU sync threads
#if defined(__HIP_PLATFORM_AMD__)
#define __syncshm() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
Expand Down
12 changes: 0 additions & 12 deletions include/mscclpp/gpu_data_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,4 @@ using __bfloat162 = __nv_bfloat162;

#endif

namespace mscclpp {

enum class DataType {
INT32,
UINT32,
FLOAT16,
FLOAT32,
BFLOAT16,
};

} // namespace mscclpp

#endif // MSCCLPP_GPU_DATA_TYPES_HPP_
16 changes: 8 additions & 8 deletions include/mscclpp/gpu_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ T* cudaCalloc(size_t nelem) {
return ptr;
}

#if (CUDA_FABRIC_SUPPORTED)
#if (CUDA_NVLS_SUPPORTED)
template <class T>
T* cudaPhysicalCalloc(size_t nbytes, size_t gran) {
AvoidCudaGraphCaptureGuard cgcGuard;
Expand Down Expand Up @@ -246,7 +246,7 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
}

#if (CUDA_FABRIC_SUPPORTED)
#if (CUDA_NVLS_SUPPORTED)
static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
size_t gran = 0;
int numDevices = 0;
Expand All @@ -270,9 +270,9 @@ static inline size_t getMulticastGranularity(size_t size, CUmulticastGranularity
/// @return A std::shared_ptr to the allocated memory.
template <class T>
std::shared_ptr<T> allocSharedPhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
#if (CUDA_FABRIC_SUPPORTED)
if (!isFabricSupported()) {
throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage);
#if (CUDA_NVLS_SUPPORTED)
if (!isNvlsSupported()) {
throw Error("Only support GPU with NVLS support", ErrorCode::InvalidUsage);
}
if (count == 0) {
return nullptr;
Expand Down Expand Up @@ -387,9 +387,9 @@ UniqueCudaHostPtr<T> makeUniqueCudaHost(size_t count) {
/// @return A std::unique_ptr to the allocated memory.
template <class T>
std::unique_ptr<T> allocUniquePhysicalCuda([[maybe_unused]] size_t count, [[maybe_unused]] size_t gran = 0) {
#if (CUDA_FABRIC_SUPPORTED)
if (!isFabricSupported()) {
throw Error("Only suupport GPU with Fabric support", ErrorCode::InvalidUsage);
#if (CUDA_NVLS_SUPPORTED)
if (!isNvlsSupported()) {
throw Error("Only suupport GPU with NVLS support", ErrorCode::InvalidUsage);
}
if (count == 0) {
return nullptr;
Expand Down
9 changes: 2 additions & 7 deletions include/mscclpp/nvls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,20 @@ class NvlsConnection {
struct DeviceMulticastPointer {
private:
void* devicePtr_;
std::shared_ptr<char> ptr_;
std::shared_ptr<char> mcPtr_;
size_t bufferSize_;

public:
using DeviceHandle = DeviceMulticastPointerDeviceHandle;
DeviceMulticastPointer(std::shared_ptr<char> ptr, std::shared_ptr<char> mcPtr, size_t bufferSize)
: devicePtr_(ptr.get()), ptr_(ptr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
DeviceMulticastPointer(void* devicePtr, std::shared_ptr<char> mcPtr, size_t bufferSize)
: devicePtr_(devicePtr), ptr_(nullptr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
: devicePtr_(devicePtr), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
DeviceHandle deviceHandle();
void* getDevicePtr();

friend class NvlsConnection;
};

std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);

std::shared_ptr<char> bindAllocatedCuda(CUdeviceptr devicePtr, size_t size);
DeviceMulticastPointer bindAllocatedMemory(CUdeviceptr devicePtr, size_t size);

size_t getMultiCastMinGranularity();

Expand Down
1 change: 0 additions & 1 deletion include/mscclpp/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ struct ScopedTimer : public Timer {
std::string getHostName(int maxlen, const char delim);

bool isNvlsSupported();
bool isFabricSupported();

} // namespace mscclpp

Expand Down
34 changes: 5 additions & 29 deletions python/mscclpp/gpu_utils_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,21 @@ using namespace mscclpp;

class PyCudaMemory {
public:
PyCudaMemory(size_t size, DataType dtype) : size_(size), dtype_(dtype) { ptr_ = allocSharedPhysicalCuda<char>(size); }
PyCudaMemory(size_t size) : size_(size) { ptr_ = allocSharedPhysicalCuda<char>(size); }

uintptr_t getPtr() const { return (uintptr_t)(ptr_.get()); }
size_t size() const { return size_; }
DataType dtype() const { return dtype_; }

private:
std::shared_ptr<char> ptr_;
size_t size_;
DataType dtype_;
};

std::shared_ptr<PyCudaMemory> allocSharedPhysicalCudaDispatcher(size_t count, DataType dtype) {
size_t size = 0;
switch (dtype) {
case DataType::FLOAT32:
size = count * sizeof(float);
break;
case DataType::FLOAT16:
size = count * sizeof(__half);
break;
case DataType::BFLOAT16:
size = count * sizeof(__bfloat16);
break;
case DataType::INT32:
size = count * sizeof(int);
break;
default:
throw std::runtime_error("Unsupported data type.");
}

return std::make_shared<PyCudaMemory>(size, dtype);
}

void register_gpu_utils(nb::module_& m) {
nb::class_<PyCudaMemory>(m, "PyCudaMemory")
.def(nb::init<size_t, DataType>(), nb::arg("size"), nb::arg("dtype"))
.def(nb::init<size_t>(), nb::arg("size"))
.def("get_ptr", &PyCudaMemory::getPtr, "Get the raw pointer")
.def("size", &PyCudaMemory::size, "Get the size of the allocated memory")
.def("dtype", &PyCudaMemory::dtype, "Get the data type of the memory");
m.def("alloc_shared_physical_cuda", &allocSharedPhysicalCudaDispatcher, nb::arg("count"), nb::arg("dtype"));
.def("size", &PyCudaMemory::size, "Get the size of the allocated memory");
m.def(
"alloc_shared_physical_cuda", [](size_t size) { return std::make_shared<PyCudaMemory>(size); }, nb::arg("size"));
}
2 changes: 1 addition & 1 deletion python/mscclpp/nvls_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ void register_nvls(nb::module_& m) {
});

nb::class_<NvlsConnection>(m, "NvlsConnection")
.def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
.def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("devicePtr"), nb::arg("size"))
.def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);

m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("allRanks"),
Expand Down
7 changes: 5 additions & 2 deletions python/mscclpp_benchmark/mscclpp_op.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import cupy as cp
import ctypes
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore, alloc_shared_physical_cuda
import mscclpp.comm as mscclpp_comm
from mscclpp.utils import KernelBuilder, pack

Expand Down Expand Up @@ -443,10 +443,13 @@ def __init__(
self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
min_gran = self.nvls_connection.get_multicast_min_granularity()
aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
buffer_raw = alloc_shared_physical_cuda(aligned_buffer_size)
self.nvls_mem_handle = self.nvls_connection.bind_allocated_memory(
buffer_raw.get_ptr(),
aligned_buffer_size
) # just using recommended size for now
self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
self.buffer_raw = buffer_raw

self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
Expand Down
Loading

0 comments on commit 755629a

Please sign in to comment.