diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 30a93d898..5e0a74fea 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -102,6 +102,7 @@ template cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + return cudaSuccess; allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 5237706b9..fdcbf0ccb 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -452,6 +452,7 @@ template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + return cudaSuccess; static uint32_t flag = 1; #if defined(__HIP_PLATFORM_AMD__) if (sizeof(T) * nelems <= (1 << 20)) { diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d03e6fdcc..b8508ca61 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -8,6 +8,7 @@ #include #include #include +#include #include "allgather.hpp" #include "allreduce.hpp" @@ -147,10 +148,12 @@ static std::shared_ptr> setupSmChannel [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); std::shared_ptr> ptr = mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); + // mscclpp::AvoidCudaGraphCaptureGuard guard; + // CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), + // sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + // cudaMemcpyHostToDevice)); + mscclpp::memcpyCuda>(ptr.get(), smChannelDeviceHandles.data(), + smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); return ptr; } @@ -327,6 +330,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) { + std::cerr << "ncclAllReduce called\n"; size_t bytes = count * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); @@ -336,10 +340,16 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t auto it = comm->channelInfos.find(key); if (it == comm->channelInfos.end()) { + std::cerr << "Entering Barrier1\n"; + comm->comm->bootstrap()->barrier(); + std::cerr << "Done Barrier1\n"; // setup smChannels (src: sendbuff, dst: remote scratch buff) std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; + std::cerr << "Entering Barrier2\n"; + comm->comm->bootstrap()->barrier(); + std::cerr << "Done Barrier2\n"; // setup smOutChannels (src: recvbuff, dst: remote recvbuff) if (bytes > (1 << 20)) { @@ -349,6 +359,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t it->second.smOutChannels = outChannels; it->second.smOutChannelDeviceHandles = setupSmChannelDeviceHandles(outChannels); } + std::cerr << "Entering Barrier3\n"; + comm->comm->bootstrap()->barrier(); + std::cerr << "Done Barrier3\n"; } smChannels = it->second.smChannelDeviceHandles.get(); @@ -384,6 +397,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + std::cerr << "ncclAllGather called\n"; size_t bytes = sendcount * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); @@ -393,16 +407,19 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t auto it = comm->channelInfos.find(key); if (it == comm->channelInfos.end()) { + std::cerr << "Entering Barrier4\n"; + comm->comm->bootstrap()->barrier(); + std::cerr << "Done Barrier4\n"; std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, const_cast(recvbuff), bytes * nRank, mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(recvbuff)); - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; + std::cerr << "Entering Barrier5\n"; + comm->comm->bootstrap()->barrier(); + std::cerr << "Done Barrier5\n"; } smChannels = it->second.smChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 01f875099..8e9e17ab5 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,7 +6,7 @@ #if defined(__HIP_PLATFORM_AMD__) -#include +// #include #include #include diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 6d5fd79f5..cf5bc0bb4 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -146,7 +146,7 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { // The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle auto entry = getTransportInfo(Transport::CudaIpc); void* base; - MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess)); + // MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess)); this->data = static_cast(base) + entry.cudaIpcOffsetFromBase; INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data); } else {