Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debugging #270

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/nccl/src/allgather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ template <bool IsOutOfPlace, typename T>
cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff,
mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, int rank, int nRanksPerNode, int worldSize,
size_t nelems, cudaStream_t stream) {
return cudaSuccess;
allgather6<IsOutOfPlace><<<28, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode,
nelems * sizeof(T) / sizeof(int));
return cudaGetLastError();
Expand Down
1 change: 1 addition & 0 deletions apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ template <typename T>
cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, int rank, int nRanksPerNode,
int worldSize, size_t nelems, cudaStream_t stream) {
return cudaSuccess;
static uint32_t flag = 1;
#if defined(__HIP_PLATFORM_AMD__)
if (sizeof(T) * nelems <= (1 << 20)) {
Expand Down
31 changes: 24 additions & 7 deletions apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mscclpp/sm_channel_device.hpp>
#include <unordered_map>
#include <vector>
#include <iostream>

#include "allgather.hpp"
#include "allreduce.hpp"
Expand Down Expand Up @@ -147,10 +148,12 @@ static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannel
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
mscclpp::AvoidCudaGraphCaptureGuard guard;
CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(),
sizeof(mscclpp::DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size(),
cudaMemcpyHostToDevice));
// mscclpp::AvoidCudaGraphCaptureGuard guard;
// CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(),
// sizeof(mscclpp::DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size(),
// cudaMemcpyHostToDevice));
mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
return ptr;
}

Expand Down Expand Up @@ -327,6 +330,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t,

NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) {
std::cerr << "ncclAllReduce called\n";
size_t bytes = count * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
int rank = comm->comm->bootstrap()->getRank();
Expand All @@ -336,10 +340,16 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t

auto it = comm->channelInfos.find(key);
if (it == comm->channelInfos.end()) {
std::cerr << "Entering Barrier1\n";
comm->comm->bootstrap()->barrier();
std::cerr << "Done Barrier1\n";
// setup smChannels (src: sendbuff, dst: remote scratch buff)
std::vector<mscclpp::SmChannel> channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>(sendbuff));
ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr};
it = comm->channelInfos.emplace(key, channelInfo).first;
std::cerr << "Entering Barrier2\n";
comm->comm->bootstrap()->barrier();
std::cerr << "Done Barrier2\n";

// setup smOutChannels (src: recvbuff, dst: remote recvbuff)
if (bytes > (1 << 20)) {
Expand All @@ -349,6 +359,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
it->second.smOutChannels = outChannels;
it->second.smOutChannelDeviceHandles = setupSmChannelDeviceHandles(outChannels);
}
std::cerr << "Entering Barrier3\n";
comm->comm->bootstrap()->barrier();
std::cerr << "Done Barrier3\n";
}

smChannels = it->second.smChannelDeviceHandles.get();
Expand Down Expand Up @@ -384,6 +397,7 @@ NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType

NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
ncclComm_t comm, cudaStream_t stream) {
std::cerr << "ncclAllGather called\n";
size_t bytes = sendcount * ncclTypeSize(datatype);
if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument;
int rank = comm->comm->bootstrap()->getRank();
Expand All @@ -393,16 +407,19 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t

auto it = comm->channelInfos.find(key);
if (it == comm->channelInfos.end()) {
std::cerr << "Entering Barrier4\n";
comm->comm->bootstrap()->barrier();
std::cerr << "Done Barrier4\n";
std::vector<mscclpp::RegisteredMemory> remoteMemories =
setupRemoteMemories(comm->comm, rank, const_cast<void*>(recvbuff), bytes * nRank,
mscclpp::Transport::CudaIpc);
std::vector<mscclpp::SmChannel> channels =
setupSmChannels(comm, remoteMemories, const_cast<void*>(recvbuff));
std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr};
it = comm->channelInfos.emplace(key, channelInfo).first;
std::cerr << "Entering Barrier5\n";
comm->comm->bootstrap()->barrier();
std::cerr << "Done Barrier5\n";
}
smChannels = it->second.smChannelDeviceHandles.get();
if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) {
Expand Down
2 changes: 1 addition & 1 deletion include/mscclpp/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

#if defined(__HIP_PLATFORM_AMD__)

#include <hip/hip_bf16.h>
// #include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>

Expand Down
2 changes: 1 addition & 1 deletion src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
// The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle
auto entry = getTransportInfo(Transport::CudaIpc);
void* base;
MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
// MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data);
} else {
Expand Down
Loading