From d208486e5503efa41cc420f2ad286a05ad89242b Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 15 Jan 2024 19:18:48 +0000 Subject: [PATCH 01/89] Add allgather5 --- test/mscclpp-test/allgather_test.cu | 55 +++++++++++++++++++++++++++-- test/mscclpp-test/common.cc | 20 ++++++----- test/mscclpp-test/common.hpp | 2 +- 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 4b2eff78f..495236a88 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -23,7 +23,7 @@ using DeviceHandle = mscclpp::DeviceHandle; __constant__ DeviceHandle constProxyChans[16]; __constant__ DeviceHandle constRawProxyChan[16]; -__constant__ DeviceHandle constSmChans[8]; +__constant__ DeviceHandle constSmChans[256]; __global__ void allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; @@ -288,6 +288,49 @@ __global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t ne nBlocksForLocalAllGather); } +__global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int lid = tid % WARP_SIZE; + const int wid = tid / WARP_SIZE; + const int nWarp = blockDim.x * gridDim.x / WARP_SIZE; + const int nPeer = nRanksPerNode - 1; + const int chanOffset = nPeer * blockIdx.x; + auto smChans = constSmChans + chanOffset; + + if (wid < nPeer) { + smChans[wid].signal(); + smChans[wid].wait(); + } + __syncthreads(); + constexpr size_t unitBytesPerThread = 16; + constexpr size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + const size_t nLoop = bytes / unitBytes; + for (size_t i = 0; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const int peerIdx = gWid % nPeer; + const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].get(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const int peerIdx = gWid % nPeer; + const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + smChans[peerIdx].get(offset, remainBytes, lid, WARP_SIZE); + } + } +} + class AllGatherProxyService : public mscclpp::BaseProxyService { public: AllGatherProxyService(int worldSize, int rank, int cudaDevice); @@ -387,6 +430,9 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { if (kernelNum == 4) { nBlocks = 21; nThreads = 1024; + } else if (kernelNum == 5) { + nBlocks = 32; + nThreads = 1024; } else { nBlocks = 1; nThreads = WARP_SIZE * (worldSize - 1); @@ -401,6 +447,8 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { allgather3<<>>(); } else if (kernelNum == 4) { allgather4<<>>(rank, worldSize, nRanksPerNode, paramCount_); + } else if (kernelNum == 5) { + allgather5<<>>(rank, worldSize, nRanksPerNode, paramCount_); } } @@ -453,7 +501,8 @@ std::vector AllGatherTestColl::getKernelRestrictions() { {1, "allgather1", false, 1, 4 * worldSize_}, {2, "allgather2", true, 3, 4 * worldSize_}, {3, "allgather3", true, 1, 4 * worldSize_}, - {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/}}; + {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/}, + {5, "allgather5", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}}; } class AllGatherTestEngine : public BaseTestEngine { @@ -494,7 +543,7 @@ void AllGatherTestEngine::setupConnections() { CUDATHROW(cudaMemcpyToSymbol(constProxyChans, devProxyChannels.data(), sizeof(DeviceHandle) * devProxyChannels.size())); - setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes); + setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 32); std::vector> smChannelHandles(smChannels_.size()); if (smChannels_.size() > sizeof(constSmChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index c5653b3fc..fe07df511 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -428,7 +428,7 @@ void BaseTestEngine::setupMeshConnections(std::vector& smChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes, - ChannelSemantic semantic) { + ChannelSemantic semantic, size_t nChannelPerConnection) { const mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc | IBs[args_.gpuNum]; mscclpp::RegisteredMemory inputBufRegMem = comm_->registerMemory(inputBuff, inputBuffBytes, allTransports); mscclpp::RegisteredMemory getPacketBufRegMem; @@ -443,19 +443,23 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha (outputBuff && semantic == ChannelSemantic::PUT) ? outputBufRegMem : inputBufRegMem; setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories); - std::unordered_map> smSemaphores; + std::unordered_map>> smSemaphores; for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace(cid, std::make_shared(*comm_, connections[cid])); + for (size_t i = 0; i < nChannelPerConnection; ++i) { + smSemaphores[cid].emplace_back(std::make_shared(*comm_, connections[cid])); + } } } comm_->setup(); - for (size_t cid = 0; cid < connections.size(); ++cid) { - if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smChannels.emplace_back(smSemaphores[cid], remoteRegMemories[cid].get(), - (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), - nullptr); + for (size_t i = 0; i < nChannelPerConnection; ++i) { + for (size_t cid = 0; cid < connections.size(); ++cid) { + if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + smChannels.emplace_back(smSemaphores[cid][i], remoteRegMemories[cid].get(), + (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), + nullptr); + } } } } diff --git a/test/mscclpp-test/common.hpp b/test/mscclpp-test/common.hpp index 665ff9119..7e3e8c423 100644 --- a/test/mscclpp-test/common.hpp +++ b/test/mscclpp-test/common.hpp @@ -118,7 +118,7 @@ class BaseTestEngine { SetupChannelFunc setupChannel = nullptr); void setupMeshConnections(std::vector& smChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0, - ChannelSemantic semantic = ChannelSemantic::PUT); + ChannelSemantic semantic = ChannelSemantic::PUT, size_t nChannelPerConnection = 1); void setupMeshConnections(std::vector& smChannels, std::vector>& proxyChannels, void* inputBuff, size_t inputBuffBytes, void* putPacketBuff = nullptr, size_t putPacketBuffBytes = 0, From 6c425e35383ab55ad43651241a737fed2b0ae6e7 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 17 Jan 2024 02:17:01 +0000 Subject: [PATCH 02/89] optimized allgather5 --- test/mscclpp-test/allgather_test.cu | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 495236a88..99191bb2f 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -292,7 +292,8 @@ __global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t ne const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int lid = tid % WARP_SIZE; const int wid = tid / WARP_SIZE; - const int nWarp = blockDim.x * gridDim.x / WARP_SIZE; + const int nThread = blockDim.x * gridDim.x; + const int nWarp = nThread / WARP_SIZE; const int nPeer = nRanksPerNode - 1; const int chanOffset = nPeer * blockIdx.x; auto smChans = constSmChans + chanOffset; @@ -302,11 +303,16 @@ __global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t ne smChans[wid].wait(); } __syncthreads(); - constexpr size_t unitBytesPerThread = 16; - constexpr size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; - const size_t unitBytes = unitBytesPerWarp * nWarp; const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread; + if (bytes >= nThread * 64) { + unitBytesPerThread = 64; + } else { + unitBytesPerThread = 16; + } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; const size_t nLoop = bytes / unitBytes; for (size_t i = 0; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; @@ -431,7 +437,7 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { nBlocks = 21; nThreads = 1024; } else if (kernelNum == 5) { - nBlocks = 32; + nBlocks = 24; nThreads = 1024; } else { nBlocks = 1; From cc9c1505207848efd1d7d68dcaddac48387142a7 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 19 Jan 2024 21:50:01 +0000 Subject: [PATCH 03/89] fix --- test/mscclpp-test/allgather_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 99191bb2f..523ea251c 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -298,8 +298,8 @@ __global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t ne const int chanOffset = nPeer * blockIdx.x; auto smChans = constSmChans + chanOffset; - if (wid < nPeer) { - smChans[wid].signal(); + if (wid < nPeer && lid == 0) { + smChans[wid].relaxedSignal(); smChans[wid].wait(); } __syncthreads(); From c10d01b4278b700685475a2f0957bc7526157e3d Mon Sep 17 00:00:00 2001 From: amduser Date: Sun, 21 Jan 2024 21:55:50 +0000 Subject: [PATCH 04/89] updates --- test/mscclpp-test/allgather_test.cu | 47 ++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 523ea251c..2e11a02f1 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -23,7 +23,7 @@ using DeviceHandle = mscclpp::DeviceHandle; __constant__ DeviceHandle constProxyChans[16]; __constant__ DeviceHandle constRawProxyChan[16]; -__constant__ DeviceHandle constSmChans[256]; +__constant__ DeviceHandle constSmChans[512]; __global__ void allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; @@ -288,14 +288,18 @@ __global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t ne nBlocksForLocalAllGather); } -__global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - const int lid = tid % WARP_SIZE; - const int wid = tid / WARP_SIZE; - const int nThread = blockDim.x * gridDim.x; - const int nWarp = nThread / WARP_SIZE; - const int nPeer = nRanksPerNode - 1; - const int chanOffset = nPeer * blockIdx.x; +__global__ void __launch_bounds__(1024, 1) allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x; + if (blockIdx.x >= nBlock) return; + + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; auto smChans = constSmChans + chanOffset; if (wid < nPeer && lid == 0) { @@ -314,25 +318,34 @@ __global__ void allgather5(int rank, int worldSize, int nRanksPerNode, size_t ne const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; const size_t unitBytes = unitBytesPerWarp * nWarp; const size_t nLoop = bytes / unitBytes; - for (size_t i = 0; i < nLoop; ++i) { + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; - const int peerIdx = gWid % nPeer; - const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].get(offset, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } if (bytes % unitBytes > 0) { const size_t gWid = wid + nLoop * nWarp; - const int peerIdx = gWid % nPeer; - const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].get(offset, remainBytes, lid, WARP_SIZE); + smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE); } } } @@ -549,7 +562,7 @@ void AllGatherTestEngine::setupConnections() { CUDATHROW(cudaMemcpyToSymbol(constProxyChans, devProxyChannels.data(), sizeof(DeviceHandle) * devProxyChannels.size())); - setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 32); + setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64); std::vector> smChannelHandles(smChannels_.size()); if (smChannels_.size() > sizeof(constSmChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); From 80bade5674dd9d9eb1e3716c13fc055e77ee2f46 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 24 Jan 2024 06:44:43 +0000 Subject: [PATCH 05/89] Doing some experiments --- test/mscclpp-test/allgather_test.cu | 324 +++++++++++++++++++++++++++- test/mscclpp-test/allreduce_test.cu | 29 +-- test/mscclpp-test/common.cc | 2 +- 3 files changed, 340 insertions(+), 15 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 2e11a02f1..0027cda74 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -24,6 +24,8 @@ __constant__ DeviceHandle constProxyChans[16]; __constant__ DeviceHandle constRawProxyChan[16]; __constant__ DeviceHandle constSmChans[512]; +__constant__ DeviceHandle constSmOutOfPlaceChans[16]; +__device__ uint64_t globalFlag; __global__ void allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; @@ -350,6 +352,285 @@ __global__ void __launch_bounds__(1024, 1) allgather5(size_t rank, [[maybe_unuse } } +__global__ void __launch_bounds__(1024, 1) allgather6(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x; + if (blockIdx.x >= nBlock) return; + + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; + auto smChans = constSmChans + chanOffset; + + if (wid < nPeer && lid == 0) { + smChans[wid].relaxedSignal(); + smChans[wid].wait(); + } + __syncthreads(); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread; + if (bytes >= nThread * 64) { + unitBytesPerThread = 64; + } else { + unitBytesPerThread = 16; + } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t nLoop = bytes / unitBytes; + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * rank + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); + } + } +} + +__global__ void __launch_bounds__(1024, 1) allgather7(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x; + if (blockIdx.x >= nBlock) return; + + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + // const size_t chanOffset = nPeer * blockIdx.x; + auto smChans = constSmOutOfPlaceChans; + + const uint32_t flag = (uint32_t)globalFlag; + // if (wid < nPeer && lid == 0) { + // smChans[wid].relaxedSignal(); + // smChans[wid].wait(); + // } + // __syncthreads(); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread = 8; + // if (bytes >= nThread * 64) { + // unitBytesPerThread = 64; + // } else { + // unitBytesPerThread = 16; + // } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t nLoop = bytes / unitBytes; + + // double buffering + const size_t scratchOffset = (flag & 1) ? 0 : bytesPerGPU * nRanksPerNode * 2; + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; + // if (lid == 0) printf("get %p, rank %zu, wid %zu, remoteRankLocalIndex %zu, offset %zu\n", smChans[peerIdx].getPacketBuffer_, rank, wid, remoteRankLocalIndex, offset); + smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * rank + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + } + } + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + } + } + + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + +__global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x / 2; + const bool isPut = blockIdx.x < nBlock; + + const size_t tid = threadIdx.x + (blockIdx.x % nBlock) * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + // const size_t chanOffset = nPeer * blockIdx.x; + auto smChans = constSmOutOfPlaceChans; + + const uint32_t flag = (uint32_t)globalFlag; + // if (wid < nPeer && lid == 0) { + // smChans[wid].relaxedSignal(); + // smChans[wid].wait(); + // } + // __syncthreads(); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread = 8; + // if (bytes >= nThread * 64) { + // unitBytesPerThread = 64; + // } else { + // unitBytesPerThread = 16; + // } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t nLoop = bytes / unitBytes; + + // double buffering + const size_t scratchOffset = (flag & 1) ? 0 : bytesPerGPU * nRanksPerNode * 2; + + if (isPut) { + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; + // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; + // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * rank + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + remainBytes, lid, WARP_SIZE, flag); + } + } + } else { + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; + // if (lid == 0) printf("get %p, rank %zu, wid %zu, remoteRankLocalIndex %zu, offset %zu\n", smChans[peerIdx].getPacketBuffer_, rank, wid, remoteRankLocalIndex, offset); + // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; + // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + unitBytesPerWarp, lid, WARP_SIZE, flag); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, + remainBytes, lid, WARP_SIZE, flag); + } + } + } + + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + class AllGatherProxyService : public mscclpp::BaseProxyService { public: AllGatherProxyService(int worldSize, int rank, int cudaDevice); @@ -452,6 +733,15 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { } else if (kernelNum == 5) { nBlocks = 24; nThreads = 1024; + } else if (kernelNum == 6) { + nBlocks = 24; + nThreads = 1024; + } else if (kernelNum == 7) { + nBlocks = 4; + nThreads = 896; + } else if (kernelNum == 8) { + nBlocks = 2; + nThreads = 896; } else { nBlocks = 1; nThreads = WARP_SIZE * (worldSize - 1); @@ -468,6 +758,12 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { allgather4<<>>(rank, worldSize, nRanksPerNode, paramCount_); } else if (kernelNum == 5) { allgather5<<>>(rank, worldSize, nRanksPerNode, paramCount_); + } else if (kernelNum == 6) { + allgather6<<>>(rank, worldSize, nRanksPerNode, paramCount_); + } else if (kernelNum == 7) { + allgather7<<>>(rank, worldSize, nRanksPerNode, paramCount_); + } else if (kernelNum == 8) { + allgather8<<>>(rank, worldSize, nRanksPerNode, paramCount_); } } @@ -521,7 +817,10 @@ std::vector AllGatherTestColl::getKernelRestrictions() { {2, "allgather2", true, 3, 4 * worldSize_}, {3, "allgather3", true, 1, 4 * worldSize_}, {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/}, - {5, "allgather5", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}}; + {5, "allgather5", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, + {6, "allgather6", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, + {7, "allgather7", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, + {8, "allgather8", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}}; } class AllGatherTestEngine : public BaseTestEngine { @@ -542,7 +841,9 @@ class AllGatherTestEngine : public BaseTestEngine { std::shared_ptr sendBuff_; std::shared_ptr expectedBuff_; + std::shared_ptr scratchPacketBuff_; std::vector smChannels_; + std::vector smOutOfPlaceChannels_; }; AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {} @@ -550,6 +851,12 @@ AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine( void AllGatherTestEngine::allocateBuffer() { sendBuff_ = mscclpp::allocExtSharedCuda(args_.maxBytes / sizeof(int)); expectedBuff_ = std::shared_ptr(new int[args_.maxBytes / sizeof(int)]); + if (args_.kernelNum == 7 || args_.kernelNum == 8) { + const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); + // 2x for double-buffering, scratchBuff used to store original data and reduced results + const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/; + scratchPacketBuff_ = mscclpp::allocExtSharedCuda(scratchBuffNelem); + } } void AllGatherTestEngine::setupConnections() { @@ -571,6 +878,21 @@ void AllGatherTestEngine::setupConnections() { [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), sizeof(DeviceHandle) * smChannelHandles.size())); + + if (args_.kernelNum == 7 || args_.kernelNum == 8) { + const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); + const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket); + setupMeshConnections(smOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), + scratchPacketBuffBytes); + std::vector> smOutOfPlaceChannelHandles(smOutOfPlaceChannels_.size()); + if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle)) { + std::runtime_error("unexpected error"); + } + std::transform(smOutOfPlaceChannels_.begin(), smOutOfPlaceChannels_.end(), smOutOfPlaceChannelHandles.begin(), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smOutOfPlaceChannelHandles.data(), + sizeof(DeviceHandle) * smOutOfPlaceChannelHandles.size())); + } } else { auto service = std::dynamic_pointer_cast(chanService_); setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0, diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 2748681b4..32a43e0c3 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -970,9 +970,8 @@ __global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) { // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; const int nPeers = nRanksPerNode - 1; - const int nPkts = nelems / 2; + const size_t nPkts = nelems / 2; const int nelemsPerRank = nelems / worldSize; const int nPktsPerRank = nelemsPerRank / 2; // flag for packets. Initially 1 @@ -982,7 +981,6 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, const int localBlockIdx = blockIdx.x % nBlocksPerPeer; const int peerIdx = blockIdx.x / nBlocksPerPeer; const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - DeviceHandle smChan = constSmOutOfPlaceChans[peerIdx]; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); @@ -995,7 +993,7 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = make_uint2(0, 0); @@ -1008,11 +1006,16 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, } data.x += src[idx].x; data.y += src[idx].y; - dst[idx].x = data.x; - dst[idx].y = data.y; + dst[idx] = data; + + mscclpp::LLPacket packet; + packet.data1 = data.x; + packet.flag1 = flag; + packet.data2 = data.y; + packet.flag2 = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)constSmOutOfPlaceChans[index].dst_ + scratchResultOffset); - dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag); + constSmOutOfPlaceChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -1180,7 +1183,7 @@ class AllReduceTestEngine : public BaseTestEngine { std::shared_ptr expectedBuff_; std::vector smOutOfPlaceChannels_; std::vector smInPlaceChannels_; - std::vector smOutputPlaceGetChannels_; + std::vector smOutOfPlaceGetChannels_; }; AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") { @@ -1301,14 +1304,14 @@ void AllReduceTestEngine::setupConnections() { CUDATHROW(cudaMemcpyToSymbol(constSmInPlaceChans, smChannelDeviceHandles.data(), sizeof(DeviceHandle) * smChannelDeviceHandles.size())); - setupMeshConnections(smOutputPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), + setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes, ChannelSemantic::GET); - if (smOutputPlaceGetChannels_.size() > + if (smOutOfPlaceGetChannels_.size() > sizeof(constSmOutOfPlaceGetChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - smChannelDeviceHandles.resize(smOutputPlaceGetChannels_.size()); - getChannelDeviceHandle(smOutputPlaceGetChannels_, smChannelDeviceHandles); + smChannelDeviceHandles.resize(smOutOfPlaceGetChannels_.size()); + getChannelDeviceHandle(smOutOfPlaceGetChannels_, smChannelDeviceHandles); CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceGetChans, smChannelDeviceHandles.data(), sizeof(DeviceHandle) * smChannelDeviceHandles.size())); } diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index fe07df511..9c52f9f4a 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -458,7 +458,7 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { smChannels.emplace_back(smSemaphores[cid][i], remoteRegMemories[cid].get(), (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), - nullptr); + outputBuff); } } } From 3d178f3335dd3007f108401cd01453124a847d46 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 25 Jan 2024 06:24:41 +0000 Subject: [PATCH 06/89] measure perf --- test/mp_unit/sm_channel_tests.cu | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index ea5241053..f448c7168 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -238,12 +238,11 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) { EXPECT_EQ(*ret, 0); } -__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret) { +__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries = 1000) { if (rank > 1) return; DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; volatile int* sendBuff = (volatile int*)buff; - int nTries = 1000; int putOffset = (rank == 0) ? 0 : 10000000; int getOffset = (rank == 0) ? 10000000 : 0; for (int i = 0; i < nTries; i++) { @@ -305,8 +304,6 @@ TEST_F(SmChannelOneToOneTest, PacketPingPong) { // The least nelem is 2 for packet ping pong kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 2, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); *ret = 0; kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); @@ -325,4 +322,17 @@ TEST_F(SmChannelOneToOneTest, PacketPingPong) { MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); + *ret = 0; + + int nTries = 1000000; + communicator->bootstrap()->barrier(); + mscclpp::Timer timer; + kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get(), nTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + if (gEnv->rank == 0) { + std::cout << "smPacketPingPong" + << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n"; + } } From 43bdd35ee0e7eb24994bd0d8f34732478c9a95a2 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 30 Jan 2024 09:17:42 +0000 Subject: [PATCH 07/89] new algo --- include/mscclpp/packet_device.hpp | 101 +++++++++++++++++++++++++- include/mscclpp/sm_channel_device.hpp | 10 +++ test/mscclpp-test/allreduce_test.cu | 86 ++++++++++++++++++++-- 3 files changed, 189 insertions(+), 8 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index bf2139938..14b33c2d3 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -14,7 +14,6 @@ #endif // defined(MSCCLPP_DEVICE_COMPILE) namespace mscclpp { - /// LL (low latency) protocol packet. union alignas(16) LLPacket { // Assume data is written with an atomicity of 8 bytes (IB/RDMA). @@ -43,6 +42,8 @@ union alignas(16) LLPacket { ulonglong2* p = reinterpret_cast(®); atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); + // __builtin_nontemporal_store(p->x, &(raw_.x)); + // __builtin_nontemporal_store(p->y, &(raw_.y)); #endif } @@ -77,9 +78,22 @@ union alignas(16) LLPacket { /// @param flag The flag to read. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. /// @return The 8-byte data read. - MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const { + MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 1000000000) const { uint2 data; POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); + // int64_t spins = 0; + // ulonglong2 reg; + // uint4* ptr; + + // do { + // reg.x = __builtin_nontemporal_load(&(raw_.x)); + // reg.y = __builtin_nontemporal_load(&(raw_.y)); + // ptr = reinterpret_cast(®); + // // if (spins >= maxSpinCount) break; + // // spins++; + // } while ((ptr->y != flag) || (ptr->w != flag)); + // data.x = ptr->x; + // data.y = ptr->z; return data; } @@ -88,6 +102,61 @@ union alignas(16) LLPacket { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; +union alignas(8) LLPacket2 { + // Assume data is written with an atomicity of 8 bytes (IB/RDMA). + struct { + uint32_t data; + uint32_t flag; + }; + uint64_t raw_; +#if defined(MSCCLPP_DEVICE_COMPILE) + + MSCCLPP_DEVICE_INLINE LLPacket2() {} + + MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { +#if defined(MSCCLPP_DEVICE_CUDA) +#else // !defined(MSCCLPP_DEVICE_CUDA) + uint2 reg = make_uint2(val, flag); + uint64_t* p = reinterpret_cast(®); + // __builtin_nontemporal_store(*p, &(raw_)); + atomicStore(&(raw_), *p, memoryOrderRelaxed); +#endif + } + + MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint32_t& data) const { +#if defined(MSCCLPP_DEVICE_CUDA) +#else // !defined(MSCCLPP_DEVICE_CUDA) + uint64_t reg; + reg = atomicLoad(&(raw_), memoryOrderRelaxed); + // reg = __builtin_nontemporal_load(&(raw_)); + uint2* ptr = reinterpret_cast(®); + data = ptr->x; + return (ptr->y != flag); +#endif + } + + MSCCLPP_DEVICE_INLINE uint32_t read(uint32_t flag, int64_t maxSpinCount = 100000000) const { + uint32_t data; + POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); + // int64_t spins = 0; + // uint64_t reg; + // uint2* ptr; + + // do { + // reg = __builtin_nontemporal_load(&(raw_)); + // ptr = reinterpret_cast(®); + // if (spins >= maxSpinCount) break; + // spins++; + // } while ((ptr->y != flag)); + // data = ptr->x; + return data; + } + + /// Clear the packet. + MSCCLPP_DEVICE_INLINE void clear() { raw_ = 0; } +#endif // defined(MSCCLPP_DEVICE_COMPILE) +}; + #if defined(MSCCLPP_DEVICE_COMPILE) /// Read from the origin and write to the target buffer. MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, @@ -116,6 +185,34 @@ MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffs originBase[i] = pkt->read(flag); } } + +/// Read from the origin and write to the target buffer. +MSCCLPP_DEVICE_INLINE void putPackets2(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes + const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); + LLPacket2* targetBase = (LLPacket2*)((char*)targetPtr + targetOffset); + size_t nElem = originBytes / sizeof(uint32_t); + for (size_t i = threadId; i < nElem; i += numThreads) { + LLPacket2* pkt = &targetBase[i]; + pkt->write(originBase[i], flag); + } +} + +/// Read from the target buffer and write to the origin. +MSCCLPP_DEVICE_INLINE void getPackets2(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes + const LLPacket2* targetBase = (const LLPacket2*)((const char*)targetPtr + targetOffset); + uint32_t* originBase = (uint32_t*)((char*)originPtr + originOffset); + size_t nElem = originBytes / sizeof(uint32_t); + for (size_t i = threadId; i < nElem; i += numThreads) { + const LLPacket2* pkt = &targetBase[i]; + originBase[i] = pkt->read(flag); + } +} #endif // defined(MSCCLPP_DEVICE_COMPILE) }; // namespace mscclpp diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index 29993f8e8..f2a6c5838 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -233,6 +233,16 @@ struct SmChannelDeviceHandle { mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } + MSCCLPP_DEVICE_INLINE void putPackets2(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::putPackets2(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + } + + MSCCLPP_DEVICE_INLINE void getPackets2(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::getPackets2(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + } + /// Signal the remote semaphore. /// /// This function guarantees that all the memory operation before this function is completed before the remote diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 32a43e0c3..a235e2d94 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1032,6 +1032,66 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, } } +__global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { + // This version of allreduce only works for single nodes + const int nPeers = nRanksPerNode - 1; + const size_t nPkts = nelems; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank; + // flag for packets. Initially 1 + const uint32_t flag = (uint32_t)globalFlag; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket2); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket2); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket2) : 3 * nPkts * sizeof(mscclpp::LLPacket2); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // step 1: write to scratch buffer + constSmOutOfPlaceChans[peerIdx].putPackets2(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint32_t data = 0; + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket2* dstPkt = (mscclpp::LLPacket2*)scratchBuff + remoteRank * nPktsPerRank; + uint32_t val = dstPkt[idx].read(flag); + data += val; + } + data += src[idx]; + dst[idx] = data; + + mscclpp::LLPacket2 packet; + packet.data = data; + packet.flag = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket2) + (idx + rank * nPktsPerRank); + for (int index = 0; index < nPeers; index++) { + constSmOutOfPlaceChans[index].write(offset, packet); + } + } + // step 3: get data result from scratch buffer + mscclpp::LLPacket2* dstPkt = (mscclpp::LLPacket2*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint32_t data = dstPkt[idx + dstOffset].read(flag); + result[idx] = data; + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + class AllReduceTestColl : public BaseTestColl { public: AllReduceTestColl() = default; @@ -1075,7 +1135,12 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { nBlocks = 21; tmpBuff = scratchPacketBuff; nThreadsPerBlock = 512; - } else { + } else if (kernelNum == 7) { + nBlocks = 28; + tmpBuff = scratchPacketBuff; + nThreadsPerBlock = 1024; + + }else { nBlocks = std::max(args.nRanksPerNode - 1, 1) * BLOCKS_PER_PEER; tmpBuff = scratchPacketBuff; nThreadsPerBlock = 1024; @@ -1101,6 +1166,10 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { allreduce6<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_); } + else if (kernelNum == 7) { + allreduce7<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, + args.nRanksPerNode, worldSize, paramCount_); + } } void AllReduceTestColl::initData(const TestArgs& args, std::vector sendBuff, void* expectedBuff) { @@ -1153,7 +1222,8 @@ std::vector AllReduceTestColl::getKernelRestrictions() { 16 * worldSize_ /*use ulong2 to transfer data*/, }, {5, "allreduce5", false, 1, 4 * worldSize_}, - {6, "allreduce6", false, 1, 4 * worldSize_}}; + {6, "allreduce6", false, 1, 4 * worldSize_}, + {7, "allreduce7", false, 1, 4 * worldSize_}}; } class AllReduceTestEngine : public BaseTestEngine { @@ -1190,9 +1260,13 @@ AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine( inPlace_ = isInPlace(); } -bool AllReduceTestEngine::isUsePacket() const { return (args_.kernelNum == 2 || args_.kernelNum == 6); } +bool AllReduceTestEngine::isUsePacket() const { + return (args_.kernelNum == 2 || args_.kernelNum == 6 || args_.kernelNum == 7); +} -bool AllReduceTestEngine::isInPlace() const { return (args_.kernelNum != 2 && args_.kernelNum != 6); } +bool AllReduceTestEngine::isInPlace() const { + return (args_.kernelNum != 2 && args_.kernelNum != 6 && args_.kernelNum != 7); +} void AllReduceTestEngine::allocateBuffer() { inputBuff_ = mscclpp::allocExtSharedCuda(args_.maxBytes / sizeof(int)); @@ -1214,7 +1288,7 @@ void AllReduceTestEngine::allocateBuffer() { getPacketBuff_ = mscclpp::allocExtSharedCuda(packetBuffNelem); putPacketBuff = putPacketBuff_.get(); getPacketBuff = getPacketBuff_.get(); - } else if (args_.kernelNum == 6) { + } else if (args_.kernelNum == 6 || args_.kernelNum == 7) { const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); // 2x for double-buffering, scratchBuff used to store original data and reduced results const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/; @@ -1235,7 +1309,7 @@ void AllReduceTestEngine::setupConnections() { std::vector> proxyChannels; const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); - if (args_.kernelNum == 6) { + if (args_.kernelNum == 6 || args_.kernelNum == 7) { const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket); setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), scratchPacketBuffBytes); From 28c5d21024be9f9eba39d971e0e4964e393a3a21 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 1 Feb 2024 22:14:50 +0000 Subject: [PATCH 08/89] Add abortFlag --- CMakeLists.txt | 4 ++-- include/mscclpp/packet_device.hpp | 30 +++++++++++++++-------------- include/mscclpp/poll_device.hpp | 10 +++++++++- test/CMakeLists.txt | 2 +- test/mscclpp-test/CMakeLists.txt | 2 +- test/mscclpp-test/allgather_test.cu | 29 ++++++++++++++-------------- test/mscclpp-test/allreduce_test.cu | 15 ++++++++------- 7 files changed, 51 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66ed4b94b..302febab7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,11 +83,11 @@ if(USE_CUDA) else() set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") - project(mscclpp LANGUAGES CXX HIP) + project(mscclpp LANGUAGES CXX) set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942) - set(GPU_LIBRARIES hip::host) + set(GPU_LIBRARIES hip::device) set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS}) endif() diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 14b33c2d3..d53be572e 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -40,10 +40,10 @@ union alignas(16) LLPacket { #else // !defined(MSCCLPP_DEVICE_CUDA) uint4 reg = make_uint4(val1, flag, val2, flag); ulonglong2* p = reinterpret_cast(®); - atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); - atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); - // __builtin_nontemporal_store(p->x, &(raw_.x)); - // __builtin_nontemporal_store(p->y, &(raw_.y)); + // atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); + // atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); + __builtin_nontemporal_store(p->x, &(raw_.x)); + __builtin_nontemporal_store(p->y, &(raw_.y)); #endif } @@ -65,8 +65,10 @@ union alignas(16) LLPacket { return (flag1 != flag) || (flag2 != flag); #else // !defined(MSCCLPP_DEVICE_CUDA) ulonglong2 reg; - reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed); - reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed); + // reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed); + // reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed); + reg.x = __builtin_nontemporal_load(&(raw_.x)); + reg.y = __builtin_nontemporal_load(&(raw_.y)); uint4* ptr = reinterpret_cast(®); data.x = ptr->x; data.y = ptr->z; @@ -118,8 +120,8 @@ union alignas(8) LLPacket2 { #else // !defined(MSCCLPP_DEVICE_CUDA) uint2 reg = make_uint2(val, flag); uint64_t* p = reinterpret_cast(®); - // __builtin_nontemporal_store(*p, &(raw_)); - atomicStore(&(raw_), *p, memoryOrderRelaxed); + __builtin_nontemporal_store(*p, &(raw_)); + // atomicStore(&(raw_), *p, memoryOrderRelaxed); #endif } @@ -127,8 +129,8 @@ union alignas(8) LLPacket2 { #if defined(MSCCLPP_DEVICE_CUDA) #else // !defined(MSCCLPP_DEVICE_CUDA) uint64_t reg; - reg = atomicLoad(&(raw_), memoryOrderRelaxed); - // reg = __builtin_nontemporal_load(&(raw_)); + // reg = atomicLoad(&(raw_), memoryOrderRelaxed); + reg = __builtin_nontemporal_load(&(raw_)); uint2* ptr = reinterpret_cast(®); data = ptr->x; return (ptr->y != flag); @@ -188,8 +190,8 @@ MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffs /// Read from the origin and write to the target buffer. MSCCLPP_DEVICE_INLINE void putPackets2(void* targetPtr, uint64_t targetOffset, const void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); LLPacket2* targetBase = (LLPacket2*)((char*)targetPtr + targetOffset); @@ -202,8 +204,8 @@ MSCCLPP_DEVICE_INLINE void putPackets2(void* targetPtr, uint64_t targetOffset, c /// Read from the target buffer and write to the origin. MSCCLPP_DEVICE_INLINE void getPackets2(const void* targetPtr, uint64_t targetOffset, void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const LLPacket2* targetBase = (const LLPacket2*)((const char*)targetPtr + targetOffset); uint32_t* originBase = (uint32_t*)((char*)originPtr + originOffset); diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp index 0cdb6b019..324423ef0 100644 --- a/include/mscclpp/poll_device.hpp +++ b/include/mscclpp/poll_device.hpp @@ -4,12 +4,14 @@ #ifndef MSCCLPP_POLL_DEVICE_HPP_ #define MSCCLPP_POLL_DEVICE_HPP_ -#include "device.hpp" +#include "atomic_device.hpp" #if defined(MSCCLPP_DEVICE_COMPILE) #include +__device__ uint64_t abortFlag; + #if defined(MSCCLPP_DEVICE_HIP) extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, const char *__function); @@ -26,6 +28,9 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ __assert_fail(#__cond, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ } \ + if ((atomicLoad(&abortFlag, memoryOrderAcquire) != 0)) { \ + break; \ + } \ } \ } while (0); @@ -43,6 +48,9 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ __assert_fail(#__cond1 #__cond2, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ } \ + if ((atomicLoad(&abortFlag, memoryOrderAcquire) != 0)) { \ + break; \ + } \ } \ } while (0); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 087fdeb86..d3c33c901 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,7 +10,7 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include) if(USE_ROCM) file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu) - set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE HIP) + set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX) endif() function(add_test_executable name sources) diff --git a/test/mscclpp-test/CMakeLists.txt b/test/mscclpp-test/CMakeLists.txt index cbbdfea65..e2ec8c2ea 100644 --- a/test/mscclpp-test/CMakeLists.txt +++ b/test/mscclpp-test/CMakeLists.txt @@ -6,7 +6,7 @@ FetchContent_MakeAvailable(json) function(add_mscclpp_test_executable name sources) if(USE_ROCM) - set_source_files_properties(${sources} PROPERTIES LANGUAGE HIP) + set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX) endif() add_executable(${name} ${sources} common.cc) target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX nlohmann_json::nlohmann_json) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 0027cda74..e7b3e407e 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -290,7 +290,8 @@ __global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t ne nBlocksForLocalAllGather); } -__global__ void __launch_bounds__(1024, 1) allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024, 1) + allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x; if (blockIdx.x >= nBlock) return; @@ -352,7 +353,8 @@ __global__ void __launch_bounds__(1024, 1) allgather5(size_t rank, [[maybe_unuse } } -__global__ void __launch_bounds__(1024, 1) allgather6(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024, 1) + allgather6(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x; if (blockIdx.x >= nBlock) return; @@ -414,7 +416,8 @@ __global__ void __launch_bounds__(1024, 1) allgather6(size_t rank, [[maybe_unuse } } -__global__ void __launch_bounds__(1024, 1) allgather7(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024, 1) + allgather7(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x; if (blockIdx.x >= nBlock) return; @@ -457,13 +460,11 @@ __global__ void __launch_bounds__(1024, 1) allgather7(size_t rank, [[maybe_unuse smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } - if (nLoop > 0) { // First loop unrolling const size_t peerIdx = wid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - // if (lid == 0) printf("get %p, rank %zu, wid %zu, remoteRankLocalIndex %zu, offset %zu\n", smChans[peerIdx].getPacketBuffer_, rank, wid, remoteRankLocalIndex, offset); smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } @@ -515,7 +516,8 @@ __global__ void __launch_bounds__(1024, 1) allgather7(size_t rank, [[maybe_unuse } } -__global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024, 1) + allgather8(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x / 2; const bool isPut = blockIdx.x < nBlock; @@ -561,7 +563,6 @@ __global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unuse unitBytesPerWarp, lid, WARP_SIZE, flag); } - for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; @@ -579,8 +580,8 @@ __global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unuse const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; const size_t offset = bytesPerGPU * rank + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) - ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) - : unitBytesPerWarp; + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; if (remainBytes > 0) { // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, @@ -593,8 +594,6 @@ __global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unuse const size_t peerIdx = wid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - // if (lid == 0) printf("get %p, rank %zu, wid %zu, remoteRankLocalIndex %zu, offset %zu\n", smChans[peerIdx].getPacketBuffer_, rank, wid, remoteRankLocalIndex, offset); - // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } @@ -616,12 +615,12 @@ __global__ void __launch_bounds__(1024, 1) allgather8(size_t rank, [[maybe_unuse const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) - ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) - : unitBytesPerWarp; + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; if (remainBytes > 0) { // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); - mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - remainBytes, lid, WARP_SIZE, flag); + mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, + offset, remainBytes, lid, WARP_SIZE, flag); } } } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index a235e2d94..af6d94499 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -993,7 +993,8 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = make_uint2(0, 0); @@ -1058,7 +1059,8 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets2(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets2(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; @@ -1140,7 +1142,7 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { tmpBuff = scratchPacketBuff; nThreadsPerBlock = 1024; - }else { + } else { nBlocks = std::max(args.nRanksPerNode - 1, 1) * BLOCKS_PER_PEER; tmpBuff = scratchPacketBuff; nThreadsPerBlock = 1024; @@ -1165,8 +1167,7 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { else if (kernelNum == 6) { allreduce6<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_); - } - else if (kernelNum == 7) { + } else if (kernelNum == 7) { allreduce7<<>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank, args.nRanksPerNode, worldSize, paramCount_); } @@ -1378,8 +1379,8 @@ void AllReduceTestEngine::setupConnections() { CUDATHROW(cudaMemcpyToSymbol(constSmInPlaceChans, smChannelDeviceHandles.data(), sizeof(DeviceHandle) * smChannelDeviceHandles.size())); - setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), - args_.maxBytes, ChannelSemantic::GET); + setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes, + ChannelSemantic::GET); if (smOutOfPlaceGetChannels_.size() > sizeof(constSmOutOfPlaceGetChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); From e06fbebd8d0866fed9253c3139d087c3c6ab97cf Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 1 Feb 2024 23:03:59 +0000 Subject: [PATCH 09/89] 16.14us for 512KB allreduce --- include/mscclpp/packet_device.hpp | 16 ++++++++-------- include/mscclpp/poll_device.hpp | 8 -------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index d53be572e..55e284cb4 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -40,10 +40,10 @@ union alignas(16) LLPacket { #else // !defined(MSCCLPP_DEVICE_CUDA) uint4 reg = make_uint4(val1, flag, val2, flag); ulonglong2* p = reinterpret_cast(®); - // atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); - // atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); - __builtin_nontemporal_store(p->x, &(raw_.x)); - __builtin_nontemporal_store(p->y, &(raw_.y)); + atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); + atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); + // __builtin_nontemporal_store(p->x, &(raw_.x)); + // __builtin_nontemporal_store(p->y, &(raw_.y)); #endif } @@ -65,10 +65,10 @@ union alignas(16) LLPacket { return (flag1 != flag) || (flag2 != flag); #else // !defined(MSCCLPP_DEVICE_CUDA) ulonglong2 reg; - // reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed); - // reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed); - reg.x = __builtin_nontemporal_load(&(raw_.x)); - reg.y = __builtin_nontemporal_load(&(raw_.y)); + reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed); + reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed); + // reg.x = __builtin_nontemporal_load(&(raw_.x)); + // reg.y = __builtin_nontemporal_load(&(raw_.y)); uint4* ptr = reinterpret_cast(®); data.x = ptr->x; data.y = ptr->z; diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp index 324423ef0..f8c5d3a9a 100644 --- a/include/mscclpp/poll_device.hpp +++ b/include/mscclpp/poll_device.hpp @@ -26,10 +26,6 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ int64_t __spin_cnt = 0; \ while (__cond) { \ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ - __assert_fail(#__cond, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ - } \ - if ((atomicLoad(&abortFlag, memoryOrderAcquire) != 0)) { \ - break; \ } \ } \ } while (0); @@ -46,10 +42,6 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ break; \ } \ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ - __assert_fail(#__cond1 #__cond2, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ - } \ - if ((atomicLoad(&abortFlag, memoryOrderAcquire) != 0)) { \ - break; \ } \ } \ } while (0); From 1bef17e592da69e26d5e74c8e317d22a6ea77ad2 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 2 Feb 2024 04:33:40 +0000 Subject: [PATCH 10/89] 16.72us for 512KB allreduce7 & pass pingpong test --- include/mscclpp/packet_device.hpp | 32 ++++++++++++++++--------------- test/mp_unit/sm_channel_tests.cu | 26 ++++++++++++------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 55e284cb4..abb115b45 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -120,8 +120,8 @@ union alignas(8) LLPacket2 { #else // !defined(MSCCLPP_DEVICE_CUDA) uint2 reg = make_uint2(val, flag); uint64_t* p = reinterpret_cast(®); - __builtin_nontemporal_store(*p, &(raw_)); - // atomicStore(&(raw_), *p, memoryOrderRelaxed); + // __builtin_nontemporal_store(*p, &(raw_)); + atomicStore(&(raw_), *p, memoryOrderRelaxed); #endif } @@ -137,20 +137,22 @@ union alignas(8) LLPacket2 { #endif } - MSCCLPP_DEVICE_INLINE uint32_t read(uint32_t flag, int64_t maxSpinCount = 100000000) const { + MSCCLPP_DEVICE_INLINE uint32_t read(uint32_t flag, int64_t maxSpinCount = 1000000) const { uint32_t data; - POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); - // int64_t spins = 0; - // uint64_t reg; - // uint2* ptr; - - // do { - // reg = __builtin_nontemporal_load(&(raw_)); - // ptr = reinterpret_cast(®); - // if (spins >= maxSpinCount) break; - // spins++; - // } while ((ptr->y != flag)); - // data = ptr->x; + // POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); + int64_t spins = 0; + uint64_t reg; + uint2* ptr; + + do { + reg = __builtin_nontemporal_load(&(raw_)); + ptr = reinterpret_cast(®); + // if (spins >= maxSpinCount) { + asm volatile("s_waitcnt vmcnt(0)"); + // spins = 0; + // } + } while ((ptr->y != flag)); + data = ptr->x; return data; } diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index f448c7168..eccd791a0 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -252,29 +252,29 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, // rank=1: 1, 0, 1, 0, ... if ((rank ^ (i & 1)) == 0) { // If each thread writes 8 bytes at once, we don't need a barrier before putPackets(). - for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { - sendBuff[2 * j] = putOffset + i + 2 * j; - sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; + for (int j = threadIdx.x; j < nElem; j += blockDim.x) { + sendBuff[j] = putOffset + i + j; + // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets2(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets2(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); - for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { - if (sendBuff[2 * j] != getOffset + i + 2 * j) { + for (int j = threadIdx.x; j < nElem; j += blockDim.x) { + if (sendBuff[j] != getOffset + i + j) { // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j, // sendBuff[2 * j], getOffset + i + 2 * j); *ret = 1; break; } - if (sendBuff[2 * j + 1] != getOffset + i + 2 * j + 1) { - // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j + 1, - // sendBuff[2 * j + 1], getOffset + i + 2 * j + 1); - *ret = 1; - break; - } + // if (sendBuff[2 * j + 1] != getOffset + i + 2 * j + 1) { + // // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j + 1, + // // sendBuff[2 * j + 1], getOffset + i + 2 * j + 1); + // *ret = 1; + // break; + // } } } // Make sure all threads are done in this iteration From 24420b039f867a15f101732f065c6880bfbe400f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 2 Feb 2024 09:19:06 +0000 Subject: [PATCH 11/89] clean up --- include/mscclpp/packet_device.hpp | 47 +++++++++------------------ include/mscclpp/sm_channel_device.hpp | 38 ++++++++++++++++++---- test/mp_unit/sm_channel_tests.cu | 4 +-- test/mscclpp-test/allreduce_test.cu | 18 +++++----- 4 files changed, 59 insertions(+), 48 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index abb115b45..4d571518e 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -104,7 +104,7 @@ union alignas(16) LLPacket { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; -union alignas(8) LLPacket2 { +union alignas(8) LLPacket64 { // Assume data is written with an atomicity of 8 bytes (IB/RDMA). struct { uint32_t data; @@ -113,14 +113,13 @@ union alignas(8) LLPacket2 { uint64_t raw_; #if defined(MSCCLPP_DEVICE_COMPILE) - MSCCLPP_DEVICE_INLINE LLPacket2() {} + MSCCLPP_DEVICE_INLINE LLPacket64() {} MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { #if defined(MSCCLPP_DEVICE_CUDA) #else // !defined(MSCCLPP_DEVICE_CUDA) uint2 reg = make_uint2(val, flag); uint64_t* p = reinterpret_cast(®); - // __builtin_nontemporal_store(*p, &(raw_)); atomicStore(&(raw_), *p, memoryOrderRelaxed); #endif } @@ -129,8 +128,7 @@ union alignas(8) LLPacket2 { #if defined(MSCCLPP_DEVICE_CUDA) #else // !defined(MSCCLPP_DEVICE_CUDA) uint64_t reg; - // reg = atomicLoad(&(raw_), memoryOrderRelaxed); - reg = __builtin_nontemporal_load(&(raw_)); + reg = atomicLoad(&(raw_), memoryOrderRelaxed); uint2* ptr = reinterpret_cast(®); data = ptr->x; return (ptr->y != flag); @@ -139,20 +137,7 @@ union alignas(8) LLPacket2 { MSCCLPP_DEVICE_INLINE uint32_t read(uint32_t flag, int64_t maxSpinCount = 1000000) const { uint32_t data; - // POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); - int64_t spins = 0; - uint64_t reg; - uint2* ptr; - - do { - reg = __builtin_nontemporal_load(&(raw_)); - ptr = reinterpret_cast(®); - // if (spins >= maxSpinCount) { - asm volatile("s_waitcnt vmcnt(0)"); - // spins = 0; - // } - } while ((ptr->y != flag)); - data = ptr->x; + POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); return data; } @@ -190,30 +175,30 @@ MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffs } } -/// Read from the origin and write to the target buffer. -MSCCLPP_DEVICE_INLINE void putPackets2(void* targetPtr, uint64_t targetOffset, const void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { +/// Read from the origin and write to the target buffer. Write 64-bit data at a time (32bit data + 32bit flag). +MSCCLPP_DEVICE_INLINE void putPackets64(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); - LLPacket2* targetBase = (LLPacket2*)((char*)targetPtr + targetOffset); + LLPacket64* targetBase = (LLPacket64*)((char*)targetPtr + targetOffset); size_t nElem = originBytes / sizeof(uint32_t); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket2* pkt = &targetBase[i]; + LLPacket64* pkt = &targetBase[i]; pkt->write(originBase[i], flag); } } -/// Read from the target buffer and write to the origin. -MSCCLPP_DEVICE_INLINE void getPackets2(const void* targetPtr, uint64_t targetOffset, void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { +/// Read from the target buffer and write to the origin. Read 64-bit data at a time (32bit data + 32bit flag). +MSCCLPP_DEVICE_INLINE void getPackets64(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - const LLPacket2* targetBase = (const LLPacket2*)((const char*)targetPtr + targetOffset); + const LLPacket64* targetBase = (const LLPacket64*)((const char*)targetPtr + targetOffset); uint32_t* originBase = (uint32_t*)((char*)originPtr + originOffset); size_t nElem = originBytes / sizeof(uint32_t); for (size_t i = threadId; i < nElem; i += numThreads) { - const LLPacket2* pkt = &targetBase[i]; + const LLPacket64* pkt = &targetBase[i]; originBase[i] = pkt->read(flag); } } diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index f2a6c5838..62e4efc16 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -233,14 +233,40 @@ struct SmChannelDeviceHandle { mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } - MSCCLPP_DEVICE_INLINE void putPackets2(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, - uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::putPackets2(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + /// Construct @ref LLPacket64 from the data in the local memory (origin) and write it on the remote packet buffer + /// (target). + /// + /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets. + /// Note that this function is intended to be used with @ref getPacket64() on the remote side to copy data in 64-bit. + /// + /// @param targetOffset The offset in bytes of the remote packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. + /// @param threadId The index of the current thread among all threads running this function. This is different from + /// the `threadIdx` in CUDA. + /// @param numThreads The total number of threads that run this function. + /// + MSCCLPP_DEVICE_INLINE void putPackets64(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::putPackets64(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } - MSCCLPP_DEVICE_INLINE void getPackets2(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, - uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets2(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + + /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). + /// + /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. + /// Note that this function is intended to be used with @ref putPacket64() on the remote side to copy data in 64-bit. + /// + /// @param targetOffset The offset in bytes of the local packet buffer. + /// @param originOffset The offset in bytes of the local data. + /// @param originBytes Bytes of the origin to be copied. + /// @param threadId The index of the current thread among all threads running this function. This is different from + /// the `threadIdx` in CUDA. + /// @param numThreads The total number of threads that run this function. + /// + MSCCLPP_DEVICE_INLINE void getPackets64(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, + uint32_t threadId, uint32_t numThreads, uint32_t flag) { + mscclpp::getPackets64(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } /// Signal the remote semaphore. diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index eccd791a0..f60980a8a 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -257,9 +257,9 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets2(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets64(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets2(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets64(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index af6d94499..dc17a4e11 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1049,40 +1049,40 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket2); + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket64); void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); - size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket2); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket64); size_t scratchResultOffset = - (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket2) : 3 * nPkts * sizeof(mscclpp::LLPacket2); + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket64) : 3 * nPkts * sizeof(mscclpp::LLPacket64); size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets2(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets64(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket2* dstPkt = (mscclpp::LLPacket2*)scratchBuff + remoteRank * nPktsPerRank; + mscclpp::LLPacket64* dstPkt = (mscclpp::LLPacket64*)scratchBuff + remoteRank * nPktsPerRank; uint32_t val = dstPkt[idx].read(flag); data += val; } data += src[idx]; dst[idx] = data; - mscclpp::LLPacket2 packet; + mscclpp::LLPacket64 packet; packet.data = data; packet.flag = flag; - size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket2) + (idx + rank * nPktsPerRank); + size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket64) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { constSmOutOfPlaceChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer - mscclpp::LLPacket2* dstPkt = (mscclpp::LLPacket2*)((char*)scratch + scratchResultOffset); + mscclpp::LLPacket64* dstPkt = (mscclpp::LLPacket64*)((char*)scratch + scratchResultOffset); const int dstOffset = remoteRank * nPktsPerRank; uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { From 83c7e0607d948256e8e1985367329c42698bc917 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 2 Feb 2024 09:48:43 +0000 Subject: [PATCH 12/89] update --- include/mscclpp/packet_device.hpp | 8 +++++--- include/mscclpp/poll_device.hpp | 6 ++++++ include/mscclpp/sm_channel_device.hpp | 1 - 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 4d571518e..a653bbc18 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -83,7 +83,7 @@ union alignas(16) LLPacket { MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 1000000000) const { uint2 data; POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); - // int64_t spins = 0; + // uint32_t spins = 0; // ulonglong2 reg; // uint4* ptr; @@ -91,8 +91,10 @@ union alignas(16) LLPacket { // reg.x = __builtin_nontemporal_load(&(raw_.x)); // reg.y = __builtin_nontemporal_load(&(raw_.y)); // ptr = reinterpret_cast(®); - // // if (spins >= maxSpinCount) break; - // // spins++; + // if (spins >= maxSpinCount) { + // asm volatile("s_waitcnt vmcnt(0)"); + // spins = 0; + // } // } while ((ptr->y != flag) || (ptr->w != flag)); // data.x = ptr->x; // data.y = ptr->z; diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp index f8c5d3a9a..24dd0a7b6 100644 --- a/include/mscclpp/poll_device.hpp +++ b/include/mscclpp/poll_device.hpp @@ -12,6 +12,9 @@ __device__ uint64_t abortFlag; +#if defined(NDEBUG) +#define __assert_fail(__assertion, __file, __line, __function) ; +#else // !defined(NDEBUG) #if defined(MSCCLPP_DEVICE_HIP) extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, const char *__function); @@ -19,6 +22,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, const char *__function) __THROW; #endif // !defined(MSCCLPP_DEVICE_HIP) +#endif // NDEBUG // If a spin is stuck, print a warning and keep spinning. #define POLL_MAYBE_JAILBREAK(__cond, __max_spin_cnt) \ @@ -26,6 +30,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ int64_t __spin_cnt = 0; \ while (__cond) { \ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ + __assert_fail(#__cond, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ } \ } \ } while (0); @@ -42,6 +47,7 @@ extern "C" __device__ void __assert_fail(const char *__assertion, const char *__ break; \ } \ if (__max_spin_cnt >= 0 && __spin_cnt++ == __max_spin_cnt) { \ + __assert_fail(#__cond1 #__cond2, __FILE__, __LINE__, __PRETTY_FUNCTION__); \ } \ } \ } while (0); diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index 62e4efc16..494c6e5ff 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -251,7 +251,6 @@ struct SmChannelDeviceHandle { mscclpp::putPackets64(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } - /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). /// /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. From ea4e1f506e947dcd6b93a3f24329f38299be117a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 2 Feb 2024 22:52:35 +0000 Subject: [PATCH 13/89] cleanup --- include/mscclpp/packet_device.hpp | 25 ++---- include/mscclpp/poll_device.hpp | 4 +- test/mscclpp-test/allgather_test.cu | 131 +--------------------------- 3 files changed, 9 insertions(+), 151 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index a653bbc18..64943ed41 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -42,8 +42,6 @@ union alignas(16) LLPacket { ulonglong2* p = reinterpret_cast(®); atomicStore(&(raw_.x), p->x, memoryOrderRelaxed); atomicStore(&(raw_.y), p->y, memoryOrderRelaxed); - // __builtin_nontemporal_store(p->x, &(raw_.x)); - // __builtin_nontemporal_store(p->y, &(raw_.y)); #endif } @@ -67,8 +65,6 @@ union alignas(16) LLPacket { ulonglong2 reg; reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed); reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed); - // reg.x = __builtin_nontemporal_load(&(raw_.x)); - // reg.y = __builtin_nontemporal_load(&(raw_.y)); uint4* ptr = reinterpret_cast(®); data.x = ptr->x; data.y = ptr->z; @@ -80,24 +76,9 @@ union alignas(16) LLPacket { /// @param flag The flag to read. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. /// @return The 8-byte data read. - MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 1000000000) const { + MSCCLPP_DEVICE_INLINE uint2 read(uint32_t flag, int64_t maxSpinCount = 100000000) const { uint2 data; POLL_MAYBE_JAILBREAK(readOnce(flag, data), maxSpinCount); - // uint32_t spins = 0; - // ulonglong2 reg; - // uint4* ptr; - - // do { - // reg.x = __builtin_nontemporal_load(&(raw_.x)); - // reg.y = __builtin_nontemporal_load(&(raw_.y)); - // ptr = reinterpret_cast(®); - // if (spins >= maxSpinCount) { - // asm volatile("s_waitcnt vmcnt(0)"); - // spins = 0; - // } - // } while ((ptr->y != flag) || (ptr->w != flag)); - // data.x = ptr->x; - // data.y = ptr->z; return data; } @@ -119,6 +100,7 @@ union alignas(8) LLPacket64 { MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { #if defined(MSCCLPP_DEVICE_CUDA) + asm volatile("st.volatile.global.v2.u32 [%0], {%1,%2};" ::"l"(&raw_), "r"(val), "r"(flag)); #else // !defined(MSCCLPP_DEVICE_CUDA) uint2 reg = make_uint2(val, flag); uint64_t* p = reinterpret_cast(®); @@ -128,6 +110,9 @@ union alignas(8) LLPacket64 { MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint32_t& data) const { #if defined(MSCCLPP_DEVICE_CUDA) + uint32_t f; + asm volatile("ld.volatile.global.v2.u32 {%0,%1}, [%4];" : "=r"(data), "=r"(f) : "l"(&raw_)); + return (f != flag); #else // !defined(MSCCLPP_DEVICE_CUDA) uint64_t reg; reg = atomicLoad(&(raw_), memoryOrderRelaxed); diff --git a/include/mscclpp/poll_device.hpp b/include/mscclpp/poll_device.hpp index 24dd0a7b6..9ad116f84 100644 --- a/include/mscclpp/poll_device.hpp +++ b/include/mscclpp/poll_device.hpp @@ -4,14 +4,12 @@ #ifndef MSCCLPP_POLL_DEVICE_HPP_ #define MSCCLPP_POLL_DEVICE_HPP_ -#include "atomic_device.hpp" +#include "device.hpp" #if defined(MSCCLPP_DEVICE_COMPILE) #include -__device__ uint64_t abortFlag; - #if defined(NDEBUG) #define __assert_fail(__assertion, __file, __line, __function) ; #else // !defined(NDEBUG) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index e7b3e407e..e8915aebe 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -388,7 +388,6 @@ __global__ void __launch_bounds__(1024, 1) if (nLoop > 0) { // First loop unrolling const size_t peerIdx = wid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } @@ -396,7 +395,6 @@ __global__ void __launch_bounds__(1024, 1) for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } @@ -404,7 +402,6 @@ __global__ void __launch_bounds__(1024, 1) if (bytes % unitBytes > 0) { const size_t gWid = wid + nLoop * nWarp; const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; const size_t offset = bytesPerGPU * rank + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) @@ -471,7 +468,6 @@ __global__ void __launch_bounds__(1024, 1) for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } @@ -487,7 +483,6 @@ __global__ void __launch_bounds__(1024, 1) if (bytes % unitBytes > 0) { const size_t gWid = wid + nLoop * nWarp; const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; const size_t offset = bytesPerGPU * rank + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) @@ -516,120 +511,6 @@ __global__ void __launch_bounds__(1024, 1) } } -__global__ void __launch_bounds__(1024, 1) - allgather8(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { - const size_t nBlock = gridDim.x / 2; - const bool isPut = blockIdx.x < nBlock; - - const size_t tid = threadIdx.x + (blockIdx.x % nBlock) * blockDim.x; - const size_t lid = tid % WARP_SIZE; - const size_t wid = tid / WARP_SIZE; - - const size_t nThread = blockDim.x * nBlock; - const size_t nWarp = nThread / WARP_SIZE; - const size_t nPeer = nRanksPerNode - 1; - // const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = constSmOutOfPlaceChans; - - const uint32_t flag = (uint32_t)globalFlag; - // if (wid < nPeer && lid == 0) { - // smChans[wid].relaxedSignal(); - // smChans[wid].wait(); - // } - // __syncthreads(); - const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); - const size_t bytes = bytesPerGPU * nPeer; - size_t unitBytesPerThread = 8; - // if (bytes >= nThread * 64) { - // unitBytesPerThread = 64; - // } else { - // unitBytesPerThread = 16; - // } - const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; - const size_t unitBytes = unitBytesPerWarp * nWarp; - const size_t nLoop = bytes / unitBytes; - - // double buffering - const size_t scratchOffset = (flag & 1) ? 0 : bytesPerGPU * nRanksPerNode * 2; - - if (isPut) { - if (nLoop > 0) { - // First loop unrolling - const size_t peerIdx = wid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; - // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); - mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - unitBytesPerWarp, lid, WARP_SIZE, flag); - } - - for (size_t i = 1; i < nLoop; ++i) { - const size_t gWid = wid + i * nWarp; - const size_t peerIdx = gWid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; - // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); - mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - unitBytesPerWarp, lid, WARP_SIZE, flag); - } - - if (bytes % unitBytes > 0) { - const size_t gWid = wid + nLoop * nWarp; - const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - const size_t offset = bytesPerGPU * rank + offsetWithinRank; - const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) - ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) - : unitBytesPerWarp; - if (remainBytes > 0) { - // smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); - mscclpp::putPackets(smChans[peerIdx].dst_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - remainBytes, lid, WARP_SIZE, flag); - } - } - } else { - if (nLoop > 0) { - // First loop unrolling - const size_t peerIdx = wid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - unitBytesPerWarp, lid, WARP_SIZE, flag); - } - - for (size_t i = 1; i < nLoop; ++i) { - const size_t gWid = wid + i * nWarp; - const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; - // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); - mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, offset, - unitBytesPerWarp, lid, WARP_SIZE, flag); - } - - if (bytes % unitBytes > 0) { - const size_t gWid = wid + nLoop * nWarp; - const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; - const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) - ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) - : unitBytesPerWarp; - if (remainBytes > 0) { - // smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); - mscclpp::getPackets(smChans[peerIdx].getPacketBuffer_, scratchOffset + offset * 2, smChans[peerIdx].src_, - offset, remainBytes, lid, WARP_SIZE, flag); - } - } - } - - if (threadIdx.x == 0 && blockIdx.x == 0) { - globalFlag += 1; - } -} - class AllGatherProxyService : public mscclpp::BaseProxyService { public: AllGatherProxyService(int worldSize, int rank, int cudaDevice); @@ -738,9 +619,6 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { } else if (kernelNum == 7) { nBlocks = 4; nThreads = 896; - } else if (kernelNum == 8) { - nBlocks = 2; - nThreads = 896; } else { nBlocks = 1; nThreads = WARP_SIZE * (worldSize - 1); @@ -761,8 +639,6 @@ void AllGatherTestColl::runColl(const TestArgs& args, cudaStream_t stream) { allgather6<<>>(rank, worldSize, nRanksPerNode, paramCount_); } else if (kernelNum == 7) { allgather7<<>>(rank, worldSize, nRanksPerNode, paramCount_); - } else if (kernelNum == 8) { - allgather8<<>>(rank, worldSize, nRanksPerNode, paramCount_); } } @@ -818,8 +694,7 @@ std::vector AllGatherTestColl::getKernelRestrictions() { {4, "allgather4", true, 3, 16 * worldSize_ /*use ulong2 to transfer data*/}, {5, "allgather5", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, {6, "allgather6", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, - {7, "allgather7", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}, - {8, "allgather8", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}}; + {7, "allgather7", false, 1, 16 * worldSize_ /*use ulong2 to transfer data*/}}; } class AllGatherTestEngine : public BaseTestEngine { @@ -850,7 +725,7 @@ AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine( void AllGatherTestEngine::allocateBuffer() { sendBuff_ = mscclpp::allocExtSharedCuda(args_.maxBytes / sizeof(int)); expectedBuff_ = std::shared_ptr(new int[args_.maxBytes / sizeof(int)]); - if (args_.kernelNum == 7 || args_.kernelNum == 8) { + if (args_.kernelNum == 7) { const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); // 2x for double-buffering, scratchBuff used to store original data and reduced results const size_t scratchBuffNelem = nPacket * 2 /*original data & reduced result */ * 2 /* double buffering*/; @@ -878,7 +753,7 @@ void AllGatherTestEngine::setupConnections() { CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), sizeof(DeviceHandle) * smChannelHandles.size())); - if (args_.kernelNum == 7 || args_.kernelNum == 8) { + if (args_.kernelNum == 7) { const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket); setupMeshConnections(smOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), From f79b939ddf56eda3717e022c777087c8eaafe877 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 2 Feb 2024 23:12:03 +0000 Subject: [PATCH 14/89] update docs --- docs/quickstart.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/quickstart.md b/docs/quickstart.md index f2b12d187..af1bbe5f3 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -27,10 +27,24 @@ CMake 3.25 or later is required. ```bash $ git clone https://github.com/microsoft/mscclpp.git $ mkdir -p mscclpp/build && cd mscclpp/build +``` + +For NVIDIA platforms, build MSCCL++ as follows. + +```bash +# For NVIDIA platforms $ cmake -DCMAKE_BUILD_TYPE=Release .. $ make -j ``` +For AMD platforms, use HIPCC instead of the default C++ compiler. Replace `/path/to/hipcc` from the command below into the your HIPCC path. + +```bash +# For AMD platforms +$ CXX=/path/to/hipcc cmake -DCMAKE_BUILD_TYPE=Release .. +$ make -j +``` + ## Install from Source (Libraries and Headers) ```bash From 214fa22594c95f3a0e8dd6673c71caf05fcbcbd6 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 2 Feb 2024 23:14:31 +0000 Subject: [PATCH 15/89] fix a typo --- include/mscclpp/packet_device.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 64943ed41..f43abba6d 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -111,7 +111,7 @@ union alignas(8) LLPacket64 { MSCCLPP_DEVICE_INLINE bool readOnce(uint32_t flag, uint32_t& data) const { #if defined(MSCCLPP_DEVICE_CUDA) uint32_t f; - asm volatile("ld.volatile.global.v2.u32 {%0,%1}, [%4];" : "=r"(data), "=r"(f) : "l"(&raw_)); + asm volatile("ld.volatile.global.v2.u32 {%0,%1}, [%2];" : "=r"(data), "=r"(f) : "l"(&raw_)); return (f != flag); #else // !defined(MSCCLPP_DEVICE_CUDA) uint64_t reg; From f6e515a61d4e47930652bb10b67c55b9ffc3b356 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 2 Feb 2024 23:35:02 +0000 Subject: [PATCH 16/89] update names --- test/mp_unit/sm_channel_tests.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index f60980a8a..7b081db91 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -238,7 +238,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) { EXPECT_EQ(*ret, 0); } -__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries = 1000) { +__global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* ret, int nTries = 1000) { if (rank > 1) return; DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; @@ -282,7 +282,7 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, } } -TEST_F(SmChannelOneToOneTest, PacketPingPong) { +TEST_F(SmChannelOneToOneTest, Packet64PingPong) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; @@ -302,23 +302,23 @@ TEST_F(SmChannelOneToOneTest, PacketPingPong) { std::shared_ptr ret = mscclpp::makeSharedCudaHost(0); // The least nelem is 2 for packet ping pong - kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 2, ret.get()); + kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 2, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); *ret = 0; - kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); + kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); + kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); + kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); @@ -327,12 +327,12 @@ TEST_F(SmChannelOneToOneTest, PacketPingPong) { int nTries = 1000000; communicator->bootstrap()->barrier(); mscclpp::Timer timer; - kernelSmPacketPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get(), nTries); + kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get(), nTries); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); communicator->bootstrap()->barrier(); if (gEnv->rank == 0) { - std::cout << "smPacketPingPong" + std::cout << "smPacket64PingPong" << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n"; } } From 31d427f7b886c19573b7b8a7c124d6baac22ff85 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 4 Feb 2024 04:26:18 +0000 Subject: [PATCH 17/89] clean up tests --- test/mp_unit/mp_unit_tests.hpp | 2 + test/mp_unit/sm_channel_tests.cu | 163 ++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 56 deletions(-) diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index de1f4b6b4..f36c57639 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -148,6 +148,8 @@ class SmChannelOneToOneTest : public CommunicatorTestBase { void setupMeshConnections(std::vector& smChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0); + using PacketPingPongKernelWrapper = std::function; + void packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper); std::unordered_map> smSemaphores; }; diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index 7b081db91..5870c7987 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -70,6 +70,61 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector __constant__ DeviceHandle gChannelOneToOneTestConstSmChans; +void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper) { + if (gEnv->rank >= numRanksToUse) return; + + const int nElem = 4 * 1024 * 1024; + const int defaultNTries = 1000; + + std::vector smChannels; + std::shared_ptr buff = mscclpp::allocExtSharedCuda(nElem); + std::shared_ptr intermBuff = mscclpp::allocExtSharedCuda(nElem * 2); + setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int)); + std::vector> deviceHandles(smChannels.size()); + std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(), + [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); }); + + ASSERT_EQ(smChannels.size(), 1); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(), + sizeof(DeviceHandle))); + + std::shared_ptr ret = mscclpp::makeSharedCudaHost(0); + + // The least nelem is 2 for packet ping pong + kernelWrapper(buff.get(), gEnv->rank, 2, ret.get(), defaultNTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + *ret = 0; + + kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), defaultNTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + EXPECT_EQ(*ret, 0); + *ret = 0; + + kernelWrapper(buff.get(), gEnv->rank, 1024 * 1024, ret.get(), defaultNTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + EXPECT_EQ(*ret, 0); + *ret = 0; + + kernelWrapper(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get(), defaultNTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + + EXPECT_EQ(*ret, 0); + *ret = 0; + + int nTries = 1000000; + communicator->bootstrap()->barrier(); + mscclpp::Timer timer; + kernelWrapper(buff.get(), gEnv->rank, 1024, ret.get(), nTries); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + if (gEnv->rank == 0) { + std::cout << testName << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n"; + } +} + __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) { DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; volatile int* sendBuff = (volatile int*)buff; @@ -269,12 +324,6 @@ __global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* re *ret = 1; break; } - // if (sendBuff[2 * j + 1] != getOffset + i + 2 * j + 1) { - // // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j + 1, - // // sendBuff[2 * j + 1], getOffset + i + 2 * j + 1); - // *ret = 1; - // break; - // } } } // Make sure all threads are done in this iteration @@ -282,57 +331,59 @@ __global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* re } } -TEST_F(SmChannelOneToOneTest, Packet64PingPong) { - if (gEnv->rank >= numRanksToUse) return; - - const int nElem = 4 * 1024 * 1024; - - std::vector smChannels; - std::shared_ptr buff = mscclpp::allocExtSharedCuda(nElem); - std::shared_ptr intermBuff = mscclpp::allocExtSharedCuda(nElem * 2); - setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int)); - std::vector> deviceHandles(smChannels.size()); - std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(), - [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); }); - - ASSERT_EQ(smChannels.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(), - sizeof(DeviceHandle))); - - std::shared_ptr ret = mscclpp::makeSharedCudaHost(0); - - // The least nelem is 2 for packet ping pong - kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 2, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - *ret = 0; - - kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - - EXPECT_EQ(*ret, 0); - *ret = 0; - - kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); +__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { + if (rank > 1) return; - EXPECT_EQ(*ret, 0); - *ret = 0; + DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; + volatile int* sendBuff = (volatile int*)buff; + int putOffset = (rank == 0) ? 0 : 10000000; + int getOffset = (rank == 0) ? 10000000 : 0; + for (int i = 0; i < nTries; i++) { + uint64_t flag = (uint64_t)i + 1; + // rank=0: 0, 1, 0, 1, ... + // rank=1: 1, 0, 1, 0, ... + if ((rank ^ (i & 1)) == 0) { + // If each thread writes 8 bytes at once, we don't need a barrier before putPackets(). + for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { + sendBuff[2 * j] = putOffset + i + 2 * j; + sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; + } + // __syncthreads(); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + } else { + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). + // __syncthreads(); + for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { + if (sendBuff[2 * j] != getOffset + i + 2 * j) { + // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j, + // sendBuff[2 * j], getOffset + i + 2 * j); + *ret = 1; + break; + } + if (sendBuff[2 * j + 1] != getOffset + i + 2 * j + 1) { + // printf("ERROR: rank = %d, sendBuff[%d] = %d, expected %d. Skipping following errors\n", rank, 2 * j + 1, + // sendBuff[2 * j + 1], getOffset + i + 2 * j + 1); + *ret = 1; + break; + } + } + } + // Make sure all threads are done in this iteration + __syncthreads(); + } +} - int nTries = 1000000; - communicator->bootstrap()->barrier(); - mscclpp::Timer timer; - kernelSmPacket64PingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get(), nTries); - MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); - communicator->bootstrap()->barrier(); +TEST_F(SmChannelOneToOneTest, Packet64PingPong) { + auto kernelSmPacket64PingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelSmPacket64PingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); + }; + packetPingPongTest("smPacket64PingPong", kernelSmPacket64PingPongWrapper); +} - if (gEnv->rank == 0) { - std::cout << "smPacket64PingPong" - << ": " << std::setprecision(4) << (float)timer.elapsed() / (float)(nTries) << " us/iter\n"; - } +TEST_F(SmChannelOneToOneTest, PacketPingPong) { + auto kernelSmPacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelSmPacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); + }; + packetPingPongTest("smPacketPingPong", kernelSmPacketPingPongWrapper); } From 80413a024899b222565f945c141e62516e302260 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 5 Feb 2024 02:33:18 +0000 Subject: [PATCH 18/89] minor update --- test/mp_unit/sm_channel_tests.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index 5870c7987..5f607248c 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -293,7 +293,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) { EXPECT_EQ(*ret, 0); } -__global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* ret, int nTries = 1000) { +__global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* ret, int nTries) { if (rank > 1) return; DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; From eef3c275286f4eeb64e4068bf6f28fb38330ca28 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 5 Feb 2024 05:02:03 +0000 Subject: [PATCH 19/89] cleanup --- test/mscclpp-test/allgather_test.cu | 12 ------------ test/mscclpp-test/allreduce_test.cu | 1 - 2 files changed, 13 deletions(-) diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index e8915aebe..5c101bbd2 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -425,23 +425,12 @@ __global__ void __launch_bounds__(1024, 1) const size_t nThread = blockDim.x * nBlock; const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; - // const size_t chanOffset = nPeer * blockIdx.x; auto smChans = constSmOutOfPlaceChans; const uint32_t flag = (uint32_t)globalFlag; - // if (wid < nPeer && lid == 0) { - // smChans[wid].relaxedSignal(); - // smChans[wid].wait(); - // } - // __syncthreads(); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU * nPeer; size_t unitBytesPerThread = 8; - // if (bytes >= nThread * 64) { - // unitBytesPerThread = 64; - // } else { - // unitBytesPerThread = 16; - // } const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; const size_t unitBytes = unitBytesPerWarp * nWarp; const size_t nLoop = bytes / unitBytes; @@ -452,7 +441,6 @@ __global__ void __launch_bounds__(1024, 1) if (nLoop > 0) { // First loop unrolling const size_t peerIdx = wid % nPeer; - // const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index dc17a4e11..577539037 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1141,7 +1141,6 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { nBlocks = 28; tmpBuff = scratchPacketBuff; nThreadsPerBlock = 1024; - } else { nBlocks = std::max(args.nRanksPerNode - 1, 1) * BLOCKS_PER_PEER; tmpBuff = scratchPacketBuff; From 98334b840be85ae4aa6487c3bee095837536297a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 5 Feb 2024 20:17:15 +0000 Subject: [PATCH 20/89] fix wrong offset calculation --- src/connection.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 4d719f3b2..65b76b33f 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -72,9 +72,9 @@ void CudaIpcConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, validateTransport(dst, remoteTransport()); uint64_t oldValue = *src; *src = newValue; - uint64_t* dstPtr = (uint64_t*)dst.data(); + uint64_t* dstPtr = reinterpret_cast(reinterpret_cast(dst.data()) + dstOffset); - MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr + dstOffset, src, sizeof(uint64_t), cudaMemcpyHostToDevice, stream_)); + MSCCLPP_CUDATHROW(cudaMemcpyAsync(dstPtr, src, sizeof(uint64_t), cudaMemcpyHostToDevice, stream_)); INFO(MSCCLPP_P2P, "CudaIpcConnection atomic write: from %p to %p, %lu -> %lu", src, dstPtr + dstOffset, oldValue, newValue); From 71f6c85c3f50162cfc8b16aff974f20a344d78ab Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 5 Feb 2024 21:31:32 +0000 Subject: [PATCH 21/89] Do not compile nvls_test with ROCm --- include/mscclpp/nvls_device.hpp | 4 ++-- test/nvls_test.cu | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp index 52ade275d..edaf2e256 100644 --- a/include/mscclpp/nvls_device.hpp +++ b/include/mscclpp/nvls_device.hpp @@ -20,7 +20,7 @@ struct DeviceMulticastPointerDeviceHandle { void* mcPtr; size_t bufferSize; -#if defined(MSCCLPP_DEVICE_COMPILE) +#if defined(MSCCLPP_DEVICE_CUDA) template MSCCLPP_DEVICE_INLINE void multimemLoad(TVaule& val, T* ptr) { static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4"); @@ -54,7 +54,7 @@ struct DeviceMulticastPointerDeviceHandle { static_assert(dependentFalse, "Not supported type"); } }; -#endif +#endif // defined(MSCCLPP_DEVICE_CUDA) }; } // namespace mscclpp diff --git a/test/nvls_test.cu b/test/nvls_test.cu index e01b4d790..55ece3fcf 100644 --- a/test/nvls_test.cu +++ b/test/nvls_test.cu @@ -1,11 +1,13 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include + +#if (USE_NVLS) #include #include #include #include -#include #include #include #include @@ -71,7 +73,6 @@ __global__ void testing(float* mc_ptr, int size, int myrank, int nranks) { } int main() { -#if (USE_NVLS) int myrank, nranks; MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); @@ -199,5 +200,14 @@ int main() { } MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); -#endif // (USE_NVLS) + return 0; } + +#else // !(USE_NVLS) + +int main() { + printf("This test requires NVLS to be enabled\n"); + return 0; +} + +#endif // !(USE_NVLS) From 5eb35cd45bbe0c2b26e1a6c647ec92f6e2acd86a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 5 Feb 2024 22:23:42 +0000 Subject: [PATCH 22/89] MSRCHA-371 workaround --- include/mscclpp/semaphore_device.hpp | 12 ++++++++++-- src/semaphore.cc | 3 ++- test/mp_unit/proxy_channel_tests.cu | 5 +++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index cd455078a..088f27bf3 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -20,14 +20,22 @@ struct Host2DeviceSemaphoreDeviceHandle { /// @return true if the host has signaled. MSCCLPP_DEVICE_INLINE bool poll() { bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId)); - if (signaled) (*expectedInboundSemaphoreId) += 1; + if (signaled) { + (*expectedInboundSemaphoreId) += 1; + } else { + // TODO: MSRCHA-371 + atomicStore(&inboundSemaphoreId[1], uint64_t{0}, memoryOrderRelaxed); + } return signaled; } /// Wait for the host to signal. MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 100000000) { (*expectedInboundSemaphoreId) += 1; - POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)), + // TODO: MSRCHA-371 + POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)) + ? (atomicStore(&inboundSemaphoreId[1], uint64_t{0}, memoryOrderRelaxed), true) + : false, maxSpinCount); } #endif // defined(MSCCLPP_DEVICE_COMPILE) diff --git a/src/semaphore.cc b/src/semaphore.cc index 7dec60c3d..74aaaf485 100644 --- a/src/semaphore.cc +++ b/src/semaphore.cc @@ -11,8 +11,9 @@ namespace mscclpp { static NonblockingFuture setupInboundSemaphoreId(Communicator& communicator, Connection* connection, void* localInboundSemaphoreId) { + // TODO: MSRCHA-371 auto localInboundSemaphoreIdsRegMem = - communicator.registerMemory(localInboundSemaphoreId, sizeof(uint64_t), connection->transport()); + communicator.registerMemory(localInboundSemaphoreId, sizeof(uint64_t) * 2, connection->transport()); int remoteRank = communicator.remoteRankOf(*connection); int tag = communicator.tagOf(*connection); communicator.sendMemoryOnSetup(localInboundSemaphoreIdsRegMem, remoteRank, tag); diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu index 796a565d4..2c276619e 100644 --- a/test/mp_unit/proxy_channel_tests.cu +++ b/test/mp_unit/proxy_channel_tests.cu @@ -435,6 +435,9 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) { proxyService->stopProxy(); } +// TODO: MSRCHA-371 +#if defined(MSCCLPP_DEVICE_CUDA) + TEST_F(ProxyChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } @@ -442,3 +445,5 @@ TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); } + +#endif // defined(MSCCLPP_DEVICE_CUDA) From 58eb956b25bc6b552595847f697d535d39f4af39 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Feb 2024 22:16:51 +0000 Subject: [PATCH 23/89] fix multi-node connection --- python/mscclpp/comm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py index d84410668..918cd97fb 100644 --- a/python/mscclpp/comm.py +++ b/python/mscclpp/comm.py @@ -86,15 +86,18 @@ def make_connection( ) -> dict[int, Connection]: if type(endpoints) is Transport: endpoints = EndpointConfig(endpoints) - if endpoints.transport == Transport.Nvls: - return self.communicator.connct_nvls_collective(all_ranks, endpoints) + elif type(endpoints) is dict: + endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()} connections = {} for rank in all_ranks: if type(endpoints) is dict: endpoint = endpoints[rank] else: endpoint = endpoints - connections[rank] = self.communicator.connect_on_setup(rank, 0, endpoint) + if endpoint.transport == Transport.Nvls: + connections[rank] = self.communicator.connct_nvls_collective(all_ranks, endpoint) + else: + connections[rank] = self.communicator.connect_on_setup(rank, 0, endpoint) self.communicator.setup() connections = {rank: connections[rank].get() for rank in connections} return connections From b841e91341203b3e791abe6e5ee98a37e06c5418 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 6 Feb 2024 22:58:31 +0000 Subject: [PATCH 24/89] wip --- include/mscclpp/core.hpp | 12 +- include/mscclpp/nccl.h | 475 +++++++++++++++++++++++++++++++++++++ src/bootstrap/bootstrap.cc | 8 +- src/nccl/nccl.cc | 33 +++ 4 files changed, 517 insertions(+), 11 deletions(-) create mode 100644 include/mscclpp/nccl.h create mode 100644 src/nccl/nccl.cc diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 02c277a3e..143b4c8dc 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -52,9 +52,7 @@ class Bootstrap { class TcpBootstrap : public Bootstrap { public: /// Constructor. - /// @param rank The rank of the process. - /// @param nRanks The total number of ranks. - TcpBootstrap(int rank, int nRanks); + TcpBootstrap(); /// Destructor. ~TcpBootstrap(); @@ -69,13 +67,17 @@ class TcpBootstrap : public Bootstrap { /// Initialize the @ref TcpBootstrap with a given unique ID. /// @param uniqueId The unique ID to initialize the @ref TcpBootstrap with. + /// @param rank The rank of the process. + /// @param nRanks The total number of ranks. /// @param timeoutSec The connection timeout in seconds. - void initialize(UniqueId uniqueId, int64_t timeoutSec = 30); + void initialize(UniqueId uniqueId, int rank, int nRanks, int64_t timeoutSec = 30); /// Initialize the @ref TcpBootstrap with a string formatted as "ip:port" or "interface:ip:port". /// @param ifIpPortTrio The string formatted as "ip:port" or "interface:ip:port". + /// @param rank The rank of the process. + /// @param nRanks The total number of ranks. /// @param timeoutSec The connection timeout in seconds. - void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec = 30); + void initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec = 30); /// Return the rank of the process. int getRank() override; diff --git a/include/mscclpp/nccl.h b/include/mscclpp/nccl.h new file mode 100644 index 000000000..ac852f9af --- /dev/null +++ b/include/mscclpp/nccl.h @@ -0,0 +1,475 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include +#include +#if CUDART_VERSION >= 11000 +#include +#endif +#if CUDART_VERSION >= 11080 +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +/* Opaque handle to communicator */ +typedef struct ncclComm* ncclComm_t; +#define NCCL_COMM_NULL NULL + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +/* Error type */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + +#define NCCL_CONFIG_UNDEF_INT INT_MIN +#define NCCL_CONFIG_UNDEF_PTR NULL +#define NCCL_SPLIT_NOCOLOR -1 + +/* Communicator configuration. Users can assign value to attributes to specify the + * behavior of a communicator. */ +typedef struct ncclConfig_v21700 { + /* attributes that users should never touch. */ + size_t size; + unsigned int magic; + unsigned int version; + /* attributes that users are able to customize. */ + int blocking; + int cgaClusterSize; + int minCTAs; + int maxCTAs; + const char *netName; + int splitShare; +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in NCCL error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + NCCL_CONFIG_UNDEF_INT, /* blocking */ \ + NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ + NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ + NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ + NCCL_CONFIG_UNDEF_PTR, /* netName */ \ + NCCL_CONFIG_UNDEF_INT /* splitShare */ \ +} + +/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ +ncclResult_t ncclGetVersion(int *version); +ncclResult_t pncclGetVersion(int *version); + +/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be + * called once and the Id should be distributed to all ranks in the + * communicator before calling ncclCommInitRank. */ +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + +/* Create a new communicator (multi thread/process version) with a configuration + * set by users. */ +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); + +/* Creates a new communicator (multi thread/process version). + * rank must be between 0 and nranks-1 and unique within a communicator clique. + * Each rank is associated to a CUDA device, which has to be set before calling + * ncclCommInitRank. + * ncclCommInitRank implicitly syncronizes with other ranks, so it must be + * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + +/* Creates a clique of communicators (single process version). + * This is a convenience function to create a single-process communicator clique. + * Returns an array of ndev newly initialized communicators in comm. + * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + * If devlist is NULL, the first ndev CUDA devices are used. + * Order of devlist defines user-order of processors within the communicator. */ +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + +/* Finalize a communicator. ncclCommFinalize flushes all issued communications, + * and marks communicator state as ncclInProgress. The state will change to ncclSuccess + * when the communicator is globally quiescent and related resources are freed; then, + * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + * itself) without blocking. */ +ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t pncclCommFinalize(ncclComm_t comm); + +/* Frees local resources associated with communicator object. */ +ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t pncclCommDestroy(ncclComm_t comm); + +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + +/* Creates one or more communicators from an existing one. + * Ranks with the same color will end up in the same communicator. + * Within the new communicator, key will be used to order ranks. + * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group + * and will therefore return a NULL communicator. + * If config is NULL, the new communicator will inherit the original communicator's + * configuration*/ +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); +ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config); + +/* Returns a string for each error code. */ +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); + +/* Returns a human-readable message of the last error that occurred. + * comm is currently unused and can be set to NULL + */ +const char* ncclGetLastError(ncclComm_t comm); +const char* pncclGetLastError(ncclComm_t comm); + +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + +/* Gets the number of ranks in the communicator clique. */ +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + +/* Returns the cuda device number associated with the communicator. */ +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + +/* Returns the user-ordered "rank" associated with the communicator. */ +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + +/* Reduction operation selector */ +typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclAvg = 4, + /* ncclNumOps: The number of built-in ncclRedOp_t values. Also + * serves as the least possible value for dynamic ncclRedOp_t's + * as constructed by ncclRedOpCreate*** functions. */ + ncclNumOps = 5, + /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. + * It is defined to be the largest signed value (since compilers + * are permitted to use signed enums) that won't grow + * sizeof(ncclRedOp_t) when compared to previous NCCL versions to + * maintain ABI compatibility. */ + ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) + } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, +#if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__) + ncclBfloat16 = 9, + ncclFp8E4M3 = 10, + ncclFp8E5M2 = 11, + ncclNumTypes = 12 +#elif defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 = 9, + ncclNumTypes = 10 +#else + ncclNumTypes = 9 +#endif +} ncclDataType_t; + +/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ +typedef enum { + /* ncclScalarDevice: The scalar is in device-visible memory and will be + * dereferenced while the collective is running. */ + ncclScalarDevice = 0, + + /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be + * dereferenced before the ncclRedOpCreate***() function returns. */ + ncclScalarHostImmediate = 1 +} ncclScalarResidence_t; + +/* + * ncclRedOpCreatePreMulSum + * + * Creates a new reduction operator which pre-multiplies input values by a given + * scalar locally before reducing them with peer values via summation. For use + * only with collectives launched against *comm* and *datatype*. The + * *residence* argument indicates how/when the memory pointed to by *scalar* + * will be dereferenced. Upon return, the newly created operator's handle + * is stored in *op*. + */ +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); +ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); + +/* + * ncclRedOpDestroy + * + * Destroys the reduction operator *op*. The operator must have been created by + * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be + * destroyed as soon as the last NCCL function which is given that operator returns. + */ +ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); +ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); + +/* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + +/* + * Reduce + * + * Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + +/* + * (deprecated) Broadcast (in-place) + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * Broadcast + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * All-Reduce + * + * Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + +/* + * Reduce-Scatter + * + * Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); + +/* + * All-Gather + * + * Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +/* + * Send + * + * Send data from sendbuff to rank peer. + * + * Rank peer needs to call ncclRecv with the same datatype and the same count from this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Receive + * + * Receive data from rank peer into recvbuff. + * + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* All-To-All + * + * Device (i) send (j)th block of data to device (j) and be placed as (i)th + * block. Each block for sending/receiving has count elements, which means + * that recvbuff and sendbuff should have a size of nranks*count elements. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +/*! @brief Opaque handle to MSCCL algorithm */ +typedef int mscclAlgoHandle_t; + +/*! @brief MSCCL Load Algorithm + * + * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return + * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL + * scheduler instead of end users. + */ +ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); +ncclResult_t pmscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank); + +/*! @brief MSCCL Run Algorithm + * + * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter + * list merges all possible parameters required by different operations as this + * is a general-purposed API. This API is expected to be called by MSCCL + * scheduler instead of end users. + */ +ncclResult_t mscclRunAlgo( + const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pmscclRunAlgo( + const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], + void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], + size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op, + mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, cudaStream_t stream); + +/*! @brief MSCCL Load Algorithm + * + * @details Unload MSCCL algorithm previous loaded using its handle. This API + * is expected to be called by MSCCL scheduler instead of end users. + */ +ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); +ncclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); + +/* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. + */ + +/* + * Group Start + * + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. + */ +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); + +/* + * Group End + * + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. + */ +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 00a58b992..86bb89dc3 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -70,7 +70,7 @@ static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is class TcpBootstrap::Impl { public: - Impl(int rank, int nRanks); + Impl(); ~Impl(); void initialize(const UniqueId& uniqueId, int64_t timeoutSec); void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec); @@ -120,11 +120,7 @@ class TcpBootstrap::Impl { }; TcpBootstrap::Impl::Impl(int rank, int nRanks) - : rank_(rank), - nRanks_(nRanks), - netInitialized(false), - peerCommAddresses_(nRanks, SocketAddress()), - barrierArr_(nRanks, 0), + : netInitialized(false), abortFlagStorage_(new uint32_t(0)), abortFlag_(abortFlagStorage_.get()) {} diff --git a/src/nccl/nccl.cc b/src/nccl/nccl.cc new file mode 100644 index 000000000..14b324042 --- /dev/null +++ b/src/nccl/nccl.cc @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "api.h" +#include "nccl.h" +#include "debug.h" + +#include + +namespace mscclpp { + +MSCCLPP_API ncclResult_t ncclGetVersion(int *version) { + if (version == nullptr) { + WARN("version is nullptr"); + return ncclInvalidArgument; + } + *version = MSCCLPP_VERSION; + return ncclSuccess; +} + +MSCCLPP_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { + if (uniqueId == nullptr) { + WARN("uniqueId is nullptr"); + return ncclInvalidArgument; + } + if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) { + WARN("UNIQUE_ID_BYTES mismatch"); + return ncclInternalError; + } + +} + +} // namespace mscclpp From b6bec539050d464573903295d10c1c261b04faac Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 7 Feb 2024 05:43:56 +0000 Subject: [PATCH 25/89] Restructuring --- CMakeLists.txt | 6 ++++ apps/nccl/CMakeLists.txt | 2 ++ {include/mscclpp => apps/nccl/include}/nccl.h | 0 apps/nccl/src/nccl.cc | 20 +++++++++++ src/nccl/nccl.cc | 33 ------------------- 5 files changed, 28 insertions(+), 33 deletions(-) create mode 100644 apps/nccl/CMakeLists.txt rename {include/mscclpp => apps/nccl/include}/nccl.h (100%) create mode 100644 apps/nccl/src/nccl.cc delete mode 100644 src/nccl/nccl.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 66ed4b94b..07c35e1f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,7 @@ option(ENABLE_TRACE "Enable tracing" OFF) option(USE_NPKIT "Use NPKIT" ON) option(BUILD_TESTS "Build tests" ON) option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON) +option(BUILD_APPS_NCCL "Build NCCL interfaces" OFF) option(USE_CUDA "Use NVIDIA/CUDA." OFF) option(USE_ROCM "Use AMD/ROCm." OFF) option(BYPASS_GPU_CHECK "Bypass GPU check." OFF) @@ -151,3 +152,8 @@ endif() if(BUILD_PYTHON_BINDINGS) add_subdirectory(python) endif() + +# NCCL interfaces +if(BUILD_APPS_NCCL) + add_subdirectory(apps/nccl) +endif() diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt new file mode 100644 index 000000000..9a0454564 --- /dev/null +++ b/apps/nccl/CMakeLists.txt @@ -0,0 +1,2 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. diff --git a/include/mscclpp/nccl.h b/apps/nccl/include/nccl.h similarity index 100% rename from include/mscclpp/nccl.h rename to apps/nccl/include/nccl.h diff --git a/apps/nccl/src/nccl.cc b/apps/nccl/src/nccl.cc new file mode 100644 index 000000000..856bebded --- /dev/null +++ b/apps/nccl/src/nccl.cc @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "nccl.h" + +#include + +#define NCCL_API extern "C" __attribute__((visibility("default"))) + +NCCL_API ncclResult_t ncclGetVersion(int *version) { + if (version == nullptr) return ncclInvalidArgument; + *version = MSCCLPP_VERSION; + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { + if (uniqueId == nullptr) return ncclInvalidArgument; + if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError; + return ncclSuccess; +} diff --git a/src/nccl/nccl.cc b/src/nccl/nccl.cc deleted file mode 100644 index 14b324042..000000000 --- a/src/nccl/nccl.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "api.h" -#include "nccl.h" -#include "debug.h" - -#include - -namespace mscclpp { - -MSCCLPP_API ncclResult_t ncclGetVersion(int *version) { - if (version == nullptr) { - WARN("version is nullptr"); - return ncclInvalidArgument; - } - *version = MSCCLPP_VERSION; - return ncclSuccess; -} - -MSCCLPP_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { - if (uniqueId == nullptr) { - WARN("uniqueId is nullptr"); - return ncclInvalidArgument; - } - if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) { - WARN("UNIQUE_ID_BYTES mismatch"); - return ncclInternalError; - } - -} - -} // namespace mscclpp From e1988e21fbbfa291119315bcde204fe1bdce1037 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 7 Feb 2024 06:18:14 +0000 Subject: [PATCH 26/89] cmake --- apps/nccl/CMakeLists.txt | 29 ++++++++++++++++++++++++++ apps/nccl/src/CMakeLists.txt | 0 apps/nccl/src/{nccl.cc => nccl.cpp} | 0 src/bootstrap/bootstrap.cc | 8 +++---- test/allgather_test_cpp.cu | 4 ++-- test/allgather_test_host_offloading.cu | 4 ++-- test/mp_unit/bootstrap_tests.cc | 22 +++++++++---------- test/mp_unit/communicator_tests.cu | 4 ++-- test/mp_unit/ib_tests.cu | 4 ++-- test/mscclpp-test/common.cc | 4 ++-- test/unit/core_tests.cc | 4 ++-- 11 files changed, 56 insertions(+), 27 deletions(-) create mode 100644 apps/nccl/src/CMakeLists.txt rename apps/nccl/src/{nccl.cc => nccl.cpp} (100%) diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 9a0454564..d92ded891 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -1,2 +1,31 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. + +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*.cpp) +file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h) + +add_library(mscclpp_nccl_obj OBJECT) +target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES}) +target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) +target_include_directories(mscclpp_nccl_obj PRIVATE ${GPU_INCLUDE_DIRS} include) +target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} mscclpp_obj) +set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) +if(USE_CUDA) + target_compile_definitions(mscclpp_nccl_obj PRIVATE USE_CUDA) +elseif(USE_ROCM) + target_compile_definitions(mscclpp_nccl_obj PRIVATE USE_ROCM) +endif() + +add_library(mscclpp_nccl SHARED) +target_link_libraries(mscclpp_nccl PUBLIC mscclpp_nccl_obj) +set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) +add_library(mscclpp_nccl_static STATIC) +target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_nccl_obj) +set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) + +install(TARGETS mscclpp_nccl_obj + FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include) +install(TARGETS mscclpp_nccl + LIBRARY DESTINATION ${INSTALL_PREFIX}/lib) +install(TARGETS mscclpp_nccl_static + ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib) diff --git a/apps/nccl/src/CMakeLists.txt b/apps/nccl/src/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/apps/nccl/src/nccl.cc b/apps/nccl/src/nccl.cpp similarity index 100% rename from apps/nccl/src/nccl.cc rename to apps/nccl/src/nccl.cpp diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 86bb89dc3..1fb3eef97 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -119,7 +119,7 @@ class TcpBootstrap::Impl { void netInit(std::string ipPortPair, std::string interface); }; -TcpBootstrap::Impl::Impl(int rank, int nRanks) +TcpBootstrap::Impl::Impl() : netInitialized(false), abortFlagStorage_(new uint32_t(0)), abortFlag_(abortFlagStorage_.get()) {} @@ -482,7 +482,7 @@ void TcpBootstrap::Impl::close() { peerRecvSockets_.clear(); } -MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap(int rank, int nRanks) { pimpl_ = std::make_unique(rank, nRanks); } +MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap() { pimpl_ = std::make_unique(); } MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return pimpl_->createUniqueId(); } @@ -502,11 +502,11 @@ MSCCLPP_API_CPP void TcpBootstrap::recv(void* data, int size, int peer, int tag) MSCCLPP_API_CPP void TcpBootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); } -MSCCLPP_API_CPP void TcpBootstrap::initialize(UniqueId uniqueId, int64_t timeoutSec) { +MSCCLPP_API_CPP void TcpBootstrap::initialize(UniqueId uniqueId, int rank, int nRanks, int64_t timeoutSec) { pimpl_->initialize(uniqueId, timeoutSec); } -MSCCLPP_API_CPP void TcpBootstrap::initialize(const std::string& ipPortPair, int64_t timeoutSec) { +MSCCLPP_API_CPP void TcpBootstrap::initialize(const std::string& ipPortPair, int rank, int nRanks, int64_t timeoutSec) { pimpl_->initialize(ipPortPair, timeoutSec); } diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu index 2f56b221d..aeb8003d9 100644 --- a/test/allgather_test_cpp.cu +++ b/test/allgather_test_cpp.cu @@ -393,8 +393,8 @@ int main(int argc, const char* argv[]) { try { if (rank == 0) printf("Initializing MSCCL++\n"); - auto bootstrap = std::make_shared(rank, world_size); - bootstrap->initialize(ip_port); + auto bootstrap = std::make_shared(); + bootstrap->initialize(ip_port, rank, world_size); mscclpp::Communicator comm(bootstrap); mscclpp::ProxyService proxyService; diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index 7f50994b9..d186f6be7 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -228,11 +228,11 @@ int main(int argc, char* argv[]) { MSCCLPP_CUDATHROW(cudaSetDevice(cudaNum)); if (rank == 0) printf("Initializing MSCCL++\n"); - auto bootstrap = std::make_shared(rank, world_size); + auto bootstrap = std::make_shared(); mscclpp::UniqueId uniqueId; if (rank == 0) uniqueId = bootstrap->createUniqueId(); MPI_Bcast(&uniqueId, sizeof(uniqueId), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(uniqueId); + bootstrap->initialize(uniqueId, rank, world_size); mscclpp::Communicator comm(bootstrap); int* data_d; diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index 82120a1f7..c303662aa 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -49,17 +49,17 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr bootstr } TEST_F(BootstrapTest, WithId) { - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); + bootstrap->initialize(id, gEnv->rank, gEnv->worldSize); bootstrapTestAll(bootstrap); } TEST_F(BootstrapTest, WithIpPortPair) { - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); - bootstrap->initialize(gEnv->args["ip_port"]); + auto bootstrap = std::make_shared(); + bootstrap->initialize(gEnv->args["ip_port"], gEnv->rank, gEnv->worldSize); bootstrapTestAll(bootstrap); } @@ -68,23 +68,23 @@ TEST_F(BootstrapTest, ResumeWithId) { bootstrapTestTimer.set(300); for (int i = 0; i < 3000; ++i) { - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id, 300); + bootstrap->initialize(id, gEnv->rank, gEnv->worldSize, 300); } } TEST_F(BootstrapTest, ResumeWithIpPortPair) { for (int i = 0; i < 5; ++i) { - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); - bootstrap->initialize(gEnv->args["ip_port"]); + auto bootstrap = std::make_shared(); + bootstrap->initialize(gEnv->args["ip_port"], gEnv->rank, gEnv->worldSize); } } TEST_F(BootstrapTest, ExitBeforeConnect) { - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(); bootstrap->createUniqueId(); } @@ -92,12 +92,12 @@ TEST_F(BootstrapTest, TimeoutWithId) { mscclpp::Timer timer; // All ranks initialize a bootstrap with their own id (will hang) - auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(); mscclpp::UniqueId id = bootstrap->createUniqueId(); try { // Set bootstrap timeout to 1 second - bootstrap->initialize(id, 1); + bootstrap->initialize(id, gEnv->rank, gEnv->worldSize, 1); } catch (const mscclpp::Error& e) { ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout); } diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu index 30727667d..beb47ebc7 100644 --- a/test/mp_unit/communicator_tests.cu +++ b/test/mp_unit/communicator_tests.cu @@ -22,7 +22,7 @@ void CommunicatorTestBase::SetUp() { std::shared_ptr bootstrap; mscclpp::UniqueId id; if (gEnv->rank < numRanksToUse) { - bootstrap = std::make_shared(gEnv->rank, numRanksToUse); + bootstrap = std::make_shared(); if (gEnv->rank == 0) id = bootstrap->createUniqueId(); } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -30,7 +30,7 @@ void CommunicatorTestBase::SetUp() { if (gEnv->rank >= numRanksToUse) { return; } - bootstrap->initialize(id); + bootstrap->initialize(id, gEnv->rank, numRanksToUse); communicator = std::make_shared(bootstrap); } diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu index e878154d7..605019cd6 100644 --- a/test/mp_unit/ib_tests.cu +++ b/test/mp_unit/ib_tests.cu @@ -24,7 +24,7 @@ void IbPeerToPeerTest::SetUp() { if (gEnv->rank < 2) { // This test needs only two ranks - bootstrap = std::make_shared(gEnv->rank, 2); + bootstrap = std::make_shared(); if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -33,7 +33,7 @@ void IbPeerToPeerTest::SetUp() { return; } - bootstrap->initialize(id); + bootstrap->initialize(id, gEnv->rank, 2); ibCtx = std::make_shared(ibDevName); qp = ibCtx->createQp(1024, 1, 8192, 0, 64); diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index c5653b3fc..d3dd77327 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -324,11 +324,11 @@ void BaseTestEngine::runTest() { } void BaseTestEngine::bootstrap() { - auto bootstrap = std::make_shared(args_.rank, args_.totalRanks); + auto bootstrap = std::make_shared(); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); + bootstrap->initialize(id, args_.rank, args_.totalRanks); comm_ = std::make_shared(bootstrap); } diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc index 90da5dd7f..745f71079 100644 --- a/test/unit/core_tests.cc +++ b/test/unit/core_tests.cc @@ -9,8 +9,8 @@ class LocalCommunicatorTest : public ::testing::Test { protected: void SetUp() override { - bootstrap = std::make_shared(0, 1); - bootstrap->initialize(bootstrap->createUniqueId()); + bootstrap = std::make_shared(); + bootstrap->initialize(bootstrap->createUniqueId(), 0, 1); comm = std::make_shared(bootstrap); } From ae3fb4d6b11e441685c14dd9aaee80294a7292ee Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 7 Feb 2024 06:56:26 +0000 Subject: [PATCH 27/89] wip --- apps/nccl/include/comm.h | 8 ++++++++ apps/nccl/src/nccl.cpp | 1 + python/mscclpp/core_py.cpp | 17 +++++++++-------- src/bootstrap/bootstrap.cc | 24 +++++++++++++----------- 4 files changed, 31 insertions(+), 19 deletions(-) create mode 100644 apps/nccl/include/comm.h diff --git a/apps/nccl/include/comm.h b/apps/nccl/include/comm.h new file mode 100644 index 000000000..225ad3357 --- /dev/null +++ b/apps/nccl/include/comm.h @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +struct ncclComm { + std::shared_ptr comm; +}; diff --git a/apps/nccl/src/nccl.cpp b/apps/nccl/src/nccl.cpp index 856bebded..fe8fec14c 100644 --- a/apps/nccl/src/nccl.cpp +++ b/apps/nccl/src/nccl.cpp @@ -16,5 +16,6 @@ NCCL_API ncclResult_t ncclGetVersion(int *version) { NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { if (uniqueId == nullptr) return ncclInvalidArgument; if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError; + // std::shared_ptr return ncclSuccess; } diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 5fd4bd317..1bc9560b5 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -59,16 +59,17 @@ void register_core(nb::module_& m) { nb::class_(m, "UniqueId"); nb::class_(m, "TcpBootstrap") - .def(nb::init(), "Do not use this constructor. Use create instead.") - .def_static( - "create", [](int rank, int nRanks) { return std::make_shared(rank, nRanks); }, nb::arg("rank"), - nb::arg("nRanks")) + .def(nb::init<>(), "Do not use this constructor. Use create instead.") + .def_static("create", []() { return std::make_shared(); }) .def("create_unique_id", &TcpBootstrap::createUniqueId) .def("get_unique_id", &TcpBootstrap::getUniqueId) - .def("initialize", static_cast(&TcpBootstrap::initialize), - nb::call_guard(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30) - .def("initialize", static_cast(&TcpBootstrap::initialize), - nb::call_guard(), nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30); + .def("initialize", static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("uniqueId"), nb::arg("rank"), nb::arg("nRanks"), + nb::arg("timeoutSec") = 30) + .def("initialize", + static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("ifIpPortTrio"), nb::arg("rank"), nb::arg("nRanks"), + nb::arg("timeoutSec") = 30); nb::enum_(m, "Transport") .value("Unknown", Transport::Unknown) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 1fb3eef97..8746a4326 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -72,8 +72,8 @@ class TcpBootstrap::Impl { public: Impl(); ~Impl(); - void initialize(const UniqueId& uniqueId, int64_t timeoutSec); - void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec); + void initialize(const UniqueId& uniqueId, int rank, int nRanks, int64_t timeoutSec); + void initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec); void establishConnections(int64_t timeoutSec); UniqueId createUniqueId(); UniqueId getUniqueId() const; @@ -120,9 +120,7 @@ class TcpBootstrap::Impl { }; TcpBootstrap::Impl::Impl() - : netInitialized(false), - abortFlagStorage_(new uint32_t(0)), - abortFlag_(abortFlagStorage_.get()) {} + : netInitialized(false), abortFlagStorage_(new uint32_t(0)), abortFlag_(abortFlagStorage_.get()) {} UniqueId TcpBootstrap::Impl::getUniqueId() const { UniqueId ret; @@ -142,15 +140,19 @@ int TcpBootstrap::Impl::getRank() { return rank_; } int TcpBootstrap::Impl::getNranks() { return nRanks_; } -void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int64_t timeoutSec) { - netInit("", ""); +void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int rank, int nRanks, int64_t timeoutSec) { + rank_ = rank; + nRanks_ = nRanks; + netInit("", ""); std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_)); - establishConnections(timeoutSec); } -void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t timeoutSec) { +void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec) { + rank_ = rank; + nRanks_ = nRanks; + // first check if it is a trio int nColons = 0; for (auto c : ifIpPortTrio) { @@ -503,11 +505,11 @@ MSCCLPP_API_CPP void TcpBootstrap::recv(void* data, int size, int peer, int tag) MSCCLPP_API_CPP void TcpBootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); } MSCCLPP_API_CPP void TcpBootstrap::initialize(UniqueId uniqueId, int rank, int nRanks, int64_t timeoutSec) { - pimpl_->initialize(uniqueId, timeoutSec); + pimpl_->initialize(uniqueId, rank, nRanks, timeoutSec); } MSCCLPP_API_CPP void TcpBootstrap::initialize(const std::string& ipPortPair, int rank, int nRanks, int64_t timeoutSec) { - pimpl_->initialize(ipPortPair, timeoutSec); + pimpl_->initialize(ipPortPair, rank, nRanks, timeoutSec); } MSCCLPP_API_CPP void TcpBootstrap::barrier() { pimpl_->barrier(); } From e15d22ad88cc109603fe3b73cdc95c161577a5f8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 7 Feb 2024 10:58:20 +0000 Subject: [PATCH 28/89] wip --- apps/nccl/include/comm.h | 8 -- apps/nccl/src/nccl.cpp | 30 ++++++- include/mscclpp/core.hpp | 20 +++-- python/mscclpp/core_py.cpp | 17 ++-- src/bootstrap/bootstrap.cc | 107 +++++++++++++++---------- src/bootstrap/socket.cc | 6 +- src/include/socket.h | 12 +-- test/allgather_test_cpp.cu | 4 +- test/allgather_test_host_offloading.cu | 4 +- test/mp_unit/bootstrap_tests.cc | 22 ++--- test/mp_unit/communicator_tests.cu | 4 +- test/mp_unit/ib_tests.cu | 4 +- test/mscclpp-test/common.cc | 4 +- test/nccl_api_test.cc | 0 test/unit/core_tests.cc | 4 +- test/unit/socket_tests.cc | 2 +- 16 files changed, 144 insertions(+), 104 deletions(-) delete mode 100644 apps/nccl/include/comm.h create mode 100644 test/nccl_api_test.cc diff --git a/apps/nccl/include/comm.h b/apps/nccl/include/comm.h deleted file mode 100644 index 225ad3357..000000000 --- a/apps/nccl/include/comm.h +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -struct ncclComm { - std::shared_ptr comm; -}; diff --git a/apps/nccl/src/nccl.cpp b/apps/nccl/src/nccl.cpp index fe8fec14c..6164b0aea 100644 --- a/apps/nccl/src/nccl.cpp +++ b/apps/nccl/src/nccl.cpp @@ -7,7 +7,12 @@ #define NCCL_API extern "C" __attribute__((visibility("default"))) -NCCL_API ncclResult_t ncclGetVersion(int *version) { +struct ncclComm { + std::shared_ptr comm; + // we need a map for registered buffers +}; + +NCCL_API ncclResult_t ncclGetVersion(int* version) { if (version == nullptr) return ncclInvalidArgument; *version = MSCCLPP_VERSION; return ncclSuccess; @@ -16,6 +21,27 @@ NCCL_API ncclResult_t ncclGetVersion(int *version) { NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { if (uniqueId == nullptr) return ncclInvalidArgument; if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError; - // std::shared_ptr + mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId(); + memcpy(uniqueId, &id, sizeof(ncclUniqueId)); + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { + if (comm == nullptr) return ncclInvalidArgument; + if (nranks < 0 || rank < 0 || rank >= nranks) return ncclInvalidArgument; + std::shared_ptr bootstrap = std::make_shared(rank, nranks); + mscclpp::UniqueId id; + memcpy(id.data(), &commId, sizeof(ncclUniqueId)); + bootstrap->initialize(id); + std::shared_ptr mscclppComm = std::make_shared(bootstrap); + ncclComm* comm_ptr = new ncclComm(); + comm_ptr->comm = mscclppComm; + *comm = comm_ptr; + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == nullptr) return ncclInvalidArgument; + delete comm; return ncclSuccess; } diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 143b4c8dc..c2a4dff44 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -51,33 +51,31 @@ class Bootstrap { /// A native implementation of the bootstrap using TCP sockets. class TcpBootstrap : public Bootstrap { public: + /// Create a random unique ID. + /// @return The created unique ID. + static UniqueId createUniqueId(); + /// Constructor. - TcpBootstrap(); + /// @param rank The rank of the process. + /// @param nRanks The total number of ranks. + TcpBootstrap(int rank, int nRanks); /// Destructor. ~TcpBootstrap(); - /// Create a random unique ID and store it in the @ref TcpBootstrap. - /// @return The created unique ID. - UniqueId createUniqueId(); - /// Return the unique ID stored in the @ref TcpBootstrap. /// @return The unique ID stored in the @ref TcpBootstrap. UniqueId getUniqueId() const; /// Initialize the @ref TcpBootstrap with a given unique ID. /// @param uniqueId The unique ID to initialize the @ref TcpBootstrap with. - /// @param rank The rank of the process. - /// @param nRanks The total number of ranks. /// @param timeoutSec The connection timeout in seconds. - void initialize(UniqueId uniqueId, int rank, int nRanks, int64_t timeoutSec = 30); + void initialize(UniqueId uniqueId, int64_t timeoutSec = 30); /// Initialize the @ref TcpBootstrap with a string formatted as "ip:port" or "interface:ip:port". /// @param ifIpPortTrio The string formatted as "ip:port" or "interface:ip:port". - /// @param rank The rank of the process. - /// @param nRanks The total number of ranks. /// @param timeoutSec The connection timeout in seconds. - void initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec = 30); + void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec = 30); /// Return the rank of the process. int getRank() override; diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 1bc9560b5..5fd4bd317 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -59,17 +59,16 @@ void register_core(nb::module_& m) { nb::class_(m, "UniqueId"); nb::class_(m, "TcpBootstrap") - .def(nb::init<>(), "Do not use this constructor. Use create instead.") - .def_static("create", []() { return std::make_shared(); }) + .def(nb::init(), "Do not use this constructor. Use create instead.") + .def_static( + "create", [](int rank, int nRanks) { return std::make_shared(rank, nRanks); }, nb::arg("rank"), + nb::arg("nRanks")) .def("create_unique_id", &TcpBootstrap::createUniqueId) .def("get_unique_id", &TcpBootstrap::getUniqueId) - .def("initialize", static_cast(&TcpBootstrap::initialize), - nb::call_guard(), nb::arg("uniqueId"), nb::arg("rank"), nb::arg("nRanks"), - nb::arg("timeoutSec") = 30) - .def("initialize", - static_cast(&TcpBootstrap::initialize), - nb::call_guard(), nb::arg("ifIpPortTrio"), nb::arg("rank"), nb::arg("nRanks"), - nb::arg("timeoutSec") = 30); + .def("initialize", static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30) + .def("initialize", static_cast(&TcpBootstrap::initialize), + nb::call_guard(), nb::arg("ifIpPortTrio"), nb::arg("timeoutSec") = 30); nb::enum_(m, "Transport") .value("Unknown", Transport::Unknown) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 8746a4326..fd03a592b 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -70,12 +70,14 @@ static_assert(sizeof(UniqueIdInternal) <= sizeof(UniqueId), "UniqueIdInternal is class TcpBootstrap::Impl { public: - Impl(); + static UniqueId createUniqueId(); + static UniqueId getUniqueId(const UniqueIdInternal& uniqueId); + + Impl(int rank, int nRanks); ~Impl(); - void initialize(const UniqueId& uniqueId, int rank, int nRanks, int64_t timeoutSec); - void initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec); + void initialize(const UniqueId& uniqueId, int64_t timeoutSec); + void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec); void establishConnections(int64_t timeoutSec); - UniqueId createUniqueId(); UniqueId getUniqueId() const; int getRank(); int getNranks(); @@ -99,7 +101,6 @@ class TcpBootstrap::Impl { std::unique_ptr abortFlagStorage_; volatile uint32_t* abortFlag_; std::thread rootThread_; - char netIfName_[MAX_IF_NAME_SIZE + 1]; SocketAddress netIfAddr_; std::unordered_map, std::shared_ptr, PairHash> peerSendSockets_; std::unordered_map, std::shared_ptr, PairHash> peerRecvSockets_; @@ -110,49 +111,62 @@ class TcpBootstrap::Impl { std::shared_ptr getPeerSendSocket(int peer, int tag); std::shared_ptr getPeerRecvSocket(int peer, int tag); + static void assignPortToUniqueId(UniqueIdInternal& uniqueId); + static void netInit(std::string ipPortPair, std::string interface, SocketAddress& netIfAddr); + void bootstrapCreateRoot(); void bootstrapRoot(); void getRemoteAddresses(Socket* listenSock, std::vector& rankAddresses, std::vector& rankAddressesRoot, int& rank); void sendHandleToPeer(int peer, const std::vector& rankAddresses, const std::vector& rankAddressesRoot); - void netInit(std::string ipPortPair, std::string interface); }; -TcpBootstrap::Impl::Impl() - : netInitialized(false), abortFlagStorage_(new uint32_t(0)), abortFlag_(abortFlagStorage_.get()) {} +UniqueId TcpBootstrap::Impl::createUniqueId() { + UniqueIdInternal uniqueId; + SocketAddress netIfAddr; + netInit("", "", netIfAddr); + getRandomData(&uniqueId.magic, sizeof(uniqueId_.magic)); + std::memcpy(&uniqueId.addr, &netIfAddr, sizeof(SocketAddress)); + assignPortToUniqueId(uniqueId); + return getUniqueId(uniqueId); +} -UniqueId TcpBootstrap::Impl::getUniqueId() const { +UniqueId TcpBootstrap::Impl::getUniqueId(const UniqueIdInternal& uniqueId) { UniqueId ret; - std::memcpy(&ret, &uniqueId_, sizeof(uniqueId_)); + std::memcpy(&ret, &uniqueId, sizeof(uniqueId)); return ret; } -UniqueId TcpBootstrap::Impl::createUniqueId() { - netInit("", ""); - getRandomData(&uniqueId_.magic, sizeof(uniqueId_.magic)); - std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress)); - bootstrapCreateRoot(); - return getUniqueId(); -} +TcpBootstrap::Impl::Impl(int rank, int nRanks) + : rank_(rank), + nRanks_(nRanks), + netInitialized(false), + peerCommAddresses_(nRanks, SocketAddress()), + barrierArr_(nRanks, 0), + abortFlagStorage_(new uint32_t(0)), + abortFlag_(abortFlagStorage_.get()) {} + +UniqueId TcpBootstrap::Impl::getUniqueId() const { return getUniqueId(uniqueId_); } int TcpBootstrap::Impl::getRank() { return rank_; } int TcpBootstrap::Impl::getNranks() { return nRanks_; } -void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int rank, int nRanks, int64_t timeoutSec) { - rank_ = rank; - nRanks_ = nRanks; - - netInit("", ""); +void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int64_t timeoutSec) { + netInit("", "", netIfAddr_); std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_)); + if (rank_ == 0) { + bootstrapCreateRoot(); + } + + char line[MAX_IF_NAME_SIZE + 1]; + SocketToString(&uniqueId_.addr, line); + INFO(MSCCLPP_INIT, "rank %d nranks %d - connecting to %s", rank_, nRanks_, line); establishConnections(timeoutSec); } -void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int rank, int nRanks, int64_t timeoutSec) { - rank_ = rank; - nRanks_ = nRanks; - +void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t timeoutSec) { // first check if it is a trio int nColons = 0; for (auto c : ifIpPortTrio) { @@ -168,7 +182,7 @@ void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int rank, i ipPortPair = ifIpPortTrio.substr(ipPortPair.find_first_of(':') + 1); } - netInit(ipPortPair, interface); + netInit(ipPortPair, interface, netIfAddr_); uniqueId_.magic = 0xdeadbeef; std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress)); @@ -228,9 +242,15 @@ void TcpBootstrap::Impl::sendHandleToPeer(int peer, const std::vector socket = std::make_unique(&uniqueId.addr, uniqueId.magic, SocketTypeBootstrap); + socket->bind(); + uniqueId.addr = socket->getAddr(); +} + void TcpBootstrap::Impl::bootstrapCreateRoot() { listenSockRoot_ = std::make_unique(&uniqueId_.addr, uniqueId_.magic, SocketTypeBootstrap, abortFlag_, 0); - listenSockRoot_->listen(); + listenSockRoot_->bindAndListen(); uniqueId_.addr = listenSockRoot_->getAddr(); rootThread_ = std::thread([this]() { @@ -277,34 +297,33 @@ void TcpBootstrap::Impl::bootstrapRoot() { TRACE(MSCCLPP_INIT, "DONE"); } -void TcpBootstrap::Impl::netInit(std::string ipPortPair, std::string interface) { - if (netInitialized) return; +void TcpBootstrap::Impl::netInit(std::string ipPortPair, std::string interface, SocketAddress& netIfAddr) { + char netIfName[MAX_IF_NAME_SIZE + 1]; if (!ipPortPair.empty()) { if (interface != "") { // we know the - int ret = FindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1, interface.c_str()); + int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1, interface.c_str()); if (ret <= 0) throw Error("NET/Socket : No interface named " + interface + " found.", ErrorCode::InternalError); } else { // we do not know the try to match it next SocketAddress remoteAddr; SocketGetAddrFromString(&remoteAddr, ipPortPair.c_str()); - if (FindInterfaceMatchSubnet(netIfName_, &netIfAddr_, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + if (FindInterfaceMatchSubnet(netIfName, &netIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { throw Error("NET/Socket : No usable listening interface found", ErrorCode::InternalError); } } } else { - int ret = FindInterfaces(netIfName_, &netIfAddr_, MAX_IF_NAME_SIZE, 1); + int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1); if (ret <= 0) { throw Error("TcpBootstrap : no socket interface found", ErrorCode::InternalError); } } char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2]; - std::sprintf(line, " %s:", netIfName_); - SocketToString(&netIfAddr_, line + strlen(line)); + std::sprintf(line, " %s:", netIfName); + SocketToString(&netIfAddr, line + strlen(line)); INFO(MSCCLPP_INIT, "TcpBootstrap : Using%s", line); - netInitialized = true; } #define TIMEOUT(__exp) \ @@ -343,13 +362,13 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) { uint64_t magic = uniqueId_.magic; // Create socket for other ranks to contact me listenSock_ = std::make_unique(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_); - listenSock_->listen(); + listenSock_->bindAndListen(); info.extAddressListen = listenSock_->getAddr(); { // Create socket for root to contact me Socket lsock(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_); - lsock.listen(); + lsock.bindAndListen(); info.extAddressListenRoot = lsock.getAddr(); // stagger connection times to avoid an overload of the root @@ -484,9 +503,9 @@ void TcpBootstrap::Impl::close() { peerRecvSockets_.clear(); } -MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap() { pimpl_ = std::make_unique(); } +MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return Impl::createUniqueId(); } -MSCCLPP_API_CPP UniqueId TcpBootstrap::createUniqueId() { return pimpl_->createUniqueId(); } +MSCCLPP_API_CPP TcpBootstrap::TcpBootstrap(int rank, int nRanks) { pimpl_ = std::make_unique(rank, nRanks); } MSCCLPP_API_CPP UniqueId TcpBootstrap::getUniqueId() const { return pimpl_->getUniqueId(); } @@ -504,12 +523,12 @@ MSCCLPP_API_CPP void TcpBootstrap::recv(void* data, int size, int peer, int tag) MSCCLPP_API_CPP void TcpBootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); } -MSCCLPP_API_CPP void TcpBootstrap::initialize(UniqueId uniqueId, int rank, int nRanks, int64_t timeoutSec) { - pimpl_->initialize(uniqueId, rank, nRanks, timeoutSec); +MSCCLPP_API_CPP void TcpBootstrap::initialize(UniqueId uniqueId, int64_t timeoutSec) { + pimpl_->initialize(uniqueId, timeoutSec); } -MSCCLPP_API_CPP void TcpBootstrap::initialize(const std::string& ipPortPair, int rank, int nRanks, int64_t timeoutSec) { - pimpl_->initialize(ipPortPair, rank, nRanks, timeoutSec); +MSCCLPP_API_CPP void TcpBootstrap::initialize(const std::string& ipPortPair, int64_t timeoutSec) { + pimpl_->initialize(ipPortPair, timeoutSec); } MSCCLPP_API_CPP void TcpBootstrap::barrier() { pimpl_->barrier(); } diff --git a/src/bootstrap/socket.cc b/src/bootstrap/socket.cc index 2267af9b3..a79821f1b 100644 --- a/src/bootstrap/socket.cc +++ b/src/bootstrap/socket.cc @@ -390,7 +390,7 @@ Socket::Socket(const SocketAddress* addr, uint64_t magic, enum SocketType type, Socket::~Socket() { close(); } -void Socket::listen() { +void Socket::bind() { if (fd_ == -1) { throw Error("file descriptor is -1", ErrorCode::InvalidUsage); } @@ -433,7 +433,11 @@ void Socket::listen() { if (::getsockname(fd_, &addr_.sa, &size) != 0) { throw SysError("getsockname failed", errno); } + state_ = SocketStateBound; +} +void Socket::bindAndListen() { + bind(); #ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN + 1]; TRACE(MSCCLPP_INIT | MSCCLPP_NET, "Listening on socket %s", SocketToString(&addr_, line)); diff --git a/src/include/socket.h b/src/include/socket.h index 9f043414e..ed125c990 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -35,10 +35,11 @@ enum SocketState { SocketStateConnecting = 4, SocketStateConnectPolling = 5, SocketStateConnected = 6, - SocketStateReady = 7, - SocketStateClosed = 8, - SocketStateError = 9, - SocketStateNum = 10 + SocketStateBound = 7, + SocketStateReady = 8, + SocketStateClosed = 9, + SocketStateError = 10, + SocketStateNum = 11 }; enum SocketType { @@ -62,7 +63,8 @@ class Socket { enum SocketType type = SocketTypeUnknown, volatile uint32_t* abortFlag = nullptr, int asyncFlag = 0); ~Socket(); - void listen(); + void bind(); + void bindAndListen(); void connect(int64_t timeout = -1); void accept(const Socket* listenSocket, int64_t timeout = -1); void send(void* ptr, int size); diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu index aeb8003d9..2f56b221d 100644 --- a/test/allgather_test_cpp.cu +++ b/test/allgather_test_cpp.cu @@ -393,8 +393,8 @@ int main(int argc, const char* argv[]) { try { if (rank == 0) printf("Initializing MSCCL++\n"); - auto bootstrap = std::make_shared(); - bootstrap->initialize(ip_port, rank, world_size); + auto bootstrap = std::make_shared(rank, world_size); + bootstrap->initialize(ip_port); mscclpp::Communicator comm(bootstrap); mscclpp::ProxyService proxyService; diff --git a/test/allgather_test_host_offloading.cu b/test/allgather_test_host_offloading.cu index d186f6be7..7f50994b9 100644 --- a/test/allgather_test_host_offloading.cu +++ b/test/allgather_test_host_offloading.cu @@ -228,11 +228,11 @@ int main(int argc, char* argv[]) { MSCCLPP_CUDATHROW(cudaSetDevice(cudaNum)); if (rank == 0) printf("Initializing MSCCL++\n"); - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(rank, world_size); mscclpp::UniqueId uniqueId; if (rank == 0) uniqueId = bootstrap->createUniqueId(); MPI_Bcast(&uniqueId, sizeof(uniqueId), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(uniqueId, rank, world_size); + bootstrap->initialize(uniqueId); mscclpp::Communicator comm(bootstrap); int* data_d; diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index c303662aa..82120a1f7 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -49,17 +49,17 @@ void BootstrapTest::bootstrapTestAll(std::shared_ptr bootstr } TEST_F(BootstrapTest, WithId) { - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id, gEnv->rank, gEnv->worldSize); + bootstrap->initialize(id); bootstrapTestAll(bootstrap); } TEST_F(BootstrapTest, WithIpPortPair) { - auto bootstrap = std::make_shared(); - bootstrap->initialize(gEnv->args["ip_port"], gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + bootstrap->initialize(gEnv->args["ip_port"]); bootstrapTestAll(bootstrap); } @@ -68,23 +68,23 @@ TEST_F(BootstrapTest, ResumeWithId) { bootstrapTestTimer.set(300); for (int i = 0; i < 3000; ++i) { - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id, gEnv->rank, gEnv->worldSize, 300); + bootstrap->initialize(id, 300); } } TEST_F(BootstrapTest, ResumeWithIpPortPair) { for (int i = 0; i < 5; ++i) { - auto bootstrap = std::make_shared(); - bootstrap->initialize(gEnv->args["ip_port"], gEnv->rank, gEnv->worldSize); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + bootstrap->initialize(gEnv->args["ip_port"]); } } TEST_F(BootstrapTest, ExitBeforeConnect) { - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); bootstrap->createUniqueId(); } @@ -92,12 +92,12 @@ TEST_F(BootstrapTest, TimeoutWithId) { mscclpp::Timer timer; // All ranks initialize a bootstrap with their own id (will hang) - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); mscclpp::UniqueId id = bootstrap->createUniqueId(); try { // Set bootstrap timeout to 1 second - bootstrap->initialize(id, gEnv->rank, gEnv->worldSize, 1); + bootstrap->initialize(id, 1); } catch (const mscclpp::Error& e) { ASSERT_EQ(e.getErrorCode(), mscclpp::ErrorCode::Timeout); } diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu index beb47ebc7..30727667d 100644 --- a/test/mp_unit/communicator_tests.cu +++ b/test/mp_unit/communicator_tests.cu @@ -22,7 +22,7 @@ void CommunicatorTestBase::SetUp() { std::shared_ptr bootstrap; mscclpp::UniqueId id; if (gEnv->rank < numRanksToUse) { - bootstrap = std::make_shared(); + bootstrap = std::make_shared(gEnv->rank, numRanksToUse); if (gEnv->rank == 0) id = bootstrap->createUniqueId(); } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -30,7 +30,7 @@ void CommunicatorTestBase::SetUp() { if (gEnv->rank >= numRanksToUse) { return; } - bootstrap->initialize(id, gEnv->rank, numRanksToUse); + bootstrap->initialize(id); communicator = std::make_shared(bootstrap); } diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu index 605019cd6..e878154d7 100644 --- a/test/mp_unit/ib_tests.cu +++ b/test/mp_unit/ib_tests.cu @@ -24,7 +24,7 @@ void IbPeerToPeerTest::SetUp() { if (gEnv->rank < 2) { // This test needs only two ranks - bootstrap = std::make_shared(); + bootstrap = std::make_shared(gEnv->rank, 2); if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -33,7 +33,7 @@ void IbPeerToPeerTest::SetUp() { return; } - bootstrap->initialize(id, gEnv->rank, 2); + bootstrap->initialize(id); ibCtx = std::make_shared(ibDevName); qp = ibCtx->createQp(1024, 1, 8192, 0, 64); diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index d3dd77327..c5653b3fc 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -324,11 +324,11 @@ void BaseTestEngine::runTest() { } void BaseTestEngine::bootstrap() { - auto bootstrap = std::make_shared(); + auto bootstrap = std::make_shared(args_.rank, args_.totalRanks); mscclpp::UniqueId id; if (bootstrap->getRank() == 0) id = bootstrap->createUniqueId(); MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id, args_.rank, args_.totalRanks); + bootstrap->initialize(id); comm_ = std::make_shared(bootstrap); } diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc new file mode 100644 index 000000000..e69de29bb diff --git a/test/unit/core_tests.cc b/test/unit/core_tests.cc index 745f71079..90da5dd7f 100644 --- a/test/unit/core_tests.cc +++ b/test/unit/core_tests.cc @@ -9,8 +9,8 @@ class LocalCommunicatorTest : public ::testing::Test { protected: void SetUp() override { - bootstrap = std::make_shared(); - bootstrap->initialize(bootstrap->createUniqueId(), 0, 1); + bootstrap = std::make_shared(0, 1); + bootstrap->initialize(bootstrap->createUniqueId()); comm = std::make_shared(bootstrap); } diff --git a/test/unit/socket_tests.cc b/test/unit/socket_tests.cc index fe0a063e5..4fa8d3915 100644 --- a/test/unit/socket_tests.cc +++ b/test/unit/socket_tests.cc @@ -17,7 +17,7 @@ TEST(Socket, ListenAndConnect) { ASSERT_NO_THROW(mscclpp::SocketGetAddrFromString(&listenAddr, ipPortPair.c_str())); mscclpp::Socket listenSock(&listenAddr); - listenSock.listen(); + listenSock.bindAndListen(); std::thread clientThread([&listenAddr]() { mscclpp::Socket sock(&listenAddr); From ad926786e9abc52518f0a6ecaaa528fd91fbf8d8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 7 Feb 2024 11:27:46 +0000 Subject: [PATCH 29/89] wip --- python/mscclpp/core_py.cpp | 2 +- src/bootstrap/bootstrap.cc | 11 +++++++++-- test/CMakeLists.txt | 5 +++-- test/nccl_api_test.cc | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 5fd4bd317..1a1cd2780 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -63,7 +63,7 @@ void register_core(nb::module_& m) { .def_static( "create", [](int rank, int nRanks) { return std::make_shared(rank, nRanks); }, nb::arg("rank"), nb::arg("nRanks")) - .def("create_unique_id", &TcpBootstrap::createUniqueId) + .def_static("create_unique_id", &TcpBootstrap::createUniqueId) .def("get_unique_id", &TcpBootstrap::getUniqueId) .def("initialize", static_cast(&TcpBootstrap::initialize), nb::call_guard(), nb::arg("uniqueId"), nb::arg("timeoutSec") = 30) diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index fd03a592b..6ae8b2c10 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -154,7 +154,11 @@ int TcpBootstrap::Impl::getRank() { return rank_; } int TcpBootstrap::Impl::getNranks() { return nRanks_; } void TcpBootstrap::Impl::initialize(const UniqueId& uniqueId, int64_t timeoutSec) { - netInit("", "", netIfAddr_); + if (!netInitialized) { + netInit("", "", netIfAddr_); + netInitialized = true; + } + std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_)); if (rank_ == 0) { bootstrapCreateRoot(); @@ -182,7 +186,10 @@ void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t tim ipPortPair = ifIpPortTrio.substr(ipPortPair.find_first_of(':') + 1); } - netInit(ipPortPair, interface, netIfAddr_); + if (!netInitialized) { + netInit("", "", netIfAddr_); + netInitialized = true; + } uniqueId_.magic = 0xdeadbeef; std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress)); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ef85cde5a..1c8cb2b04 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,9 +3,9 @@ find_package(MPI) -set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) +set(TEST_LIBS_COMMON mscclpp_nccl mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main) -set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${GPU_INCLUDE_DIRS}) +set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include ${GPU_INCLUDE_DIRS}) set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include) if(USE_ROCM) @@ -24,6 +24,7 @@ endfunction() add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) add_test_executable(nvls_test nvls_test.cu) +add_test_executable(nccl_api_test nccl_api_test.cc) configure_file(run_mpi_test.sh.in run_mpi_test.sh) diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index e69de29bb..80ae8dee5 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -0,0 +1,21 @@ + +#include + +#include "nccl.h" + +int main(int argc, char** argv) { + int rank, world_size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + ncclUniqueId id; + if (rank == 0) { + ncclGetUniqueId(&id); + } + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + ncclComm_t comm; + ncclCommInitRank(&comm, world_size, id, rank); + + MPI_Finalize(); + return 0; +} From 1b4a8fb929fb3897e8a1b79d88de199bc3c93177 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 7 Feb 2024 11:32:18 +0000 Subject: [PATCH 30/89] update --- test/nccl_api_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index 80ae8dee5..8590e4c87 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -15,6 +15,8 @@ int main(int argc, char** argv) { MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); ncclComm_t comm; ncclCommInitRank(&comm, world_size, id, rank); + // ncclAllReduce(nullptr, nullptr, 1, ncclInt, ncclSum, comm, 0, MPI_COMM_WORLD); + ncclCommDestroy(comm); MPI_Finalize(); return 0; From 8a4c0be85d547cbdd5d74150ee20e8d44e029d42 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 7 Feb 2024 19:47:20 +0000 Subject: [PATCH 31/89] borrow simple test code --- apps/nccl/src/CMakeLists.txt | 0 apps/nccl/src/nccl.cpp | 14 ++++ test/nccl_api_test.cc | 140 +++++++++++++++++++++++++++++++---- 3 files changed, 140 insertions(+), 14 deletions(-) delete mode 100644 apps/nccl/src/CMakeLists.txt diff --git a/apps/nccl/src/CMakeLists.txt b/apps/nccl/src/CMakeLists.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/apps/nccl/src/nccl.cpp b/apps/nccl/src/nccl.cpp index 6164b0aea..00f9f2e92 100644 --- a/apps/nccl/src/nccl.cpp +++ b/apps/nccl/src/nccl.cpp @@ -45,3 +45,17 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { delete comm; return ncclSuccess; } + +NCCL_API const char* ncclGetErrorString(ncclResult_t result) { + switch (result) { + case ncclSuccess : return "no error"; + case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; + case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; + case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; + case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; + case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; + case ncclRemoteError : return "remote process exited or there was a network error"; + case ncclInProgress : return "NCCL operation in progress"; + default : return "unknown result code"; + } +} diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index 8590e4c87..8f0891044 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -1,23 +1,135 @@ +// Code borrowed from https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html -#include - +#include +#include "cuda_runtime.h" #include "nccl.h" +#include "mpi.h" +#include +#include +#include -int main(int argc, char** argv) { - int rank, world_size; - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - ncclUniqueId id; - if (rank == 0) { - ncclGetUniqueId(&id); + +#define MPICHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + + +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Failed: Cuda error %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + + +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + + +static uint64_t getHostHash(const char* string) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (int c = 0; string[c] != '\0'; c++){ + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } } - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); +} + + +int main(int argc, char* argv[]) +{ + int size = 32*1024*1024; + + + int myRank, nRanks, localRank = 0; + + + //initializing MPI + MPICHECK(MPI_Init(&argc, &argv)); + MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); + MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); + + + //calculating localRank based on hostname which is used in selecting a GPU + uint64_t hostHashs[nRanks]; + char hostname[1024]; + getHostName(hostname, 1024); + hostHashs[myRank] = getHostHash(hostname); + MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); + for (int p=0; p Date: Wed, 7 Feb 2024 19:53:33 +0000 Subject: [PATCH 32/89] Lint --- apps/nccl/src/nccl.cpp | 5 ++ test/nccl_api_test.cc | 124 +++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 72 deletions(-) diff --git a/apps/nccl/src/nccl.cpp b/apps/nccl/src/nccl.cpp index 00f9f2e92..ff579e900 100644 --- a/apps/nccl/src/nccl.cpp +++ b/apps/nccl/src/nccl.cpp @@ -59,3 +59,8 @@ NCCL_API const char* ncclGetErrorString(ncclResult_t result) { default : return "unknown result code"; } } + +NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + return ncclSuccess; +} diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index 8f0891044..a2113aea3 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -1,135 +1,115 @@ // Code borrowed from https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html -#include -#include "cuda_runtime.h" -#include "nccl.h" -#include "mpi.h" -#include #include +#include #include +#include +#include "cuda_runtime.h" +#include "mpi.h" +#include "nccl.h" -#define MPICHECK(cmd) do { \ - int e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Failed: Cuda error %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - - -#define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - +#define MPICHECK(cmd) \ + do { \ + int e = cmd; \ + if (e != MPI_SUCCESS) { \ + printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t r = cmd; \ + if (r != ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) static uint64_t getHostHash(const char* string) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++){ + for (int c = 0; string[c] != '\0'; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } - static void getHostName(char* hostname, int maxlen) { gethostname(hostname, maxlen); - for (int i=0; i< maxlen; i++) { + for (int i = 0; i < maxlen; i++) { if (hostname[i] == '.') { - hostname[i] = '\0'; - return; + hostname[i] = '\0'; + return; } } } - -int main(int argc, char* argv[]) -{ - int size = 32*1024*1024; - +int main(int argc, char* argv[]) { + int size = 32 * 1024 * 1024; int myRank, nRanks, localRank = 0; - - //initializing MPI + // initializing MPI MPICHECK(MPI_Init(&argc, &argv)); MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); - - //calculating localRank based on hostname which is used in selecting a GPU + // calculating localRank based on hostname which is used in selecting a GPU uint64_t hostHashs[nRanks]; char hostname[1024]; getHostName(hostname, 1024); hostHashs[myRank] = getHostHash(hostname); MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); - for (int p=0; p Date: Wed, 7 Feb 2024 23:03:16 +0000 Subject: [PATCH 33/89] all-reduce wip --- apps/nccl/CMakeLists.txt | 2 +- apps/nccl/src/allreduce.cu | 2 + apps/nccl/src/nccl.cpp | 66 ---------- apps/nccl/src/nccl.cu | 252 +++++++++++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+), 67 deletions(-) create mode 100644 apps/nccl/src/allreduce.cu delete mode 100644 apps/nccl/src/nccl.cpp create mode 100644 apps/nccl/src/nccl.cu diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index d92ded891..83a797c91 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*.cpp) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/nccl.cu src/allreduce.cu) file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h) add_library(mscclpp_nccl_obj OBJECT) diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu new file mode 100644 index 000000000..fd2aca680 --- /dev/null +++ b/apps/nccl/src/allreduce.cu @@ -0,0 +1,2 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. diff --git a/apps/nccl/src/nccl.cpp b/apps/nccl/src/nccl.cpp deleted file mode 100644 index ff579e900..000000000 --- a/apps/nccl/src/nccl.cpp +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "nccl.h" - -#include - -#define NCCL_API extern "C" __attribute__((visibility("default"))) - -struct ncclComm { - std::shared_ptr comm; - // we need a map for registered buffers -}; - -NCCL_API ncclResult_t ncclGetVersion(int* version) { - if (version == nullptr) return ncclInvalidArgument; - *version = MSCCLPP_VERSION; - return ncclSuccess; -} - -NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { - if (uniqueId == nullptr) return ncclInvalidArgument; - if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError; - mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId(); - memcpy(uniqueId, &id, sizeof(ncclUniqueId)); - return ncclSuccess; -} - -NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { - if (comm == nullptr) return ncclInvalidArgument; - if (nranks < 0 || rank < 0 || rank >= nranks) return ncclInvalidArgument; - std::shared_ptr bootstrap = std::make_shared(rank, nranks); - mscclpp::UniqueId id; - memcpy(id.data(), &commId, sizeof(ncclUniqueId)); - bootstrap->initialize(id); - std::shared_ptr mscclppComm = std::make_shared(bootstrap); - ncclComm* comm_ptr = new ncclComm(); - comm_ptr->comm = mscclppComm; - *comm = comm_ptr; - return ncclSuccess; -} - -NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { - if (comm == nullptr) return ncclInvalidArgument; - delete comm; - return ncclSuccess; -} - -NCCL_API const char* ncclGetErrorString(ncclResult_t result) { - switch (result) { - case ncclSuccess : return "no error"; - case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; - case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; - case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; - case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; - case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; - case ncclRemoteError : return "remote process exited or there was a network error"; - case ncclInProgress : return "NCCL operation in progress"; - default : return "unknown result code"; - } -} - -NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { - return ncclSuccess; -} diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu new file mode 100644 index 000000000..1f0e25010 --- /dev/null +++ b/apps/nccl/src/nccl.cu @@ -0,0 +1,252 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "nccl.h" + +#include +#include +#include +#include +#include +#include + +#define NCCL_API extern "C" __attribute__((visibility("default"))) + +#define CUDACHECK(cmd) \ + do { \ + cudaError_t e = cmd; \ + if (e != cudaSuccess) { \ + printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// TODO: +static const int nRanksPerNode = 8; + +static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, + mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, + mscclpp::Transport::IB6, mscclpp::Transport::IB7}; + +__constant__ mscclpp::DeviceHandle constSmChannels[8]; + +struct ncclComm { + std::shared_ptr comm; + std::unordered_map registeredMemories; + std::vector> connections; + std::vector smChannels; + std::shared_ptr scratchBuff; +}; + +cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems, cudaStream_t stream); + +#include +#include + +// extern __constant__ mscclpp::SmChannelDeviceHandle *constSmChannels; +__device__ uint64_t globalFlag; + +__global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { + // This version of allreduce only works for single nodes + if (worldSize != nRanksPerNode) return; + const int nPeers = nRanksPerNode - 1; + const int nPkts = nelems / 2; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank / 2; + // flag for packets. Initially 1 + const uint32_t flag = (uint32_t)globalFlag; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + mscclpp::SmChannelDeviceHandle smChan = constSmChannels[peerIdx]; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // step 1: write to scratch buffer + smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint2 data = make_uint2(0, 0); + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + uint2 val = dstPkt[idx].read(flag); + data.x += val.x; + data.y += val.y; + } + data.x += src[idx].x; + data.y += src[idx].y; + dst[idx].x = data.x; + dst[idx].y = data.y; + for (int index = 0; index < nPeers; index++) { + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)constSmChannels[index].dst_ + scratchResultOffset); + dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag); + } + } + // step 3: get data result from scratch buffer + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint2 data = dstPkt[idx + dstOffset].read(flag); + result[idx].x = data.x; + result[idx].y = data.y; + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + +cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems, cudaStream_t stream) { + allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); + return cudaGetLastError(); +} + +static size_t ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + return 2; + case ncclInt32: + case ncclUint32: + return 4; + case ncclInt64: + case ncclUint64: + return 8; + case ncclFloat32: + return 4; + case ncclFloat64: + return 8; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + return 2; +#endif // defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_FP8_TYPES_EXIST__) + case ncclFp8E4M3: + case ncclFp8E5M2: + return 1; +#endif // defined(__CUDA_FP8_TYPES_EXIST__) + case ncclNumTypes: + return 0; + } + return 0; +} + +NCCL_API ncclResult_t ncclGetVersion(int* version) { + if (version == nullptr) return ncclInvalidArgument; + *version = MSCCLPP_VERSION; + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { + if (uniqueId == nullptr) return ncclInvalidArgument; + if (MSCCLPP_UNIQUE_ID_BYTES != NCCL_UNIQUE_ID_BYTES) return ncclInternalError; + mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId(); + memcpy(uniqueId, &id, sizeof(ncclUniqueId)); + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { + if (comm == nullptr) return ncclInvalidArgument; + if (nranks < 0 || rank < 0 || rank >= nranks) return ncclInvalidArgument; + std::shared_ptr bootstrap = std::make_shared(rank, nranks); + mscclpp::UniqueId id; + memcpy(id.data(), &commId, sizeof(ncclUniqueId)); + bootstrap->initialize(id); + std::shared_ptr mscclppComm = std::make_shared(bootstrap); + ncclComm* comm_ptr = new ncclComm(); + comm_ptr->comm = mscclppComm; + *comm = comm_ptr; + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == nullptr) return ncclInvalidArgument; + delete comm; + return ncclSuccess; +} + +NCCL_API const char* ncclGetErrorString(ncclResult_t result) { + switch (result) { + case ncclSuccess : return "no error"; + case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; + case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; + case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; + case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; + case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; + case ncclRemoteError : return "remote process exited or there was a network error"; + case ncclInProgress : return "NCCL operation in progress"; + default : return "unknown result code"; + } +} + +NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + size_t bytes = count * ncclTypeSize(datatype); + if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; + int rank = comm->comm->bootstrap()->getRank(); + int localRank = rank % nRanksPerNode; + comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 4); + comm->registeredMemories.emplace(comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); + auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); + if (comm->connections.empty()) { + std::vector>> connectionFutures; + std::vector> remoteRegMemoryFutures; + auto rankToNode = [&](int r) { return r / nRanksPerNode; }; + for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport; + if (rankToNode(i) == rankToNode(rank)) { + transport = mscclpp::Transport::CudaIpc; + } else { + transport = IBs[localRank]; + } + connectionFutures.push_back(comm->comm->connectOnSetup(i, 0, transport)); + remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); + comm->comm->sendMemoryOnSetup(localRegMemory, i, 0); + } + comm->comm->setup(); + + std::transform( + connectionFutures.begin(), connectionFutures.end(), std::back_inserter(comm->connections), + [](const mscclpp::NonblockingFuture>& future) { return future.get(); }); + + std::vector> smSemaphores; + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + smSemaphores.emplace_back(std::make_shared(*(comm->comm), comm->connections[cid])); + } + } + comm->comm->setup(); + + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + comm->smChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), localRegMemory.data(), nullptr); + } + } + std::vector> smChannelDeviceHandles; + std::transform( + comm->smChannels.begin(), comm->smChannels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + } + CUDACHECK(allreduce((int*)sendbuff, (int*)localRegMemory.data(), recvbuff, comm->comm->bootstrap()->getRank(), nRanksPerNode, + comm->comm->bootstrap()->getNranks(), count, stream)); + return ncclSuccess; +} From bbf2b2f029868ac21de0c4fa6d624e4408e14341 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 8 Feb 2024 03:10:57 +0000 Subject: [PATCH 34/89] fix a mistake --- apps/nccl/src/nccl.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 1f0e25010..ced370c6b 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -201,7 +201,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int localRank = rank % nRanksPerNode; - comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 4); + comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); comm->registeredMemories.emplace(comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); if (comm->connections.empty()) { From 60bf6a9e854cf9aa61d92ffbf3c6b6a97fa07a8e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 8 Feb 2024 04:28:27 +0000 Subject: [PATCH 35/89] nccl-tests works --- apps/nccl/src/nccl.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index ced370c6b..f9ef9749f 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -201,10 +201,10 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int localRank = rank % nRanksPerNode; - comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); - comm->registeredMemories.emplace(comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); - auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); if (comm->connections.empty()) { + comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); + comm->registeredMemories.emplace(comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); + auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); std::vector>> connectionFutures; std::vector> remoteRegMemoryFutures; auto rankToNode = [&](int r) { return r / nRanksPerNode; }; @@ -246,7 +246,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } - CUDACHECK(allreduce((int*)sendbuff, (int*)localRegMemory.data(), recvbuff, comm->comm->bootstrap()->getRank(), nRanksPerNode, + CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), recvbuff, comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); return ncclSuccess; } From f264eabd553e8de3a0c90ce248777f2821c1c42e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 8 Feb 2024 07:08:55 +0000 Subject: [PATCH 36/89] out of place pass --- apps/nccl/src/nccl.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index f9ef9749f..e9a843c16 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -56,7 +56,7 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, const int nelemsPerRank = nelems / worldSize; const int nPktsPerRank = nelemsPerRank / 2; // flag for packets. Initially 1 - const uint32_t flag = (uint32_t)globalFlag; + const uint32_t flag = (uint32_t)globalFlag + 1; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; const int localBlockIdx = blockIdx.x % nBlocksPerPeer; @@ -225,7 +225,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ std::transform( connectionFutures.begin(), connectionFutures.end(), std::back_inserter(comm->connections), [](const mscclpp::NonblockingFuture>& future) { return future.get(); }); - + std::vector> smSemaphores; for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { @@ -236,7 +236,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - comm->smChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), localRegMemory.data(), nullptr); + comm->smChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(sendbuff), + nullptr); } } std::vector> smChannelDeviceHandles; From ea601952b3a1e8fa17084f8e1cb73c76b55c0eda Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 8 Feb 2024 08:43:23 +0000 Subject: [PATCH 37/89] support more data types --- apps/nccl/src/nccl.cu | 152 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 135 insertions(+), 17 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index e9a843c16..d606a90b8 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -21,6 +21,104 @@ } \ } while (0) +template +__forceinline__ __device__ To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u; + u.f = src; + return u.t; +} + +template +__forceinline__ __device__ T add_elements(T a, T b) { + return a + b; +} + +template <> +__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { + return __hadd2(a, b); +} + +template +__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) { + int4 ret; + ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +template +__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) { + uint2 ret; + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ int add_vectors_helper(int a, int b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +__forceinline__ __device__ int add_vectors(int a, int b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int add_vectors<__half>(int a, int b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { + size_t nInt4 = nElem / 4; + size_t nLastInts = nElem % 4; + int4* dst4 = (int4*)dst; + int4* src4 = (int4*)src; + for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + dst4[i] = add_vectors(dst4[i], src4[i]); + } + if (nLastInts > 0) { + int* dstLast = ((int*)dst) + nInt4 * 4; + int* srcLast = ((int*)src) + nInt4 * 4; + for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + dstLast[i] = add_vectors(dstLast[i], srcLast[i]); + } + } +} + +template +__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { + vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); +} + // TODO: static const int nRanksPerNode = 8; @@ -47,10 +145,12 @@ cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int n // extern __constant__ mscclpp::SmChannelDeviceHandle *constSmChannels; __device__ uint64_t globalFlag; -__global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, +template +__global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; + nelems = nelems / (sizeof(int) / sizeof(T)); const int nPeers = nRanksPerNode - 1; const int nPkts = nelems / 2; const int nelemsPerRank = nelems / worldSize; @@ -83,11 +183,9 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, const int remoteRank = index < rank ? index : index + 1; mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; uint2 val = dstPkt[idx].read(flag); - data.x += val.x; - data.y += val.y; + data = add_vectors(val, data); } - data.x += src[idx].x; - data.y += src[idx].y; + data = add_vectors(data, src[idx]); dst[idx].x = data.x; dst[idx].y = data.y; for (int index = 0; index < nPeers; index++) { @@ -109,8 +207,9 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, } } -cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems, cudaStream_t stream) { +template +cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, + cudaStream_t stream) { allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); return cudaGetLastError(); } @@ -195,15 +294,17 @@ NCCL_API const char* ncclGetErrorString(ncclResult_t result) { } } -NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { size_t bytes = count * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int localRank = rank % nRanksPerNode; if (comm->connections.empty()) { comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); - comm->registeredMemories.emplace(comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); + comm->registeredMemories.emplace( + comm->scratchBuff.get(), + comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); std::vector>> connectionFutures; std::vector> remoteRegMemoryFutures; @@ -229,7 +330,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ std::vector> smSemaphores; for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace_back(std::make_shared(*(comm->comm), comm->connections[cid])); + smSemaphores.emplace_back( + std::make_shared(*(comm->comm), comm->connections[cid])); } } comm->comm->setup(); @@ -241,13 +343,29 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_ } } std::vector> smChannelDeviceHandles; - std::transform( - comm->smChannels.begin(), comm->smChannels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + std::transform(comm->smChannels.begin(), comm->smChannels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + } + switch (datatype) { + case ncclFloat16: + CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, + comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), + count, stream)); + break; + case ncclFloat32: + CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, + comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), + count, stream)); + break; + case ncclInt32: + CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), + count, stream)); + break; + default: + return ncclInvalidArgument; } - CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), recvbuff, comm->comm->bootstrap()->getRank(), nRanksPerNode, - comm->comm->bootstrap()->getNranks(), count, stream)); return ncclSuccess; } From 2093e27eae7d5b57f3ade498d4b7654122815554 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 8 Feb 2024 09:02:54 +0000 Subject: [PATCH 38/89] cllean up --- apps/nccl/src/nccl.cu | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d606a90b8..3733cadf7 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -246,6 +246,14 @@ static size_t ncclTypeSize(ncclDataType_t type) { return 0; } +static mscclpp::Transport getTransport(int rank, int peerRank) { + if (rank / nRanksPerNode == peerRank / nRanksPerNode) { + return mscclpp::Transport::CudaIpc; + } else { + return IBs[rank % nRanksPerNode]; + } +} + NCCL_API ncclResult_t ncclGetVersion(int* version) { if (version == nullptr) return ncclInvalidArgument; *version = MSCCLPP_VERSION; @@ -268,8 +276,19 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI memcpy(id.data(), &commId, sizeof(ncclUniqueId)); bootstrap->initialize(id); std::shared_ptr mscclppComm = std::make_shared(bootstrap); + std::vector>> connectionFutures; + + for (int i = 0; i < mscclppComm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport = getTransport(rank, i); + connectionFutures.push_back(mscclppComm->connectOnSetup(i, 0, transport)); + } + mscclppComm->setup(); ncclComm* comm_ptr = new ncclComm(); comm_ptr->comm = mscclppComm; + std::transform( + connectionFutures.begin(), connectionFutures.end(), std::back_inserter(comm_ptr->connections), + [](const mscclpp::NonblockingFuture>& future) { return future.get(); }); *comm = comm_ptr; return ncclSuccess; } @@ -300,33 +319,21 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int localRank = rank % nRanksPerNode; - if (comm->connections.empty()) { + if (comm->registeredMemories.empty()) { comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); comm->registeredMemories.emplace( comm->scratchBuff.get(), comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); - std::vector>> connectionFutures; std::vector> remoteRegMemoryFutures; - auto rankToNode = [&](int r) { return r / nRanksPerNode; }; for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { if (i == rank) continue; - mscclpp::Transport transport; - if (rankToNode(i) == rankToNode(rank)) { - transport = mscclpp::Transport::CudaIpc; - } else { - transport = IBs[localRank]; - } - connectionFutures.push_back(comm->comm->connectOnSetup(i, 0, transport)); + mscclpp::Transport transport = getTransport(rank, i); remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); comm->comm->sendMemoryOnSetup(localRegMemory, i, 0); } comm->comm->setup(); - std::transform( - connectionFutures.begin(), connectionFutures.end(), std::back_inserter(comm->connections), - [](const mscclpp::NonblockingFuture>& future) { return future.get(); }); - std::vector> smSemaphores; for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { @@ -348,6 +355,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } + switch (datatype) { case ncclFloat16: CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, From a5ab22fc75fedf08a0c845cc1586e9518563aab4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 8 Feb 2024 09:38:06 +0000 Subject: [PATCH 39/89] wip --- apps/nccl/src/nccl.cu | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 3733cadf7..b0d840de3 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -130,8 +130,11 @@ __constant__ mscclpp::DeviceHandle constSmChannels[8]; struct ncclComm { std::shared_ptr comm; - std::unordered_map registeredMemories; std::vector> connections; + std::vector> smSemaphores; + + // Maybe changed during communication collectives + std::unordered_map registeredMemories; std::vector smChannels; std::shared_ptr scratchBuff; }; @@ -284,11 +287,24 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI connectionFutures.push_back(mscclppComm->connectOnSetup(i, 0, transport)); } mscclppComm->setup(); + + std::vector> connections; + std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections), + [](const auto& future) { return future.get(); }); + + std::vector> smSemaphores; + for (size_t cid = 0; cid < connections.size(); ++cid) { + if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + smSemaphores.emplace_back( + std::make_shared(*(mscclppComm), connections[cid])); + } + } + mscclppComm->setup(); + ncclComm* comm_ptr = new ncclComm(); comm_ptr->comm = mscclppComm; - std::transform( - connectionFutures.begin(), connectionFutures.end(), std::back_inserter(comm_ptr->connections), - [](const mscclpp::NonblockingFuture>& future) { return future.get(); }); + comm_ptr->connections = connections; + comm_ptr->smSemaphores = smSemaphores; *comm = comm_ptr; return ncclSuccess; } @@ -319,6 +335,9 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int localRank = rank % nRanksPerNode; + // TODO: For each api, we may use different channels and registered memories.For registered memories, we can use the + // memory address as the key. Then we can get the related registered memory from the map. For smChannels, it related + // with (cid, dst, src). If the tuple (cid, dst, src) is the same, we can use the same smChannel. if (comm->registeredMemories.empty()) { comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); comm->registeredMemories.emplace( @@ -334,15 +353,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t } comm->comm->setup(); - std::vector> smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace_back( - std::make_shared(*(comm->comm), comm->connections[cid])); - } - } - comm->comm->setup(); - + std::vector>& smSemaphores = comm->smSemaphores; for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { comm->smChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(sendbuff), From cd2c85d3ad9d0291136d84b9a1b7a81d0264b00b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 8 Feb 2024 09:51:04 +0000 Subject: [PATCH 40/89] leave some commeents --- apps/nccl/src/nccl.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index b0d840de3..0d2bf47cd 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -135,6 +135,10 @@ struct ncclComm { // Maybe changed during communication collectives std::unordered_map registeredMemories; + // The key is addr, rank + // std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; + // The key is (cid, dst, src) + // std::unordered_map, mscclpp::SmChannel> smChannels; std::vector smChannels; std::shared_ptr scratchBuff; }; @@ -338,6 +342,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t // TODO: For each api, we may use different channels and registered memories.For registered memories, we can use the // memory address as the key. Then we can get the related registered memory from the map. For smChannels, it related // with (cid, dst, src). If the tuple (cid, dst, src) is the same, we can use the same smChannel. + // This assumes each memory area can only communicate other peers's fixed memory area. We can use local memory address + // to get the remote memory addresses. They are not changed for a same comm. if (comm->registeredMemories.empty()) { comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); comm->registeredMemories.emplace( From f420ee3431c64e1308e3d16dd7d70d23808597ef Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 19 Feb 2024 08:10:20 +0000 Subject: [PATCH 41/89] WIP --- apps/nccl/src/nccl.cu | 85 ++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 0d2bf47cd..8956a3d9c 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -4,7 +4,7 @@ #include "nccl.h" #include -#include +#include #include #include #include @@ -121,6 +121,8 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { // TODO: static const int nRanksPerNode = 8; +// Only use scratch buffer for message size less then 1MB +static const int scratchSize = 1024 * 1024 * 8; static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -133,14 +135,10 @@ struct ncclComm { std::vector> connections; std::vector> smSemaphores; - // Maybe changed during communication collectives - std::unordered_map registeredMemories; - // The key is addr, rank - // std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; - // The key is (cid, dst, src) - // std::unordered_map, mscclpp::SmChannel> smChannels; - std::vector smChannels; + // key is the pair of sendbuff and recvbuff + std::map, std::vector> smChannels; std::shared_ptr scratchBuff; + std::vector remoteScratchRegMemories; }; cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, @@ -305,11 +303,27 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI } mscclppComm->setup(); - ncclComm* comm_ptr = new ncclComm(); - comm_ptr->comm = mscclppComm; - comm_ptr->connections = connections; - comm_ptr->smSemaphores = smSemaphores; - *comm = comm_ptr; + ncclComm* commPtr = new ncclComm(); + commPtr->comm = mscclppComm; + commPtr->connections = connections; + commPtr->smSemaphores = smSemaphores; + // using scratch buffer for message size less then 1MB + commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); + + mscclpp::RegisteredMemory memory = mscclppComm->registerMemory( + commPtr->scratchBuff.get(), scratchSize, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + std::vector> remoteRegMemoryFutures; + for (int i = 0; i < commPtr->comm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport = getTransport(rank, i); + remoteRegMemoryFutures.push_back(commPtr->comm->recvMemoryOnSetup(i, 0)); + commPtr->comm->sendMemoryOnSetup(memory, i, 0); + } + commPtr->comm->setup(); + std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(), + std::back_inserter(commPtr->remoteScratchRegMemories), + [](const auto& future) { return future.get(); }); + *comm = commPtr; return ncclSuccess; } @@ -338,45 +352,31 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t size_t bytes = count * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); - int localRank = rank % nRanksPerNode; - // TODO: For each api, we may use different channels and registered memories.For registered memories, we can use the - // memory address as the key. Then we can get the related registered memory from the map. For smChannels, it related - // with (cid, dst, src). If the tuple (cid, dst, src) is the same, we can use the same smChannel. - // This assumes each memory area can only communicate other peers's fixed memory area. We can use local memory address - // to get the remote memory addresses. They are not changed for a same comm. - if (comm->registeredMemories.empty()) { - comm->scratchBuff = mscclpp::allocExtSharedCuda(bytes * 8); - comm->registeredMemories.emplace( - comm->scratchBuff.get(), - comm->comm->registerMemory(comm->scratchBuff.get(), bytes, mscclpp::Transport::CudaIpc | IBs[localRank])); - auto& localRegMemory = comm->registeredMemories.at(comm->scratchBuff.get()); - std::vector> remoteRegMemoryFutures; - for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); - remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); - comm->comm->sendMemoryOnSetup(localRegMemory, i, 0); - } - comm->comm->setup(); - + std::pair key(sendbuff, recvbuff); + std::vector channels; + if (comm->smChannels.find(key) == comm->smChannels.end()) { std::vector>& smSemaphores = comm->smSemaphores; for (size_t cid = 0; cid < comm->connections.size(); ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - comm->smChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(sendbuff), - nullptr); + channels.emplace_back(smSemaphores[cid], comm->remoteScratchRegMemories[cid], const_cast(sendbuff), + nullptr); } } - std::vector> smChannelDeviceHandles; - std::transform(comm->smChannels.begin(), comm->smChannels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + comm->smChannels.emplace(key, channels); + } else { + channels = comm->smChannels[key]; } + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device + CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); switch (datatype) { case ncclFloat16: CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, - comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), + rank, nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: @@ -385,6 +385,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, stream)); break; case ncclInt32: + case ncclUint32: CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); From adbabf9f137d8d59466ca950927f3b2cdbaca978 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 19 Feb 2024 09:34:23 +0000 Subject: [PATCH 42/89] WIP --- apps/nccl/src/nccl.cu | 157 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 139 insertions(+), 18 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 8956a3d9c..e7d8d1da6 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -1,15 +1,16 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "nccl.h" - #include #include +#include #include #include #include #include +#include "nccl.h" + #define NCCL_API extern "C" __attribute__((visibility("default"))) #define CUDACHECK(cmd) \ @@ -129,6 +130,8 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans mscclpp::Transport::IB6, mscclpp::Transport::IB7}; __constant__ mscclpp::DeviceHandle constSmChannels[8]; +__constant__ mscclpp::DeviceHandle constSmOutChannels[8]; +__device__ mscclpp::DeviceSyncer deviceSyncer; struct ncclComm { std::shared_ptr comm; @@ -212,10 +215,93 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa } } +template +__global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmChannelDeviceHandle* smOutChans, T* src, + T* dst, int rank, int nranks, size_t nelems) { + const size_t chunkSize = nelems / nranks; + if (nranks == 1) return; + const int nPeer = nranks - 1; + const size_t indexOffset = rank * chunkSize; + const size_t vectorSize = sizeof(int4) / sizeof(T); + const size_t indexOffset4 = indexOffset / vectorSize; + int4* src4 = (int4*)src; + int4* dst4 = (int4*)dst; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // synchronize everyone + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChans[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChans[tid - nPeer].wait(); + } + deviceSyncer.sync(gridDim.x); + + // use int4 as much as possible + const size_t nInt4 = chunkSize / vectorSize; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { + int4 tmp = src4[indexOffset4 + idx]; + for (int index = 0; index < nPeer; ++index) { + int4 val; + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + val = smChans[peerIdx].read(indexOffset4 + idx); + tmp = add_vectors(tmp, val); + } + dst4[indexOffset4 + idx] = tmp; + } + + // use the given TYPE for the rest + size_t processed = nInt4 * vectorSize * nranks; + const size_t nRemElems = nelems - processed; + const size_t startIdx = processed + (nRemElems * rank) / nranks; + const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks; + for (int idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { + T tmp = src[idx]; + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + T val = smChans[peerIdx].read(idx); + tmp += val; + } + dst[idx] = tmp; + } + + // synchronize everyone again + deviceSyncer.sync(gridDim.x); + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChans[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChans[tid - nPeer].wait(); + } + + deviceSyncer.sync(gridDim.x); + for (int i = 0; i < nPeer; ++i) { + int peerIdx = (i + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); + size_t offset = chunkSize * remoteRank * sizeof(T); + smOutChans[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); + } +} + template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); + if (sizeof(T) * nelems <= (1 << 20)) { + allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); + } else { + allreduce1<<<24, 1024, 0, stream>>>(constSmChannels, constSmOutChannels, buff, resultBuff, rank, worldSize, nelems); + } return cudaGetLastError(); } @@ -353,25 +439,60 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); std::pair key(sendbuff, recvbuff); - std::vector channels; - if (comm->smChannels.find(key) == comm->smChannels.end()) { - std::vector>& smSemaphores = comm->smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[cid], comm->remoteScratchRegMemories[cid], const_cast(sendbuff), - nullptr); + if (bytes <= 1 << 20) { + std::vector channels; + if (comm->smChannels.find(key) == comm->smChannels.end()) { + std::vector>& smSemaphores = comm->smSemaphores; + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + channels.emplace_back(smSemaphores[cid], comm->remoteScratchRegMemories[cid], const_cast(sendbuff), + nullptr); + } } + comm->smChannels.emplace(key, channels); + } else { + channels = comm->smChannels[key]; } - comm->smChannels.emplace(key, channels); + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device + CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } else { - channels = comm->smChannels[key]; + // TODO: Debug this + std::vector channels; + if (comm->smChannels.find(key) == comm->smChannels.end()) { + mscclpp::RegisteredMemory memory = comm->comm->registerMemory( + const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + std::vector> remoteRegMemoryFutures; + for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport = getTransport(rank, i); + remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); + comm->comm->sendMemoryOnSetup(memory, i, 0); + } + comm->comm->setup(); + std::vector>& smSemaphores = comm->smSemaphores; + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + channels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(sendbuff), + nullptr); + } + } + comm->smChannels.emplace(key, channels); + } else { + channels = comm->smChannels[key]; + } + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device + CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + CUDACHECK(cudaMemcpyToSymbol(constSmOutChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); switch (datatype) { case ncclFloat16: From 7b8e4f132692519a186b7ec1e8304816fbd4ef47 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 20 Feb 2024 08:22:59 +0000 Subject: [PATCH 43/89] WIP --- apps/nccl/src/nccl.cu | 56 ++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index e7d8d1da6..13a69576c 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -140,6 +140,7 @@ struct ncclComm { // key is the pair of sendbuff and recvbuff std::map, std::vector> smChannels; + std::map, std::vector> smOutChannels; std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; }; @@ -216,8 +217,7 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa } template -__global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmChannelDeviceHandle* smOutChans, T* src, - T* dst, int rank, int nranks, size_t nelems) { +__global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) { const size_t chunkSize = nelems / nranks; if (nranks == 1) return; const int nPeer = nranks - 1; @@ -234,10 +234,10 @@ __global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmC } __syncthreads(); if (tid < nPeer) { - smChans[tid].relaxedSignal(); + constSmChannels[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - smChans[tid - nPeer].wait(); + constSmChannels[tid - nPeer].wait(); } deviceSyncer.sync(gridDim.x); @@ -249,7 +249,7 @@ __global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmC int4 val; int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - val = smChans[peerIdx].read(indexOffset4 + idx); + val = constSmChannels[peerIdx].read(indexOffset4 + idx); tmp = add_vectors(tmp, val); } dst4[indexOffset4 + idx] = tmp; @@ -265,7 +265,7 @@ __global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmC for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - T val = smChans[peerIdx].read(idx); + T val = constSmChannels[peerIdx].read(idx); tmp += val; } dst[idx] = tmp; @@ -278,10 +278,10 @@ __global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmC } __syncthreads(); if (tid < nPeer) { - smChans[tid].relaxedSignal(); + constSmChannels[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - smChans[tid - nPeer].wait(); + constSmChannels[tid - nPeer].wait(); } deviceSyncer.sync(gridDim.x); @@ -290,7 +290,7 @@ __global__ void allreduce1(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::SmC if (peerIdx >= nPeer) peerIdx -= nPeer; const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = chunkSize * remoteRank * sizeof(T); - smOutChans[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); + constSmOutChannels[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); } } @@ -300,7 +300,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPe if (sizeof(T) * nelems <= (1 << 20)) { allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); } else { - allreduce1<<<24, 1024, 0, stream>>>(constSmChannels, constSmOutChannels, buff, resultBuff, rank, worldSize, nelems); + allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, rank, worldSize, nelems); } return cudaGetLastError(); } @@ -391,8 +391,8 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI ncclComm* commPtr = new ncclComm(); commPtr->comm = mscclppComm; - commPtr->connections = connections; - commPtr->smSemaphores = smSemaphores; + commPtr->connections = std::move(connections); + commPtr->smSemaphores = std::move(smSemaphores); // using scratch buffer for message size less then 1MB commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); @@ -460,8 +460,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } else { - // TODO: Debug this std::vector channels; + std::vector outChannels; if (comm->smChannels.find(key) == comm->smChannels.end()) { mscclpp::RegisteredMemory memory = comm->comm->registerMemory( const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); @@ -481,8 +481,31 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t } } comm->smChannels.emplace(key, channels); + if (sendbuff != recvbuff) { + mscclpp::RegisteredMemory memory = comm->comm->registerMemory( + const_cast(recvbuff), bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + std::vector> remoteRegMemoryFutures; + for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport = getTransport(rank, i); + remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); + comm->comm->sendMemoryOnSetup(memory, i, 0); + } + comm->comm->setup(); + std::vector>& smSemaphores = comm->smSemaphores; + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + outChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(recvbuff), + nullptr); + } + } + comm->smOutChannels.emplace(key, outChannels); + } } else { channels = comm->smChannels[key]; + if (sendbuff != recvbuff) { + outChannels = comm->smOutChannels[key]; + } } std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), @@ -490,8 +513,13 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + if (sendbuff != recvbuff) { + smChannelDeviceHandles.clear(); + std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + } CUDACHECK(cudaMemcpyToSymbol(constSmOutChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } switch (datatype) { From 1c08a8029bb7fcb66d84e3f3579d9711720442da Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 20 Feb 2024 09:25:22 +0000 Subject: [PATCH 44/89] WIP --- apps/nccl/src/nccl.cu | 127 ++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 73 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 13a69576c..4a2a2b1e6 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -345,6 +345,37 @@ static mscclpp::Transport getTransport(int rank, int peerRank) { } } +static std::vector setupRemoteMemories(std::shared_ptr comm, int rank, + void* buff, size_t bytes, + mscclpp::TransportFlags transport) { + std::vector remoteMemories; + mscclpp::RegisteredMemory memory = comm->registerMemory(buff, bytes, transport); + std::vector> remoteRegMemoryFutures; + for (int i = 0; i < comm->bootstrap()->getNranks(); i++) { + if (i == rank) continue; + mscclpp::Transport transport = getTransport(rank, i); + remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(i, 0)); + comm->sendMemoryOnSetup(memory, i, 0); + } + comm->setup(); + std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(), std::back_inserter(remoteMemories), + [](const auto& future) { return future.get(); }); + return remoteMemories; +} + +static std::vector setupSmChannels(ncclComm_t comm, + const std::vector& remoteMemories, + void* src) { + std::vector channels; + std::vector>& smSemaphores = comm->smSemaphores; + for (size_t cid = 0; cid < comm->connections.size(); ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + channels.emplace_back(smSemaphores[cid], remoteMemories[cid], src, nullptr); + } + } + return channels; +} + NCCL_API ncclResult_t ncclGetVersion(int* version) { if (version == nullptr) return ncclInvalidArgument; *version = MSCCLPP_VERSION; @@ -395,20 +426,9 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI commPtr->smSemaphores = std::move(smSemaphores); // using scratch buffer for message size less then 1MB commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); + commPtr->remoteScratchRegMemories = setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, + mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); - mscclpp::RegisteredMemory memory = mscclppComm->registerMemory( - commPtr->scratchBuff.get(), scratchSize, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); - std::vector> remoteRegMemoryFutures; - for (int i = 0; i < commPtr->comm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); - remoteRegMemoryFutures.push_back(commPtr->comm->recvMemoryOnSetup(i, 0)); - commPtr->comm->sendMemoryOnSetup(memory, i, 0); - } - commPtr->comm->setup(); - std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(), - std::back_inserter(commPtr->remoteScratchRegMemories), - [](const auto& future) { return future.get(); }); *comm = commPtr; return ncclSuccess; } @@ -440,83 +460,44 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t int rank = comm->comm->bootstrap()->getRank(); std::pair key(sendbuff, recvbuff); if (bytes <= 1 << 20) { - std::vector channels; - if (comm->smChannels.find(key) == comm->smChannels.end()) { - std::vector>& smSemaphores = comm->smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[cid], comm->remoteScratchRegMemories[cid], const_cast(sendbuff), - nullptr); - } - } - comm->smChannels.emplace(key, channels); - } else { - channels = comm->smChannels[key]; + auto it = comm->smChannels.find(key); + if (it == comm->smChannels.end()) { + std::vector channels = + setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); + it = comm->smChannels.emplace(key, channels).first; } std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } else { - std::vector channels; - std::vector outChannels; - if (comm->smChannels.find(key) == comm->smChannels.end()) { - mscclpp::RegisteredMemory memory = comm->comm->registerMemory( - const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); - std::vector> remoteRegMemoryFutures; - for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); - remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); - comm->comm->sendMemoryOnSetup(memory, i, 0); - } - comm->comm->setup(); - std::vector>& smSemaphores = comm->smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(sendbuff), - nullptr); - } - } - comm->smChannels.emplace(key, channels); - if (sendbuff != recvbuff) { - mscclpp::RegisteredMemory memory = comm->comm->registerMemory( - const_cast(recvbuff), bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); - std::vector> remoteRegMemoryFutures; - for (int i = 0; i < comm->comm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); - remoteRegMemoryFutures.push_back(comm->comm->recvMemoryOnSetup(i, 0)); - comm->comm->sendMemoryOnSetup(memory, i, 0); - } - comm->comm->setup(); - std::vector>& smSemaphores = comm->smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - outChannels.emplace_back(smSemaphores[cid], remoteRegMemoryFutures[cid].get(), const_cast(recvbuff), - nullptr); - } - } - comm->smOutChannels.emplace(key, outChannels); - } - } else { - channels = comm->smChannels[key]; + auto it = comm->smChannels.find(key); + auto outIt = comm->smOutChannels.find(key); + if (it == comm->smChannels.end()) { + std::vector remoteMemories = + setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, + mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(sendbuff)); + it = comm->smChannels.emplace(key, channels).first; if (sendbuff != recvbuff) { - outChannels = comm->smOutChannels[key]; + std::vector remoteMemories = + setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); + outIt = comm->smOutChannels.emplace(key, outChannels).first; } } std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); if (sendbuff != recvbuff) { smChannelDeviceHandles.clear(); - std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + std::transform(outIt->second.begin(), outIt->second.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); } CUDACHECK(cudaMemcpyToSymbol(constSmOutChannels, smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); From 1e6a7a57833940464e03d3672a3f93a6e034b420 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 20 Feb 2024 09:58:54 +0000 Subject: [PATCH 45/89] workable version --- apps/nccl/src/nccl.cu | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 4a2a2b1e6..734790b57 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -2,7 +2,6 @@ // Licensed under the MIT license. #include -#include #include #include #include @@ -133,14 +132,31 @@ __constant__ mscclpp::DeviceHandle constSmChannels[8]; __constant__ mscclpp::DeviceHandle constSmOutChannels[8]; __device__ mscclpp::DeviceSyncer deviceSyncer; +struct channelKey { + const void* sendbuff; + const void* recvbuff; + size_t bytes; + bool operator==(const channelKey& other) const { + return sendbuff == other.sendbuff && recvbuff == other.recvbuff && bytes == other.bytes; + } +}; + +namespace std { +template <> +struct hash { + std::size_t operator()(const channelKey& k) const { + return std::hash()(k.sendbuff) ^ std::hash()(k.recvbuff) ^ std::hash()(k.bytes); + } +}; +} // namespace std + struct ncclComm { std::shared_ptr comm; std::vector> connections; std::vector> smSemaphores; - // key is the pair of sendbuff and recvbuff - std::map, std::vector> smChannels; - std::map, std::vector> smOutChannels; + std::unordered_map> smChannels; + std::unordered_map> smOutChannels; std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; }; @@ -458,7 +474,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t size_t bytes = count * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); - std::pair key(sendbuff, recvbuff); + channelKey key{sendbuff, recvbuff, bytes}; if (bytes <= 1 << 20) { auto it = comm->smChannels.find(key); if (it == comm->smChannels.end()) { From 9790dfc3196c8d69230e3a43d6e0172ddf187077 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 20 Feb 2024 18:57:49 +0000 Subject: [PATCH 46/89] Revert "MSRCHA-371 workaround" This reverts commit 5eb35cd45bbe0c2b26e1a6c647ec92f6e2acd86a. --- include/mscclpp/semaphore_device.hpp | 12 ++---------- src/semaphore.cc | 3 +-- test/mp_unit/proxy_channel_tests.cu | 5 ----- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index 088f27bf3..cd455078a 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -20,22 +20,14 @@ struct Host2DeviceSemaphoreDeviceHandle { /// @return true if the host has signaled. MSCCLPP_DEVICE_INLINE bool poll() { bool signaled = (atomicLoad(inboundSemaphoreId, memoryOrderAcquire) > (*expectedInboundSemaphoreId)); - if (signaled) { - (*expectedInboundSemaphoreId) += 1; - } else { - // TODO: MSRCHA-371 - atomicStore(&inboundSemaphoreId[1], uint64_t{0}, memoryOrderRelaxed); - } + if (signaled) (*expectedInboundSemaphoreId) += 1; return signaled; } /// Wait for the host to signal. MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 100000000) { (*expectedInboundSemaphoreId) += 1; - // TODO: MSRCHA-371 - POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)) - ? (atomicStore(&inboundSemaphoreId[1], uint64_t{0}, memoryOrderRelaxed), true) - : false, + POLL_MAYBE_JAILBREAK((atomicLoad(inboundSemaphoreId, memoryOrderAcquire) < (*expectedInboundSemaphoreId)), maxSpinCount); } #endif // defined(MSCCLPP_DEVICE_COMPILE) diff --git a/src/semaphore.cc b/src/semaphore.cc index 74aaaf485..7dec60c3d 100644 --- a/src/semaphore.cc +++ b/src/semaphore.cc @@ -11,9 +11,8 @@ namespace mscclpp { static NonblockingFuture setupInboundSemaphoreId(Communicator& communicator, Connection* connection, void* localInboundSemaphoreId) { - // TODO: MSRCHA-371 auto localInboundSemaphoreIdsRegMem = - communicator.registerMemory(localInboundSemaphoreId, sizeof(uint64_t) * 2, connection->transport()); + communicator.registerMemory(localInboundSemaphoreId, sizeof(uint64_t), connection->transport()); int remoteRank = communicator.remoteRankOf(*connection); int tag = communicator.tagOf(*connection); communicator.sendMemoryOnSetup(localInboundSemaphoreIdsRegMem, remoteRank, tag); diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/proxy_channel_tests.cu index 2c276619e..796a565d4 100644 --- a/test/mp_unit/proxy_channel_tests.cu +++ b/test/mp_unit/proxy_channel_tests.cu @@ -435,9 +435,6 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) { proxyService->stopProxy(); } -// TODO: MSRCHA-371 -#if defined(MSCCLPP_DEVICE_CUDA) - TEST_F(ProxyChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } @@ -445,5 +442,3 @@ TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); } TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); } - -#endif // defined(MSCCLPP_DEVICE_CUDA) From 53b0953c82813e2c16c10e83c5e69c075864206d Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 20 Feb 2024 21:59:09 +0000 Subject: [PATCH 47/89] Update names & interfaces --- include/mscclpp/packet_device.hpp | 155 +++++++++++++++++++++----- include/mscclpp/sm_channel_device.hpp | 43 +------ test/mp_unit/sm_channel_tests.cu | 28 ++--- test/mscclpp-test/allreduce_test.cu | 16 +-- 4 files changed, 153 insertions(+), 89 deletions(-) diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index f43abba6d..11f63b53f 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -5,6 +5,7 @@ #define MSCCLPP_PACKET_DEVICE_HPP_ #include +#include #include "device.hpp" @@ -15,7 +16,7 @@ namespace mscclpp { /// LL (low latency) protocol packet. -union alignas(16) LLPacket { +union alignas(16) LL16Packet { // Assume data is written with an atomicity of 8 bytes (IB/RDMA). struct { uint32_t data1; @@ -27,7 +28,7 @@ union alignas(16) LLPacket { #if defined(MSCCLPP_DEVICE_COMPILE) ulonglong2 raw_; - MSCCLPP_DEVICE_INLINE LLPacket() {} + MSCCLPP_DEVICE_INLINE LL16Packet() {} /// Write 8 bytes of data to the packet. /// @param val1 The first 4-byte data to write. @@ -87,7 +88,7 @@ union alignas(16) LLPacket { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; -union alignas(8) LLPacket64 { +union alignas(8) LL8Packet { // Assume data is written with an atomicity of 8 bytes (IB/RDMA). struct { uint32_t data; @@ -96,7 +97,7 @@ union alignas(8) LLPacket64 { uint64_t raw_; #if defined(MSCCLPP_DEVICE_COMPILE) - MSCCLPP_DEVICE_INLINE LLPacket64() {} + MSCCLPP_DEVICE_INLINE LL8Packet() {} MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { #if defined(MSCCLPP_DEVICE_CUDA) @@ -133,62 +134,156 @@ union alignas(8) LLPacket64 { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; +using LLPacket = LL16Packet; + #if defined(MSCCLPP_DEVICE_COMPILE) -/// Read from the origin and write to the target buffer. -MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { +/// Read data from the origin and write LL16Packets to the target buffer. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to write to the target buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to write. +/// +MSCCLPP_DEVICE_INLINE void putLL16Packets(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); - LLPacket* targetBase = (LLPacket*)((char*)targetPtr + targetOffset); + LL16Packet* targetBase = (LL16Packet*)((char*)targetPtr + targetOffset); size_t nElem = originBytes / sizeof(uint64_t); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket* pkt = &targetBase[i]; + LL16Packet* pkt = &targetBase[i]; pkt->write(originBase[2 * i], originBase[2 * i + 1], flag); } } -/// Read from the target buffer and write to the origin. -MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { +/// Read LL16Packets from the target buffer and write retrieved data to the origin. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to write to the target buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to write. +/// +MSCCLPP_DEVICE_INLINE void getLL16Packets(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - const LLPacket* targetBase = (const LLPacket*)((const char*)targetPtr + targetOffset); + const LL16Packet* targetBase = (const LL16Packet*)((const char*)targetPtr + targetOffset); uint2* originBase = (uint2*)((char*)originPtr + originOffset); size_t nElem = originBytes / sizeof(uint2); for (size_t i = threadId; i < nElem; i += numThreads) { - const LLPacket* pkt = &targetBase[i]; + const LL16Packet* pkt = &targetBase[i]; originBase[i] = pkt->read(flag); } } -/// Read from the origin and write to the target buffer. Write 64-bit data at a time (32bit data + 32bit flag). -MSCCLPP_DEVICE_INLINE void putPackets64(void* targetPtr, uint64_t targetOffset, const void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { - // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes +/// Read data from the origin and write LL8Packets to the target buffer. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to write to the target buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to write. +/// +MSCCLPP_DEVICE_INLINE void putLL8Packets(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + // Offsets should be aligned to 4 bytes & size should be a multiple of 4 bytes const uint32_t* originBase = (const uint32_t*)((const char*)originPtr + originOffset); - LLPacket64* targetBase = (LLPacket64*)((char*)targetPtr + targetOffset); + LL8Packet* targetBase = (LL8Packet*)((char*)targetPtr + targetOffset); size_t nElem = originBytes / sizeof(uint32_t); for (size_t i = threadId; i < nElem; i += numThreads) { - LLPacket64* pkt = &targetBase[i]; + LL8Packet* pkt = &targetBase[i]; pkt->write(originBase[i], flag); } } -/// Read from the target buffer and write to the origin. Read 64-bit data at a time (32bit data + 32bit flag). -MSCCLPP_DEVICE_INLINE void getPackets64(const void* targetPtr, uint64_t targetOffset, void* originPtr, - uint64_t originOffset, uint64_t originBytes, uint32_t threadId, - uint32_t numThreads, uint32_t flag) { - // Offsets should be aligned to 8 bytes & size should be a multiple of 8 bytes - const LLPacket64* targetBase = (const LLPacket64*)((const char*)targetPtr + targetOffset); +/// Read LL8Packets from the target buffer and write retrieved data to the origin. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to write to the target buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to write. +/// +MSCCLPP_DEVICE_INLINE void getLL8Packets(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + // Offsets should be aligned to 4 bytes & size should be a multiple of 4 bytes + const LL8Packet* targetBase = (const LL8Packet*)((const char*)targetPtr + targetOffset); uint32_t* originBase = (uint32_t*)((char*)originPtr + originOffset); size_t nElem = originBytes / sizeof(uint32_t); for (size_t i = threadId; i < nElem; i += numThreads) { - const LLPacket64* pkt = &targetBase[i]; + const LL8Packet* pkt = &targetBase[i]; originBase[i] = pkt->read(flag); } } + +/// Read data from the origin and write packets to the target buffer. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to write to the target buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to write. +/// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet. +/// +template +MSCCLPP_DEVICE_INLINE void putPackets(void* targetPtr, uint64_t targetOffset, const void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + if constexpr (std::is_same::value) { + putLL16Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag); + } else if constexpr (std::is_same::value) { + putLL8Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag); + } else { + static_assert(std::is_same::value || std::is_same::value, + "Unsupported packet type"); + } +} + +/// Read packets from the target buffer and write retrieved data to the origin. +/// +/// @param targetPtr The target buffer. +/// @param targetOffset The offset in the target buffer. +/// @param originPtr The origin buffer. +/// @param originOffset The offset in the origin buffer. +/// @param originBytes The number of bytes to read from the origin buffer. +/// @param threadId The thread ID. The thread ID should be less than @p numThreads. +/// @param numThreads The number of threads that call this function. +/// @param flag The flag to read. +/// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet. +/// +template +MSCCLPP_DEVICE_INLINE void getPackets(const void* targetPtr, uint64_t targetOffset, void* originPtr, + uint64_t originOffset, uint64_t originBytes, uint32_t threadId, + uint32_t numThreads, uint32_t flag) { + if constexpr (std::is_same::value) { + getLL16Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag); + } else if constexpr (std::is_same::value) { + getLL8Packets(targetPtr, targetOffset, originPtr, originOffset, originBytes, threadId, numThreads, flag); + } else { + static_assert(std::is_same::value || std::is_same::value, + "Unsupported packet type"); + } +} #endif // defined(MSCCLPP_DEVICE_COMPILE) }; // namespace mscclpp diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index 494c6e5ff..ee02e73c7 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -211,10 +211,12 @@ struct SmChannelDeviceHandle { /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. + /// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet. /// + template MSCCLPP_DEVICE_INLINE void putPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + mscclpp::putPackets(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). @@ -227,45 +229,12 @@ struct SmChannelDeviceHandle { /// @param threadId The index of the current thread among all threads running this function. This is different from /// the `threadIdx` in CUDA. /// @param numThreads The total number of threads that run this function. + /// @tparam PacketType The packet type. It should be either @ref LL16Packet or @ref LL8Packet. /// + template MSCCLPP_DEVICE_INLINE void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); - } - - /// Construct @ref LLPacket64 from the data in the local memory (origin) and write it on the remote packet buffer - /// (target). - /// - /// This function is intended to be collectively called by multiple threads. Each thread copies a part of packets. - /// Note that this function is intended to be used with @ref getPacket64() on the remote side to copy data in 64-bit. - /// - /// @param targetOffset The offset in bytes of the remote packet buffer. - /// @param originOffset The offset in bytes of the local data. - /// @param originBytes Bytes of the origin to be copied. - /// @param threadId The index of the current thread among all threads running this function. This is different from - /// the `threadIdx` in CUDA. - /// @param numThreads The total number of threads that run this function. - /// - MSCCLPP_DEVICE_INLINE void putPackets64(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, - uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::putPackets64(dst_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); - } - - /// Retrieve data from @ref LLPacket in the local packet buffer (target) and write it on the local data (origin). - /// - /// This function is intended to be collectively called by multiple threads. Each thread copies a part of data. - /// Note that this function is intended to be used with @ref putPacket64() on the remote side to copy data in 64-bit. - /// - /// @param targetOffset The offset in bytes of the local packet buffer. - /// @param originOffset The offset in bytes of the local data. - /// @param originBytes Bytes of the origin to be copied. - /// @param threadId The index of the current thread among all threads running this function. This is different from - /// the `threadIdx` in CUDA. - /// @param numThreads The total number of threads that run this function. - /// - MSCCLPP_DEVICE_INLINE void getPackets64(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, - uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets64(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); } /// Signal the remote semaphore. diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index 5f607248c..eba7bc811 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -293,7 +293,7 @@ TEST_F(SmChannelOneToOneTest, GetPingPong) { EXPECT_EQ(*ret, 0); } -__global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* ret, int nTries) { +__global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { if (rank > 1) return; DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; @@ -312,9 +312,9 @@ __global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* re // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets64(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets64(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { @@ -331,7 +331,7 @@ __global__ void kernelSmPacket64PingPong(int* buff, int rank, int nElem, int* re } } -__global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { +__global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { if (rank > 1) return; DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; @@ -349,9 +349,9 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { @@ -374,16 +374,16 @@ __global__ void kernelSmPacketPingPong(int* buff, int rank, int nElem, int* ret, } } -TEST_F(SmChannelOneToOneTest, Packet64PingPong) { - auto kernelSmPacket64PingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { - kernelSmPacket64PingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); +TEST_F(SmChannelOneToOneTest, LL8PacketPingPong) { + auto kernelSmLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelSmLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; - packetPingPongTest("smPacket64PingPong", kernelSmPacket64PingPongWrapper); + packetPingPongTest("smLL8PacketPingPong", kernelSmLL8PacketPingPongWrapper); } -TEST_F(SmChannelOneToOneTest, PacketPingPong) { - auto kernelSmPacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { - kernelSmPacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); +TEST_F(SmChannelOneToOneTest, LL16PacketPingPong) { + auto kernelSmLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelSmLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; - packetPingPongTest("smPacketPingPong", kernelSmPacketPingPongWrapper); + packetPingPongTest("smLL16PacketPingPong", kernelSmLL16PacketPingPongWrapper); } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 577539037..3a2485319 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1049,40 +1049,40 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket64); + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet); void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); - size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket64); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); size_t scratchResultOffset = - (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket64) : 3 * nPkts * sizeof(mscclpp::LLPacket64); + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet); size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets64(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket64* dstPkt = (mscclpp::LLPacket64*)scratchBuff + remoteRank * nPktsPerRank; + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; uint32_t val = dstPkt[idx].read(flag); data += val; } data += src[idx]; dst[idx] = data; - mscclpp::LLPacket64 packet; + mscclpp::LL8Packet packet; packet.data = data; packet.flag = flag; - size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket64) + (idx + rank * nPktsPerRank); + size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { constSmOutOfPlaceChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer - mscclpp::LLPacket64* dstPkt = (mscclpp::LLPacket64*)((char*)scratch + scratchResultOffset); + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset); const int dstOffset = remoteRank * nPktsPerRank; uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { From 90f93a7b4304070c11b2a135752fc857972fe702 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 20 Feb 2024 22:03:06 +0000 Subject: [PATCH 48/89] Lint --- include/mscclpp/sm_channel_device.hpp | 3 ++- test/mscclpp-test/allreduce_test.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/sm_channel_device.hpp index ee02e73c7..e49a431b7 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/sm_channel_device.hpp @@ -234,7 +234,8 @@ struct SmChannelDeviceHandle { template MSCCLPP_DEVICE_INLINE void getPackets(uint64_t targetOffset, uint64_t originOffset, uint64_t originBytes, uint32_t threadId, uint32_t numThreads, uint32_t flag) { - mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, numThreads, flag); + mscclpp::getPackets(getPacketBuffer_, targetOffset, src_, originOffset, originBytes, threadId, + numThreads, flag); } /// Signal the remote semaphore. diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 3a2485319..980f4c18b 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1060,7 +1060,7 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, // step 1: write to scratch buffer constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; From c1da8dd737e6ad2cea4881719a7b87e74ba96f5f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 20 Feb 2024 22:04:38 +0000 Subject: [PATCH 49/89] spelling --- apps/nccl/include/nccl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h index ac852f9af..bd0a3f264 100644 --- a/apps/nccl/include/nccl.h +++ b/apps/nccl/include/nccl.h @@ -275,7 +275,7 @@ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncc * root is the rank (not the CUDA device) where data resides before the * operation is started. * - * This operation is implicitely in place. + * This operation is implicitly in place. */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); From c7ebf3a53eb2828c97b7efc44aa197ad7c3e3282 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 21 Feb 2024 00:11:30 +0000 Subject: [PATCH 50/89] Updates for ROCm --- apps/nccl/CMakeLists.txt | 6 +++++- apps/nccl/include/nccl.h | 9 +-------- apps/nccl/src/allreduce.cu | 2 -- apps/nccl/src/nccl.cu | 9 ++++----- include/mscclpp/gpu.hpp | 11 ++++++++++- test/nccl_api_test.cc | 2 +- 6 files changed, 21 insertions(+), 18 deletions(-) delete mode 100644 apps/nccl/src/allreduce.cu diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 83a797c91..32159d3c2 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -1,9 +1,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/nccl.cu src/allreduce.cu) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/nccl.cu) file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h) +if(USE_ROCM) + set_source_files_properties(src/nccl.cu PROPERTIES LANGUAGE CXX) +endif() + add_library(mscclpp_nccl_obj OBJECT) target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES}) target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h index bd0a3f264..f94173ef8 100644 --- a/apps/nccl/include/nccl.h +++ b/apps/nccl/include/nccl.h @@ -8,14 +8,7 @@ #ifndef NCCL_H_ #define NCCL_H_ -#include -#include -#if CUDART_VERSION >= 11000 -#include -#endif -#if CUDART_VERSION >= 11080 -#include -#endif +#include #ifdef __cplusplus extern "C" { diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu deleted file mode 100644 index fd2aca680..000000000 --- a/apps/nccl/src/allreduce.cu +++ /dev/null @@ -1,2 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 734790b57..42dd32a81 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -102,13 +102,13 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem, int bloc size_t nLastInts = nElem % 4; int4* dst4 = (int4*)dst; int4* src4 = (int4*)src; - for (int i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { dst4[i] = add_vectors(dst4[i], src4[i]); } if (nLastInts > 0) { int* dstLast = ((int*)dst) + nInt4 * 4; int* srcLast = ((int*)src) + nInt4 * 4; - for (int i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { dstLast[i] = add_vectors(dstLast[i], srcLast[i]); } } @@ -259,7 +259,7 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) // use int4 as much as possible const size_t nInt4 = chunkSize / vectorSize; - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { int4 tmp = src4[indexOffset4 + idx]; for (int index = 0; index < nPeer; ++index) { int4 val; @@ -276,7 +276,7 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) const size_t nRemElems = nelems - processed; const size_t startIdx = processed + (nRemElems * rank) / nranks; const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks; - for (int idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { T tmp = src[idx]; for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); @@ -369,7 +369,6 @@ static std::vector setupRemoteMemories(std::shared_pt std::vector> remoteRegMemoryFutures; for (int i = 0; i < comm->bootstrap()->getNranks(); i++) { if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(i, 0)); comm->sendMemoryOnSetup(memory, i, 0); } diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index f560a655c..d25f9ce63 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -7,6 +7,8 @@ #if defined(__HIP_PLATFORM_AMD__) #include +#include +#include using cudaError_t = hipError_t; using cudaGraph_t = hipGraph_t; @@ -61,6 +63,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__) #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__) #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__) +#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__) #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__) #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__) #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) @@ -88,8 +91,14 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #else #include -#include #include +#include +#if (CUDART_VERSION >= 11000) +#include +#endif +#if (CUDART_VERSION >= 11080) +#include +#endif #endif diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index a2113aea3..788993866 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -5,7 +5,7 @@ #include #include -#include "cuda_runtime.h" +#include #include "mpi.h" #include "nccl.h" From 182a8e3918d9ab20d4908119b35ffd95cef78847 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 21 Feb 2024 00:13:03 +0000 Subject: [PATCH 51/89] fixes --- test/mp_unit/sm_channel_tests.cu | 8 ++++---- test/mscclpp-test/allreduce_test.cu | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index eba7bc811..45c5fa644 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -312,9 +312,9 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { @@ -349,9 +349,9 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 980f4c18b..cbedcefd0 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1059,8 +1059,8 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), + tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; From 34a1c6b097cba3b5ce66442504e47197dfdc43d4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 21 Feb 2024 06:23:57 +0000 Subject: [PATCH 52/89] Add more function definitions --- apps/nccl/src/nccl.cu | 122 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 42dd32a81..0ce6f8bab 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -405,6 +405,11 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { return ncclSuccess; } +NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config) { + // TODO: implement this function + return ncclInternalError; +} + NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { if (comm == nullptr) return ncclInvalidArgument; if (nranks < 0 || rank < 0 || rank >= nranks) return ncclInvalidArgument; @@ -448,12 +453,32 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI return ncclSuccess; } +NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) { + // TODO: implement this function + return ncclInternalError; +} + NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == nullptr) return ncclInvalidArgument; delete comm; return ncclSuccess; } +NCCL_API ncclResult_t ncclCommAbort(ncclComm_t comm) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config) { + // TODO: implement this function + return ncclInternalError; +} + NCCL_API const char* ncclGetErrorString(ncclResult_t result) { switch (result) { case ncclSuccess : return "no error"; @@ -468,6 +493,62 @@ NCCL_API const char* ncclGetErrorString(ncclResult_t result) { } } +NCCL_API const char* ncclGetLastError(ncclComm_t comm) { + // TODO: implement this function + return nullptr; +} + +NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { + // TODO: implement this function + return ncclInternalError; + +} + +NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { + if (comm == nullptr || count == nullptr) return ncclInvalidArgument; + *count = comm->comm->bootstrap()->getNranks(); + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { + if (comm == nullptr || rank == nullptr) return ncclInvalidArgument; + *rank = comm->comm->bootstrap()->getRank(); + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { size_t bytes = count * ncclTypeSize(datatype); @@ -540,3 +621,44 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t } return ncclSuccess; } + +NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + // TODO: implement this function + return ncclInternalError; +} + +NCCL_API ncclResult_t ncclGroupStart() { + // Do nothing + return ncclSuccess; +} + +NCCL_API ncclResult_t ncclGroupEnd() { + // Do nothing + return ncclSuccess; +} From b8fa212c95ebfe463ae5b568f1fc7080bf060600 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 21 Feb 2024 10:07:37 +0000 Subject: [PATCH 53/89] make it work for rocm6.0 --- apps/nccl/src/nccl.cu | 4 ++-- include/mscclpp/gpu.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 0ce6f8bab..ec47362cf 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -511,8 +511,8 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { } NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) { - // TODO: implement this function - return ncclInternalError; + *device = comm->comm->bootstrap()->getRank(); + return ncclSuccess; } NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index d25f9ce63..188f04966 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -8,7 +8,7 @@ #include #include -#include +// #include using cudaError_t = hipError_t; using cudaGraph_t = hipGraph_t; From 4b3e27c555e7e6d4be824e586f96cf06f0c5272b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 21 Feb 2024 11:17:26 +0000 Subject: [PATCH 54/89] allreduce7 works --- apps/nccl/src/nccl.cu | 91 +++++++++++++++++++++++++++++ src/bootstrap/bootstrap.cc | 2 +- test/mscclpp-test/allreduce_test.cu | 10 +++- 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index ec47362cf..39f08f411 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -96,6 +96,21 @@ __forceinline__ __device__ int add_vectors<__half>(int a, int b) { return add_vectors_helper<__half2>(a, b); } +template +__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { + return add_vectors_helper<__half2>(a, b); +} + template __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { size_t nInt4 = nElem / 4; @@ -310,11 +325,86 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) } } +template +__global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { + // This version of allreduce only works for single nodes + if (worldSize != nRanksPerNode) return; + nelems = nelems / (sizeof(int) / sizeof(T)); + const int nPeers = nRanksPerNode - 1; + const size_t nPkts = nelems; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank; + // flag for packets. Initially 1 + const uint32_t flag = (uint32_t)globalFlag; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // step 1: write to scratch buffer + constSmChannels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint32_t data = 0; + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; + uint32_t val = dstPkt[idx].read(flag); + data = add_vectors(val, data); + } + data = add_vectors(data, src[idx]); + dst[idx] = data; + + mscclpp::LL8Packet packet; + packet.data = data; + packet.flag = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); + for (int index = 0; index < nPeers; index++) { + constSmChannels[index].write(offset, packet); + } + } + // step 3: get data result from scratch buffer + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint32_t data = dstPkt[idx + dstOffset].read(flag); + result[idx] = data; + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + globalFlag += 1; + } +} + template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { if (sizeof(T) * nelems <= (1 << 20)) { +#if defined(__HIP_PLATFORM_AMD__) + int nBlocks = 28; + int nThreadsPerBlock = 1024; + if (nelems >= 8192) { + nBlocks = 56; + nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + } + allreduce7<<>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, + nelems); +#else allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); +#endif } else { allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, rank, worldSize, nelems); } @@ -511,6 +601,7 @@ NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { } NCCL_API ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) { + if (comm == nullptr || device == nullptr) return ncclInvalidArgument; *device = comm->comm->bootstrap()->getRank(); return ncclSuccess; } diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 6ae8b2c10..c9cea10f4 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -187,7 +187,7 @@ void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t tim } if (!netInitialized) { - netInit("", "", netIfAddr_); + netInit(ipPortPair, interface, netIfAddr_); netInitialized = true; } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index cbedcefd0..56de36159 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1138,9 +1138,15 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) { tmpBuff = scratchPacketBuff; nThreadsPerBlock = 512; } else if (kernelNum == 7) { - nBlocks = 28; tmpBuff = scratchPacketBuff; - nThreadsPerBlock = 1024; + // tune the #blocks and #threads for MI300X + if (paramCount_ < 8192) { + nBlocks = 28; + nThreadsPerBlock = 1024; + } else { + nBlocks = 56; + nThreadsPerBlock = (paramCount_ <= 76800) ? 512 : 1024; + } } else { nBlocks = std::max(args.nRanksPerNode - 1, 1) * BLOCKS_PER_PEER; tmpBuff = scratchPacketBuff; From 5bf684b69fc1a56eb4234978bbf184a24bb5681e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 21 Feb 2024 23:08:21 +0000 Subject: [PATCH 55/89] disable IB for now --- apps/nccl/CMakeLists.txt | 2 +- apps/nccl/README.md | 14 +++++++++++ apps/nccl/rccl_test.py | 53 ++++++++++++++++++++++++++++++++++++++++ apps/nccl/src/nccl.cu | 23 ++++++++--------- 4 files changed, 80 insertions(+), 12 deletions(-) create mode 100644 apps/nccl/README.md create mode 100644 apps/nccl/rccl_test.py diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 32159d3c2..2432fe562 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -12,7 +12,7 @@ add_library(mscclpp_nccl_obj OBJECT) target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES}) target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) target_include_directories(mscclpp_nccl_obj PRIVATE ${GPU_INCLUDE_DIRS} include) -target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} mscclpp_obj) +target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj) set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) if(USE_CUDA) target_compile_definitions(mscclpp_nccl_obj PRIVATE USE_CUDA) diff --git a/apps/nccl/README.md b/apps/nccl/README.md new file mode 100644 index 000000000..b6c537ed7 --- /dev/null +++ b/apps/nccl/README.md @@ -0,0 +1,14 @@ +# NCCL Interfaces of MSCCL++ + +Compile + +```bash +CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF .. +make -j +``` + +Run rccl-tests + +```bash +mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD="$MSCCLPP_BUILD/libmscclpp.so $MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so" -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NCCL_DEBUG=WARN ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 +``` diff --git a/apps/nccl/rccl_test.py b/apps/nccl/rccl_test.py new file mode 100644 index 000000000..9cdbed1ac --- /dev/null +++ b/apps/nccl/rccl_test.py @@ -0,0 +1,53 @@ +import os +from mpi4py import MPI +import torch +from cupy.cuda import nccl + +ROOT_RANK = 0 +comm = MPI.COMM_WORLD +rank =comm.Get_rank() + +is_group_root = rank == ROOT_RANK + +world_size = comm.Get_size() + +os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) + +device_type = "cuda" +torch.cuda.set_device(0) +device_index = 0 +device = torch.device(type=device_type, index=device_index) + +if is_group_root: + id_ = nccl.get_unique_id() +else: + id_ = None + +ranks = range(world_size) +id_, ranks = comm.bcast((id_, ranks), root=0) +group = nccl.NcclCommunicator(len(ranks), id_, rank) +print(f"{rank=}, {device=}, {group=}") + +M = 1024 +N = 4096 +K = 2048 +shape_a = (M,K) +shape_b = (K,N) +shape_c = (M,N) + +a = torch.ones(shape_a, device="cuda") +b = torch.ones(shape_b, device="cuda") +c = torch.mm(a, b) + +print(c) + +nccl_op = nccl.NCCL_SUM +group.allReduce( + sendbuf=c.data_ptr(), + recvbuf=c.data_ptr(), + count=c.nelement(), + datatype=nccl.NCCL_FLOAT, + op=nccl_op, + stream=torch.cuda.current_stream().cuda_stream) + +print(c) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 39f08f411..715211c7d 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -139,9 +139,9 @@ static const int nRanksPerNode = 8; // Only use scratch buffer for message size less then 1MB static const int scratchSize = 1024 * 1024 * 8; -static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, - mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, - mscclpp::Transport::IB6, mscclpp::Transport::IB7}; +// static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, +// mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, +// mscclpp::Transport::IB6, mscclpp::Transport::IB7}; __constant__ mscclpp::DeviceHandle constSmChannels[8]; __constant__ mscclpp::DeviceHandle constSmOutChannels[8]; @@ -444,11 +444,12 @@ static size_t ncclTypeSize(ncclDataType_t type) { } static mscclpp::Transport getTransport(int rank, int peerRank) { - if (rank / nRanksPerNode == peerRank / nRanksPerNode) { - return mscclpp::Transport::CudaIpc; - } else { - return IBs[rank % nRanksPerNode]; - } + // if (rank / nRanksPerNode == peerRank / nRanksPerNode) { + // return mscclpp::Transport::CudaIpc; + // } else { + // return IBs[rank % nRanksPerNode]; + // } + return mscclpp::Transport::CudaIpc; } static std::vector setupRemoteMemories(std::shared_ptr comm, int rank, @@ -537,7 +538,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI // using scratch buffer for message size less then 1MB commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); commPtr->remoteScratchRegMemories = setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, - mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + mscclpp::Transport::CudaIpc); *comm = commPtr; return ncclSuccess; @@ -665,12 +666,12 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (it == comm->smChannels.end()) { std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, - mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(sendbuff)); it = comm->smChannels.emplace(key, channels).first; if (sendbuff != recvbuff) { std::vector remoteMemories = - setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc | IBs[rank % nRanksPerNode]); + setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); outIt = comm->smOutChannels.emplace(key, outChannels).first; } From d92e248b9811da908315074e49f3b8719c07b14a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 22 Feb 2024 01:20:37 +0000 Subject: [PATCH 56/89] allgather works --- apps/nccl/rccl_test.py | 33 +++++++---- apps/nccl/src/nccl.cu | 128 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 140 insertions(+), 21 deletions(-) diff --git a/apps/nccl/rccl_test.py b/apps/nccl/rccl_test.py index 9cdbed1ac..298d26eee 100644 --- a/apps/nccl/rccl_test.py +++ b/apps/nccl/rccl_test.py @@ -38,16 +38,29 @@ a = torch.ones(shape_a, device="cuda") b = torch.ones(shape_b, device="cuda") c = torch.mm(a, b) - + print(c) - -nccl_op = nccl.NCCL_SUM -group.allReduce( - sendbuf=c.data_ptr(), - recvbuf=c.data_ptr(), - count=c.nelement(), + +# nccl_op = nccl.NCCL_SUM +# group.allReduce( +# sendbuf=c.data_ptr(), +# recvbuf=c.data_ptr(), +# count=c.nelement(), +# datatype=nccl.NCCL_FLOAT, +# op=nccl_op, +# stream=torch.cuda.current_stream().cuda_stream) + +# print(c) + +d = torch.ones((1024*1024,), device="cuda") +e = torch.zeros((8*1024*1024,), device="cuda") +e[rank*1024*1024:(rank+1)*1024*1024] = d + +group.allGather( + sendbuf=d.data_ptr(), + recvbuf=e.data_ptr(), + count=d.nelement(), datatype=nccl.NCCL_FLOAT, - op=nccl_op, stream=torch.cuda.current_stream().cuda_stream) - -print(c) + +print(e) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 715211c7d..73d9d6b2d 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -21,6 +21,14 @@ } \ } while (0) +#define NUM_CHANNELS_PER_CONNECTION 32 + +#if defined(__HIP_PLATFORM_AMD__) +#define WARP_SIZE 64 +#else +#define WARP_SIZE 32 +#endif + template __forceinline__ __device__ To bit_cast(const From& src) { static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); @@ -143,8 +151,8 @@ static const int scratchSize = 1024 * 1024 * 8; // mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, // mscclpp::Transport::IB6, mscclpp::Transport::IB7}; -__constant__ mscclpp::DeviceHandle constSmChannels[8]; -__constant__ mscclpp::DeviceHandle constSmOutChannels[8]; +__constant__ mscclpp::DeviceHandle constSmChannels[256]; +__constant__ mscclpp::DeviceHandle constSmOutChannels[256]; __device__ mscclpp::DeviceSyncer deviceSyncer; struct channelKey { @@ -411,6 +419,76 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPe return cudaGetLastError(); } +__global__ void __launch_bounds__(1024, 1) + allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x; + if (blockIdx.x >= nBlock) return; + + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; + auto smChans = constSmChannels + chanOffset; + + if (wid < nPeer && lid == 0) { + smChans[wid].relaxedSignal(); + smChans[wid].wait(); + } + __syncthreads(); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread; + if (bytes >= nThread * 64) { + unitBytesPerThread = 64; + } else { + unitBytesPerThread = 16; + } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t nLoop = bytes / unitBytes; + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE); + } + } +} + +template +cudaError_t allgather(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, + cudaStream_t stream) { + allgather5<<<24, 1024, 0, stream>>>(rank, worldSize, nRanksPerNode, nelems); + return cudaGetLastError(); +} + static size_t ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: @@ -474,9 +552,12 @@ static std::vector setupSmChannels(ncclComm_t comm, void* src) { std::vector channels; std::vector>& smSemaphores = comm->smSemaphores; - for (size_t cid = 0; cid < comm->connections.size(); ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[cid], remoteMemories[cid], src, nullptr); + size_t nConnections = comm->connections.size(); + for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { + for (size_t cid = 0; cid < nConnections; ++cid) { + if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + channels.emplace_back(smSemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr); + } } } return channels; @@ -523,10 +604,12 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI [](const auto& future) { return future.get(); }); std::vector> smSemaphores; - for (size_t cid = 0; cid < connections.size(); ++cid) { - if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace_back( - std::make_shared(*(mscclppComm), connections[cid])); + for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { + for (size_t cid = 0; cid < connections.size(); ++cid) { + if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { + smSemaphores.emplace_back( + std::make_shared(*(mscclppComm), connections[cid])); + } } } mscclppComm->setup(); @@ -723,8 +806,31 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { - // TODO: implement this function - return ncclInternalError; + size_t bytes = sendcount * ncclTypeSize(datatype); + if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; + int rank = comm->comm->bootstrap()->getRank(); + int nRank = comm->comm->bootstrap()->getNranks(); + channelKey key{sendbuff, recvbuff, bytes}; + + auto it = comm->smChannels.find(key); + if (it == comm->smChannels.end()) { + std::vector remoteMemories = + setupRemoteMemories(comm->comm, rank, const_cast(recvbuff), bytes * nRank, + mscclpp::Transport::CudaIpc); + std::vector channels = + setupSmChannels(comm, remoteMemories, const_cast(recvbuff)); + it = comm->smChannels.emplace(key, channels).first; + } + std::vector> smChannelDeviceHandles; + std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device + CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + + CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + return ncclSuccess; } NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, From 141e7da21437b0b44aa1730fe12679393d390ee3 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 22 Feb 2024 01:49:58 +0000 Subject: [PATCH 57/89] add some impls --- apps/nccl/src/nccl.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 73d9d6b2d..5b84fb095 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -633,8 +633,8 @@ NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* dev } NCCL_API ncclResult_t ncclCommFinalize(ncclComm_t comm) { - // TODO: implement this function - return ncclInternalError; + comm->comm->bootstrap()->barrier(); + return ncclSuccess; } NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { @@ -645,7 +645,7 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { NCCL_API ncclResult_t ncclCommAbort(ncclComm_t comm) { // TODO: implement this function - return ncclInternalError; + return ncclSuccess; } NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config) { @@ -673,9 +673,9 @@ NCCL_API const char* ncclGetLastError(ncclComm_t comm) { } NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { - // TODO: implement this function - return ncclInternalError; - + if (asyncError == nullptr) return ncclInvalidArgument; + *asyncError = ncclSuccess; + return ncclSuccess; } NCCL_API ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { From c50f7b5964bec2fc70f549d1afb49b096f0b05cd Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 22 Feb 2024 03:46:44 +0000 Subject: [PATCH 58/89] out-of-place allgather --- apps/nccl/src/nccl.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 5b84fb095..f4cb31db2 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -485,6 +485,8 @@ __global__ void __launch_bounds__(1024, 1) template cudaError_t allgather(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + cudaError_t err = cudaMemcpyAsync(resultBuff + nelems * rank, buff, nelems * sizeof(T), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; allgather5<<<24, 1024, 0, stream>>>(rank, worldSize, nRanksPerNode, nelems); return cudaGetLastError(); } From cf91552ed85080bc06123e4d8fcc25455f6e86c4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 22 Feb 2024 11:30:41 +0000 Subject: [PATCH 59/89] hipGraph work --- apps/nccl/src/nccl.cu | 10 +++++++--- include/mscclpp/gpu.hpp | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index f4cb31db2..beac7c8aa 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -182,6 +182,8 @@ struct ncclComm { std::unordered_map> smOutChannels; std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; + std::vector> smDeviceHandles; + std::vector> smOutDeviceHandles; }; cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, @@ -739,12 +741,14 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); it = comm->smChannels.emplace(key, channels).first; } - std::vector> smChannelDeviceHandles; + std::vector>& smChannelDeviceHandles = comm->smDeviceHandles; + smChannelDeviceHandles.clear(); std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + CUDACHECK(cudaMemcpyToSymbolAsync(constSmChannels, smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + 0, cudaMemcpyHostToDevice, stream)); } else { auto it = comm->smChannels.find(key); auto outIt = comm->smOutChannels.find(key); diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 188f04966..a73532afa 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -63,6 +63,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__) #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__) #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__) +#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__) #define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__) #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__) #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__) From 1becb792a5987b69c6b63876576a220a27221342 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 22 Feb 2024 13:55:02 +0000 Subject: [PATCH 60/89] hipGraph works --- apps/nccl/src/nccl.cu | 175 ++++++++++++++++++++++++------------------ 1 file changed, 99 insertions(+), 76 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index beac7c8aa..1cb7c71d6 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -173,24 +173,28 @@ struct hash { }; } // namespace std +struct ChannelInfo { + std::vector smChannels; + std::vector smOutChannels; + std::vector> smChannelDeviceHandles; + std::vector> smOutChannelDeviceHandles; +}; + struct ncclComm { std::shared_ptr comm; std::vector> connections; std::vector> smSemaphores; - std::unordered_map> smChannels; - std::unordered_map> smOutChannels; + std::unordered_map channelInfos; std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; - std::vector> smDeviceHandles; - std::vector> smOutDeviceHandles; }; cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream); -#include #include +#include // extern __constant__ mscclpp::SmChannelDeviceHandle *constSmChannels; __device__ uint64_t globalFlag; @@ -513,12 +517,12 @@ static size_t ncclTypeSize(ncclDataType_t type) { #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: return 2; -#endif // defined(__CUDA_BF16_TYPES_EXIST__) +#endif // defined(__CUDA_BF16_TYPES_EXIST__) #if defined(__CUDA_FP8_TYPES_EXIST__) case ncclFp8E4M3: case ncclFp8E5M2: return 1; -#endif // defined(__CUDA_FP8_TYPES_EXIST__) +#endif // defined(__CUDA_FP8_TYPES_EXIST__) case ncclNumTypes: return 0; } @@ -581,7 +585,8 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { return ncclSuccess; } -NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config) { +NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, + ncclConfig_t* config) { // TODO: implement this function return ncclInternalError; } @@ -624,8 +629,8 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI commPtr->smSemaphores = std::move(smSemaphores); // using scratch buffer for message size less then 1MB commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); - commPtr->remoteScratchRegMemories = setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, - mscclpp::Transport::CudaIpc); + commPtr->remoteScratchRegMemories = + setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, mscclpp::Transport::CudaIpc); *comm = commPtr; return ncclSuccess; @@ -652,22 +657,31 @@ NCCL_API ncclResult_t ncclCommAbort(ncclComm_t comm) { return ncclSuccess; } -NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config) { +NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config) { // TODO: implement this function return ncclInternalError; } NCCL_API const char* ncclGetErrorString(ncclResult_t result) { switch (result) { - case ncclSuccess : return "no error"; - case ncclUnhandledCudaError : return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; - case ncclSystemError : return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; - case ncclInternalError : return "internal error - please report this issue to the NCCL developers"; - case ncclInvalidArgument : return "invalid argument (run with NCCL_DEBUG=WARN for details)"; - case ncclInvalidUsage : return "invalid usage (run with NCCL_DEBUG=WARN for details)"; - case ncclRemoteError : return "remote process exited or there was a network error"; - case ncclInProgress : return "NCCL operation in progress"; - default : return "unknown result code"; + case ncclSuccess: + return "no error"; + case ncclUnhandledCudaError: + return "unhandled cuda error (run with NCCL_DEBUG=INFO for details)"; + case ncclSystemError: + return "unhandled system error (run with NCCL_DEBUG=INFO for details)"; + case ncclInternalError: + return "internal error - please report this issue to the NCCL developers"; + case ncclInvalidArgument: + return "invalid argument (run with NCCL_DEBUG=WARN for details)"; + case ncclInvalidUsage: + return "invalid usage (run with NCCL_DEBUG=WARN for details)"; + case ncclRemoteError: + return "remote process exited or there was a network error"; + case ncclInProgress: + return "NCCL operation in progress"; + default: + return "unknown result code"; } } @@ -676,7 +690,7 @@ NCCL_API const char* ncclGetLastError(ncclComm_t comm) { return nullptr; } -NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { +NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError) { if (asyncError == nullptr) return ncclInvalidArgument; *asyncError = ncclSuccess; return ncclSuccess; @@ -700,7 +714,8 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { return ncclSuccess; } -NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { +NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, + ncclScalarResidence_t residence, ncclComm_t comm) { // TODO: implement this function return ncclInternalError; } @@ -716,14 +731,14 @@ NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t co return ncclInternalError; } -NCCL_API ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, + cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + int root, ncclComm_t comm, cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } @@ -735,56 +750,63 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t int rank = comm->comm->bootstrap()->getRank(); channelKey key{sendbuff, recvbuff, bytes}; if (bytes <= 1 << 20) { - auto it = comm->smChannels.find(key); - if (it == comm->smChannels.end()) { + auto it = comm->channelInfos.find(key); + if (it == comm->channelInfos.end()) { std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); - it = comm->smChannels.emplace(key, channels).first; + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + it = comm->channelInfos.emplace(key, channelInfo).first; } - std::vector>& smChannelDeviceHandles = comm->smDeviceHandles; - smChannelDeviceHandles.clear(); - std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbolAsync(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - 0, cudaMemcpyHostToDevice, stream)); + CUDACHECK(cudaMemcpyToSymbolAsync( + constSmChannels, it->second.smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, + cudaMemcpyHostToDevice, stream)); } else { - auto it = comm->smChannels.find(key); - auto outIt = comm->smOutChannels.find(key); - if (it == comm->smChannels.end()) { + auto it = comm->channelInfos.find(key); + if (it == comm->channelInfos.end()) { std::vector remoteMemories = - setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, - mscclpp::Transport::CudaIpc); + setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(sendbuff)); - it = comm->smChannels.emplace(key, channels).first; + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + it = comm->channelInfos.emplace(key, channelInfo).first; if (sendbuff != recvbuff) { std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); - outIt = comm->smOutChannels.emplace(key, outChannels).first; + it->second.smOutChannels = outChannels; + std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(it->second.smOutChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); } } - std::vector> smChannelDeviceHandles; - std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); + CUDACHECK(cudaMemcpyToSymbolAsync( + constSmChannels, it->second.smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, + cudaMemcpyHostToDevice, stream)); if (sendbuff != recvbuff) { - smChannelDeviceHandles.clear(); - std::transform(outIt->second.begin(), outIt->second.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + CUDACHECK(cudaMemcpyToSymbolAsync( + constSmOutChannels, it->second.smOutChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * it->second.smOutChannelDeviceHandles.size(), 0, + cudaMemcpyHostToDevice, stream)); + } else { + CUDACHECK(cudaMemcpyToSymbolAsync( + constSmOutChannels, it->second.smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, + cudaMemcpyHostToDevice, stream)); } - CUDACHECK(cudaMemcpyToSymbol(constSmOutChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); } switch (datatype) { case ncclFloat16: - CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, - rank, nRanksPerNode, comm->comm->bootstrap()->getNranks(), - count, stream)); + CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, rank, nRanksPerNode, + comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, @@ -803,56 +825,57 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t return ncclSuccess; } -NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream) { +NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, + ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, + ncclComm_t comm, cudaStream_t stream) { size_t bytes = sendcount * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); int nRank = comm->comm->bootstrap()->getNranks(); channelKey key{sendbuff, recvbuff, bytes}; - auto it = comm->smChannels.find(key); - if (it == comm->smChannels.end()) { + auto it = comm->channelInfos.find(key); + if (it == comm->channelInfos.end()) { std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, const_cast(recvbuff), bytes * nRank, mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(recvbuff)); - it = comm->smChannels.emplace(key, channels).first; + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + it = comm->channelInfos.emplace(key, channelInfo).first; } - std::vector> smChannelDeviceHandles; - std::transform(it->second.begin(), it->second.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbol(constSmChannels, smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size())); - + CUDACHECK(cudaMemcpyToSymbolAsync( + constSmChannels, it->second.smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, + cudaMemcpyHostToDevice, stream)); CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); return ncclSuccess; } -NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, + cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclComm_t comm, cudaStream_t stream) { // TODO: implement this function return ncclInternalError; } From 2e0013f915bef1fbb24bd2382c0022006cc9503c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 00:02:40 +0000 Subject: [PATCH 61/89] fixed out-of-place allgather --- apps/nccl/src/nccl.cu | 60 +++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 1cb7c71d6..e95b4fced 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "nccl.h" @@ -425,8 +426,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPe return cudaGetLastError(); } +template __global__ void __launch_bounds__(1024, 1) - allgather5(size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + allgather6(void *sendbuff, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x; if (blockIdx.x >= nBlock) return; @@ -460,40 +462,61 @@ __global__ void __launch_bounds__(1024, 1) if (nLoop > 0) { // First loop unrolling const size_t peerIdx = wid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } } for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); - const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } } if (bytes % unitBytes > 0) { const size_t gWid = wid + nLoop * nWarp; const size_t peerIdx = gWid % nPeer; - const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - const size_t offset = bytesPerGPU * remoteRankLocalIndex + offsetWithinRank; + const size_t offset = bytesPerGPU * rank + offsetWithinRank; const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE); + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + smChans[peerIdx].copy<16, true>(src + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); + smChans[peerIdx].copy<16, true>(dst + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); + } } } } -template +template cudaError_t allgather(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - cudaError_t err = cudaMemcpyAsync(resultBuff + nelems * rank, buff, nelems * sizeof(T), cudaMemcpyDeviceToDevice, stream); - if (err != cudaSuccess) return err; - allgather5<<<24, 1024, 0, stream>>>(rank, worldSize, nRanksPerNode, nelems); + allgather6<<<24, 1024, 0, stream>>>((void*)buff, rank, worldSize, nRanksPerNode, nelems); return cudaGetLastError(); } @@ -857,8 +880,13 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t constSmChannels, it->second.smChannelDeviceHandles.data(), sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, cudaMemcpyHostToDevice, stream)); - CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, - rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { + CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + } else { + CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + } return ncclSuccess; } From f01c7023891a218cffc8cfc2dc17a7777b397d88 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 01:24:14 +0000 Subject: [PATCH 62/89] fixed allreduce hanging --- apps/nccl/src/nccl.cu | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index e95b4fced..c4f90590e 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -198,11 +198,10 @@ cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int n #include // extern __constant__ mscclpp::SmChannelDeviceHandle *constSmChannels; -__device__ uint64_t globalFlag; template __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems) { + size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -210,8 +209,6 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa const int nPkts = nelems / 2; const int nelemsPerRank = nelems / worldSize; const int nPktsPerRank = nelemsPerRank / 2; - // flag for packets. Initially 1 - const uint32_t flag = (uint32_t)globalFlag + 1; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; const int localBlockIdx = blockIdx.x % nBlocksPerPeer; @@ -257,9 +254,6 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa result[idx].x = data.x; result[idx].y = data.y; } - if (threadIdx.x == 0 && blockIdx.x == 0) { - globalFlag += 1; - } } template @@ -342,7 +336,7 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) template __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems) { + size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -350,8 +344,6 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRa const size_t nPkts = nelems; const int nelemsPerRank = nelems / worldSize; const int nPktsPerRank = nelemsPerRank; - // flag for packets. Initially 1 - const uint32_t flag = (uint32_t)globalFlag; // thread block & channel info const int nBlocksPerPeer = gridDim.x / nPeers; const int localBlockIdx = blockIdx.x % nBlocksPerPeer; @@ -399,14 +391,12 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRa uint32_t data = dstPkt[idx + dstOffset].read(flag); result[idx] = data; } - if (threadIdx.x == 0 && blockIdx.x == 0) { - globalFlag += 1; - } } template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + static uint32_t flag = 1; if (sizeof(T) * nelems <= (1 << 20)) { #if defined(__HIP_PLATFORM_AMD__) int nBlocks = 28; @@ -416,9 +406,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPe nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } allreduce7<<>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, - nelems); + nelems, flag++); #else - allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems); + allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems, flag++); #endif } else { allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, rank, worldSize, nelems); From 9537c97d9d7df9bb1a41c6539adcecce26cd5e86 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 24 Feb 2024 02:18:19 +0000 Subject: [PATCH 63/89] perf improve --- apps/nccl/src/nccl.cu | 170 ++++++++++++++++++++++++------------------ 1 file changed, 97 insertions(+), 73 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index c4f90590e..020b9dcfb 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -152,8 +152,6 @@ static const int scratchSize = 1024 * 1024 * 8; // mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, // mscclpp::Transport::IB6, mscclpp::Transport::IB7}; -__constant__ mscclpp::DeviceHandle constSmChannels[256]; -__constant__ mscclpp::DeviceHandle constSmOutChannels[256]; __device__ mscclpp::DeviceSyncer deviceSyncer; struct channelKey { @@ -177,8 +175,8 @@ struct hash { struct ChannelInfo { std::vector smChannels; std::vector smOutChannels; - std::vector> smChannelDeviceHandles; - std::vector> smOutChannelDeviceHandles; + std::shared_ptr> smChannelDeviceHandles; + std::shared_ptr> smOutChannelDeviceHandles; }; struct ncclComm { @@ -197,11 +195,9 @@ cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int n #include #include -// extern __constant__ mscclpp::SmChannelDeviceHandle *constSmChannels; - template -__global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems, uint32_t flag) { +__global__ void allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -214,7 +210,7 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa const int localBlockIdx = blockIdx.x % nBlocksPerPeer; const int peerIdx = blockIdx.x / nBlocksPerPeer; const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - mscclpp::SmChannelDeviceHandle smChan = constSmChannels[peerIdx]; + mscclpp::SmChannelDeviceHandle smChan = smChannels[peerIdx]; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); @@ -241,7 +237,7 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa dst[idx].x = data.x; dst[idx].y = data.y; for (int index = 0; index < nPeers; index++) { - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)constSmChannels[index].dst_ + scratchResultOffset); + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)smChannels[index].dst_ + scratchResultOffset); dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag); } } @@ -257,7 +253,9 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, int rank, int nRa } template -__global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) { +__global__ void allreduce1(T* src, T* dst, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nranks, + size_t nelems) { const size_t chunkSize = nelems / nranks; if (nranks == 1) return; const int nPeer = nranks - 1; @@ -274,10 +272,10 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) } __syncthreads(); if (tid < nPeer) { - constSmChannels[tid].relaxedSignal(); + smChannels[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - constSmChannels[tid - nPeer].wait(); + smChannels[tid - nPeer].wait(); } deviceSyncer.sync(gridDim.x); @@ -289,7 +287,7 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) int4 val; int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - val = constSmChannels[peerIdx].read(indexOffset4 + idx); + val = smChannels[peerIdx].read(indexOffset4 + idx); tmp = add_vectors(tmp, val); } dst4[indexOffset4 + idx] = tmp; @@ -305,7 +303,7 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - T val = constSmChannels[peerIdx].read(idx); + T val = smChannels[peerIdx].read(idx); tmp += val; } dst[idx] = tmp; @@ -318,10 +316,10 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) } __syncthreads(); if (tid < nPeer) { - constSmChannels[tid].relaxedSignal(); + smChannels[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - constSmChannels[tid - nPeer].wait(); + smChannels[tid - nPeer].wait(); } deviceSyncer.sync(gridDim.x); @@ -330,13 +328,13 @@ __global__ void allreduce1(T* src, T* dst, int rank, int nranks, size_t nelems) if (peerIdx >= nPeer) peerIdx -= nPeer; const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = chunkSize * remoteRank * sizeof(T); - constSmOutChannels[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); + smOutChannels[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); } } template -__global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems, uint32_t flag) { +__global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -361,8 +359,8 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRa uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmChannels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + smChannels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; @@ -380,7 +378,7 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRa packet.flag = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - constSmChannels[index].write(offset, packet); + smChannels[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -394,8 +392,9 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, int rank, int nRa } template -cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, - cudaStream_t stream) { +cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, + int worldSize, size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; if (sizeof(T) * nelems <= (1 << 20)) { #if defined(__HIP_PLATFORM_AMD__) @@ -405,20 +404,22 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPe nBlocks = 56; nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } - allreduce7<<>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, - nelems, flag++); + allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, + worldSize, nelems, flag++); #else - allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, rank, nRanksPerNode, worldSize, nelems, flag++); + allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, + flag++); #endif } else { - allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, rank, worldSize, nelems); + allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, smChannels, smOutChannels, rank, worldSize, nelems); } return cudaGetLastError(); } template __global__ void __launch_bounds__(1024, 1) - allgather6(void *sendbuff, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t rank, + [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nBlock = gridDim.x; if (blockIdx.x >= nBlock) return; @@ -430,7 +431,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = constSmChannels + chanOffset; + auto smChans = smChannels + chanOffset; if (wid < nPeer && lid == 0) { smChans[wid].relaxedSignal(); @@ -504,9 +505,9 @@ __global__ void __launch_bounds__(1024, 1) } template -cudaError_t allgather(T* buff, T* scratch, T* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems, - cudaStream_t stream) { - allgather6<<<24, 1024, 0, stream>>>((void*)buff, rank, worldSize, nRanksPerNode, nelems); +cudaError_t allgather(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { + allgather6<<<24, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, nelems); return cudaGetLastError(); } @@ -762,6 +763,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); channelKey key{sendbuff, recvbuff, bytes}; + mscclpp::DeviceHandle* smChannels = nullptr; + mscclpp::DeviceHandle* smOutChannels = nullptr; if (bytes <= 1 << 20) { auto it = comm->channelInfos.find(key); if (it == comm->channelInfos.end()) { @@ -770,14 +773,18 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + std::shared_ptr> ptr = + mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); + { + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + } + ChannelInfo channelInfo{channels, {}, ptr, nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; } - // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbolAsync( - constSmChannels, it->second.smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, - cudaMemcpyHostToDevice, stream)); + smChannels = it->second.smChannelDeviceHandles.get(); } else { auto it = comm->channelInfos.find(key); if (it == comm->channelInfos.end()) { @@ -787,48 +794,60 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + std::shared_ptr> ptr = + mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); + { + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + } + ChannelInfo channelInfo{channels, {}, ptr, nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; if (sendbuff != recvbuff) { std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); - it->second.smOutChannels = outChannels; - std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(it->second.smOutChannelDeviceHandles), + std::vector> smOutChannelDeviceHandles; + std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(smOutChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + std::shared_ptr> outPtr = + mscclpp::allocSharedCuda>(smOutChannelDeviceHandles.size()); + { + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(outPtr.get(), smOutChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smOutChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + } + it->second.smOutChannels = outChannels; + it->second.smOutChannelDeviceHandles = outPtr; + } else { + std::shared_ptr> outPtr = + mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(outPtr.get(), smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + it->second.smOutChannelDeviceHandles = outPtr; } } - // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbolAsync( - constSmChannels, it->second.smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, - cudaMemcpyHostToDevice, stream)); - if (sendbuff != recvbuff) { - CUDACHECK(cudaMemcpyToSymbolAsync( - constSmOutChannels, it->second.smOutChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * it->second.smOutChannelDeviceHandles.size(), 0, - cudaMemcpyHostToDevice, stream)); - } else { - CUDACHECK(cudaMemcpyToSymbolAsync( - constSmOutChannels, it->second.smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, - cudaMemcpyHostToDevice, stream)); - } + smChannels = it->second.smChannelDeviceHandles.get(); + smOutChannels = it->second.smOutChannelDeviceHandles.get(); } switch (datatype) { case ncclFloat16: - CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, rank, nRanksPerNode, - comm->comm->bootstrap()->getNranks(), count, stream)); + CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, + rank, nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: - CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, - comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), - count, stream)); + CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels, + smOutChannels, comm->comm->bootstrap()->getRank(), nRanksPerNode, + comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclInt32: case ncclUint32: - CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); break; @@ -851,6 +870,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t int rank = comm->comm->bootstrap()->getRank(); int nRank = comm->comm->bootstrap()->getNranks(); channelKey key{sendbuff, recvbuff, bytes}; + mscclpp::DeviceHandle* smChannels = nullptr; auto it = comm->channelInfos.find(key); if (it == comm->channelInfos.end()) { @@ -862,19 +882,23 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - ChannelInfo channelInfo{channels, {}, smChannelDeviceHandles, {}}; + std::shared_ptr> ptr = + mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); + { + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + } + ChannelInfo channelInfo{channels, {}, ptr, nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; } - // TODO: if sendbuff and recvbuff don't change, we can avoid copying smChannelDeviceHandles to device - CUDACHECK(cudaMemcpyToSymbolAsync( - constSmChannels, it->second.smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * it->second.smChannelDeviceHandles.size(), 0, - cudaMemcpyHostToDevice, stream)); + smChannels = it->second.smChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { - CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); } else { - CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, + CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); } return ncclSuccess; From c39cbefd8490c7cd12373b9428a1f23e1efc1705 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 24 Feb 2024 02:52:01 +0000 Subject: [PATCH 64/89] update --- apps/nccl/src/nccl.cu | 69 ++++++++++++------------------------------- 1 file changed, 19 insertions(+), 50 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 020b9dcfb..8bd4b7ecd 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -585,6 +585,20 @@ static std::vector setupSmChannels(ncclComm_t comm, return channels; } +static std::shared_ptr> setupSmChannelDeviceHandles( + const std::vector& smChannels) { + std::vector> smChannelDeviceHandles; + std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + std::shared_ptr> ptr = + mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); + mscclpp::AvoidCudaGraphCaptureGuard guard; + CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), + sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), + cudaMemcpyHostToDevice)); + return ptr; +} + NCCL_API ncclResult_t ncclGetVersion(int* version) { if (version == nullptr) return ncclInvalidArgument; *version = MSCCLPP_VERSION; @@ -770,18 +784,7 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t if (it == comm->channelInfos.end()) { std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> ptr = - mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - { - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); - } - ChannelInfo channelInfo{channels, {}, ptr, nullptr}; + ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; } smChannels = it->second.smChannelDeviceHandles.get(); @@ -791,43 +794,17 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc); std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(sendbuff)); - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> ptr = - mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - { - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); - } - ChannelInfo channelInfo{channels, {}, ptr, nullptr}; + ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; if (sendbuff != recvbuff) { std::vector remoteMemories = setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); - std::vector> smOutChannelDeviceHandles; - std::transform(outChannels.begin(), outChannels.end(), std::back_inserter(smOutChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> outPtr = - mscclpp::allocSharedCuda>(smOutChannelDeviceHandles.size()); - { - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(outPtr.get(), smOutChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smOutChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); - } + std::shared_ptr> outPtr = setupSmChannelDeviceHandles(outChannels); it->second.smOutChannels = outChannels; it->second.smOutChannelDeviceHandles = outPtr; } else { - std::shared_ptr> outPtr = - mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(outPtr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); + std::shared_ptr> outPtr = setupSmChannelDeviceHandles(channels); it->second.smOutChannelDeviceHandles = outPtr; } } @@ -882,15 +859,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> ptr = - mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - { - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); - } - ChannelInfo channelInfo{channels, {}, ptr, nullptr}; + ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; it = comm->channelInfos.emplace(key, channelInfo).first; } smChannels = it->second.smChannelDeviceHandles.get(); From f0e3063ffd45f2af94a35b846dd1ebea9943ae84 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 05:50:52 +0000 Subject: [PATCH 65/89] allreduce8 wip --- apps/nccl/src/nccl.cu | 168 ++++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 63 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 8bd4b7ecd..5ca10fd29 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -22,7 +22,7 @@ } \ } while (0) -#define NUM_CHANNELS_PER_CONNECTION 32 +#define NUM_CHANNELS_PER_CONNECTION 64 #if defined(__HIP_PLATFORM_AMD__) #define WARP_SIZE 64 @@ -145,8 +145,8 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { // TODO: static const int nRanksPerNode = 8; -// Only use scratch buffer for message size less then 1MB -static const int scratchSize = 1024 * 1024 * 8; + +static const int scratchSize = 1024 * 1024 * 40; // static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, // mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -391,28 +391,83 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa } } +template +__global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, + int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { + const size_t nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; + // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) + const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); + const size_t nInt4PerRank = nInt4 / worldSize; + auto smOutChans = smOutChannels + chanOffset; + + int4* buff4 = reinterpret_cast(buff); + int4* scratch4 = reinterpret_cast(scratch); + int4* resultBuff4 = reinterpret_cast(resultBuff); + + /// Starts reduce-scatter + + if (threadIdx.x < nPeer) { + smOutChans[threadIdx.x].relaxedSignal(); + smOutChans[threadIdx.x].wait(); + } + __syncthreads(); + + // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4PerBlock` + constexpr size_t unitNInt4PerBlock = 1024; + const size_t nUnitsPerBlock = (nInt4PerRank + unitNInt4PerBlock - 1) / unitNInt4PerBlock; + size_t offsetOfThisBlock = unitNInt4PerBlock * nUnitsPerBlock * blockIdx.x; + size_t nInt4OfThisBlock = unitNInt4PerBlock * nUnitsPerBlock; + if ((nInt4PerRank % unitNInt4PerBlock != 0) && (blockIdx.x == gridDim.x - 1)) { + // The last block may have fewer int4 than others + nInt4OfThisBlock = unitNInt4PerBlock * (nUnitsPerBlock - 1) + nInt4PerRank % unitNInt4PerBlock; + } + + for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { + int4 data = buff4[nInt4PerRank * rank + idx]; + for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = scratch4[nInt4PerRank * remoteRank + idx]; + data = add_vectors(val, data); + } + resultBuff4[nInt4PerRank * rank + idx] = data; + for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + smOutChans[peerIdx].write(nInt4PerRank * remoteRank + idx, data); + } + } +} + template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; - if (sizeof(T) * nelems <= (1 << 20)) { #if defined(__HIP_PLATFORM_AMD__) - int nBlocks = 28; - int nThreadsPerBlock = 1024; - if (nelems >= 8192) { - nBlocks = 56; - nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; - } + // int nBlocks = 28; + // int nThreadsPerBlock = 1024; + // if (nelems >= 8192) { + // nBlocks = 56; + // nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + // } + int nBlocks = 1; + int nThreadsPerBlock = 1024; + if (sizeof(T) * nelems <= (1 << 20)) { allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); + } else { + allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, + worldSize, nelems, flag++); + } #else + if (sizeof(T) * nelems <= (1 << 20)) { allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); -#endif } else { allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, smChannels, smOutChannels, rank, worldSize, nelems); } +#endif return cudaGetLastError(); } @@ -613,8 +668,8 @@ NCCL_API ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { return ncclSuccess; } -NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, - ncclConfig_t* config) { +NCCL_API ncclResult_t ncclCommInitRankConfig(ncclComm_t*, int, ncclUniqueId, int, + ncclConfig_t*) { // TODO: implement this function return ncclInternalError; } @@ -655,7 +710,6 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI commPtr->comm = mscclppComm; commPtr->connections = std::move(connections); commPtr->smSemaphores = std::move(smSemaphores); - // using scratch buffer for message size less then 1MB commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); commPtr->remoteScratchRegMemories = setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, mscclpp::Transport::CudaIpc); @@ -664,7 +718,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI return ncclSuccess; } -NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist) { +NCCL_API ncclResult_t ncclCommInitAll(ncclComm_t*, int, const int*) { // TODO: implement this function return ncclInternalError; } @@ -680,12 +734,12 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { return ncclSuccess; } -NCCL_API ncclResult_t ncclCommAbort(ncclComm_t comm) { +NCCL_API ncclResult_t ncclCommAbort(ncclComm_t) { // TODO: implement this function return ncclSuccess; } -NCCL_API ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm, ncclConfig_t* config) { +NCCL_API ncclResult_t ncclCommSplit(ncclComm_t, int, int, ncclComm_t*, ncclConfig_t*) { // TODO: implement this function return ncclInternalError; } @@ -742,31 +796,31 @@ NCCL_API ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { return ncclSuccess; } -NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t* op, void* scalar, ncclDataType_t datatype, - ncclScalarResidence_t residence, ncclComm_t comm) { +NCCL_API ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t*, void*, ncclDataType_t, + ncclScalarResidence_t, ncclComm_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { +NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, + ncclRedOp_t, int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, - cudaStream_t stream) { +NCCL_API ncclResult_t ncclBcast(void*, size_t, ncclDataType_t, int, ncclComm_t, + cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - int root, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, + int, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } @@ -779,39 +833,27 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t channelKey key{sendbuff, recvbuff, bytes}; mscclpp::DeviceHandle* smChannels = nullptr; mscclpp::DeviceHandle* smOutChannels = nullptr; - if (bytes <= 1 << 20) { - auto it = comm->channelInfos.find(key); - if (it == comm->channelInfos.end()) { - std::vector channels = - setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); - ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; - it = comm->channelInfos.emplace(key, channelInfo).first; - } - smChannels = it->second.smChannelDeviceHandles.get(); - } else { - auto it = comm->channelInfos.find(key); - if (it == comm->channelInfos.end()) { + + auto it = comm->channelInfos.find(key); + if (it == comm->channelInfos.end()) { + // setup smChannels (src: sendbuff, dst: remote scratch buff) + std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); + ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; + it = comm->channelInfos.emplace(key, channelInfo).first; + + // setup smOutChannels (src: recvbuff, dst: remote recvbuff) + if (bytes > (1 << 20)) { std::vector remoteMemories = - setupRemoteMemories(comm->comm, rank, const_cast(sendbuff), bytes, mscclpp::Transport::CudaIpc); - std::vector channels = setupSmChannels(comm, remoteMemories, const_cast(sendbuff)); - ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; - it = comm->channelInfos.emplace(key, channelInfo).first; - if (sendbuff != recvbuff) { - std::vector remoteMemories = - setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); - std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); - std::shared_ptr> outPtr = setupSmChannelDeviceHandles(outChannels); - it->second.smOutChannels = outChannels; - it->second.smOutChannelDeviceHandles = outPtr; - } else { - std::shared_ptr> outPtr = setupSmChannelDeviceHandles(channels); - it->second.smOutChannelDeviceHandles = outPtr; - } + setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); + std::vector outChannels = setupSmChannels(comm, remoteMemories, recvbuff); + it->second.smOutChannels = outChannels; + it->second.smOutChannelDeviceHandles = setupSmChannelDeviceHandles(outChannels); } - smChannels = it->second.smChannelDeviceHandles.get(); - smOutChannels = it->second.smOutChannelDeviceHandles.get(); } + smChannels = it->second.smChannelDeviceHandles.get(); + smOutChannels = it->second.smOutChannelDeviceHandles.get(); + switch (datatype) { case ncclFloat16: CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, @@ -834,8 +876,8 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t return ncclSuccess; } -NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, - ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclReduceScatter(const void*, void*, size_t, ncclDataType_t, + ncclRedOp_t, ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } @@ -873,20 +915,20 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t return ncclSuccess; } -NCCL_API ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, - cudaStream_t stream) { +NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, + cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, - cudaStream_t stream) { +NCCL_API ncclResult_t ncclRecv(void*, size_t, ncclDataType_t, int, ncclComm_t, + cudaStream_t) { // TODO: implement this function return ncclInternalError; } -NCCL_API ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclComm_t comm, cudaStream_t stream) { +NCCL_API ncclResult_t ncclAllToAll(const void*, void*, size_t, ncclDataType_t, + ncclComm_t, cudaStream_t) { // TODO: implement this function return ncclInternalError; } From 674ef34ed9b537dc96270561084e3601e1a3bd4e Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 06:36:38 +0000 Subject: [PATCH 66/89] fix reduce-scatter --- apps/nccl/src/nccl.cu | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 5ca10fd29..d14cfd9a9 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -414,14 +414,16 @@ __global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa } __syncthreads(); - // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4PerBlock` - constexpr size_t unitNInt4PerBlock = 1024; - const size_t nUnitsPerBlock = (nInt4PerRank + unitNInt4PerBlock - 1) / unitNInt4PerBlock; - size_t offsetOfThisBlock = unitNInt4PerBlock * nUnitsPerBlock * blockIdx.x; - size_t nInt4OfThisBlock = unitNInt4PerBlock * nUnitsPerBlock; - if ((nInt4PerRank % unitNInt4PerBlock != 0) && (blockIdx.x == gridDim.x - 1)) { - // The last block may have fewer int4 than others - nInt4OfThisBlock = unitNInt4PerBlock * (nUnitsPerBlock - 1) + nInt4PerRank % unitNInt4PerBlock; + // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` + constexpr size_t unitNInt4 = 1024; + const size_t maxNInt4PerBlock = (((nInt4PerRank + gridDim.x - 1) / gridDim.x) + unitNInt4 - 1) / unitNInt4 * unitNInt4; + size_t offsetOfThisBlock = maxNInt4PerBlock * blockIdx.x; + size_t nInt4OfThisBlock = maxNInt4PerBlock; + size_t nNeededBlocks = (nInt4PerRank + maxNInt4PerBlock - 1) / maxNInt4PerBlock; + if (blockIdx.x >= nNeededBlocks) { + nInt4OfThisBlock = 0; + } else if (blockIdx.x == nNeededBlocks - 1) { + nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); } for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { @@ -451,7 +453,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< // nBlocks = 56; // nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; // } - int nBlocks = 1; + int nBlocks = 28; int nThreadsPerBlock = 1024; if (sizeof(T) * nelems <= (1 << 20)) { allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, From d8452c2d66287aabe00e03a64b74b6ea6f8eaa65 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sat, 24 Feb 2024 06:45:04 +0000 Subject: [PATCH 67/89] wip --- apps/nccl/src/nccl.cu | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d14cfd9a9..f6bc58e23 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -393,27 +393,20 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa template __global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, - int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { + mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, + int worldSize, size_t nelems) { const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); const size_t nInt4PerRank = nInt4 / worldSize; + auto smChans = smChannels + chanOffset; auto smOutChans = smOutChannels + chanOffset; int4* buff4 = reinterpret_cast(buff); int4* scratch4 = reinterpret_cast(scratch); int4* resultBuff4 = reinterpret_cast(resultBuff); - /// Starts reduce-scatter - - if (threadIdx.x < nPeer) { - smOutChans[threadIdx.x].relaxedSignal(); - smOutChans[threadIdx.x].wait(); - } - __syncthreads(); - // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` constexpr size_t unitNInt4 = 1024; const size_t maxNInt4PerBlock = (((nInt4PerRank + gridDim.x - 1) / gridDim.x) + unitNInt4 - 1) / unitNInt4 * unitNInt4; @@ -426,6 +419,23 @@ __global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); } + /// Starts allgather + for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { + for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = buff4[nInt4PerRank * remoteRank + idx]; + smChans[peerIdx].write(nInt4PerRank * remoteRank + idx, val); + } + } + + /// Starts reduce-scatter + + if (threadIdx.x < nPeer) { + smOutChans[threadIdx.x].relaxedSignal(); + smOutChans[threadIdx.x].wait(); + } + __syncthreads(); + for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { int4 data = buff4[nInt4PerRank * rank + idx]; for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { @@ -460,7 +470,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< worldSize, nelems, flag++); } else { allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, - worldSize, nelems, flag++); + worldSize, nelems); } #else if (sizeof(T) * nelems <= (1 << 20)) { From 5c071841687921aba0300d0b192c7776594ab777 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 07:11:25 +0000 Subject: [PATCH 68/89] allreduce8 looks good --- apps/nccl/src/nccl.cu | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index f6bc58e23..3a9ef4153 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -424,7 +424,7 @@ __global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx]; - smChans[peerIdx].write(nInt4PerRank * remoteRank + idx, val); + smChans[peerIdx].write(nInt4PerRank * rank + idx, val); } } @@ -446,7 +446,7 @@ __global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa resultBuff4[nInt4PerRank * rank + idx] = data; for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - smOutChans[peerIdx].write(nInt4PerRank * remoteRank + idx, data); + smOutChans[peerIdx].write(nInt4PerRank * rank + idx, data); } } } @@ -457,18 +457,18 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< int worldSize, size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; #if defined(__HIP_PLATFORM_AMD__) - // int nBlocks = 28; - // int nThreadsPerBlock = 1024; - // if (nelems >= 8192) { - // nBlocks = 56; - // nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; - // } - int nBlocks = 28; - int nThreadsPerBlock = 1024; if (sizeof(T) * nelems <= (1 << 20)) { + int nBlocks = 28; + int nThreadsPerBlock = 1024; + if (nelems >= 8192) { + nBlocks = 56; + nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + } allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); } else { + int nBlocks = 16; + int nThreadsPerBlock = 1024; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, worldSize, nelems); } From efa4da276ffa7057a2239fab04a138b10c1908c4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 07:34:07 +0000 Subject: [PATCH 69/89] some tuning --- apps/nccl/src/nccl.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 3a9ef4153..3ce1fc035 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -467,8 +467,8 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); } else { - int nBlocks = 16; - int nThreadsPerBlock = 1024; + int nBlocks = 32; + int nThreadsPerBlock = 512; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, worldSize, nelems); } From 4f13178fdb55ae2216e90a63daf5a6b3aa04f2ba Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 24 Feb 2024 07:47:52 +0000 Subject: [PATCH 70/89] safety guard --- apps/nccl/src/nccl.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 3ce1fc035..4a4967622 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -466,11 +466,13 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< } allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); - } else { + } else if (sizeof(T) * nelems <= (40 << 20)) { int nBlocks = 32; int nThreadsPerBlock = 512; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, worldSize, nelems); + } else { + // TODO: } #else if (sizeof(T) * nelems <= (1 << 20)) { From 0e4367b6404cdd71e274523d3e53a0d681fe7bb7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 26 Feb 2024 03:09:09 +0000 Subject: [PATCH 71/89] fix --- apps/nccl/src/nccl.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 4a4967622..de639a7eb 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -576,7 +576,8 @@ __global__ void __launch_bounds__(1024, 1) template cudaError_t allgather(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - allgather6<<<24, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, nelems); + allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, + nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); } From 86334d79bc1172debd846dcdf701a7664d0d978c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 26 Feb 2024 07:10:02 +0000 Subject: [PATCH 72/89] fix perf issue --- apps/nccl/src/nccl.cu | 57 ++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index de639a7eb..80f9688ba 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -26,6 +26,7 @@ #if defined(__HIP_PLATFORM_AMD__) #define WARP_SIZE 64 +#define __syncwarp() __builtin_amdgcn_wave_barrier() #else #define WARP_SIZE 32 #endif @@ -144,9 +145,8 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { } // TODO: -static const int nRanksPerNode = 8; - -static const int scratchSize = 1024 * 1024 * 40; +static const int NRANKS_PER_NODE = 8; +static const int SCRATCH_SIZE = 1024 * 1024 * 40; // static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, // mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -196,8 +196,9 @@ cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int n #include template -__global__ void allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { +__global__ void __launch_bounds__(1024, 1) + allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, + int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -253,9 +254,9 @@ __global__ void allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa } template -__global__ void allreduce1(T* src, T* dst, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, int nranks, - size_t nelems) { +__global__ void __launch_bounds__(1024, 1) + allreduce1(T* src, T* dst, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nranks, size_t nelems) { const size_t chunkSize = nelems / nranks; if (nranks == 1) return; const int nPeer = nranks - 1; @@ -333,8 +334,9 @@ __global__ void allreduce1(T* src, T* dst, mscclpp::DeviceHandle -__global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { +__global__ void __launch_bounds__(1024, 1) + allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, + int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; nelems = nelems / (sizeof(int) / sizeof(T)); @@ -358,9 +360,17 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + // Put channels into shared memory, read channel info from global memory is unexpectable slow. + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + const int lid = tid % WARP_SIZE; + if (lid < nPeers) { + channels[lid] = smChannels[lid]; + } + __syncwarp(); + // step 1: write to scratch buffer - smChannels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + channels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; @@ -378,7 +388,7 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa packet.flag = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - smChannels[index].write(offset, packet); + channels[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -392,9 +402,10 @@ __global__ void allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHa } template -__global__ void allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, - int worldSize, size_t nelems) { +__global__ void __launch_bounds__(1024, 1) + allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) @@ -725,9 +736,9 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI commPtr->comm = mscclppComm; commPtr->connections = std::move(connections); commPtr->smSemaphores = std::move(smSemaphores); - commPtr->scratchBuff = mscclpp::allocExtSharedCuda(scratchSize); + commPtr->scratchBuff = mscclpp::allocExtSharedCuda(SCRATCH_SIZE); commPtr->remoteScratchRegMemories = - setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), scratchSize, mscclpp::Transport::CudaIpc); + setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc); *comm = commPtr; return ncclSuccess; @@ -872,17 +883,17 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t switch (datatype) { case ncclFloat16: CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, - rank, nRanksPerNode, comm->comm->bootstrap()->getNranks(), count, stream)); + rank, NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels, - smOutChannels, comm->comm->bootstrap()->getRank(), nRanksPerNode, + smOutChannels, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclInt32: case ncclUint32: CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, - comm->comm->bootstrap()->getRank(), nRanksPerNode, comm->comm->bootstrap()->getNranks(), + comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; default: @@ -922,10 +933,10 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t smChannels = it->second.smChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, - rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } else { CUDACHECK(allgather((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, - rank, nRanksPerNode, nRank, bytes / sizeof(int), stream)); + rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } return ncclSuccess; } From 12ae9db1aa59a020d5e7f67742e752f31b23eceb Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 26 Feb 2024 10:36:47 +0000 Subject: [PATCH 73/89] update --- apps/nccl/src/nccl.cu | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 80f9688ba..141e121e2 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -430,20 +430,29 @@ __global__ void __launch_bounds__(1024, 1) nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); } + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; + const int lid = threadIdx.x % WARP_SIZE; + if (lid < nPeer) { + channels[lid] = smChans[lid]; + outChannels[lid] = smOutChans[lid]; + } + __syncwarp(); + /// Starts allgather for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx]; - smChans[peerIdx].write(nInt4PerRank * rank + idx, val); + channels[peerIdx].write(nInt4PerRank * rank + idx, val); } } /// Starts reduce-scatter if (threadIdx.x < nPeer) { - smOutChans[threadIdx.x].relaxedSignal(); - smOutChans[threadIdx.x].wait(); + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); } __syncthreads(); @@ -457,7 +466,7 @@ __global__ void __launch_bounds__(1024, 1) resultBuff4[nInt4PerRank * rank + idx] = data; for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - smOutChans[peerIdx].write(nInt4PerRank * rank + idx, data); + outChannels[peerIdx].write(nInt4PerRank * rank + idx, data); } } } From 057886ee824d167892647b0aa1a18662f76f78a4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 28 Feb 2024 04:14:47 +0000 Subject: [PATCH 74/89] clean code --- apps/nccl/src/allgather.hpp | 110 ++++++++ apps/nccl/src/allreduce.hpp | 438 ++++++++++++++++++++++++++++ apps/nccl/src/common.hpp | 17 ++ apps/nccl/src/nccl.cu | 549 +----------------------------------- 4 files changed, 572 insertions(+), 542 deletions(-) create mode 100644 apps/nccl/src/allgather.hpp create mode 100644 apps/nccl/src/allreduce.hpp create mode 100644 apps/nccl/src/common.hpp diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp new file mode 100644 index 000000000..30a93d898 --- /dev/null +++ b/apps/nccl/src/allgather.hpp @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ALLGATHER_HPP_ +#define ALLGATHER_HPP_ + +#include +#include +#include + +#include "common.hpp" + +template +__global__ void __launch_bounds__(1024, 1) + allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t rank, + [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { + const size_t nBlock = gridDim.x; + if (blockIdx.x >= nBlock) return; + + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t lid = tid % WARP_SIZE; + const size_t wid = tid / WARP_SIZE; + + const size_t nThread = blockDim.x * nBlock; + const size_t nWarp = nThread / WARP_SIZE; + const size_t nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; + auto smChans = smChannels + chanOffset; + + if (wid < nPeer && lid == 0) { + smChans[wid].relaxedSignal(); + smChans[wid].wait(); + } + __syncthreads(); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); + const size_t bytes = bytesPerGPU * nPeer; + size_t unitBytesPerThread; + if (bytes >= nThread * 64) { + unitBytesPerThread = 64; + } else { + unitBytesPerThread = 16; + } + const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; + const size_t unitBytes = unitBytesPerWarp * nWarp; + const size_t nLoop = bytes / unitBytes; + + if (nLoop > 0) { + // First loop unrolling + const size_t peerIdx = wid % nPeer; + const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + } + + for (size_t i = 1; i < nLoop; ++i) { + const size_t gWid = wid + i * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + } + } + + if (bytes % unitBytes > 0) { + const size_t gWid = wid + nLoop * nWarp; + const size_t peerIdx = gWid % nPeer; + const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; + const size_t offset = bytesPerGPU * rank + offsetWithinRank; + const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) + ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) + : unitBytesPerWarp; + if (remainBytes > 0) { + if constexpr (IsOutOfPlace) { + char* dst = reinterpret_cast(smChans[peerIdx].dst_); + char* src = reinterpret_cast(smChans[peerIdx].src_); + char* buff = reinterpret_cast(sendbuff); + smChans[peerIdx].copy<16, true>(src + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); + smChans[peerIdx].copy<16, true>(dst + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); + } else { + smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); + } + } + } +} + +template +cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff, + mscclpp::DeviceHandle* smChannels, int rank, int nRanksPerNode, int worldSize, + size_t nelems, cudaStream_t stream) { + allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, + nelems * sizeof(T) / sizeof(int)); + return cudaGetLastError(); +} + +#endif // ALLGATHER_HPP_ diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp new file mode 100644 index 000000000..b4ea6d132 --- /dev/null +++ b/apps/nccl/src/allreduce.hpp @@ -0,0 +1,438 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ALLREDUCE_HPP_ +#define ALLREDUCE_HPP_ + +#include +#include +#include +#include + +#include "common.hpp" + +extern __device__ mscclpp::DeviceSyncer deviceSyncer; + +template +__forceinline__ __device__ To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u; + u.f = src; + return u.t; +} + +template +__forceinline__ __device__ T add_elements(T a, T b) { + return a + b; +} + +template <> +__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { + return __hadd2(a, b); +} + +template +__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) { + int4 ret; + ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +template +__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) { + uint2 ret; + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ int add_vectors_helper(int a, int b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +__forceinline__ __device__ int add_vectors(int a, int b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ int add_vectors<__half>(int a, int b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) { + return add_vectors_helper(a, b); +} + +template <> +__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { + return add_vectors_helper<__half2>(a, b); +} + +template +__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { + size_t nInt4 = nElem / 4; + size_t nLastInts = nElem % 4; + int4* dst4 = (int4*)dst; + int4* src4 = (int4*)src; + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + dst4[i] = add_vectors(dst4[i], src4[i]); + } + if (nLastInts > 0) { + int* dstLast = ((int*)dst) + nInt4 * 4; + int* srcLast = ((int*)src) + nInt4 * 4; + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + dstLast[i] = add_vectors(dstLast[i], srcLast[i]); + } + } +} + +template +__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { + vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); +} + +template +__global__ void __launch_bounds__(1024, 1) + allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, + int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { + // This version of allreduce only works for single nodes + if (worldSize != nRanksPerNode) return; + nelems = nelems / (sizeof(int) / sizeof(T)); + const int nPeers = nRanksPerNode - 1; + const int nPkts = nelems / 2; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank / 2; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + mscclpp::SmChannelDeviceHandle smChan = smChannels[peerIdx]; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // step 1: write to scratch buffer + smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint2 data = make_uint2(0, 0); + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; + uint2 val = dstPkt[idx].read(flag); + data = add_vectors(val, data); + } + data = add_vectors(data, src[idx]); + dst[idx].x = data.x; + dst[idx].y = data.y; + for (int index = 0; index < nPeers; index++) { + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)smChannels[index].dst_ + scratchResultOffset); + dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag); + } + } + // step 3: get data result from scratch buffer + mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint2 data = dstPkt[idx + dstOffset].read(flag); + result[idx].x = data.x; + result[idx].y = data.y; + } +} + +template +__global__ void __launch_bounds__(1024, 1) + allreduce1(T* src, T* dst, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nranks, size_t nelems) { + const size_t chunkSize = nelems / nranks; + if (nranks == 1) return; + const int nPeer = nranks - 1; + const size_t indexOffset = rank * chunkSize; + const size_t vectorSize = sizeof(int4) / sizeof(T); + const size_t indexOffset4 = indexOffset / vectorSize; + int4* src4 = (int4*)src; + int4* dst4 = (int4*)dst; + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // synchronize everyone + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChannels[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChannels[tid - nPeer].wait(); + } + deviceSyncer.sync(gridDim.x); + + // use int4 as much as possible + const size_t nInt4 = chunkSize / vectorSize; + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { + int4 tmp = src4[indexOffset4 + idx]; + for (int index = 0; index < nPeer; ++index) { + int4 val; + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + val = smChannels[peerIdx].read(indexOffset4 + idx); + tmp = add_vectors(tmp, val); + } + dst4[indexOffset4 + idx] = tmp; + } + + // use the given TYPE for the rest + size_t processed = nInt4 * vectorSize * nranks; + const size_t nRemElems = nelems - processed; + const size_t startIdx = processed + (nRemElems * rank) / nranks; + const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks; + for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { + T tmp = src[idx]; + for (int index = 0; index < nPeer; ++index) { + int peerIdx = (index + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + T val = smChannels[peerIdx].read(idx); + tmp += val; + } + dst[idx] = tmp; + } + + // synchronize everyone again + deviceSyncer.sync(gridDim.x); + if (tid == 0) { + __threadfence_system(); + } + __syncthreads(); + if (tid < nPeer) { + smChannels[tid].relaxedSignal(); + } + if (tid >= nPeer && tid < nPeer * 2) { + smChannels[tid - nPeer].wait(); + } + + deviceSyncer.sync(gridDim.x); + for (int i = 0; i < nPeer; ++i) { + int peerIdx = (i + rank); + if (peerIdx >= nPeer) peerIdx -= nPeer; + const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); + size_t offset = chunkSize * remoteRank * sizeof(T); + smOutChannels[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); + } +} + +template +__global__ void __launch_bounds__(1024, 1) + allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, + int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { + // This version of allreduce only works for single nodes + if (worldSize != nRanksPerNode) return; + nelems = nelems / (sizeof(int) / sizeof(T)); + const int nPeers = nRanksPerNode - 1; + const size_t nPkts = nelems; + const int nelemsPerRank = nelems / worldSize; + const int nPktsPerRank = nelemsPerRank; + // thread block & channel info + const int nBlocksPerPeer = gridDim.x / nPeers; + const int localBlockIdx = blockIdx.x % nBlocksPerPeer; + const int peerIdx = blockIdx.x / nBlocksPerPeer; + const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; + const int tid = threadIdx.x + localBlockIdx * blockDim.x; + // double buffering + size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet); + void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); + size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); + size_t scratchResultOffset = + (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet); + size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); + uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); + uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); + + // Put channels into shared memory, read channel info from global memory is unexpectable slow. + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + const int lid = tid % WARP_SIZE; + if (lid < nPeers) { + channels[lid] = smChannels[lid]; + } + __syncwarp(); + + // step 1: write to scratch buffer + channels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); + // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { + uint32_t data = 0; + for (int index = 0; index < nPeers; index++) { + const int remoteRank = index < rank ? index : index + 1; + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; + uint32_t val = dstPkt[idx].read(flag); + data = add_vectors(val, data); + } + data = add_vectors(data, src[idx]); + dst[idx] = data; + + mscclpp::LL8Packet packet; + packet.data = data; + packet.flag = flag; + size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); + for (int index = 0; index < nPeers; index++) { + channels[index].write(offset, packet); + } + } + // step 3: get data result from scratch buffer + mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset); + const int dstOffset = remoteRank * nPktsPerRank; + uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); + for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { + uint32_t data = dstPkt[idx + dstOffset].read(flag); + result[idx] = data; + } +} + +template +__global__ void __launch_bounds__(1024, 1) + allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, + size_t nelems) { + const int nPeer = nRanksPerNode - 1; + const size_t chanOffset = nPeer * blockIdx.x; + // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) + const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); + const size_t nInt4PerRank = nInt4 / worldSize; + auto smChans = smChannels + chanOffset; + auto smOutChans = smOutChannels + chanOffset; + + int4* buff4 = reinterpret_cast(buff); + int4* scratch4 = reinterpret_cast(scratch); + int4* resultBuff4 = reinterpret_cast(resultBuff); + + // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` + constexpr size_t unitNInt4 = 1024; + const size_t maxNInt4PerBlock = (((nInt4PerRank + gridDim.x - 1) / gridDim.x) + unitNInt4 - 1) / unitNInt4 * unitNInt4; + size_t offsetOfThisBlock = maxNInt4PerBlock * blockIdx.x; + size_t nInt4OfThisBlock = maxNInt4PerBlock; + size_t nNeededBlocks = (nInt4PerRank + maxNInt4PerBlock - 1) / maxNInt4PerBlock; + if (blockIdx.x >= nNeededBlocks) { + nInt4OfThisBlock = 0; + } else if (blockIdx.x == nNeededBlocks - 1) { + nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); + } + + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; + const int lid = threadIdx.x % WARP_SIZE; + if (lid < nPeer) { + channels[lid] = smChans[lid]; + outChannels[lid] = smOutChans[lid]; + } + __syncwarp(); + + /// Starts allgather + for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = buff4[nInt4PerRank * remoteRank + idx]; + channels[peerIdx].write(nInt4PerRank * rank + idx, val); + } + } + + /// Starts reduce-scatter + + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); + } + __syncthreads(); + + for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { + int4 data = buff4[nInt4PerRank * rank + idx]; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = scratch4[nInt4PerRank * remoteRank + idx]; + data = add_vectors(val, data); + } + resultBuff4[nInt4PerRank * rank + idx] = data; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + outChannels[peerIdx].write(nInt4PerRank * rank + idx, data); + } + } +} + +template +cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, + int worldSize, size_t nelems, cudaStream_t stream) { + static uint32_t flag = 1; +#if defined(__HIP_PLATFORM_AMD__) + if (sizeof(T) * nelems <= (1 << 20)) { + int nBlocks = 28; + int nThreadsPerBlock = 1024; + if (nelems >= 8192) { + nBlocks = 56; + nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; + } + allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, + worldSize, nelems, flag++); + } else if (sizeof(T) * nelems <= (40 << 20)) { + int nBlocks = 32; + int nThreadsPerBlock = 512; + allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, + worldSize, nelems); + } else { + // TODO: + } +#else + if (sizeof(T) * nelems <= (1 << 20)) { + allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, + flag++); + } else { + allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, smChannels, smOutChannels, rank, worldSize, nelems); + } +#endif + return cudaGetLastError(); +} + +#endif // ALLREDUCE_KERNEL_H diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp new file mode 100644 index 000000000..3843f8ea3 --- /dev/null +++ b/apps/nccl/src/common.hpp @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef NCCL_COMMON_HPP_ +#define NCCL_COMMON_HPP_ + +#if defined(__HIP_PLATFORM_AMD__) +#define WARP_SIZE 64 +#define __syncwarp() __builtin_amdgcn_wave_barrier() +#else +#define WARP_SIZE 32 +#endif + +constexpr int NRANKS_PER_NODE = 8; +constexpr int SCRATCH_SIZE = 1024 * 1024 * 40; + +#endif // NCCL_COMMON_HPP_ diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 141e121e2..d03e6fdcc 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -9,6 +9,8 @@ #include #include +#include "allgather.hpp" +#include "allreduce.hpp" #include "nccl.h" #define NCCL_API extern "C" __attribute__((visibility("default"))) @@ -23,137 +25,12 @@ } while (0) #define NUM_CHANNELS_PER_CONNECTION 64 - -#if defined(__HIP_PLATFORM_AMD__) -#define WARP_SIZE 64 -#define __syncwarp() __builtin_amdgcn_wave_barrier() -#else -#define WARP_SIZE 32 -#endif - -template -__forceinline__ __device__ To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -__forceinline__ __device__ T add_elements(T a, T b) { - return a + b; -} - -template <> -__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); -} - -template -__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ int add_vectors_helper(int a, int b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -__forceinline__ __device__ int add_vectors(int a, int b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ int add_vectors<__half>(int a, int b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { - size_t nInt4 = nElem / 4; - size_t nLastInts = nElem % 4; - int4* dst4 = (int4*)dst; - int4* src4 = (int4*)src; - for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { - dst4[i] = add_vectors(dst4[i], src4[i]); - } - if (nLastInts > 0) { - int* dstLast = ((int*)dst) + nInt4 * 4; - int* srcLast = ((int*)src) + nInt4 * 4; - for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { - dstLast[i] = add_vectors(dstLast[i], srcLast[i]); - } - } -} - -template -__forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { - vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); -} - -// TODO: -static const int NRANKS_PER_NODE = 8; -static const int SCRATCH_SIZE = 1024 * 1024 * 40; +__device__ mscclpp::DeviceSyncer deviceSyncer; // static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, // mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, // mscclpp::Transport::IB6, mscclpp::Transport::IB7}; -__device__ mscclpp::DeviceSyncer deviceSyncer; - struct channelKey { const void* sendbuff; const void* recvbuff; @@ -189,418 +66,6 @@ struct ncclComm { std::vector remoteScratchRegMemories; }; -cudaError_t allreduce(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, - size_t nelems, cudaStream_t stream); - -#include -#include - -template -__global__ void __launch_bounds__(1024, 1) - allreduce6(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, - int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - nelems = nelems / (sizeof(int) / sizeof(T)); - const int nPeers = nRanksPerNode - 1; - const int nPkts = nelems / 2; - const int nelemsPerRank = nelems / worldSize; - const int nPktsPerRank = nelemsPerRank / 2; - // thread block & channel info - const int nBlocksPerPeer = gridDim.x / nPeers; - const int localBlockIdx = blockIdx.x % nBlocksPerPeer; - const int peerIdx = blockIdx.x / nBlocksPerPeer; - const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - mscclpp::SmChannelDeviceHandle smChan = smChannels[peerIdx]; - const int tid = threadIdx.x + localBlockIdx * blockDim.x; - // double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); - void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); - size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket); - size_t scratchResultOffset = - (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket); - size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); - uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int)); - uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); - - // step 1: write to scratch buffer - smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); - // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { - uint2 data = make_uint2(0, 0); - for (int index = 0; index < nPeers; index++) { - const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; - uint2 val = dstPkt[idx].read(flag); - data = add_vectors(val, data); - } - data = add_vectors(data, src[idx]); - dst[idx].x = data.x; - dst[idx].y = data.y; - for (int index = 0; index < nPeers; index++) { - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)smChannels[index].dst_ + scratchResultOffset); - dstPkt[idx + rank * nPktsPerRank].write(data.x, data.y, flag); - } - } - // step 3: get data result from scratch buffer - mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset); - const int dstOffset = remoteRank * nPktsPerRank; - uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); - for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { - uint2 data = dstPkt[idx + dstOffset].read(flag); - result[idx].x = data.x; - result[idx].y = data.y; - } -} - -template -__global__ void __launch_bounds__(1024, 1) - allreduce1(T* src, T* dst, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, int nranks, size_t nelems) { - const size_t chunkSize = nelems / nranks; - if (nranks == 1) return; - const int nPeer = nranks - 1; - const size_t indexOffset = rank * chunkSize; - const size_t vectorSize = sizeof(int4) / sizeof(T); - const size_t indexOffset4 = indexOffset / vectorSize; - int4* src4 = (int4*)src; - int4* dst4 = (int4*)dst; - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - // synchronize everyone - if (tid == 0) { - __threadfence_system(); - } - __syncthreads(); - if (tid < nPeer) { - smChannels[tid].relaxedSignal(); - } - if (tid >= nPeer && tid < nPeer * 2) { - smChannels[tid - nPeer].wait(); - } - deviceSyncer.sync(gridDim.x); - - // use int4 as much as possible - const size_t nInt4 = chunkSize / vectorSize; - for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * gridDim.x) { - int4 tmp = src4[indexOffset4 + idx]; - for (int index = 0; index < nPeer; ++index) { - int4 val; - int peerIdx = (index + rank); - if (peerIdx >= nPeer) peerIdx -= nPeer; - val = smChannels[peerIdx].read(indexOffset4 + idx); - tmp = add_vectors(tmp, val); - } - dst4[indexOffset4 + idx] = tmp; - } - - // use the given TYPE for the rest - size_t processed = nInt4 * vectorSize * nranks; - const size_t nRemElems = nelems - processed; - const size_t startIdx = processed + (nRemElems * rank) / nranks; - const size_t endIdx = processed + (nRemElems * (rank + 1)) / nranks; - for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x + startIdx; idx < endIdx; idx += blockDim.x * gridDim.x) { - T tmp = src[idx]; - for (int index = 0; index < nPeer; ++index) { - int peerIdx = (index + rank); - if (peerIdx >= nPeer) peerIdx -= nPeer; - T val = smChannels[peerIdx].read(idx); - tmp += val; - } - dst[idx] = tmp; - } - - // synchronize everyone again - deviceSyncer.sync(gridDim.x); - if (tid == 0) { - __threadfence_system(); - } - __syncthreads(); - if (tid < nPeer) { - smChannels[tid].relaxedSignal(); - } - if (tid >= nPeer && tid < nPeer * 2) { - smChannels[tid - nPeer].wait(); - } - - deviceSyncer.sync(gridDim.x); - for (int i = 0; i < nPeer; ++i) { - int peerIdx = (i + rank); - if (peerIdx >= nPeer) peerIdx -= nPeer; - const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); - size_t offset = chunkSize * remoteRank * sizeof(T); - smOutChannels[peerIdx].get(offset, chunkSize * sizeof(T), tid, blockDim.x * gridDim.x); - } -} - -template -__global__ void __launch_bounds__(1024, 1) - allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, int rank, - int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { - // This version of allreduce only works for single nodes - if (worldSize != nRanksPerNode) return; - nelems = nelems / (sizeof(int) / sizeof(T)); - const int nPeers = nRanksPerNode - 1; - const size_t nPkts = nelems; - const int nelemsPerRank = nelems / worldSize; - const int nPktsPerRank = nelemsPerRank; - // thread block & channel info - const int nBlocksPerPeer = gridDim.x / nPeers; - const int localBlockIdx = blockIdx.x % nBlocksPerPeer; - const int peerIdx = blockIdx.x / nBlocksPerPeer; - const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - const int tid = threadIdx.x + localBlockIdx * blockDim.x; - // double buffering - size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LL8Packet); - void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset); - size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LL8Packet); - size_t scratchResultOffset = - (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LL8Packet) : 3 * nPkts * sizeof(mscclpp::LL8Packet); - size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); - uint32_t* src = (uint32_t*)((char*)buff + rank * nelemsPerRank * sizeof(int)); - uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); - - // Put channels into shared memory, read channel info from global memory is unexpectable slow. - __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; - const int lid = tid % WARP_SIZE; - if (lid < nPeers) { - channels[lid] = smChannels[lid]; - } - __syncwarp(); - - // step 1: write to scratch buffer - channels[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); - // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { - uint32_t data = 0; - for (int index = 0; index < nPeers; index++) { - const int remoteRank = index < rank ? index : index + 1; - mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; - uint32_t val = dstPkt[idx].read(flag); - data = add_vectors(val, data); - } - data = add_vectors(data, src[idx]); - dst[idx] = data; - - mscclpp::LL8Packet packet; - packet.data = data; - packet.flag = flag; - size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); - for (int index = 0; index < nPeers; index++) { - channels[index].write(offset, packet); - } - } - // step 3: get data result from scratch buffer - mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)((char*)scratch + scratchResultOffset); - const int dstOffset = remoteRank * nPktsPerRank; - uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); - for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { - uint32_t data = dstPkt[idx + dstOffset].read(flag); - result[idx] = data; - } -} - -template -__global__ void __launch_bounds__(1024, 1) - allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, - size_t nelems) { - const size_t nPeer = nRanksPerNode - 1; - const size_t chanOffset = nPeer * blockIdx.x; - // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) - const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); - const size_t nInt4PerRank = nInt4 / worldSize; - auto smChans = smChannels + chanOffset; - auto smOutChans = smOutChannels + chanOffset; - - int4* buff4 = reinterpret_cast(buff); - int4* scratch4 = reinterpret_cast(scratch); - int4* resultBuff4 = reinterpret_cast(resultBuff); - - // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` - constexpr size_t unitNInt4 = 1024; - const size_t maxNInt4PerBlock = (((nInt4PerRank + gridDim.x - 1) / gridDim.x) + unitNInt4 - 1) / unitNInt4 * unitNInt4; - size_t offsetOfThisBlock = maxNInt4PerBlock * blockIdx.x; - size_t nInt4OfThisBlock = maxNInt4PerBlock; - size_t nNeededBlocks = (nInt4PerRank + maxNInt4PerBlock - 1) / maxNInt4PerBlock; - if (blockIdx.x >= nNeededBlocks) { - nInt4OfThisBlock = 0; - } else if (blockIdx.x == nNeededBlocks - 1) { - nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); - } - - __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; - __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; - const int lid = threadIdx.x % WARP_SIZE; - if (lid < nPeer) { - channels[lid] = smChans[lid]; - outChannels[lid] = smOutChans[lid]; - } - __syncwarp(); - - /// Starts allgather - for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { - for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - int4 val = buff4[nInt4PerRank * remoteRank + idx]; - channels[peerIdx].write(nInt4PerRank * rank + idx, val); - } - } - - /// Starts reduce-scatter - - if (threadIdx.x < nPeer) { - outChannels[threadIdx.x].relaxedSignal(); - outChannels[threadIdx.x].wait(); - } - __syncthreads(); - - for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx]; - for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - int4 val = scratch4[nInt4PerRank * remoteRank + idx]; - data = add_vectors(val, data); - } - resultBuff4[nInt4PerRank * rank + idx] = data; - for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - const size_t remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - outChannels[peerIdx].write(nInt4PerRank * rank + idx, data); - } - } -} - -template -cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, - int worldSize, size_t nelems, cudaStream_t stream) { - static uint32_t flag = 1; -#if defined(__HIP_PLATFORM_AMD__) - if (sizeof(T) * nelems <= (1 << 20)) { - int nBlocks = 28; - int nThreadsPerBlock = 1024; - if (nelems >= 8192) { - nBlocks = 56; - nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; - } - allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, - worldSize, nelems, flag++); - } else if (sizeof(T) * nelems <= (40 << 20)) { - int nBlocks = 32; - int nThreadsPerBlock = 512; - allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, - worldSize, nelems); - } else { - // TODO: - } -#else - if (sizeof(T) * nelems <= (1 << 20)) { - allreduce6<<<21, 512, 0, stream>>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, - flag++); - } else { - allreduce1<<<24, 1024, 0, stream>>>(buff, resultBuff, smChannels, smOutChannels, rank, worldSize, nelems); - } -#endif - return cudaGetLastError(); -} - -template -__global__ void __launch_bounds__(1024, 1) - allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t rank, - [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { - const size_t nBlock = gridDim.x; - if (blockIdx.x >= nBlock) return; - - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; - const size_t lid = tid % WARP_SIZE; - const size_t wid = tid / WARP_SIZE; - - const size_t nThread = blockDim.x * nBlock; - const size_t nWarp = nThread / WARP_SIZE; - const size_t nPeer = nRanksPerNode - 1; - const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = smChannels + chanOffset; - - if (wid < nPeer && lid == 0) { - smChans[wid].relaxedSignal(); - smChans[wid].wait(); - } - __syncthreads(); - const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); - const size_t bytes = bytesPerGPU * nPeer; - size_t unitBytesPerThread; - if (bytes >= nThread * 64) { - unitBytesPerThread = 64; - } else { - unitBytesPerThread = 16; - } - const size_t unitBytesPerWarp = unitBytesPerThread * WARP_SIZE; - const size_t unitBytes = unitBytesPerWarp * nWarp; - const size_t nLoop = bytes / unitBytes; - - if (nLoop > 0) { - // First loop unrolling - const size_t peerIdx = wid % nPeer; - const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; - if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); - char* buff = reinterpret_cast(sendbuff); - const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - } else { - smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); - } - } - - for (size_t i = 1; i < nLoop; ++i) { - const size_t gWid = wid + i * nWarp; - const size_t peerIdx = gWid % nPeer; - const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; - if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); - char* buff = reinterpret_cast(sendbuff); - const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset, buff + offsetWithinRank, unitBytesPerWarp, lid, WARP_SIZE); - } else { - smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); - } - } - - if (bytes % unitBytes > 0) { - const size_t gWid = wid + nLoop * nWarp; - const size_t peerIdx = gWid % nPeer; - const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - const size_t offset = bytesPerGPU * rank + offsetWithinRank; - const size_t remainBytes = (offsetWithinRank + unitBytesPerWarp > bytesPerGPU) - ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) - : unitBytesPerWarp; - if (remainBytes > 0) { - if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); - char* buff = reinterpret_cast(sendbuff); - smChans[peerIdx].copy<16, true>(src + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); - smChans[peerIdx].copy<16, true>(dst + offset, buff + offsetWithinRank, remainBytes, lid, WARP_SIZE); - } else { - smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); - } - } - } -} - -template -cudaError_t allgather(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - allgather6<<<28, 1024, 0, stream>>>((void*)buff, smChannels, rank, worldSize, nRanksPerNode, - nelems * sizeof(T) / sizeof(int)); - return cudaGetLastError(); -} - static size_t ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: @@ -633,7 +98,7 @@ static size_t ncclTypeSize(ncclDataType_t type) { return 0; } -static mscclpp::Transport getTransport(int rank, int peerRank) { +static mscclpp::Transport getTransport(int, int) { // if (rank / nRanksPerNode == peerRank / nRanksPerNode) { // return mscclpp::Transport::CudaIpc; // } else { @@ -802,12 +267,12 @@ NCCL_API const char* ncclGetErrorString(ncclResult_t result) { } } -NCCL_API const char* ncclGetLastError(ncclComm_t comm) { +NCCL_API const char* ncclGetLastError(ncclComm_t) { // TODO: implement this function return nullptr; } -NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t* asyncError) { +NCCL_API ncclResult_t ncclCommGetAsyncError(ncclComm_t, ncclResult_t* asyncError) { if (asyncError == nullptr) return ncclInvalidArgument; *asyncError = ncclSuccess; return ncclSuccess; @@ -861,7 +326,7 @@ NCCL_API ncclResult_t ncclBroadcast(const void*, void*, size_t, ncclDataType_t, } NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + ncclRedOp_t, ncclComm_t comm, cudaStream_t stream) { size_t bytes = count * ncclTypeSize(datatype); if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr) return ncclInvalidArgument; int rank = comm->comm->bootstrap()->getRank(); From f58b6f8164f3332dd7d473f448cf6f69778a9ced Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 28 Feb 2024 04:20:36 +0000 Subject: [PATCH 75/89] lint --- apps/nccl/rccl_test.py | 37 +++++++++++++++++++------------------ include/mscclpp/gpu.hpp | 6 +++--- test/nccl_api_test.cc | 1 + 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/apps/nccl/rccl_test.py b/apps/nccl/rccl_test.py index 298d26eee..bca7af815 100644 --- a/apps/nccl/rccl_test.py +++ b/apps/nccl/rccl_test.py @@ -2,39 +2,39 @@ from mpi4py import MPI import torch from cupy.cuda import nccl - + ROOT_RANK = 0 comm = MPI.COMM_WORLD -rank =comm.Get_rank() - +rank = comm.Get_rank() + is_group_root = rank == ROOT_RANK - + world_size = comm.Get_size() - + os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) - -device_type = "cuda" + +device_type = "cuda" torch.cuda.set_device(0) device_index = 0 device = torch.device(type=device_type, index=device_index) - + if is_group_root: id_ = nccl.get_unique_id() else: id_ = None - + ranks = range(world_size) id_, ranks = comm.bcast((id_, ranks), root=0) group = nccl.NcclCommunicator(len(ranks), id_, rank) print(f"{rank=}, {device=}, {group=}") - + M = 1024 N = 4096 K = 2048 -shape_a = (M,K) -shape_b = (K,N) -shape_c = (M,N) - +shape_a = (M, K) +shape_b = (K, N) +shape_c = (M, N) + a = torch.ones(shape_a, device="cuda") b = torch.ones(shape_b, device="cuda") c = torch.mm(a, b) @@ -52,15 +52,16 @@ # print(c) -d = torch.ones((1024*1024,), device="cuda") -e = torch.zeros((8*1024*1024,), device="cuda") -e[rank*1024*1024:(rank+1)*1024*1024] = d +d = torch.ones((1024 * 1024,), device="cuda") +e = torch.zeros((8 * 1024 * 1024,), device="cuda") +e[rank * 1024 * 1024 : (rank + 1) * 1024 * 1024] = d group.allGather( sendbuf=d.data_ptr(), recvbuf=e.data_ptr(), count=d.nelement(), datatype=nccl.NCCL_FLOAT, - stream=torch.cuda.current_stream().cuda_stream) + stream=torch.cuda.current_stream().cuda_stream, +) print(e) diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index a73532afa..29b48a4f7 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,9 +6,9 @@ #if defined(__HIP_PLATFORM_AMD__) -#include #include -// #include +#include +#include using cudaError_t = hipError_t; using cudaGraph_t = hipGraph_t; @@ -92,8 +92,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #else #include -#include #include +#include #if (CUDART_VERSION >= 11000) #include #endif diff --git a/test/nccl_api_test.cc b/test/nccl_api_test.cc index 788993866..4e23a217b 100644 --- a/test/nccl_api_test.cc +++ b/test/nccl_api_test.cc @@ -6,6 +6,7 @@ #include #include + #include "mpi.h" #include "nccl.h" From d72f09f85621e5f7786853175be8c4c5cb0aa876 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 28 Feb 2024 11:07:47 +0000 Subject: [PATCH 76/89] allreduce for all message sizes --- apps/nccl/src/allreduce.hpp | 95 +++++++++++++++++++++++++++---------- apps/nccl/src/common.hpp | 2 +- include/mscclpp/gpu.hpp | 2 +- 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index b4ea6d132..5237706b9 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -355,11 +355,17 @@ __global__ void __launch_bounds__(1024, 1) size_t offsetOfThisBlock = maxNInt4PerBlock * blockIdx.x; size_t nInt4OfThisBlock = maxNInt4PerBlock; size_t nNeededBlocks = (nInt4PerRank + maxNInt4PerBlock - 1) / maxNInt4PerBlock; + constexpr size_t nInt4PerChunk = 1024 * 256 / sizeof(int4); // 256KB if (blockIdx.x >= nNeededBlocks) { nInt4OfThisBlock = 0; } else if (blockIdx.x == nNeededBlocks - 1) { nInt4OfThisBlock = nInt4PerRank - maxNInt4PerBlock * (nNeededBlocks - 1); } + const size_t nItrs = nInt4OfThisBlock / nInt4PerChunk; + const size_t restNInt4 = nInt4OfThisBlock % nInt4PerChunk; + const size_t chunkSizePerRank = nNeededBlocks * nInt4PerChunk; + const size_t blockOffset = nInt4PerChunk * blockIdx.x; + const size_t scratchChunkRankOffset = chunkSizePerRank * rank; __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; @@ -370,33 +376,74 @@ __global__ void __launch_bounds__(1024, 1) } __syncwarp(); - /// Starts allgather - for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - int4 val = buff4[nInt4PerRank * remoteRank + idx]; - channels[peerIdx].write(nInt4PerRank * rank + idx, val); + // we can use double buffering to hide synchronization overhead + for (size_t itr = 0; itr < nItrs; itr++) { + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); + } + __syncthreads(); + // Starts allgather + for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; + channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); + } } - } - - /// Starts reduce-scatter - if (threadIdx.x < static_cast(nPeer)) { - outChannels[threadIdx.x].relaxedSignal(); - outChannels[threadIdx.x].wait(); + /// Starts reduce-scatter + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); + } + __syncthreads(); + + for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { + int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; + data = add_vectors(val, data); + } + resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock, data); + } + } + offsetOfThisBlock += nInt4PerChunk; } - __syncthreads(); + if (restNInt4 > 0) { + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); + } + __syncthreads(); + for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; + channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); + } + } - for (size_t idx = offsetOfThisBlock + threadIdx.x; idx < offsetOfThisBlock + nInt4OfThisBlock; idx += blockDim.x) { - int4 data = buff4[nInt4PerRank * rank + idx]; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; - int4 val = scratch4[nInt4PerRank * remoteRank + idx]; - data = add_vectors(val, data); + if (threadIdx.x < static_cast(nPeer)) { + outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].wait(); } - resultBuff4[nInt4PerRank * rank + idx] = data; - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - outChannels[peerIdx].write(nInt4PerRank * rank + idx, data); + __syncthreads(); + + for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { + int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock]; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; + data = add_vectors(val, data); + } + resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; + for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock, data); + } } } } @@ -416,13 +463,11 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< } allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); - } else if (sizeof(T) * nelems <= (40 << 20)) { + } else { int nBlocks = 32; int nThreadsPerBlock = 512; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, worldSize, nelems); - } else { - // TODO: } #else if (sizeof(T) * nelems <= (1 << 20)) { diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp index 3843f8ea3..9ad538439 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp @@ -12,6 +12,6 @@ #endif constexpr int NRANKS_PER_NODE = 8; -constexpr int SCRATCH_SIZE = 1024 * 1024 * 40; +constexpr int SCRATCH_SIZE = 1024 * 1024 * 64; // 32 thread-blocks * 8 ranks * 256KB = 64MB #endif // NCCL_COMMON_HPP_ diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 29b48a4f7..01f875099 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,9 +6,9 @@ #if defined(__HIP_PLATFORM_AMD__) +#include #include #include -#include using cudaError_t = hipError_t; using cudaGraph_t = hipGraph_t; From 96e31bf2260fb69124c671a7cf37662dde677d5b Mon Sep 17 00:00:00 2001 From: Pashupati Kumar <74680231+pash-msft@users.noreply.github.com> Date: Thu, 7 Mar 2024 23:25:37 -0800 Subject: [PATCH 77/89] Fixed issues found on AMD (#271) --- apps/nccl/src/nccl.cu | 6 ++---- include/mscclpp/gpu.hpp | 2 +- src/fifo.cc | 1 + 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index d03e6fdcc..b1fabdf21 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -147,10 +147,8 @@ static std::shared_ptr> setupSmChannel [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); std::shared_ptr> ptr = mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - mscclpp::AvoidCudaGraphCaptureGuard guard; - CUDACHECK(cudaMemcpy(ptr.get(), smChannelDeviceHandles.data(), - sizeof(mscclpp::DeviceHandle) * smChannelDeviceHandles.size(), - cudaMemcpyHostToDevice)); + mscclpp::memcpyCuda>(ptr.get(), smChannelDeviceHandles.data(), + smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); return ptr; } diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 01f875099..8e9e17ab5 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,7 +6,7 @@ #if defined(__HIP_PLATFORM_AMD__) -#include +// #include #include #include diff --git a/src/fifo.cc b/src/fifo.cc index 4255bcdcd..592bf7d00 100644 --- a/src/fifo.cc +++ b/src/fifo.cc @@ -56,6 +56,7 @@ MSCCLPP_API_CPP void Fifo::pop() { MSCCLPP_API_CPP void Fifo::flushTail(bool sync) { // Flush the tail to device memory. This is either triggered every ProxyFlushPeriod to make sure that the fifo can // make progress even if there is no request mscclppSync. However, mscclppSync type is for flush request. + AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaMemcpyAsync(pimpl->tailReplica.get(), &pimpl->hostTail, sizeof(uint64_t), cudaMemcpyHostToDevice, pimpl->stream)); if (sync) { From 8d02754a00f3e4c0504455457d9cd74abda5fcef Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 12 Mar 2024 04:15:31 +0000 Subject: [PATCH 78/89] improve perf --- apps/nccl/src/allreduce.hpp | 16 +++++++++------- apps/nccl/src/common.hpp | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 5237706b9..a11b6e8db 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -379,13 +379,14 @@ __global__ void __launch_bounds__(1024, 1) // we can use double buffering to hide synchronization overhead for (size_t itr = 0; itr < nItrs; itr++) { if (threadIdx.x < static_cast(nPeer)) { - outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].signal(); outChannels[threadIdx.x].wait(); } __syncthreads(); // Starts allgather for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int i = 0; i < nPeer; i++) { + const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); @@ -394,7 +395,7 @@ __global__ void __launch_bounds__(1024, 1) /// Starts reduce-scatter if (threadIdx.x < static_cast(nPeer)) { - outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].signal(); outChannels[threadIdx.x].wait(); } __syncthreads(); @@ -415,12 +416,13 @@ __global__ void __launch_bounds__(1024, 1) } if (restNInt4 > 0) { if (threadIdx.x < static_cast(nPeer)) { - outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].signal(); outChannels[threadIdx.x].wait(); } __syncthreads(); for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { - for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { + for (int i = 0; i < nPeer; i++) { + const int peerIdx = (i + blockIdx.x) % nPeer; const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock]; channels[peerIdx].write(scratchChunkRankOffset + blockOffset + idx, val); @@ -428,7 +430,7 @@ __global__ void __launch_bounds__(1024, 1) } if (threadIdx.x < static_cast(nPeer)) { - outChannels[threadIdx.x].relaxedSignal(); + outChannels[threadIdx.x].signal(); outChannels[threadIdx.x].wait(); } __syncthreads(); @@ -464,7 +466,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< allreduce7<<>>(buff, scratch, resultBuff, smChannels, rank, nRanksPerNode, worldSize, nelems, flag++); } else { - int nBlocks = 32; + int nBlocks = 35; int nThreadsPerBlock = 512; allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, rank, nRanksPerNode, worldSize, nelems); diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp index 9ad538439..cddc69625 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp @@ -12,6 +12,6 @@ #endif constexpr int NRANKS_PER_NODE = 8; -constexpr int SCRATCH_SIZE = 1024 * 1024 * 64; // 32 thread-blocks * 8 ranks * 256KB = 64MB +constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks * 256KB = 70MB #endif // NCCL_COMMON_HPP_ From f81d53d8031ca2e36c4df04bc26253448f81be36 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 12 Mar 2024 07:13:19 +0000 Subject: [PATCH 79/89] perf improve --- apps/nccl/src/allreduce.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index a11b6e8db..075bd2801 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -350,7 +350,7 @@ __global__ void __launch_bounds__(1024, 1) int4* resultBuff4 = reinterpret_cast(resultBuff); // Distribute `nInt4PerRank` across all blocks with the unit size `unitNInt4` - constexpr size_t unitNInt4 = 1024; + constexpr size_t unitNInt4 = 512; const size_t maxNInt4PerBlock = (((nInt4PerRank + gridDim.x - 1) / gridDim.x) + unitNInt4 - 1) / unitNInt4 * unitNInt4; size_t offsetOfThisBlock = maxNInt4PerBlock * blockIdx.x; size_t nInt4OfThisBlock = maxNInt4PerBlock; From a075f71f93e6394e55f42ff5d2816a5d8a31145c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 19 Mar 2024 04:54:41 +0000 Subject: [PATCH 80/89] Fix build dependencies --- apps/nccl/CMakeLists.txt | 4 ++++ apps/nccl/test/CMakeLists.txt | 6 ++++++ {test => apps/nccl/test}/nccl_api_test.cc | 0 test/CMakeLists.txt | 5 ++--- 4 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 apps/nccl/test/CMakeLists.txt rename {test => apps/nccl/test}/nccl_api_test.cc (100%) diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 2432fe562..5e301fb5e 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -33,3 +33,7 @@ install(TARGETS mscclpp_nccl LIBRARY DESTINATION ${INSTALL_PREFIX}/lib) install(TARGETS mscclpp_nccl_static ARCHIVE DESTINATION ${INSTALL_PREFIX}/lib) + +if(BUILD_TESTS) + add_subdirectory(test) +endif() diff --git a/apps/nccl/test/CMakeLists.txt b/apps/nccl/test/CMakeLists.txt new file mode 100644 index 000000000..c7cfbaa7d --- /dev/null +++ b/apps/nccl/test/CMakeLists.txt @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +add_executable(nccl_api_test nccl_api_test.cc) +target_link_libraries(nccl_api_test mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads MPI::MPI_CXX) +target_include_directories(nccl_api_test ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include) diff --git a/test/nccl_api_test.cc b/apps/nccl/test/nccl_api_test.cc similarity index 100% rename from test/nccl_api_test.cc rename to apps/nccl/test/nccl_api_test.cc diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5b236ebfe..0268af1c6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,9 +3,9 @@ find_package(MPI) -set(TEST_LIBS_COMMON mscclpp_nccl mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) +set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main) -set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include ${GPU_INCLUDE_DIRS}) +set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${GPU_INCLUDE_DIRS}) set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include) if(USE_ROCM) @@ -24,7 +24,6 @@ endfunction() add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) add_test_executable(nvls_test nvls_test.cu) -add_test_executable(nccl_api_test nccl_api_test.cc) configure_file(run_mpi_test.sh.in run_mpi_test.sh) From 7aa2ddd96532b1f51d4c9299d0ff339cfc4aece0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 19 Mar 2024 05:29:52 +0000 Subject: [PATCH 81/89] Further fix build dependencies --- apps/nccl/CMakeLists.txt | 4 ++-- apps/nccl/README.md | 2 +- apps/nccl/src/allgather.hpp | 2 ++ apps/nccl/src/allreduce.hpp | 2 ++ apps/nccl/test/CMakeLists.txt | 6 ++++-- docs/quickstart.md | 3 +++ 6 files changed, 14 insertions(+), 5 deletions(-) diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 5e301fb5e..c15d43ecb 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -1,11 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/nccl.cu) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS src/*) file(GLOB_RECURSE HEADERS CONFIGURE_DEPENDS include/nccl.h) if(USE_ROCM) - set_source_files_properties(src/nccl.cu PROPERTIES LANGUAGE CXX) + set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CXX) endif() add_library(mscclpp_nccl_obj OBJECT) diff --git a/apps/nccl/README.md b/apps/nccl/README.md index b6c537ed7..08a807f72 100644 --- a/apps/nccl/README.md +++ b/apps/nccl/README.md @@ -3,7 +3,7 @@ Compile ```bash -CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF .. +CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_APPS_NCCL=ON .. make -j ``` diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 30a93d898..4e352c070 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -5,7 +5,9 @@ #define ALLGATHER_HPP_ #include +#include #include +#include #include #include "common.hpp" diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 075bd2801..6cd4deb3f 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -5,8 +5,10 @@ #define ALLREDUCE_HPP_ #include +#include #include #include +#include #include #include "common.hpp" diff --git a/apps/nccl/test/CMakeLists.txt b/apps/nccl/test/CMakeLists.txt index c7cfbaa7d..025d2db79 100644 --- a/apps/nccl/test/CMakeLists.txt +++ b/apps/nccl/test/CMakeLists.txt @@ -1,6 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +find_package(MPI) + add_executable(nccl_api_test nccl_api_test.cc) -target_link_libraries(nccl_api_test mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads MPI::MPI_CXX) -target_include_directories(nccl_api_test ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include) +target_link_libraries(nccl_api_test mscclpp mscclpp_nccl ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads MPI::MPI_CXX) +target_include_directories(nccl_api_test PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/apps/nccl/include) diff --git a/docs/quickstart.md b/docs/quickstart.md index af1bbe5f3..a30c42032 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -59,7 +59,10 @@ $ sudo make install/fast Python 3.8 or later is required. ```bash +# For NVIDIA platforms $ python -m pip install . +# For AMD platforms +$ CXX=/path/to/hipcc python -m pip install . ``` ## Docker Images From 2dfb5cf039056e50336acb6bdece60a0d220017c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 19 Mar 2024 05:37:18 +0000 Subject: [PATCH 82/89] Temporal flags --- apps/nccl/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nccl/README.md b/apps/nccl/README.md index 08a807f72..251fc03af 100644 --- a/apps/nccl/README.md +++ b/apps/nccl/README.md @@ -10,5 +10,5 @@ make -j Run rccl-tests ```bash -mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD="$MSCCLPP_BUILD/libmscclpp.so $MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so" -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NCCL_DEBUG=WARN ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 +mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD="$MSCCLPP_BUILD/libmscclpp.so $MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so" -x HIP_FORCE_DEV_KERNARG=1 -x HSA_ENABLE_IPC_MODE_LEGACY=1 -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NCCL_DEBUG=WARN ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` From c685788a29670041ede973c9bd2a473533d489b4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 19 Mar 2024 05:43:02 +0000 Subject: [PATCH 83/89] Include mscclpp object in mscclpp_nccl --- apps/nccl/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index c15d43ecb..d11780be1 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -21,10 +21,10 @@ elseif(USE_ROCM) endif() add_library(mscclpp_nccl SHARED) -target_link_libraries(mscclpp_nccl PUBLIC mscclpp_nccl_obj) +target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj) set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) add_library(mscclpp_nccl_static STATIC) -target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_nccl_obj) +target_link_libraries(mscclpp_nccl_static PUBLIC mscclpp_obj mscclpp_nccl_obj) set_target_properties(mscclpp_nccl_static PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) install(TARGETS mscclpp_nccl_obj From 84eb0efaab4e98590c9c181be9bd3d8196e80aba Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 19 Mar 2024 05:44:08 +0000 Subject: [PATCH 84/89] Update readme --- apps/nccl/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/nccl/README.md b/apps/nccl/README.md index 251fc03af..62a24d9e2 100644 --- a/apps/nccl/README.md +++ b/apps/nccl/README.md @@ -10,5 +10,5 @@ make -j Run rccl-tests ```bash -mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD="$MSCCLPP_BUILD/libmscclpp.so $MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so" -x HIP_FORCE_DEV_KERNARG=1 -x HSA_ENABLE_IPC_MODE_LEGACY=1 -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NCCL_DEBUG=WARN ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 +mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/apps/nccl/libmscclpp_nccl.so -x HIP_FORCE_DEV_KERNARG=1 -x HSA_ENABLE_IPC_MODE_LEGACY=1 -x MSCCLPP_DEBUG=WARN -x MSCCLPP_DEBUG_SUBSYS=ALL -x NCCL_DEBUG=WARN ./build/all_reduce_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` From f3d4a3c2edf28031d4c9d8ec3cfb91dca2dae4a9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 20 Mar 2024 04:12:17 +0000 Subject: [PATCH 85/89] minor updates --- CMakeLists.txt | 2 +- apps/nccl/CMakeLists.txt | 2 +- python/mscclpp/CMakeLists.txt | 4 ++-- python/test/CMakeLists.txt | 4 ++-- test/CMakeLists.txt | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78844191f..b77c7e1b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,7 @@ find_package(Threads REQUIRED) add_library(mscclpp_obj OBJECT) target_include_directories(mscclpp_obj - PRIVATE + SYSTEM PRIVATE ${GPU_INCLUDE_DIRS} ${IBVERBS_INCLUDE_DIRS} ${NUMA_INCLUDE_DIRS}) diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index d11780be1..33f385da0 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -11,7 +11,7 @@ endif() add_library(mscclpp_nccl_obj OBJECT) target_sources(mscclpp_nccl_obj PRIVATE ${SOURCES}) target_sources(mscclpp_nccl_obj PUBLIC FILE_SET HEADERS FILES ${HEADERS}) -target_include_directories(mscclpp_nccl_obj PRIVATE ${GPU_INCLUDE_DIRS} include) +target_include_directories(mscclpp_nccl_obj PRIVATE include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) target_link_libraries(mscclpp_nccl_obj PRIVATE ${GPU_LIBRARIES} PUBLIC mscclpp_obj) set_target_properties(mscclpp_nccl_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) if(USE_CUDA) diff --git a/python/mscclpp/CMakeLists.txt b/python/mscclpp/CMakeLists.txt index 0fe510c80..bb9eadf32 100644 --- a/python/mscclpp/CMakeLists.txt +++ b/python/mscclpp/CMakeLists.txt @@ -9,6 +9,6 @@ FetchContent_MakeAvailable(nanobind) file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) nanobind_add_module(mscclpp_py ${SOURCES}) set_target_properties(mscclpp_py PROPERTIES OUTPUT_NAME _mscclpp) -target_link_libraries(mscclpp_py PRIVATE ${GPU_LIBRARIES} mscclpp_static) -target_include_directories(mscclpp_py PRIVATE ${GPU_INCLUDE_DIRS}) +target_link_libraries(mscclpp_py PRIVATE mscclpp_static ${GPU_LIBRARIES}) +target_include_directories(mscclpp_py SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) install(TARGETS mscclpp_py LIBRARY DESTINATION .) diff --git a/python/test/CMakeLists.txt b/python/test/CMakeLists.txt index cf705841c..be62aea99 100644 --- a/python/test/CMakeLists.txt +++ b/python/test/CMakeLists.txt @@ -9,5 +9,5 @@ FetchContent_MakeAvailable(nanobind) file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) nanobind_add_module(mscclpp_py_test ${SOURCES}) set_target_properties(mscclpp_py_test PROPERTIES OUTPUT_NAME _ext) -target_link_libraries(mscclpp_py_test PRIVATE ${GPU_LIBRARIES} mscclpp_static) -target_include_directories(mscclpp_py_test PRIVATE ${GPU_INCLUDE_DIRS}) +target_link_libraries(mscclpp_py_test PRIVATE mscclpp_static ${GPU_LIBRARIES}) +target_include_directories(mscclpp_py_test SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0268af1c6..da47066ea 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,7 +5,7 @@ find_package(MPI) set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main) -set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include ${GPU_INCLUDE_DIRS}) +set(TEST_INC_COMMON PRIVATE ${PROJECT_SOURCE_DIR}/include SYSTEM PRIVATE ${GPU_INCLUDE_DIRS}) set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/include) if(USE_ROCM) From ba6a4e9fac7c664693bcb9887917186e840e7ca9 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 22 Mar 2024 00:40:09 +0000 Subject: [PATCH 86/89] A bug fix --- apps/nccl/src/allgather.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 4e352c070..673074857 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -29,9 +29,9 @@ __global__ void __launch_bounds__(1024, 1) const size_t chanOffset = nPeer * blockIdx.x; auto smChans = smChannels + chanOffset; - if (wid < nPeer && lid == 0) { - smChans[wid].relaxedSignal(); - smChans[wid].wait(); + if (threadIdx.x / WARP_SIZE < nPeer && lid == 0) { + smChans[threadIdx.x / WARP_SIZE].signal(); + smChans[threadIdx.x / WARP_SIZE].wait(); } __syncthreads(); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); From bfb665c2802ba297f5ff679fb7ee8f1cdf4c011c Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 22 Mar 2024 21:23:08 +0000 Subject: [PATCH 87/89] minor fixes --- apps/nccl/src/allreduce.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 6cd4deb3f..437a79557 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -164,7 +164,7 @@ __global__ void __launch_bounds__(1024, 1) for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; - uint2 val = dstPkt[idx].read(flag); + uint2 val = dstPkt[idx].read(flag, -1); data = add_vectors(val, data); } data = add_vectors(data, src[idx]); @@ -180,7 +180,7 @@ __global__ void __launch_bounds__(1024, 1) const int dstOffset = remoteRank * nPktsPerRank; uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { - uint2 data = dstPkt[idx + dstOffset].read(flag); + uint2 data = dstPkt[idx + dstOffset].read(flag, -1); result[idx].x = data.x; result[idx].y = data.y; } @@ -310,7 +310,7 @@ __global__ void __launch_bounds__(1024, 1) for (int index = 0; index < nPeers; index++) { const int remoteRank = index < rank ? index : index + 1; mscclpp::LL8Packet* dstPkt = (mscclpp::LL8Packet*)scratchBuff + remoteRank * nPktsPerRank; - uint32_t val = dstPkt[idx].read(flag); + uint32_t val = dstPkt[idx].read(flag, -1); data = add_vectors(val, data); } data = add_vectors(data, src[idx]); @@ -329,13 +329,13 @@ __global__ void __launch_bounds__(1024, 1) const int dstOffset = remoteRank * nPktsPerRank; uint32_t* result = (uint32_t*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int)); for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) { - uint32_t data = dstPkt[idx + dstOffset].read(flag); + uint32_t data = dstPkt[idx + dstOffset].read(flag, -1); result[idx] = data; } } template -__global__ void __launch_bounds__(1024, 1) +__global__ void __launch_bounds__(512, 1) allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, mscclpp::DeviceHandle* smOutChannels, int rank, int nRanksPerNode, int worldSize, size_t nelems) { From 36248ef5aa097bf21781a6fdad58d6eae0c2efed Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 22 Mar 2024 21:28:46 +0000 Subject: [PATCH 88/89] WIP algather sync issue --- apps/nccl/src/allgather.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 673074857..2afe0bb13 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -12,6 +12,21 @@ #include "common.hpp" +extern __device__ mscclpp::DeviceSyncer deviceSyncer; + +__forceinline__ __device__ void world_barrier(mscclpp::DeviceHandle* smChannels) { + if (threadIdx.x == 0) { + __threadfence_system(); + } + __syncthreads(); + deviceSyncer.sync(gridDim.x); + if (blockIdx.x == 0 && threadIdx.x < 7) { + smChannels[threadIdx.x].signal(); + smChannels[threadIdx.x].wait(); + } + deviceSyncer.sync(gridDim.x); +} + template __global__ void __launch_bounds__(1024, 1) allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t rank, @@ -34,6 +49,7 @@ __global__ void __launch_bounds__(1024, 1) smChans[threadIdx.x / WARP_SIZE].wait(); } __syncthreads(); + world_barrier(smChannels); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU * nPeer; size_t unitBytesPerThread; @@ -98,6 +114,7 @@ __global__ void __launch_bounds__(1024, 1) } } } + world_barrier(smChannels); } template From fa3d0d22a89b05e3a2ad14cf2a54b5b4f10680e2 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 26 Mar 2024 03:09:45 +0000 Subject: [PATCH 89/89] Fixed allgather --- apps/nccl/src/allgather.hpp | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 2afe0bb13..9d8e1fbc7 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -12,44 +12,26 @@ #include "common.hpp" -extern __device__ mscclpp::DeviceSyncer deviceSyncer; - -__forceinline__ __device__ void world_barrier(mscclpp::DeviceHandle* smChannels) { - if (threadIdx.x == 0) { - __threadfence_system(); - } - __syncthreads(); - deviceSyncer.sync(gridDim.x); - if (blockIdx.x == 0 && threadIdx.x < 7) { - smChannels[threadIdx.x].signal(); - smChannels[threadIdx.x].wait(); - } - deviceSyncer.sync(gridDim.x); -} - template __global__ void __launch_bounds__(1024, 1) allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { - const size_t nBlock = gridDim.x; - if (blockIdx.x >= nBlock) return; - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; const size_t wid = tid / WARP_SIZE; - const size_t nThread = blockDim.x * nBlock; + const size_t nThread = blockDim.x * gridDim.x; const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; auto smChans = smChannels + chanOffset; - if (threadIdx.x / WARP_SIZE < nPeer && lid == 0) { - smChans[threadIdx.x / WARP_SIZE].signal(); - smChans[threadIdx.x / WARP_SIZE].wait(); + if (threadIdx.x < nPeer) { + smChans[threadIdx.x].relaxedSignal(); + smChans[threadIdx.x].wait(); } __syncthreads(); - world_barrier(smChannels); + const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); const size_t bytes = bytesPerGPU * nPeer; size_t unitBytesPerThread; @@ -114,7 +96,6 @@ __global__ void __launch_bounds__(1024, 1) } } } - world_barrier(smChannels); } template