From 20a5b6d31b6bb82da624ee1c48664c0dcbaf9e38 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 28 Mar 2024 09:13:42 +0000 Subject: [PATCH 01/51] init --- src/executor/executor.cc | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 src/executor/executor.cc diff --git a/src/executor/executor.cc b/src/executor/executor.cc new file mode 100644 index 000000000..71705089e --- /dev/null +++ b/src/executor/executor.cc @@ -0,0 +1,4 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// This is used for execute json file generated by msccl scheduler From abd4b3c35822fd9113db32ae741f4b6eb0bc173e Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 29 Mar 2024 07:34:26 +0000 Subject: [PATCH 02/51] init --- src/executor/execution_kernel.cu | 17 ++++++++ src/executor/execution_plan.cpp | 3 ++ src/executor/executor.cc | 9 ++++ src/include/execution_plan.hpp | 70 ++++++++++++++++++++++++++++++++ src/include/executor.hpp | 38 +++++++++++++++++ 5 files changed, 137 insertions(+) create mode 100644 src/executor/execution_kernel.cu create mode 100644 src/executor/execution_plan.cpp create mode 100644 src/include/execution_plan.hpp create mode 100644 src/include/executor.hpp diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu new file mode 100644 index 000000000..f7b79315e --- /dev/null +++ b/src/executor/execution_kernel.cu @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "execution_plan.hpp" + +extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[]; + +__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff) { + // read data from shared memory + // 1. get the number of command from shared memory + int nOps = sharedMem->nOperations; + for (int opId= 0; opId < nOps; opId++) { + // 2. get the command + mscclpp::Operation* op = sharedMem->operations + opId; + // 3. execute the command + } +} diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp new file mode 100644 index 000000000..048847f54 --- /dev/null +++ b/src/executor/execution_plan.cpp @@ -0,0 +1,3 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 71705089e..de9e8ebd7 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -2,3 +2,12 @@ // Licensed under the MIT license. // This is used for execute json file generated by msccl scheduler + +#include "executor.hpp" + +namespace mscclpp { + +void Executor::Impl::launchKernel() { +} + +} // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp new file mode 100644 index 000000000..aa964ae1a --- /dev/null +++ b/src/include/execution_plan.hpp @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_EXECUTOR_PLAN_HPP_ +#define MSCCLPP_EXECUTOR_PLAN_HPP_ + +#include +#include +#include + +#include + +namespace mscclpp { + +constexpr int MAX_CHANNEL = 24; +constexpr int MAX_CHANNEL_PER_OPERATION = 8; + +enum class OperationType { + BARRIER, + PUT, + GET, + COPY, + SIGNAL, + WAIT, + FLUSH, + REDUCE, + READ_REDUCE_COPY, + READ_REDUCE_COPY_PUT, +}; + +enum class ChannelType { + SM, + PROXY, +}; + +struct Channels { + mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; +}; + +struct Operation { + OperationType type; + ChannelType channelType; + uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + size_t inputOffset[MAX_CHANNEL_PER_OPERATION]; + size_t outputOffset[MAX_CHANNEL_PER_OPERATION]; + size_t srcOffset; + size_t dstOffset; + size_t size; +}; + +struct DeviceExecutionPlan { + int nSmChannels; + int nProxyChannels; + Channels channels; + int nOperations; + Operation operations[1]; +}; + +class ExectionPlan { + public: + ExectionPlan(); + void loadExecutionPlan(const std::string& json); + ~ExectionPlan(); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_EXECUTOR_PLAN_HPP_ diff --git a/src/include/executor.hpp b/src/include/executor.hpp new file mode 100644 index 000000000..788b09e9e --- /dev/null +++ b/src/include/executor.hpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_EXECUTOR_HPP_ +#define MSCCLPP_EXECUTOR_HPP_ + +#include +#include + +#include "execution_plan.hpp" + +namespace mscclpp { + +class Executor { + public: + Executor(); + template + void execute(std::shared_ptr sendbuff, std::shared_ptr recvBuff, size_t sendBuffSize, size_t recvBuffSize, + const ExectionPlan& plan); + ~Executor(); + + private: + struct Impl; + + std::shared_ptr impl_; +}; + +struct Executor::Impl { + Impl(); + void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + const ExectionPlan& plan); + void launchKernel(); + ~Impl(); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_EXECUTOR_HPP_ From c3e0e022f4f467e9ff2e4ccc26b18c141d1c3417 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 29 Mar 2024 11:38:31 +0000 Subject: [PATCH 03/51] WIP --- CMakeLists.txt | 6 +++++- src/executor/execution_kernel.cu | 10 ++++++++-- src/executor/execution_plan.cpp | 10 ++++++++++ src/include/execution_plan.hpp | 32 +++++++++++++++++++++++++++----- src/include/executor.hpp | 2 +- 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ccddb366b..865ab81f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,13 +99,17 @@ find_package(IBVerbs REQUIRED) find_package(NUMA REQUIRED) find_package(Threads REQUIRED) +include(FetchContent) +FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz) +FetchContent_MakeAvailable(json) + add_library(mscclpp_obj OBJECT) target_include_directories(mscclpp_obj SYSTEM PRIVATE ${GPU_INCLUDE_DIRS} ${IBVERBS_INCLUDE_DIRS} ${NUMA_INCLUDE_DIRS}) -target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} Threads::Threads) +target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads) set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) if(USE_CUDA) target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index f7b79315e..cecf0605e 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -5,13 +5,19 @@ extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[]; -__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff) { +__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) { // read data from shared memory // 1. get the number of command from shared memory int nOps = sharedMem->nOperations; - for (int opId= 0; opId < nOps; opId++) { + mscclpp::DeviceHandle* smChannel = sharedMem->channels.smChannels; + mscclpp::DeviceHandle* proxyChannel = sharedMem->channels.proxyChannels; + for (int opId = 0; opId < nOps; opId++) { // 2. get the command mscclpp::Operation* op = sharedMem->operations + opId; // 3. execute the command + switch (op->type) { + default: + break; + } } } diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp index 048847f54..1a9278cd2 100644 --- a/src/executor/execution_plan.cpp +++ b/src/executor/execution_plan.cpp @@ -1,3 +1,13 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include "execution_plan.hpp" + +#include + +namespace mscclpp { +using json = nlohmann::json; +void ExecutionPlan::loadExecutionPlan(std::ifstream& file) { + json obj = json::parse(file); +} +} // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index aa964ae1a..368ccdc13 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -33,6 +33,19 @@ enum class ChannelType { PROXY, }; +enum class BufferType { + INPUT, + OUTPUT, + SCRATCH, +}; + +struct ChannelInfo { + BufferType srcBufferType; + BufferType dstBufferType; + ChannelType channelType; + std::vector connectedPeers; +}; + struct Channels { mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; @@ -53,16 +66,25 @@ struct Operation { struct DeviceExecutionPlan { int nSmChannels; int nProxyChannels; - Channels channels; int nOperations; + Channels channels; Operation operations[1]; }; -class ExectionPlan { +class ExecutionPlan { public: - ExectionPlan(); - void loadExecutionPlan(const std::string& json); - ~ExectionPlan(); + ExecutionPlan(); + void loadExecutionPlan(std::ifstream& file); + std::vector getConnectedPeers(int rank); + size_t getScratchSize(size_t inputSize); + std::vector getOperations(int rank, int threadblock); + std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, + BufferType dstBufferType, ChannelType channelType); + ~ExecutionPlan(); + + private: + // operations for [rank][threadblock] + std::vector> operations_; }; } // namespace mscclpp diff --git a/src/include/executor.hpp b/src/include/executor.hpp index 788b09e9e..647e55193 100644 --- a/src/include/executor.hpp +++ b/src/include/executor.hpp @@ -28,7 +28,7 @@ class Executor { struct Executor::Impl { Impl(); void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, - const ExectionPlan& plan); + const ExecutionPlan& plan); void launchKernel(); ~Impl(); }; From 91d4df203b5887f8a554f9f7c10bed180d31a973 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 31 Mar 2024 08:11:14 +0000 Subject: [PATCH 04/51] WIP --- src/executor/executor.cc | 60 +++++++++++++++++++++++++++++++++- src/include/execution_plan.hpp | 11 +++++-- src/include/executor.hpp | 44 ++++++++++++++++++++++--- 3 files changed, 106 insertions(+), 9 deletions(-) diff --git a/src/executor/executor.cc b/src/executor/executor.cc index de9e8ebd7..96889092a 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -5,9 +5,67 @@ #include "executor.hpp" +namespace { +static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, + mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, + mscclpp::Transport::IB6, mscclpp::Transport::IB7}; +} // namespace + namespace mscclpp { -void Executor::Impl::launchKernel() { +ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, + size_t recvBuffSize, const ExecutionPlan& plan) { + ExecutionPlanKey key = {sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan.getName()}; + if (this->contexts.find(key) != this->contexts.end()) { + return this->contexts[key]; + } + ExecutionContext context; + size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBuffSize); + std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); + context.scratchBuffer = scratchBuffer; + + std::vector bufferTypes = plan.getConnectedBufferTypes(rank, ChannelType::SM); + int nranksPerNode = plan.nranksPerNode(); + auto getTransportFlags = [&](std::vector& infos, int rank) -> mscclpp::TransportFlags { + return mscclpp::Transport::CudaIpc; + }; + auto getBufferInfo = [&](BufferType type) { + switch (type) { + case BufferType::INPUT: + return std::make_pair(sendbuff, sendBuffSize); + case BufferType::OUTPUT: + return std::make_pair(recvBuff, recvBuffSize); + case BufferType::SCRATCH: + return std::make_pair((void*)scratchBuffer.get(), scratchBufferSize); + default: + throw std::runtime_error("Invalid buffer type"); + } + }; + auto getConnectedPeers = [&](std::vector& infos) { + std::vector peers; + return peers; + }; + + for (BufferType bufferType : bufferTypes) { + std::vector channelInfos = plan.getChannelInfos(rank, bufferType); + mscclpp::TransportFlags transportFlags = getTransportFlags(channelInfos, rank); + mscclpp::RegisteredMemory memory = + this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags); + std::vector connectedPeers = getConnectedPeers(channelInfos); + std::vector> remoteRegMemoryFutures; + for (int peer : connectedPeers) { + remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); + comm->sendMemoryOnSetup(memory, peer, 0); + } + comm->setup(); + for (int i = 0; i < remoteRegMemoryFutures.size(); i++) { + context.registeredMemories[{bufferType, connectedPeers[i]}].push_back(remoteRegMemoryFutures[i].get()); + } + } + std::vector smChannelInfos = plan.getChannelInfos(rank, ChannelType::SM); + return context; } +void Executor::Impl::launchKernel() {} + } // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 368ccdc13..66b3297a1 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -73,10 +73,14 @@ struct DeviceExecutionPlan { class ExecutionPlan { public: - ExecutionPlan(); + ExecutionPlan(std::string name); + std::string getName() const; void loadExecutionPlan(std::ifstream& file); - std::vector getConnectedPeers(int rank); - size_t getScratchSize(size_t inputSize); + int nranksPerNode() const; + std::vector getChannelInfos(int rank, ChannelType channelType) const; + std::vector getChannelInfos(int rank, BufferType bufferType) const; + std::vector getConnectedBufferTypes(int rank, ChannelType channelType) const; + size_t getScratchBufferSize(int rank, size_t inputSize) const; std::vector getOperations(int rank, int threadblock); std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, BufferType dstBufferType, ChannelType channelType); @@ -85,6 +89,7 @@ class ExecutionPlan { private: // operations for [rank][threadblock] std::vector> operations_; + std::string name_; }; } // namespace mscclpp diff --git a/src/include/executor.hpp b/src/include/executor.hpp index 647e55193..0b6c379a7 100644 --- a/src/include/executor.hpp +++ b/src/include/executor.hpp @@ -4,8 +4,10 @@ #ifndef MSCCLPP_EXECUTOR_HPP_ #define MSCCLPP_EXECUTOR_HPP_ +#include #include #include +#include #include "execution_plan.hpp" @@ -13,10 +15,10 @@ namespace mscclpp { class Executor { public: - Executor(); + Executor(const std::unordered_map connections); template void execute(std::shared_ptr sendbuff, std::shared_ptr recvBuff, size_t sendBuffSize, size_t recvBuffSize, - const ExectionPlan& plan); + const ExecutionPlan& plan); ~Executor(); private: @@ -25,14 +27,46 @@ class Executor { std::shared_ptr impl_; }; +struct ExecutionContext { + std::unordered_map, std::vector> registeredMemories; + std::vector smChannels; + std::vector proxyChannels; + std::vector deviceExecutionPlans; + std::shared_ptr scratchBuffer; +}; + +struct ExecutionPlanKey { + void* sendBuff; + void* recvBuff; + size_t sendBuffSize; + size_t recvBuffSize; + std::string plan; +}; + struct Executor::Impl { - Impl(); - void setupCommnucation(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, - const ExecutionPlan& plan); + std::unordered_map contexts; + const std::unordered_map connections; + std::shared_ptr comm; + + Impl(const std::unordered_map connections); + ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, + size_t recvBuffSize, const ExecutionPlan& plan); + void setupRegisteredMemories(ExecutionContext& context, int rank, const ExecutionPlan& plan); + void setupChannels(ExecutionContext& context, int rank, const ExecutionPlan& plan); void launchKernel(); ~Impl(); }; } // namespace mscclpp +namespace std { +template <> +struct hash { + std::size_t operator()(const mscclpp::ExecutionPlanKey& key) const { + return std::hash()(key.sendBuff) ^ std::hash()(key.recvBuff) ^ std::hash()(key.sendBuffSize) ^ + std::hash()(key.recvBuffSize) ^ std::hash()(key.plan); + } +}; +} // namespace std + #endif // MSCCLPP_EXECUTOR_HPP_ From d6ec1439dba739212a65f358df2da971b198c35f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 07:51:33 +0000 Subject: [PATCH 05/51] WIP --- src/executor/executor.cc | 118 +++++++++++++++++++++++++++------ src/include/execution_plan.hpp | 4 +- src/include/executor.hpp | 70 +++++++++++-------- 3 files changed, 141 insertions(+), 51 deletions(-) diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 96889092a..2d0650cec 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -// This is used for execute json file generated by msccl scheduler - #include "executor.hpp" +#include + namespace { static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -13,59 +13,135 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans namespace mscclpp { -ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, - size_t recvBuffSize, const ExecutionPlan& plan) { - ExecutionPlanKey key = {sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan.getName()}; +ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, const ExecutionPlan& plan) { + ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()}; if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; } ExecutionContext context; - size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBuffSize); + size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBufferSize); std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); context.scratchBuffer = scratchBuffer; + context.scratchBufferSize = scratchBufferSize; + this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); + this->setupChannels(context, sendbuff, recvbuff, rank, plan); + return context; +} - std::vector bufferTypes = plan.getConnectedBufferTypes(rank, ChannelType::SM); +void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, + size_t sendBufferSize, size_t recvBufferSize, int rank, + const ExecutionPlan& plan) { int nranksPerNode = plan.nranksPerNode(); - auto getTransportFlags = [&](std::vector& infos, int rank) -> mscclpp::TransportFlags { - return mscclpp::Transport::CudaIpc; + auto getTransportFlags = [&](std::vector& infos, int rank) { + TransportFlags flags; + for (ChannelInfo& info : infos) { + if (info.channelType == ChannelType::SM) { + flags |= Transport::CudaIpc; + } else if (info.channelType == ChannelType::PROXY) { + flags |= IBs[rank % nranksPerNode]; + } + } + return flags; }; auto getBufferInfo = [&](BufferType type) { switch (type) { case BufferType::INPUT: - return std::make_pair(sendbuff, sendBuffSize); + return std::make_pair(sendbuff, sendBufferSize); case BufferType::OUTPUT: - return std::make_pair(recvBuff, recvBuffSize); + return std::make_pair(recvbuff, recvBufferSize); case BufferType::SCRATCH: - return std::make_pair((void*)scratchBuffer.get(), scratchBufferSize); + return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize); default: throw std::runtime_error("Invalid buffer type"); } }; auto getConnectedPeers = [&](std::vector& infos) { - std::vector peers; - return peers; + std::set peers; + for (ChannelInfo& info : infos) { + for (int peer : info.connectedPeers) { + peers.insert(peer); + } + } + return std::vector(peers.begin(), peers.end()); }; + std::vector bufferTypes = plan.getConnectedBufferTypes(rank); for (BufferType bufferType : bufferTypes) { std::vector channelInfos = plan.getChannelInfos(rank, bufferType); - mscclpp::TransportFlags transportFlags = getTransportFlags(channelInfos, rank); - mscclpp::RegisteredMemory memory = + TransportFlags transportFlags = getTransportFlags(channelInfos, rank); + RegisteredMemory memory = this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags); std::vector connectedPeers = getConnectedPeers(channelInfos); std::vector> remoteRegMemoryFutures; for (int peer : connectedPeers) { - remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); comm->sendMemoryOnSetup(memory, peer, 0); + remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); } comm->setup(); for (int i = 0; i < remoteRegMemoryFutures.size(); i++) { - context.registeredMemories[{bufferType, connectedPeers[i]}].push_back(remoteRegMemoryFutures[i].get()); + context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get()); + } + } +} + +void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, + int rank, const ExecutionPlan& plan) { + const auto channelTypes = {ChannelType::SM, ChannelType::PROXY}; + std::vector> smSemaphores; + std::vector proxySemaphores; + for (ChannelType channelType : channelTypes) { + std::vector channelInfos = plan.getChannelInfos(rank, channelType); + for (ChannelInfo& info : channelInfos) { + for (int peer : info.connectedPeers) { + if (channelType == ChannelType::SM) { + smSemaphores.push_back( + std::make_shared(*this->comm, this->connections.at(peer))); + } else if (channelType == ChannelType::PROXY) { + proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer))); + } + } + } + } + this->comm->setup(); + context.smSemaphores = std::move(smSemaphores); + context.proxySemaphores = std::move(proxySemaphores); + + auto getBuffer = [&](BufferType type) { + switch (type) { + case BufferType::INPUT: + return sendbuff; + case BufferType::OUTPUT: + return recvbuff; + case BufferType::SCRATCH: + return (void*)context.scratchBuffer.get(); + default: + throw std::runtime_error("Invalid buffer type"); + } + }; + for (ChannelType channelType : channelTypes) { + std::vector channelInfos = plan.getChannelInfos(rank, channelType); + int index = 0; + for (ChannelInfo& info : channelInfos) { + void* src = getBuffer(info.srcBufferType); + void* dst = getBuffer(info.dstBufferType); + TransportFlags transport = context.registeredMemories.begin()->second.transports(); + RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); + for (int peer : info.connectedPeers) { + if (channelType == ChannelType::SM) { + context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}], + src, nullptr); + } else if (channelType == ChannelType::PROXY) { + context.proxyChannels.emplace_back( + this->proxyService->proxyChannel(proxySemaphores[index]), + this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), + this->proxyService->addMemory(localMemory)); + } + } } } - std::vector smChannelInfos = plan.getChannelInfos(rank, ChannelType::SM); - return context; } -void Executor::Impl::launchKernel() {} +void Executor::Impl::launchKernel(ExecutionContext& context) {} } // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 66b3297a1..8b4342c8f 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -48,7 +48,7 @@ struct ChannelInfo { struct Channels { mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; - mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; }; struct Operation { @@ -79,7 +79,7 @@ class ExecutionPlan { int nranksPerNode() const; std::vector getChannelInfos(int rank, ChannelType channelType) const; std::vector getChannelInfos(int rank, BufferType bufferType) const; - std::vector getConnectedBufferTypes(int rank, ChannelType channelType) const; + std::vector getConnectedBufferTypes(int rank) const; size_t getScratchBufferSize(int rank, size_t inputSize) const; std::vector getOperations(int rank, int threadblock); std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, diff --git a/src/include/executor.hpp b/src/include/executor.hpp index 0b6c379a7..66518388a 100644 --- a/src/include/executor.hpp +++ b/src/include/executor.hpp @@ -12,7 +12,33 @@ #include "execution_plan.hpp" namespace mscclpp { +struct ExecutionContextKey { + void* sendBuff; + void* recvBuff; + size_t sendBuffSize; + size_t recvBuffSize; + std::string plan; +}; +} // namespace mscclpp + +namespace std { +template <> +struct hash> { + std::size_t operator()(const std::pair& key) const { + return std::hash()(key.second) ^ std::hash()(static_cast(key.first)); + } +}; +template <> +struct hash { + std::size_t operator()(const mscclpp::ExecutionContextKey& key) const { + return std::hash()(key.sendBuff) ^ std::hash()(key.recvBuff) ^ std::hash()(key.sendBuffSize) ^ + std::hash()(key.recvBuffSize) ^ std::hash()(key.plan); + } +}; +} // namespace std + +namespace mscclpp { class Executor { public: Executor(const std::unordered_map connections); @@ -28,45 +54,33 @@ class Executor { }; struct ExecutionContext { - std::unordered_map, std::vector> registeredMemories; + std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; + std::vector> smSemaphores; + std::vector proxySemaphores; std::vector smChannels; - std::vector proxyChannels; + std::vector proxyChannels; std::vector deviceExecutionPlans; std::shared_ptr scratchBuffer; -}; - -struct ExecutionPlanKey { - void* sendBuff; - void* recvBuff; - size_t sendBuffSize; - size_t recvBuffSize; - std::string plan; + size_t scratchBufferSize; }; struct Executor::Impl { - std::unordered_map contexts; - const std::unordered_map connections; + std::unordered_map contexts; + const std::unordered_map> connections; std::shared_ptr comm; + std::shared_ptr proxyService; Impl(const std::unordered_map connections); - ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, - size_t recvBuffSize, const ExecutionPlan& plan); - void setupRegisteredMemories(ExecutionContext& context, int rank, const ExecutionPlan& plan); - void setupChannels(ExecutionContext& context, int rank, const ExecutionPlan& plan); - void launchKernel(); - ~Impl(); + ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, const ExecutionPlan& plan); + void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, int rank, const ExecutionPlan& plan); + void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank, + const ExecutionPlan& plan); + void launchKernel(ExecutionContext& context); + ~Impl() = default; }; } // namespace mscclpp -namespace std { -template <> -struct hash { - std::size_t operator()(const mscclpp::ExecutionPlanKey& key) const { - return std::hash()(key.sendBuff) ^ std::hash()(key.recvBuff) ^ std::hash()(key.sendBuffSize) ^ - std::hash()(key.recvBuffSize) ^ std::hash()(key.plan); - } -}; -} // namespace std - #endif // MSCCLPP_EXECUTOR_HPP_ From 3fdd602c7a79d30f71099dfd540d7fd758668247 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 09:45:29 +0000 Subject: [PATCH 06/51] WIP --- src/executor/execution_plan.cpp | 68 ++++++++++++++++++++++++++++++++- src/executor/executor.cc | 26 +++++++++++-- src/include/execution_plan.hpp | 11 ++++-- src/include/executor.hpp | 16 ++++---- 4 files changed, 103 insertions(+), 18 deletions(-) diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cpp index 1a9278cd2..a4fa11d25 100644 --- a/src/executor/execution_plan.cpp +++ b/src/executor/execution_plan.cpp @@ -5,9 +5,75 @@ #include +namespace { +template +std::vector filter(const std::vector& vec, Predicate pred) { + std::vector filtered; + std::copy_if(vec.begin(), vec.end(), std::back_inserter(filtered), pred); + return filtered; +} +} // namespace + namespace mscclpp { using json = nlohmann::json; + +ExecutionPlan::ExecutionPlan(std::ifstream& file) { this->loadExecutionPlan(file); } + +std::string ExecutionPlan::getName() const { return this->name_; } + +int ExecutionPlan::nranksPerNode() const { return this->nranksPerNode_; } + +std::vector ExecutionPlan::getChannelInfos(int rank, ChannelType channelType) const { + auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; }; + return filter(this->channelInfos_.at(rank), pred); +} + +std::vector ExecutionPlan::getChannelInfos(int rank, BufferType dstBufferType) const { + auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; }; + return filter(this->channelInfos_.at(rank), pred); +} + void ExecutionPlan::loadExecutionPlan(std::ifstream& file) { - json obj = json::parse(file); + auto convertToBufferType = [](const std::string& str) { + if (str == "input") { + return BufferType::INPUT; + } else if (str == "output") { + return BufferType::OUTPUT; + } else if (str == "scratch") { + return BufferType::SCRATCH; + } else { + throw std::runtime_error("Invalid buffer type"); + } + }; + auto convertToChannelType = [](const std::string& str) { + if (str == "sm") { + return ChannelType::SM; + } else if (str == "proxy") { + return ChannelType::PROXY; + } else { + throw std::runtime_error("Invalid channel type"); + } + }; + + json obj = json::parse(file); + this->name_ = obj["name"]; + this->nranksPerNode_ = obj["nranksPerNode"]; + auto gpus = obj["gpus"]; + for (const auto& gpu : gpus) { + int rank = gpu["rank"]; + std::vector channelInfos; + for (const auto& channel : gpu["channels"]) { + ChannelInfo info; + info.srcBufferType = convertToBufferType(channel["srcBuffer"]); + info.dstBufferType = convertToBufferType(channel["dstBuffer"]); + info.channelType = convertToChannelType(channel["type"]); + for (const auto& peer : channel["connectedTo"]) { + info.connectedPeers.push_back(peer); + } + channelInfos.push_back(info); + } + this->channelInfos_[rank] = channelInfos; + } } + } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 2d0650cec..f17808fa7 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -13,6 +13,22 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans namespace mscclpp { +Executor::Executor(std::shared_ptr comm, const std::unordered_map connections) + : impl_(std::make_shared(comm, connections)) {} + +void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + const ExecutionPlan& plan) { + ExecutionContext context = + this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); + this->impl_->launchKernel(context); +} + +Executor::Impl::Impl(std::shared_ptr comm, + const std::unordered_map> connections) + : comm(comm), connections(connections) { + this->proxyService = std::make_shared(); +} + ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, const ExecutionPlan& plan) { ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()}; @@ -25,7 +41,7 @@ ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, context.scratchBuffer = scratchBuffer; context.scratchBufferSize = scratchBufferSize; this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); - this->setupChannels(context, sendbuff, recvbuff, rank, plan); + this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); return context; } @@ -95,8 +111,7 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo for (ChannelInfo& info : channelInfos) { for (int peer : info.connectedPeers) { if (channelType == ChannelType::SM) { - smSemaphores.push_back( - std::make_shared(*this->comm, this->connections.at(peer))); + smSemaphores.push_back(std::make_shared(*this->comm, this->connections.at(peer))); } else if (channelType == ChannelType::PROXY) { proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer))); } @@ -142,6 +157,9 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo } } -void Executor::Impl::launchKernel(ExecutionContext& context) {} +void Executor::Impl::launchKernel(ExecutionContext& context) { + // Need to change to use flush function and make sure the proxy service will get the latest data. + this->proxyService->startProxy(); +} } // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 8b4342c8f..220cda9a6 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -7,8 +7,8 @@ #include #include #include - #include +#include namespace mscclpp { @@ -73,9 +73,8 @@ struct DeviceExecutionPlan { class ExecutionPlan { public: - ExecutionPlan(std::string name); + ExecutionPlan(std::ifstream& file); std::string getName() const; - void loadExecutionPlan(std::ifstream& file); int nranksPerNode() const; std::vector getChannelInfos(int rank, ChannelType channelType) const; std::vector getChannelInfos(int rank, BufferType bufferType) const; @@ -84,12 +83,16 @@ class ExecutionPlan { std::vector getOperations(int rank, int threadblock); std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, BufferType dstBufferType, ChannelType channelType); - ~ExecutionPlan(); + ~ExecutionPlan() = default; private: + void loadExecutionPlan(std::ifstream& file); + // operations for [rank][threadblock] std::vector> operations_; + std::unordered_map> channelInfos_; std::string name_; + int nranksPerNode_; }; } // namespace mscclpp diff --git a/src/include/executor.hpp b/src/include/executor.hpp index 66518388a..88469416d 100644 --- a/src/include/executor.hpp +++ b/src/include/executor.hpp @@ -41,11 +41,9 @@ struct hash { namespace mscclpp { class Executor { public: - Executor(const std::unordered_map connections); - template - void execute(std::shared_ptr sendbuff, std::shared_ptr recvBuff, size_t sendBuffSize, size_t recvBuffSize, - const ExecutionPlan& plan); - ~Executor(); + Executor(std::shared_ptr comm, const std::unordered_map connections); + void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan); + ~Executor() = default; private: struct Impl; @@ -65,12 +63,12 @@ struct ExecutionContext { }; struct Executor::Impl { + std::shared_ptr comm; + const std::unordered_map> connections; + std::shared_ptr proxyService; std::unordered_map contexts; - const std::unordered_map> connections; - std::shared_ptr comm; - std::shared_ptr proxyService; - Impl(const std::unordered_map connections); + Impl(std::shared_ptr comm, const std::unordered_map> connections); ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, const ExecutionPlan& plan); void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, From bf681b327f8f15f2840cb3d1cf8f7ab34c51c1a4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 11:31:40 +0000 Subject: [PATCH 07/51] compiled --- include/mscclpp/executor.hpp | 40 ++++++++ src/executor/execution_kernel.cu | 40 ++++---- .../{execution_plan.cpp => execution_plan.cc} | 36 +++++--- src/executor/executor.cc | 92 ++++++++++++++++--- src/include/execution_plan.hpp | 20 ++-- src/include/executor.hpp | 84 ----------------- 6 files changed, 169 insertions(+), 143 deletions(-) create mode 100644 include/mscclpp/executor.hpp rename src/executor/{execution_plan.cpp => execution_plan.cc} (60%) delete mode 100644 src/include/executor.hpp diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp new file mode 100644 index 000000000..c9f7e7f59 --- /dev/null +++ b/include/mscclpp/executor.hpp @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_EXECUTOR_HPP_ +#define MSCCLPP_EXECUTOR_HPP_ + +#include +#include +#include + +namespace mscclpp { + +class ExecutionPlan { + public: + ExecutionPlan(std::ifstream& file); + ~ExecutionPlan() = default; + + private: + struct Impl; + std::shared_ptr impl_; + + friend class Executor; +}; + +class Executor { + public: + Executor(std::shared_ptr comm, const std::unordered_map> connections); + Executor(const Executor&) = delete; + Executor& operator=(const Executor&) = delete; + ~Executor() = default; + + void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan); + + private: + struct Impl; + std::unique_ptr impl_; +}; +} // namespace mscclpp + +#endif // MSCCLPP_EXECUTOR_HPP_ diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index cecf0605e..6b467dbcb 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -1,23 +1,23 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// // Copyright (c) Microsoft Corporation. +// // Licensed under the MIT license. -#include "execution_plan.hpp" +// #include "execution_plan.hpp" -extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[]; +// extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[]; -__global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) { - // read data from shared memory - // 1. get the number of command from shared memory - int nOps = sharedMem->nOperations; - mscclpp::DeviceHandle* smChannel = sharedMem->channels.smChannels; - mscclpp::DeviceHandle* proxyChannel = sharedMem->channels.proxyChannels; - for (int opId = 0; opId < nOps; opId++) { - // 2. get the command - mscclpp::Operation* op = sharedMem->operations + opId; - // 3. execute the command - switch (op->type) { - default: - break; - } - } -} +// __global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) { +// // read data from shared memory +// // 1. get the number of command from shared memory +// int nOps = sharedMem->nOperations; +// mscclpp::DeviceHandle* smChannel = sharedMem->channels.smChannels; +// mscclpp::DeviceHandle* proxyChannel = sharedMem->channels.proxyChannels; +// for (int opId = 0; opId < nOps; opId++) { +// // 2. get the command +// mscclpp::Operation* op = sharedMem->operations + opId; +// // 3. execute the command +// switch (op->type) { +// default: +// break; +// } +// } +// } diff --git a/src/executor/execution_plan.cpp b/src/executor/execution_plan.cc similarity index 60% rename from src/executor/execution_plan.cpp rename to src/executor/execution_plan.cc index a4fa11d25..5fe314cfc 100644 --- a/src/executor/execution_plan.cpp +++ b/src/executor/execution_plan.cc @@ -3,6 +3,7 @@ #include "execution_plan.hpp" +#include #include namespace { @@ -17,23 +18,30 @@ std::vector filter(const std::vector& vec, Predicate pred) { namespace mscclpp { using json = nlohmann::json; -ExecutionPlan::ExecutionPlan(std::ifstream& file) { this->loadExecutionPlan(file); } +ExecutionPlan::Impl::Impl(std::ifstream& file) { this->loadExecutionPlan(file); } -std::string ExecutionPlan::getName() const { return this->name_; } - -int ExecutionPlan::nranksPerNode() const { return this->nranksPerNode_; } - -std::vector ExecutionPlan::getChannelInfos(int rank, ChannelType channelType) const { +std::vector ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const { auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; }; - return filter(this->channelInfos_.at(rank), pred); + return filter(this->channelInfos.at(rank), pred); } - -std::vector ExecutionPlan::getChannelInfos(int rank, BufferType dstBufferType) const { +std::vector ExecutionPlan::Impl::getChannelInfos(int rank, BufferType dstBufferType) const { auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; }; - return filter(this->channelInfos_.at(rank), pred); + return filter(this->channelInfos.at(rank), pred); +} + +std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const { + return std::vector(); +} +size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return 0; }; +std::vector ExecutionPlan::Impl::getOperations(int rank, int threadblock) { + return std::vector(); +} +std::pair ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, + BufferType dstBufferType, ChannelType channelType) { + return std::make_pair(0, 0); } -void ExecutionPlan::loadExecutionPlan(std::ifstream& file) { +void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { auto convertToBufferType = [](const std::string& str) { if (str == "input") { return BufferType::INPUT; @@ -56,8 +64,8 @@ void ExecutionPlan::loadExecutionPlan(std::ifstream& file) { }; json obj = json::parse(file); - this->name_ = obj["name"]; - this->nranksPerNode_ = obj["nranksPerNode"]; + this->name = obj["name"]; + this->nranksPerNode = obj["nranksPerNode"]; auto gpus = obj["gpus"]; for (const auto& gpu : gpus) { int rank = gpu["rank"]; @@ -72,7 +80,7 @@ void ExecutionPlan::loadExecutionPlan(std::ifstream& file) { } channelInfos.push_back(info); } - this->channelInfos_[rank] = channelInfos; + this->channelInfos[rank] = channelInfos; } } diff --git a/src/executor/executor.cc b/src/executor/executor.cc index f17808fa7..7ae13dd94 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -1,10 +1,45 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "executor.hpp" - +#include +#include +#include #include +#include "execution_plan.hpp" + +namespace mscclpp { +struct ExecutionContextKey { + void* sendBuff; + void* recvBuff; + size_t sendBuffSize; + size_t recvBuffSize; + std::string plan; + + bool operator==(const ExecutionContextKey& other) const { + return sendBuff == other.sendBuff && recvBuff == other.recvBuff && sendBuffSize == other.sendBuffSize && + recvBuffSize == other.recvBuffSize && plan == other.plan; + } +}; +} // namespace mscclpp + +namespace std { +template <> +struct hash> { + std::size_t operator()(const std::pair& key) const { + return std::hash()(key.second) ^ std::hash()(static_cast(key.first)); + } +}; + +template <> +struct hash { + std::size_t operator()(const mscclpp::ExecutionContextKey& key) const { + return std::hash()(key.sendBuff) ^ std::hash()(key.recvBuff) ^ std::hash()(key.sendBuffSize) ^ + std::hash()(key.recvBuffSize) ^ std::hash()(key.plan); + } +}; +} // namespace std + namespace { static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -13,8 +48,37 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans namespace mscclpp { -Executor::Executor(std::shared_ptr comm, const std::unordered_map connections) - : impl_(std::make_shared(comm, connections)) {} +struct ExecutionContext { + std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; + std::vector> smSemaphores; + std::vector proxySemaphores; + std::vector smChannels; + std::vector proxyChannels; + std::vector deviceExecutionPlans; + std::shared_ptr scratchBuffer; + size_t scratchBufferSize; +}; + +struct Executor::Impl { + std::shared_ptr comm; + const std::unordered_map> connections; + std::shared_ptr proxyService; + std::unordered_map contexts; + + Impl(std::shared_ptr comm, const std::unordered_map> connections); + ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, const ExecutionPlan& plan); + void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, int rank, const ExecutionPlan& plan); + void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank, + const ExecutionPlan& plan); + void launchKernel(ExecutionContext& context); + ~Impl() = default; +}; + +Executor::Executor(std::shared_ptr comm, + const std::unordered_map> connections) + : impl_(std::make_unique(comm, connections)) {} void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan) { @@ -31,12 +95,12 @@ Executor::Impl::Impl(std::shared_ptr comm, ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, const ExecutionPlan& plan) { - ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.getName()}; + ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name}; if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; } ExecutionContext context; - size_t scratchBufferSize = plan.getScratchBufferSize(rank, sendBufferSize); + size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize); std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); context.scratchBuffer = scratchBuffer; context.scratchBufferSize = scratchBufferSize; @@ -48,7 +112,7 @@ ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, int rank, const ExecutionPlan& plan) { - int nranksPerNode = plan.nranksPerNode(); + int nranksPerNode = plan.impl_->nranksPerNode; auto getTransportFlags = [&](std::vector& infos, int rank) { TransportFlags flags; for (ChannelInfo& info : infos) { @@ -82,9 +146,9 @@ void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* se return std::vector(peers.begin(), peers.end()); }; - std::vector bufferTypes = plan.getConnectedBufferTypes(rank); + std::vector bufferTypes = plan.impl_->getConnectedBufferTypes(rank); for (BufferType bufferType : bufferTypes) { - std::vector channelInfos = plan.getChannelInfos(rank, bufferType); + std::vector channelInfos = plan.impl_->getChannelInfos(rank, bufferType); TransportFlags transportFlags = getTransportFlags(channelInfos, rank); RegisteredMemory memory = this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags); @@ -95,7 +159,7 @@ void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* se remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); } comm->setup(); - for (int i = 0; i < remoteRegMemoryFutures.size(); i++) { + for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) { context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get()); } } @@ -107,7 +171,7 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo std::vector> smSemaphores; std::vector proxySemaphores; for (ChannelType channelType : channelTypes) { - std::vector channelInfos = plan.getChannelInfos(rank, channelType); + std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); for (ChannelInfo& info : channelInfos) { for (int peer : info.connectedPeers) { if (channelType == ChannelType::SM) { @@ -135,11 +199,10 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo } }; for (ChannelType channelType : channelTypes) { - std::vector channelInfos = plan.getChannelInfos(rank, channelType); + std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); int index = 0; for (ChannelInfo& info : channelInfos) { void* src = getBuffer(info.srcBufferType); - void* dst = getBuffer(info.dstBufferType); TransportFlags transport = context.registeredMemories.begin()->second.transports(); RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); for (int peer : info.connectedPeers) { @@ -159,7 +222,8 @@ void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, vo void Executor::Impl::launchKernel(ExecutionContext& context) { // Need to change to use flush function and make sure the proxy service will get the latest data. - this->proxyService->startProxy(); + // may need atomic variable + // this->proxyService->startProxy(); } } // namespace mscclpp diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 220cda9a6..e70ef5c6a 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -5,6 +5,7 @@ #define MSCCLPP_EXECUTOR_PLAN_HPP_ #include +#include #include #include #include @@ -71,11 +72,11 @@ struct DeviceExecutionPlan { Operation operations[1]; }; -class ExecutionPlan { +struct ExecutionPlan::Impl { public: - ExecutionPlan(std::ifstream& file); - std::string getName() const; - int nranksPerNode() const; + Impl(std::ifstream& file); + ~Impl() = default; + std::vector getChannelInfos(int rank, ChannelType channelType) const; std::vector getChannelInfos(int rank, BufferType bufferType) const; std::vector getConnectedBufferTypes(int rank) const; @@ -83,16 +84,13 @@ class ExecutionPlan { std::vector getOperations(int rank, int threadblock); std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, BufferType dstBufferType, ChannelType channelType); - ~ExecutionPlan() = default; - - private: void loadExecutionPlan(std::ifstream& file); // operations for [rank][threadblock] - std::vector> operations_; - std::unordered_map> channelInfos_; - std::string name_; - int nranksPerNode_; + std::vector> operations; + std::unordered_map> channelInfos; + std::string name; + int nranksPerNode; }; } // namespace mscclpp diff --git a/src/include/executor.hpp b/src/include/executor.hpp deleted file mode 100644 index 88469416d..000000000 --- a/src/include/executor.hpp +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef MSCCLPP_EXECUTOR_HPP_ -#define MSCCLPP_EXECUTOR_HPP_ - -#include -#include -#include -#include - -#include "execution_plan.hpp" - -namespace mscclpp { -struct ExecutionContextKey { - void* sendBuff; - void* recvBuff; - size_t sendBuffSize; - size_t recvBuffSize; - std::string plan; -}; -} // namespace mscclpp - -namespace std { -template <> -struct hash> { - std::size_t operator()(const std::pair& key) const { - return std::hash()(key.second) ^ std::hash()(static_cast(key.first)); - } -}; - -template <> -struct hash { - std::size_t operator()(const mscclpp::ExecutionContextKey& key) const { - return std::hash()(key.sendBuff) ^ std::hash()(key.recvBuff) ^ std::hash()(key.sendBuffSize) ^ - std::hash()(key.recvBuffSize) ^ std::hash()(key.plan); - } -}; -} // namespace std - -namespace mscclpp { -class Executor { - public: - Executor(std::shared_ptr comm, const std::unordered_map connections); - void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan); - ~Executor() = default; - - private: - struct Impl; - - std::shared_ptr impl_; -}; - -struct ExecutionContext { - std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; - std::vector> smSemaphores; - std::vector proxySemaphores; - std::vector smChannels; - std::vector proxyChannels; - std::vector deviceExecutionPlans; - std::shared_ptr scratchBuffer; - size_t scratchBufferSize; -}; - -struct Executor::Impl { - std::shared_ptr comm; - const std::unordered_map> connections; - std::shared_ptr proxyService; - std::unordered_map contexts; - - Impl(std::shared_ptr comm, const std::unordered_map> connections); - ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, const ExecutionPlan& plan); - void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, int rank, const ExecutionPlan& plan); - void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank, - const ExecutionPlan& plan); - void launchKernel(ExecutionContext& context); - ~Impl() = default; -}; - -} // namespace mscclpp - -#endif // MSCCLPP_EXECUTOR_HPP_ From edf93df44e3b29f0329fcd8db5227090cd511290 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 11:40:54 +0000 Subject: [PATCH 08/51] WIP --- src/executor/executor.cc | 270 +++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 142 deletions(-) diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 7ae13dd94..324671e27 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -65,165 +65,151 @@ struct Executor::Impl { std::shared_ptr proxyService; std::unordered_map contexts; - Impl(std::shared_ptr comm, const std::unordered_map> connections); - ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, const ExecutionPlan& plan); - void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, int rank, const ExecutionPlan& plan); - void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank, - const ExecutionPlan& plan); - void launchKernel(ExecutionContext& context); + Impl(std::shared_ptr comm, const std::unordered_map> connections) + : comm(comm), connections(connections) { + this->proxyService = std::make_shared(); + } ~Impl() = default; -}; -Executor::Executor(std::shared_ptr comm, - const std::unordered_map> connections) - : impl_(std::make_unique(comm, connections)) {} - -void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, - const ExecutionPlan& plan) { - ExecutionContext context = - this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); - this->impl_->launchKernel(context); -} - -Executor::Impl::Impl(std::shared_ptr comm, - const std::unordered_map> connections) - : comm(comm), connections(connections) { - this->proxyService = std::make_shared(); -} - -ExecutionContext Executor::Impl::setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, const ExecutionPlan& plan) { - ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name}; - if (this->contexts.find(key) != this->contexts.end()) { - return this->contexts[key]; + ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, const ExecutionPlan& plan) { + ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name}; + if (this->contexts.find(key) != this->contexts.end()) { + return this->contexts[key]; + } + ExecutionContext context; + size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize); + std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); + context.scratchBuffer = scratchBuffer; + context.scratchBufferSize = scratchBufferSize; + this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); + this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); + return context; } - ExecutionContext context; - size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize); - std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); - context.scratchBuffer = scratchBuffer; - context.scratchBufferSize = scratchBufferSize; - this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); - this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); - return context; -} -void Executor::Impl::setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, - size_t sendBufferSize, size_t recvBufferSize, int rank, - const ExecutionPlan& plan) { - int nranksPerNode = plan.impl_->nranksPerNode; - auto getTransportFlags = [&](std::vector& infos, int rank) { - TransportFlags flags; - for (ChannelInfo& info : infos) { - if (info.channelType == ChannelType::SM) { - flags |= Transport::CudaIpc; - } else if (info.channelType == ChannelType::PROXY) { - flags |= IBs[rank % nranksPerNode]; + void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, + size_t recvBufferSize, int rank, const ExecutionPlan& plan) { + int nranksPerNode = plan.impl_->nranksPerNode; + auto getTransportFlags = [&](std::vector& infos, int rank) { + TransportFlags flags; + for (ChannelInfo& info : infos) { + if (info.channelType == ChannelType::SM) { + flags |= Transport::CudaIpc; + } else if (info.channelType == ChannelType::PROXY) { + flags |= IBs[rank % nranksPerNode]; + } } - } - return flags; - }; - auto getBufferInfo = [&](BufferType type) { - switch (type) { - case BufferType::INPUT: - return std::make_pair(sendbuff, sendBufferSize); - case BufferType::OUTPUT: - return std::make_pair(recvbuff, recvBufferSize); - case BufferType::SCRATCH: - return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize); - default: - throw std::runtime_error("Invalid buffer type"); - } - }; - auto getConnectedPeers = [&](std::vector& infos) { - std::set peers; - for (ChannelInfo& info : infos) { - for (int peer : info.connectedPeers) { - peers.insert(peer); + return flags; + }; + auto getBufferInfo = [&](BufferType type) { + switch (type) { + case BufferType::INPUT: + return std::make_pair(sendbuff, sendBufferSize); + case BufferType::OUTPUT: + return std::make_pair(recvbuff, recvBufferSize); + case BufferType::SCRATCH: + return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize); + default: + throw std::runtime_error("Invalid buffer type"); + } + }; + auto getConnectedPeers = [&](std::vector& infos) { + std::set peers; + for (ChannelInfo& info : infos) { + for (int peer : info.connectedPeers) { + peers.insert(peer); + } + } + return std::vector(peers.begin(), peers.end()); + }; + + std::vector bufferTypes = plan.impl_->getConnectedBufferTypes(rank); + for (BufferType bufferType : bufferTypes) { + std::vector channelInfos = plan.impl_->getChannelInfos(rank, bufferType); + TransportFlags transportFlags = getTransportFlags(channelInfos, rank); + RegisteredMemory memory = + this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags); + std::vector connectedPeers = getConnectedPeers(channelInfos); + std::vector> remoteRegMemoryFutures; + for (int peer : connectedPeers) { + comm->sendMemoryOnSetup(memory, peer, 0); + remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); + } + comm->setup(); + for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) { + context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get()); } - } - return std::vector(peers.begin(), peers.end()); - }; - - std::vector bufferTypes = plan.impl_->getConnectedBufferTypes(rank); - for (BufferType bufferType : bufferTypes) { - std::vector channelInfos = plan.impl_->getChannelInfos(rank, bufferType); - TransportFlags transportFlags = getTransportFlags(channelInfos, rank); - RegisteredMemory memory = - this->comm->registerMemory(getBufferInfo(bufferType).first, getBufferInfo(bufferType).second, transportFlags); - std::vector connectedPeers = getConnectedPeers(channelInfos); - std::vector> remoteRegMemoryFutures; - for (int peer : connectedPeers) { - comm->sendMemoryOnSetup(memory, peer, 0); - remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(peer, 0)); - } - comm->setup(); - for (size_t i = 0; i < remoteRegMemoryFutures.size(); i++) { - context.registeredMemories[{bufferType, connectedPeers[i]}] = std::move(remoteRegMemoryFutures[i].get()); } } -} -void Executor::Impl::setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, - int rank, const ExecutionPlan& plan) { - const auto channelTypes = {ChannelType::SM, ChannelType::PROXY}; - std::vector> smSemaphores; - std::vector proxySemaphores; - for (ChannelType channelType : channelTypes) { - std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); - for (ChannelInfo& info : channelInfos) { - for (int peer : info.connectedPeers) { - if (channelType == ChannelType::SM) { - smSemaphores.push_back(std::make_shared(*this->comm, this->connections.at(peer))); - } else if (channelType == ChannelType::PROXY) { - proxySemaphores.push_back(this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer))); + void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, int rank, + const ExecutionPlan& plan) { + const auto channelTypes = {ChannelType::SM, ChannelType::PROXY}; + std::vector> smSemaphores; + std::vector proxySemaphores; + for (ChannelType channelType : channelTypes) { + std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); + for (ChannelInfo& info : channelInfos) { + for (int peer : info.connectedPeers) { + if (channelType == ChannelType::SM) { + smSemaphores.push_back(std::make_shared(*this->comm, this->connections.at(peer))); + } else if (channelType == ChannelType::PROXY) { + proxySemaphores.push_back( + this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer))); + } } } } - } - this->comm->setup(); - context.smSemaphores = std::move(smSemaphores); - context.proxySemaphores = std::move(proxySemaphores); - - auto getBuffer = [&](BufferType type) { - switch (type) { - case BufferType::INPUT: - return sendbuff; - case BufferType::OUTPUT: - return recvbuff; - case BufferType::SCRATCH: - return (void*)context.scratchBuffer.get(); - default: - throw std::runtime_error("Invalid buffer type"); - } - }; - for (ChannelType channelType : channelTypes) { - std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); - int index = 0; - for (ChannelInfo& info : channelInfos) { - void* src = getBuffer(info.srcBufferType); - TransportFlags transport = context.registeredMemories.begin()->second.transports(); - RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); - for (int peer : info.connectedPeers) { - if (channelType == ChannelType::SM) { - context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}], - src, nullptr); - } else if (channelType == ChannelType::PROXY) { - context.proxyChannels.emplace_back( - this->proxyService->proxyChannel(proxySemaphores[index]), - this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), - this->proxyService->addMemory(localMemory)); + this->comm->setup(); + context.smSemaphores = std::move(smSemaphores); + context.proxySemaphores = std::move(proxySemaphores); + + auto getBuffer = [&](BufferType type) { + switch (type) { + case BufferType::INPUT: + return sendbuff; + case BufferType::OUTPUT: + return recvbuff; + case BufferType::SCRATCH: + return (void*)context.scratchBuffer.get(); + default: + throw std::runtime_error("Invalid buffer type"); + } + }; + for (ChannelType channelType : channelTypes) { + std::vector channelInfos = plan.impl_->getChannelInfos(rank, channelType); + int index = 0; + for (ChannelInfo& info : channelInfos) { + void* src = getBuffer(info.srcBufferType); + TransportFlags transport = context.registeredMemories.begin()->second.transports(); + RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); + for (int peer : info.connectedPeers) { + if (channelType == ChannelType::SM) { + context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}], + src, nullptr); + } else if (channelType == ChannelType::PROXY) { + context.proxyChannels.emplace_back( + this->proxyService->proxyChannel(proxySemaphores[index]), + this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), + this->proxyService->addMemory(localMemory)); + } } } } } -} -void Executor::Impl::launchKernel(ExecutionContext& context) { - // Need to change to use flush function and make sure the proxy service will get the latest data. - // may need atomic variable - // this->proxyService->startProxy(); + void launchKernel(ExecutionContext& context) {} +}; + +Executor::Executor(std::shared_ptr comm, + const std::unordered_map> connections) + : impl_(std::make_unique(comm, connections)) {} + +void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + const ExecutionPlan& plan) { + ExecutionContext context = + this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); + this->impl_->launchKernel(context); } } // namespace mscclpp From c071e5d789a372def895b315126e1ea8d49fadc7 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 12:00:38 +0000 Subject: [PATCH 09/51] add test file --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_plan.cc | 2 ++ src/executor/executor.cc | 2 ++ test/CMakeLists.txt | 1 + test/executor_test.cc | 30 ++++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 test/executor_test.cc diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index c9f7e7f59..c64230aa6 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -27,7 +27,7 @@ class Executor { Executor(std::shared_ptr comm, const std::unordered_map> connections); Executor(const Executor&) = delete; Executor& operator=(const Executor&) = delete; - ~Executor() = default; + ~Executor(); void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan); diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 5fe314cfc..65d372502 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -84,4 +84,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { } } +ExecutionPlan::ExecutionPlan(std::ifstream& file) : impl_(std::make_shared(file)) {} + } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 324671e27..fd75b4d61 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -212,4 +212,6 @@ void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size this->impl_->launchKernel(context); } +Executor::~Executor() = default; + } // namespace mscclpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index da47066ea..501f96ab0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,6 +24,7 @@ endfunction() add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) add_test_executable(nvls_test nvls_test.cu) +add_test_executable(executor_test executor_test.cc) configure_file(run_mpi_test.sh.in run_mpi_test.sh) diff --git a/test/executor_test.cc b/test/executor_test.cc new file mode 100644 index 000000000..f6ad8bd02 --- /dev/null +++ b/test/executor_test.cc @@ -0,0 +1,30 @@ +#include + +#include +#include + +int main() { + int rank; + int world_size; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + auto bootstrap = std::make_shared(rank, world_size); + mscclpp::UniqueId id; + if (rank == 0) { + id = bootstrap->createUniqueId(); + } + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); + auto comm = std::make_shared(bootstrap); + std::shared_ptr executor = + std::make_shared(comm, std::unordered_map>()); + std::ifstream file("execution_plan.json"); + mscclpp::ExecutionPlan plan(file); + std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); + std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); + executor->execute(sendbuff.get(), recvbuff.get(), 1024, 1024, plan); + + MPI_Finalize(); + return 0; +} From 580e4a4b36f68c73bf77e8c05b58cc0f3d1c1ea6 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 1 Apr 2024 14:33:07 +0000 Subject: [PATCH 10/51] WIP --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_plan.cc | 11 +++++++++++ src/executor/executor.cc | 36 +++++++++++++++++++++++++--------- src/include/execution_plan.hpp | 1 + test/executor_test.cc | 3 +-- 5 files changed, 41 insertions(+), 12 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index c64230aa6..895fc03d8 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -24,7 +24,7 @@ class ExecutionPlan { class Executor { public: - Executor(std::shared_ptr comm, const std::unordered_map> connections); + Executor(std::shared_ptr comm); Executor(const Executor&) = delete; Executor& operator=(const Executor&) = delete; ~Executor(); diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 65d372502..2020b970d 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -5,6 +5,7 @@ #include #include +#include namespace { template @@ -29,6 +30,16 @@ std::vector ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy return filter(this->channelInfos.at(rank), pred); } +std::vector ExecutionPlan::Impl::getConnectedPeers(int rank) const { + std::set peers; + for (const auto& info : this->channelInfos.at(rank)) { + for (int peer : info.connectedPeers) { + peers.insert(peer); + } + } + return std::vector(peers.begin(), peers.end()); +} + std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const { return std::vector(); } diff --git a/src/executor/executor.cc b/src/executor/executor.cc index fd75b4d61..4a3e367e3 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -41,6 +41,10 @@ struct hash { } // namespace std namespace { +auto inSameNode = [](int rank1, int rank2, int nranksPerNode) { + return rank1 / nranksPerNode == rank2 / nranksPerNode; +}; + static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, mscclpp::Transport::IB6, mscclpp::Transport::IB7}; @@ -61,14 +65,11 @@ struct ExecutionContext { struct Executor::Impl { std::shared_ptr comm; - const std::unordered_map> connections; + std::unordered_map> connections; std::shared_ptr proxyService; std::unordered_map contexts; - Impl(std::shared_ptr comm, const std::unordered_map> connections) - : comm(comm), connections(connections) { - this->proxyService = std::make_shared(); - } + Impl(std::shared_ptr comm) : comm(comm) { this->proxyService = std::make_shared(); } ~Impl() = default; ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, @@ -82,11 +83,26 @@ struct Executor::Impl { std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); context.scratchBuffer = scratchBuffer; context.scratchBufferSize = scratchBufferSize; + this->setupConnections(context, rank, plan); this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); return context; } + void setupConnections(ExecutionContext& context, int rank, const ExecutionPlan& plan) { + std::vector connectedPeers = plan.impl_->getConnectedPeers(rank); + std::vector>> connectionFutures; + for (int peer : connectedPeers) { + Transport transport = inSameNode(rank, peer, plan.impl_->nranksPerNode) ? Transport::CudaIpc + : IBs[rank % plan.impl_->nranksPerNode]; + connectionFutures.push_back(this->comm->connectOnSetup(peer, 0, transport)); + } + this->comm->setup(); + for (size_t i = 0; i < connectionFutures.size(); i++) { + this->connections[connectedPeers[i]] = connectionFutures[i].get(); + } + } + void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, int rank, const ExecutionPlan& plan) { int nranksPerNode = plan.impl_->nranksPerNode; @@ -96,7 +112,11 @@ struct Executor::Impl { if (info.channelType == ChannelType::SM) { flags |= Transport::CudaIpc; } else if (info.channelType == ChannelType::PROXY) { - flags |= IBs[rank % nranksPerNode]; + for (int peer : info.connectedPeers) { + if (inSameNode(rank, peer, nranksPerNode)) { + flags |= IBs[rank % nranksPerNode]; + } + } } } return flags; @@ -201,9 +221,7 @@ struct Executor::Impl { void launchKernel(ExecutionContext& context) {} }; -Executor::Executor(std::shared_ptr comm, - const std::unordered_map> connections) - : impl_(std::make_unique(comm, connections)) {} +Executor::Executor(std::shared_ptr comm) : impl_(std::make_unique(comm)) {} void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan) { diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index e70ef5c6a..de1ba4496 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -79,6 +79,7 @@ struct ExecutionPlan::Impl { std::vector getChannelInfos(int rank, ChannelType channelType) const; std::vector getChannelInfos(int rank, BufferType bufferType) const; + std::vector getConnectedPeers(int rank) const; std::vector getConnectedBufferTypes(int rank) const; size_t getScratchBufferSize(int rank, size_t inputSize) const; std::vector getOperations(int rank, int threadblock); diff --git a/test/executor_test.cc b/test/executor_test.cc index f6ad8bd02..7059631ab 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -17,8 +17,7 @@ int main() { MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); auto comm = std::make_shared(bootstrap); - std::shared_ptr executor = - std::make_shared(comm, std::unordered_map>()); + std::shared_ptr executor = std::make_shared(comm); std::ifstream file("execution_plan.json"); mscclpp::ExecutionPlan plan(file); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); From d7026fbfdd63f5743ede571024420237485f7ef5 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 2 Apr 2024 06:03:33 +0000 Subject: [PATCH 11/51] WIP --- src/executor/execution_plan.cc | 19 +- src/include/execution_plan.hpp | 3 + test/execution-files/allreduce.json | 451 ++++++++++++++++++++++++++++ test/executor_test.cc | 7 +- 4 files changed, 472 insertions(+), 8 deletions(-) create mode 100644 test/execution-files/allreduce.json diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 2020b970d..a962e956f 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -43,7 +43,9 @@ std::vector ExecutionPlan::Impl::getConnectedPeers(int rank) const { std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const { return std::vector(); } -size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return 0; }; +size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { + return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank); +} std::vector ExecutionPlan::Impl::getOperations(int rank, int threadblock) { return std::vector(); } @@ -54,11 +56,11 @@ std::pair ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, in void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { auto convertToBufferType = [](const std::string& str) { - if (str == "input") { + if (str == "i") { return BufferType::INPUT; - } else if (str == "output") { + } else if (str == "o") { return BufferType::OUTPUT; - } else if (str == "scratch") { + } else if (str == "s") { return BufferType::SCRATCH; } else { throw std::runtime_error("Invalid buffer type"); @@ -79,12 +81,15 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { this->nranksPerNode = obj["nranksPerNode"]; auto gpus = obj["gpus"]; for (const auto& gpu : gpus) { - int rank = gpu["rank"]; + int rank = gpu["id"]; + this->inputChunks[rank] = gpu["inputChunks"]; + this->outputChunks[rank] = gpu["outputChunks"]; + this->scratchChunks[rank] = gpu["scratchChunks"]; std::vector channelInfos; for (const auto& channel : gpu["channels"]) { ChannelInfo info; - info.srcBufferType = convertToBufferType(channel["srcBuffer"]); - info.dstBufferType = convertToBufferType(channel["dstBuffer"]); + info.srcBufferType = convertToBufferType(channel["srcbuff"]); + info.dstBufferType = convertToBufferType(channel["dstbuff"]); info.channelType = convertToChannelType(channel["type"]); for (const auto& peer : channel["connectedTo"]) { info.connectedPeers.push_back(peer); diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index de1ba4496..70df39c47 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -92,6 +92,9 @@ struct ExecutionPlan::Impl { std::unordered_map> channelInfos; std::string name; int nranksPerNode; + std::unordered_map inputChunks; + std::unordered_map outputChunks; + std::unordered_map scratchChunks; }; } // namespace mscclpp diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json new file mode 100644 index 000000000..60a8ffe88 --- /dev/null +++ b/test/execution-files/allreduce.json @@ -0,0 +1,451 @@ +{ + "name": "allreduce_pairs", + "colletive": "allreduce", + "protocol": "Simple", + "inplace": true, + "nranksPerNode": 8, + "gpus": [ + { + "id": 0, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "nop", + "deps": [ + { + "tb": 0, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_cids": [ + { + "id": 0, + "off": 0 + } + ], + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "dstoff": 0, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 0, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cid": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 3 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 1 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_cids": [ + { + "id": 0, + "off": 1 + } + ], + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "dstoff": 1, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cid": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "i", + "type": "sm", + "connectedTo": [ + 1, + 1 + ] + } + ] + }, + { + "id": 1, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "nop", + "deps": [ + { + "tb": 0, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_cids": [ + { + "id": 0, + "off": 2 + } + ], + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "dstoff": 2, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 0, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cid": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 1 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 3 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_cids": [ + { + "id": 0, + "off": 3 + } + ], + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "dstoff": 3, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + }, + { + "name": "wait", + "i_cids": [ + { + "id": 0, + "off": 0 + } + ], + "srcbuff": "i", + "dstbuff": "i", + "ctype": "sm" + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cid": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "i", + "type": "sm", + "connectedTo": [ + 0, + 0 + ] + } + ] + } + ] +} diff --git a/test/executor_test.cc b/test/executor_test.cc index 7059631ab..9a8bcb72e 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -3,6 +3,8 @@ #include #include +const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp"; + int main() { int rank; int world_size; @@ -16,9 +18,12 @@ int main() { } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); + // sleep 10s + // std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); std::shared_ptr executor = std::make_shared(comm); - std::ifstream file("execution_plan.json"); + + std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); mscclpp::ExecutionPlan plan(file); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); From b34c9e85056c40211796900e7ddeef4176139575 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 2 Apr 2024 06:58:16 +0000 Subject: [PATCH 12/51] WIP --- include/mscclpp/executor.hpp | 3 ++- src/executor/execution_plan.cc | 6 +++++- src/executor/executor.cc | 21 +++++++++++---------- test/executor_test.cc | 2 +- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 895fc03d8..bf09a0d6e 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -29,7 +29,8 @@ class Executor { Executor& operator=(const Executor&) = delete; ~Executor(); - void execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan); + void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + const ExecutionPlan& plan); private: struct Impl; diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index a962e956f..c6b332fb8 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -41,7 +41,11 @@ std::vector ExecutionPlan::Impl::getConnectedPeers(int rank) const { } std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) const { - return std::vector(); + std::set bufferTypes; + for (const auto& info : this->channelInfos.at(rank)) { + bufferTypes.insert(info.dstBufferType); + } + return std::vector(bufferTypes.begin(), bufferTypes.end()); } size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank); diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 4a3e367e3..94565cdcb 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -53,6 +53,7 @@ static const mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Trans namespace mscclpp { struct ExecutionContext { + std::unordered_map> connections; std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; std::vector> smSemaphores; std::vector proxySemaphores; @@ -65,7 +66,6 @@ struct ExecutionContext { struct Executor::Impl { std::shared_ptr comm; - std::unordered_map> connections; std::shared_ptr proxyService; std::unordered_map contexts; @@ -99,7 +99,7 @@ struct Executor::Impl { } this->comm->setup(); for (size_t i = 0; i < connectionFutures.size(); i++) { - this->connections[connectedPeers[i]] = connectionFutures[i].get(); + context.connections[connectedPeers[i]] = connectionFutures[i].get(); } } @@ -113,7 +113,7 @@ struct Executor::Impl { flags |= Transport::CudaIpc; } else if (info.channelType == ChannelType::PROXY) { for (int peer : info.connectedPeers) { - if (inSameNode(rank, peer, nranksPerNode)) { + if (!inSameNode(rank, peer, nranksPerNode)) { flags |= IBs[rank % nranksPerNode]; } } @@ -172,10 +172,11 @@ struct Executor::Impl { for (ChannelInfo& info : channelInfos) { for (int peer : info.connectedPeers) { if (channelType == ChannelType::SM) { - smSemaphores.push_back(std::make_shared(*this->comm, this->connections.at(peer))); + smSemaphores.push_back( + std::make_shared(*this->comm, context.connections.at(peer))); } else if (channelType == ChannelType::PROXY) { proxySemaphores.push_back( - this->proxyService->buildAndAddSemaphore(*this->comm, this->connections.at(peer))); + this->proxyService->buildAndAddSemaphore(*this->comm, context.connections.at(peer))); } } } @@ -205,11 +206,11 @@ struct Executor::Impl { RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); for (int peer : info.connectedPeers) { if (channelType == ChannelType::SM) { - context.smChannels.emplace_back(smSemaphores[index], context.registeredMemories[{info.dstBufferType, peer}], - src, nullptr); + context.smChannels.emplace_back(context.smSemaphores[index], + context.registeredMemories[{info.dstBufferType, peer}], src, nullptr); } else if (channelType == ChannelType::PROXY) { context.proxyChannels.emplace_back( - this->proxyService->proxyChannel(proxySemaphores[index]), + this->proxyService->proxyChannel(context.proxySemaphores[index]), this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), this->proxyService->addMemory(localMemory)); } @@ -223,10 +224,10 @@ struct Executor::Impl { Executor::Executor(std::shared_ptr comm) : impl_(std::make_unique(comm)) {} -void Executor::execute(void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, +void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan) { ExecutionContext context = - this->impl_->setupExecutionContext(0, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); + this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); this->impl_->launchKernel(context); } diff --git a/test/executor_test.cc b/test/executor_test.cc index 9a8bcb72e..35c4cce2b 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -27,7 +27,7 @@ int main() { mscclpp::ExecutionPlan plan(file); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); - executor->execute(sendbuff.get(), recvbuff.get(), 1024, 1024, plan); + executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan); MPI_Finalize(); return 0; From faef1e4425dc7080a18496156393a8fab6bae7f8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 2 Apr 2024 07:23:52 +0000 Subject: [PATCH 13/51] WIP --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_plan.cc | 1 - src/executor/executor.cc | 17 ++++++++++------- src/include/execution_plan.hpp | 1 - test/execution-files/allreduce.json | 1 - test/executor_test.cc | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index bf09a0d6e..1efd2a747 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -24,7 +24,7 @@ class ExecutionPlan { class Executor { public: - Executor(std::shared_ptr comm); + Executor(std::shared_ptr comm, int nranksPerNode); Executor(const Executor&) = delete; Executor& operator=(const Executor&) = delete; ~Executor(); diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index c6b332fb8..9b6c284c3 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -82,7 +82,6 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { json obj = json::parse(file); this->name = obj["name"]; - this->nranksPerNode = obj["nranksPerNode"]; auto gpus = obj["gpus"]; for (const auto& gpu : gpus) { int rank = gpu["id"]; diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 94565cdcb..47ee0be70 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -65,11 +65,14 @@ struct ExecutionContext { }; struct Executor::Impl { + int nranksPerNode; std::shared_ptr comm; std::shared_ptr proxyService; std::unordered_map contexts; - Impl(std::shared_ptr comm) : comm(comm) { this->proxyService = std::make_shared(); } + Impl(std::shared_ptr comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) { + this->proxyService = std::make_shared(); + } ~Impl() = default; ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, @@ -93,8 +96,8 @@ struct Executor::Impl { std::vector connectedPeers = plan.impl_->getConnectedPeers(rank); std::vector>> connectionFutures; for (int peer : connectedPeers) { - Transport transport = inSameNode(rank, peer, plan.impl_->nranksPerNode) ? Transport::CudaIpc - : IBs[rank % plan.impl_->nranksPerNode]; + Transport transport = + inSameNode(rank, peer, this->nranksPerNode) ? Transport::CudaIpc : IBs[rank % this->nranksPerNode]; connectionFutures.push_back(this->comm->connectOnSetup(peer, 0, transport)); } this->comm->setup(); @@ -105,7 +108,6 @@ struct Executor::Impl { void setupRegisteredMemories(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, int rank, const ExecutionPlan& plan) { - int nranksPerNode = plan.impl_->nranksPerNode; auto getTransportFlags = [&](std::vector& infos, int rank) { TransportFlags flags; for (ChannelInfo& info : infos) { @@ -113,8 +115,8 @@ struct Executor::Impl { flags |= Transport::CudaIpc; } else if (info.channelType == ChannelType::PROXY) { for (int peer : info.connectedPeers) { - if (!inSameNode(rank, peer, nranksPerNode)) { - flags |= IBs[rank % nranksPerNode]; + if (!inSameNode(rank, peer, this->nranksPerNode)) { + flags |= IBs[rank % this->nranksPerNode]; } } } @@ -222,7 +224,8 @@ struct Executor::Impl { void launchKernel(ExecutionContext& context) {} }; -Executor::Executor(std::shared_ptr comm) : impl_(std::make_unique(comm)) {} +Executor::Executor(std::shared_ptr comm, int nranksPerNode) + : impl_(std::make_unique(comm, nranksPerNode)) {} void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, const ExecutionPlan& plan) { diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 70df39c47..759f3ebc7 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -91,7 +91,6 @@ struct ExecutionPlan::Impl { std::vector> operations; std::unordered_map> channelInfos; std::string name; - int nranksPerNode; std::unordered_map inputChunks; std::unordered_map outputChunks; std::unordered_map scratchChunks; diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json index 60a8ffe88..58db26c08 100644 --- a/test/execution-files/allreduce.json +++ b/test/execution-files/allreduce.json @@ -3,7 +3,6 @@ "colletive": "allreduce", "protocol": "Simple", "inplace": true, - "nranksPerNode": 8, "gpus": [ { "id": 0, diff --git a/test/executor_test.cc b/test/executor_test.cc index 35c4cce2b..ccc0356a1 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -21,7 +21,7 @@ int main() { // sleep 10s // std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); - std::shared_ptr executor = std::make_shared(comm); + std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); mscclpp::ExecutionPlan plan(file); From a80bcee1f62ef4f5b51d67851bb6304b60e1066d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 3 Apr 2024 07:12:07 +0000 Subject: [PATCH 14/51] build pass --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_plan.cc | 196 ++++++++++++++++++++++----- src/executor/executor.cc | 29 +++- src/include/execution_plan.hpp | 71 +++++++--- test/execution-files/allreduce.json | 200 +++++++++++++++++++--------- test/executor_test.cc | 4 +- 6 files changed, 389 insertions(+), 113 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 1efd2a747..076238336 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -12,7 +12,7 @@ namespace mscclpp { class ExecutionPlan { public: - ExecutionPlan(std::ifstream& file); + ExecutionPlan(std::string planPath); ~ExecutionPlan() = default; private: diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 9b6c284c3..0dff4adbd 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -4,7 +4,6 @@ #include "execution_plan.hpp" #include -#include #include namespace { @@ -14,17 +13,67 @@ std::vector filter(const std::vector& vec, Predicate pred) { std::copy_if(vec.begin(), vec.end(), std::back_inserter(filtered), pred); return filtered; } + +auto getOpType = [](const std::string& str) { + if (str == "nop") { + return mscclpp::OperationType::BARRIER; + } else if (str == "put") { + return mscclpp::OperationType::PUT; + } else if (str == "get") { + return mscclpp::OperationType::GET; + } else if (str == "copy") { + return mscclpp::OperationType::COPY; + } else if (str == "signal") { + return mscclpp::OperationType::SIGNAL; + } else if (str == "wait") { + return mscclpp::OperationType::WAIT; + } else if (str == "flush") { + return mscclpp::OperationType::FLUSH; + } else if (str == "reduce") { + return mscclpp::OperationType::REDUCE; + } else if (str == "read_reduce_copy") { + return mscclpp::OperationType::READ_REDUCE_COPY; + } else if (str == "read_reduce_copy_put") { + return mscclpp::OperationType::READ_REDUCE_COPY_PUT; + } else { + throw std::runtime_error("Invalid operation type"); + } +}; + +auto convertToBufferType = [](const std::string& str) { + if (str == "i") { + return mscclpp::BufferType::INPUT; + } else if (str == "o") { + return mscclpp::BufferType::OUTPUT; + } else if (str == "s") { + return mscclpp::BufferType::SCRATCH; + } else { + throw std::runtime_error("Invalid buffer type"); + } +}; + +auto convertToChannelType = [](const std::string& str) { + if (str == "sm") { + return mscclpp::ChannelType::SM; + } else if (str == "proxy") { + return mscclpp::ChannelType::PROXY; + } else { + throw std::runtime_error("Invalid channel type"); + } +}; + } // namespace namespace mscclpp { using json = nlohmann::json; -ExecutionPlan::Impl::Impl(std::ifstream& file) { this->loadExecutionPlan(file); } +ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath) {} std::vector ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const { auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; }; return filter(this->channelInfos.at(rank), pred); } + std::vector ExecutionPlan::Impl::getChannelInfos(int rank, BufferType dstBufferType) const { auto pred = [dstBufferType](const ChannelInfo& info) { return info.dstBufferType == dstBufferType; }; return filter(this->channelInfos.at(rank), pred); @@ -50,44 +99,37 @@ std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank); } -std::vector ExecutionPlan::Impl::getOperations(int rank, int threadblock) { - return std::vector(); -} -std::pair ExecutionPlan::Impl::getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, - BufferType dstBufferType, ChannelType channelType) { - return std::make_pair(0, 0); +std::vector ExecutionPlan::Impl::getOperations(int rank, int threadblock) const { + return this->operations.at(rank)[threadblock]; } -void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { - auto convertToBufferType = [](const std::string& str) { - if (str == "i") { - return BufferType::INPUT; - } else if (str == "o") { - return BufferType::OUTPUT; - } else if (str == "s") { - return BufferType::SCRATCH; - } else { - throw std::runtime_error("Invalid buffer type"); - } - }; - auto convertToChannelType = [](const std::string& str) { - if (str == "sm") { - return ChannelType::SM; - } else if (str == "proxy") { - return ChannelType::PROXY; - } else { - throw std::runtime_error("Invalid channel type"); - } - }; +int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->operations.at(rank).size(); } +void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) { + std::ifstream file(this->planPath); json obj = json::parse(file); this->name = obj["name"]; auto gpus = obj["gpus"]; + for (const auto& gpu : gpus) { int rank = gpu["id"]; this->inputChunks[rank] = gpu["inputChunks"]; this->outputChunks[rank] = gpu["outputChunks"]; this->scratchChunks[rank] = gpu["scratchChunks"]; + } + this->setupChannels(gpus); + + uint32_t maxInputChunks = 0; + for (const auto& [rank, chunks] : this->inputChunks) { + maxInputChunks = std::max(maxInputChunks, chunks); + } + this->chunkSize = inputSize / maxInputChunks; + this->setupOperations(gpus); +} + +void ExecutionPlan::Impl::setupChannels(const json& gpus) { + for (const auto& gpu : gpus) { + int rank = gpu["id"]; std::vector channelInfos; for (const auto& channel : gpu["channels"]) { ChannelInfo info; @@ -101,8 +143,102 @@ void ExecutionPlan::Impl::loadExecutionPlan(std::ifstream& file) { } this->channelInfos[rank] = channelInfos; } + + // setup threadblockChannelMap + for (const auto& gpu : gpus) { + int rank = gpu["id"]; + auto channelTypes = {ChannelType::SM, ChannelType::PROXY}; + std::unordered_map> channelMap; + for (auto channelType : channelTypes) { + const std::vector channelInfos = this->getChannelInfos(rank, channelType); + for (size_t i = 0; i < channelInfos.size(); i++) { + const ChannelInfo& info = channelInfos[i]; + ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType}; + channelMap[key].push_back(i); + } + } + for (const auto& threadblock : gpu["threadblocks"]) { + for (const auto& channel : threadblock["channels"]) { + ChannelType channelType = convertToChannelType(channel["ctype"]); + ChannelKey key = {convertToBufferType(channel["src"]), convertToBufferType(channel["dst"]), channelType}; + for (int id : channel["cids"]) { + if (channelType == ChannelType::SM) { + this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); + } else if (channelType == ChannelType::PROXY) { + this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); + } + } + } + } + } +} + +void ExecutionPlan::Impl::setupOperations(const json& gpus) { + // setup threadblocks and operations + for (const auto& gpu : gpus) { + int rank = gpu["id"]; + for (const auto& threadblock : gpu["threadblocks"]) { + std::unordered_map> channelIndexes; + std::vector ops; + int threadblockId = threadblock["id"]; + const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId]; + const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId]; + for (size_t i = 0; i < smChannels.size(); i++) { + const auto& [_, key] = smChannels[i]; + channelIndexes[key].push_back(i); + } + for (size_t i = 0; i < proxyChannels.size(); i++) { + const auto& [_, key] = proxyChannels[i]; + channelIndexes[key].push_back(i); + } + for (const auto& op : threadblock["ops"]) { + Operation operation = {}; + operation.type = static_cast(getOpType(op["name"])); + if (op.contains("ctype")) { + operation.channelType = convertToChannelType(op["ctype"]); + } + if (op.contains("i_cids")) { + operation.nInputChannels = op["i_cids"].size(); + } + if (op.contains("o_cids")) { + operation.nOutputChannels = op["o_cids"].size(); + } + for (int i = 0; i < operation.nInputChannels; i++) { + BufferType srcBufferType = convertToBufferType(op["i_buff"][i]["src"]); + BufferType dstBufferType = convertToBufferType(op["i_buff"][i]["dst"]); + operation.inputChannelIndex[i] = + channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; + operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["offset"]; + } + for (int i = 0; i < operation.nOutputChannels; i++) { + BufferType srcBufferType = convertToBufferType(op["o_buff"][i]["src"]); + BufferType dstBufferType = convertToBufferType(op["o_buff"][i]["dst"]); + operation.outputChannelIndex[i] = + channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]]; + operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["offset"]; + } + if (op.contains("srcbuff")) { + operation.srcBufferType = convertToBufferType(op["srcbuff"]); + } + if (op.contains("srcoff")) { + operation.srcOffset = (int)op["srcoff"] * this->chunkSize; + } + if (op.contains("dstbuff")) { + operation.dstBufferType = convertToBufferType(op["dstbuff"]); + } + if (op.contains("dstoff")) { + operation.dstOffset = (int)op["dstoff"] * this->chunkSize; + } + if (op.contains("cnt")) { + operation.size = this->chunkSize * (int)op["cnt"]; + } + ops.push_back(operation); + } + this->operations[rank].push_back(ops); + } + } } -ExecutionPlan::ExecutionPlan(std::ifstream& file) : impl_(std::make_shared(file)) {} +ExecutionPlan::ExecutionPlan(std::string planPath) : impl_(std::make_shared(planPath)) {} } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 47ee0be70..445ebccad 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -60,6 +60,7 @@ struct ExecutionContext { std::vector smChannels; std::vector proxyChannels; std::vector deviceExecutionPlans; + std::vector> operations; std::shared_ptr scratchBuffer; size_t scratchBufferSize; }; @@ -81,6 +82,8 @@ struct Executor::Impl { if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; } + plan.impl_->loadExecutionPlan(sendBufferSize); + ExecutionContext context; size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize); std::shared_ptr scratchBuffer = allocExtSharedCuda(scratchBufferSize); @@ -89,6 +92,7 @@ struct Executor::Impl { this->setupConnections(context, rank, plan); this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); + this->setupDeviceExecutionPlan(context, rank, plan); return context; } @@ -221,7 +225,30 @@ struct Executor::Impl { } } - void launchKernel(ExecutionContext& context) {} + void setupDeviceExecutionPlan(ExecutionContext& context, int rank, const ExecutionPlan& plan) { + std::vector deviceExecutionPlans; + for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) { + DeviceExecutionPlan deviceExecutionPlan; + std::vector ops = plan.impl_->getOperations(rank, threadblock); + context.operations.emplace_back(std::move(ops)); + deviceExecutionPlan.nOperations = ops.size(); + deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size(); + deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size(); + for (const auto& [index, key] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.smChannels[index] = mscclpp::deviceHandle(context.smChannels[index]); + } + for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]); + } + deviceExecutionPlans.push_back(deviceExecutionPlan); + } + context.deviceExecutionPlans = std::move(deviceExecutionPlans); + } + + void launchKernel(ExecutionContext& context) { + // copy context to shared memory + // launch kernel + } }; Executor::Executor(std::shared_ptr comm, int nranksPerNode) diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 759f3ebc7..768799846 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -8,11 +8,46 @@ #include #include #include +#include #include #include namespace mscclpp { +enum class BufferType { + INPUT, + OUTPUT, + SCRATCH, +}; + +enum class ChannelType { + SM, + PROXY, +}; + +struct ChannelKey { + BufferType srcBufferType; + BufferType dstBufferType; + ChannelType channelType; + bool operator==(const ChannelKey& other) const { + return srcBufferType == other.srcBufferType && dstBufferType == other.dstBufferType && + channelType == other.channelType; + } +}; +} // namespace mscclpp + +namespace std { +template <> +struct hash { + std::size_t operator()(const mscclpp::ChannelKey& key) const { + return std::hash()(static_cast(key.srcBufferType)) ^ + std::hash()(static_cast(key.dstBufferType)) ^ std::hash()(static_cast(key.channelType)); + } +}; +} // namespace std + +namespace mscclpp { + constexpr int MAX_CHANNEL = 24; constexpr int MAX_CHANNEL_PER_OPERATION = 8; @@ -29,17 +64,6 @@ enum class OperationType { READ_REDUCE_COPY_PUT, }; -enum class ChannelType { - SM, - PROXY, -}; - -enum class BufferType { - INPUT, - OUTPUT, - SCRATCH, -}; - struct ChannelInfo { BufferType srcBufferType; BufferType dstBufferType; @@ -55,10 +79,14 @@ struct Channels { struct Operation { OperationType type; ChannelType channelType; + uint16_t nInputChannels; + uint16_t nOutputChannels; uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; size_t inputOffset[MAX_CHANNEL_PER_OPERATION]; size_t outputOffset[MAX_CHANNEL_PER_OPERATION]; + BufferType srcBufferType; + BufferType dstBufferType; size_t srcOffset; size_t dstOffset; size_t size; @@ -74,7 +102,7 @@ struct DeviceExecutionPlan { struct ExecutionPlan::Impl { public: - Impl(std::ifstream& file); + Impl(std::string planPath); ~Impl() = default; std::vector getChannelInfos(int rank, ChannelType channelType) const; @@ -82,18 +110,25 @@ struct ExecutionPlan::Impl { std::vector getConnectedPeers(int rank) const; std::vector getConnectedBufferTypes(int rank) const; size_t getScratchBufferSize(int rank, size_t inputSize) const; - std::vector getOperations(int rank, int threadblock); - std::pair getThreadBlockChannelRange(int rank, int threadblock, BufferType srcBufferType, - BufferType dstBufferType, ChannelType channelType); - void loadExecutionPlan(std::ifstream& file); + std::vector getOperations(int rank, int threadblock) const; + int getThreadblockCount(int rank) const; + + void loadExecutionPlan(size_t inputSize); + void setupChannels(const nlohmann::json& gpus); + void setupOperations(const nlohmann::json& gpus); - // operations for [rank][threadblock] - std::vector> operations; + std::string planPath; + // operations for [rank][threadblock] = [operations] + std::unordered_map>> operations; std::unordered_map> channelInfos; + // threadblockChannelMap[rank][threadblock] = [channelIndex] + std::unordered_map>>> threadblockSMChannelMap; + std::unordered_map>>> threadblockProxyChannelMap; std::string name; std::unordered_map inputChunks; std::unordered_map outputChunks; std::unordered_map scratchChunks; + size_t chunkSize; }; } // namespace mscclpp diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json index 58db26c08..b2032e90b 100644 --- a/test/execution-files/allreduce.json +++ b/test/execution-files/allreduce.json @@ -15,27 +15,33 @@ "ops": [ { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "nop", @@ -47,20 +53,28 @@ ] }, { - "name": "rrcs", + "name": "rrs", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 0 } ], + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", + "dst": 0, "dstbuff": "i", "dstoff": 0, "ctype": "sm", @@ -77,27 +91,33 @@ }, { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 } ], "channels": [ @@ -105,7 +125,7 @@ "src": "i", "dst": "i", "ctype": "sm", - "cid": [ + "cids": [ 0 ] } @@ -116,27 +136,33 @@ "ops": [ { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 3 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 1 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "nop", @@ -148,20 +174,28 @@ ] }, { - "name": "rrcs", + "name": "rrs", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 1 } ], + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", + "dst": 0, "dstbuff": "i", "dstoff": 1, "ctype": "sm", @@ -178,27 +212,33 @@ }, { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 } ], "channels": [ @@ -206,7 +246,7 @@ "src": "i", "dst": "i", "ctype": "sm", - "cid": [ + "cids": [ 1 ] } @@ -236,27 +276,33 @@ "ops": [ { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "nop", @@ -268,20 +314,28 @@ ] }, { - "name": "rrcs", + "name": "rrs", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 2 } ], + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", + "dst": 1, "dstbuff": "i", "dstoff": 2, "ctype": "sm", @@ -298,27 +352,33 @@ }, { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 } ], "channels": [ @@ -326,7 +386,7 @@ "src": "i", "dst": "i", "ctype": "sm", - "cid": [ + "cids": [ 0 ] } @@ -337,27 +397,33 @@ "ops": [ { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 1 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 3 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "nop", @@ -369,20 +435,28 @@ ] }, { - "name": "rrcs", + "name": "rrs", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 3 } ], + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", + "dst": 1, "dstbuff": "i", "dstoff": 3, "ctype": "sm", @@ -399,27 +473,33 @@ }, { "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, "o_cids": [ { "id": 0, "off": 2 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 }, { "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, "i_cids": [ { "id": 0, "off": 0 } ], - "srcbuff": "i", - "dstbuff": "i", - "ctype": "sm" + "ctype": "sm", + "cnt": 1 } ], "channels": [ @@ -427,7 +507,7 @@ "src": "i", "dst": "i", "ctype": "sm", - "cid": [ + "cids": [ 1 ] } diff --git a/test/executor_test.cc b/test/executor_test.cc index ccc0356a1..865ba2122 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -1,6 +1,5 @@ #include -#include #include const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp"; @@ -23,8 +22,7 @@ int main() { auto comm = std::make_shared(bootstrap); std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); - std::ifstream file(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); - mscclpp::ExecutionPlan plan(file); + mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan); From a1a11da7ed83a76e0188482ea056a7d6fc6cf656 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 3 Apr 2024 08:27:18 +0000 Subject: [PATCH 15/51] update struct --- src/executor/executor.cc | 3 +++ src/include/execution_plan.hpp | 38 ++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 445ebccad..173870b74 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -247,6 +247,9 @@ struct Executor::Impl { void launchKernel(ExecutionContext& context) { // copy context to shared memory + // std::cout << sizeof(Channels) << std::endl; + // std::cout << sizeof(Operation) << std::endl; + // std::cout << sizeof(DeviceExecutionPlan) << std::endl; // launch kernel } }; diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 768799846..25595b093 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -14,13 +14,13 @@ namespace mscclpp { -enum class BufferType { +enum class BufferType : uint8_t { INPUT, OUTPUT, SCRATCH, }; -enum class ChannelType { +enum class ChannelType : uint8_t { SM, PROXY, }; @@ -48,10 +48,11 @@ struct hash { namespace mscclpp { -constexpr int MAX_CHANNEL = 24; +constexpr int MAX_CHANNEL = 16; constexpr int MAX_CHANNEL_PER_OPERATION = 8; +constexpr int MAX_OPERATION = 64; -enum class OperationType { +enum class OperationType : uint8_t { BARRIER, PUT, GET, @@ -79,25 +80,26 @@ struct Channels { struct Operation { OperationType type; ChannelType channelType; - uint16_t nInputChannels; - uint16_t nOutputChannels; - uint16_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - uint16_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - size_t inputOffset[MAX_CHANNEL_PER_OPERATION]; - size_t outputOffset[MAX_CHANNEL_PER_OPERATION]; BufferType srcBufferType; BufferType dstBufferType; - size_t srcOffset; - size_t dstOffset; - size_t size; + uint8_t nInputChannels; + uint8_t nOutputChannels; + uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION]; + uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION]; + uint32_t srcOffset; + uint32_t dstOffset; + uint32_t size; }; +// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes struct DeviceExecutionPlan { - int nSmChannels; - int nProxyChannels; - int nOperations; - Channels channels; - Operation operations[1]; + uint8_t nSmChannels; // 1 bytes + uint8_t nProxyChannels; // 1 bytes + uint16_t nOperations; // 2 bytes + Channels channels; // 1920 bytes + Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes }; struct ExecutionPlan::Impl { From 4b5668c0487f17d4f9d8df77a25ac34c06bc5128 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 3 Apr 2024 10:00:57 +0000 Subject: [PATCH 16/51] fix --- src/executor/execution_plan.cc | 27 ++++++++++++++++----------- src/include/execution_plan.hpp | 5 +++-- test/executor_test.cc | 2 +- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 0dff4adbd..7753bc0b9 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -29,12 +29,14 @@ auto getOpType = [](const std::string& str) { return mscclpp::OperationType::WAIT; } else if (str == "flush") { return mscclpp::OperationType::FLUSH; - } else if (str == "reduce") { + } else if (str == "re") { return mscclpp::OperationType::REDUCE; - } else if (str == "read_reduce_copy") { - return mscclpp::OperationType::READ_REDUCE_COPY; - } else if (str == "read_reduce_copy_put") { - return mscclpp::OperationType::READ_REDUCE_COPY_PUT; + } else if (str == "rs") { + return mscclpp::OperationType::REDUCE_SEND; + } else if (str == "rr") { + return mscclpp::OperationType::READ_REDUCE; + } else if (str == "rrs") { + return mscclpp::OperationType::READ_REDUCE_SEND; } else { throw std::runtime_error("Invalid operation type"); } @@ -157,6 +159,9 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) { channelMap[key].push_back(i); } } + int nthreadblocks = gpu["threadblocks"].size(); + this->threadblockSMChannelMap[rank].resize(nthreadblocks); + this->threadblockProxyChannelMap[rank].resize(nthreadblocks); for (const auto& threadblock : gpu["threadblocks"]) { for (const auto& channel : threadblock["channels"]) { ChannelType channelType = convertToChannelType(channel["ctype"]); @@ -204,18 +209,18 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { operation.nOutputChannels = op["o_cids"].size(); } for (int i = 0; i < operation.nInputChannels; i++) { - BufferType srcBufferType = convertToBufferType(op["i_buff"][i]["src"]); - BufferType dstBufferType = convertToBufferType(op["i_buff"][i]["dst"]); + BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); + BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); operation.inputChannelIndex[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; - operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["offset"]; + operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; } for (int i = 0; i < operation.nOutputChannels; i++) { - BufferType srcBufferType = convertToBufferType(op["o_buff"][i]["src"]); - BufferType dstBufferType = convertToBufferType(op["o_buff"][i]["dst"]); + BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); + BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); operation.outputChannelIndex[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]]; - operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["offset"]; + operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["off"]; } if (op.contains("srcbuff")) { operation.srcBufferType = convertToBufferType(op["srcbuff"]); diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 25595b093..bfa51b503 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -61,8 +61,9 @@ enum class OperationType : uint8_t { WAIT, FLUSH, REDUCE, - READ_REDUCE_COPY, - READ_REDUCE_COPY_PUT, + REDUCE_SEND, + READ_REDUCE, + READ_REDUCE_SEND, }; struct ChannelInfo { diff --git a/test/executor_test.cc b/test/executor_test.cc index 865ba2122..995049ccf 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -18,7 +18,7 @@ int main() { MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); // sleep 10s - // std::this_thread::sleep_for(std::chrono::seconds(20)); + std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); From d47ac6581d1beb63667d424dad6e9661e758a6d1 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 3 Apr 2024 14:29:17 +0000 Subject: [PATCH 17/51] try to launch kernel --- include/mscclpp/executor.hpp | 2 +- src/CMakeLists.txt | 2 +- src/executor/execution_kernel.cu | 29 ++++-------- src/executor/executor.cc | 31 ++++++++----- src/include/execution_kernel.hpp | 78 ++++++++++++++++++++++++++++++++ src/include/execution_plan.hpp | 63 +------------------------- test/executor_test.cc | 2 +- 7 files changed, 112 insertions(+), 95 deletions(-) create mode 100644 src/include/execution_kernel.hpp diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 076238336..985ffba3a 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -29,7 +29,7 @@ class Executor { Executor& operator=(const Executor&) = delete; ~Executor(); - void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, const ExecutionPlan& plan); private: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfbcc927a..45b4075d2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu) target_sources(mscclpp_obj PRIVATE ${SOURCES}) target_include_directories(mscclpp_obj PRIVATE include) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 6b467dbcb..9b22bcc0d 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -1,23 +1,12 @@ -// // Copyright (c) Microsoft Corporation. -// // Licensed under the MIT license. +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. -// #include "execution_plan.hpp" +#include "execution_kernel.hpp" -// extern __shared__ mscclpp::DeviceExecutionPlan sharedMem[]; +namespace mscclpp { +__global__ void kernel(DeviceExecutionPlan* plan) {} -// __global__ void commnuication_kernel(void* sendbuff, void* recvbuff, void* scratchbuff, size_t chunkSize) { -// // read data from shared memory -// // 1. get the number of command from shared memory -// int nOps = sharedMem->nOperations; -// mscclpp::DeviceHandle* smChannel = sharedMem->channels.smChannels; -// mscclpp::DeviceHandle* proxyChannel = sharedMem->channels.proxyChannels; -// for (int opId = 0; opId < nOps; opId++) { -// // 2. get the command -// mscclpp::Operation* op = sharedMem->operations + opId; -// // 3. execute the command -// switch (op->type) { -// default: -// break; -// } -// } -// } +void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream) { + kernel<<>>(plan); +} +} // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 173870b74..26717d2d4 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -6,6 +6,7 @@ #include #include +#include "execution_kernel.hpp" #include "execution_plan.hpp" namespace mscclpp { @@ -60,9 +61,9 @@ struct ExecutionContext { std::vector smChannels; std::vector proxyChannels; std::vector deviceExecutionPlans; - std::vector> operations; std::shared_ptr scratchBuffer; size_t scratchBufferSize; + std::shared_ptr deviceExecutionPlansBuffer; }; struct Executor::Impl { @@ -70,8 +71,10 @@ struct Executor::Impl { std::shared_ptr comm; std::shared_ptr proxyService; std::unordered_map contexts; + CudaStreamWithFlags stream; - Impl(std::shared_ptr comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) { + Impl(std::shared_ptr comm, int nranksPerNode) + : nranksPerNode(nranksPerNode), comm(comm), stream(cudaStreamNonBlocking) { this->proxyService = std::make_shared(); } ~Impl() = default; @@ -93,6 +96,12 @@ struct Executor::Impl { this->setupRegisteredMemories(context, sendbuff, recvbuff, sendBufferSize, recvBufferSize, rank, plan); this->setupChannels(context, sendbuff, recvbuff, sendBufferSize, rank, plan); this->setupDeviceExecutionPlan(context, rank, plan); + context.deviceExecutionPlansBuffer = + allocExtSharedCuda(context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan)); + MSCCLPP_CUDATHROW(cudaMemcpyAsync(context.deviceExecutionPlansBuffer.get(), context.deviceExecutionPlans.data(), + context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan), + cudaMemcpyHostToDevice, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); return context; } @@ -230,7 +239,6 @@ struct Executor::Impl { for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) { DeviceExecutionPlan deviceExecutionPlan; std::vector ops = plan.impl_->getOperations(rank, threadblock); - context.operations.emplace_back(std::move(ops)); deviceExecutionPlan.nOperations = ops.size(); deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size(); deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size(); @@ -240,28 +248,29 @@ struct Executor::Impl { for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) { deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]); } + for (size_t i = 0; i < ops.size(); i++) { + deviceExecutionPlan.operations[i] = ops[i]; + } deviceExecutionPlans.push_back(deviceExecutionPlan); } context.deviceExecutionPlans = std::move(deviceExecutionPlans); } - void launchKernel(ExecutionContext& context) { - // copy context to shared memory - // std::cout << sizeof(Channels) << std::endl; - // std::cout << sizeof(Operation) << std::endl; - // std::cout << sizeof(DeviceExecutionPlan) << std::endl; - // launch kernel + void launchKernel(ExecutionContext& context, int nthreadsPerBlock) { + int nthreadblocks = context.deviceExecutionPlans.size(); + ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock, + (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), this->stream); } }; Executor::Executor(std::shared_ptr comm, int nranksPerNode) : impl_(std::make_unique(comm, nranksPerNode)) {} -void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, +void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, const ExecutionPlan& plan) { ExecutionContext context = this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); - this->impl_->launchKernel(context); + this->impl_->launchKernel(context, nthreads); } Executor::~Executor() = default; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp new file mode 100644 index 000000000..e0c607a7a --- /dev/null +++ b/src/include/execution_kernel.hpp @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_EXECUTION_KERNEL_HPP_ +#define MSCCLPP_EXECUTION_KERNEL_HPP_ + +#include +#include + +namespace mscclpp { + +constexpr int MAX_CHANNEL = 16; +constexpr int MAX_CHANNEL_PER_OPERATION = 8; +constexpr int MAX_OPERATION = 64; + +enum class BufferType : uint8_t { + INPUT, + OUTPUT, + SCRATCH, +}; + +enum class ChannelType : uint8_t { + SM, + PROXY, +}; + +enum class OperationType : uint8_t { + BARRIER, + PUT, + GET, + COPY, + SIGNAL, + WAIT, + FLUSH, + REDUCE, + REDUCE_SEND, + READ_REDUCE, + READ_REDUCE_SEND, +}; + +struct Channels { + mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; +}; + +struct Operation { + OperationType type; + ChannelType channelType; + BufferType srcBufferType; + BufferType dstBufferType; + uint8_t nInputChannels; + uint8_t nOutputChannels; + uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; + uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION]; + uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION]; + uint32_t srcOffset; + uint32_t dstOffset; + uint32_t size; +}; + +// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes +struct DeviceExecutionPlan { + uint8_t nSmChannels; // 1 bytes + uint8_t nProxyChannels; // 1 bytes + uint16_t nOperations; // 2 bytes + Channels channels; // 1920 bytes + Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes +}; + +class ExecutionKernel { + public: + static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream); +}; + +} // namespace mscclpp + +#endif // MSCCLPP_EXECUTION_KERNEL_HPP_ diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index bfa51b503..3575390ba 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -6,24 +6,13 @@ #include #include -#include -#include #include #include #include -namespace mscclpp { - -enum class BufferType : uint8_t { - INPUT, - OUTPUT, - SCRATCH, -}; +#include "execution_kernel.hpp" -enum class ChannelType : uint8_t { - SM, - PROXY, -}; +namespace mscclpp { struct ChannelKey { BufferType srcBufferType; @@ -48,24 +37,6 @@ struct hash { namespace mscclpp { -constexpr int MAX_CHANNEL = 16; -constexpr int MAX_CHANNEL_PER_OPERATION = 8; -constexpr int MAX_OPERATION = 64; - -enum class OperationType : uint8_t { - BARRIER, - PUT, - GET, - COPY, - SIGNAL, - WAIT, - FLUSH, - REDUCE, - REDUCE_SEND, - READ_REDUCE, - READ_REDUCE_SEND, -}; - struct ChannelInfo { BufferType srcBufferType; BufferType dstBufferType; @@ -73,36 +44,6 @@ struct ChannelInfo { std::vector connectedPeers; }; -struct Channels { - mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; - mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; -}; - -struct Operation { - OperationType type; - ChannelType channelType; - BufferType srcBufferType; - BufferType dstBufferType; - uint8_t nInputChannels; - uint8_t nOutputChannels; - uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION]; - uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION]; - uint32_t srcOffset; - uint32_t dstOffset; - uint32_t size; -}; - -// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes -struct DeviceExecutionPlan { - uint8_t nSmChannels; // 1 bytes - uint8_t nProxyChannels; // 1 bytes - uint16_t nOperations; // 2 bytes - Channels channels; // 1920 bytes - Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes -}; - struct ExecutionPlan::Impl { public: Impl(std::string planPath); diff --git a/test/executor_test.cc b/test/executor_test.cc index 995049ccf..8a82f9fc3 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -25,7 +25,7 @@ int main() { mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); - executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, plan); + executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan); MPI_Finalize(); return 0; From 0b4c19a89ce72546720839c249d997801b30c365 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 3 Apr 2024 14:50:46 +0000 Subject: [PATCH 18/51] WIP --- src/executor/execution_kernel.cu | 26 +++++++++++++++++++++++--- src/executor/executor.cc | 4 +++- src/include/execution_kernel.hpp | 3 ++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 9b22bcc0d..4bcd4e7e9 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -1,12 +1,32 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include + #include "execution_kernel.hpp" +#if defined(MSCCLPP_DEVICE_HIP) +#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); +#endif // defined(MSCCLPP_DEVICE_HIP) + namespace mscclpp { -__global__ void kernel(DeviceExecutionPlan* plan) {} +__global__ void kernel(DeviceExecutionPlan* plan) { + extern __shared__ int sharedMem[]; + int bid = blockIdx.x; + int tid = threadIdx.x; + DeviceExecutionPlan* localPlan = plan + bid; + for (int i = tid; i < sizeof(DeviceExecutionPlan); i += blockDim.x) { + sharedMem[i] = ((int*)localPlan)[i]; + } +#if defined(MSCCLPP_DEVICE_HIP) + __synclds(); +#else // !defined(MSCCLPP_DEVICE_HIP) + __syncthreads(); +#endif // !defined(MSCCLPP_DEVICE_HIP) +} -void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream) { - kernel<<>>(plan); +void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream) { + kernel<<>>(plan); } } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 26717d2d4..507c606d6 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -258,8 +258,10 @@ struct Executor::Impl { void launchKernel(ExecutionContext& context, int nthreadsPerBlock) { int nthreadblocks = context.deviceExecutionPlans.size(); + size_t sharedMemSize = sizeof(DeviceExecutionPlan); ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock, - (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), this->stream); + (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, + this->stream); } }; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index e0c607a7a..2dfda4011 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -70,7 +70,8 @@ struct DeviceExecutionPlan { class ExecutionKernel { public: - static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, cudaStream_t stream); + static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream); }; } // namespace mscclpp From c14aac266cf3ed993a9016bccd71bc171045a57d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Apr 2024 03:41:26 +0000 Subject: [PATCH 19/51] WIP --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_kernel.cu | 69 +++++++++++++++++++++++++++++--- src/executor/executor.cc | 19 +++++---- src/include/execution_kernel.hpp | 2 +- test/executor_test.cc | 19 +++++++-- 5 files changed, 91 insertions(+), 20 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 985ffba3a..a98853776 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -30,7 +30,7 @@ class Executor { ~Executor(); void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, - const ExecutionPlan& plan); + const ExecutionPlan& plan, cudaStream_t stream); private: struct Impl; diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 4bcd4e7e9..47333b62c 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -10,12 +10,39 @@ #endif // defined(MSCCLPP_DEVICE_HIP) namespace mscclpp { -__global__ void kernel(DeviceExecutionPlan* plan) { + +MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChannels, + DeviceHandle* proxyChannels, uint8_t* channelIndex, + int nChannels, ChannelType chType) { + if (tid < nChannels) { + if (chType == ChannelType::SM) { + smChannels[channelIndex[tid]].signal(); + } + if (chType == ChannelType::PROXY) { + proxyChannels[channelIndex[tid]].signal(); + } + } +} + +MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle* smChannels, + DeviceHandle* proxyChannels, uint8_t* channelIndex, + int nChannels, ChannelType chType) { + if (tid < nChannels) { + if (chType == ChannelType::SM) { + smChannels[channelIndex[tid]].wait(); + } + if (chType == ChannelType::PROXY) { + proxyChannels[channelIndex[tid]].wait(); + } + } +} + +__global__ void kernel(int rank, DeviceExecutionPlan* plan) { extern __shared__ int sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; DeviceExecutionPlan* localPlan = plan + bid; - for (int i = tid; i < sizeof(DeviceExecutionPlan); i += blockDim.x) { + for (int i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) { sharedMem[i] = ((int*)localPlan)[i]; } #if defined(MSCCLPP_DEVICE_HIP) @@ -23,10 +50,42 @@ __global__ void kernel(DeviceExecutionPlan* plan) { #else // !defined(MSCCLPP_DEVICE_HIP) __syncthreads(); #endif // !defined(MSCCLPP_DEVICE_HIP) + Operation* operations = localPlan->operations; + DeviceHandle* smChannels = localPlan->channels.smChannels; + DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; + if (bid > 0) { + return; + } + for (int i = 0; i < localPlan->nOperations; i++) { + switch (operations[i].type) { + case OperationType::BARRIER: + __syncthreads(); + break; + case OperationType::SIGNAL: + // if (tid == 0) { + // printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid, + // operations[i].nOutputChannels, operations[i].outputChannelIndex[0]); + // } + handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndex, operations[i].nOutputChannels, + operations[i].channelType); + break; + case OperationType::WAIT: + // if (tid == 0) { + // printf("rank: %d bid: %d, ninputchannels: %d inputChannelIndex %d\n", rank, bid, + // operations[i].nInputChannels, + // operations[i].inputChannelIndex[0]); + // } + handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels, + operations[i].channelType); + break; + default: + break; + } + } } -void ExecutionKernel::launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream) { - kernel<<>>(plan); +void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, + size_t sharedMemSize, cudaStream_t stream) { + kernel<<>>(rank, plan); } } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 507c606d6..dcdebd7ec 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -71,16 +71,14 @@ struct Executor::Impl { std::shared_ptr comm; std::shared_ptr proxyService; std::unordered_map contexts; - CudaStreamWithFlags stream; - Impl(std::shared_ptr comm, int nranksPerNode) - : nranksPerNode(nranksPerNode), comm(comm), stream(cudaStreamNonBlocking) { + Impl(std::shared_ptr comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) { this->proxyService = std::make_shared(); } ~Impl() = default; ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, const ExecutionPlan& plan) { + size_t recvBufferSize, const ExecutionPlan& plan, cudaStream_t stream) { ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name}; if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; @@ -102,6 +100,7 @@ struct Executor::Impl { context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice, stream)); MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); + this->contexts.insert({key, context}); return context; } @@ -256,12 +255,12 @@ struct Executor::Impl { context.deviceExecutionPlans = std::move(deviceExecutionPlans); } - void launchKernel(ExecutionContext& context, int nthreadsPerBlock) { + void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, cudaStream_t stream) { int nthreadblocks = context.deviceExecutionPlans.size(); size_t sharedMemSize = sizeof(DeviceExecutionPlan); - ExecutionKernel::launchKernel(nthreadblocks, nthreadsPerBlock, + ExecutionKernel::launchKernel(rank, nthreadblocks, nthreadsPerBlock, (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, - this->stream); + stream); } }; @@ -269,10 +268,10 @@ Executor::Executor(std::shared_ptr comm, int nranksPerNode) : impl_(std::make_unique(comm, nranksPerNode)) {} void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, - const ExecutionPlan& plan) { + const ExecutionPlan& plan, cudaStream_t stream) { ExecutionContext context = - this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); - this->impl_->launchKernel(context, nthreads); + this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream); + this->impl_->launchKernel(context, rank, nthreads, stream); } Executor::~Executor() = default; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 2dfda4011..ff79dcc39 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -70,7 +70,7 @@ struct DeviceExecutionPlan { class ExecutionKernel { public: - static void launchKernel(int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, + static void launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); }; diff --git a/test/executor_test.cc b/test/executor_test.cc index 8a82f9fc3..76a580d36 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -1,6 +1,17 @@ #include #include +#include + +// Check CUDA RT calls +#define CUDACHECK(cmd) \ + do { \ + cudaError_t err = cmd; \ + if (err != cudaSuccess) { \ + printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp"; @@ -17,15 +28,17 @@ int main() { } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); - // sleep 10s - std::this_thread::sleep_for(std::chrono::seconds(20)); + // sleep 20s + // std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); - executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan); + mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); + executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan, stream); + CUDACHECK(cudaStreamSynchronize(stream)); MPI_Finalize(); return 0; From d1c28bb3642b145d29456a787bdec492bce1c6d2 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Apr 2024 07:51:20 +0000 Subject: [PATCH 20/51] pass build --- include/mscclpp/executor.hpp | 11 +- src/executor/execution_kernel.cu | 192 ++++++++++++++++++++++++++++++- src/executor/executor.cc | 15 +-- src/include/execution_kernel.hpp | 7 +- test/executor_test.cc | 6 +- 5 files changed, 213 insertions(+), 18 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index a98853776..21087a762 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -10,6 +10,13 @@ namespace mscclpp { +enum class DataType { + INT32, + UINT32, + FLOAT16, + FLOAT32, +}; + class ExecutionPlan { public: ExecutionPlan(std::string planPath); @@ -29,8 +36,8 @@ class Executor { Executor& operator=(const Executor&) = delete; ~Executor(); - void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, - const ExecutionPlan& plan, cudaStream_t stream); + void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType, + int nthreads, const ExecutionPlan& plan, cudaStream_t stream); private: struct Impl; diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 47333b62c..3d2304735 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -9,6 +9,121 @@ #define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); #endif // defined(MSCCLPP_DEVICE_HIP) +namespace { +template +MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u; + u.f = src; + return u.t; +} + +template +MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) { + return a + b; +} + +template <> +MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) { + return __hadd2(a, b); +} + +template +MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) { + int4 ret; + ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +template +MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) { + uint2 ret; + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE uint2 add_vectors<__half>(uint2 a, uint2 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE int add_vectors<__half>(int a, int b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { + size_t nInt4 = nElem / 4; + size_t nLastInts = nElem % 4; + int4* dst4 = (int4*)dst; + int4* src4 = (int4*)src; + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { + dst4[i] = add_vectors(dst4[i], src4[i]); + } + if (nLastInts > 0) { + int* dstLast = ((int*)dst) + nInt4 * 4; + int* srcLast = ((int*)src) + nInt4 * 4; + for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { + dstLast[i] = add_vectors(dstLast[i], srcLast[i]); + } + } +} + +template +MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) { + vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); +} +} // namespace + namespace mscclpp { MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChannels, @@ -37,7 +152,52 @@ MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle* smChanne } } -__global__ void kernel(int rank, DeviceExecutionPlan* plan) { +template +MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output, + uint32_t outputOffsetByBytes, DeviceHandle* smChannels, + uint8_t* srcChannelIndex, uint8_t* dstChannelIndex, + uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, + int nDstChannels, uint32_t size) { + const size_t vectorSize = sizeof(int4) / sizeof(T); + const size_t nInt4 = size / sizeof(int4); + const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); + const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); + int4* input4 = (int4*)input; + int4* output4 = (int4*)output; + for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) { + int4 tmp = input4[inputOffset4 + idx]; + for (int index = 0; index < nSrcChannels; ++index) { + int4 val; + size_t srcOffset = srcOffsets[index] / sizeof(int4); + val = smChannels[srcChannelIndex[index]].read(srcOffset + idx); + tmp = add_vectors(tmp, val); + } + output4[outputOffset4 + idx] = tmp; + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(int4); + smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + } + } + // handle rest of data + size_t processed = nInt4 * sizeof(int4); + const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T); + const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T); + for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) { + T tmp = input[idx]; + for (int index = 0; index < nSrcChannels; ++index) { + size_t srcOffset = srcOffsets[index] / sizeof(T); + tmp += smChannels[srcChannelIndex[index]].read(srcOffset + idx); + } + output[idx] = tmp; + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(T); + smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + } + } +} + +template +__global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutionPlan* plan) { extern __shared__ int sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; @@ -78,14 +238,38 @@ __global__ void kernel(int rank, DeviceExecutionPlan* plan) { handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels, operations[i].channelType); break; + case OperationType::READ_REDUCE_SEND: + handleReadReduceCopySend(input, operations[i].srcOffset, input, operations[i].srcOffset, smChannels, + operations[i].inputChannelIndex, operations[i].outputChannelIndex, + operations[i].inputOffset, operations[i].outputOffset, operations[i].nInputChannels, + operations[i].nOutputChannels, operations[i].size); + break; default: break; } } } -void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, - size_t sharedMemSize, cudaStream_t stream) { - kernel<<>>(rank, plan); +void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream) { + switch (dataType) { + case DataType::INT32: + kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, + (int32_t*)scratch, plan); + break; + case DataType::UINT32: + kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, + (uint32_t*)scratch, plan); + break; + case DataType::FLOAT16: + kernel + <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + break; + case DataType::FLOAT32: + kernel + <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + break; + } } } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index dcdebd7ec..38cce1cb4 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -255,23 +255,24 @@ struct Executor::Impl { context.deviceExecutionPlans = std::move(deviceExecutionPlans); } - void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, cudaStream_t stream) { + void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, void* sendbuff, void* recvbuff, + DataType dataType, cudaStream_t stream) { int nthreadblocks = context.deviceExecutionPlans.size(); size_t sharedMemSize = sizeof(DeviceExecutionPlan); - ExecutionKernel::launchKernel(rank, nthreadblocks, nthreadsPerBlock, - (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, - stream); + ExecutionKernel::launchKernel( + rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, + (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream); } }; Executor::Executor(std::shared_ptr comm, int nranksPerNode) : impl_(std::make_unique(comm, nranksPerNode)) {} -void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, int nthreads, - const ExecutionPlan& plan, cudaStream_t stream) { +void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, + DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream) { ExecutionContext context = this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream); - this->impl_->launchKernel(context, rank, nthreads, stream); + this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream); } Executor::~Executor() = default; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index ff79dcc39..0d86f8231 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -4,6 +4,7 @@ #ifndef MSCCLPP_EXECUTION_KERNEL_HPP_ #define MSCCLPP_EXECUTION_KERNEL_HPP_ +#include #include #include @@ -35,7 +36,9 @@ enum class OperationType : uint8_t { REDUCE, REDUCE_SEND, READ_REDUCE, + READ_REDUCE_COPY, READ_REDUCE_SEND, + READ_REDUCE_COPY_SEND, }; struct Channels { @@ -70,8 +73,8 @@ struct DeviceExecutionPlan { class ExecutionKernel { public: - static void launchKernel(int rank, int nthreadblocks, int nthreads, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream); + static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); }; } // namespace mscclpp diff --git a/test/executor_test.cc b/test/executor_test.cc index 76a580d36..df81dbaad 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -34,10 +34,10 @@ int main() { std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); - std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024); - std::shared_ptr recvbuff = mscclpp::allocExtSharedCuda(1024); + std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024 * 1024); mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); - executor->execute(rank, sendbuff.get(), recvbuff.get(), 1024, 1024, 512, plan, stream); + executor->execute(rank, sendbuff.get(), sendbuff.get(), 1024 * 1024, 1024 * 1024, mscclpp::DataType::FLOAT16, 512, + plan, stream); CUDACHECK(cudaStreamSynchronize(stream)); MPI_Finalize(); From 36d31db9de8cf25169f024598f83c402913951b3 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Apr 2024 10:10:14 +0000 Subject: [PATCH 21/51] fix channel bugs --- src/executor/execution_kernel.cu | 34 ++++++++++++++++++++--------- src/executor/execution_plan.cc | 24 ++++++++++---------- src/executor/executor.cc | 16 ++++++++------ src/include/execution_kernel.hpp | 10 ++++----- test/execution-files/allreduce.json | 20 +++++++++++++---- 5 files changed, 66 insertions(+), 38 deletions(-) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 3d2304735..b781c71a9 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -126,6 +126,20 @@ MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) { namespace mscclpp { +template +MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) { + if (bufferType == BufferType::INPUT) { + return input; + } + if (bufferType == BufferType::OUTPUT) { + return output; + } + if (bufferType == BufferType::SCRATCH) { + return scratch; + } + return nullptr; +} + MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChannels, DeviceHandle* proxyChannels, uint8_t* channelIndex, int nChannels, ChannelType chType) { @@ -158,7 +172,6 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs uint8_t* srcChannelIndex, uint8_t* dstChannelIndex, uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, int nDstChannels, uint32_t size) { - const size_t vectorSize = sizeof(int4) / sizeof(T); const size_t nInt4 = size / sizeof(int4); const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); @@ -213,9 +226,8 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio Operation* operations = localPlan->operations; DeviceHandle* smChannels = localPlan->channels.smChannels; DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; - if (bid > 0) { - return; - } + T* src = nullptr; + T* dst = nullptr; for (int i = 0; i < localPlan->nOperations; i++) { switch (operations[i].type) { case OperationType::BARRIER: @@ -226,7 +238,7 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio // printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid, // operations[i].nOutputChannels, operations[i].outputChannelIndex[0]); // } - handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndex, operations[i].nOutputChannels, + handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels, operations[i].channelType); break; case OperationType::WAIT: @@ -235,13 +247,15 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio // operations[i].nInputChannels, // operations[i].inputChannelIndex[0]); // } - handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndex, operations[i].nInputChannels, + handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, operations[i].channelType); break; - case OperationType::READ_REDUCE_SEND: - handleReadReduceCopySend(input, operations[i].srcOffset, input, operations[i].srcOffset, smChannels, - operations[i].inputChannelIndex, operations[i].outputChannelIndex, - operations[i].inputOffset, operations[i].outputOffset, operations[i].nInputChannels, + case OperationType::READ_REDUCE_COPY_SEND: + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + dst = getBuffer(input, output, scratch, operations[i].dstBufferType); + handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, + operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, + operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, operations[i].nOutputChannels, operations[i].size); break; default: diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 7753bc0b9..4a64e86e6 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -33,10 +33,10 @@ auto getOpType = [](const std::string& str) { return mscclpp::OperationType::REDUCE; } else if (str == "rs") { return mscclpp::OperationType::REDUCE_SEND; - } else if (str == "rr") { - return mscclpp::OperationType::READ_REDUCE; - } else if (str == "rrs") { - return mscclpp::OperationType::READ_REDUCE_SEND; + } else if (str == "rrc") { + return mscclpp::OperationType::READ_REDUCE_COPY; + } else if (str == "rrcs") { + return mscclpp::OperationType::READ_REDUCE_COPY_SEND; } else { throw std::runtime_error("Invalid operation type"); } @@ -153,10 +153,12 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) { std::unordered_map> channelMap; for (auto channelType : channelTypes) { const std::vector channelInfos = this->getChannelInfos(rank, channelType); - for (size_t i = 0; i < channelInfos.size(); i++) { - const ChannelInfo& info = channelInfos[i]; + int index = 0; + for (const auto& info : channelInfos) { ChannelKey key = {info.srcBufferType, info.dstBufferType, info.channelType}; - channelMap[key].push_back(i); + for (size_t i = 0; i < info.connectedPeers.size(); i++) { + channelMap[key].push_back(index++); + } } } int nthreadblocks = gpu["threadblocks"].size(); @@ -211,16 +213,16 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { for (int i = 0; i < operation.nInputChannels; i++) { BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); - operation.inputChannelIndex[i] = + operation.inputChannelIndexes[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; - operation.inputOffset[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; + operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; } for (int i = 0; i < operation.nOutputChannels; i++) { BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); - operation.outputChannelIndex[i] = + operation.outputChannelIndexes[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]]; - operation.outputOffset[i] = this->chunkSize * (int)op["o_cids"][i]["off"]; + operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"]; } if (op.contains("srcbuff")) { operation.srcBufferType = convertToBufferType(op["srcbuff"]); diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 38cce1cb4..1d6d9305e 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -220,11 +220,11 @@ struct Executor::Impl { RegisteredMemory localMemory = this->comm->registerMemory(src, sendBufferSize, transport); for (int peer : info.connectedPeers) { if (channelType == ChannelType::SM) { - context.smChannels.emplace_back(context.smSemaphores[index], + context.smChannels.emplace_back(context.smSemaphores[index++], context.registeredMemories[{info.dstBufferType, peer}], src, nullptr); } else if (channelType == ChannelType::PROXY) { context.proxyChannels.emplace_back( - this->proxyService->proxyChannel(context.proxySemaphores[index]), + this->proxyService->proxyChannel(context.proxySemaphores[index++]), this->proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), this->proxyService->addMemory(localMemory)); } @@ -236,16 +236,18 @@ struct Executor::Impl { void setupDeviceExecutionPlan(ExecutionContext& context, int rank, const ExecutionPlan& plan) { std::vector deviceExecutionPlans; for (int threadblock = 0; threadblock < plan.impl_->getThreadblockCount(rank); threadblock++) { - DeviceExecutionPlan deviceExecutionPlan; + DeviceExecutionPlan deviceExecutionPlan = {}; std::vector ops = plan.impl_->getOperations(rank, threadblock); deviceExecutionPlan.nOperations = ops.size(); deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size(); deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size(); - for (const auto& [index, key] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) { - deviceExecutionPlan.channels.smChannels[index] = mscclpp::deviceHandle(context.smChannels[index]); + int chanIndex = 0; + for (const auto& [index, _] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.smChannels[chanIndex++] = mscclpp::deviceHandle(context.smChannels[index]); } - for (const auto& [index, key] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) { - deviceExecutionPlan.channels.proxyChannels[index] = mscclpp::deviceHandle(context.proxyChannels[index]); + chanIndex = 0; + for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]); } for (size_t i = 0; i < ops.size(); i++) { deviceExecutionPlan.operations[i] = ops[i]; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 0d86f8231..f1934b567 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -35,9 +35,7 @@ enum class OperationType : uint8_t { FLUSH, REDUCE, REDUCE_SEND, - READ_REDUCE, READ_REDUCE_COPY, - READ_REDUCE_SEND, READ_REDUCE_COPY_SEND, }; @@ -53,10 +51,10 @@ struct Operation { BufferType dstBufferType; uint8_t nInputChannels; uint8_t nOutputChannels; - uint8_t inputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - uint8_t outputChannelIndex[MAX_CHANNEL_PER_OPERATION]; - uint32_t inputOffset[MAX_CHANNEL_PER_OPERATION]; - uint32_t outputOffset[MAX_CHANNEL_PER_OPERATION]; + uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; + uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; uint32_t srcOffset; uint32_t dstOffset; uint32_t size; diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json index b2032e90b..67e393fac 100644 --- a/test/execution-files/allreduce.json +++ b/test/execution-files/allreduce.json @@ -53,7 +53,7 @@ ] }, { - "name": "rrs", + "name": "rrcs", "i_buff": { "src": "i", "dst": "i" @@ -74,6 +74,9 @@ "off": 0 } ], + "src": 0, + "srcbuff": "i", + "srcoff": 0, "dst": 0, "dstbuff": "i", "dstoff": 0, @@ -174,7 +177,7 @@ ] }, { - "name": "rrs", + "name": "rrcs", "i_buff": { "src": "i", "dst": "i" @@ -195,6 +198,9 @@ "off": 0 } ], + "src": 0, + "srcbuff": "i", + "srcoff": 1, "dst": 0, "dstbuff": "i", "dstoff": 1, @@ -314,7 +320,7 @@ ] }, { - "name": "rrs", + "name": "rrcs", "i_buff": { "src": "i", "dst": "i" @@ -335,6 +341,9 @@ "off": 2 } ], + "src": 1, + "srcbuff": "i", + "srcoff": 2, "dst": 1, "dstbuff": "i", "dstoff": 2, @@ -435,7 +444,7 @@ ] }, { - "name": "rrs", + "name": "rrcs", "i_buff": { "src": "i", "dst": "i" @@ -456,6 +465,9 @@ "off": 2 } ], + "src": 1, + "srcbuff": "i", + "srcoff": 3, "dst": 1, "dstbuff": "i", "dstoff": 3, From 37c2d7da623df6adca3c9e0863a2ae8df6994213 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Apr 2024 10:37:55 +0000 Subject: [PATCH 22/51] minor --- src/executor/execution_kernel.cu | 37 +++-------------------------- test/execution-files/allreduce.json | 12 +++++----- 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index b781c71a9..8a07870de 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -66,7 +66,7 @@ MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { } template <> -MSCCLPP_DEVICE_INLINE uint2 add_vectors<__half>(uint2 a, uint2 b) { +MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) { return add_vectors_helper<__half2>(a, b); } @@ -81,7 +81,7 @@ MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { } template <> -MSCCLPP_DEVICE_INLINE int add_vectors<__half>(int a, int b) { +MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) { return add_vectors_helper<__half2>(a, b); } @@ -96,32 +96,10 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { } template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { +MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { return add_vectors_helper<__half2>(a, b); } -template -MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem, int blockId, int nBlocks) { - size_t nInt4 = nElem / 4; - size_t nLastInts = nElem % 4; - int4* dst4 = (int4*)dst; - int4* src4 = (int4*)src; - for (size_t i = threadIdx.x + blockId * blockDim.x; i < nInt4; i += blockDim.x * nBlocks) { - dst4[i] = add_vectors(dst4[i], src4[i]); - } - if (nLastInts > 0) { - int* dstLast = ((int*)dst) + nInt4 * 4; - int* srcLast = ((int*)src) + nInt4 * 4; - for (size_t i = threadIdx.x + blockId * blockDim.x; i < nLastInts; i += blockDim.x * nBlocks) { - dstLast[i] = add_vectors(dstLast[i], srcLast[i]); - } - } -} - -template -MSCCLPP_DEVICE_INLINE void vectorSum(T* dst, T* src, size_t nElem) { - vectorSum(dst, src, nElem, blockIdx.x, gridDim.x); -} } // namespace namespace mscclpp { @@ -234,19 +212,10 @@ __global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutio __syncthreads(); break; case OperationType::SIGNAL: - // if (tid == 0) { - // printf("rank: %d bid: %d, noutputchannels: %d outputChannelIndex %d\n", rank, bid, - // operations[i].nOutputChannels, operations[i].outputChannelIndex[0]); - // } handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels, operations[i].channelType); break; case OperationType::WAIT: - // if (tid == 0) { - // printf("rank: %d bid: %d, ninputchannels: %d inputChannelIndex %d\n", rank, bid, - // operations[i].nInputChannels, - // operations[i].inputChannelIndex[0]); - // } handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, operations[i].channelType); break; diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json index 67e393fac..a511316ab 100644 --- a/test/execution-files/allreduce.json +++ b/test/execution-files/allreduce.json @@ -195,7 +195,7 @@ "o_cids": [ { "id": 0, - "off": 0 + "off": 1 } ], "src": 0, @@ -225,7 +225,7 @@ "o_cids": [ { "id": 0, - "off": 0 + "off": 1 } ], "ctype": "sm", @@ -240,7 +240,7 @@ "i_cids": [ { "id": 0, - "off": 2 + "off": 3 } ], "ctype": "sm", @@ -462,7 +462,7 @@ "o_cids": [ { "id": 0, - "off": 2 + "off": 3 } ], "src": 1, @@ -492,7 +492,7 @@ "o_cids": [ { "id": 0, - "off": 2 + "off": 3 } ], "ctype": "sm", @@ -507,7 +507,7 @@ "i_cids": [ { "id": 0, - "off": 0 + "off": 1 } ], "ctype": "sm", From 8c7978016a4c5db827a90941c2ebdbdfe18e0dab Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 4 Apr 2024 15:09:23 +0000 Subject: [PATCH 23/51] add python binding --- python/mscclpp/__init__.py | 3 +++ python/mscclpp/core_py.cpp | 2 ++ python/mscclpp/executor.cpp | 34 +++++++++++++++++++++++++ python/test/executor_test.py | 49 ++++++++++++++++++++++++++++++++++++ test/executor_test.cc | 3 ++- 5 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 python/mscclpp/executor.cpp create mode 100644 python/test/executor_test.py diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 8f013e080..0c8f7eb3b 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -19,6 +19,9 @@ TcpBootstrap, Transport, TransportFlags, + DataType, + Executor, + ExecutionPlan, version, is_nvls_supported, ) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 1a1cd2780..3f78dad35 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -20,6 +20,7 @@ extern void register_fifo(nb::module_& m); extern void register_semaphore(nb::module_& m); extern void register_utils(nb::module_& m); extern void register_numa(nb::module_& m); +extern void register_executor(nb::module_& m); template void def_nonblocking_future(nb::handle& m, const std::string& typestr) { @@ -204,4 +205,5 @@ NB_MODULE(_mscclpp, m) { register_utils(m); register_core(m); register_numa(m); + register_executor(m); } diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp new file mode 100644 index 000000000..5276e3336 --- /dev/null +++ b/python/mscclpp/executor.cpp @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include + +#include +#include + +namespace nb = nanobind; +using namespace mscclpp; + +void register_executor(nb::module_& m) { + nb::enum_(m, "DataType") + .value("int32", DataType::INT32) + .value("uint32", DataType::UINT32) + .value("float16", DataType::FLOAT16) + .value("float32", DataType::FLOAT32); + + nb::class_(m, "ExecutionPlan").def(nb::init(), nb::arg("planPath")); + + nb::class_(m, "Executor") + .def(nb::init, int>(), nb::arg("comm"), nb::arg("nranksPerNode")) + .def( + "execute", + [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize, + DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream) { + self->execute(rank, reinterpret_cast(sendbuff), reinterpret_cast(recvBuff), sendBuffSize, + recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream); + }, + nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"), + nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream")); +} diff --git a/python/test/executor_test.py b/python/test/executor_test.py new file mode 100644 index 000000000..b896bf102 --- /dev/null +++ b/python/test/executor_test.py @@ -0,0 +1,49 @@ +from os import path +from mscclpp import ( + DataType, + Executor, + ExecutionPlan, +) +import mscclpp.comm as mscclpp_comm + +import cupy as cp +from mpi4py import MPI + +MSCCLPP_ROOT_PATH = "/root/mscclpp" + +if __name__ == "__main__": + shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) + N_GPUS_PER_NODE = shm_comm.size + shm_comm.Free() + + cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() + mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD) + executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) + execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")) + + buffer_size = 1024 * 1024 + cp.random.seed(42) + buffer = cp.random.random(buffer_size).astype(cp.float16) + sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size) + sendbuf = sub_arrays[MPI.COMM_WORLD.rank] + + expected = cp.zeros_like(sendbuf) + for i in range(MPI.COMM_WORLD.size): + expected += sub_arrays[i] + + stream = cp.cuda.Stream(non_blocking=True) + executor.execute( + MPI.COMM_WORLD.rank, + sendbuf.data.ptr, + sendbuf.data.ptr, + buffer_size, + buffer_size, + DataType.float16, + 512, + execution_plan, + stream.ptr, + ) + stream.synchronize() + assert cp.allclose(sendbuf, expected, atol=1e-3) + executor = None + mscclpp_group = None diff --git a/test/executor_test.cc b/test/executor_test.cc index df81dbaad..c58573ce8 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -31,8 +31,9 @@ int main() { // sleep 20s // std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); - std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); + CUDACHECK(cudaSetDevice(rank)); + std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024 * 1024); mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); From 71b62246bf52c213a39b3de922fa6ad5e2e47365 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 03:21:23 +0000 Subject: [PATCH 24/51] Fix --- python/test/executor_test.py | 10 +++++----- test/executor_test.cc | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index b896bf102..a777c3546 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -21,9 +21,9 @@ executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")) - buffer_size = 1024 * 1024 + nelems = 1024 * 1024 cp.random.seed(42) - buffer = cp.random.random(buffer_size).astype(cp.float16) + buffer = cp.random.random(nelems).astype(cp.float16) sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size) sendbuf = sub_arrays[MPI.COMM_WORLD.rank] @@ -36,14 +36,14 @@ MPI.COMM_WORLD.rank, sendbuf.data.ptr, sendbuf.data.ptr, - buffer_size, - buffer_size, + sendbuf.nbytes, + sendbuf.nbytes, DataType.float16, 512, execution_plan, stream.ptr, ) stream.synchronize() - assert cp.allclose(sendbuf, expected, atol=1e-3) + assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size) executor = None mscclpp_group = None diff --git a/test/executor_test.cc b/test/executor_test.cc index c58573ce8..4a7b36a79 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -35,10 +35,11 @@ int main() { std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); - std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(1024 * 1024); + const int bufferSize = 1024 * 1024; + std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); - executor->execute(rank, sendbuff.get(), sendbuff.get(), 1024 * 1024, 1024 * 1024, mscclpp::DataType::FLOAT16, 512, - plan, stream); + executor->execute(rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512, plan, + stream); CUDACHECK(cudaStreamSynchronize(stream)); MPI_Finalize(); From 2eb6426f79cb3e665af869b9cae170bf43bedfc9 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 03:40:13 +0000 Subject: [PATCH 25/51] update test json --- test/execution-files/allreduce.json | 516 +++++++++++++++++++++++++++- 1 file changed, 508 insertions(+), 8 deletions(-) diff --git a/test/execution-files/allreduce.json b/test/execution-files/allreduce.json index a511316ab..739b8e6ab 100644 --- a/test/execution-files/allreduce.json +++ b/test/execution-files/allreduce.json @@ -6,7 +6,7 @@ "gpus": [ { "id": 0, - "inputChunks": 4, + "inputChunks": 8, "outputChunks": 0, "scratchChunks": 0, "threadblocks": [ @@ -136,6 +136,130 @@ }, { "id": 1, + "ops": [ + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 6 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 4 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 4 + } + ], + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 4 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 4, + "dst": 0, + "dstbuff": "i", + "dstoff": 4, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 4 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 6 + } + ], + "ctype": "sm", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + }, + { + "id": 2, "ops": [ { "name": "signal", @@ -171,7 +295,7 @@ "name": "nop", "deps": [ { - "tb": 1, + "tb": 2, "step": 1 } ] @@ -211,7 +335,7 @@ "name": "nop", "deps": [ { - "tb": 1, + "tb": 2, "step": 3 } ] @@ -253,7 +377,131 @@ "dst": "i", "ctype": "sm", "cids": [ - 1 + 2 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 7 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 5 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 3, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 5 + } + ], + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 5 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 5, + "dst": 0, + "dstbuff": "i", + "dstoff": 5, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 3, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 5 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 7 + } + ], + "ctype": "sm", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cids": [ + 3 ] } ] @@ -265,6 +513,8 @@ "dstbuff": "i", "type": "sm", "connectedTo": [ + 1, + 1, 1, 1 ] @@ -273,7 +523,7 @@ }, { "id": 1, - "inputChunks": 4, + "inputChunks": 8, "outputChunks": 0, "scratchChunks": 0, "threadblocks": [ @@ -403,6 +653,130 @@ }, { "id": 1, + "ops": [ + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 4 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 6 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 6 + } + ], + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 6 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 6, + "dst": 1, + "dstbuff": "i", + "dstoff": 6, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 1, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 6 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 4 + } + ], + "ctype": "sm", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + }, + { + "id": 2, "ops": [ { "name": "signal", @@ -438,7 +812,7 @@ "name": "nop", "deps": [ { - "tb": 1, + "tb": 2, "step": 1 } ] @@ -478,7 +852,7 @@ "name": "nop", "deps": [ { - "tb": 1, + "tb": 2, "step": 3 } ] @@ -520,7 +894,131 @@ "dst": "i", "ctype": "sm", "cids": [ - 1 + 2 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 5 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 7 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 3, + "step": 1 + } + ] + }, + { + "name": "rrcs", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 7 + } + ], + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 7 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 7, + "dst": 1, + "dstbuff": "i", + "dstoff": 7, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "nop", + "deps": [ + { + "tb": 3, + "step": 3 + } + ] + }, + { + "name": "signal", + "o_buff": { + "src": "i", + "dst": "i" + }, + "o_cids": [ + { + "id": 0, + "off": 7 + } + ], + "ctype": "sm", + "cnt": 1 + }, + { + "name": "wait", + "i_buff": { + "src": "i", + "dst": "i" + }, + "i_cids": [ + { + "id": 0, + "off": 5 + } + ], + "ctype": "sm", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "i", + "ctype": "sm", + "cids": [ + 3 ] } ] @@ -532,6 +1030,8 @@ "dstbuff": "i", "type": "sm", "connectedTo": [ + 0, + 0, 0, 0 ] From 7e74ed8522c84e3c32166f83a198779e9c6c0abe Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 06:34:38 +0000 Subject: [PATCH 26/51] for rocm --- src/CMakeLists.txt | 2 +- src/executor/execution_kernel.cu | 258 -------------------------- src/include/execution_common.hpp | 73 ++++++++ src/include/execution_kernel.hpp | 298 +++++++++++++++++++++++++------ src/include/execution_plan.hpp | 2 +- 5 files changed, 318 insertions(+), 315 deletions(-) delete mode 100644 src/executor/execution_kernel.cu create mode 100644 src/include/execution_common.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45b4075d2..cfbcc927a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) target_sources(mscclpp_obj PRIVATE ${SOURCES}) target_include_directories(mscclpp_obj PRIVATE include) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu deleted file mode 100644 index 8a07870de..000000000 --- a/src/executor/execution_kernel.cu +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -#include "execution_kernel.hpp" - -#if defined(MSCCLPP_DEVICE_HIP) -#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); -#endif // defined(MSCCLPP_DEVICE_HIP) - -namespace { -template -MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) { - return a + b; -} - -template <> -MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); -} - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) { - return add_vectors_helper<__half2>(a, b); -} - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { - return add_vectors_helper<__half2>(a, b); -} - -} // namespace - -namespace mscclpp { - -template -MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) { - if (bufferType == BufferType::INPUT) { - return input; - } - if (bufferType == BufferType::OUTPUT) { - return output; - } - if (bufferType == BufferType::SCRATCH) { - return scratch; - } - return nullptr; -} - -MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChannels, - DeviceHandle* proxyChannels, uint8_t* channelIndex, - int nChannels, ChannelType chType) { - if (tid < nChannels) { - if (chType == ChannelType::SM) { - smChannels[channelIndex[tid]].signal(); - } - if (chType == ChannelType::PROXY) { - proxyChannels[channelIndex[tid]].signal(); - } - } -} - -MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle* smChannels, - DeviceHandle* proxyChannels, uint8_t* channelIndex, - int nChannels, ChannelType chType) { - if (tid < nChannels) { - if (chType == ChannelType::SM) { - smChannels[channelIndex[tid]].wait(); - } - if (chType == ChannelType::PROXY) { - proxyChannels[channelIndex[tid]].wait(); - } - } -} - -template -MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output, - uint32_t outputOffsetByBytes, DeviceHandle* smChannels, - uint8_t* srcChannelIndex, uint8_t* dstChannelIndex, - uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, - int nDstChannels, uint32_t size) { - const size_t nInt4 = size / sizeof(int4); - const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); - const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); - int4* input4 = (int4*)input; - int4* output4 = (int4*)output; - for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) { - int4 tmp = input4[inputOffset4 + idx]; - for (int index = 0; index < nSrcChannels; ++index) { - int4 val; - size_t srcOffset = srcOffsets[index] / sizeof(int4); - val = smChannels[srcChannelIndex[index]].read(srcOffset + idx); - tmp = add_vectors(tmp, val); - } - output4[outputOffset4 + idx] = tmp; - for (int index = 0; index < nDstChannels; ++index) { - size_t dstOffset = dstOffsets[index] / sizeof(int4); - smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); - } - } - // handle rest of data - size_t processed = nInt4 * sizeof(int4); - const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T); - const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T); - for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) { - T tmp = input[idx]; - for (int index = 0; index < nSrcChannels; ++index) { - size_t srcOffset = srcOffsets[index] / sizeof(T); - tmp += smChannels[srcChannelIndex[index]].read(srcOffset + idx); - } - output[idx] = tmp; - for (int index = 0; index < nDstChannels; ++index) { - size_t dstOffset = dstOffsets[index] / sizeof(T); - smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); - } - } -} - -template -__global__ void kernel(int rank, T* input, T* output, T* scratch, DeviceExecutionPlan* plan) { - extern __shared__ int sharedMem[]; - int bid = blockIdx.x; - int tid = threadIdx.x; - DeviceExecutionPlan* localPlan = plan + bid; - for (int i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) { - sharedMem[i] = ((int*)localPlan)[i]; - } -#if defined(MSCCLPP_DEVICE_HIP) - __synclds(); -#else // !defined(MSCCLPP_DEVICE_HIP) - __syncthreads(); -#endif // !defined(MSCCLPP_DEVICE_HIP) - Operation* operations = localPlan->operations; - DeviceHandle* smChannels = localPlan->channels.smChannels; - DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; - T* src = nullptr; - T* dst = nullptr; - for (int i = 0; i < localPlan->nOperations; i++) { - switch (operations[i].type) { - case OperationType::BARRIER: - __syncthreads(); - break; - case OperationType::SIGNAL: - handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels, - operations[i].channelType); - break; - case OperationType::WAIT: - handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, - operations[i].channelType); - break; - case OperationType::READ_REDUCE_COPY_SEND: - src = getBuffer(input, output, scratch, operations[i].srcBufferType); - dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, - operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, - operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, - operations[i].nOutputChannels, operations[i].size); - break; - default: - break; - } - } -} - -void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream) { - switch (dataType) { - case DataType::INT32: - kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, - (int32_t*)scratch, plan); - break; - case DataType::UINT32: - kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, - (uint32_t*)scratch, plan); - break; - case DataType::FLOAT16: - kernel - <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); - break; - case DataType::FLOAT32: - kernel - <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); - break; - } -} -} // namespace mscclpp diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp new file mode 100644 index 000000000..59d341612 --- /dev/null +++ b/src/include/execution_common.hpp @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_EXECUTION_COMMON_HPP_ +#define MSCCLPP_EXECUTION_COMMON_HPP_ + +#include +#include + +namespace mscclpp { + +constexpr int MAX_CHANNEL = 16; +constexpr int MAX_CHANNEL_PER_OPERATION = 8; +constexpr int MAX_OPERATION = 64; + +enum class BufferType : uint8_t { + INPUT, + OUTPUT, + SCRATCH, +}; + +enum class ChannelType : uint8_t { + SM, + PROXY, +}; + +enum class OperationType : uint8_t { + BARRIER, + PUT, + GET, + COPY, + SIGNAL, + WAIT, + FLUSH, + REDUCE, + REDUCE_SEND, + READ_REDUCE_COPY, + READ_REDUCE_COPY_SEND, +}; + +struct Channels { + mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; +}; + +struct Operation { + OperationType type; + ChannelType channelType; + BufferType srcBufferType; + BufferType dstBufferType; + uint8_t nInputChannels; + uint8_t nOutputChannels; + uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; + uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; + uint32_t srcOffset; + uint32_t dstOffset; + uint32_t size; +}; + +// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes +struct DeviceExecutionPlan { + uint8_t nSmChannels; // 1 bytes + uint8_t nProxyChannels; // 1 bytes + uint16_t nOperations; // 2 bytes + Channels channels; // 1920 bytes + Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes +}; + +} // namespace mscclpp + +#endif // MSCCLPP_EXECUTION_COMMON_HPP_ diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index f1934b567..71c1e140a 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -8,73 +8,261 @@ #include #include -namespace mscclpp { +#include "execution_common.hpp" -constexpr int MAX_CHANNEL = 16; -constexpr int MAX_CHANNEL_PER_OPERATION = 8; -constexpr int MAX_OPERATION = 64; +#if defined(MSCCLPP_DEVICE_HIP) +#define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); +#endif // defined(MSCCLPP_DEVICE_HIP) -enum class BufferType : uint8_t { - INPUT, - OUTPUT, - SCRATCH, -}; +namespace { +template +MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); -enum class ChannelType : uint8_t { - SM, - PROXY, -}; + union { + From f; + To t; + } u; + u.f = src; + return u.t; +} -enum class OperationType : uint8_t { - BARRIER, - PUT, - GET, - COPY, - SIGNAL, - WAIT, - FLUSH, - REDUCE, - REDUCE_SEND, - READ_REDUCE_COPY, - READ_REDUCE_COPY_SEND, -}; +template +MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) { + return a + b; +} -struct Channels { - mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; - mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; -}; +template <> +MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) { + return __hadd2(a, b); +} -struct Operation { - OperationType type; - ChannelType channelType; - BufferType srcBufferType; - BufferType dstBufferType; - uint8_t nInputChannels; - uint8_t nOutputChannels; - uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; - uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; - uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; - uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; - uint32_t srcOffset; - uint32_t dstOffset; - uint32_t size; -}; +template +MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) { + int4 ret; + ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} -// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes -struct DeviceExecutionPlan { - uint8_t nSmChannels; // 1 bytes - uint8_t nProxyChannels; // 1 bytes - uint16_t nOperations; // 2 bytes - Channels channels; // 1920 bytes - Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes -}; +template +MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) { + uint2 ret; + ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) { + return add_vectors_helper<__half2>(a, b); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) { + return bit_cast(add_elements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { + return add_vectors_helper(a, b); +} + +template <> +MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { + return add_vectors_helper<__half2>(a, b); +} + +} // namespace + +namespace mscclpp { + +template +MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) { + if (bufferType == BufferType::INPUT) { + return input; + } + if (bufferType == BufferType::OUTPUT) { + return output; + } + if (bufferType == BufferType::SCRATCH) { + return scratch; + } + return nullptr; +} + +MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChannels, + DeviceHandle* proxyChannels, uint8_t* channelIndex, + int nChannels, ChannelType chType) { + if (tid < nChannels) { + if (chType == ChannelType::SM) { + smChannels[channelIndex[tid]].signal(); + } + if (chType == ChannelType::PROXY) { + proxyChannels[channelIndex[tid]].signal(); + } + } +} + +MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle* smChannels, + DeviceHandle* proxyChannels, uint8_t* channelIndex, + int nChannels, ChannelType chType) { + if (tid < nChannels) { + if (chType == ChannelType::SM) { + smChannels[channelIndex[tid]].wait(); + } + if (chType == ChannelType::PROXY) { + proxyChannels[channelIndex[tid]].wait(); + } + } +} + +template +MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output, + uint32_t outputOffsetByBytes, DeviceHandle* smChannels, + uint8_t* srcChannelIndex, uint8_t* dstChannelIndex, + uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, + int nDstChannels, uint32_t size) { + const size_t nInt4 = size / sizeof(int4); + const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); + const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); + int4* input4 = (int4*)input; + int4* output4 = (int4*)output; + for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) { + int4 tmp = input4[inputOffset4 + idx]; + for (int index = 0; index < nSrcChannels; ++index) { + int4 val; + size_t srcOffset = srcOffsets[index] / sizeof(int4); + val = smChannels[srcChannelIndex[index]].read(srcOffset + idx); + tmp = add_vectors(tmp, val); + } + output4[outputOffset4 + idx] = tmp; + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(int4); + smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + } + } + // handle rest of data + size_t processed = nInt4 * sizeof(int4); + const size_t startIdx = (inputOffsetByBytes + processed) / sizeof(T); + const size_t endIdx = (inputOffsetByBytes + size) / sizeof(T); + for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) { + T tmp = input[idx]; + for (int index = 0; index < nSrcChannels; ++index) { + size_t srcOffset = srcOffsets[index] / sizeof(T); + tmp += smChannels[srcChannelIndex[index]].read(srcOffset + idx); + } + output[idx] = tmp; + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(T); + smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + } + } +} + +template +__global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, + DeviceExecutionPlan* plan) { + extern __shared__ int sharedMem[]; + int bid = blockIdx.x; + int tid = threadIdx.x; + DeviceExecutionPlan* localPlan = plan + bid; + for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) { + sharedMem[i] = ((int*)localPlan)[i]; + } +#if defined(MSCCLPP_DEVICE_HIP) + __synclds(); +#else // !defined(MSCCLPP_DEVICE_HIP) + __syncthreads(); +#endif // !defined(MSCCLPP_DEVICE_HIP) + Operation* operations = localPlan->operations; + DeviceHandle* smChannels = localPlan->channels.smChannels; + DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; + T* src = nullptr; + T* dst = nullptr; + for (int i = 0; i < localPlan->nOperations; i++) { + switch (operations[i].type) { + case OperationType::BARRIER: + __syncthreads(); + break; + case OperationType::SIGNAL: + handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels, + operations[i].channelType); + break; + case OperationType::WAIT: + handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, + operations[i].channelType); + break; + case OperationType::READ_REDUCE_COPY_SEND: + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + dst = getBuffer(input, output, scratch, operations[i].dstBufferType); + handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, + operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, + operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, + operations[i].nOutputChannels, operations[i].size); + break; + default: + break; + } + } +} class ExecutionKernel { public: static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) { + switch (dataType) { + case DataType::INT32: + kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, + (int32_t*)scratch, plan); + break; + case DataType::UINT32: + kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, + (uint32_t*)scratch, plan); + break; + case DataType::FLOAT16: + kernel + <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + break; + case DataType::FLOAT32: + kernel + <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + break; + } + } }; - } // namespace mscclpp #endif // MSCCLPP_EXECUTION_KERNEL_HPP_ diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 3575390ba..6a4aaa80a 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -10,7 +10,7 @@ #include #include -#include "execution_kernel.hpp" +#include "execution_common.hpp" namespace mscclpp { From 7745c873d0fcb3d7ed7a96ebb45abd174ed55585 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 06:43:39 +0000 Subject: [PATCH 27/51] fix build --- src/executor/execution_kernel.cu | 34 ++++++++++++++++++++++++++++++++ src/include/execution_kernel.hpp | 21 ++++++++++++-------- 2 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 src/executor/execution_kernel.cu diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu new file mode 100644 index 000000000..a2e37d9ef --- /dev/null +++ b/src/executor/execution_kernel.cu @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +#include "execution_kernel.hpp" + +namespace mscclpp { + +#if !defined(MSCCLPP_DEVICE_HIP) +void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream) { + switch (dataType) { + case DataType::INT32: + kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, + (int32_t*)scratch, plan); + break; + case DataType::UINT32: + kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, + (uint32_t*)scratch, plan); + break; + case DataType::FLOAT16: + kernel + <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + break; + case DataType::FLOAT32: + kernel + <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + break; + } +} +#endif // !defined(MSCCLPP_DEVICE_HIP) +} // namespace mscclpp diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 71c1e140a..ef19f30c2 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -193,8 +193,8 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs } template -__global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, - DeviceExecutionPlan* plan) { +__global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, + DeviceExecutionPlan* plan) { extern __shared__ int sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; @@ -241,28 +241,33 @@ __global__ void kernel([[maybe_unused]] int rank /*for debug*/, T* input, T* out class ExecutionKernel { public: +#if defined(MSCCLPP_DEVICE_HIP) static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) { switch (dataType) { case DataType::INT32: - kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, - (int32_t*)scratch, plan); + executionKernel<<>>(rank, (int32_t*)src, (int32_t*)dst, + (int32_t*)scratch, plan); break; case DataType::UINT32: - kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, - (uint32_t*)scratch, plan); + executionKernel<<>>( + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan); break; case DataType::FLOAT16: - kernel + executionKernel <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); break; case DataType::FLOAT32: - kernel + executionKernel <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); break; } } }; +#else // !defined(MSCCLPP_DEVICE_HIP) + static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); +#endif // defined(MSCCLPP_DEVICE_HIP) } // namespace mscclpp #endif // MSCCLPP_EXECUTION_KERNEL_HPP_ From bbf197d508bc6201ba31fcc1dcdaeac1d5d5a56b Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 07:01:13 +0000 Subject: [PATCH 28/51] update --- src/executor/execution_kernel.cu | 34 -------------------------------- src/include/execution_kernel.hpp | 13 +++++++----- 2 files changed, 8 insertions(+), 39 deletions(-) delete mode 100644 src/executor/execution_kernel.cu diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu deleted file mode 100644 index a2e37d9ef..000000000 --- a/src/executor/execution_kernel.cu +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include - -#include "execution_kernel.hpp" - -namespace mscclpp { - -#if !defined(MSCCLPP_DEVICE_HIP) -void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream) { - switch (dataType) { - case DataType::INT32: - kernel<<>>(rank, (int32_t*)src, (int32_t*)dst, - (int32_t*)scratch, plan); - break; - case DataType::UINT32: - kernel<<>>(rank, (uint32_t*)src, (uint32_t*)dst, - (uint32_t*)scratch, plan); - break; - case DataType::FLOAT16: - kernel - <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); - break; - case DataType::FLOAT32: - kernel - <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); - break; - } -} -#endif // !defined(MSCCLPP_DEVICE_HIP) -} // namespace mscclpp diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index ef19f30c2..e3d110d94 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -10,6 +10,7 @@ #include "execution_common.hpp" +#if defined(MSCCLPP_DEVICE_COMPILE) #if defined(MSCCLPP_DEVICE_HIP) #define __synclds() asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); #endif // defined(MSCCLPP_DEVICE_HIP) @@ -106,9 +107,11 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint3 } } // namespace +#endif // defined(MSCCLPP_DEVICE_COMPILE) namespace mscclpp { +#if defined(MSCCLPP_DEVICE_COMPILE) template MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) { if (bufferType == BufferType::INPUT) { @@ -238,10 +241,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu } } } +#endif // defined(MSCCLPP_DEVICE_COMPILE) class ExecutionKernel { public: -#if defined(MSCCLPP_DEVICE_HIP) +#if defined(MSCCLPP_DEVICE_COMPILE) static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) { switch (dataType) { @@ -263,11 +267,10 @@ class ExecutionKernel { break; } } +#else // !defined(MSCCLPP_DEVICE_COMPILE) + static void launchKernel(int, int, int, void*, void*, void*, DataType, DeviceExecutionPlan*, size_t, cudaStream_t) {} +#endif // !defined(MSCCLPP_DEVICE_COMPILE) }; -#else // !defined(MSCCLPP_DEVICE_HIP) - static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); -#endif // defined(MSCCLPP_DEVICE_HIP) } // namespace mscclpp #endif // MSCCLPP_EXECUTION_KERNEL_HPP_ From d38c9edff7383e8c34c9764f0c0e15ad0f89955d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 11:20:03 +0000 Subject: [PATCH 29/51] build fix --- src/CMakeLists.txt | 2 +- src/executor/execution_kernel.cu | 29 +++++++++++++++++++++++++++++ src/include/execution_kernel.hpp | 9 +++++---- 3 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 src/executor/execution_kernel.cu diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfbcc927a..45b4075d2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu) target_sources(mscclpp_obj PRIVATE ${SOURCES}) target_include_directories(mscclpp_obj PRIVATE include) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu new file mode 100644 index 000000000..f5a24ff0f --- /dev/null +++ b/src/executor/execution_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "execution_kernel.hpp" + +namespace mscclpp { +void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream) { + switch (dataType) { + case DataType::INT32: + executionKernel<<>>(rank, (int32_t*)src, (int32_t*)dst, + (int32_t*)scratch, plan); + break; + case DataType::UINT32: + executionKernel<<>>( + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan); + break; + case DataType::FLOAT16: + executionKernel + <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + break; + case DataType::FLOAT32: + executionKernel + <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + break; + } +} +} // namespace mscclpp diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index e3d110d94..6ac592eb6 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -245,7 +245,7 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu class ExecutionKernel { public: -#if defined(MSCCLPP_DEVICE_COMPILE) +#if defined(MSCCLPP_DEVICE_HIP) static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) { switch (dataType) { @@ -267,9 +267,10 @@ class ExecutionKernel { break; } } -#else // !defined(MSCCLPP_DEVICE_COMPILE) - static void launchKernel(int, int, int, void*, void*, void*, DataType, DeviceExecutionPlan*, size_t, cudaStream_t) {} -#endif // !defined(MSCCLPP_DEVICE_COMPILE) +#else // !defined(MSCCLPP_DEVICE_HIP) + static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); +#endif // !defined(MSCCLPP_DEVICE_HIP) }; } // namespace mscclpp From 867101e9aec63d68b72bd1d658b134c4a3e9cc74 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 11:28:50 +0000 Subject: [PATCH 30/51] minor update --- src/executor/execution_kernel.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index f5a24ff0f..d5e07a3da 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -3,6 +3,7 @@ #include "execution_kernel.hpp" +#if defined(MSCCLPP_DEVICE_CUDA) namespace mscclpp { void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, @@ -27,3 +28,4 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo } } } // namespace mscclpp +#endif From 6049e9e44194236a560813e179cfeca9fd337867 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 5 Apr 2024 12:23:51 +0000 Subject: [PATCH 31/51] more ops --- src/include/execution_kernel.hpp | 47 +++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 6ac592eb6..833130fc9 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -140,24 +140,29 @@ MSCCLPP_DEVICE_INLINE void handleSignal(int tid, DeviceHandle* smChan } MSCCLPP_DEVICE_INLINE void handleWait(int tid, DeviceHandle* smChannels, - DeviceHandle* proxyChannels, uint8_t* channelIndex, + DeviceHandle* proxyChannels, uint8_t* channelIndexes, int nChannels, ChannelType chType) { if (tid < nChannels) { if (chType == ChannelType::SM) { - smChannels[channelIndex[tid]].wait(); + smChannels[channelIndexes[tid]].wait(); } if (chType == ChannelType::PROXY) { - proxyChannels[channelIndex[tid]].wait(); + proxyChannels[channelIndexes[tid]].wait(); } } } +MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle& smChannel, uint32_t srcOffset, uint32_t dstOffset, + uint32_t size) { + smChannel.get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x); +} + template MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output, uint32_t outputOffsetByBytes, DeviceHandle* smChannels, - uint8_t* srcChannelIndex, uint8_t* dstChannelIndex, + uint8_t* srcChannelIndexes, uint8_t* dstChannelIndexes, uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, - int nDstChannels, uint32_t size) { + int nDstChannels, uint32_t size, bool sendToRemote = true) { const size_t nInt4 = size / sizeof(int4); const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); @@ -168,13 +173,15 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs for (int index = 0; index < nSrcChannels; ++index) { int4 val; size_t srcOffset = srcOffsets[index] / sizeof(int4); - val = smChannels[srcChannelIndex[index]].read(srcOffset + idx); + val = smChannels[srcChannelIndexes[index]].read(srcOffset + idx); tmp = add_vectors(tmp, val); } output4[outputOffset4 + idx] = tmp; - for (int index = 0; index < nDstChannels; ++index) { - size_t dstOffset = dstOffsets[index] / sizeof(int4); - smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + if (sendToRemote) { + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(int4); + smChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); + } } } // handle rest of data @@ -185,12 +192,14 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs T tmp = input[idx]; for (int index = 0; index < nSrcChannels; ++index) { size_t srcOffset = srcOffsets[index] / sizeof(T); - tmp += smChannels[srcChannelIndex[index]].read(srcOffset + idx); + tmp += smChannels[srcChannelIndexes[index]].read(srcOffset + idx); } output[idx] = tmp; - for (int index = 0; index < nDstChannels; ++index) { - size_t dstOffset = dstOffsets[index] / sizeof(T); - smChannels[dstChannelIndex[index]].write(dstOffset + idx, tmp); + if (sendToRemote) { + for (int index = 0; index < nDstChannels; ++index) { + size_t dstOffset = dstOffsets[index] / sizeof(T); + smChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); + } } } } @@ -228,6 +237,10 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, operations[i].channelType); break; + case OperationType::GET: + handleGet(smChannels[operations[i].inputChannelIndexes[0]], operations[i].inputOffsets[0], + operations[i].dstOffset, operations[i].size); + break; case OperationType::READ_REDUCE_COPY_SEND: src = getBuffer(input, output, scratch, operations[i].srcBufferType); dst = getBuffer(input, output, scratch, operations[i].dstBufferType); @@ -236,6 +249,14 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, operations[i].nOutputChannels, operations[i].size); break; + case OperationType::READ_REDUCE_COPY: + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + dst = getBuffer(input, output, scratch, operations[i].dstBufferType); + handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, + operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, + operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, + operations[i].nOutputChannels, operations[i].size, false); + break; default: break; } From d97f31274eee1c0d16c305f7af41ede16796fb8d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 7 Apr 2024 11:48:45 +0000 Subject: [PATCH 32/51] WIP --- include/mscclpp/core.hpp | 4 + include/mscclpp/executor.hpp | 7 +- include/mscclpp/packet_device.hpp | 15 ++++ python/mscclpp/__init__.py | 1 + python/mscclpp/executor.cpp | 9 ++- src/executor/execution_kernel.cu | 25 ++++-- src/executor/executor.cc | 25 ++++-- src/include/execution_common.hpp | 4 + src/include/execution_kernel.hpp | 129 +++++++++++++++++++++++------- 9 files changed, 174 insertions(+), 45 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 50a922bc3..456020975 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -760,6 +760,10 @@ DeviceHandle> deviceHandle(T&& t) { return t.deviceHandle(); } +/// Packet value type. +template +using PacketValType = typename T::ValueType; + } // namespace mscclpp namespace std { diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 21087a762..60a68fbb2 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -17,6 +17,11 @@ enum class DataType { FLOAT32, }; +enum class PacketType { + LL8, + LL16, +}; + class ExecutionPlan { public: ExecutionPlan(std::string planPath); @@ -37,7 +42,7 @@ class Executor { ~Executor(); void execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType, - int nthreads, const ExecutionPlan& plan, cudaStream_t stream); + int nthreads, const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType = PacketType::LL16); private: struct Impl; diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 11f63b53f..7678c81b5 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -24,12 +24,20 @@ union alignas(16) LL16Packet { uint32_t data2; uint32_t flag2; }; + using ValueType = uint2; #if defined(MSCCLPP_DEVICE_COMPILE) ulonglong2 raw_; MSCCLPP_DEVICE_INLINE LL16Packet() {} + MSCCLPP_DEVICE_INLINE LL16Packet(uint2 val, uint32_t flag) { + data1 = val.x; + flag1 = flag; + data2 = val.y; + flag2 = flag; + } + /// Write 8 bytes of data to the packet. /// @param val1 The first 4-byte data to write. /// @param val2 The second 4-byte data to write. @@ -95,10 +103,17 @@ union alignas(8) LL8Packet { uint32_t flag; }; uint64_t raw_; + + using ValueType = uint32_t; #if defined(MSCCLPP_DEVICE_COMPILE) MSCCLPP_DEVICE_INLINE LL8Packet() {} + MSCCLPP_DEVICE_INLINE LL8Packet(uint32_t val, uint32_t flag) { + data = val; + flag = flag; + } + MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { #if defined(MSCCLPP_DEVICE_CUDA) asm volatile("st.volatile.global.v2.u32 [%0], {%1,%2};" ::"l"(&raw_), "r"(val), "r"(flag)); diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 0c8f7eb3b..0acc55fc5 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -22,6 +22,7 @@ DataType, Executor, ExecutionPlan, + PacketType, version, is_nvls_supported, ) diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp index 5276e3336..f57a4294b 100644 --- a/python/mscclpp/executor.cpp +++ b/python/mscclpp/executor.cpp @@ -18,6 +18,8 @@ void register_executor(nb::module_& m) { .value("float16", DataType::FLOAT16) .value("float32", DataType::FLOAT32); + nb::enum_(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); + nb::class_(m, "ExecutionPlan").def(nb::init(), nb::arg("planPath")); nb::class_(m, "Executor") @@ -25,10 +27,11 @@ void register_executor(nb::module_& m) { .def( "execute", [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize, - DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream) { + DataType dataType, int nthreads, const ExecutionPlan& plan, uintptr_t stream, PacketType packetType) { self->execute(rank, reinterpret_cast(sendbuff), reinterpret_cast(recvBuff), sendBuffSize, - recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream); + recvBuffSize, dataType, nthreads, plan, (cudaStream_t)stream, packetType); }, nb::arg("rank"), nb::arg("sendbuff"), nb::arg("recvBuff"), nb::arg("sendBuffSize"), nb::arg("recvBuffSize"), - nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream")); + nb::arg("dataType"), nb::arg("nthreads"), nb::arg("plan"), nb::arg("stream"), + nb::arg("packetType") = PacketType::LL16); } diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index d5e07a3da..7aca5b1ed 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -5,27 +5,36 @@ #if defined(MSCCLPP_DEVICE_CUDA) namespace mscclpp { + +template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream) { + cudaStream_t stream, uint32_t flag) { switch (dataType) { case DataType::INT32: - executionKernel<<>>(rank, (int32_t*)src, (int32_t*)dst, - (int32_t*)scratch, plan); + executionKernel<<>>( + rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag); break; case DataType::UINT32: executionKernel<<>>( - rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan); + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag); break; case DataType::FLOAT16: - executionKernel - <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + executionKernel<<>>(rank, (half*)src, (half*)dst, + (half*)scratch, plan, flag); break; case DataType::FLOAT32: - executionKernel - <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + executionKernel<<>>(rank, (float*)src, (float*)dst, + (float*)scratch, plan, flag); break; } } + +template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, + void* scratch, DataType dataType, DeviceExecutionPlan* plan, + size_t sharedMemSize, cudaStream_t stream, uint32_t flag); +template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, + void* scratch, DataType dataType, DeviceExecutionPlan* plan, + size_t sharedMemSize, cudaStream_t stream, uint32_t flag); } // namespace mscclpp #endif diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 1d6d9305e..d775cd593 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -258,12 +258,24 @@ struct Executor::Impl { } void launchKernel(ExecutionContext& context, int rank, int nthreadsPerBlock, void* sendbuff, void* recvbuff, - DataType dataType, cudaStream_t stream) { + DataType dataType, cudaStream_t stream, PacketType packetType) { + static uint32_t flag = 0; int nthreadblocks = context.deviceExecutionPlans.size(); size_t sharedMemSize = sizeof(DeviceExecutionPlan); - ExecutionKernel::launchKernel( - rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, - (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream); + switch (packetType) { + case PacketType::LL16: + ExecutionKernel::launchKernel( + rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, + (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag); + break; + case PacketType::LL8: + ExecutionKernel::launchKernel( + rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, + (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag); + break; + default: + throw std::runtime_error("Invalid packet type"); + } } }; @@ -271,10 +283,11 @@ Executor::Executor(std::shared_ptr comm, int nranksPerNode) : impl_(std::make_unique(comm, nranksPerNode)) {} void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, - DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream) { + DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream, + PacketType packetType) { ExecutionContext context = this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream); - this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream); + this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType); } Executor::~Executor() = default; diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 59d341612..5a63859b8 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -27,13 +27,17 @@ enum class ChannelType : uint8_t { enum class OperationType : uint8_t { BARRIER, PUT, + PUT_PACKET, GET, COPY, + COPY_PACKET, SIGNAL, WAIT, FLUSH, REDUCE, + REDUCE_PACKET, REDUCE_SEND, + REDUCE_SEND_PACKET, READ_REDUCE_COPY, READ_REDUCE_COPY_SEND, }; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 833130fc9..023ca1a15 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -5,6 +5,7 @@ #define MSCCLPP_EXECUTION_KERNEL_HPP_ #include +#include #include #include @@ -102,7 +103,7 @@ MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { } template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { +MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { return add_vectors_helper<__half2>(a, b); } @@ -112,6 +113,7 @@ MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint32_t add_vectors<__half>(uint3 namespace mscclpp { #if defined(MSCCLPP_DEVICE_COMPILE) + template MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType bufferType) { if (bufferType == BufferType::INPUT) { @@ -158,11 +160,11 @@ MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle& smChannel, uint32_ } template -MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffsetByBytes, T* output, - uint32_t outputOffsetByBytes, DeviceHandle* smChannels, - uint8_t* srcChannelIndexes, uint8_t* dstChannelIndexes, - uint32_t* srcOffsets, uint32_t* dstOffsets, int nSrcChannels, - int nDstChannels, uint32_t size, bool sendToRemote = true) { +MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOffsetByBytes, T* input, + uint32_t inputOffsetByBytes, DeviceHandle* smChannels, + uint8_t* dstChannelIndexes, uint8_t* srcChannelIndexes, + uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels, + int nSrcChannels, uint32_t size, bool sendToRemote = true) { const size_t nInt4 = size / sizeof(int4); const size_t inputOffset4 = inputOffsetByBytes / sizeof(int4); const size_t outputOffset4 = outputOffsetByBytes / sizeof(int4); @@ -204,9 +206,59 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* input, uint32_t inputOffs } } -template +template +MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHandle* smChannels, + uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels, + uint32_t size, uint32_t flag) { + for (int index = 0; index < nDstChannels; ++index) { + smChannels[dstChannelIndexes[index]].putPackets(dstOffsets[index], inputOffsetByBytes, size, + threadIdx.x, blockDim.x, flag); + } +} + +template +MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffsetByBytes, T* input, + uint32_t inputOffsetByBytes, DeviceHandle* smChannels, + uint8_t* dstChannelIndexes, uint32_t* dstOffsets, + uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size, + uint32_t flag) { + size_t nPackets = size * 2 / sizeof(PacketType); + uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType); + uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType); + PacketValType* src = (PacketValType*)input + srcOffset; + PacketValType* dst = (PacketValType*)output + dstOffset; + for (int idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { + PacketValType data = {}; + for (int index = 0; index < nSrcs; ++index) { + PacketType* pkt = (PacketType*)input + srcOffsets[index] / sizeof(PacketType); + PacketValType val = pkt[idx].read(flag); + data = add_vectors(data, val); + } + data = add_vectors(data, src[idx]); + dst[idx] = data; + + PacketType pkt(data, flag); + for (int index = 0; index < nDstChannels; ++index) { + smChannels[dstChannelIndexes[index]].write(dstOffsets[index] / sizeof(PacketValType) + idx, pkt); + } + } +} + +template +MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size, + uint32_t flag) { + PacketType* srcPackets = (PacketType*)src; + PacketValType* result = (PacketValType*)dst; + size_t nPackets = size * 2 / sizeof(PacketType); + for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { + PacketValType data = srcPackets[idx].read(flag); + result[idx] = data; + } +} + +template __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, - DeviceExecutionPlan* plan) { + DeviceExecutionPlan* plan, uint32_t flag) { extern __shared__ int sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; @@ -242,20 +294,39 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu operations[i].dstOffset, operations[i].size); break; case OperationType::READ_REDUCE_COPY_SEND: - src = getBuffer(input, output, scratch, operations[i].srcBufferType); dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, - operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, - operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, - operations[i].nOutputChannels, operations[i].size); + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, + operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, + operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels, + operations[i].nInputChannels, operations[i].size); break; case OperationType::READ_REDUCE_COPY: + dst = getBuffer(input, output, scratch, operations[i].dstBufferType); src = getBuffer(input, output, scratch, operations[i].srcBufferType); + handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, + operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, + operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels, + operations[i].nInputChannels, operations[i].size, false); + break; + case OperationType::PUT_PACKET: + handlePutPacket(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, + operations[i].outputOffsets, operations[i].nOutputChannels, operations[i].size, + flag); + break; + case OperationType::REDUCE_SEND_PACKET: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - handleReadReduceCopySend(src, operations[i].srcOffset, dst, operations[i].dstOffset, smChannels, - operations[i].inputChannelIndexes, operations[i].outputChannelIndexes, - operations[i].inputOffsets, operations[i].outputOffsets, operations[i].nInputChannels, - operations[i].nOutputChannels, operations[i].size, false); + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + handleReduceSendPacket(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, + operations[i].outputChannelIndexes, operations[i].outputOffsets, + operations[i].inputOffsets, operations[i].nOutputChannels, + operations[i].nInputChannels, operations[i].size, flag); + break; + case OperationType::COPY_PACKET: + dst = getBuffer(input, output, scratch, operations[i].dstBufferType); + src = getBuffer(input, output, scratch, operations[i].srcBufferType); + handleCopyPacket(dst, src, operations[i].dstOffset, operations[i].srcOffset, operations[i].size, + flag); break; default: break; @@ -267,30 +338,34 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu class ExecutionKernel { public: #if defined(MSCCLPP_DEVICE_HIP) + template static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream) { + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream, + uint32_t flag = 0) { switch (dataType) { case DataType::INT32: - executionKernel<<>>(rank, (int32_t*)src, (int32_t*)dst, - (int32_t*)scratch, plan); + executionKernel<<>>( + rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag); break; case DataType::UINT32: - executionKernel<<>>( - rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan); + executionKernel<<>>( + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag); break; case DataType::FLOAT16: - executionKernel - <<>>(rank, (half*)src, (half*)dst, (half*)scratch, plan); + executionKernel<<>>( + rank, (half*)src, (half*)dst, (half*)scratch, plan, flag); break; case DataType::FLOAT32: - executionKernel - <<>>(rank, (float*)src, (float*)dst, (float*)scratch, plan); + executionKernel<<>>( + rank, (float*)src, (float*)dst, (float*)scratch, plan, flag); break; } } #else // !defined(MSCCLPP_DEVICE_HIP) + template static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream); + DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream, + uint32_t flag = 0); #endif // !defined(MSCCLPP_DEVICE_HIP) }; } // namespace mscclpp From 64106f1f419e40f4880f1ca3edb7407c0be1c64c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 7 Apr 2024 12:44:09 +0000 Subject: [PATCH 33/51] WIP --- include/mscclpp/packet_device.hpp | 4 +- src/executor/execution_plan.cc | 6 +- src/include/execution_kernel.hpp | 19 +- src/include/execution_plan.hpp | 1 + test/execution-files/allreduce_packet.json | 330 +++++++++++++++++++++ test/executor_test.cc | 4 +- 6 files changed, 350 insertions(+), 14 deletions(-) create mode 100644 test/execution-files/allreduce_packet.json diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index 7678c81b5..a20c8abec 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -110,8 +110,8 @@ union alignas(8) LL8Packet { MSCCLPP_DEVICE_INLINE LL8Packet() {} MSCCLPP_DEVICE_INLINE LL8Packet(uint32_t val, uint32_t flag) { - data = val; - flag = flag; + this->data = val; + this->flag = flag; } MSCCLPP_DEVICE_INLINE void write(uint32_t val, uint32_t flag) { diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 4a64e86e6..ac4a8fdbf 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -69,7 +69,7 @@ auto convertToChannelType = [](const std::string& str) { namespace mscclpp { using json = nlohmann::json; -ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath) {} +ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath), isUsingPacket(false) {} std::vector ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const { auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; }; @@ -111,6 +111,10 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) { std::ifstream file(this->planPath); json obj = json::parse(file); this->name = obj["name"]; + std::string protocol = obj["protocol"]; + if (protocol == "LL") { + this->isUsingPacket = true; + } auto gpus = obj["gpus"]; for (const auto& gpu : gpus) { diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 023ca1a15..be0533f35 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -211,8 +211,8 @@ MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHa uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels, uint32_t size, uint32_t flag) { for (int index = 0; index < nDstChannels; ++index) { - smChannels[dstChannelIndexes[index]].putPackets(dstOffsets[index], inputOffsetByBytes, size, - threadIdx.x, blockDim.x, flag); + smChannels[dstChannelIndexes[index]].putPackets( + dstOffsets[index] * sizeof(PacketType), inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag); } } @@ -223,14 +223,14 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size, uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); - uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType); - uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType); + const uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType); + const uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType); PacketValType* src = (PacketValType*)input + srcOffset; PacketValType* dst = (PacketValType*)output + dstOffset; - for (int idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { + for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { PacketValType data = {}; for (int index = 0; index < nSrcs; ++index) { - PacketType* pkt = (PacketType*)input + srcOffsets[index] / sizeof(PacketType); + PacketType* pkt = (PacketType*)((char*)input + 2 * srcOffsets[index]); PacketValType val = pkt[idx].read(flag); data = add_vectors(data, val); } @@ -239,7 +239,8 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs PacketType pkt(data, flag); for (int index = 0; index < nDstChannels; ++index) { - smChannels[dstChannelIndexes[index]].write(dstOffsets[index] / sizeof(PacketValType) + idx, pkt); + size_t offset = (dstOffsets[index] * 2) / sizeof(PacketType); + smChannels[dstChannelIndexes[index]].write(offset + idx, pkt); } } } @@ -247,8 +248,8 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffs template MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size, uint32_t flag) { - PacketType* srcPackets = (PacketType*)src; - PacketValType* result = (PacketValType*)dst; + PacketType* srcPackets = (PacketType*)((char*)src + 2 * srcOffset); + PacketValType* result = (PacketValType*)((char*)dst + dstOffset); size_t nPackets = size * 2 / sizeof(PacketType); for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { PacketValType data = srcPackets[idx].read(flag); diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 6a4aaa80a..8c0029f0a 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -62,6 +62,7 @@ struct ExecutionPlan::Impl { void setupOperations(const nlohmann::json& gpus); std::string planPath; + bool isUsingPacket; // operations for [rank][threadblock] = [operations] std::unordered_map>> operations; std::unordered_map> channelInfos; diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json new file mode 100644 index 000000000..3eda0ff3d --- /dev/null +++ b/test/execution-files/allreduce_packet.json @@ -0,0 +1,330 @@ +{ + "name": "allreduce_pairs", + "colletive": "allreduce", + "protocol": "LL", + "inplace": true, + "gpus": [ + { + "id": 0, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 8, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 2, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 4 + } + ], + "src": 0, + "srcs": [ + { + "buff": "s", + "off": 2 + } + ], + "srcbuff": "i", + "srcoff": 0, + "dst": 0, + "dstbuff": "i", + "dstoff": 0, + "ctype": "none", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 0, + "srcbuff": "s", + "srcoff": 6, + "dst": 0, + "dstbuff": "i", + "dstoff": 2, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 1 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 3, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 5 + } + ], + "src": 0, + "srcs": [ + { + "buff": "s", + "off": 3 + } + ], + "srcbuff": "i", + "srcoff": 1, + "dst": 0, + "dstbuff": "i", + "dstoff": 1, + "ctype": "none", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 0, + "srcbuff": "s", + "srcoff": 7, + "dst": 0, + "dstbuff": "i", + "dstoff": 3, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "s", + "type": "sm", + "connectedTo": [ + 1, + 1 + ] + } + ] + }, + { + "id": 1, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 6, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 0, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 6 + } + ], + "src": 1, + "srcs": [ + { + "buff": "s", + "off": 0 + } + ], + "srcbuff": "i", + "srcoff": 2, + "dst": 1, + "dstbuff": "i", + "dstoff": 2, + "ctype": "none", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 1, + "srcbuff": "s", + "srcoff": 4, + "dst": 1, + "dstbuff": "i", + "dstoff": 0, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 3 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 1, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { + "src": "i", + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 7 + } + ], + "src": 1, + "srcs": [ + { + "buff": "s", + "off": 1 + } + ], + "srcbuff": "i", + "srcoff": 3, + "dst": 1, + "dstbuff": "i", + "dstoff": 3, + "ctype": "none", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 1, + "srcbuff": "s", + "srcoff": 5, + "dst": 1, + "dstbuff": "i", + "dstoff": 1, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "s", + "type": "sm", + "connectedTo": [ + 0, + 0 + ] + } + ] + } + ] + } diff --git a/test/executor_test.cc b/test/executor_test.cc index 4a7b36a79..c708a97d3 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -29,12 +29,12 @@ int main() { MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); // sleep 20s - // std::this_thread::sleep_for(std::chrono::seconds(20)); + std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); CUDACHECK(cudaSetDevice(rank)); std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); - mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); + mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce_packet.json"); const int bufferSize = 1024 * 1024; std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); From feaf058e1cee4c84e955bfc80fa8317a408130fa Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 7 Apr 2024 12:45:55 +0000 Subject: [PATCH 34/51] update --- src/executor/execution_plan.cc | 8 ++++---- src/include/execution_common.hpp | 4 ++-- src/include/execution_kernel.hpp | 19 +++++++++---------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index ac4a8fdbf..4c75797bb 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -209,19 +209,19 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { operation.channelType = convertToChannelType(op["ctype"]); } if (op.contains("i_cids")) { - operation.nInputChannels = op["i_cids"].size(); + operation.nInputs = op["i_cids"].size(); } if (op.contains("o_cids")) { - operation.nOutputChannels = op["o_cids"].size(); + operation.nOutputs = op["o_cids"].size(); } - for (int i = 0; i < operation.nInputChannels; i++) { + for (int i = 0; i < operation.nInputs; i++) { BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); operation.inputChannelIndexes[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; } - for (int i = 0; i < operation.nOutputChannels; i++) { + for (int i = 0; i < operation.nOutputs; i++) { BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); operation.outputChannelIndexes[i] = diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 5a63859b8..ba61fb84c 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -52,8 +52,8 @@ struct Operation { ChannelType channelType; BufferType srcBufferType; BufferType dstBufferType; - uint8_t nInputChannels; - uint8_t nOutputChannels; + uint8_t nInputs; + uint8_t nOutputs; uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index be0533f35..e2ceaf224 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -283,11 +283,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu __syncthreads(); break; case OperationType::SIGNAL: - handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputChannels, + handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputs, operations[i].channelType); break; case OperationType::WAIT: - handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputChannels, + handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputs, operations[i].channelType); break; case OperationType::GET: @@ -299,29 +299,28 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu src = getBuffer(input, output, scratch, operations[i].srcBufferType); handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, - operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels, - operations[i].nInputChannels, operations[i].size); + operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs, + operations[i].nInputs, operations[i].size); break; case OperationType::READ_REDUCE_COPY: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); src = getBuffer(input, output, scratch, operations[i].srcBufferType); handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, - operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputChannels, - operations[i].nInputChannels, operations[i].size, false); + operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs, + operations[i].nInputs, operations[i].size, false); break; case OperationType::PUT_PACKET: handlePutPacket(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, - operations[i].outputOffsets, operations[i].nOutputChannels, operations[i].size, - flag); + operations[i].outputOffsets, operations[i].nOutputs, operations[i].size, flag); break; case OperationType::REDUCE_SEND_PACKET: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); src = getBuffer(input, output, scratch, operations[i].srcBufferType); handleReduceSendPacket(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, operations[i].outputOffsets, - operations[i].inputOffsets, operations[i].nOutputChannels, - operations[i].nInputChannels, operations[i].size, flag); + operations[i].inputOffsets, operations[i].nOutputs, operations[i].nInputs, + operations[i].size, flag); break; case OperationType::COPY_PACKET: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); From d52ef41ba5b1836fd812ebd1776087fb82970608 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 7 Apr 2024 13:47:36 +0000 Subject: [PATCH 35/51] WIP --- src/executor/execution_plan.cc | 18 + src/include/execution_common.hpp | 1 + test/execution-files/allreduce_packet.json | 636 ++++++++++----------- 3 files changed, 337 insertions(+), 318 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 4c75797bb..29bd28e07 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -37,6 +37,12 @@ auto getOpType = [](const std::string& str) { return mscclpp::OperationType::READ_REDUCE_COPY; } else if (str == "rrcs") { return mscclpp::OperationType::READ_REDUCE_COPY_SEND; + } else if (str == "ppkt") { + return mscclpp::OperationType::PUT_PACKET; + } else if (str == "rspkt") { + return mscclpp::OperationType::REDUCE_SEND_PACKET; + } else if (str == "cpkt") { + return mscclpp::OperationType::COPY_PACKET; } else { throw std::runtime_error("Invalid operation type"); } @@ -59,6 +65,8 @@ auto convertToChannelType = [](const std::string& str) { return mscclpp::ChannelType::SM; } else if (str == "proxy") { return mscclpp::ChannelType::PROXY; + } else if (str == "none") { + return mscclpp::ChannelType::NONE; } else { throw std::runtime_error("Invalid channel type"); } @@ -99,6 +107,9 @@ std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c return std::vector(bufferTypes.begin(), bufferTypes.end()); } size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { + if (this->isUsingPacket) { + return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2; + } return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank); } std::vector ExecutionPlan::Impl::getOperations(int rank, int threadblock) const { @@ -221,6 +232,13 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; } + // will have either srcs or i_cids + if (op.contains("srcs")) { + operation.nInputs = op["srcs"].size(); + } + for (int i = 0; i < operation.nInputs; i++) { + operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"]; + } for (int i = 0; i < operation.nOutputs; i++) { BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index ba61fb84c..2d03feb61 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -20,6 +20,7 @@ enum class BufferType : uint8_t { }; enum class ChannelType : uint8_t { + NONE, SM, PROXY, }; diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json index 3eda0ff3d..7045f21c2 100644 --- a/test/execution-files/allreduce_packet.json +++ b/test/execution-files/allreduce_packet.json @@ -1,330 +1,330 @@ { - "name": "allreduce_pairs", - "colletive": "allreduce", - "protocol": "LL", - "inplace": true, - "gpus": [ - { - "id": 0, - "inputChunks": 4, - "outputChunks": 0, - "scratchChunks": 8, - "threadblocks": [ - { - "id": 0, - "ops": [ - { - "name": "ppkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 0 - } - ], - "src": 0, - "srcbuff": "i", - "srcoff": 2, - "ctype": "sm", - "cnt": 1 - }, - { - "name": "rspkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 4 - } - ], - "src": 0, - "srcs": [ - { - "buff": "s", - "off": 2 - } - ], - "srcbuff": "i", - "srcoff": 0, - "dst": 0, - "dstbuff": "i", - "dstoff": 0, - "ctype": "none", - "cnt": 1 + "name": "allreduce_pairs", + "colletive": "allreduce", + "protocol": "LL", + "inplace": true, + "gpus": [ + { + "id": 0, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 8, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" }, - { - "name": "cpkt", - "src": 0, - "srcbuff": "s", - "srcoff": 6, - "dst": 0, - "dstbuff": "i", - "dstoff": 2, - "ctype": "none", - "cnt": 1 - } - ], - "channels": [ - { + "o_cids": [ + { + "id": 0, + "off": 0 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 2, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { "src": "i", - "dst": "s", - "ctype": "sm", - "cids": [ - 0 - ] - } - ] - }, - { - "id": 1, - "ops": [ - { - "name": "ppkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 1 - } - ], - "src": 0, - "srcbuff": "i", - "srcoff": 3, - "ctype": "sm", - "cnt": 1 + "dst": "s" }, - { - "name": "rspkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 5 - } - ], - "src": 0, - "srcs": [ - { - "buff": "s", - "off": 3 - } - ], - "srcbuff": "i", - "srcoff": 1, - "dst": 0, - "dstbuff": "i", - "dstoff": 1, - "ctype": "none", - "cnt": 1 + "o_cids": [ + { + "id": 0, + "off": 4 + } + ], + "src": 0, + "srcs": [ + { + "buff": "s", + "off": 2 + } + ], + "srcbuff": "i", + "srcoff": 0, + "dst": 0, + "dstbuff": "i", + "dstoff": 0, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 0, + "srcbuff": "s", + "srcoff": 6, + "dst": 0, + "dstbuff": "i", + "dstoff": 2, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" }, - { - "name": "cpkt", - "src": 0, - "srcbuff": "s", - "srcoff": 7, - "dst": 0, - "dstbuff": "i", - "dstoff": 3, - "ctype": "none", - "cnt": 1 - } - ], - "channels": [ - { + "o_cids": [ + { + "id": 0, + "off": 1 + } + ], + "src": 0, + "srcbuff": "i", + "srcoff": 3, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { "src": "i", - "dst": "s", - "ctype": "sm", - "cids": [ - 1 - ] - } - ] - } - ], - "channels": [ - { - "srcbuff": "i", - "dstbuff": "s", - "type": "sm", - "connectedTo": [ - 1, - 1 - ] - } - ] - }, - { - "id": 1, - "inputChunks": 4, - "outputChunks": 0, - "scratchChunks": 6, - "threadblocks": [ - { - "id": 0, - "ops": [ - { - "name": "ppkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 2 - } - ], - "src": 1, - "srcbuff": "i", - "srcoff": 0, - "ctype": "sm", - "cnt": 1 + "dst": "s" }, - { - "name": "rspkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 6 - } - ], - "src": 1, - "srcs": [ - { - "buff": "s", - "off": 0 - } - ], - "srcbuff": "i", - "srcoff": 2, - "dst": 1, - "dstbuff": "i", - "dstoff": 2, - "ctype": "none", - "cnt": 1 + "o_cids": [ + { + "id": 0, + "off": 5 + } + ], + "src": 0, + "srcs": [ + { + "buff": "s", + "off": 3 + } + ], + "srcbuff": "i", + "srcoff": 1, + "dst": 0, + "dstbuff": "i", + "dstoff": 1, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 0, + "srcbuff": "s", + "srcoff": 7, + "dst": 0, + "dstbuff": "i", + "dstoff": 3, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "s", + "type": "sm", + "connectedTo": [ + 1, + 1 + ] + } + ] + }, + { + "id": 1, + "inputChunks": 4, + "outputChunks": 0, + "scratchChunks": 6, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" }, - { - "name": "cpkt", - "src": 1, - "srcbuff": "s", - "srcoff": 4, - "dst": 1, - "dstbuff": "i", - "dstoff": 0, - "ctype": "none", - "cnt": 1 - } - ], - "channels": [ - { + "o_cids": [ + { + "id": 0, + "off": 2 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 0, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { "src": "i", - "dst": "s", - "ctype": "sm", - "cids": [ - 0 - ] - } - ] - }, - { - "id": 1, - "ops": [ - { - "name": "ppkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 3 - } - ], - "src": 1, - "srcbuff": "i", - "srcoff": 1, - "ctype": "sm", - "cnt": 1 + "dst": "s" }, - { - "name": "rspkt", - "o_buff": { - "src": "i", - "dst": "s" - }, - "o_cids": [ - { - "id": 0, - "off": 7 - } - ], - "src": 1, - "srcs": [ - { - "buff": "s", - "off": 1 - } - ], - "srcbuff": "i", - "srcoff": 3, - "dst": 1, - "dstbuff": "i", - "dstoff": 3, - "ctype": "none", - "cnt": 1 + "o_cids": [ + { + "id": 0, + "off": 6 + } + ], + "src": 1, + "srcs": [ + { + "buff": "s", + "off": 0 + } + ], + "srcbuff": "i", + "srcoff": 2, + "dst": 1, + "dstbuff": "i", + "dstoff": 2, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 1, + "srcbuff": "s", + "srcoff": 4, + "dst": 1, + "dstbuff": "i", + "dstoff": 0, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "ppkt", + "o_buff": { + "src": "i", + "dst": "s" }, - { - "name": "cpkt", - "src": 1, - "srcbuff": "s", - "srcoff": 5, - "dst": 1, - "dstbuff": "i", - "dstoff": 1, - "ctype": "none", - "cnt": 1 - } - ], - "channels": [ - { + "o_cids": [ + { + "id": 0, + "off": 3 + } + ], + "src": 1, + "srcbuff": "i", + "srcoff": 1, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "rspkt", + "o_buff": { "src": "i", - "dst": "s", - "ctype": "sm", - "cids": [ - 1 - ] - } - ] - } - ], - "channels": [ - { - "srcbuff": "i", - "dstbuff": "s", - "type": "sm", - "connectedTo": [ - 0, - 0 - ] - } - ] - } - ] - } + "dst": "s" + }, + "o_cids": [ + { + "id": 0, + "off": 7 + } + ], + "src": 1, + "srcs": [ + { + "buff": "s", + "off": 1 + } + ], + "srcbuff": "i", + "srcoff": 3, + "dst": 1, + "dstbuff": "i", + "dstoff": 3, + "ctype": "sm", + "cnt": 1 + }, + { + "name": "cpkt", + "src": 1, + "srcbuff": "s", + "srcoff": 5, + "dst": 1, + "dstbuff": "i", + "dstoff": 1, + "ctype": "none", + "cnt": 1 + } + ], + "channels": [ + { + "src": "i", + "dst": "s", + "ctype": "sm", + "cids": [ + 1 + ] + } + ] + } + ], + "channels": [ + { + "srcbuff": "i", + "dstbuff": "s", + "type": "sm", + "connectedTo": [ + 0, + 0 + ] + } + ] + } + ] +} From b03be9ab3591631d8d0ac4429cf81cea1381af69 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 7 Apr 2024 14:42:21 +0000 Subject: [PATCH 36/51] WIP --- src/executor/execution_plan.cc | 1 + src/include/execution_common.hpp | 5 ++++- src/include/execution_kernel.hpp | 38 +++++++++++++++++--------------- test/executor_test.cc | 2 +- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 29bd28e07..55123d6b0 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -235,6 +235,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { // will have either srcs or i_cids if (op.contains("srcs")) { operation.nInputs = op["srcs"].size(); + operation.inputBufferType = convertToBufferType(op["srcs"][0]["buff"]); } for (int i = 0; i < operation.nInputs; i++) { operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"]; diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 2d03feb61..685317268 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -55,7 +55,10 @@ struct Operation { BufferType dstBufferType; uint8_t nInputs; uint8_t nOutputs; - uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + union { + uint8_t inputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; + BufferType inputBufferType; + }; uint8_t outputChannelIndexes[MAX_CHANNEL_PER_OPERATION]; uint32_t inputOffsets[MAX_CHANNEL_PER_OPERATION]; uint32_t outputOffsets[MAX_CHANNEL_PER_OPERATION]; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index e2ceaf224..a026905aa 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -211,36 +211,36 @@ MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHa uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels, uint32_t size, uint32_t flag) { for (int index = 0; index < nDstChannels; ++index) { - smChannels[dstChannelIndexes[index]].putPackets( - dstOffsets[index] * sizeof(PacketType), inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag); + smChannels[dstChannelIndexes[index]].putPackets(dstOffsets[index] * 2, inputOffsetByBytes, size, + threadIdx.x, blockDim.x, flag); } } template -MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* output, uint32_t outputOffsetByBytes, T* input, - uint32_t inputOffsetByBytes, DeviceHandle* smChannels, - uint8_t* dstChannelIndexes, uint32_t* dstOffsets, - uint32_t* srcOffsets, int nDstChannels, int nSrcs, size_t size, +MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes, + T* inputBuff, uint32_t* inputOffsets, int nSrcs, + DeviceHandle* smChannels, uint8_t* outputChannelIndexes, + uint32_t* outputOffsets, int nDstChannels, size_t size, uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); - const uint32_t srcOffset = inputOffsetByBytes / sizeof(PacketValType); - const uint32_t dstOffset = outputOffsetByBytes / sizeof(PacketValType); - PacketValType* src = (PacketValType*)input + srcOffset; - PacketValType* dst = (PacketValType*)output + dstOffset; + const uint32_t srcOffset = dstOffsetByBytes / sizeof(PacketValType); + const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType); + PacketValType* srcPacketValue = (PacketValType*)src + srcOffset; + PacketValType* dstPacketValue = (PacketValType*)dst + dstOffset; for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { PacketValType data = {}; for (int index = 0; index < nSrcs; ++index) { - PacketType* pkt = (PacketType*)((char*)input + 2 * srcOffsets[index]); + PacketType* pkt = (PacketType*)((char*)inputBuff + 2 * inputOffsets[index]); PacketValType val = pkt[idx].read(flag); data = add_vectors(data, val); } - data = add_vectors(data, src[idx]); - dst[idx] = data; + data = add_vectors(data, srcPacketValue[idx]); + dstPacketValue[idx] = data; PacketType pkt(data, flag); for (int index = 0; index < nDstChannels; ++index) { - size_t offset = (dstOffsets[index] * 2) / sizeof(PacketType); - smChannels[dstChannelIndexes[index]].write(offset + idx, pkt); + size_t offset = (outputOffsets[index] * 2) / sizeof(PacketType); + smChannels[outputChannelIndexes[index]].write(offset + idx, pkt); } } } @@ -277,6 +277,7 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; T* src = nullptr; T* dst = nullptr; + T* tmp = nullptr; for (int i = 0; i < localPlan->nOperations; i++) { switch (operations[i].type) { case OperationType::BARRIER: @@ -317,10 +318,11 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu case OperationType::REDUCE_SEND_PACKET: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); src = getBuffer(input, output, scratch, operations[i].srcBufferType); - handleReduceSendPacket(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, + tmp = getBuffer(input, output, scratch, operations[i].inputBufferType); + handleReduceSendPacket(dst, operations[i].dstOffset, src, operations[i].srcOffset, tmp, + operations[i].inputOffsets, operations[i].nInputs, smChannels, operations[i].outputChannelIndexes, operations[i].outputOffsets, - operations[i].inputOffsets, operations[i].nOutputs, operations[i].nInputs, - operations[i].size, flag); + operations[i].nOutputs, operations[i].size, flag); break; case OperationType::COPY_PACKET: dst = getBuffer(input, output, scratch, operations[i].dstBufferType); diff --git a/test/executor_test.cc b/test/executor_test.cc index c708a97d3..4a8822a99 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -29,7 +29,7 @@ int main() { MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); bootstrap->initialize(id); // sleep 20s - std::this_thread::sleep_for(std::chrono::seconds(20)); + // std::this_thread::sleep_for(std::chrono::seconds(20)); auto comm = std::make_shared(bootstrap); CUDACHECK(cudaSetDevice(rank)); From 695ff9449a5de7b303bd6bfb1d1890daa2dd85d4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 8 Apr 2024 03:29:39 +0000 Subject: [PATCH 37/51] update --- include/mscclpp/executor.hpp | 2 +- python/mscclpp/executor.cpp | 3 ++- python/test/executor_test.py | 38 ++++++++++++++++++++++++++- src/executor/execution_plan.cc | 48 ++++++++++++++++++---------------- src/executor/executor.cc | 10 +++---- src/include/execution_plan.hpp | 6 ++--- test/executor_test.cc | 2 +- 7 files changed, 74 insertions(+), 35 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index 60a68fbb2..f54c80585 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -24,7 +24,7 @@ enum class PacketType { class ExecutionPlan { public: - ExecutionPlan(std::string planPath); + ExecutionPlan(const std::string name, const std::string planPath); ~ExecutionPlan() = default; private: diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp index f57a4294b..9b411ba8a 100644 --- a/python/mscclpp/executor.cpp +++ b/python/mscclpp/executor.cpp @@ -20,7 +20,8 @@ void register_executor(nb::module_& m) { nb::enum_(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); - nb::class_(m, "ExecutionPlan").def(nb::init(), nb::arg("planPath")); + nb::class_(m, "ExecutionPlan") + .def(nb::init(), nb::arg("name"), nb::arg("planPath")); nb::class_(m, "Executor") .def(nb::init, int>(), nb::arg("comm"), nb::arg("nranksPerNode")) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index a777c3546..3cef5a318 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -11,6 +11,29 @@ MSCCLPP_ROOT_PATH = "/root/mscclpp" +def bench_time(niters: int, func): + # capture cuda graph for niters of the kernel launch + stream = cp.cuda.Stream(non_blocking=True) + with stream: + stream.begin_capture() + for i in range(niters): + func(stream) + graph = stream.end_capture() + + # now run a warm up round + graph.launch(stream) + + # now run the benchmark and measure time + start = cp.cuda.Event() + end = cp.cuda.Event() + + start.record(stream) + graph.launch(stream) + end.record(stream) + end.synchronize() + + return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 + if __name__ == "__main__": shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) N_GPUS_PER_NODE = shm_comm.size @@ -19,7 +42,7 @@ cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD) executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) - execution_plan = ExecutionPlan(path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")) + execution_plan = ExecutionPlan("allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")) nelems = 1024 * 1024 cp.random.seed(42) @@ -45,5 +68,18 @@ ) stream.synchronize() assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size) + + execution_time = bench_time(1000, lambda stream: executor.execute( + MPI.COMM_WORLD.rank, + sendbuf.data.ptr, + sendbuf.data.ptr, + sendbuf.nbytes, + sendbuf.nbytes, + DataType.float16, + 512, + execution_plan, + stream.ptr, + )) + print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes") executor = None mscclpp_group = None diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 55123d6b0..fb3f3a027 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -77,7 +77,8 @@ auto convertToChannelType = [](const std::string& str) { namespace mscclpp { using json = nlohmann::json; -ExecutionPlan::Impl::Impl(std::string planPath) : planPath(planPath), isUsingPacket(false) {} +ExecutionPlan::Impl::Impl(const std::string name, const std::string planPath) + : name(name), planPath(planPath), isUsingPacket(false) {} std::vector ExecutionPlan::Impl::getChannelInfos(int rank, ChannelType channelType) const { auto pred = [channelType](const ChannelInfo& info) { return info.channelType == channelType; }; @@ -121,7 +122,9 @@ int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->oper void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) { std::ifstream file(this->planPath); json obj = json::parse(file); - this->name = obj["name"]; + if (this->name != obj["name"]) { + throw std::runtime_error("Plan name does not match"); + } std::string protocol = obj["protocol"]; if (protocol == "LL") { this->isUsingPacket = true; @@ -221,31 +224,31 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { } if (op.contains("i_cids")) { operation.nInputs = op["i_cids"].size(); - } - if (op.contains("o_cids")) { - operation.nOutputs = op["o_cids"].size(); - } - for (int i = 0; i < operation.nInputs; i++) { - BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); - BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); - operation.inputChannelIndexes[i] = - channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; - operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; + for (int i = 0; i < operation.nInputs; i++) { + BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); + BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); + operation.inputChannelIndexes[i] = + channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; + operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; + } } // will have either srcs or i_cids if (op.contains("srcs")) { operation.nInputs = op["srcs"].size(); operation.inputBufferType = convertToBufferType(op["srcs"][0]["buff"]); + for (int i = 0; i < operation.nInputs; i++) { + operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"]; + } } - for (int i = 0; i < operation.nInputs; i++) { - operation.inputOffsets[i] = this->chunkSize * (int)op["srcs"][i]["off"]; - } - for (int i = 0; i < operation.nOutputs; i++) { - BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); - BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); - operation.outputChannelIndexes[i] = - channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]]; - operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"]; + if (op.contains("o_cids")) { + operation.nOutputs = op["o_cids"].size(); + for (int i = 0; i < operation.nOutputs; i++) { + BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]); + BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]); + operation.outputChannelIndexes[i] = + channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["o_cids"][i]["id"]]; + operation.outputOffsets[i] = this->chunkSize * (int)op["o_cids"][i]["off"]; + } } if (op.contains("srcbuff")) { operation.srcBufferType = convertToBufferType(op["srcbuff"]); @@ -269,6 +272,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { } } -ExecutionPlan::ExecutionPlan(std::string planPath) : impl_(std::make_shared(planPath)) {} +ExecutionPlan::ExecutionPlan(const std::string name, const std::string planPath) + : impl_(std::make_shared(name, planPath)) {} } // namespace mscclpp diff --git a/src/executor/executor.cc b/src/executor/executor.cc index d775cd593..62bf33806 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -78,7 +78,7 @@ struct Executor::Impl { ~Impl() = default; ExecutionContext setupExecutionContext(int rank, void* sendbuff, void* recvbuff, size_t sendBufferSize, - size_t recvBufferSize, const ExecutionPlan& plan, cudaStream_t stream) { + size_t recvBufferSize, const ExecutionPlan& plan) { ExecutionContextKey key = {sendbuff, recvbuff, sendBufferSize, recvBufferSize, plan.impl_->name}; if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; @@ -96,10 +96,8 @@ struct Executor::Impl { this->setupDeviceExecutionPlan(context, rank, plan); context.deviceExecutionPlansBuffer = allocExtSharedCuda(context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan)); - MSCCLPP_CUDATHROW(cudaMemcpyAsync(context.deviceExecutionPlansBuffer.get(), context.deviceExecutionPlans.data(), - context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan), - cudaMemcpyHostToDevice, stream)); - MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); + memcpyCuda(context.deviceExecutionPlansBuffer.get(), (char*)context.deviceExecutionPlans.data(), + context.deviceExecutionPlans.size() * sizeof(DeviceExecutionPlan), cudaMemcpyHostToDevice); this->contexts.insert({key, context}); return context; } @@ -286,7 +284,7 @@ void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuff DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType) { ExecutionContext context = - this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan, stream); + this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType); } diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 8c0029f0a..450bb4f55 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -46,7 +46,7 @@ struct ChannelInfo { struct ExecutionPlan::Impl { public: - Impl(std::string planPath); + Impl(const std::string name, const std::string planPath); ~Impl() = default; std::vector getChannelInfos(int rank, ChannelType channelType) const; @@ -61,7 +61,8 @@ struct ExecutionPlan::Impl { void setupChannels(const nlohmann::json& gpus); void setupOperations(const nlohmann::json& gpus); - std::string planPath; + const std::string name; + const std::string planPath; bool isUsingPacket; // operations for [rank][threadblock] = [operations] std::unordered_map>> operations; @@ -69,7 +70,6 @@ struct ExecutionPlan::Impl { // threadblockChannelMap[rank][threadblock] = [channelIndex] std::unordered_map>>> threadblockSMChannelMap; std::unordered_map>>> threadblockProxyChannelMap; - std::string name; std::unordered_map inputChunks; std::unordered_map outputChunks; std::unordered_map scratchChunks; diff --git a/test/executor_test.cc b/test/executor_test.cc index 4a8822a99..213b3cdb6 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -34,7 +34,7 @@ int main() { CUDACHECK(cudaSetDevice(rank)); std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); - mscclpp::ExecutionPlan plan(MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce_packet.json"); + mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); const int bufferSize = 1024 * 1024; std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); From 5f37c0aa94d86021af3ea311ad3c385c77c681f8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 8 Apr 2024 03:31:44 +0000 Subject: [PATCH 38/51] lint --- python/test/executor_test.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 3cef5a318..395d089b8 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -11,6 +11,7 @@ MSCCLPP_ROOT_PATH = "/root/mscclpp" + def bench_time(niters: int, func): # capture cuda graph for niters of the kernel launch stream = cp.cuda.Stream(non_blocking=True) @@ -34,6 +35,7 @@ def bench_time(niters: int, func): return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 + if __name__ == "__main__": shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) N_GPUS_PER_NODE = shm_comm.size @@ -42,7 +44,9 @@ def bench_time(niters: int, func): cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD) executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) - execution_plan = ExecutionPlan("allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json")) + execution_plan = ExecutionPlan( + "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json") + ) nelems = 1024 * 1024 cp.random.seed(42) @@ -69,17 +73,20 @@ def bench_time(niters: int, func): stream.synchronize() assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size) - execution_time = bench_time(1000, lambda stream: executor.execute( - MPI.COMM_WORLD.rank, - sendbuf.data.ptr, - sendbuf.data.ptr, - sendbuf.nbytes, - sendbuf.nbytes, - DataType.float16, - 512, - execution_plan, - stream.ptr, - )) + execution_time = bench_time( + 1000, + lambda stream: executor.execute( + MPI.COMM_WORLD.rank, + sendbuf.data.ptr, + sendbuf.data.ptr, + sendbuf.nbytes, + sendbuf.nbytes, + DataType.float16, + 512, + execution_plan, + stream.ptr, + ), + ) print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes") executor = None mscclpp_group = None From 78c56650cef1d67077589c4305b4aca48709c787 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 8 Apr 2024 03:45:53 +0000 Subject: [PATCH 39/51] fix --- src/include/execution_kernel.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index a026905aa..ae0c40e26 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -223,7 +223,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy uint32_t* outputOffsets, int nDstChannels, size_t size, uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); - const uint32_t srcOffset = dstOffsetByBytes / sizeof(PacketValType); + const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType); const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType); PacketValType* srcPacketValue = (PacketValType*)src + srcOffset; PacketValType* dstPacketValue = (PacketValType*)dst + dstOffset; From 309f8f2db636c6920b40aeeaa94e8e3b6b45ef9a Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 9 Apr 2024 07:15:47 +0000 Subject: [PATCH 40/51] add test --- python/test/executor_test.py | 20 +----------------- python/test/test_mscclpp.py | 41 +++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 395d089b8..c4cd0a87c 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -53,25 +53,7 @@ def bench_time(niters: int, func): buffer = cp.random.random(nelems).astype(cp.float16) sub_arrays = cp.split(buffer, MPI.COMM_WORLD.size) sendbuf = sub_arrays[MPI.COMM_WORLD.rank] - - expected = cp.zeros_like(sendbuf) - for i in range(MPI.COMM_WORLD.size): - expected += sub_arrays[i] - - stream = cp.cuda.Stream(non_blocking=True) - executor.execute( - MPI.COMM_WORLD.rank, - sendbuf.data.ptr, - sendbuf.data.ptr, - sendbuf.nbytes, - sendbuf.nbytes, - DataType.float16, - 512, - execution_plan, - stream.ptr, - ) - stream.synchronize() - assert cp.allclose(sendbuf, expected, atol=1e-3 * MPI.COMM_WORLD.size) + mscclpp_group.barrier() execution_time = bench_time( 1000, diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 4b3cb6ebf..f007718e3 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -12,7 +12,10 @@ import pytest from mscclpp import ( + DataType, EndpointConfig, + ExecutionPlan, + Executor, Fifo, Host2DeviceSemaphore, Host2HostSemaphore, @@ -25,7 +28,7 @@ import mscclpp.comm as mscclpp_comm from mscclpp.utils import KernelBuilder, pack from ._cpp import _ext -from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group +from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group, N_GPUS_PER_NODE ethernet_interface_name = "eth0" @@ -590,3 +593,39 @@ def test_nvls(mpi_group: MpiGroup): kernel() cp.cuda.runtime.deviceSynchronize() group.barrier() + + +@parametrize_mpi_groups(2) +@pytest.mark.parametrize("filename", ["allreduce.json", "allreduce_packet.json"]) +def test_executor(mpi_group: MpiGroup, filename: str): + if all_ranks_on_the_same_node(mpi_group) is False: + pytest.skip("algo not support cross node") + project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm) + executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) + execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename)) + + nelems = 1024 * 1024 + cp.random.seed(42) + buffer = cp.random.random(nelems).astype(cp.float16) + sub_arrays = cp.split(buffer, mpi_group.comm.size) + sendbuf = sub_arrays[mpi_group.comm.rank] + expected = cp.zeros_like(sendbuf) + for i in range(mpi_group.comm.size): + expected += sub_arrays[i] + mscclpp_group.barrier() + + stream = cp.cuda.Stream(non_blocking=True) + executor.execute( + mpi_group.comm.rank, + sendbuf.data.ptr, + sendbuf.data.ptr, + sendbuf.nbytes, + sendbuf.nbytes, + DataType.float16, + 512, + execution_plan, + stream.ptr, + ) + stream.synchronize() + assert cp.allclose(sendbuf, expected, atol=1e-3 * mpi_group.comm.size) From 80a513e75ad9dfd61888dcbed2103160fb7c9842 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 9 Apr 2024 08:03:39 +0000 Subject: [PATCH 41/51] minor update --- python/test/executor_test.py | 3 +++ src/executor/execution_plan.cc | 11 +++++------ src/executor/executor.cc | 3 ++- src/include/execution_plan.hpp | 4 ++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index c4cd0a87c..50b296b2d 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + from os import path from mscclpp import ( DataType, diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index fb3f3a027..5948d029d 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -119,7 +119,7 @@ std::vector ExecutionPlan::Impl::getOperations(int rank, int threadbl int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->operations.at(rank).size(); } -void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) { +void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) { std::ifstream file(this->planPath); json obj = json::parse(file); if (this->name != obj["name"]) { @@ -139,14 +139,12 @@ void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize) { } this->setupChannels(gpus); - uint32_t maxInputChunks = 0; - for (const auto& [rank, chunks] : this->inputChunks) { - maxInputChunks = std::max(maxInputChunks, chunks); - } - this->chunkSize = inputSize / maxInputChunks; + this->chunkSize = inputSize / this->inputChunks[rank]; this->setupOperations(gpus); } +// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors. +// Step 2. For each threadblock, construct a vector of channel indexes and keys. void ExecutionPlan::Impl::setupChannels(const json& gpus) { for (const auto& gpu : gpus) { int rank = gpu["id"]; @@ -227,6 +225,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { for (int i = 0; i < operation.nInputs; i++) { BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]); BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]); + // Get the relevant channel index in rank channelInfos operation.inputChannelIndexes[i] = channelIndexes[{srcBufferType, dstBufferType, operation.channelType}][op["i_cids"][i]["id"]]; operation.inputOffsets[i] = this->chunkSize * (int)op["i_cids"][i]["off"]; diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 62bf33806..d4112f99b 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -83,7 +83,7 @@ struct Executor::Impl { if (this->contexts.find(key) != this->contexts.end()) { return this->contexts[key]; } - plan.impl_->loadExecutionPlan(sendBufferSize); + plan.impl_->loadExecutionPlan(rank, sendBufferSize); ExecutionContext context; size_t scratchBufferSize = plan.impl_->getScratchBufferSize(rank, sendBufferSize); @@ -285,6 +285,7 @@ void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuff PacketType packetType) { ExecutionContext context = this->impl_->setupExecutionContext(rank, sendbuff, recvBuff, sendBuffSize, recvBuffSize, plan); + // TODO(binyli): need to flush proxy channel here this->impl_->proxyService->startProxy(); this->impl_->launchKernel(context, rank, nthreads, sendbuff, recvBuff, dataType, stream, packetType); } diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 450bb4f55..924d1358c 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -57,7 +57,7 @@ struct ExecutionPlan::Impl { std::vector getOperations(int rank, int threadblock) const; int getThreadblockCount(int rank) const; - void loadExecutionPlan(size_t inputSize); + void loadExecutionPlan(int rank, size_t inputSize); void setupChannels(const nlohmann::json& gpus); void setupOperations(const nlohmann::json& gpus); @@ -67,7 +67,7 @@ struct ExecutionPlan::Impl { // operations for [rank][threadblock] = [operations] std::unordered_map>> operations; std::unordered_map> channelInfos; - // threadblockChannelMap[rank][threadblock] = [channelIndex] + // threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey] std::unordered_map>>> threadblockSMChannelMap; std::unordered_map>>> threadblockProxyChannelMap; std::unordered_map inputChunks; From ab8d6d74f56d539479b16ea46f75ffbff1afeaf8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 17 Apr 2024 05:57:23 +0000 Subject: [PATCH 42/51] minor improve --- src/include/execution_kernel.hpp | 62 +++++++++++++++----------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index ae0c40e26..5cf63af83 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -272,63 +272,59 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu #else // !defined(MSCCLPP_DEVICE_HIP) __syncthreads(); #endif // !defined(MSCCLPP_DEVICE_HIP) + localPlan = (DeviceExecutionPlan*)sharedMem; + int nOperations = localPlan->nOperations; Operation* operations = localPlan->operations; DeviceHandle* smChannels = localPlan->channels.smChannels; DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; T* src = nullptr; T* dst = nullptr; T* tmp = nullptr; - for (int i = 0; i < localPlan->nOperations; i++) { - switch (operations[i].type) { + for (int i = 0; i < nOperations; i++) { + Operation* op = &operations[i]; + switch (op->type) { case OperationType::BARRIER: __syncthreads(); break; case OperationType::SIGNAL: - handleSignal(tid, smChannels, proxyChannels, operations[i].outputChannelIndexes, operations[i].nOutputs, - operations[i].channelType); + handleSignal(tid, smChannels, proxyChannels, op->outputChannelIndexes, op->nOutputs, op->channelType); break; case OperationType::WAIT: - handleWait(tid, smChannels, proxyChannels, operations[i].inputChannelIndexes, operations[i].nInputs, - operations[i].channelType); + handleWait(tid, smChannels, proxyChannels, op->inputChannelIndexes, op->nInputs, op->channelType); break; case OperationType::GET: - handleGet(smChannels[operations[i].inputChannelIndexes[0]], operations[i].inputOffsets[0], - operations[i].dstOffset, operations[i].size); + handleGet(smChannels[op->inputChannelIndexes[0]], op->inputOffsets[0], op->dstOffset, op->size); break; case OperationType::READ_REDUCE_COPY_SEND: - dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - src = getBuffer(input, output, scratch, operations[i].srcBufferType); - handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, - operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, - operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs, - operations[i].nInputs, operations[i].size); + dst = getBuffer(input, output, scratch, op->dstBufferType); + src = getBuffer(input, output, scratch, op->srcBufferType); + handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes, + op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs, + op->nInputs, op->size); break; case OperationType::READ_REDUCE_COPY: - dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - src = getBuffer(input, output, scratch, operations[i].srcBufferType); - handleReadReduceCopySend(dst, operations[i].dstOffset, src, operations[i].srcOffset, smChannels, - operations[i].outputChannelIndexes, operations[i].inputChannelIndexes, - operations[i].outputOffsets, operations[i].inputOffsets, operations[i].nOutputs, - operations[i].nInputs, operations[i].size, false); + dst = getBuffer(input, output, scratch, op->dstBufferType); + src = getBuffer(input, output, scratch, op->srcBufferType); + handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes, + op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs, + op->nInputs, op->size, false); break; case OperationType::PUT_PACKET: - handlePutPacket(operations[i].srcOffset, smChannels, operations[i].outputChannelIndexes, - operations[i].outputOffsets, operations[i].nOutputs, operations[i].size, flag); + handlePutPacket(op->srcOffset, smChannels, op->outputChannelIndexes, op->outputOffsets, + op->nOutputs, op->size, flag); break; case OperationType::REDUCE_SEND_PACKET: - dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - src = getBuffer(input, output, scratch, operations[i].srcBufferType); - tmp = getBuffer(input, output, scratch, operations[i].inputBufferType); - handleReduceSendPacket(dst, operations[i].dstOffset, src, operations[i].srcOffset, tmp, - operations[i].inputOffsets, operations[i].nInputs, smChannels, - operations[i].outputChannelIndexes, operations[i].outputOffsets, - operations[i].nOutputs, operations[i].size, flag); + dst = getBuffer(input, output, scratch, op->dstBufferType); + src = getBuffer(input, output, scratch, op->srcBufferType); + tmp = getBuffer(input, output, scratch, op->inputBufferType); + handleReduceSendPacket(dst, op->dstOffset, src, op->srcOffset, tmp, op->inputOffsets, + op->nInputs, smChannels, op->outputChannelIndexes, op->outputOffsets, + op->nOutputs, op->size, flag); break; case OperationType::COPY_PACKET: - dst = getBuffer(input, output, scratch, operations[i].dstBufferType); - src = getBuffer(input, output, scratch, operations[i].srcBufferType); - handleCopyPacket(dst, src, operations[i].dstOffset, operations[i].srcOffset, operations[i].size, - flag); + dst = getBuffer(input, output, scratch, op->dstBufferType); + src = getBuffer(input, output, scratch, op->srcBufferType); + handleCopyPacket(dst, src, op->dstOffset, op->srcOffset, op->size, flag); break; default: break; From 48d877c9b1c4a4e80516c310228153b53154a6c4 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 17 Apr 2024 08:38:23 +0000 Subject: [PATCH 43/51] WIP --- src/include/execution_common.hpp | 4 ++-- src/include/execution_kernel.hpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index 685317268..38d624109 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -67,8 +67,8 @@ struct Operation { uint32_t size; }; -// total size = 1920 + 6400 + 4 + 4(padding) = 8324 bytes -struct DeviceExecutionPlan { +// total size = 1920 + 6400 + 4 + 4(padding) + 12(align) = 8336 bytes +struct __attribute__((aligned(16))) DeviceExecutionPlan { uint8_t nSmChannels; // 1 bytes uint8_t nProxyChannels; // 1 bytes uint16_t nOperations; // 2 bytes diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 5cf63af83..9fadb2bc1 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -260,12 +260,12 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOf template __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, DeviceExecutionPlan* plan, uint32_t flag) { - extern __shared__ int sharedMem[]; + extern __shared__ int4 sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; DeviceExecutionPlan* localPlan = plan + bid; - for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int); i += blockDim.x) { - sharedMem[i] = ((int*)localPlan)[i]; + for (size_t i = tid; i < sizeof(DeviceExecutionPlan) / sizeof(int4); i += blockDim.x) { + sharedMem[i] = ((int4*)localPlan)[i]; } #if defined(MSCCLPP_DEVICE_HIP) __synclds(); From d6b03669070e0244280789ba9033831db0ce997d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 17 Apr 2024 14:53:35 +0000 Subject: [PATCH 44/51] packet fix --- src/executor/execution_kernel.cu | 26 ++++++++------- src/executor/execution_plan.cc | 3 +- src/executor/executor.cc | 10 +++--- src/include/execution_kernel.hpp | 55 ++++++++++++++++---------------- 4 files changed, 50 insertions(+), 44 deletions(-) diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu index 7aca5b1ed..4e96af9ab 100644 --- a/src/executor/execution_kernel.cu +++ b/src/executor/execution_kernel.cu @@ -8,33 +8,35 @@ namespace mscclpp { template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, - cudaStream_t stream, uint32_t flag) { + size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan, + size_t sharedMemSize, cudaStream_t stream, uint32_t flag) { switch (dataType) { case DataType::INT32: executionKernel<<>>( - rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag); + rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag); break; case DataType::UINT32: executionKernel<<>>( - rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag); + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag); break; case DataType::FLOAT16: - executionKernel<<>>(rank, (half*)src, (half*)dst, - (half*)scratch, plan, flag); + executionKernel<<>>( + rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag); break; case DataType::FLOAT32: - executionKernel<<>>(rank, (float*)src, (float*)dst, - (float*)scratch, plan, flag); + executionKernel<<>>( + rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag); break; } } template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, - void* scratch, DataType dataType, DeviceExecutionPlan* plan, - size_t sharedMemSize, cudaStream_t stream, uint32_t flag); + void* scratch, size_t scratchSize, DataType dataType, + DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream, uint32_t flag); template void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, - void* scratch, DataType dataType, DeviceExecutionPlan* plan, - size_t sharedMemSize, cudaStream_t stream, uint32_t flag); + void* scratch, size_t scratchSize, DataType dataType, + DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream, uint32_t flag); } // namespace mscclpp #endif diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 5948d029d..341c9bd9f 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -109,7 +109,8 @@ std::vector ExecutionPlan::Impl::getConnectedBufferTypes(int rank) c } size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize) const { if (this->isUsingPacket) { - return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2; + return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank) * 2 /* data + flag*/ * + 2 /*double buffer*/; } return inputSize / this->inputChunks.at(rank) * this->scratchChunks.at(rank); } diff --git a/src/executor/executor.cc b/src/executor/executor.cc index d4112f99b..5b1a827c6 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -263,13 +263,15 @@ struct Executor::Impl { switch (packetType) { case PacketType::LL16: ExecutionKernel::launchKernel( - rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, - (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag); + rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), + context.scratchBufferSize, dataType, (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), + sharedMemSize, stream, ++flag); break; case PacketType::LL8: ExecutionKernel::launchKernel( - rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), dataType, - (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), sharedMemSize, stream, ++flag); + rank, nthreadblocks, nthreadsPerBlock, sendbuff, recvbuff, (void*)context.scratchBuffer.get(), + context.scratchBufferSize, dataType, (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(), + sharedMemSize, stream, ++flag); break; default: throw std::runtime_error("Invalid packet type"); diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 9fadb2bc1..4cfd1698a 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -207,22 +207,24 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf } template -MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, DeviceHandle* smChannels, - uint8_t* dstChannelIndexes, uint32_t* dstOffsets, int nDstChannels, - uint32_t size, uint32_t flag) { +MSCCLPP_DEVICE_INLINE void handlePutPacket(uint32_t inputOffsetByBytes, size_t scratchSize, + DeviceHandle* smChannels, uint8_t* dstChannelIndexes, + uint32_t* dstOffsets, int nDstChannels, uint32_t size, uint32_t flag) { + const size_t scratchBaseOffset = flag & 0x1 ? 0 : scratchSize >> 1; for (int index = 0; index < nDstChannels; ++index) { - smChannels[dstChannelIndexes[index]].putPackets(dstOffsets[index] * 2, inputOffsetByBytes, size, - threadIdx.x, blockDim.x, flag); + smChannels[dstChannelIndexes[index]].putPackets( + scratchBaseOffset + dstOffsets[index] * 2, inputOffsetByBytes, size, threadIdx.x, blockDim.x, flag); } } template MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes, - T* inputBuff, uint32_t* inputOffsets, int nSrcs, + T* inputBuff, size_t inputBuffSize, uint32_t* inputOffsets, int nSrcs, DeviceHandle* smChannels, uint8_t* outputChannelIndexes, uint32_t* outputOffsets, int nDstChannels, size_t size, uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); + const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1; const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType); const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType); PacketValType* srcPacketValue = (PacketValType*)src + srcOffset; @@ -230,7 +232,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { PacketValType data = {}; for (int index = 0; index < nSrcs; ++index) { - PacketType* pkt = (PacketType*)((char*)inputBuff + 2 * inputOffsets[index]); + PacketType* pkt = (PacketType*)((char*)inputBuff + intputBaseOffset + 2 * inputOffsets[index]); PacketValType val = pkt[idx].read(flag); data = add_vectors(data, val); } @@ -239,16 +241,17 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy PacketType pkt(data, flag); for (int index = 0; index < nDstChannels; ++index) { - size_t offset = (outputOffsets[index] * 2) / sizeof(PacketType); + size_t offset = (intputBaseOffset + outputOffsets[index] * 2) / sizeof(PacketType); smChannels[outputChannelIndexes[index]].write(offset + idx, pkt); } } } template -MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size, - uint32_t flag) { - PacketType* srcPackets = (PacketType*)((char*)src + 2 * srcOffset); +MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize, uint32_t dstOffset, + uint32_t srcOffset, size_t size, uint32_t flag) { + const size_t outputScratchBaseOffset = flag & 0x1 ? 0 : srcSize >> 1; + PacketType* srcPackets = (PacketType*)((char*)src + outputScratchBaseOffset + 2 * srcOffset); PacketValType* result = (PacketValType*)((char*)dst + dstOffset); size_t nPackets = size * 2 / sizeof(PacketType); for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { @@ -259,7 +262,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOf template __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* input, T* output, T* scratch, - DeviceExecutionPlan* plan, uint32_t flag) { + size_t scratchSize, DeviceExecutionPlan* plan, uint32_t flag) { extern __shared__ int4 sharedMem[]; int bid = blockIdx.x; int tid = threadIdx.x; @@ -279,7 +282,6 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; T* src = nullptr; T* dst = nullptr; - T* tmp = nullptr; for (int i = 0; i < nOperations; i++) { Operation* op = &operations[i]; switch (op->type) { @@ -310,21 +312,20 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu op->nInputs, op->size, false); break; case OperationType::PUT_PACKET: - handlePutPacket(op->srcOffset, smChannels, op->outputChannelIndexes, op->outputOffsets, + handlePutPacket(op->srcOffset, scratchSize, smChannels, op->outputChannelIndexes, op->outputOffsets, op->nOutputs, op->size, flag); break; case OperationType::REDUCE_SEND_PACKET: dst = getBuffer(input, output, scratch, op->dstBufferType); src = getBuffer(input, output, scratch, op->srcBufferType); - tmp = getBuffer(input, output, scratch, op->inputBufferType); - handleReduceSendPacket(dst, op->dstOffset, src, op->srcOffset, tmp, op->inputOffsets, - op->nInputs, smChannels, op->outputChannelIndexes, op->outputOffsets, - op->nOutputs, op->size, flag); + handleReduceSendPacket(dst, op->dstOffset, src, op->srcOffset, scratch, scratchSize, + op->inputOffsets, op->nInputs, smChannels, op->outputChannelIndexes, + op->outputOffsets, op->nOutputs, op->size, flag); break; case OperationType::COPY_PACKET: dst = getBuffer(input, output, scratch, op->dstBufferType); src = getBuffer(input, output, scratch, op->srcBufferType); - handleCopyPacket(dst, src, op->dstOffset, op->srcOffset, op->size, flag); + handleCopyPacket(dst, src, scratchSize, op->dstOffset, op->srcOffset, op->size, flag); break; default: break; @@ -338,32 +339,32 @@ class ExecutionKernel { #if defined(MSCCLPP_DEVICE_HIP) template static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream, - uint32_t flag = 0) { + size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream, uint32_t flag = 0) { switch (dataType) { case DataType::INT32: executionKernel<<>>( - rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag); + rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag); break; case DataType::UINT32: executionKernel<<>>( - rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag); + rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag); break; case DataType::FLOAT16: executionKernel<<>>( - rank, (half*)src, (half*)dst, (half*)scratch, plan, flag); + rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag); break; case DataType::FLOAT32: executionKernel<<>>( - rank, (float*)src, (float*)dst, (float*)scratch, plan, flag); + rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag); break; } } #else // !defined(MSCCLPP_DEVICE_HIP) template static void launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch, - DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, cudaStream_t stream, - uint32_t flag = 0); + size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize, + cudaStream_t stream, uint32_t flag = 0); #endif // !defined(MSCCLPP_DEVICE_HIP) }; } // namespace mscclpp From 0c2b2c14f4b82118b06cf79c8410e40223a19c54 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 17 Apr 2024 19:07:25 +0000 Subject: [PATCH 45/51] Minor updates --- include/mscclpp/executor.hpp | 2 +- src/executor/execution_plan.cc | 2 +- src/include/execution_kernel.hpp | 53 ++++++++++++++++---------------- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index f54c80585..ab54c2596 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -24,7 +24,7 @@ enum class PacketType { class ExecutionPlan { public: - ExecutionPlan(const std::string name, const std::string planPath); + ExecutionPlan(const std::string& name, const std::string& planPath); ~ExecutionPlan() = default; private: diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 341c9bd9f..9decbed1b 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -130,7 +130,7 @@ void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) { if (protocol == "LL") { this->isUsingPacket = true; } - auto gpus = obj["gpus"]; + const auto& gpus = obj["gpus"]; for (const auto& gpu : gpus) { int rank = gpu["id"]; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 4cfd1698a..3e8d05f50 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -280,52 +280,51 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu Operation* operations = localPlan->operations; DeviceHandle* smChannels = localPlan->channels.smChannels; DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; - T* src = nullptr; - T* dst = nullptr; + for (int i = 0; i < nOperations; i++) { - Operation* op = &operations[i]; - switch (op->type) { + Operation& op = operations[i]; + switch (op.type) { case OperationType::BARRIER: __syncthreads(); break; case OperationType::SIGNAL: - handleSignal(tid, smChannels, proxyChannels, op->outputChannelIndexes, op->nOutputs, op->channelType); + handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); break; case OperationType::WAIT: - handleWait(tid, smChannels, proxyChannels, op->inputChannelIndexes, op->nInputs, op->channelType); + handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType); break; case OperationType::GET: - handleGet(smChannels[op->inputChannelIndexes[0]], op->inputOffsets[0], op->dstOffset, op->size); + handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size); break; case OperationType::READ_REDUCE_COPY_SEND: - dst = getBuffer(input, output, scratch, op->dstBufferType); - src = getBuffer(input, output, scratch, op->srcBufferType); - handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes, - op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs, - op->nInputs, op->size); + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, + op.nInputs, op.size); break; case OperationType::READ_REDUCE_COPY: - dst = getBuffer(input, output, scratch, op->dstBufferType); - src = getBuffer(input, output, scratch, op->srcBufferType); - handleReadReduceCopySend(dst, op->dstOffset, src, op->srcOffset, smChannels, op->outputChannelIndexes, - op->inputChannelIndexes, op->outputOffsets, op->inputOffsets, op->nOutputs, - op->nInputs, op->size, false); + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, + op.nInputs, op.size, false); break; case OperationType::PUT_PACKET: - handlePutPacket(op->srcOffset, scratchSize, smChannels, op->outputChannelIndexes, op->outputOffsets, - op->nOutputs, op->size, flag); + handlePutPacket(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets, + op.nOutputs, op.size, flag); break; case OperationType::REDUCE_SEND_PACKET: - dst = getBuffer(input, output, scratch, op->dstBufferType); - src = getBuffer(input, output, scratch, op->srcBufferType); - handleReduceSendPacket(dst, op->dstOffset, src, op->srcOffset, scratch, scratchSize, - op->inputOffsets, op->nInputs, smChannels, op->outputChannelIndexes, - op->outputOffsets, op->nOutputs, op->size, flag); + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReduceSendPacket(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, + op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes, + op.outputOffsets, op.nOutputs, op.size, flag); break; case OperationType::COPY_PACKET: - dst = getBuffer(input, output, scratch, op->dstBufferType); - src = getBuffer(input, output, scratch, op->srcBufferType); - handleCopyPacket(dst, src, scratchSize, op->dstOffset, op->srcOffset, op->size, flag); + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleCopyPacket(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag); break; default: break; From 8d1b644e05b57931246d6f57d7bf2c3273672d58 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 17 Apr 2024 20:16:15 +0000 Subject: [PATCH 46/51] minor updates --- src/executor/execution_plan.cc | 2 +- src/include/execution_kernel.hpp | 78 ++++++++++++++------------------ 2 files changed, 34 insertions(+), 46 deletions(-) diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 9decbed1b..60fb2b438 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -272,7 +272,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus) { } } -ExecutionPlan::ExecutionPlan(const std::string name, const std::string planPath) +ExecutionPlan::ExecutionPlan(const std::string& name, const std::string& planPath) : impl_(std::make_shared(name, planPath)) {} } // namespace mscclpp diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 3e8d05f50..9749faf69 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -283,51 +283,39 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu for (int i = 0; i < nOperations; i++) { Operation& op = operations[i]; - switch (op.type) { - case OperationType::BARRIER: - __syncthreads(); - break; - case OperationType::SIGNAL: - handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); - break; - case OperationType::WAIT: - handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType); - break; - case OperationType::GET: - handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size); - break; - case OperationType::READ_REDUCE_COPY_SEND: - T* dst = getBuffer(input, output, scratch, op.dstBufferType); - T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, - op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, - op.nInputs, op.size); - break; - case OperationType::READ_REDUCE_COPY: - T* dst = getBuffer(input, output, scratch, op.dstBufferType); - T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, - op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, - op.nInputs, op.size, false); - break; - case OperationType::PUT_PACKET: - handlePutPacket(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets, - op.nOutputs, op.size, flag); - break; - case OperationType::REDUCE_SEND_PACKET: - T* dst = getBuffer(input, output, scratch, op.dstBufferType); - T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleReduceSendPacket(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, - op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes, - op.outputOffsets, op.nOutputs, op.size, flag); - break; - case OperationType::COPY_PACKET: - T* dst = getBuffer(input, output, scratch, op.dstBufferType); - T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleCopyPacket(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag); - break; - default: - break; + if (op.type == OperationType::BARRIER) { + __syncthreads(); + } else if (op.type == OperationType::SIGNAL) { + handleSignal(tid, smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); + } else if (op.type == OperationType::WAIT) { + handleWait(tid, smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType); + } else if (op.type == OperationType::GET) { + handleGet(smChannels[op.inputChannelIndexes[0]], op.inputOffsets[0], op.dstOffset, op.size); + } else if (op.type == OperationType::READ_REDUCE_COPY_SEND) { + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs, + op.size); + } else if (op.type == OperationType::READ_REDUCE_COPY) { + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs, + op.size, false); + } else if (op.type == OperationType::PUT_PACKET) { + handlePutPacket(op.srcOffset, scratchSize, smChannels, op.outputChannelIndexes, op.outputOffsets, + op.nOutputs, op.size, flag); + } else if (op.type == OperationType::REDUCE_SEND_PACKET) { + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleReduceSendPacket(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, op.inputOffsets, + op.nInputs, smChannels, op.outputChannelIndexes, op.outputOffsets, + op.nOutputs, op.size, flag); + } else if (op.type == OperationType::COPY_PACKET) { + T* dst = getBuffer(input, output, scratch, op.dstBufferType); + T* src = getBuffer(input, output, scratch, op.srcBufferType); + handleCopyPacket(dst, src, scratchSize, op.dstOffset, op.srcOffset, op.size, flag); } } } From c29df8eca260fe4326f8a3703c41c6dbd0174e36 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 18 Apr 2024 03:17:50 +0000 Subject: [PATCH 47/51] Fix hang --- python/test/executor_test.py | 12 +++++++----- test/execution-files/allreduce_packet.json | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 50b296b2d..239ecd4a7 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -15,7 +15,7 @@ MSCCLPP_ROOT_PATH = "/root/mscclpp" -def bench_time(niters: int, func): +def bench_time(niters: int, ngraphIters: int, func): # capture cuda graph for niters of the kernel launch stream = cp.cuda.Stream(non_blocking=True) with stream: @@ -32,11 +32,12 @@ def bench_time(niters: int, func): end = cp.cuda.Event() start.record(stream) - graph.launch(stream) + for _ in range(ngraphIters): + graph.launch(stream) end.record(stream) end.synchronize() - return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 + return cp.cuda.get_elapsed_time(start, end) / niters * 1000.0 / ngraphIters if __name__ == "__main__": @@ -59,7 +60,8 @@ def bench_time(niters: int, func): mscclpp_group.barrier() execution_time = bench_time( - 1000, + 100, + 10, lambda stream: executor.execute( MPI.COMM_WORLD.rank, sendbuf.data.ptr, @@ -72,6 +74,6 @@ def bench_time(niters: int, func): stream.ptr, ), ) - print(f"Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes") + print(f"Rank: {MPI.COMM_WORLD.rank} Execution time: {execution_time} us, data size: {sendbuf.nbytes} bytes") executor = None mscclpp_group = None diff --git a/test/execution-files/allreduce_packet.json b/test/execution-files/allreduce_packet.json index 7045f21c2..c01ae4fd9 100644 --- a/test/execution-files/allreduce_packet.json +++ b/test/execution-files/allreduce_packet.json @@ -169,7 +169,7 @@ "id": 1, "inputChunks": 4, "outputChunks": 0, - "scratchChunks": 6, + "scratchChunks": 8, "threadblocks": [ { "id": 0, From 4a739909c9c27959cb4b26fc8bf0a0db0419489f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 18 Apr 2024 05:36:10 +0000 Subject: [PATCH 48/51] address comments --- include/mscclpp/core.hpp | 2 +- include/mscclpp/errors.hpp | 1 + include/mscclpp/packet_device.hpp | 4 ++-- src/errors.cc | 4 ++++ src/executor/execution_plan.cc | 8 ++++---- src/executor/executor.cc | 6 +++--- src/include/execution_kernel.hpp | 20 ++++++++++---------- 7 files changed, 25 insertions(+), 20 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 456020975..132df587a 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -762,7 +762,7 @@ DeviceHandle> deviceHandle(T&& t) { /// Packet value type. template -using PacketValType = typename T::ValueType; +using PacketPayload = typename T::Payload; } // namespace mscclpp diff --git a/include/mscclpp/errors.hpp b/include/mscclpp/errors.hpp index 4e90c8d84..8d3fde4d1 100644 --- a/include/mscclpp/errors.hpp +++ b/include/mscclpp/errors.hpp @@ -16,6 +16,7 @@ enum class ErrorCode { InvalidUsage, // The function was used incorrectly. Timeout, // The operation timed out. Aborted, // The operation was aborted. + ExecutorError, // An error occurred in the MSCCL++ executor. }; /// Convert an error code to a string. diff --git a/include/mscclpp/packet_device.hpp b/include/mscclpp/packet_device.hpp index a20c8abec..8cff4c790 100644 --- a/include/mscclpp/packet_device.hpp +++ b/include/mscclpp/packet_device.hpp @@ -24,7 +24,7 @@ union alignas(16) LL16Packet { uint32_t data2; uint32_t flag2; }; - using ValueType = uint2; + using Payload = uint2; #if defined(MSCCLPP_DEVICE_COMPILE) ulonglong2 raw_; @@ -104,7 +104,7 @@ union alignas(8) LL8Packet { }; uint64_t raw_; - using ValueType = uint32_t; + using Payload = uint32_t; #if defined(MSCCLPP_DEVICE_COMPILE) MSCCLPP_DEVICE_INLINE LL8Packet() {} diff --git a/src/errors.cc b/src/errors.cc index 537b3fc27..fbc7a5734 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -19,6 +19,10 @@ std::string errorToString(enum ErrorCode error) { return "InvalidUsage"; case ErrorCode::Timeout: return "Timeout"; + case ErrorCode::Aborted: + return "Aborted"; + case ErrorCode::ExecutorError: + return "ExecutorError"; default: return "UnknownError"; } diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 60fb2b438..da7e135a7 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -44,7 +44,7 @@ auto getOpType = [](const std::string& str) { } else if (str == "cpkt") { return mscclpp::OperationType::COPY_PACKET; } else { - throw std::runtime_error("Invalid operation type"); + throw mscclpp::Error("Invalid operation type", mscclpp::ErrorCode::ExecutorError); } }; @@ -56,7 +56,7 @@ auto convertToBufferType = [](const std::string& str) { } else if (str == "s") { return mscclpp::BufferType::SCRATCH; } else { - throw std::runtime_error("Invalid buffer type"); + throw mscclpp::Error("Invalid buffer type", mscclpp::ErrorCode::ExecutorError); } }; @@ -68,7 +68,7 @@ auto convertToChannelType = [](const std::string& str) { } else if (str == "none") { return mscclpp::ChannelType::NONE; } else { - throw std::runtime_error("Invalid channel type"); + throw mscclpp::Error("Invalid channel type", mscclpp::ErrorCode::ExecutorError); } }; @@ -124,7 +124,7 @@ void ExecutionPlan::Impl::loadExecutionPlan(int rank, size_t inputSize) { std::ifstream file(this->planPath); json obj = json::parse(file); if (this->name != obj["name"]) { - throw std::runtime_error("Plan name does not match"); + throw Error("Plan name does not match", ErrorCode::ExecutorError); } std::string protocol = obj["protocol"]; if (protocol == "LL") { diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 5b1a827c6..fb033a73e 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -142,7 +142,7 @@ struct Executor::Impl { case BufferType::SCRATCH: return std::make_pair((void*)context.scratchBuffer.get(), context.scratchBufferSize); default: - throw std::runtime_error("Invalid buffer type"); + throw Error("Invalid buffer type", ErrorCode::ExecutorError); } }; auto getConnectedPeers = [&](std::vector& infos) { @@ -206,7 +206,7 @@ struct Executor::Impl { case BufferType::SCRATCH: return (void*)context.scratchBuffer.get(); default: - throw std::runtime_error("Invalid buffer type"); + throw Error("Invalid buffer type", ErrorCode::ExecutorError); } }; for (ChannelType channelType : channelTypes) { @@ -274,7 +274,7 @@ struct Executor::Impl { sharedMemSize, stream, ++flag); break; default: - throw std::runtime_error("Invalid packet type"); + throw Error("Invalid packet type", ErrorCode::ExecutorError); } } }; diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 9749faf69..08e8796a5 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -225,19 +225,19 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1; - const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketValType); - const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketValType); - PacketValType* srcPacketValue = (PacketValType*)src + srcOffset; - PacketValType* dstPacketValue = (PacketValType*)dst + dstOffset; + const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketPayload); + const uint32_t dstOffset = dstOffsetByBytes / sizeof(PacketPayload); + PacketPayload* srcPacketPayload = (PacketPayload*)src + srcOffset; + PacketPayload* dstPacketPayload = (PacketPayload*)dst + dstOffset; for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { - PacketValType data = {}; + PacketPayload data = {}; for (int index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)inputBuff + intputBaseOffset + 2 * inputOffsets[index]); - PacketValType val = pkt[idx].read(flag); + PacketPayload val = pkt[idx].read(flag); data = add_vectors(data, val); } - data = add_vectors(data, srcPacketValue[idx]); - dstPacketValue[idx] = data; + data = add_vectors(data, srcPacketPayload[idx]); + dstPacketPayload[idx] = data; PacketType pkt(data, flag); for (int index = 0; index < nDstChannels; ++index) { @@ -252,10 +252,10 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize uint32_t srcOffset, size_t size, uint32_t flag) { const size_t outputScratchBaseOffset = flag & 0x1 ? 0 : srcSize >> 1; PacketType* srcPackets = (PacketType*)((char*)src + outputScratchBaseOffset + 2 * srcOffset); - PacketValType* result = (PacketValType*)((char*)dst + dstOffset); + PacketPayload* result = (PacketPayload*)((char*)dst + dstOffset); size_t nPackets = size * 2 / sizeof(PacketType); for (size_t idx = threadIdx.x; idx < nPackets; idx += blockDim.x) { - PacketValType data = srcPackets[idx].read(flag); + PacketPayload data = srcPackets[idx].read(flag); result[idx] = data; } } From d4671bc6b77f012f6c37a59bfe139a99254e5a8d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 18 Apr 2024 11:10:19 +0000 Subject: [PATCH 49/51] address comments --- include/mscclpp/core.hpp | 4 ++++ include/mscclpp/executor.hpp | 2 +- python/mscclpp/comm.py | 1 + python/mscclpp/core_py.cpp | 1 + python/mscclpp/executor.cpp | 2 +- python/test/executor_test.py | 8 ++------ python/test/test_mscclpp.py | 4 ++-- src/bootstrap/bootstrap.cc | 25 +++++++++++++++++++++++++ src/executor/executor.cc | 6 +++--- test/executor_test.cc | 4 ++-- test/mp_unit/bootstrap_tests.cc | 7 +++++++ 11 files changed, 49 insertions(+), 15 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 132df587a..01b8096a9 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -38,6 +38,7 @@ class Bootstrap { virtual ~Bootstrap() = default; virtual int getRank() = 0; virtual int getNranks() = 0; + virtual int getNranksPerNode() = 0; virtual void send(void* data, int size, int peer, int tag) = 0; virtual void recv(void* data, int size, int peer, int tag) = 0; virtual void allGather(void* allData, int size) = 0; @@ -83,6 +84,9 @@ class TcpBootstrap : public Bootstrap { /// Return the total number of ranks. int getNranks() override; + /// Return the total number of ranks per node. + int getNranksPerNode() override; + /// Send data to another process. /// /// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size, diff --git a/include/mscclpp/executor.hpp b/include/mscclpp/executor.hpp index ab54c2596..23dc7cece 100644 --- a/include/mscclpp/executor.hpp +++ b/include/mscclpp/executor.hpp @@ -36,7 +36,7 @@ class ExecutionPlan { class Executor { public: - Executor(std::shared_ptr comm, int nranksPerNode); + Executor(std::shared_ptr comm); Executor(const Executor&) = delete; Executor& operator=(const Executor&) = delete; ~Executor(); diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py index b3cc51f12..1cf9ebb41 100644 --- a/python/mscclpp/comm.py +++ b/python/mscclpp/comm.py @@ -51,6 +51,7 @@ def __init__( self.communicator = Communicator(self.bootstrap) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() + self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() def barrier(self): self.bootstrap.barrier() diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 3f78dad35..68da91599 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -36,6 +36,7 @@ void register_core(nb::module_& m) { nb::class_(m, "Bootstrap") .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) + .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode) .def( "send", [](Bootstrap* self, uintptr_t ptr, size_t size, int peer, int tag) { diff --git a/python/mscclpp/executor.cpp b/python/mscclpp/executor.cpp index 9b411ba8a..9f58eac8f 100644 --- a/python/mscclpp/executor.cpp +++ b/python/mscclpp/executor.cpp @@ -24,7 +24,7 @@ void register_executor(nb::module_& m) { .def(nb::init(), nb::arg("name"), nb::arg("planPath")); nb::class_(m, "Executor") - .def(nb::init, int>(), nb::arg("comm"), nb::arg("nranksPerNode")) + .def(nb::init>(), nb::arg("comm")) .def( "execute", [](Executor* self, int rank, uintptr_t sendbuff, uintptr_t recvBuff, size_t sendBuffSize, size_t recvBuffSize, diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 239ecd4a7..b0e4342dd 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -41,13 +41,9 @@ def bench_time(niters: int, ngraphIters: int, func): if __name__ == "__main__": - shm_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED, 0, MPI.INFO_NULL) - N_GPUS_PER_NODE = shm_comm.size - shm_comm.Free() - - cp.cuda.Device(MPI.COMM_WORLD.rank % N_GPUS_PER_NODE).use() mscclpp_group = mscclpp_comm.CommGroup(MPI.COMM_WORLD) - executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) + cp.cuda.Device(MPI.COMM_WORLD.rank % mscclpp_group.nranks_per_node).use() + executor = Executor(mscclpp_group.communicator) execution_plan = ExecutionPlan( "allreduce_pairs", path.join(MSCCLPP_ROOT_PATH, "test", "execution-files", "allreduce.json") ) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index f007718e3..c6014b84e 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -28,7 +28,7 @@ import mscclpp.comm as mscclpp_comm from mscclpp.utils import KernelBuilder, pack from ._cpp import _ext -from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group, N_GPUS_PER_NODE +from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group ethernet_interface_name = "eth0" @@ -602,7 +602,7 @@ def test_executor(mpi_group: MpiGroup, filename: str): pytest.skip("algo not support cross node") project_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mscclpp_group = mscclpp_comm.CommGroup(mpi_group.comm) - executor = Executor(mscclpp_group.communicator, N_GPUS_PER_NODE) + executor = Executor(mscclpp_group.communicator) execution_plan = ExecutionPlan("allreduce_pairs", os.path.join(project_dir, "test", "execution-files", filename)) nelems = 1024 * 1024 diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index c9cea10f4..d6e9a0dfb 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -81,6 +81,7 @@ class TcpBootstrap::Impl { UniqueId getUniqueId() const; int getRank(); int getNranks(); + int getNranksPerNode(); void allGather(void* allData, int size); void send(void* data, int size, int peer, int tag); void recv(void* data, int size, int peer, int tag); @@ -91,6 +92,7 @@ class TcpBootstrap::Impl { UniqueIdInternal uniqueId_; int rank_; int nRanks_; + int nRanksPerNode_; bool netInitialized; std::unique_ptr listenSockRoot_; std::unique_ptr listenSock_; @@ -141,6 +143,7 @@ UniqueId TcpBootstrap::Impl::getUniqueId(const UniqueIdInternal& uniqueId) { TcpBootstrap::Impl::Impl(int rank, int nRanks) : rank_(rank), nRanks_(nRanks), + nRanksPerNode_(0), netInitialized(false), peerCommAddresses_(nRanks, SocketAddress()), barrierArr_(nRanks, 0), @@ -418,6 +421,26 @@ void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) { TRACE(MSCCLPP_INIT, "rank %d nranks %d - DONE", rank_, nRanks_); } +int TcpBootstrap::Impl::getNranksPerNode() { + if (nRanksPerNode_ > 0) return nRanksPerNode_; + int nRanksPerNode = 0; + bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET; + for (int i = 0; i < nRanks_; i++) { + if (useIpv4) { + if (peerCommAddresses_[i].sin.sin_addr.s_addr == peerCommAddresses_[rank_].sin.sin_addr.s_addr) { + nRanksPerNode++; + } + } else { + if (std::memcmp(&(peerCommAddresses_[i].sin6.sin6_addr), &(peerCommAddresses_[rank_].sin6.sin6_addr), + sizeof(in6_addr)) == 0) { + nRanksPerNode++; + } + } + } + nRanksPerNode_ = nRanksPerNode; + return nRanksPerNode_; +} + void TcpBootstrap::Impl::allGather(void* allData, int size) { char* data = static_cast(allData); int rank = rank_; @@ -520,6 +543,8 @@ MSCCLPP_API_CPP int TcpBootstrap::getRank() { return pimpl_->getRank(); } MSCCLPP_API_CPP int TcpBootstrap::getNranks() { return pimpl_->getNranks(); } +MSCCLPP_API_CPP int TcpBootstrap::getNranksPerNode() { return pimpl_->getNranksPerNode(); } + MSCCLPP_API_CPP void TcpBootstrap::send(void* data, int size, int peer, int tag) { pimpl_->send(data, size, peer, tag); } diff --git a/src/executor/executor.cc b/src/executor/executor.cc index fb033a73e..2f4fdd264 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -72,7 +72,8 @@ struct Executor::Impl { std::shared_ptr proxyService; std::unordered_map contexts; - Impl(std::shared_ptr comm, int nranksPerNode) : nranksPerNode(nranksPerNode), comm(comm) { + Impl(std::shared_ptr comm) : comm(comm) { + this->nranksPerNode = comm->bootstrap()->getNranksPerNode(); this->proxyService = std::make_shared(); } ~Impl() = default; @@ -279,8 +280,7 @@ struct Executor::Impl { } }; -Executor::Executor(std::shared_ptr comm, int nranksPerNode) - : impl_(std::make_unique(comm, nranksPerNode)) {} +Executor::Executor(std::shared_ptr comm) : impl_(std::make_unique(comm)) {} void Executor::execute(int rank, void* sendbuff, void* recvBuff, size_t sendBuffSize, size_t recvBuffSize, DataType dataType, int nthreads, const ExecutionPlan& plan, cudaStream_t stream, diff --git a/test/executor_test.cc b/test/executor_test.cc index 213b3cdb6..fafa71412 100644 --- a/test/executor_test.cc +++ b/test/executor_test.cc @@ -27,13 +27,13 @@ int main() { id = bootstrap->createUniqueId(); } MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - bootstrap->initialize(id); // sleep 20s // std::this_thread::sleep_for(std::chrono::seconds(20)); + bootstrap->initialize(id); auto comm = std::make_shared(bootstrap); CUDACHECK(cudaSetDevice(rank)); - std::shared_ptr executor = std::make_shared(comm, 8 /*nranksPerNode*/); + std::shared_ptr executor = std::make_shared(comm); mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); const int bufferSize = 1024 * 1024; std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); diff --git a/test/mp_unit/bootstrap_tests.cc b/test/mp_unit/bootstrap_tests.cc index 69e566dbd..65ec17027 100644 --- a/test/mp_unit/bootstrap_tests.cc +++ b/test/mp_unit/bootstrap_tests.cc @@ -120,6 +120,13 @@ class MPIBootstrap : public mscclpp::Bootstrap { MPI_Comm_size(MPI_COMM_WORLD, &worldSize); return worldSize; } + int getNranksPerNode() override { + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); + int shmrank; + MPI_Comm_size(shmcomm, &shmrank); + return shmrank; + } void allGather(void* sendbuf, int size) override { MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, sendbuf, size, MPI_BYTE, MPI_COMM_WORLD); } From 149eb416f25de1c5fcf4848cd1efede6ab333c10 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 18 Apr 2024 13:53:53 +0000 Subject: [PATCH 50/51] address comments --- test/CMakeLists.txt | 1 - test/executor_test.cc | 47 --------------------------- test/mp_unit/CMakeLists.txt | 1 + test/mp_unit/executor_tests.cc | 58 ++++++++++++++++++++++++++++++++++ test/mp_unit/mp_unit_tests.hpp | 8 +++++ 5 files changed, 67 insertions(+), 48 deletions(-) delete mode 100644 test/executor_test.cc create mode 100644 test/mp_unit/executor_tests.cc diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 501f96ab0..da47066ea 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,7 +24,6 @@ endfunction() add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) add_test_executable(nvls_test nvls_test.cu) -add_test_executable(executor_test executor_test.cc) configure_file(run_mpi_test.sh.in run_mpi_test.sh) diff --git a/test/executor_test.cc b/test/executor_test.cc deleted file mode 100644 index fafa71412..000000000 --- a/test/executor_test.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include - -#include -#include - -// Check CUDA RT calls -#define CUDACHECK(cmd) \ - do { \ - cudaError_t err = cmd; \ - if (err != cudaSuccess) { \ - printf("%s:%d Cuda failure '%s'\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (false) - -const std::string MSCCLPP_ROOT_PATH = "/root/mscclpp"; - -int main() { - int rank; - int world_size; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - auto bootstrap = std::make_shared(rank, world_size); - mscclpp::UniqueId id; - if (rank == 0) { - id = bootstrap->createUniqueId(); - } - MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); - // sleep 20s - // std::this_thread::sleep_for(std::chrono::seconds(20)); - bootstrap->initialize(id); - auto comm = std::make_shared(bootstrap); - CUDACHECK(cudaSetDevice(rank)); - - std::shared_ptr executor = std::make_shared(comm); - mscclpp::ExecutionPlan plan("allreduce_pairs", MSCCLPP_ROOT_PATH + "/test/execution-files/allreduce.json"); - const int bufferSize = 1024 * 1024; - std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); - mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); - executor->execute(rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512, plan, - stream); - CUDACHECK(cudaStreamSynchronize(stream)); - - MPI_Finalize(); - return 0; -} diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt index dc388844f..8e37d2405 100644 --- a/test/mp_unit/CMakeLists.txt +++ b/test/mp_unit/CMakeLists.txt @@ -8,4 +8,5 @@ target_sources(mp_unit_tests PRIVATE communicator_tests.cu proxy_channel_tests.cu sm_channel_tests.cu + executor_tests.cc ) diff --git a/test/mp_unit/executor_tests.cc b/test/mp_unit/executor_tests.cc new file mode 100644 index 000000000..6bfe09516 --- /dev/null +++ b/test/mp_unit/executor_tests.cc @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +#include + +#include "mp_unit_tests.hpp" + +namespace { +std::string getExecutablePath() { + char result[PATH_MAX]; + ssize_t count = readlink("/proc/self/exe", result, PATH_MAX); + if (count == -1) { + throw std::runtime_error("Failed to get executable path"); + } + return std::string(result, count); +} +} // namespace + +void ExecutorTest::SetUp() { + MultiProcessTest::SetUp(); + + MSCCLPP_CUDATHROW(cudaSetDevice(rankToLocalRank(gEnv->rank))); + std::shared_ptr bootstrap; + mscclpp::UniqueId id; + if (gEnv->rank < gEnv->worldSize) { + bootstrap = std::make_shared(gEnv->rank, gEnv->worldSize); + if (gEnv->rank == 0) id = bootstrap->createUniqueId(); + } + MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + bootstrap->initialize(id); + std::shared_ptr communicator = std::make_shared(bootstrap); + executor = std::make_shared(communicator); +} + +void ExecutorTest::TearDown() { + executor.reset(); + MultiProcessTest::TearDown(); +} + +TEST_F(ExecutorTest, TwoNodesAllreduce) { + if (gEnv->worldSize != 2 || gEnv->nRanksPerNode != 2) { + GTEST_SKIP() << "This test requires world size to be 2 and ranks per node to be 2"; + return; + } + std::string executablePath = getExecutablePath(); + std::filesystem::path path = executablePath; + std::filesystem::path executionFilesPath = + path.parent_path().parent_path().parent_path() / "test/execution-files/allreduce.json"; + mscclpp::ExecutionPlan plan("allreduce_pairs", executionFilesPath.string()); + const int bufferSize = 1024 * 1024; + std::shared_ptr sendbuff = mscclpp::allocExtSharedCuda(bufferSize); + mscclpp::CudaStreamWithFlags stream(cudaStreamNonBlocking); + executor->execute(gEnv->rank, sendbuff.get(), sendbuff.get(), bufferSize, bufferSize, mscclpp::DataType::FLOAT16, 512, + plan, stream); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); +} diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index e934dee49..6cb159c67 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -155,4 +156,11 @@ class SmChannelOneToOneTest : public CommunicatorTestBase { std::unordered_map> smSemaphores; }; +class ExecutorTest : public MultiProcessTest { + protected: + void SetUp() override; + void TearDown() override; + + std::shared_ptr executor; +}; #endif // MSCCLPP_MP_UNIT_TESTS_HPP_ From c407f29f9d420f81540ef0e11a6267530c886569 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 18 Apr 2024 18:16:29 +0000 Subject: [PATCH 51/51] Python binding for ExecutorError --- python/mscclpp/error_py.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/mscclpp/error_py.cpp b/python/mscclpp/error_py.cpp index 18d4b834a..af78ac880 100644 --- a/python/mscclpp/error_py.cpp +++ b/python/mscclpp/error_py.cpp @@ -16,7 +16,8 @@ void register_error(nb::module_& m) { .value("RemoteError", ErrorCode::RemoteError) .value("InvalidUsage", ErrorCode::InvalidUsage) .value("Timeout", ErrorCode::Timeout) - .value("Aborted", ErrorCode::Aborted); + .value("Aborted", ErrorCode::Aborted) + .value("ExecutorError", ErrorCode::ExecutorError); nb::class_(m, "BaseError") .def(nb::init(), nb::arg("message"), nb::arg("errorCode"))